View Javadoc
1   /*
2    * Copyright (c) 2002-2025 Gargoyle Software Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * https://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  package org.htmlunit.html.parser.neko;
16  
17  import java.io.IOException;
18  import java.io.InputStream;
19  import java.io.StringReader;
20  import java.lang.reflect.InvocationTargetException;
21  import java.net.URL;
22  import java.nio.charset.Charset;
23  import java.util.ArrayList;
24  import java.util.List;
25  import java.util.Map;
26  import java.util.concurrent.ConcurrentHashMap;
27  
28  import org.htmlunit.ObjectInstantiationException;
29  import org.htmlunit.Page;
30  import org.htmlunit.SgmlPage;
31  import org.htmlunit.WebAssert;
32  import org.htmlunit.WebClient;
33  import org.htmlunit.WebResponse;
34  import org.htmlunit.cyberneko.HTMLScanner;
35  import org.htmlunit.cyberneko.HTMLTagBalancer;
36  import org.htmlunit.cyberneko.xerces.util.DefaultErrorHandler;
37  import org.htmlunit.cyberneko.xerces.xni.QName;
38  import org.htmlunit.cyberneko.xerces.xni.XNIException;
39  import org.htmlunit.cyberneko.xerces.xni.parser.XMLErrorHandler;
40  import org.htmlunit.cyberneko.xerces.xni.parser.XMLInputSource;
41  import org.htmlunit.cyberneko.xerces.xni.parser.XMLParseException;
42  import org.htmlunit.html.DefaultElementFactory;
43  import org.htmlunit.html.DomNode;
44  import org.htmlunit.html.ElementFactory;
45  import org.htmlunit.html.Html;
46  import org.htmlunit.html.HtmlPage;
47  import org.htmlunit.html.UnknownElementFactory;
48  import org.htmlunit.html.parser.HTMLParser;
49  import org.htmlunit.html.parser.HTMLParserListener;
50  import org.htmlunit.svg.SvgElementFactory;
51  import org.htmlunit.util.StringUtils;
52  import org.w3c.dom.Node;
53  import org.xml.sax.SAXException;
54  
55  /**
56   * <p>SAX parser implementation that uses the NekoHTML {@link org.htmlunit.cyberneko.HTMLConfiguration}
57   * to parse HTML into a HtmlUnit-specific DOM (HU-DOM) tree.</p>
58   *
59   * @author Christian Sell
60   * @author David K. Taylor
61   * @author Chris Erskine
62   * @author Ahmed Ashour
63   * @author Marc Guillemot
64   * @author Ethan Glasser-Camp
65   * @author Sudhan Moghe
66   * @author Ronald Brill
67   * @author Frank Danek
68   * @author Carsten Steul
69   */
70  public final class HtmlUnitNekoHtmlParser implements HTMLParser {
71  
72      /**
73       * The SVG factory.
74       */
75      public static final SvgElementFactory SVG_FACTORY = new SvgElementFactory();
76  
77      private static final Map<String, ElementFactory> ELEMENT_FACTORIES = new ConcurrentHashMap<>();
78  
79      static {
80          final DefaultElementFactory defaultElementFactory = new DefaultElementFactory();
81          for (final String tagName : DefaultElementFactory.SUPPORTED_TAGS_) {
82              ELEMENT_FACTORIES.put(tagName, defaultElementFactory);
83          }
84      }
85  
86      /**
87       * {@inheritDoc}
88       */
89      @Override
90      public void parseFragment(final WebClient webClient, final DomNode parent, final DomNode context,
91              final String source, final boolean createdByJavascript)
92          throws SAXException, IOException {
93          final Page page = parent.getPage();
94          if (!(page instanceof HtmlPage)) {
95              return;
96          }
97          final HtmlPage htmlPage = (HtmlPage) page;
98          final URL url = htmlPage.getUrl();
99  
100         final HtmlUnitNekoDOMBuilder domBuilder =
101                 new HtmlUnitNekoDOMBuilder(this, parent, url, source, createdByJavascript);
102         domBuilder.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", true);
103         // build fragment context stack
104         DomNode node = context;
105         final List<QName> ancestors = new ArrayList<>();
106         while (node != null && node.getNodeType() != Node.DOCUMENT_NODE) {
107             ancestors.add(0, new QName(null, node.getNodeName(), null, null));
108             node = node.getParentNode();
109         }
110         if (ancestors.isEmpty() || !"html".equals(ancestors.get(0).getLocalpart())) {
111             ancestors.add(new QName(null, "html", null, null));
112             ancestors.add(new QName(null, "body", null, null));
113         }
114         else if (ancestors.size() == 1
115                 || (!"body".equals(ancestors.get(1).getLocalpart())
116                         && !"head".equals(ancestors.get(1).getLocalpart()))) {
117             ancestors.add(new QName(null, "body", null, null));
118         }
119 
120         domBuilder.setFeature(HTMLScanner.ALLOW_SELFCLOSING_TAGS, true);
121         domBuilder.setProperty(HTMLTagBalancer.FRAGMENT_CONTEXT_STACK, ancestors.toArray(new QName[0]));
122 
123         final XMLInputSource in = new XMLInputSource(null, url.toString(), null, new StringReader(source), null);
124 
125         htmlPage.registerParsingStart();
126         htmlPage.registerSnippetParsingStart();
127         try {
128             domBuilder.parse(in);
129         }
130         finally {
131             htmlPage.registerParsingEnd();
132             htmlPage.registerSnippetParsingEnd();
133         }
134     }
135 
136     /**
137      * {@inheritDoc}
138      */
139     @Override
140     public void parse(final WebClient webClient, final WebResponse webResponse, final HtmlPage page,
141             final boolean xhtml, final boolean createdByJavascript) throws IOException {
142         final URL url = webResponse.getWebRequest().getUrl();
143         final HtmlUnitNekoDOMBuilder domBuilder =
144                 new HtmlUnitNekoDOMBuilder(this, page, url, null, createdByJavascript);
145 
146         final Charset charset = webResponse.getContentCharset();
147         try {
148             if (!webResponse.wasContentCharsetTentative()) {
149                 // The charset is certain so ignore any others found in the document
150                 domBuilder.setFeature(HTMLScanner.IGNORE_SPECIFIED_CHARSET, true);
151             }
152 
153             // xml content is different
154             if (xhtml) {
155                 domBuilder.setFeature(HTMLScanner.ALLOW_SELFCLOSING_TAGS, true);
156                 domBuilder.setFeature(HTMLScanner.SCRIPT_STRIP_CDATA_DELIMS, true);
157                 domBuilder.setFeature(HTMLScanner.STYLE_STRIP_CDATA_DELIMS, true);
158                 domBuilder.setFeature(HTMLScanner.CDATA_SECTIONS, true);
159                 domBuilder.setFeature(HTMLScanner.CDATA_EARLY_CLOSING, false);
160             }
161 
162             if (webClient != null) {
163                 final int bufferSize = webClient.getOptions().getNekoReaderBufferSize();
164                 if (bufferSize > 0) {
165                     domBuilder.setProperty(HTMLScanner.READER_BUFFER_SIZE, bufferSize);
166                 }
167             }
168         }
169         catch (final Exception e) {
170             throw new ObjectInstantiationException("Error setting HTML parser feature", e);
171         }
172 
173         try (InputStream content = webResponse.getContentAsStream()) {
174             final String encoding = charset.name();
175             final XMLInputSource in = new XMLInputSource(null, url.toString(), null, content, encoding);
176 
177             page.registerParsingStart();
178             try {
179                 domBuilder.parse(in);
180             }
181             catch (final XNIException e) {
182                 // extract enclosed exception
183                 final Throwable origin = extractNestedException(e);
184                 throw new RuntimeException("Failed parsing content from " + url, origin);
185             }
186         }
187         finally {
188             page.registerParsingEnd();
189         }
190     }
191 
192     /**
193      * Extract nested exception within an XNIException (Nekohtml uses reflection and generated
194      * exceptions are wrapped many times within XNIException and InvocationTargetException)
195      *
196      * @param e the original XNIException
197      * @return the cause exception
198      */
199     static Throwable extractNestedException(final Throwable e) {
200         Throwable originalException = e;
201         Throwable cause = ((XNIException) e).getException();
202         while (cause != null) {
203             originalException = cause;
204             if (cause instanceof XNIException) {
205                 cause = ((XNIException) cause).getException();
206             }
207             else if (cause instanceof InvocationTargetException) {
208                 cause = cause.getCause();
209             }
210             else {
211                 cause = null;
212             }
213         }
214         return originalException;
215     }
216 
217     /**
218      * {@inheritDoc}
219      */
220     @Override
221     public ElementFactory getSvgFactory() {
222         return SVG_FACTORY;
223     }
224 
225     /**
226      * {@inheritDoc}
227      */
228     @Override
229     public ElementFactory getFactory(final String tagName) {
230         final ElementFactory result = ELEMENT_FACTORIES.get(tagName);
231 
232         if (result != null) {
233             return result;
234         }
235         return UnknownElementFactory.INSTANCE;
236     }
237 
238     /**
239      * <span style="color:red">INTERNAL API - SUBJECT TO CHANGE AT ANY TIME - USE AT YOUR OWN RISK.</span><br>
240      *
241      * Returns the pre-registered element factory corresponding to the specified tag, or an UnknownElementFactory.
242      * @param page the page
243      * @param namespaceURI the namespace URI
244      * @param qualifiedName the qualified name
245      * @param insideSvg is the node inside an SVG node or not
246      * @param svgSupport true if called from javascript createElementNS
247      * @return the pre-registered element factory corresponding to the specified tag, or an UnknownElementFactory
248      */
249     @Override
250     public ElementFactory getElementFactory(final SgmlPage page, final String namespaceURI,
251             final String qualifiedName, final boolean insideSvg, final boolean svgSupport) {
252         if (insideSvg) {
253             return SVG_FACTORY;
254         }
255 
256         if (namespaceURI == null || namespaceURI.isEmpty()
257             || Html.XHTML_NAMESPACE.equals(namespaceURI)
258             || Html.SVG_NAMESPACE.equals(namespaceURI)
259             || !qualifiedName.contains(":")) {
260 
261             String tagName = qualifiedName;
262             final int index = tagName.indexOf(':');
263             if (index == -1) {
264                 tagName = StringUtils.toRootLowerCase(tagName);
265             }
266             else {
267                 tagName = tagName.substring(index + 1);
268             }
269             final ElementFactory factory;
270             if (svgSupport && !"svg".equals(tagName) && Html.SVG_NAMESPACE.equals(namespaceURI)) {
271                 factory = SVG_FACTORY;
272             }
273             else {
274                 factory = ELEMENT_FACTORIES.get(tagName);
275             }
276 
277             if (factory != null) {
278                 return factory;
279             }
280         }
281         return UnknownElementFactory.INSTANCE;
282     }
283 }
284 
285 /**
286  * Utility to transmit parsing errors to a {@link HTMLParserListener}.
287  */
288 class HtmlUnitNekoHTMLErrorHandler implements XMLErrorHandler {
289     private final HTMLParserListener listener_;
290     private final URL url_;
291     private final String html_;
292 
293     HtmlUnitNekoHTMLErrorHandler(final HTMLParserListener listener, final URL url, final String htmlContent) {
294         WebAssert.notNull("listener", listener);
295         WebAssert.notNull("url", url);
296         listener_ = listener;
297         url_ = url;
298         html_ = htmlContent;
299     }
300 
301     /**
302      * @see DefaultErrorHandler#error(String,String,XMLParseException)
303      */
304     @Override
305     public void error(final String domain, final String key,
306             final XMLParseException exception) throws XNIException {
307         listener_.error(exception.getMessage(),
308                 url_,
309                 html_,
310                 exception.getLineNumber(),
311                 exception.getColumnNumber(),
312                 key);
313     }
314 
315     /**
316      * @see DefaultErrorHandler#warning(String,String,XMLParseException)
317      */
318     @Override
319     public void warning(final String domain, final String key,
320             final XMLParseException exception) throws XNIException {
321         listener_.warning(exception.getMessage(),
322                 url_,
323                 html_,
324                 exception.getLineNumber(),
325                 exception.getColumnNumber(),
326                 key);
327     }
328 
329     @Override
330     public void fatalError(final String domain, final String key,
331             final XMLParseException exception) throws XNIException {
332         listener_.error(exception.getMessage(),
333                 url_,
334                 html_,
335                 exception.getLineNumber(),
336                 exception.getColumnNumber(),
337                 key);
338     }
339 }