View Javadoc
1   /*
2    * Copyright (c) 2002-2026 Gargoyle Software Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * https://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  package org.htmlunit.html.parser.neko;
16  
17  import java.io.IOException;
18  import java.io.InputStream;
19  import java.io.StringReader;
20  import java.net.URL;
21  import java.nio.charset.Charset;
22  import java.util.ArrayList;
23  import java.util.List;
24  import java.util.Map;
25  import java.util.concurrent.ConcurrentHashMap;
26  
27  import org.htmlunit.ObjectInstantiationException;
28  import org.htmlunit.Page;
29  import org.htmlunit.SgmlPage;
30  import org.htmlunit.WebAssert;
31  import org.htmlunit.WebClient;
32  import org.htmlunit.WebResponse;
33  import org.htmlunit.cyberneko.HTMLScanner;
34  import org.htmlunit.cyberneko.HTMLTagBalancer;
35  import org.htmlunit.cyberneko.xerces.util.DefaultErrorHandler;
36  import org.htmlunit.cyberneko.xerces.xni.QName;
37  import org.htmlunit.cyberneko.xerces.xni.XNIException;
38  import org.htmlunit.cyberneko.xerces.xni.parser.XMLErrorHandler;
39  import org.htmlunit.cyberneko.xerces.xni.parser.XMLInputSource;
40  import org.htmlunit.cyberneko.xerces.xni.parser.XMLParseException;
41  import org.htmlunit.html.DefaultElementFactory;
42  import org.htmlunit.html.DomNode;
43  import org.htmlunit.html.ElementFactory;
44  import org.htmlunit.html.Html;
45  import org.htmlunit.html.HtmlPage;
46  import org.htmlunit.html.UnknownElementFactory;
47  import org.htmlunit.html.parser.HTMLParser;
48  import org.htmlunit.html.parser.HTMLParserListener;
49  import org.htmlunit.svg.SvgElementFactory;
50  import org.htmlunit.util.StringUtils;
51  import org.w3c.dom.Node;
52  import org.xml.sax.SAXException;
53  
54  /**
55   * <p>SAX parser implementation that uses the NekoHTML {@link org.htmlunit.cyberneko.HTMLConfiguration}
56   * to parse HTML into a HtmlUnit-specific DOM (HU-DOM) tree.</p>
57   *
58   * @author Christian Sell
59   * @author David K. Taylor
60   * @author Chris Erskine
61   * @author Ahmed Ashour
62   * @author Marc Guillemot
63   * @author Ethan Glasser-Camp
64   * @author Sudhan Moghe
65   * @author Ronald Brill
66   * @author Frank Danek
67   * @author Carsten Steul
68   */
69  public final class HtmlUnitNekoHtmlParser implements HTMLParser {
70  
71      /**
72       * The SVG factory.
73       */
74      public static final SvgElementFactory SVG_FACTORY = new SvgElementFactory();
75  
76      private static final Map<String, ElementFactory> ELEMENT_FACTORIES = new ConcurrentHashMap<>();
77  
78      static {
79          final DefaultElementFactory defaultElementFactory = new DefaultElementFactory();
80          for (final String tagName : DefaultElementFactory.SUPPORTED_TAGS_) {
81              ELEMENT_FACTORIES.put(tagName, defaultElementFactory);
82          }
83      }
84  
85      /**
86       * {@inheritDoc}
87       */
88      @Override
89      public void parseFragment(final WebClient webClient, final DomNode parent, final DomNode context,
90              final String source, final boolean createdByJavascript)
91          throws SAXException, IOException {
92          final Page page = parent.getPage();
93          if (!(page instanceof HtmlPage htmlPage)) {
94              return;
95          }
96          final URL url = htmlPage.getUrl();
97  
98          final HtmlUnitNekoDOMBuilder domBuilder =
99                  new HtmlUnitNekoDOMBuilder(this, webClient, parent, url, source, createdByJavascript);
100         domBuilder.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", true);
101         // build fragment context stack
102         DomNode node = context;
103         final List<QName> ancestors = new ArrayList<>();
104         while (node != null && node.getNodeType() != Node.DOCUMENT_NODE) {
105             ancestors.add(0, new QName(null, node.getNodeName(), null, null));
106             node = node.getParentNode();
107         }
108         if (ancestors.isEmpty() || !"html".equals(ancestors.get(0).getLocalpart())) {
109             ancestors.add(new QName(null, "html", null, null));
110             ancestors.add(new QName(null, "body", null, null));
111         }
112         else if (ancestors.size() == 1
113                 || (!"body".equals(ancestors.get(1).getLocalpart())
114                         && !"head".equals(ancestors.get(1).getLocalpart()))) {
115             ancestors.add(new QName(null, "body", null, null));
116         }
117 
118         domBuilder.setFeature(HTMLScanner.ALLOW_SELFCLOSING_TAGS, true);
119         domBuilder.setProperty(HTMLTagBalancer.FRAGMENT_CONTEXT_STACK, ancestors.toArray(new QName[0]));
120 
121         final XMLInputSource in = new XMLInputSource(null, url.toString(), null, new StringReader(source), null);
122 
123         htmlPage.registerParsingStart();
124         htmlPage.registerSnippetParsingStart();
125         try {
126             domBuilder.parse(in);
127         }
128         finally {
129             htmlPage.registerParsingEnd();
130             htmlPage.registerSnippetParsingEnd();
131         }
132     }
133 
134     /**
135      * {@inheritDoc}
136      */
137     @Override
138     public void parse(final WebClient webClient, final WebResponse webResponse, final HtmlPage page,
139             final boolean xhtml, final boolean createdByJavascript) throws IOException {
140         final URL url = webResponse.getWebRequest().getUrl();
141         final HtmlUnitNekoDOMBuilder domBuilder =
142                 new HtmlUnitNekoDOMBuilder(this, webClient, page, url, null, createdByJavascript);
143 
144         final Charset charset = webResponse.getContentCharset();
145         try {
146             if (!webResponse.wasContentCharsetTentative()) {
147                 // The charset is certain so ignore any others found in the document
148                 domBuilder.setFeature(HTMLScanner.IGNORE_SPECIFIED_CHARSET, true);
149             }
150 
151             // xml content is different
152             if (xhtml) {
153                 domBuilder.setFeature(HTMLScanner.ALLOW_SELFCLOSING_TAGS, true);
154                 domBuilder.setFeature(HTMLScanner.SCRIPT_STRIP_CDATA_DELIMS, true);
155                 domBuilder.setFeature(HTMLScanner.STYLE_STRIP_CDATA_DELIMS, true);
156                 domBuilder.setFeature(HTMLScanner.CDATA_SECTIONS, true);
157                 domBuilder.setFeature(HTMLScanner.CDATA_EARLY_CLOSING, false);
158             }
159 
160             if (webClient != null) {
161                 final int bufferSize = webClient.getOptions().getNekoReaderBufferSize();
162                 if (bufferSize > 0) {
163                     domBuilder.setProperty(HTMLScanner.READER_BUFFER_SIZE, bufferSize);
164                 }
165             }
166         }
167         catch (final Exception e) {
168             throw new ObjectInstantiationException("Error setting HTML parser feature", e);
169         }
170 
171         try (InputStream content = webResponse.getContentAsStream()) {
172             final String encoding = charset.name();
173             final XMLInputSource in = new XMLInputSource(null, url.toString(), null, content, encoding);
174 
175             page.registerParsingStart();
176             try {
177                 domBuilder.parse(in);
178             }
179             catch (final XNIException e) {
180                 // extract enclosed exception
181                 final Throwable origin = extractNestedException(e);
182                 throw new RuntimeException("Failed parsing content from " + url, origin);
183             }
184         }
185         finally {
186             page.registerParsingEnd();
187         }
188     }
189 
190     /**
191      * Extract nested exception within an XNIException.
192      *
193      * @param e the original XNIException
194      * @return the cause exception
195      */
196     static Throwable extractNestedException(final Throwable e) {
197         Throwable originalException;
198         Throwable cause = e;
199         do {
200             originalException = cause;
201 
202             if (cause instanceof XNIException) {
203                 cause = cause.getCause();
204             }
205             else {
206                 cause = null;
207             }
208         }
209         while (cause != null);
210 
211         return originalException;
212     }
213 
214     /**
215      * {@inheritDoc}
216      */
217     @Override
218     public ElementFactory getSvgFactory() {
219         return SVG_FACTORY;
220     }
221 
222     /**
223      * {@inheritDoc}
224      */
225     @Override
226     public ElementFactory getFactory(final String tagName) {
227         final ElementFactory result = ELEMENT_FACTORIES.get(tagName);
228 
229         if (result != null) {
230             return result;
231         }
232         return UnknownElementFactory.INSTANCE;
233     }
234 
235     /**
236      * <span style="color:red">INTERNAL API - SUBJECT TO CHANGE AT ANY TIME - USE AT YOUR OWN RISK.</span><br>
237      *
238      * Returns the pre-registered element factory corresponding to the specified tag, or an UnknownElementFactory.
239      * @param page the page
240      * @param namespaceURI the namespace URI
241      * @param qualifiedName the qualified name
242      * @param insideSvg is the node inside an SVG node or not
243      * @param svgSupport true if called from javascript createElementNS
244      * @return the pre-registered element factory corresponding to the specified tag, or an UnknownElementFactory
245      */
246     @Override
247     public ElementFactory getElementFactory(final SgmlPage page, final String namespaceURI,
248             final String qualifiedName, final boolean insideSvg, final boolean svgSupport) {
249         if (insideSvg) {
250             return SVG_FACTORY;
251         }
252 
253         if (namespaceURI == null || namespaceURI.isEmpty()
254             || Html.XHTML_NAMESPACE.equals(namespaceURI)
255             || Html.SVG_NAMESPACE.equals(namespaceURI)
256             || !qualifiedName.contains(":")) {
257 
258             String tagName = qualifiedName;
259             final int index = tagName.indexOf(':');
260             if (index == -1) {
261                 tagName = StringUtils.toRootLowerCase(tagName);
262             }
263             else {
264                 tagName = tagName.substring(index + 1);
265             }
266             final ElementFactory factory;
267             if (svgSupport && !"svg".equals(tagName) && Html.SVG_NAMESPACE.equals(namespaceURI)) {
268                 factory = SVG_FACTORY;
269             }
270             else {
271                 factory = ELEMENT_FACTORIES.get(tagName);
272             }
273 
274             if (factory != null) {
275                 return factory;
276             }
277         }
278         return UnknownElementFactory.INSTANCE;
279     }
280 }
281 
282 /**
283  * Utility to transmit parsing errors to a {@link HTMLParserListener}.
284  */
285 class HtmlUnitNekoHTMLErrorHandler implements XMLErrorHandler {
286     private final HTMLParserListener listener_;
287     private final URL url_;
288     private final String html_;
289 
290     HtmlUnitNekoHTMLErrorHandler(final HTMLParserListener listener, final URL url, final String htmlContent) {
291         WebAssert.notNull("listener", listener);
292         WebAssert.notNull("url", url);
293         listener_ = listener;
294         url_ = url;
295         html_ = htmlContent;
296     }
297 
298     /**
299      * @see DefaultErrorHandler#error(String,String,XMLParseException)
300      */
301     @Override
302     public void error(final String domain, final String key,
303             final XMLParseException exception) throws XNIException {
304         listener_.error(exception.getMessage(),
305                 url_,
306                 html_,
307                 exception.getLineNumber(),
308                 exception.getColumnNumber(),
309                 key);
310     }
311 
312     /**
313      * @see DefaultErrorHandler#warning(String,String,XMLParseException)
314      */
315     @Override
316     public void warning(final String domain, final String key,
317             final XMLParseException exception) throws XNIException {
318         listener_.warning(exception.getMessage(),
319                 url_,
320                 html_,
321                 exception.getLineNumber(),
322                 exception.getColumnNumber(),
323                 key);
324     }
325 
326     @Override
327     public void fatalError(final String domain, final String key,
328             final XMLParseException exception) throws XNIException {
329         listener_.error(exception.getMessage(),
330                 url_,
331                 html_,
332                 exception.getLineNumber(),
333                 exception.getColumnNumber(),
334                 key);
335     }
336 }