View Javadoc
1   /*
2    * Copyright (c) 2002-2025 Gargoyle Software Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * https://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  package org.htmlunit.html.parser.neko;
16  
17  import java.io.IOException;
18  import java.io.InputStream;
19  import java.io.StringReader;
20  import java.lang.reflect.InvocationTargetException;
21  import java.net.URL;
22  import java.nio.charset.Charset;
23  import java.util.ArrayList;
24  import java.util.List;
25  import java.util.Map;
26  import java.util.concurrent.ConcurrentHashMap;
27  
28  import org.htmlunit.ObjectInstantiationException;
29  import org.htmlunit.Page;
30  import org.htmlunit.SgmlPage;
31  import org.htmlunit.WebAssert;
32  import org.htmlunit.WebClient;
33  import org.htmlunit.WebResponse;
34  import org.htmlunit.cyberneko.HTMLScanner;
35  import org.htmlunit.cyberneko.HTMLTagBalancer;
36  import org.htmlunit.cyberneko.xerces.util.DefaultErrorHandler;
37  import org.htmlunit.cyberneko.xerces.xni.QName;
38  import org.htmlunit.cyberneko.xerces.xni.XNIException;
39  import org.htmlunit.cyberneko.xerces.xni.parser.XMLErrorHandler;
40  import org.htmlunit.cyberneko.xerces.xni.parser.XMLInputSource;
41  import org.htmlunit.cyberneko.xerces.xni.parser.XMLParseException;
42  import org.htmlunit.html.DefaultElementFactory;
43  import org.htmlunit.html.DomNode;
44  import org.htmlunit.html.ElementFactory;
45  import org.htmlunit.html.Html;
46  import org.htmlunit.html.HtmlPage;
47  import org.htmlunit.html.UnknownElementFactory;
48  import org.htmlunit.html.parser.HTMLParser;
49  import org.htmlunit.html.parser.HTMLParserListener;
50  import org.htmlunit.svg.SvgElementFactory;
51  import org.htmlunit.util.StringUtils;
52  import org.w3c.dom.Node;
53  import org.xml.sax.SAXException;
54  
55  /**
56   * <p>SAX parser implementation that uses the NekoHTML {@link org.htmlunit.cyberneko.HTMLConfiguration}
57   * to parse HTML into a HtmlUnit-specific DOM (HU-DOM) tree.</p>
58   *
59   * @author <a href="mailto:cse@dynabean.de">Christian Sell</a>
60   * @author David K. Taylor
61   * @author Chris Erskine
62   * @author Ahmed Ashour
63   * @author Marc Guillemot
64   * @author Ethan Glasser-Camp
65   * @author Sudhan Moghe
66   * @author Ronald Brill
67   * @author Frank Danek
68   * @author Carsten Steul
69   */
70  public final class HtmlUnitNekoHtmlParser implements HTMLParser {
71  
72      /**
73       * The SVG factory.
74       */
75      public static final SvgElementFactory SVG_FACTORY = new SvgElementFactory();
76  
77      private static final Map<String, ElementFactory> ELEMENT_FACTORIES = new ConcurrentHashMap<>();
78  
79      static {
80          final DefaultElementFactory defaultElementFactory = new DefaultElementFactory();
81          for (final String tagName : DefaultElementFactory.SUPPORTED_TAGS_) {
82              ELEMENT_FACTORIES.put(tagName, defaultElementFactory);
83          }
84      }
85  
86      /**
87       *{@inheritDoc}
88       */
89      @Override
90      public void parseFragment(final WebClient webClient, final DomNode parent, final DomNode context, final String source,
91              final boolean createdByJavascript)
92          throws SAXException, IOException {
93          final Page page = parent.getPage();
94          if (!(page instanceof HtmlPage)) {
95              return;
96          }
97          final HtmlPage htmlPage = (HtmlPage) page;
98          final URL url = htmlPage.getUrl();
99  
100         final HtmlUnitNekoDOMBuilder domBuilder =
101                 new HtmlUnitNekoDOMBuilder(this, parent, url, source, createdByJavascript);
102         domBuilder.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", true);
103         // build fragment context stack
104         DomNode node = context;
105         final List<QName> ancestors = new ArrayList<>();
106         while (node != null && node.getNodeType() != Node.DOCUMENT_NODE) {
107             ancestors.add(0, new QName(null, node.getNodeName(), null, null));
108             node = node.getParentNode();
109         }
110         if (ancestors.isEmpty() || !"html".equals(ancestors.get(0).getLocalpart())) {
111             ancestors.add(new QName(null, "html", null, null));
112             ancestors.add(new QName(null, "body", null, null));
113         }
114         else if (ancestors.size() == 1
115                 || (!"body".equals(ancestors.get(1).getLocalpart())
116                         && !"head".equals(ancestors.get(1).getLocalpart()))) {
117             ancestors.add(new QName(null, "body", null, null));
118         }
119 
120         domBuilder.setFeature(HTMLScanner.ALLOW_SELFCLOSING_TAGS, true);
121         domBuilder.setProperty(HTMLTagBalancer.FRAGMENT_CONTEXT_STACK, ancestors.toArray(new QName[0]));
122 
123         final XMLInputSource in = new XMLInputSource(null, url.toString(), null, new StringReader(source), null);
124 
125         htmlPage.registerParsingStart();
126         htmlPage.registerSnippetParsingStart();
127         try {
128             domBuilder.parse(in);
129         }
130         finally {
131             htmlPage.registerParsingEnd();
132             htmlPage.registerSnippetParsingEnd();
133         }
134     }
135 
136     /**
137      * {@inheritDoc}
138      */
139     @Override
140     public void parse(final WebClient webClient, final WebResponse webResponse, final HtmlPage page,
141             final boolean xhtml, final boolean createdByJavascript) throws IOException {
142         final URL url = webResponse.getWebRequest().getUrl();
143         final HtmlUnitNekoDOMBuilder domBuilder =
144                 new HtmlUnitNekoDOMBuilder(this, page, url, null, createdByJavascript);
145 
146         final Charset charset = webResponse.getContentCharset();
147         try {
148             if (!webResponse.wasContentCharsetTentative()) {
149                 // The charset is certain so ignore any others found in the document
150                 domBuilder.setFeature(HTMLScanner.IGNORE_SPECIFIED_CHARSET, true);
151             }
152 
153             // xml content is different
154             if (xhtml) {
155                 domBuilder.setFeature(HTMLScanner.ALLOW_SELFCLOSING_TAGS, true);
156                 domBuilder.setFeature(HTMLScanner.SCRIPT_STRIP_CDATA_DELIMS, true);
157                 domBuilder.setFeature(HTMLScanner.STYLE_STRIP_CDATA_DELIMS, true);
158                 domBuilder.setFeature(HTMLScanner.CDATA_EARLY_CLOSING, false);
159             }
160 
161             if (webClient != null) {
162                 final int bufferSize = webClient.getOptions().getNekoReaderBufferSize();
163                 if (bufferSize > 0) {
164                     domBuilder.setProperty(HTMLScanner.READER_BUFFER_SIZE, bufferSize);
165                 }
166             }
167         }
168         catch (final Exception e) {
169             throw new ObjectInstantiationException("Error setting HTML parser feature", e);
170         }
171 
172         try (InputStream content = webResponse.getContentAsStream()) {
173             final String encoding = charset.name();
174             final XMLInputSource in = new XMLInputSource(null, url.toString(), null, content, encoding);
175 
176             page.registerParsingStart();
177             try {
178                 domBuilder.parse(in);
179             }
180             catch (final XNIException e) {
181                 // extract enclosed exception
182                 final Throwable origin = extractNestedException(e);
183                 throw new RuntimeException("Failed parsing content from " + url, origin);
184             }
185         }
186         finally {
187             page.registerParsingEnd();
188         }
189     }
190 
191     /**
192      * Extract nested exception within an XNIException (Nekohtml uses reflection and generated
193      * exceptions are wrapped many times within XNIException and InvocationTargetException)
194      *
195      * @param e the original XNIException
196      * @return the cause exception
197      */
198     static Throwable extractNestedException(final Throwable e) {
199         Throwable originalException = e;
200         Throwable cause = ((XNIException) e).getException();
201         while (cause != null) {
202             originalException = cause;
203             if (cause instanceof XNIException) {
204                 cause = ((XNIException) cause).getException();
205             }
206             else if (cause instanceof InvocationTargetException) {
207                 cause = cause.getCause();
208             }
209             else {
210                 cause = null;
211             }
212         }
213         return originalException;
214     }
215 
216     /**
217      * {@inheritDoc}
218      */
219     @Override
220     public ElementFactory getSvgFactory() {
221         return SVG_FACTORY;
222     }
223 
224     /**
225      * {@inheritDoc}
226      */
227     @Override
228     public ElementFactory getFactory(final String tagName) {
229         final ElementFactory result = ELEMENT_FACTORIES.get(tagName);
230 
231         if (result != null) {
232             return result;
233         }
234         return UnknownElementFactory.INSTANCE;
235     }
236 
237     /**
238      * <span style="color:red">INTERNAL API - SUBJECT TO CHANGE AT ANY TIME - USE AT YOUR OWN RISK.</span><br>
239      *
240      * Returns the pre-registered element factory corresponding to the specified tag, or an UnknownElementFactory.
241      * @param page the page
242      * @param namespaceURI the namespace URI
243      * @param qualifiedName the qualified name
244      * @param insideSvg is the node inside an SVG node or not
245      * @param svgSupport true if called from javascript createElementNS
246      * @return the pre-registered element factory corresponding to the specified tag, or an UnknownElementFactory
247      */
248     @Override
249     public ElementFactory getElementFactory(final SgmlPage page, final String namespaceURI,
250             final String qualifiedName, final boolean insideSvg, final boolean svgSupport) {
251         if (insideSvg) {
252             return SVG_FACTORY;
253         }
254 
255         if (namespaceURI == null || namespaceURI.isEmpty()
256             || Html.XHTML_NAMESPACE.equals(namespaceURI)
257             || Html.SVG_NAMESPACE.equals(namespaceURI)
258             || !qualifiedName.contains(":")) {
259 
260             String tagName = qualifiedName;
261             final int index = tagName.indexOf(':');
262             if (index == -1) {
263                 tagName = StringUtils.toRootLowerCase(tagName);
264             }
265             else {
266                 tagName = tagName.substring(index + 1);
267             }
268             final ElementFactory factory;
269             if (svgSupport && !"svg".equals(tagName) && Html.SVG_NAMESPACE.equals(namespaceURI)) {
270                 factory = SVG_FACTORY;
271             }
272             else {
273                 factory = ELEMENT_FACTORIES.get(tagName);
274             }
275 
276             if (factory != null) {
277                 return factory;
278             }
279         }
280         return UnknownElementFactory.INSTANCE;
281     }
282 }
283 
284 /**
285  * Utility to transmit parsing errors to a {@link HTMLParserListener}.
286  */
287 class HtmlUnitNekoHTMLErrorHandler implements XMLErrorHandler {
288     private final HTMLParserListener listener_;
289     private final URL url_;
290     private final String html_;
291 
292     HtmlUnitNekoHTMLErrorHandler(final HTMLParserListener listener, final URL url, final String htmlContent) {
293         WebAssert.notNull("listener", listener);
294         WebAssert.notNull("url", url);
295         listener_ = listener;
296         url_ = url;
297         html_ = htmlContent;
298     }
299 
300     /**
301      * @see DefaultErrorHandler#error(String,String,XMLParseException)
302      */
303     @Override
304     public void error(final String domain, final String key,
305             final XMLParseException exception) throws XNIException {
306         listener_.error(exception.getMessage(),
307                 url_,
308                 html_,
309                 exception.getLineNumber(),
310                 exception.getColumnNumber(),
311                 key);
312     }
313 
314     /**
315      * @see DefaultErrorHandler#warning(String,String,XMLParseException)
316      */
317     @Override
318     public void warning(final String domain, final String key,
319             final XMLParseException exception) throws XNIException {
320         listener_.warning(exception.getMessage(),
321                 url_,
322                 html_,
323                 exception.getLineNumber(),
324                 exception.getColumnNumber(),
325                 key);
326     }
327 
328     @Override
329     public void fatalError(final String domain, final String key,
330             final XMLParseException exception) throws XNIException {
331         listener_.error(exception.getMessage(),
332                 url_,
333                 html_,
334                 exception.getLineNumber(),
335                 exception.getColumnNumber(),
336                 key);
337     }
338 }