View Javadoc
1   /*
2    * Copyright (c) 2002-2025 Gargoyle Software Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * https://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  package org.htmlunit.html.parser.neko;
16  
17  import java.io.IOException;
18  import java.io.InputStream;
19  import java.io.StringReader;
20  import java.lang.reflect.InvocationTargetException;
21  import java.net.URL;
22  import java.nio.charset.Charset;
23  import java.util.ArrayList;
24  import java.util.HashMap;
25  import java.util.List;
26  import java.util.Map;
27  
28  import org.htmlunit.ObjectInstantiationException;
29  import org.htmlunit.Page;
30  import org.htmlunit.SgmlPage;
31  import org.htmlunit.WebAssert;
32  import org.htmlunit.WebResponse;
33  import org.htmlunit.cyberneko.HTMLScanner;
34  import org.htmlunit.cyberneko.HTMLTagBalancer;
35  import org.htmlunit.cyberneko.xerces.util.DefaultErrorHandler;
36  import org.htmlunit.cyberneko.xerces.xni.QName;
37  import org.htmlunit.cyberneko.xerces.xni.XNIException;
38  import org.htmlunit.cyberneko.xerces.xni.parser.XMLErrorHandler;
39  import org.htmlunit.cyberneko.xerces.xni.parser.XMLInputSource;
40  import org.htmlunit.cyberneko.xerces.xni.parser.XMLParseException;
41  import org.htmlunit.html.DefaultElementFactory;
42  import org.htmlunit.html.DomNode;
43  import org.htmlunit.html.ElementFactory;
44  import org.htmlunit.html.Html;
45  import org.htmlunit.html.HtmlPage;
46  import org.htmlunit.html.UnknownElementFactory;
47  import org.htmlunit.html.parser.HTMLParser;
48  import org.htmlunit.html.parser.HTMLParserListener;
49  import org.htmlunit.svg.SvgElementFactory;
50  import org.htmlunit.util.StringUtils;
51  import org.w3c.dom.Node;
52  import org.xml.sax.SAXException;
53  
54  /**
55   * <p>SAX parser implementation that uses the NekoHTML {@link org.htmlunit.cyberneko.HTMLConfiguration}
56   * to parse HTML into a HtmlUnit-specific DOM (HU-DOM) tree.</p>
57   *
58   * @author <a href="mailto:cse@dynabean.de">Christian Sell</a>
59   * @author David K. Taylor
60   * @author Chris Erskine
61   * @author Ahmed Ashour
62   * @author Marc Guillemot
63   * @author Ethan Glasser-Camp
64   * @author Sudhan Moghe
65   * @author Ronald Brill
66   * @author Frank Danek
67   * @author Carsten Steul
68   */
69  public final class HtmlUnitNekoHtmlParser implements HTMLParser {
70  
71      /**
72       * The SVG factory.
73       */
74      public static final SvgElementFactory SVG_FACTORY = new SvgElementFactory();
75  
76      private static final Map<String, ElementFactory> ELEMENT_FACTORIES = new HashMap<>();
77  
78      static {
79          final DefaultElementFactory defaultElementFactory = new DefaultElementFactory();
80          for (final String tagName : DefaultElementFactory.SUPPORTED_TAGS_) {
81              ELEMENT_FACTORIES.put(tagName, defaultElementFactory);
82          }
83      }
84  
85      /**
86       * Parses the HTML content from the given string into an object tree representation.
87       *
88       * @param parent the parent for the new nodes
89       * @param source the (X)HTML to be parsed
90       * @throws SAXException if a SAX error occurs
91       * @throws IOException if an IO error occurs
92       */
93      @Override
94      public void parseFragment(final DomNode parent, final String source) throws SAXException, IOException {
95          parseFragment(parent, parent, source, false);
96      }
97  
98      /**
99       * Parses the HTML content from the given string into an object tree representation.
100      *
101      * @param parent where the new parsed nodes will be added to
102      * @param context the context to build the fragment context stack
103      * @param source the (X)HTML to be parsed
104      * @param createdByJavascript if true the (script) tag was created by javascript
105      * @throws SAXException if a SAX error occurs
106      * @throws IOException if an IO error occurs
107      */
108     @Override
109     public void parseFragment(final DomNode parent, final DomNode context, final String source,
110             final boolean createdByJavascript)
111         throws SAXException, IOException {
112         final Page page = parent.getPage();
113         if (!(page instanceof HtmlPage)) {
114             return;
115         }
116         final HtmlPage htmlPage = (HtmlPage) page;
117         final URL url = htmlPage.getUrl();
118 
119         final HtmlUnitNekoDOMBuilder domBuilder =
120                 new HtmlUnitNekoDOMBuilder(this, parent, url, source, createdByJavascript);
121         domBuilder.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", true);
122         // build fragment context stack
123         DomNode node = context;
124         final List<QName> ancestors = new ArrayList<>();
125         while (node != null && node.getNodeType() != Node.DOCUMENT_NODE) {
126             ancestors.add(0, new QName(null, node.getNodeName(), null, null));
127             node = node.getParentNode();
128         }
129         if (ancestors.isEmpty() || !"html".equals(ancestors.get(0).getLocalpart())) {
130             ancestors.add(new QName(null, "html", null, null));
131             ancestors.add(new QName(null, "body", null, null));
132         }
133         else if (ancestors.size() == 1
134                 || (!"body".equals(ancestors.get(1).getLocalpart())
135                         && !"head".equals(ancestors.get(1).getLocalpart()))) {
136             ancestors.add(new QName(null, "body", null, null));
137         }
138 
139         domBuilder.setFeature(HTMLScanner.ALLOW_SELFCLOSING_TAGS, true);
140         domBuilder.setProperty(HTMLTagBalancer.FRAGMENT_CONTEXT_STACK, ancestors.toArray(new QName[0]));
141 
142         final XMLInputSource in = new XMLInputSource(null, url.toString(), null, new StringReader(source), null);
143 
144         htmlPage.registerParsingStart();
145         htmlPage.registerSnippetParsingStart();
146         try {
147             domBuilder.parse(in);
148         }
149         finally {
150             htmlPage.registerParsingEnd();
151             htmlPage.registerSnippetParsingEnd();
152         }
153     }
154 
155     /**
156      * Parses the WebResponse into an object tree representation.
157      *
158      * @param webResponse the response data
159      * @param page the HtmlPage to add the nodes
160      * @param xhtml if true use the XHtml parser
161      * @param createdByJavascript if true the (script) tag was created by javascript
162      * @throws IOException if there is an IO error
163      */
164     @Override
165     public void parse(final WebResponse webResponse, final HtmlPage page,
166             final boolean xhtml, final boolean createdByJavascript) throws IOException {
167         final URL url = webResponse.getWebRequest().getUrl();
168         final HtmlUnitNekoDOMBuilder domBuilder =
169                 new HtmlUnitNekoDOMBuilder(this, page, url, null, createdByJavascript);
170 
171         final Charset charset = webResponse.getContentCharset();
172         try {
173             if (!webResponse.wasContentCharsetTentative()) {
174                 // The charset is certain so ignore any others found in the document
175                 domBuilder.setFeature(HTMLScanner.IGNORE_SPECIFIED_CHARSET, true);
176             }
177 
178             // xml content is different
179             if (xhtml) {
180                 domBuilder.setFeature(HTMLScanner.ALLOW_SELFCLOSING_TAGS, true);
181                 domBuilder.setFeature(HTMLScanner.SCRIPT_STRIP_CDATA_DELIMS, true);
182                 domBuilder.setFeature(HTMLScanner.STYLE_STRIP_CDATA_DELIMS, true);
183                 domBuilder.setFeature(HTMLScanner.CDATA_EARLY_CLOSING, false);
184             }
185         }
186         catch (final Exception e) {
187             throw new ObjectInstantiationException("Error setting HTML parser feature", e);
188         }
189 
190         try (InputStream content = webResponse.getContentAsStream()) {
191             final String encoding = charset.name();
192             final XMLInputSource in = new XMLInputSource(null, url.toString(), null, content, encoding);
193 
194             page.registerParsingStart();
195             try {
196                 domBuilder.parse(in);
197             }
198             catch (final XNIException e) {
199                 // extract enclosed exception
200                 final Throwable origin = extractNestedException(e);
201                 throw new RuntimeException("Failed parsing content from " + url, origin);
202             }
203         }
204         finally {
205             page.registerParsingEnd();
206         }
207     }
208 
209     /**
210      * Extract nested exception within an XNIException (Nekohtml uses reflection and generated
211      * exceptions are wrapped many times within XNIException and InvocationTargetException)
212      *
213      * @param e the original XNIException
214      * @return the cause exception
215      */
216     static Throwable extractNestedException(final Throwable e) {
217         Throwable originalException = e;
218         Throwable cause = ((XNIException) e).getException();
219         while (cause != null) {
220             originalException = cause;
221             if (cause instanceof XNIException) {
222                 cause = ((XNIException) cause).getException();
223             }
224             else if (cause instanceof InvocationTargetException) {
225                 cause = cause.getCause();
226             }
227             else {
228                 cause = null;
229             }
230         }
231         return originalException;
232     }
233 
234     /**
235      * {@inheritDoc}
236      */
237     @Override
238     public ElementFactory getSvgFactory() {
239         return SVG_FACTORY;
240     }
241 
242     /**
243      * {@inheritDoc}
244      */
245     @Override
246     public ElementFactory getFactory(final String tagName) {
247         final ElementFactory result = ELEMENT_FACTORIES.get(tagName);
248 
249         if (result != null) {
250             return result;
251         }
252         return UnknownElementFactory.INSTANCE;
253     }
254 
255     /**
256      * <span style="color:red">INTERNAL API - SUBJECT TO CHANGE AT ANY TIME - USE AT YOUR OWN RISK.</span><br>
257      *
258      * Returns the pre-registered element factory corresponding to the specified tag, or an UnknownElementFactory.
259      * @param page the page
260      * @param namespaceURI the namespace URI
261      * @param qualifiedName the qualified name
262      * @param insideSvg is the node inside an SVG node or not
263      * @param svgSupport true if called from javascript createElementNS
264      * @return the pre-registered element factory corresponding to the specified tag, or an UnknownElementFactory
265      */
266     @Override
267     public ElementFactory getElementFactory(final SgmlPage page, final String namespaceURI,
268             final String qualifiedName, final boolean insideSvg, final boolean svgSupport) {
269         if (insideSvg) {
270             return SVG_FACTORY;
271         }
272 
273         if (namespaceURI == null || namespaceURI.isEmpty()
274             || Html.XHTML_NAMESPACE.equals(namespaceURI)
275             || Html.SVG_NAMESPACE.equals(namespaceURI)
276             || !qualifiedName.contains(":")) {
277 
278             String tagName = qualifiedName;
279             final int index = tagName.indexOf(':');
280             if (index == -1) {
281                 tagName = StringUtils.toRootLowerCase(tagName);
282             }
283             else {
284                 tagName = tagName.substring(index + 1);
285             }
286             final ElementFactory factory;
287             if (svgSupport && !"svg".equals(tagName) && Html.SVG_NAMESPACE.equals(namespaceURI)) {
288                 factory = SVG_FACTORY;
289             }
290             else {
291                 factory = ELEMENT_FACTORIES.get(tagName);
292             }
293 
294             if (factory != null) {
295                 return factory;
296             }
297         }
298         return UnknownElementFactory.INSTANCE;
299     }
300 }
301 
302 /**
303  * Utility to transmit parsing errors to a {@link HTMLParserListener}.
304  */
305 class HtmlUnitNekoHTMLErrorHandler implements XMLErrorHandler {
306     private final HTMLParserListener listener_;
307     private final URL url_;
308     private final String html_;
309 
310     HtmlUnitNekoHTMLErrorHandler(final HTMLParserListener listener, final URL url, final String htmlContent) {
311         WebAssert.notNull("listener", listener);
312         WebAssert.notNull("url", url);
313         listener_ = listener;
314         url_ = url;
315         html_ = htmlContent;
316     }
317 
318     /**
319      * @see DefaultErrorHandler#error(String,String,XMLParseException)
320      */
321     @Override
322     public void error(final String domain, final String key,
323             final XMLParseException exception) throws XNIException {
324         listener_.error(exception.getMessage(),
325                 url_,
326                 html_,
327                 exception.getLineNumber(),
328                 exception.getColumnNumber(),
329                 key);
330     }
331 
332     /**
333      * @see DefaultErrorHandler#warning(String,String,XMLParseException)
334      */
335     @Override
336     public void warning(final String domain, final String key,
337             final XMLParseException exception) throws XNIException {
338         listener_.warning(exception.getMessage(),
339                 url_,
340                 html_,
341                 exception.getLineNumber(),
342                 exception.getColumnNumber(),
343                 key);
344     }
345 
346     @Override
347     public void fatalError(final String domain, final String key,
348             final XMLParseException exception) throws XNIException {
349         listener_.error(exception.getMessage(),
350                 url_,
351                 html_,
352                 exception.getLineNumber(),
353                 exception.getColumnNumber(),
354                 key);
355     }
356 }