1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 package org.htmlunit.html.parser.neko;
16
17 import java.io.IOException;
18 import java.io.InputStream;
19 import java.io.StringReader;
20 import java.lang.reflect.InvocationTargetException;
21 import java.net.URL;
22 import java.nio.charset.Charset;
23 import java.util.ArrayList;
24 import java.util.HashMap;
25 import java.util.List;
26 import java.util.Map;
27
28 import org.htmlunit.ObjectInstantiationException;
29 import org.htmlunit.Page;
30 import org.htmlunit.SgmlPage;
31 import org.htmlunit.WebAssert;
32 import org.htmlunit.WebResponse;
33 import org.htmlunit.cyberneko.HTMLScanner;
34 import org.htmlunit.cyberneko.HTMLTagBalancer;
35 import org.htmlunit.cyberneko.xerces.util.DefaultErrorHandler;
36 import org.htmlunit.cyberneko.xerces.xni.QName;
37 import org.htmlunit.cyberneko.xerces.xni.XNIException;
38 import org.htmlunit.cyberneko.xerces.xni.parser.XMLErrorHandler;
39 import org.htmlunit.cyberneko.xerces.xni.parser.XMLInputSource;
40 import org.htmlunit.cyberneko.xerces.xni.parser.XMLParseException;
41 import org.htmlunit.html.DefaultElementFactory;
42 import org.htmlunit.html.DomNode;
43 import org.htmlunit.html.ElementFactory;
44 import org.htmlunit.html.Html;
45 import org.htmlunit.html.HtmlPage;
46 import org.htmlunit.html.UnknownElementFactory;
47 import org.htmlunit.html.parser.HTMLParser;
48 import org.htmlunit.html.parser.HTMLParserListener;
49 import org.htmlunit.svg.SvgElementFactory;
50 import org.htmlunit.util.StringUtils;
51 import org.w3c.dom.Node;
52 import org.xml.sax.SAXException;
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69 public final class HtmlUnitNekoHtmlParser implements HTMLParser {
70
71
72
73
74 public static final SvgElementFactory SVG_FACTORY = new SvgElementFactory();
75
76 private static final Map<String, ElementFactory> ELEMENT_FACTORIES = new HashMap<>();
77
78 static {
79 final DefaultElementFactory defaultElementFactory = new DefaultElementFactory();
80 for (final String tagName : DefaultElementFactory.SUPPORTED_TAGS_) {
81 ELEMENT_FACTORIES.put(tagName, defaultElementFactory);
82 }
83 }
84
85
86
87
88
89
90
91
92
93 @Override
94 public void parseFragment(final DomNode parent, final String source) throws SAXException, IOException {
95 parseFragment(parent, parent, source, false);
96 }
97
98
99
100
101
102
103
104
105
106
107
108 @Override
109 public void parseFragment(final DomNode parent, final DomNode context, final String source,
110 final boolean createdByJavascript)
111 throws SAXException, IOException {
112 final Page page = parent.getPage();
113 if (!(page instanceof HtmlPage)) {
114 return;
115 }
116 final HtmlPage htmlPage = (HtmlPage) page;
117 final URL url = htmlPage.getUrl();
118
119 final HtmlUnitNekoDOMBuilder domBuilder =
120 new HtmlUnitNekoDOMBuilder(this, parent, url, source, createdByJavascript);
121 domBuilder.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", true);
122
123 DomNode node = context;
124 final List<QName> ancestors = new ArrayList<>();
125 while (node != null && node.getNodeType() != Node.DOCUMENT_NODE) {
126 ancestors.add(0, new QName(null, node.getNodeName(), null, null));
127 node = node.getParentNode();
128 }
129 if (ancestors.isEmpty() || !"html".equals(ancestors.get(0).getLocalpart())) {
130 ancestors.add(new QName(null, "html", null, null));
131 ancestors.add(new QName(null, "body", null, null));
132 }
133 else if (ancestors.size() == 1
134 || (!"body".equals(ancestors.get(1).getLocalpart())
135 && !"head".equals(ancestors.get(1).getLocalpart()))) {
136 ancestors.add(new QName(null, "body", null, null));
137 }
138
139 domBuilder.setFeature(HTMLScanner.ALLOW_SELFCLOSING_TAGS, true);
140 domBuilder.setProperty(HTMLTagBalancer.FRAGMENT_CONTEXT_STACK, ancestors.toArray(new QName[0]));
141
142 final XMLInputSource in = new XMLInputSource(null, url.toString(), null, new StringReader(source), null);
143
144 htmlPage.registerParsingStart();
145 htmlPage.registerSnippetParsingStart();
146 try {
147 domBuilder.parse(in);
148 }
149 finally {
150 htmlPage.registerParsingEnd();
151 htmlPage.registerSnippetParsingEnd();
152 }
153 }
154
155
156
157
158
159
160
161
162
163
164 @Override
165 public void parse(final WebResponse webResponse, final HtmlPage page,
166 final boolean xhtml, final boolean createdByJavascript) throws IOException {
167 final URL url = webResponse.getWebRequest().getUrl();
168 final HtmlUnitNekoDOMBuilder domBuilder =
169 new HtmlUnitNekoDOMBuilder(this, page, url, null, createdByJavascript);
170
171 final Charset charset = webResponse.getContentCharset();
172 try {
173 if (!webResponse.wasContentCharsetTentative()) {
174
175 domBuilder.setFeature(HTMLScanner.IGNORE_SPECIFIED_CHARSET, true);
176 }
177
178
179 if (xhtml) {
180 domBuilder.setFeature(HTMLScanner.ALLOW_SELFCLOSING_TAGS, true);
181 domBuilder.setFeature(HTMLScanner.SCRIPT_STRIP_CDATA_DELIMS, true);
182 domBuilder.setFeature(HTMLScanner.STYLE_STRIP_CDATA_DELIMS, true);
183 domBuilder.setFeature(HTMLScanner.CDATA_EARLY_CLOSING, false);
184 }
185 }
186 catch (final Exception e) {
187 throw new ObjectInstantiationException("Error setting HTML parser feature", e);
188 }
189
190 try (InputStream content = webResponse.getContentAsStream()) {
191 final String encoding = charset.name();
192 final XMLInputSource in = new XMLInputSource(null, url.toString(), null, content, encoding);
193
194 page.registerParsingStart();
195 try {
196 domBuilder.parse(in);
197 }
198 catch (final XNIException e) {
199
200 final Throwable origin = extractNestedException(e);
201 throw new RuntimeException("Failed parsing content from " + url, origin);
202 }
203 }
204 finally {
205 page.registerParsingEnd();
206 }
207 }
208
209
210
211
212
213
214
215
216 static Throwable extractNestedException(final Throwable e) {
217 Throwable originalException = e;
218 Throwable cause = ((XNIException) e).getException();
219 while (cause != null) {
220 originalException = cause;
221 if (cause instanceof XNIException) {
222 cause = ((XNIException) cause).getException();
223 }
224 else if (cause instanceof InvocationTargetException) {
225 cause = cause.getCause();
226 }
227 else {
228 cause = null;
229 }
230 }
231 return originalException;
232 }
233
234
235
236
237 @Override
238 public ElementFactory getSvgFactory() {
239 return SVG_FACTORY;
240 }
241
242
243
244
245 @Override
246 public ElementFactory getFactory(final String tagName) {
247 final ElementFactory result = ELEMENT_FACTORIES.get(tagName);
248
249 if (result != null) {
250 return result;
251 }
252 return UnknownElementFactory.INSTANCE;
253 }
254
255
256
257
258
259
260
261
262
263
264
265
266 @Override
267 public ElementFactory getElementFactory(final SgmlPage page, final String namespaceURI,
268 final String qualifiedName, final boolean insideSvg, final boolean svgSupport) {
269 if (insideSvg) {
270 return SVG_FACTORY;
271 }
272
273 if (namespaceURI == null || namespaceURI.isEmpty()
274 || Html.XHTML_NAMESPACE.equals(namespaceURI)
275 || Html.SVG_NAMESPACE.equals(namespaceURI)
276 || !qualifiedName.contains(":")) {
277
278 String tagName = qualifiedName;
279 final int index = tagName.indexOf(':');
280 if (index == -1) {
281 tagName = StringUtils.toRootLowerCase(tagName);
282 }
283 else {
284 tagName = tagName.substring(index + 1);
285 }
286 final ElementFactory factory;
287 if (svgSupport && !"svg".equals(tagName) && Html.SVG_NAMESPACE.equals(namespaceURI)) {
288 factory = SVG_FACTORY;
289 }
290 else {
291 factory = ELEMENT_FACTORIES.get(tagName);
292 }
293
294 if (factory != null) {
295 return factory;
296 }
297 }
298 return UnknownElementFactory.INSTANCE;
299 }
300 }
301
302
303
304
305 class HtmlUnitNekoHTMLErrorHandler implements XMLErrorHandler {
306 private final HTMLParserListener listener_;
307 private final URL url_;
308 private final String html_;
309
310 HtmlUnitNekoHTMLErrorHandler(final HTMLParserListener listener, final URL url, final String htmlContent) {
311 WebAssert.notNull("listener", listener);
312 WebAssert.notNull("url", url);
313 listener_ = listener;
314 url_ = url;
315 html_ = htmlContent;
316 }
317
318
319
320
321 @Override
322 public void error(final String domain, final String key,
323 final XMLParseException exception) throws XNIException {
324 listener_.error(exception.getMessage(),
325 url_,
326 html_,
327 exception.getLineNumber(),
328 exception.getColumnNumber(),
329 key);
330 }
331
332
333
334
335 @Override
336 public void warning(final String domain, final String key,
337 final XMLParseException exception) throws XNIException {
338 listener_.warning(exception.getMessage(),
339 url_,
340 html_,
341 exception.getLineNumber(),
342 exception.getColumnNumber(),
343 key);
344 }
345
346 @Override
347 public void fatalError(final String domain, final String key,
348 final XMLParseException exception) throws XNIException {
349 listener_.error(exception.getMessage(),
350 url_,
351 html_,
352 exception.getLineNumber(),
353 exception.getColumnNumber(),
354 key);
355 }
356 }