1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 package org.htmlunit.html.parser.neko;
16
17 import java.io.IOException;
18 import java.io.InputStream;
19 import java.io.StringReader;
20 import java.lang.reflect.InvocationTargetException;
21 import java.net.URL;
22 import java.nio.charset.Charset;
23 import java.util.ArrayList;
24 import java.util.List;
25 import java.util.Map;
26 import java.util.concurrent.ConcurrentHashMap;
27
28 import org.htmlunit.ObjectInstantiationException;
29 import org.htmlunit.Page;
30 import org.htmlunit.SgmlPage;
31 import org.htmlunit.WebAssert;
32 import org.htmlunit.WebClient;
33 import org.htmlunit.WebResponse;
34 import org.htmlunit.cyberneko.HTMLScanner;
35 import org.htmlunit.cyberneko.HTMLTagBalancer;
36 import org.htmlunit.cyberneko.xerces.util.DefaultErrorHandler;
37 import org.htmlunit.cyberneko.xerces.xni.QName;
38 import org.htmlunit.cyberneko.xerces.xni.XNIException;
39 import org.htmlunit.cyberneko.xerces.xni.parser.XMLErrorHandler;
40 import org.htmlunit.cyberneko.xerces.xni.parser.XMLInputSource;
41 import org.htmlunit.cyberneko.xerces.xni.parser.XMLParseException;
42 import org.htmlunit.html.DefaultElementFactory;
43 import org.htmlunit.html.DomNode;
44 import org.htmlunit.html.ElementFactory;
45 import org.htmlunit.html.Html;
46 import org.htmlunit.html.HtmlPage;
47 import org.htmlunit.html.UnknownElementFactory;
48 import org.htmlunit.html.parser.HTMLParser;
49 import org.htmlunit.html.parser.HTMLParserListener;
50 import org.htmlunit.svg.SvgElementFactory;
51 import org.htmlunit.util.StringUtils;
52 import org.w3c.dom.Node;
53 import org.xml.sax.SAXException;
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70 public final class HtmlUnitNekoHtmlParser implements HTMLParser {
71
72
73
74
75 public static final SvgElementFactory SVG_FACTORY = new SvgElementFactory();
76
77 private static final Map<String, ElementFactory> ELEMENT_FACTORIES = new ConcurrentHashMap<>();
78
79 static {
80 final DefaultElementFactory defaultElementFactory = new DefaultElementFactory();
81 for (final String tagName : DefaultElementFactory.SUPPORTED_TAGS_) {
82 ELEMENT_FACTORIES.put(tagName, defaultElementFactory);
83 }
84 }
85
86
87
88
89 @Override
90 public void parseFragment(final WebClient webClient, final DomNode parent, final DomNode context,
91 final String source, final boolean createdByJavascript)
92 throws SAXException, IOException {
93 final Page page = parent.getPage();
94 if (!(page instanceof HtmlPage)) {
95 return;
96 }
97 final HtmlPage htmlPage = (HtmlPage) page;
98 final URL url = htmlPage.getUrl();
99
100 final HtmlUnitNekoDOMBuilder domBuilder =
101 new HtmlUnitNekoDOMBuilder(this, parent, url, source, createdByJavascript);
102 domBuilder.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", true);
103
104 DomNode node = context;
105 final List<QName> ancestors = new ArrayList<>();
106 while (node != null && node.getNodeType() != Node.DOCUMENT_NODE) {
107 ancestors.add(0, new QName(null, node.getNodeName(), null, null));
108 node = node.getParentNode();
109 }
110 if (ancestors.isEmpty() || !"html".equals(ancestors.get(0).getLocalpart())) {
111 ancestors.add(new QName(null, "html", null, null));
112 ancestors.add(new QName(null, "body", null, null));
113 }
114 else if (ancestors.size() == 1
115 || (!"body".equals(ancestors.get(1).getLocalpart())
116 && !"head".equals(ancestors.get(1).getLocalpart()))) {
117 ancestors.add(new QName(null, "body", null, null));
118 }
119
120 domBuilder.setFeature(HTMLScanner.ALLOW_SELFCLOSING_TAGS, true);
121 domBuilder.setProperty(HTMLTagBalancer.FRAGMENT_CONTEXT_STACK, ancestors.toArray(new QName[0]));
122
123 final XMLInputSource in = new XMLInputSource(null, url.toString(), null, new StringReader(source), null);
124
125 htmlPage.registerParsingStart();
126 htmlPage.registerSnippetParsingStart();
127 try {
128 domBuilder.parse(in);
129 }
130 finally {
131 htmlPage.registerParsingEnd();
132 htmlPage.registerSnippetParsingEnd();
133 }
134 }
135
136
137
138
139 @Override
140 public void parse(final WebClient webClient, final WebResponse webResponse, final HtmlPage page,
141 final boolean xhtml, final boolean createdByJavascript) throws IOException {
142 final URL url = webResponse.getWebRequest().getUrl();
143 final HtmlUnitNekoDOMBuilder domBuilder =
144 new HtmlUnitNekoDOMBuilder(this, page, url, null, createdByJavascript);
145
146 final Charset charset = webResponse.getContentCharset();
147 try {
148 if (!webResponse.wasContentCharsetTentative()) {
149
150 domBuilder.setFeature(HTMLScanner.IGNORE_SPECIFIED_CHARSET, true);
151 }
152
153
154 if (xhtml) {
155 domBuilder.setFeature(HTMLScanner.ALLOW_SELFCLOSING_TAGS, true);
156 domBuilder.setFeature(HTMLScanner.SCRIPT_STRIP_CDATA_DELIMS, true);
157 domBuilder.setFeature(HTMLScanner.STYLE_STRIP_CDATA_DELIMS, true);
158 domBuilder.setFeature(HTMLScanner.CDATA_SECTIONS, true);
159 domBuilder.setFeature(HTMLScanner.CDATA_EARLY_CLOSING, false);
160 }
161
162 if (webClient != null) {
163 final int bufferSize = webClient.getOptions().getNekoReaderBufferSize();
164 if (bufferSize > 0) {
165 domBuilder.setProperty(HTMLScanner.READER_BUFFER_SIZE, bufferSize);
166 }
167 }
168 }
169 catch (final Exception e) {
170 throw new ObjectInstantiationException("Error setting HTML parser feature", e);
171 }
172
173 try (InputStream content = webResponse.getContentAsStream()) {
174 final String encoding = charset.name();
175 final XMLInputSource in = new XMLInputSource(null, url.toString(), null, content, encoding);
176
177 page.registerParsingStart();
178 try {
179 domBuilder.parse(in);
180 }
181 catch (final XNIException e) {
182
183 final Throwable origin = extractNestedException(e);
184 throw new RuntimeException("Failed parsing content from " + url, origin);
185 }
186 }
187 finally {
188 page.registerParsingEnd();
189 }
190 }
191
192
193
194
195
196
197
198
199 static Throwable extractNestedException(final Throwable e) {
200 Throwable originalException = e;
201 Throwable cause = ((XNIException) e).getException();
202 while (cause != null) {
203 originalException = cause;
204 if (cause instanceof XNIException) {
205 cause = ((XNIException) cause).getException();
206 }
207 else if (cause instanceof InvocationTargetException) {
208 cause = cause.getCause();
209 }
210 else {
211 cause = null;
212 }
213 }
214 return originalException;
215 }
216
217
218
219
220 @Override
221 public ElementFactory getSvgFactory() {
222 return SVG_FACTORY;
223 }
224
225
226
227
228 @Override
229 public ElementFactory getFactory(final String tagName) {
230 final ElementFactory result = ELEMENT_FACTORIES.get(tagName);
231
232 if (result != null) {
233 return result;
234 }
235 return UnknownElementFactory.INSTANCE;
236 }
237
238
239
240
241
242
243
244
245
246
247
248
249 @Override
250 public ElementFactory getElementFactory(final SgmlPage page, final String namespaceURI,
251 final String qualifiedName, final boolean insideSvg, final boolean svgSupport) {
252 if (insideSvg) {
253 return SVG_FACTORY;
254 }
255
256 if (namespaceURI == null || namespaceURI.isEmpty()
257 || Html.XHTML_NAMESPACE.equals(namespaceURI)
258 || Html.SVG_NAMESPACE.equals(namespaceURI)
259 || !qualifiedName.contains(":")) {
260
261 String tagName = qualifiedName;
262 final int index = tagName.indexOf(':');
263 if (index == -1) {
264 tagName = StringUtils.toRootLowerCase(tagName);
265 }
266 else {
267 tagName = tagName.substring(index + 1);
268 }
269 final ElementFactory factory;
270 if (svgSupport && !"svg".equals(tagName) && Html.SVG_NAMESPACE.equals(namespaceURI)) {
271 factory = SVG_FACTORY;
272 }
273 else {
274 factory = ELEMENT_FACTORIES.get(tagName);
275 }
276
277 if (factory != null) {
278 return factory;
279 }
280 }
281 return UnknownElementFactory.INSTANCE;
282 }
283 }
284
285
286
287
288 class HtmlUnitNekoHTMLErrorHandler implements XMLErrorHandler {
289 private final HTMLParserListener listener_;
290 private final URL url_;
291 private final String html_;
292
293 HtmlUnitNekoHTMLErrorHandler(final HTMLParserListener listener, final URL url, final String htmlContent) {
294 WebAssert.notNull("listener", listener);
295 WebAssert.notNull("url", url);
296 listener_ = listener;
297 url_ = url;
298 html_ = htmlContent;
299 }
300
301
302
303
304 @Override
305 public void error(final String domain, final String key,
306 final XMLParseException exception) throws XNIException {
307 listener_.error(exception.getMessage(),
308 url_,
309 html_,
310 exception.getLineNumber(),
311 exception.getColumnNumber(),
312 key);
313 }
314
315
316
317
318 @Override
319 public void warning(final String domain, final String key,
320 final XMLParseException exception) throws XNIException {
321 listener_.warning(exception.getMessage(),
322 url_,
323 html_,
324 exception.getLineNumber(),
325 exception.getColumnNumber(),
326 key);
327 }
328
329 @Override
330 public void fatalError(final String domain, final String key,
331 final XMLParseException exception) throws XNIException {
332 listener_.error(exception.getMessage(),
333 url_,
334 html_,
335 exception.getLineNumber(),
336 exception.getColumnNumber(),
337 key);
338 }
339 }