1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 package org.htmlunit.html.parser.neko;
16
17 import java.io.IOException;
18 import java.io.InputStream;
19 import java.io.StringReader;
20 import java.lang.reflect.InvocationTargetException;
21 import java.net.URL;
22 import java.nio.charset.Charset;
23 import java.util.ArrayList;
24 import java.util.List;
25 import java.util.Map;
26 import java.util.concurrent.ConcurrentHashMap;
27
28 import org.htmlunit.ObjectInstantiationException;
29 import org.htmlunit.Page;
30 import org.htmlunit.SgmlPage;
31 import org.htmlunit.WebAssert;
32 import org.htmlunit.WebClient;
33 import org.htmlunit.WebResponse;
34 import org.htmlunit.cyberneko.HTMLScanner;
35 import org.htmlunit.cyberneko.HTMLTagBalancer;
36 import org.htmlunit.cyberneko.xerces.util.DefaultErrorHandler;
37 import org.htmlunit.cyberneko.xerces.xni.QName;
38 import org.htmlunit.cyberneko.xerces.xni.XNIException;
39 import org.htmlunit.cyberneko.xerces.xni.parser.XMLErrorHandler;
40 import org.htmlunit.cyberneko.xerces.xni.parser.XMLInputSource;
41 import org.htmlunit.cyberneko.xerces.xni.parser.XMLParseException;
42 import org.htmlunit.html.DefaultElementFactory;
43 import org.htmlunit.html.DomNode;
44 import org.htmlunit.html.ElementFactory;
45 import org.htmlunit.html.Html;
46 import org.htmlunit.html.HtmlPage;
47 import org.htmlunit.html.UnknownElementFactory;
48 import org.htmlunit.html.parser.HTMLParser;
49 import org.htmlunit.html.parser.HTMLParserListener;
50 import org.htmlunit.svg.SvgElementFactory;
51 import org.htmlunit.util.StringUtils;
52 import org.w3c.dom.Node;
53 import org.xml.sax.SAXException;
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70 public final class HtmlUnitNekoHtmlParser implements HTMLParser {
71
72
73
74
75 public static final SvgElementFactory SVG_FACTORY = new SvgElementFactory();
76
77 private static final Map<String, ElementFactory> ELEMENT_FACTORIES = new ConcurrentHashMap<>();
78
79 static {
80 final DefaultElementFactory defaultElementFactory = new DefaultElementFactory();
81 for (final String tagName : DefaultElementFactory.SUPPORTED_TAGS_) {
82 ELEMENT_FACTORIES.put(tagName, defaultElementFactory);
83 }
84 }
85
86
87
88
89 @Override
90 public void parseFragment(final WebClient webClient, final DomNode parent, final DomNode context, final String source,
91 final boolean createdByJavascript)
92 throws SAXException, IOException {
93 final Page page = parent.getPage();
94 if (!(page instanceof HtmlPage)) {
95 return;
96 }
97 final HtmlPage htmlPage = (HtmlPage) page;
98 final URL url = htmlPage.getUrl();
99
100 final HtmlUnitNekoDOMBuilder domBuilder =
101 new HtmlUnitNekoDOMBuilder(this, parent, url, source, createdByJavascript);
102 domBuilder.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", true);
103
104 DomNode node = context;
105 final List<QName> ancestors = new ArrayList<>();
106 while (node != null && node.getNodeType() != Node.DOCUMENT_NODE) {
107 ancestors.add(0, new QName(null, node.getNodeName(), null, null));
108 node = node.getParentNode();
109 }
110 if (ancestors.isEmpty() || !"html".equals(ancestors.get(0).getLocalpart())) {
111 ancestors.add(new QName(null, "html", null, null));
112 ancestors.add(new QName(null, "body", null, null));
113 }
114 else if (ancestors.size() == 1
115 || (!"body".equals(ancestors.get(1).getLocalpart())
116 && !"head".equals(ancestors.get(1).getLocalpart()))) {
117 ancestors.add(new QName(null, "body", null, null));
118 }
119
120 domBuilder.setFeature(HTMLScanner.ALLOW_SELFCLOSING_TAGS, true);
121 domBuilder.setProperty(HTMLTagBalancer.FRAGMENT_CONTEXT_STACK, ancestors.toArray(new QName[0]));
122
123 final XMLInputSource in = new XMLInputSource(null, url.toString(), null, new StringReader(source), null);
124
125 htmlPage.registerParsingStart();
126 htmlPage.registerSnippetParsingStart();
127 try {
128 domBuilder.parse(in);
129 }
130 finally {
131 htmlPage.registerParsingEnd();
132 htmlPage.registerSnippetParsingEnd();
133 }
134 }
135
136
137
138
139 @Override
140 public void parse(final WebClient webClient, final WebResponse webResponse, final HtmlPage page,
141 final boolean xhtml, final boolean createdByJavascript) throws IOException {
142 final URL url = webResponse.getWebRequest().getUrl();
143 final HtmlUnitNekoDOMBuilder domBuilder =
144 new HtmlUnitNekoDOMBuilder(this, page, url, null, createdByJavascript);
145
146 final Charset charset = webResponse.getContentCharset();
147 try {
148 if (!webResponse.wasContentCharsetTentative()) {
149
150 domBuilder.setFeature(HTMLScanner.IGNORE_SPECIFIED_CHARSET, true);
151 }
152
153
154 if (xhtml) {
155 domBuilder.setFeature(HTMLScanner.ALLOW_SELFCLOSING_TAGS, true);
156 domBuilder.setFeature(HTMLScanner.SCRIPT_STRIP_CDATA_DELIMS, true);
157 domBuilder.setFeature(HTMLScanner.STYLE_STRIP_CDATA_DELIMS, true);
158 domBuilder.setFeature(HTMLScanner.CDATA_EARLY_CLOSING, false);
159 }
160
161 if (webClient != null) {
162 final int bufferSize = webClient.getOptions().getNekoReaderBufferSize();
163 if (bufferSize > 0) {
164 domBuilder.setProperty(HTMLScanner.READER_BUFFER_SIZE, bufferSize);
165 }
166 }
167 }
168 catch (final Exception e) {
169 throw new ObjectInstantiationException("Error setting HTML parser feature", e);
170 }
171
172 try (InputStream content = webResponse.getContentAsStream()) {
173 final String encoding = charset.name();
174 final XMLInputSource in = new XMLInputSource(null, url.toString(), null, content, encoding);
175
176 page.registerParsingStart();
177 try {
178 domBuilder.parse(in);
179 }
180 catch (final XNIException e) {
181
182 final Throwable origin = extractNestedException(e);
183 throw new RuntimeException("Failed parsing content from " + url, origin);
184 }
185 }
186 finally {
187 page.registerParsingEnd();
188 }
189 }
190
191
192
193
194
195
196
197
198 static Throwable extractNestedException(final Throwable e) {
199 Throwable originalException = e;
200 Throwable cause = ((XNIException) e).getException();
201 while (cause != null) {
202 originalException = cause;
203 if (cause instanceof XNIException) {
204 cause = ((XNIException) cause).getException();
205 }
206 else if (cause instanceof InvocationTargetException) {
207 cause = cause.getCause();
208 }
209 else {
210 cause = null;
211 }
212 }
213 return originalException;
214 }
215
216
217
218
219 @Override
220 public ElementFactory getSvgFactory() {
221 return SVG_FACTORY;
222 }
223
224
225
226
227 @Override
228 public ElementFactory getFactory(final String tagName) {
229 final ElementFactory result = ELEMENT_FACTORIES.get(tagName);
230
231 if (result != null) {
232 return result;
233 }
234 return UnknownElementFactory.INSTANCE;
235 }
236
237
238
239
240
241
242
243
244
245
246
247
248 @Override
249 public ElementFactory getElementFactory(final SgmlPage page, final String namespaceURI,
250 final String qualifiedName, final boolean insideSvg, final boolean svgSupport) {
251 if (insideSvg) {
252 return SVG_FACTORY;
253 }
254
255 if (namespaceURI == null || namespaceURI.isEmpty()
256 || Html.XHTML_NAMESPACE.equals(namespaceURI)
257 || Html.SVG_NAMESPACE.equals(namespaceURI)
258 || !qualifiedName.contains(":")) {
259
260 String tagName = qualifiedName;
261 final int index = tagName.indexOf(':');
262 if (index == -1) {
263 tagName = StringUtils.toRootLowerCase(tagName);
264 }
265 else {
266 tagName = tagName.substring(index + 1);
267 }
268 final ElementFactory factory;
269 if (svgSupport && !"svg".equals(tagName) && Html.SVG_NAMESPACE.equals(namespaceURI)) {
270 factory = SVG_FACTORY;
271 }
272 else {
273 factory = ELEMENT_FACTORIES.get(tagName);
274 }
275
276 if (factory != null) {
277 return factory;
278 }
279 }
280 return UnknownElementFactory.INSTANCE;
281 }
282 }
283
284
285
286
287 class HtmlUnitNekoHTMLErrorHandler implements XMLErrorHandler {
288 private final HTMLParserListener listener_;
289 private final URL url_;
290 private final String html_;
291
292 HtmlUnitNekoHTMLErrorHandler(final HTMLParserListener listener, final URL url, final String htmlContent) {
293 WebAssert.notNull("listener", listener);
294 WebAssert.notNull("url", url);
295 listener_ = listener;
296 url_ = url;
297 html_ = htmlContent;
298 }
299
300
301
302
303 @Override
304 public void error(final String domain, final String key,
305 final XMLParseException exception) throws XNIException {
306 listener_.error(exception.getMessage(),
307 url_,
308 html_,
309 exception.getLineNumber(),
310 exception.getColumnNumber(),
311 key);
312 }
313
314
315
316
317 @Override
318 public void warning(final String domain, final String key,
319 final XMLParseException exception) throws XNIException {
320 listener_.warning(exception.getMessage(),
321 url_,
322 html_,
323 exception.getLineNumber(),
324 exception.getColumnNumber(),
325 key);
326 }
327
328 @Override
329 public void fatalError(final String domain, final String key,
330 final XMLParseException exception) throws XNIException {
331 listener_.error(exception.getMessage(),
332 url_,
333 html_,
334 exception.getLineNumber(),
335 exception.getColumnNumber(),
336 key);
337 }
338 }