1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 package org.htmlunit.html.parser.neko;
16
17 import java.io.IOException;
18 import java.io.InputStream;
19 import java.io.StringReader;
20 import java.net.URL;
21 import java.nio.charset.Charset;
22 import java.util.ArrayList;
23 import java.util.List;
24 import java.util.Map;
25 import java.util.concurrent.ConcurrentHashMap;
26
27 import org.htmlunit.ObjectInstantiationException;
28 import org.htmlunit.Page;
29 import org.htmlunit.SgmlPage;
30 import org.htmlunit.WebAssert;
31 import org.htmlunit.WebClient;
32 import org.htmlunit.WebResponse;
33 import org.htmlunit.cyberneko.HTMLScanner;
34 import org.htmlunit.cyberneko.HTMLTagBalancer;
35 import org.htmlunit.cyberneko.xerces.util.DefaultErrorHandler;
36 import org.htmlunit.cyberneko.xerces.xni.QName;
37 import org.htmlunit.cyberneko.xerces.xni.XNIException;
38 import org.htmlunit.cyberneko.xerces.xni.parser.XMLErrorHandler;
39 import org.htmlunit.cyberneko.xerces.xni.parser.XMLInputSource;
40 import org.htmlunit.cyberneko.xerces.xni.parser.XMLParseException;
41 import org.htmlunit.html.DefaultElementFactory;
42 import org.htmlunit.html.DomNode;
43 import org.htmlunit.html.ElementFactory;
44 import org.htmlunit.html.Html;
45 import org.htmlunit.html.HtmlPage;
46 import org.htmlunit.html.UnknownElementFactory;
47 import org.htmlunit.html.parser.HTMLParser;
48 import org.htmlunit.html.parser.HTMLParserListener;
49 import org.htmlunit.svg.SvgElementFactory;
50 import org.htmlunit.util.StringUtils;
51 import org.w3c.dom.Node;
52 import org.xml.sax.SAXException;
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69 public final class HtmlUnitNekoHtmlParser implements HTMLParser {
70
71
72
73
74 public static final SvgElementFactory SVG_FACTORY = new SvgElementFactory();
75
76 private static final Map<String, ElementFactory> ELEMENT_FACTORIES = new ConcurrentHashMap<>();
77
78 static {
79 final DefaultElementFactory defaultElementFactory = new DefaultElementFactory();
80 for (final String tagName : DefaultElementFactory.SUPPORTED_TAGS_) {
81 ELEMENT_FACTORIES.put(tagName, defaultElementFactory);
82 }
83 }
84
85
86
87
88 @Override
89 public void parseFragment(final WebClient webClient, final DomNode parent, final DomNode context,
90 final String source, final boolean createdByJavascript)
91 throws SAXException, IOException {
92 final Page page = parent.getPage();
93 if (!(page instanceof HtmlPage htmlPage)) {
94 return;
95 }
96 final URL url = htmlPage.getUrl();
97
98 final HtmlUnitNekoDOMBuilder domBuilder =
99 new HtmlUnitNekoDOMBuilder(this, webClient, parent, url, source, createdByJavascript);
100 domBuilder.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", true);
101
102 DomNode node = context;
103 final List<QName> ancestors = new ArrayList<>();
104 while (node != null && node.getNodeType() != Node.DOCUMENT_NODE) {
105 ancestors.add(0, new QName(null, node.getNodeName(), null, null));
106 node = node.getParentNode();
107 }
108 if (ancestors.isEmpty() || !"html".equals(ancestors.get(0).getLocalpart())) {
109 ancestors.add(new QName(null, "html", null, null));
110 ancestors.add(new QName(null, "body", null, null));
111 }
112 else if (ancestors.size() == 1
113 || (!"body".equals(ancestors.get(1).getLocalpart())
114 && !"head".equals(ancestors.get(1).getLocalpart()))) {
115 ancestors.add(new QName(null, "body", null, null));
116 }
117
118 domBuilder.setFeature(HTMLScanner.ALLOW_SELFCLOSING_TAGS, true);
119 domBuilder.setProperty(HTMLTagBalancer.FRAGMENT_CONTEXT_STACK, ancestors.toArray(new QName[0]));
120
121 final XMLInputSource in = new XMLInputSource(null, url.toString(), null, new StringReader(source), null);
122
123 htmlPage.registerParsingStart();
124 htmlPage.registerSnippetParsingStart();
125 try {
126 domBuilder.parse(in);
127 }
128 finally {
129 htmlPage.registerParsingEnd();
130 htmlPage.registerSnippetParsingEnd();
131 }
132 }
133
134
135
136
137 @Override
138 public void parse(final WebClient webClient, final WebResponse webResponse, final HtmlPage page,
139 final boolean xhtml, final boolean createdByJavascript) throws IOException {
140 final URL url = webResponse.getWebRequest().getUrl();
141 final HtmlUnitNekoDOMBuilder domBuilder =
142 new HtmlUnitNekoDOMBuilder(this, webClient, page, url, null, createdByJavascript);
143
144 final Charset charset = webResponse.getContentCharset();
145 try {
146 if (!webResponse.wasContentCharsetTentative()) {
147
148 domBuilder.setFeature(HTMLScanner.IGNORE_SPECIFIED_CHARSET, true);
149 }
150
151
152 if (xhtml) {
153 domBuilder.setFeature(HTMLScanner.ALLOW_SELFCLOSING_TAGS, true);
154 domBuilder.setFeature(HTMLScanner.SCRIPT_STRIP_CDATA_DELIMS, true);
155 domBuilder.setFeature(HTMLScanner.STYLE_STRIP_CDATA_DELIMS, true);
156 domBuilder.setFeature(HTMLScanner.CDATA_SECTIONS, true);
157 domBuilder.setFeature(HTMLScanner.CDATA_EARLY_CLOSING, false);
158 }
159
160 if (webClient != null) {
161 final int bufferSize = webClient.getOptions().getNekoReaderBufferSize();
162 if (bufferSize > 0) {
163 domBuilder.setProperty(HTMLScanner.READER_BUFFER_SIZE, bufferSize);
164 }
165 }
166 }
167 catch (final Exception e) {
168 throw new ObjectInstantiationException("Error setting HTML parser feature", e);
169 }
170
171 try (InputStream content = webResponse.getContentAsStream()) {
172 final String encoding = charset.name();
173 final XMLInputSource in = new XMLInputSource(null, url.toString(), null, content, encoding);
174
175 page.registerParsingStart();
176 try {
177 domBuilder.parse(in);
178 }
179 catch (final XNIException e) {
180
181 final Throwable origin = extractNestedException(e);
182 throw new RuntimeException("Failed parsing content from " + url, origin);
183 }
184 }
185 finally {
186 page.registerParsingEnd();
187 }
188 }
189
190
191
192
193
194
195
196 static Throwable extractNestedException(final Throwable e) {
197 Throwable originalException;
198 Throwable cause = e;
199 do {
200 originalException = cause;
201
202 if (cause instanceof XNIException) {
203 cause = cause.getCause();
204 }
205 else {
206 cause = null;
207 }
208 }
209 while (cause != null);
210
211 return originalException;
212 }
213
214
215
216
217 @Override
218 public ElementFactory getSvgFactory() {
219 return SVG_FACTORY;
220 }
221
222
223
224
225 @Override
226 public ElementFactory getFactory(final String tagName) {
227 final ElementFactory result = ELEMENT_FACTORIES.get(tagName);
228
229 if (result != null) {
230 return result;
231 }
232 return UnknownElementFactory.INSTANCE;
233 }
234
235
236
237
238
239
240
241
242
243
244
245
246 @Override
247 public ElementFactory getElementFactory(final SgmlPage page, final String namespaceURI,
248 final String qualifiedName, final boolean insideSvg, final boolean svgSupport) {
249 if (insideSvg) {
250 return SVG_FACTORY;
251 }
252
253 if (namespaceURI == null || namespaceURI.isEmpty()
254 || Html.XHTML_NAMESPACE.equals(namespaceURI)
255 || Html.SVG_NAMESPACE.equals(namespaceURI)
256 || !qualifiedName.contains(":")) {
257
258 String tagName = qualifiedName;
259 final int index = tagName.indexOf(':');
260 if (index == -1) {
261 tagName = StringUtils.toRootLowerCase(tagName);
262 }
263 else {
264 tagName = tagName.substring(index + 1);
265 }
266 final ElementFactory factory;
267 if (svgSupport && !"svg".equals(tagName) && Html.SVG_NAMESPACE.equals(namespaceURI)) {
268 factory = SVG_FACTORY;
269 }
270 else {
271 factory = ELEMENT_FACTORIES.get(tagName);
272 }
273
274 if (factory != null) {
275 return factory;
276 }
277 }
278 return UnknownElementFactory.INSTANCE;
279 }
280 }
281
282
283
284
285 class HtmlUnitNekoHTMLErrorHandler implements XMLErrorHandler {
286 private final HTMLParserListener listener_;
287 private final URL url_;
288 private final String html_;
289
290 HtmlUnitNekoHTMLErrorHandler(final HTMLParserListener listener, final URL url, final String htmlContent) {
291 WebAssert.notNull("listener", listener);
292 WebAssert.notNull("url", url);
293 listener_ = listener;
294 url_ = url;
295 html_ = htmlContent;
296 }
297
298
299
300
301 @Override
302 public void error(final String domain, final String key,
303 final XMLParseException exception) throws XNIException {
304 listener_.error(exception.getMessage(),
305 url_,
306 html_,
307 exception.getLineNumber(),
308 exception.getColumnNumber(),
309 key);
310 }
311
312
313
314
315 @Override
316 public void warning(final String domain, final String key,
317 final XMLParseException exception) throws XNIException {
318 listener_.warning(exception.getMessage(),
319 url_,
320 html_,
321 exception.getLineNumber(),
322 exception.getColumnNumber(),
323 key);
324 }
325
326 @Override
327 public void fatalError(final String domain, final String key,
328 final XMLParseException exception) throws XNIException {
329 listener_.error(exception.getMessage(),
330 url_,
331 html_,
332 exception.getLineNumber(),
333 exception.getColumnNumber(),
334 key);
335 }
336 }