1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 package org.htmlunit.html.parser.neko;
16
17 import static org.htmlunit.BrowserVersionFeatures.JS_SCRIPT_IN_TEMPLATE_EXECUTED_ON_ATTACH;
18
19 import java.io.IOException;
20 import java.io.StringReader;
21 import java.net.URL;
22 import java.nio.charset.Charset;
23 import java.util.ArrayDeque;
24 import java.util.Deque;
25
26 import org.htmlunit.BrowserVersion;
27 import org.htmlunit.ObjectInstantiationException;
28 import org.htmlunit.WebClient;
29 import org.htmlunit.WebResponse;
30 import org.htmlunit.cyberneko.HTMLConfiguration;
31 import org.htmlunit.cyberneko.HTMLElements;
32 import org.htmlunit.cyberneko.HTMLScanner;
33 import org.htmlunit.cyberneko.HTMLTagBalancingListener;
34 import org.htmlunit.cyberneko.xerces.parsers.AbstractSAXParser;
35 import org.htmlunit.cyberneko.xerces.xni.Augmentations;
36 import org.htmlunit.cyberneko.xerces.xni.QName;
37 import org.htmlunit.cyberneko.xerces.xni.XMLAttributes;
38 import org.htmlunit.cyberneko.xerces.xni.XMLString;
39 import org.htmlunit.cyberneko.xerces.xni.XNIException;
40 import org.htmlunit.cyberneko.xerces.xni.parser.XMLInputSource;
41 import org.htmlunit.cyberneko.xerces.xni.parser.XMLParserConfiguration;
42 import org.htmlunit.html.DomCDataSection;
43 import org.htmlunit.html.DomComment;
44 import org.htmlunit.html.DomDocumentType;
45 import org.htmlunit.html.DomElement;
46 import org.htmlunit.html.DomNode;
47 import org.htmlunit.html.DomText;
48 import org.htmlunit.html.ElementFactory;
49 import org.htmlunit.html.Html;
50 import org.htmlunit.html.HtmlBody;
51 import org.htmlunit.html.HtmlElement;
52 import org.htmlunit.html.HtmlForm;
53 import org.htmlunit.html.HtmlHiddenInput;
54 import org.htmlunit.html.HtmlImage;
55 import org.htmlunit.html.HtmlPage;
56 import org.htmlunit.html.HtmlSvg;
57 import org.htmlunit.html.HtmlTable;
58 import org.htmlunit.html.HtmlTableRow;
59 import org.htmlunit.html.HtmlTemplate;
60 import org.htmlunit.html.ScriptElement;
61 import org.htmlunit.html.SubmittableElement;
62 import org.htmlunit.html.XHtmlPage;
63 import org.htmlunit.html.parser.HTMLParser;
64 import org.htmlunit.html.parser.HTMLParserDOMBuilder;
65 import org.htmlunit.html.parser.HTMLParserListener;
66 import org.htmlunit.javascript.host.html.HTMLBodyElement;
67 import org.htmlunit.util.StringUtils;
68 import org.w3c.dom.Node;
69 import org.xml.sax.Attributes;
70 import org.xml.sax.ContentHandler;
71 import org.xml.sax.Locator;
72 import org.xml.sax.SAXException;
73 import org.xml.sax.ext.LexicalHandler;
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95 final class HtmlUnitNekoDOMBuilder extends AbstractSAXParser
96 implements ContentHandler, LexicalHandler, HTMLTagBalancingListener, HTMLParserDOMBuilder {
97
98
99 private static final HTMLElements HTMLELEMENTS = new HTMLElements();
100
101 private enum HeadParsed { YES, SYNTHESIZED, NO }
102
103 private final HTMLParser htmlParser_;
104 private final HtmlPage page_;
105
106 private Locator locator_;
107 private final Deque<DomNode> stack_ = new ArrayDeque<>();
108
109
110 private boolean snippetStartNodeOverwritten_;
111 private final int initialSize_;
112 private DomNode currentNode_;
113 private final boolean createdByJavascript_;
114 private final XMLString characters_ = new XMLString();
115 private HtmlUnitNekoDOMBuilder.HeadParsed headParsed_ = HeadParsed.NO;
116 private HtmlElement body_;
117 private boolean lastTagWasSynthesized_;
118 private HtmlForm consumingForm_;
119 private boolean formEndingIsAdjusting_;
120 private boolean insideSvg_;
121 private boolean insideTemplate_;
122
123 private static final String FEATURE_AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";
124 private static final String FEATURE_PARSE_NOSCRIPT
125 = "http://cyberneko.org/html/features/parse-noscript-content";
126
127
128
129
130
131 @Override
132 public void pushInputString(final String html) {
133 page_.registerParsingStart();
134 page_.registerInlineSnippetParsingStart();
135 try {
136 final WebResponse webResponse = page_.getWebResponse();
137 final Charset charset = webResponse.getContentCharset();
138 final String url = webResponse.getWebRequest().getUrl().toString();
139 final XMLInputSource in = new XMLInputSource(null, url, null, new StringReader(html), charset.name());
140 ((HTMLConfiguration) parserConfiguration_).evaluateInputSource(in);
141 }
142 finally {
143 page_.registerParsingEnd();
144 page_.registerInlineSnippetParsingEnd();
145 }
146 }
147
148
149
150
151
152
153
154 HtmlUnitNekoDOMBuilder(final HTMLParser htmlParser, final WebClient webClient,
155 final DomNode node, final URL url, final String htmlContent, final boolean createdByJavascript) {
156 super(createConfiguration(webClient.getBrowserVersion()));
157
158 htmlParser_ = htmlParser;
159 page_ = (HtmlPage) node.getPage();
160
161 currentNode_ = node;
162 for (final Node ancestor : currentNode_.getAncestors()) {
163 stack_.push((DomNode) ancestor);
164 }
165 createdByJavascript_ = createdByJavascript;
166
167 final HTMLParserListener listener = webClient.getHTMLParserListener();
168 final boolean reportErrors = listener != null;
169 if (reportErrors) {
170 parserConfiguration_.setErrorHandler(new HtmlUnitNekoHTMLErrorHandler(listener, url, htmlContent));
171 }
172
173 try {
174 setFeature(FEATURE_AUGMENTATIONS, true);
175 setFeature("http://cyberneko.org/html/features/report-errors", reportErrors);
176 setFeature(FEATURE_PARSE_NOSCRIPT, !webClient.isJavaScriptEnabled());
177 setFeature(HTMLScanner.ALLOW_SELFCLOSING_IFRAME, false);
178
179 setContentHandler(this);
180 setLexicalHandler(this);
181 }
182 catch (final SAXException e) {
183 throw new ObjectInstantiationException("unable to create HTML parser", e);
184 }
185 initialSize_ = stack_.size();
186 }
187
188
189
190
191
192 private static XMLParserConfiguration createConfiguration(final BrowserVersion browserVersion) {
193
194
195
196 return new HTMLConfiguration(new HTMLElements.HTMLElementsWithCache(HTMLELEMENTS));
197 }
198
199
200
201
202 @Override
203 public void setDocumentLocator(final Locator locator) {
204 locator_ = locator;
205 }
206
207
208
209
210 @Override
211 public void startDocument() throws SAXException {
212
213 }
214
215
216 @Override
217 public void startElement(final QName element, final XMLAttributes attributes, final Augmentations augs)
218 throws XNIException {
219
220 lastTagWasSynthesized_ = augs.isSynthesized();
221 super.startElement(element, attributes, augs);
222 }
223
224
225
226
227 @Override
228 public void startElement(String namespaceURI, final String localName, final String qName, final Attributes atts)
229 throws SAXException {
230
231 if (snippetStartNodeOverwritten_) {
232 snippetStartNodeOverwritten_ = false;
233 return;
234 }
235 handleCharacters();
236
237 final String tagLower = StringUtils.toRootLowerCase(localName);
238 if (page_.isParsingHtmlSnippet() && ("html".equals(tagLower) || "body".equals(tagLower))) {
239
240
241 stack_.push(currentNode_);
242 return;
243 }
244
245 if ("head".equals(tagLower)) {
246 if (headParsed_ == HeadParsed.YES || page_.isParsingHtmlSnippet()) {
247
248
249 stack_.push(currentNode_);
250 return;
251 }
252
253 headParsed_ = lastTagWasSynthesized_ ? HeadParsed.SYNTHESIZED : HeadParsed.YES;
254 }
255
256
257
258 HtmlBody oldBody = null;
259 final boolean isBodyTag = "body".equals(tagLower);
260 if (isBodyTag) {
261 final HtmlBody body = page_.getBody();
262 if (body != null) {
263 oldBody = body;
264 }
265 }
266
267 if (namespaceURI != null) {
268 namespaceURI = namespaceURI.trim();
269 }
270
271 if (!(page_ instanceof XHtmlPage) && Html.XHTML_NAMESPACE.equals(namespaceURI)) {
272 namespaceURI = null;
273 }
274
275 final ElementFactory factory =
276 htmlParser_.getElementFactory(page_, namespaceURI, qName, insideSvg_, false);
277 if (factory == HtmlUnitNekoHtmlParser.SVG_FACTORY) {
278 namespaceURI = Html.SVG_NAMESPACE;
279 }
280
281 final DomElement newElement = factory.createElementNS(page_, namespaceURI, qName, atts);
282 newElement.setStartLocation(locator_.getLineNumber(), locator_.getColumnNumber());
283
284
285 addNodeToRightParent(currentNode_, newElement);
286
287 if (newElement instanceof HtmlSvg) {
288 insideSvg_ = true;
289 }
290 else if (newElement instanceof HtmlTemplate) {
291 insideTemplate_ = true;
292 }
293
294
295
296 else if (newElement instanceof HtmlForm form) {
297 consumingForm_ = form;
298 formEndingIsAdjusting_ = false;
299 }
300 else if (consumingForm_ != null) {
301
302 if (newElement instanceof SubmittableElement) {
303
304 if (((HtmlElement) newElement).getEnclosingForm() != consumingForm_) {
305 ((HtmlElement) newElement).setOwningForm(consumingForm_);
306 }
307 }
308 }
309
310
311
312 if (oldBody != null) {
313 oldBody.quietlyRemoveAndMoveChildrenTo(newElement);
314 }
315
316 if (!insideSvg_ && isBodyTag) {
317 body_ = (HtmlElement) newElement;
318 }
319 else if (createdByJavascript_
320 && newElement instanceof ScriptElement script
321 && (!insideTemplate_
322 || !page_.getWebClient().getBrowserVersion()
323 .hasFeature(JS_SCRIPT_IN_TEMPLATE_EXECUTED_ON_ATTACH))) {
324 script.markAsCreatedByDomParser();
325 }
326
327 currentNode_ = newElement;
328 stack_.push(currentNode_);
329 }
330
331
332
333
334
335 private void addNodeToRightParent(final DomNode currentNode, final DomElement newElement) {
336 final String currentNodeName = currentNode.getNodeName();
337 final String newNodeName = newElement.getNodeName();
338
339
340 if (isTableChild(newNodeName)) {
341 final DomNode parent =
342 "table".equals(currentNodeName) ? currentNode : findElementOnStack("table");
343 appendChild(parent, newElement);
344 return;
345 }
346 if ("tr".equals(newNodeName)) {
347 final DomNode parent =
348 isTableChild(currentNodeName) ? currentNode : findElementOnStack("tbody", "thead", "tfoot");
349 appendChild(parent, newElement);
350 return;
351 }
352 if (isTableCell(newNodeName)) {
353 final DomNode parent =
354 "tr".equals(currentNodeName) ? currentNode : findElementOnStack("tr");
355 appendChild(parent, newElement);
356 return;
357 }
358
359
360 if ("table".equals(currentNodeName) || isTableChild(currentNodeName) || "tr".equals(currentNodeName)) {
361 if ("template".equals(newNodeName)) {
362 currentNode.appendChild(newElement);
363 }
364
365
366 else if (!"colgroup".equals(currentNodeName)
367 && ("script".equals(newNodeName)
368 || "form".equals(newNodeName)
369 || "style".equals(newNodeName))) {
370 currentNode.appendChild(newElement);
371 }
372
373
374 else if ("col".equals(newNodeName) && "colgroup".equals(currentNodeName)) {
375 currentNode.appendChild(newElement);
376 }
377 else if ("caption".equals(currentNodeName)) {
378 currentNode.appendChild(newElement);
379 }
380 else if (newElement instanceof HtmlHiddenInput) {
381 currentNode.appendChild(newElement);
382 }
383 else {
384
385 final DomNode parent = findElementOnStack("table");
386 parent.insertBefore(newElement);
387 }
388 return;
389 }
390
391 if (formEndingIsAdjusting_ && "form".equals(currentNodeName)) {
392
393 appendChild(currentNode.getParentNode(), newElement);
394 return;
395 }
396
397
398 appendChild(currentNode, newElement);
399 }
400
401 private DomNode findElementOnStack(final String searchedElementName) {
402 for (final DomNode node : stack_) {
403 if (searchedElementName.equals(node.getNodeName())) {
404 return node;
405 }
406 }
407
408
409 return stack_.peek();
410 }
411
412 private DomNode findElementOnStack(final String... searchedElementNames) {
413 for (final DomNode node : stack_) {
414 for (final String searchedElementName : searchedElementNames) {
415 if (searchedElementName.equals(node.getNodeName())) {
416 return node;
417 }
418 }
419 }
420
421
422 return stack_.peek();
423 }
424
425 private static boolean isTableChild(final String nodeName) {
426 if (nodeName == null || nodeName.length() < 5) {
427 return false;
428 }
429
430 return "thead".equals(nodeName)
431 || "tbody".equals(nodeName)
432 || "tfoot".equals(nodeName)
433 || "caption".equals(nodeName)
434 || "colgroup".equals(nodeName);
435 }
436
437 private static boolean isTableCell(final String nodeName) {
438 if (nodeName == null || nodeName.length() != 2) {
439 return false;
440 }
441 return "td".equals(nodeName) || "th".equals(nodeName);
442 }
443
444
445 @Override
446 public void endElement(final QName element, final Augmentations augs)
447 throws XNIException {
448
449 lastTagWasSynthesized_ = augs.isSynthesized();
450 super.endElement(element, augs);
451 }
452
453
454
455
456 @Override
457 public void endElement(final String namespaceURI, final String localName, final String qName)
458 throws SAXException {
459
460 final String tagLower = StringUtils.toRootLowerCase(localName);
461
462 handleCharacters();
463
464 if (page_.isParsingHtmlSnippet()) {
465 if ("html".equals(tagLower) || "body".equals(tagLower)) {
466 return;
467 }
468 if (stack_.size() == initialSize_) {
469
470
471 snippetStartNodeOverwritten_ = !StringUtils.equalsChar('p', tagLower);
472 return;
473 }
474 }
475
476 if ("svg".equals(tagLower)) {
477 insideSvg_ = false;
478 }
479 else if ("template".equals(tagLower)) {
480 insideTemplate_ = false;
481 }
482
483
484
485
486 if (stack_.isEmpty()) {
487 return;
488 }
489
490 final DomNode previousNode = stack_.pop();
491 previousNode.setEndLocation(locator_.getLineNumber(), locator_.getColumnNumber());
492
493 if ("form".equals(tagLower) && !lastTagWasSynthesized_) {
494
495
496 consumingForm_ = null;
497 }
498
499 if (!stack_.isEmpty()) {
500 currentNode_ = stack_.peek();
501 }
502
503 final boolean postponed = page_.isParsingInlineHtmlSnippet();
504 previousNode.onAllChildrenAddedToPage(postponed);
505 }
506
507
508 @Override
509 public void characters(final char[] ch, final int start, final int length) throws SAXException {
510 characters_.append(ch, start, length);
511 }
512
513
514 @Override
515 public void ignorableWhitespace(final char[] ch, final int start, final int length) throws SAXException {
516 characters_.append(ch, start, length);
517 }
518
519
520
521
522 private void handleCharacters() {
523
524 if (characters_.length() == 0) {
525 return;
526 }
527
528
529 final String textValue = characters_.toString();
530 characters_.clear();
531
532 if (StringUtils.isBlank(textValue)) {
533 appendChild(currentNode_, new DomText(page_, textValue));
534 return;
535 }
536
537
538 if (currentNode_ instanceof HtmlTableRow row) {
539 final HtmlTable enclosingTable = row.getEnclosingTable();
540 if (enclosingTable != null) {
541 if (enclosingTable.getPreviousSibling() instanceof DomText domText) {
542 domText.setTextContent(domText.getWholeText() + textValue);
543 }
544 else {
545 enclosingTable.insertBefore(new DomText(page_, textValue));
546 }
547 }
548 }
549 else if (currentNode_ instanceof HtmlTable enclosingTable) {
550 if (enclosingTable.getPreviousSibling() instanceof DomText domText) {
551 domText.setTextContent(domText.getWholeText() + textValue);
552 }
553 else {
554 enclosingTable.insertBefore(new DomText(page_, textValue));
555 }
556 }
557 else if (currentNode_ instanceof HtmlImage) {
558 currentNode_.getParentNode().appendChild(new DomText(page_, textValue));
559 }
560 else {
561 appendChild(currentNode_, new DomText(page_, textValue));
562 }
563 }
564
565
566 @Override
567 public void endDocument() throws SAXException {
568 handleCharacters();
569 if (locator_ != null) {
570 page_.setEndLocation(locator_.getLineNumber(), locator_.getColumnNumber());
571 }
572 }
573
574
575 @Override
576 public void startPrefixMapping(final String prefix, final String uri) throws SAXException {
577
578 }
579
580
581 @Override
582 public void endPrefixMapping(final String prefix) throws SAXException {
583
584 }
585
586
587 @Override
588 public void processingInstruction(final String target, final String data) throws SAXException {
589
590 }
591
592
593 @Override
594 public void skippedEntity(final String name) throws SAXException {
595
596 }
597
598
599
600
601 @Override
602 public void comment(final char[] ch, final int start, final int length) {
603 handleCharacters();
604 final String data = new String(ch, start, length);
605 final DomComment comment = new DomComment(page_, data);
606 appendChild(currentNode_, comment);
607 }
608
609
610 @Override
611 public void endCDATA() {
612 final String data = characters_.toString();
613 characters_.clear();
614
615 final DomCDataSection cdataSection = new DomCDataSection(page_, data);
616 appendChild(currentNode_, cdataSection);
617 }
618
619
620 @Override
621 public void endDTD() {
622
623 }
624
625
626 @Override
627 public void endEntity(final String name) {
628
629 }
630
631
632 @Override
633 public void startCDATA() {
634 handleCharacters();
635 }
636
637
638 @Override
639 public void startDTD(final String name, final String publicId, final String systemId) {
640 final DomDocumentType type = new DomDocumentType(page_, name, publicId, systemId);
641 page_.setDocumentType(type);
642
643 final Node child;
644 child = type;
645 page_.appendChild(child);
646 }
647
648
649 @Override
650 public void startEntity(final String name) {
651
652 }
653
654
655
656
657 @Override
658 public void ignoredEndElement(final QName element, final Augmentations augs) {
659
660
661 if ("form".equals(element.getLocalpart()) && consumingForm_ != null) {
662 consumingForm_ = null;
663
664 if (findElementOnStack("table", "form") instanceof HtmlTable) {
665
666 }
667 else {
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695 formEndingIsAdjusting_ = true;
696 }
697 }
698 }
699
700
701
702
703 @Override
704 public void ignoredStartElement(final QName elem, final XMLAttributes attrs, final Augmentations augs) {
705
706
707 if (attrs != null && body_ != null) {
708 final String lp = elem.getLocalpart();
709 if (lp != null && lp.length() == 4) {
710 if ("body".equalsIgnoreCase(lp)) {
711 copyAttributes(body_, attrs);
712 }
713 else if ("html".equalsIgnoreCase(lp)) {
714 final DomNode parent = body_.getParentNode();
715 if (parent instanceof DomElement element) {
716 copyAttributes(element, attrs);
717 }
718 }
719 }
720 }
721 }
722
723 private static void copyAttributes(final DomElement to, final XMLAttributes attrs) {
724 final int length = attrs.getLength();
725
726 for (int i = 0; i < length; i++) {
727 final String attrName = StringUtils.toRootLowerCase(attrs.getLocalName(i));
728 if (to.getAttributes().getNamedItem(attrName) == null) {
729 to.setAttribute(attrName, attrs.getValue(i));
730 if (attrName.startsWith("on") && to.getPage().getWebClient().isJavaScriptEngineEnabled()
731 && to.getScriptableObject() instanceof HTMLBodyElement) {
732 final HTMLBodyElement jsBody = to.getScriptableObject();
733 jsBody.createEventHandlerFromAttribute(attrName, attrs.getValue(i));
734 }
735 }
736 }
737 }
738
739
740
741
742 @Override
743 public void parse(final XMLInputSource inputSource) throws XNIException, IOException {
744 final HTMLParserDOMBuilder oldBuilder = page_.getDOMBuilder();
745 page_.setDOMBuilder(this);
746 try {
747 super.parse(inputSource);
748 }
749 finally {
750 page_.setDOMBuilder(oldBuilder);
751 }
752 }
753
754 private static void appendChild(final DomNode parent, final DomNode child) {
755 if (parent instanceof HtmlTemplate template) {
756 template.getContent().appendChild(child);
757 return;
758 }
759
760 parent.appendChild(child);
761 }
762 }