1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 package org.htmlunit.html.parser.neko;
16
17 import static org.htmlunit.BrowserVersionFeatures.HTML_COMMAND_TAG;
18 import static org.htmlunit.BrowserVersionFeatures.JS_SCRIPT_IN_TEMPLATE_EXECUTED_ON_ATTACH;
19
20 import java.io.IOException;
21 import java.io.StringReader;
22 import java.net.URL;
23 import java.nio.charset.Charset;
24 import java.util.ArrayDeque;
25 import java.util.Deque;
26
27 import org.htmlunit.BrowserVersion;
28 import org.htmlunit.ObjectInstantiationException;
29 import org.htmlunit.WebClient;
30 import org.htmlunit.WebResponse;
31 import org.htmlunit.cyberneko.HTMLConfiguration;
32 import org.htmlunit.cyberneko.HTMLElements;
33 import org.htmlunit.cyberneko.HTMLScanner;
34 import org.htmlunit.cyberneko.HTMLTagBalancingListener;
35 import org.htmlunit.cyberneko.xerces.parsers.AbstractSAXParser;
36 import org.htmlunit.cyberneko.xerces.xni.Augmentations;
37 import org.htmlunit.cyberneko.xerces.xni.QName;
38 import org.htmlunit.cyberneko.xerces.xni.XMLAttributes;
39 import org.htmlunit.cyberneko.xerces.xni.XMLString;
40 import org.htmlunit.cyberneko.xerces.xni.XNIException;
41 import org.htmlunit.cyberneko.xerces.xni.parser.XMLInputSource;
42 import org.htmlunit.cyberneko.xerces.xni.parser.XMLParserConfiguration;
43 import org.htmlunit.html.DomCDataSection;
44 import org.htmlunit.html.DomComment;
45 import org.htmlunit.html.DomDocumentType;
46 import org.htmlunit.html.DomElement;
47 import org.htmlunit.html.DomNode;
48 import org.htmlunit.html.DomText;
49 import org.htmlunit.html.ElementFactory;
50 import org.htmlunit.html.Html;
51 import org.htmlunit.html.HtmlBody;
52 import org.htmlunit.html.HtmlElement;
53 import org.htmlunit.html.HtmlForm;
54 import org.htmlunit.html.HtmlHiddenInput;
55 import org.htmlunit.html.HtmlImage;
56 import org.htmlunit.html.HtmlPage;
57 import org.htmlunit.html.HtmlSvg;
58 import org.htmlunit.html.HtmlTable;
59 import org.htmlunit.html.HtmlTableRow;
60 import org.htmlunit.html.HtmlTemplate;
61 import org.htmlunit.html.ScriptElement;
62 import org.htmlunit.html.SubmittableElement;
63 import org.htmlunit.html.XHtmlPage;
64 import org.htmlunit.html.parser.HTMLParser;
65 import org.htmlunit.html.parser.HTMLParserDOMBuilder;
66 import org.htmlunit.html.parser.HTMLParserListener;
67 import org.htmlunit.javascript.host.html.HTMLBodyElement;
68 import org.htmlunit.util.StringUtils;
69 import org.w3c.dom.Node;
70 import org.xml.sax.Attributes;
71 import org.xml.sax.ContentHandler;
72 import org.xml.sax.Locator;
73 import org.xml.sax.SAXException;
74 import org.xml.sax.ext.LexicalHandler;
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96 final class HtmlUnitNekoDOMBuilder extends AbstractSAXParser
97 implements ContentHandler, LexicalHandler, HTMLTagBalancingListener, HTMLParserDOMBuilder {
98
99
100 private static final HTMLElements HTMLELEMENTS;
101 private static final HTMLElements HTMLELEMENTS_WITH_CMD;
102
103 static {
104
105 final short commandShortCode = HTMLElements.UNKNOWN + 1;
106
107 final HTMLElements.Element command = new HTMLElements.Element(commandShortCode, "COMMAND",
108 HTMLElements.Element.EMPTY, new short[] {HTMLElements.BODY, HTMLElements.HEAD}, null);
109
110 HTMLELEMENTS = new HTMLElements();
111
112 final HTMLElements value = new HTMLElements();
113 value.setElement(command);
114 HTMLELEMENTS_WITH_CMD = value;
115 }
116
117 private enum HeadParsed { YES, SYNTHESIZED, NO }
118
119 private final HTMLParser htmlParser_;
120 private final HtmlPage page_;
121
122 private Locator locator_;
123 private final Deque<DomNode> stack_ = new ArrayDeque<>();
124
125
126 private boolean snippetStartNodeOverwritten_;
127 private final int initialSize_;
128 private DomNode currentNode_;
129 private final boolean createdByJavascript_;
130 private final XMLString characters_ = new XMLString();
131 private HtmlUnitNekoDOMBuilder.HeadParsed headParsed_ = HeadParsed.NO;
132 private HtmlElement body_;
133 private boolean lastTagWasSynthesized_;
134 private HtmlForm consumingForm_;
135 private boolean formEndingIsAdjusting_;
136 private boolean insideSvg_;
137 private boolean insideTemplate_;
138
139 private static final String FEATURE_AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";
140 private static final String FEATURE_PARSE_NOSCRIPT
141 = "http://cyberneko.org/html/features/parse-noscript-content";
142
143
144
145
146
147 @Override
148 public void pushInputString(final String html) {
149 page_.registerParsingStart();
150 page_.registerInlineSnippetParsingStart();
151 try {
152 final WebResponse webResponse = page_.getWebResponse();
153 final Charset charset = webResponse.getContentCharset();
154 final String url = webResponse.getWebRequest().getUrl().toString();
155 final XMLInputSource in = new XMLInputSource(null, url, null, new StringReader(html), charset.name());
156 ((HTMLConfiguration) parserConfiguration_).evaluateInputSource(in);
157 }
158 finally {
159 page_.registerParsingEnd();
160 page_.registerInlineSnippetParsingEnd();
161 }
162 }
163
164
165
166
167
168
169
170 HtmlUnitNekoDOMBuilder(final HTMLParser htmlParser,
171 final DomNode node, final URL url, final String htmlContent, final boolean createdByJavascript) {
172 super(createConfiguration(node.getPage().getWebClient().getBrowserVersion()));
173
174 htmlParser_ = htmlParser;
175 page_ = (HtmlPage) node.getPage();
176
177 currentNode_ = node;
178 for (final Node ancestor : currentNode_.getAncestors()) {
179 stack_.push((DomNode) ancestor);
180 }
181 createdByJavascript_ = createdByJavascript;
182
183 final WebClient webClient = page_.getWebClient();
184 final HTMLParserListener listener = webClient.getHTMLParserListener();
185 final boolean reportErrors = listener != null;
186 if (reportErrors) {
187 parserConfiguration_.setErrorHandler(new HtmlUnitNekoHTMLErrorHandler(listener, url, htmlContent));
188 }
189
190 try {
191 setFeature(FEATURE_AUGMENTATIONS, true);
192 setFeature("http://cyberneko.org/html/features/report-errors", reportErrors);
193 setFeature(FEATURE_PARSE_NOSCRIPT, !webClient.isJavaScriptEnabled());
194 setFeature(HTMLScanner.ALLOW_SELFCLOSING_IFRAME, false);
195
196 setContentHandler(this);
197 setLexicalHandler(this);
198 }
199 catch (final SAXException e) {
200 throw new ObjectInstantiationException("unable to create HTML parser", e);
201 }
202 initialSize_ = stack_.size();
203 }
204
205
206
207
208
209 private static XMLParserConfiguration createConfiguration(final BrowserVersion browserVersion) {
210
211
212
213
214 if (browserVersion.hasFeature(HTML_COMMAND_TAG)) {
215 return new HTMLConfiguration(new HTMLElements.HTMLElementsWithCache(HTMLELEMENTS_WITH_CMD));
216 }
217 return new HTMLConfiguration(new HTMLElements.HTMLElementsWithCache(HTMLELEMENTS));
218 }
219
220
221
222
223 @Override
224 public void setDocumentLocator(final Locator locator) {
225 locator_ = locator;
226 }
227
228
229
230
231 @Override
232 public void startDocument() throws SAXException {
233
234 }
235
236
237 @Override
238 public void startElement(final QName element, final XMLAttributes attributes, final Augmentations augs)
239 throws XNIException {
240
241 lastTagWasSynthesized_ = augs.isSynthesized();
242 super.startElement(element, attributes, augs);
243 }
244
245
246
247
248 @Override
249 public void startElement(String namespaceURI, final String localName, final String qName, final Attributes atts)
250 throws SAXException {
251
252 if (snippetStartNodeOverwritten_) {
253 snippetStartNodeOverwritten_ = false;
254 return;
255 }
256 handleCharacters();
257
258 final String tagLower = StringUtils.toRootLowerCase(localName);
259 if (page_.isParsingHtmlSnippet() && ("html".equals(tagLower) || "body".equals(tagLower))) {
260
261
262 stack_.push(currentNode_);
263 return;
264 }
265
266 if ("head".equals(tagLower)) {
267 if (headParsed_ == HeadParsed.YES || page_.isParsingHtmlSnippet()) {
268
269
270 stack_.push(currentNode_);
271 return;
272 }
273
274 headParsed_ = lastTagWasSynthesized_ ? HeadParsed.SYNTHESIZED : HeadParsed.YES;
275 }
276
277
278
279 HtmlBody oldBody = null;
280 final boolean isBodyTag = "body".equals(tagLower);
281 if (isBodyTag) {
282 final HtmlBody body = page_.getBody();
283 if (body != null) {
284 oldBody = body;
285 }
286 }
287
288 if (namespaceURI != null) {
289 namespaceURI = namespaceURI.trim();
290 }
291
292 if (!(page_ instanceof XHtmlPage) && Html.XHTML_NAMESPACE.equals(namespaceURI)) {
293 namespaceURI = null;
294 }
295
296 final ElementFactory factory =
297 htmlParser_.getElementFactory(page_, namespaceURI, qName, insideSvg_, false);
298 if (factory == HtmlUnitNekoHtmlParser.SVG_FACTORY) {
299 namespaceURI = Html.SVG_NAMESPACE;
300 }
301
302 final DomElement newElement = factory.createElementNS(page_, namespaceURI, qName, atts);
303 newElement.setStartLocation(locator_.getLineNumber(), locator_.getColumnNumber());
304
305
306 addNodeToRightParent(currentNode_, newElement);
307
308 if (newElement instanceof HtmlSvg) {
309 insideSvg_ = true;
310 }
311 else if (newElement instanceof HtmlTemplate) {
312 insideTemplate_ = true;
313 }
314
315
316
317 else if (newElement instanceof HtmlForm) {
318 consumingForm_ = (HtmlForm) newElement;
319 formEndingIsAdjusting_ = false;
320 }
321 else if (consumingForm_ != null) {
322
323 if (newElement instanceof SubmittableElement) {
324
325 if (((HtmlElement) newElement).getEnclosingForm() != consumingForm_) {
326 ((HtmlElement) newElement).setOwningForm(consumingForm_);
327 }
328 }
329 }
330
331
332
333 if (oldBody != null) {
334 oldBody.quietlyRemoveAndMoveChildrenTo(newElement);
335 }
336
337 if (!insideSvg_ && isBodyTag) {
338 body_ = (HtmlElement) newElement;
339 }
340 else if (createdByJavascript_
341 && newElement instanceof ScriptElement
342 && (!insideTemplate_
343 || !page_.getWebClient().getBrowserVersion()
344 .hasFeature(JS_SCRIPT_IN_TEMPLATE_EXECUTED_ON_ATTACH))) {
345 final ScriptElement script = (ScriptElement) newElement;
346 script.markAsCreatedByDomParser();
347 }
348
349 currentNode_ = newElement;
350 stack_.push(currentNode_);
351 }
352
353
354
355
356
357 private void addNodeToRightParent(final DomNode currentNode, final DomElement newElement) {
358 final String currentNodeName = currentNode.getNodeName();
359 final String newNodeName = newElement.getNodeName();
360
361
362 if (isTableChild(newNodeName)) {
363 final DomNode parent =
364 "table".equals(currentNodeName) ? currentNode : findElementOnStack("table");
365 appendChild(parent, newElement);
366 return;
367 }
368 if ("tr".equals(newNodeName)) {
369 final DomNode parent =
370 isTableChild(currentNodeName) ? currentNode : findElementOnStack("tbody", "thead", "tfoot");
371 appendChild(parent, newElement);
372 return;
373 }
374 if (isTableCell(newNodeName)) {
375 final DomNode parent =
376 "tr".equals(currentNodeName) ? currentNode : findElementOnStack("tr");
377 appendChild(parent, newElement);
378 return;
379 }
380
381
382 if ("table".equals(currentNodeName) || isTableChild(currentNodeName) || "tr".equals(currentNodeName)) {
383 if ("template".equals(newNodeName)) {
384 currentNode.appendChild(newElement);
385 }
386
387
388 else if (!"colgroup".equals(currentNodeName)
389 && ("script".equals(newNodeName)
390 || "form".equals(newNodeName)
391 || "style".equals(newNodeName))) {
392 currentNode.appendChild(newElement);
393 }
394
395
396 else if ("col".equals(newNodeName) && "colgroup".equals(currentNodeName)) {
397 currentNode.appendChild(newElement);
398 }
399 else if ("caption".equals(currentNodeName)) {
400 currentNode.appendChild(newElement);
401 }
402 else if (newElement instanceof HtmlHiddenInput) {
403 currentNode.appendChild(newElement);
404 }
405 else {
406
407 final DomNode parent = findElementOnStack("table");
408 parent.insertBefore(newElement);
409 }
410 return;
411 }
412
413 if (formEndingIsAdjusting_ && "form".equals(currentNodeName)) {
414
415 appendChild(currentNode.getParentNode(), newElement);
416 return;
417 }
418
419
420 appendChild(currentNode, newElement);
421 }
422
423 private DomNode findElementOnStack(final String searchedElementName) {
424 for (final DomNode node : stack_) {
425 if (searchedElementName.equals(node.getNodeName())) {
426 return node;
427 }
428 }
429
430
431 return stack_.peek();
432 }
433
434 private DomNode findElementOnStack(final String... searchedElementNames) {
435 for (final DomNode node : stack_) {
436 for (final String searchedElementName : searchedElementNames) {
437 if (searchedElementName.equals(node.getNodeName())) {
438 return node;
439 }
440 }
441 }
442
443
444 return stack_.peek();
445 }
446
447 private static boolean isTableChild(final String nodeName) {
448 if (nodeName == null || nodeName.length() < 5) {
449 return false;
450 }
451
452 return "thead".equals(nodeName)
453 || "tbody".equals(nodeName)
454 || "tfoot".equals(nodeName)
455 || "caption".equals(nodeName)
456 || "colgroup".equals(nodeName);
457 }
458
459 private static boolean isTableCell(final String nodeName) {
460 if (nodeName == null || nodeName.length() != 2) {
461 return false;
462 }
463 return "td".equals(nodeName) || "th".equals(nodeName);
464 }
465
466
467 @Override
468 public void endElement(final QName element, final Augmentations augs)
469 throws XNIException {
470
471 lastTagWasSynthesized_ = augs.isSynthesized();
472 super.endElement(element, augs);
473 }
474
475
476
477
478 @Override
479 public void endElement(final String namespaceURI, final String localName, final String qName)
480 throws SAXException {
481
482 final String tagLower = StringUtils.toRootLowerCase(localName);
483
484 handleCharacters();
485
486 if (page_.isParsingHtmlSnippet()) {
487 if ("html".equals(tagLower) || "body".equals(tagLower)) {
488 return;
489 }
490 if (stack_.size() == initialSize_) {
491
492
493 snippetStartNodeOverwritten_ = !StringUtils.equalsChar('p', tagLower);
494 return;
495 }
496 }
497
498 if ("svg".equals(tagLower)) {
499 insideSvg_ = false;
500 }
501 else if ("template".equals(tagLower)) {
502 insideTemplate_ = false;
503 }
504
505
506
507
508 if (stack_.isEmpty()) {
509 return;
510 }
511
512 final DomNode previousNode = stack_.pop();
513 previousNode.setEndLocation(locator_.getLineNumber(), locator_.getColumnNumber());
514
515 if ("form".equals(tagLower) && !lastTagWasSynthesized_) {
516
517
518 consumingForm_ = null;
519 }
520
521 if (!stack_.isEmpty()) {
522 currentNode_ = stack_.peek();
523 }
524
525 final boolean postponed = page_.isParsingInlineHtmlSnippet();
526 previousNode.onAllChildrenAddedToPage(postponed);
527 }
528
529
530 @Override
531 public void characters(final char[] ch, final int start, final int length) throws SAXException {
532 characters_.append(ch, start, length);
533 }
534
535
536 @Override
537 public void ignorableWhitespace(final char[] ch, final int start, final int length) throws SAXException {
538 characters_.append(ch, start, length);
539 }
540
541
542
543
544 private void handleCharacters() {
545
546 if (characters_.length() == 0) {
547 return;
548 }
549
550
551 final String textValue = characters_.toString();
552 characters_.clear();
553
554 if (StringUtils.isBlank(textValue)) {
555 appendChild(currentNode_, new DomText(page_, textValue));
556 return;
557 }
558
559
560 if (currentNode_ instanceof HtmlTableRow) {
561 final HtmlTableRow row = (HtmlTableRow) currentNode_;
562 final HtmlTable enclosingTable = row.getEnclosingTable();
563 if (enclosingTable != null) {
564 if (enclosingTable.getPreviousSibling() instanceof DomText) {
565 final DomText domText = (DomText) enclosingTable.getPreviousSibling();
566 domText.setTextContent(domText.getWholeText() + textValue);
567 }
568 else {
569 enclosingTable.insertBefore(new DomText(page_, textValue));
570 }
571 }
572 }
573 else if (currentNode_ instanceof HtmlTable) {
574 final HtmlTable enclosingTable = (HtmlTable) currentNode_;
575 if (enclosingTable.getPreviousSibling() instanceof DomText) {
576 final DomText domText = (DomText) enclosingTable.getPreviousSibling();
577 domText.setTextContent(domText.getWholeText() + textValue);
578 }
579 else {
580 enclosingTable.insertBefore(new DomText(page_, textValue));
581 }
582 }
583 else if (currentNode_ instanceof HtmlImage) {
584 currentNode_.getParentNode().appendChild(new DomText(page_, textValue));
585 }
586 else {
587 appendChild(currentNode_, new DomText(page_, textValue));
588 }
589 }
590
591
592 @Override
593 public void endDocument() throws SAXException {
594 handleCharacters();
595 if (locator_ != null) {
596 page_.setEndLocation(locator_.getLineNumber(), locator_.getColumnNumber());
597 }
598 }
599
600
601 @Override
602 public void startPrefixMapping(final String prefix, final String uri) throws SAXException {
603
604 }
605
606
607 @Override
608 public void endPrefixMapping(final String prefix) throws SAXException {
609
610 }
611
612
613 @Override
614 public void processingInstruction(final String target, final String data) throws SAXException {
615
616 }
617
618
619 @Override
620 public void skippedEntity(final String name) throws SAXException {
621
622 }
623
624
625
626
627 @Override
628 public void comment(final char[] ch, final int start, final int length) {
629 handleCharacters();
630 final String data = new String(ch, start, length);
631 final DomComment comment = new DomComment(page_, data);
632 appendChild(currentNode_, comment);
633 }
634
635
636 @Override
637 public void endCDATA() {
638 final String data = characters_.toString();
639 characters_.clear();
640
641 final DomCDataSection cdataSection = new DomCDataSection(page_, data);
642 appendChild(currentNode_, cdataSection);
643 }
644
645
646 @Override
647 public void endDTD() {
648
649 }
650
651
652 @Override
653 public void endEntity(final String name) {
654
655 }
656
657
658 @Override
659 public void startCDATA() {
660 handleCharacters();
661 }
662
663
664 @Override
665 public void startDTD(final String name, final String publicId, final String systemId) {
666 final DomDocumentType type = new DomDocumentType(page_, name, publicId, systemId);
667 page_.setDocumentType(type);
668
669 final Node child;
670 child = type;
671 page_.appendChild(child);
672 }
673
674
675 @Override
676 public void startEntity(final String name) {
677
678 }
679
680
681
682
683 @Override
684 public void ignoredEndElement(final QName element, final Augmentations augs) {
685
686
687 if ("form".equals(element.getLocalpart()) && consumingForm_ != null) {
688 consumingForm_ = null;
689
690 if (findElementOnStack("table", "form") instanceof HtmlTable) {
691
692 }
693 else {
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721 formEndingIsAdjusting_ = true;
722 }
723 }
724 }
725
726
727
728
729 @Override
730 public void ignoredStartElement(final QName elem, final XMLAttributes attrs, final Augmentations augs) {
731
732
733 if (attrs != null && body_ != null) {
734 final String lp = elem.getLocalpart();
735 if (lp != null && lp.length() == 4) {
736 if ("body".equalsIgnoreCase(lp)) {
737 copyAttributes(body_, attrs);
738 }
739 else if ("html".equalsIgnoreCase(lp)) {
740 final DomNode parent = body_.getParentNode();
741 if (parent instanceof DomElement) {
742 copyAttributes((DomElement) parent, attrs);
743 }
744 }
745 }
746 }
747 }
748
749 private static void copyAttributes(final DomElement to, final XMLAttributes attrs) {
750 final int length = attrs.getLength();
751
752 for (int i = 0; i < length; i++) {
753 final String attrName = StringUtils.toRootLowerCase(attrs.getLocalName(i));
754 if (to.getAttributes().getNamedItem(attrName) == null) {
755 to.setAttribute(attrName, attrs.getValue(i));
756 if (attrName.startsWith("on") && to.getPage().getWebClient().isJavaScriptEngineEnabled()
757 && to.getScriptableObject() instanceof HTMLBodyElement) {
758 final HTMLBodyElement jsBody = to.getScriptableObject();
759 jsBody.createEventHandlerFromAttribute(attrName, attrs.getValue(i));
760 }
761 }
762 }
763 }
764
765
766
767
768 @Override
769 public void parse(final XMLInputSource inputSource) throws XNIException, IOException {
770 final HTMLParserDOMBuilder oldBuilder = page_.getDOMBuilder();
771 page_.setDOMBuilder(this);
772 try {
773 super.parse(inputSource);
774 }
775 finally {
776 page_.setDOMBuilder(oldBuilder);
777 }
778 }
779
780 private static void appendChild(final DomNode parent, final DomNode child) {
781 if (parent instanceof HtmlTemplate) {
782 ((HtmlTemplate) parent).getContent().appendChild(child);
783 return;
784 }
785
786 parent.appendChild(child);
787 }
788 }