1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 package org.htmlunit.html.parser.neko;
16
17 import static org.htmlunit.BrowserVersionFeatures.HTML_COMMAND_TAG;
18 import static org.htmlunit.BrowserVersionFeatures.JS_SCRIPT_IN_TEMPLATE_EXECUTED_ON_ATTACH;
19
20 import java.io.IOException;
21 import java.io.StringReader;
22 import java.net.URL;
23 import java.nio.charset.Charset;
24 import java.util.ArrayDeque;
25 import java.util.Deque;
26
27 import org.apache.commons.lang3.ArrayUtils;
28 import org.htmlunit.BrowserVersion;
29 import org.htmlunit.ObjectInstantiationException;
30 import org.htmlunit.WebClient;
31 import org.htmlunit.WebResponse;
32 import org.htmlunit.cyberneko.HTMLConfiguration;
33 import org.htmlunit.cyberneko.HTMLElements;
34 import org.htmlunit.cyberneko.HTMLScanner;
35 import org.htmlunit.cyberneko.HTMLTagBalancingListener;
36 import org.htmlunit.cyberneko.xerces.parsers.AbstractSAXParser;
37 import org.htmlunit.cyberneko.xerces.xni.Augmentations;
38 import org.htmlunit.cyberneko.xerces.xni.QName;
39 import org.htmlunit.cyberneko.xerces.xni.XMLAttributes;
40 import org.htmlunit.cyberneko.xerces.xni.XMLString;
41 import org.htmlunit.cyberneko.xerces.xni.XNIException;
42 import org.htmlunit.cyberneko.xerces.xni.parser.XMLInputSource;
43 import org.htmlunit.cyberneko.xerces.xni.parser.XMLParserConfiguration;
44 import org.htmlunit.html.DomComment;
45 import org.htmlunit.html.DomDocumentType;
46 import org.htmlunit.html.DomElement;
47 import org.htmlunit.html.DomNode;
48 import org.htmlunit.html.DomText;
49 import org.htmlunit.html.ElementFactory;
50 import org.htmlunit.html.Html;
51 import org.htmlunit.html.HtmlBody;
52 import org.htmlunit.html.HtmlElement;
53 import org.htmlunit.html.HtmlForm;
54 import org.htmlunit.html.HtmlHiddenInput;
55 import org.htmlunit.html.HtmlImage;
56 import org.htmlunit.html.HtmlPage;
57 import org.htmlunit.html.HtmlSvg;
58 import org.htmlunit.html.HtmlTable;
59 import org.htmlunit.html.HtmlTableRow;
60 import org.htmlunit.html.HtmlTemplate;
61 import org.htmlunit.html.ScriptElement;
62 import org.htmlunit.html.SubmittableElement;
63 import org.htmlunit.html.XHtmlPage;
64 import org.htmlunit.html.parser.HTMLParser;
65 import org.htmlunit.html.parser.HTMLParserDOMBuilder;
66 import org.htmlunit.html.parser.HTMLParserListener;
67 import org.htmlunit.javascript.host.html.HTMLBodyElement;
68 import org.htmlunit.util.StringUtils;
69 import org.w3c.dom.Node;
70 import org.xml.sax.Attributes;
71 import org.xml.sax.ContentHandler;
72 import org.xml.sax.Locator;
73 import org.xml.sax.SAXException;
74 import org.xml.sax.ext.LexicalHandler;
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96 final class HtmlUnitNekoDOMBuilder extends AbstractSAXParser
97 implements ContentHandler, LexicalHandler, HTMLTagBalancingListener, HTMLParserDOMBuilder {
98
99
100 private static final HTMLElements HTMLELEMENTS;
101 private static final HTMLElements HTMLELEMENTS_WITH_CMD;
102
103 static {
104
105 final short commandShortCode = HTMLElements.UNKNOWN + 1;
106
107 final HTMLElements.Element command = new HTMLElements.Element(commandShortCode, "COMMAND",
108 HTMLElements.Element.EMPTY, new short[] {HTMLElements.BODY, HTMLElements.HEAD}, null);
109
110 HTMLELEMENTS = new HTMLElements();
111
112 final HTMLElements value = new HTMLElements();
113 value.setElement(command);
114 HTMLELEMENTS_WITH_CMD = value;
115 }
116
117 private enum HeadParsed { YES, SYNTHESIZED, NO }
118
119 private final HTMLParser htmlParser_;
120 private final HtmlPage page_;
121
122 private Locator locator_;
123 private final Deque<DomNode> stack_ = new ArrayDeque<>();
124
125
126 private boolean snippetStartNodeOverwritten_;
127 private final int initialSize_;
128 private DomNode currentNode_;
129 private final boolean createdByJavascript_;
130 private final XMLString characters_ = new XMLString();
131 private HtmlUnitNekoDOMBuilder.HeadParsed headParsed_ = HeadParsed.NO;
132 private HtmlElement body_;
133 private boolean lastTagWasSynthesized_;
134 private HtmlForm consumingForm_;
135 private boolean formEndingIsAdjusting_;
136 private boolean insideSvg_;
137 private boolean insideTemplate_;
138
139 private static final String FEATURE_AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";
140 private static final String FEATURE_PARSE_NOSCRIPT
141 = "http://cyberneko.org/html/features/parse-noscript-content";
142
143
144
145
146
147 @Override
148 public void pushInputString(final String html) {
149 page_.registerParsingStart();
150 page_.registerInlineSnippetParsingStart();
151 try {
152 final WebResponse webResponse = page_.getWebResponse();
153 final Charset charset = webResponse.getContentCharset();
154 final String url = webResponse.getWebRequest().getUrl().toString();
155 final XMLInputSource in = new XMLInputSource(null, url, null, new StringReader(html), charset.name());
156 ((HTMLConfiguration) parserConfiguration_).evaluateInputSource(in);
157 }
158 finally {
159 page_.registerParsingEnd();
160 page_.registerInlineSnippetParsingEnd();
161 }
162 }
163
164
165
166
167
168
169
170 HtmlUnitNekoDOMBuilder(final HTMLParser htmlParser,
171 final DomNode node, final URL url, final String htmlContent, final boolean createdByJavascript) {
172 super(createConfiguration(node.getPage().getWebClient().getBrowserVersion()));
173
174 htmlParser_ = htmlParser;
175 page_ = (HtmlPage) node.getPage();
176
177 currentNode_ = node;
178 for (final Node ancestor : currentNode_.getAncestors()) {
179 stack_.push((DomNode) ancestor);
180 }
181 createdByJavascript_ = createdByJavascript;
182
183 final WebClient webClient = page_.getWebClient();
184 final HTMLParserListener listener = webClient.getHTMLParserListener();
185 final boolean reportErrors = listener != null;
186 if (reportErrors) {
187 parserConfiguration_.setErrorHandler(new HtmlUnitNekoHTMLErrorHandler(listener, url, htmlContent));
188 }
189
190 try {
191 setFeature(FEATURE_AUGMENTATIONS, true);
192 setFeature("http://cyberneko.org/html/features/report-errors", reportErrors);
193 setFeature(FEATURE_PARSE_NOSCRIPT, !webClient.isJavaScriptEnabled());
194 setFeature(HTMLScanner.ALLOW_SELFCLOSING_IFRAME, false);
195
196 setContentHandler(this);
197 setLexicalHandler(this);
198 }
199 catch (final SAXException e) {
200 throw new ObjectInstantiationException("unable to create HTML parser", e);
201 }
202 initialSize_ = stack_.size();
203 }
204
205
206
207
208
209 private static XMLParserConfiguration createConfiguration(final BrowserVersion browserVersion) {
210 if (browserVersion.hasFeature(HTML_COMMAND_TAG)) {
211 return new HTMLConfiguration(HTMLELEMENTS_WITH_CMD);
212 }
213 return new HTMLConfiguration(HTMLELEMENTS);
214 }
215
216
217
218
219 @Override
220 public void setDocumentLocator(final Locator locator) {
221 locator_ = locator;
222 }
223
224
225
226
227 @Override
228 public void startDocument() throws SAXException {
229
230 }
231
232
233 @Override
234 public void startElement(final QName element, final XMLAttributes attributes, final Augmentations augs)
235 throws XNIException {
236
237 lastTagWasSynthesized_ = augs.isSynthesized();
238 super.startElement(element, attributes, augs);
239 }
240
241
242
243
244 @Override
245 public void startElement(String namespaceURI, final String localName, final String qName, final Attributes atts)
246 throws SAXException {
247
248 if (snippetStartNodeOverwritten_) {
249 snippetStartNodeOverwritten_ = false;
250 return;
251 }
252 handleCharacters();
253
254 final String tagLower = StringUtils.toRootLowerCase(localName);
255 if (page_.isParsingHtmlSnippet() && ("html".equals(tagLower) || "body".equals(tagLower))) {
256
257
258 stack_.push(currentNode_);
259 return;
260 }
261
262 if ("head".equals(tagLower)) {
263 if (headParsed_ == HeadParsed.YES || page_.isParsingHtmlSnippet()) {
264
265
266 stack_.push(currentNode_);
267 return;
268 }
269
270 headParsed_ = lastTagWasSynthesized_ ? HeadParsed.SYNTHESIZED : HeadParsed.YES;
271 }
272
273 if (namespaceURI != null) {
274 namespaceURI = namespaceURI.trim();
275 }
276
277
278
279 HtmlBody oldBody = null;
280 final boolean isBodyTag = "body".equals(tagLower);
281 if (isBodyTag) {
282 final HtmlBody body = page_.getBody();
283 if (body != null) {
284 oldBody = body;
285 }
286 }
287
288
289 if (!(page_ instanceof XHtmlPage) && Html.XHTML_NAMESPACE.equals(namespaceURI)) {
290 namespaceURI = null;
291 }
292
293 final ElementFactory factory =
294 htmlParser_.getElementFactory(page_, namespaceURI, qName, insideSvg_, false);
295 if (factory == HtmlUnitNekoHtmlParser.SVG_FACTORY) {
296 namespaceURI = Html.SVG_NAMESPACE;
297 }
298 final DomElement newElement = factory.createElementNS(page_, namespaceURI, qName, atts);
299 newElement.setStartLocation(locator_.getLineNumber(), locator_.getColumnNumber());
300
301
302 addNodeToRightParent(currentNode_, newElement);
303
304 if (newElement instanceof HtmlSvg) {
305 insideSvg_ = true;
306 }
307 else if (newElement instanceof HtmlTemplate) {
308 insideTemplate_ = true;
309 }
310
311
312
313 if (newElement instanceof HtmlForm) {
314 consumingForm_ = (HtmlForm) newElement;
315 formEndingIsAdjusting_ = false;
316 }
317 else if (consumingForm_ != null) {
318
319 if (newElement instanceof SubmittableElement) {
320
321 if (((HtmlElement) newElement).getEnclosingForm() != consumingForm_) {
322 ((HtmlElement) newElement).setOwningForm(consumingForm_);
323 }
324 }
325 }
326
327
328
329 if (oldBody != null) {
330 oldBody.quietlyRemoveAndMoveChildrenTo(newElement);
331 }
332
333 if (!insideSvg_ && isBodyTag) {
334 body_ = (HtmlElement) newElement;
335 }
336 else if (createdByJavascript_
337 && newElement instanceof ScriptElement
338 && (!insideTemplate_
339 || !page_.getWebClient().getBrowserVersion()
340 .hasFeature(JS_SCRIPT_IN_TEMPLATE_EXECUTED_ON_ATTACH))) {
341 final ScriptElement script = (ScriptElement) newElement;
342 script.markAsCreatedByDomParser();
343 }
344
345 currentNode_ = newElement;
346 stack_.push(currentNode_);
347 }
348
349
350
351
352
353 private void addNodeToRightParent(final DomNode currentNode, final DomElement newElement) {
354 final String currentNodeName = currentNode.getNodeName();
355 final String newNodeName = newElement.getNodeName();
356
357
358 if (isTableChild(newNodeName)) {
359 final DomNode parent =
360 "table".equals(currentNodeName) ? currentNode : findElementOnStack("table");
361 appendChild(parent, newElement);
362 return;
363 }
364 if ("tr".equals(newNodeName)) {
365 final DomNode parent =
366 isTableChild(currentNodeName) ? currentNode : findElementOnStack("tbody", "thead", "tfoot");
367 appendChild(parent, newElement);
368 return;
369 }
370 if (isTableCell(newNodeName)) {
371 final DomNode parent =
372 "tr".equals(currentNodeName) ? currentNode : findElementOnStack("tr");
373 appendChild(parent, newElement);
374 return;
375 }
376
377
378 if ("table".equals(currentNodeName) || isTableChild(currentNodeName) || "tr".equals(currentNodeName)) {
379 if ("template".equals(newNodeName)) {
380 currentNode.appendChild(newElement);
381 }
382
383
384 else if (!"colgroup".equals(currentNodeName)
385 && ("script".equals(newNodeName)
386 || "form".equals(newNodeName)
387 || "style".equals(newNodeName))) {
388 currentNode.appendChild(newElement);
389 }
390
391
392 else if ("col".equals(newNodeName) && "colgroup".equals(currentNodeName)) {
393 currentNode.appendChild(newElement);
394 }
395 else if ("caption".equals(currentNodeName)) {
396 currentNode.appendChild(newElement);
397 }
398 else if (newElement instanceof HtmlHiddenInput) {
399 currentNode.appendChild(newElement);
400 }
401 else {
402
403 final DomNode parent = findElementOnStack("table");
404 parent.insertBefore(newElement);
405 }
406 return;
407 }
408
409 if (formEndingIsAdjusting_ && "form".equals(currentNodeName)) {
410
411 appendChild(currentNode.getParentNode(), newElement);
412 return;
413 }
414
415
416 appendChild(currentNode, newElement);
417 }
418
419 private DomNode findElementOnStack(final String... searchedElementNames) {
420 DomNode searchedNode = null;
421 for (final DomNode node : stack_) {
422 if (ArrayUtils.contains(searchedElementNames, node.getNodeName())) {
423 searchedNode = node;
424 break;
425 }
426 }
427
428 if (searchedNode == null) {
429 searchedNode = stack_.peek();
430 }
431
432 return searchedNode;
433 }
434
435 private static boolean isTableChild(final String nodeName) {
436 if (nodeName == null || nodeName.length() < 5) {
437 return false;
438 }
439
440 return "thead".equals(nodeName)
441 || "tbody".equals(nodeName)
442 || "tfoot".equals(nodeName)
443 || "caption".equals(nodeName)
444 || "colgroup".equals(nodeName);
445 }
446
447 private static boolean isTableCell(final String nodeName) {
448 if (nodeName == null || nodeName.length() != 2) {
449 return false;
450 }
451 return "td".equals(nodeName) || "th".equals(nodeName);
452 }
453
454
455 @Override
456 public void endElement(final QName element, final Augmentations augs)
457 throws XNIException {
458
459 lastTagWasSynthesized_ = augs.isSynthesized();
460 super.endElement(element, augs);
461 }
462
463
464
465
466 @Override
467 public void endElement(final String namespaceURI, final String localName, final String qName)
468 throws SAXException {
469
470 final String tagLower = StringUtils.toRootLowerCase(localName);
471
472 handleCharacters();
473
474 if (page_.isParsingHtmlSnippet()) {
475 if ("html".equals(tagLower) || "body".equals(tagLower)) {
476 return;
477 }
478 if (stack_.size() == initialSize_) {
479
480
481 snippetStartNodeOverwritten_ = !StringUtils.equalsChar('p', tagLower);
482 return;
483 }
484 }
485
486 if ("svg".equals(tagLower)) {
487 insideSvg_ = false;
488 }
489 else if ("template".equals(tagLower)) {
490 insideTemplate_ = false;
491 }
492
493
494
495
496 if (stack_.isEmpty()) {
497 return;
498 }
499
500 final DomNode previousNode = stack_.pop();
501 previousNode.setEndLocation(locator_.getLineNumber(), locator_.getColumnNumber());
502
503 if ("form".equals(tagLower) && !lastTagWasSynthesized_) {
504
505
506 consumingForm_ = null;
507 }
508
509 if (!stack_.isEmpty()) {
510 currentNode_ = stack_.peek();
511 }
512
513 final boolean postponed = page_.isParsingInlineHtmlSnippet();
514 previousNode.onAllChildrenAddedToPage(postponed);
515 }
516
517
518 @Override
519 public void characters(final char[] ch, final int start, final int length) throws SAXException {
520 characters_.append(ch, start, length);
521 }
522
523
524 @Override
525 public void ignorableWhitespace(final char[] ch, final int start, final int length) throws SAXException {
526 characters_.append(ch, start, length);
527 }
528
529
530
531
532 private void handleCharacters() {
533
534 if (characters_.length() == 0) {
535 return;
536 }
537
538
539 final String textValue = characters_.toString();
540 characters_.clear();
541
542 if (org.apache.commons.lang3.StringUtils.isBlank(textValue)) {
543 appendChild(currentNode_, new DomText(page_, textValue));
544 }
545 else {
546
547 if (currentNode_ instanceof HtmlTableRow) {
548 final HtmlTableRow row = (HtmlTableRow) currentNode_;
549 final HtmlTable enclosingTable = row.getEnclosingTable();
550 if (enclosingTable != null) {
551 if (enclosingTable.getPreviousSibling() instanceof DomText) {
552 final DomText domText = (DomText) enclosingTable.getPreviousSibling();
553 domText.setTextContent(domText.getWholeText() + textValue);
554 }
555 else {
556 enclosingTable.insertBefore(new DomText(page_, textValue));
557 }
558 }
559 }
560 else if (currentNode_ instanceof HtmlTable) {
561 final HtmlTable enclosingTable = (HtmlTable) currentNode_;
562 if (enclosingTable.getPreviousSibling() instanceof DomText) {
563 final DomText domText = (DomText) enclosingTable.getPreviousSibling();
564 domText.setTextContent(domText.getWholeText() + textValue);
565 }
566 else {
567 enclosingTable.insertBefore(new DomText(page_, textValue));
568 }
569 }
570 else if (currentNode_ instanceof HtmlImage) {
571 currentNode_.getParentNode().appendChild(new DomText(page_, textValue));
572 }
573 else {
574 appendChild(currentNode_, new DomText(page_, textValue));
575 }
576 }
577 }
578
579
580 @Override
581 public void endDocument() throws SAXException {
582 handleCharacters();
583 if (locator_ != null) {
584 page_.setEndLocation(locator_.getLineNumber(), locator_.getColumnNumber());
585 }
586 }
587
588
589 @Override
590 public void startPrefixMapping(final String prefix, final String uri) throws SAXException {
591
592 }
593
594
595 @Override
596 public void endPrefixMapping(final String prefix) throws SAXException {
597
598 }
599
600
601 @Override
602 public void processingInstruction(final String target, final String data) throws SAXException {
603
604 }
605
606
607 @Override
608 public void skippedEntity(final String name) throws SAXException {
609
610 }
611
612
613
614
615 @Override
616 public void comment(final char[] ch, final int start, final int length) {
617 handleCharacters();
618 final String data = new String(ch, start, length);
619 final DomComment comment = new DomComment(page_, data);
620 appendChild(currentNode_, comment);
621 }
622
623
624 @Override
625 public void endCDATA() {
626
627 }
628
629
630 @Override
631 public void endDTD() {
632
633 }
634
635
636 @Override
637 public void endEntity(final String name) {
638
639 }
640
641
642 @Override
643 public void startCDATA() {
644
645 }
646
647
648 @Override
649 public void startDTD(final String name, final String publicId, final String systemId) {
650 final DomDocumentType type = new DomDocumentType(page_, name, publicId, systemId);
651 page_.setDocumentType(type);
652
653 final Node child;
654 child = type;
655 page_.appendChild(child);
656 }
657
658
659 @Override
660 public void startEntity(final String name) {
661
662 }
663
664
665
666
667 @Override
668 public void ignoredEndElement(final QName element, final Augmentations augs) {
669
670
671 if ("form".equals(element.getLocalpart()) && consumingForm_ != null) {
672 consumingForm_ = null;
673
674 if (findElementOnStack("table", "form") instanceof HtmlTable) {
675
676 }
677 else {
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705 formEndingIsAdjusting_ = true;
706 }
707 }
708 }
709
710
711
712
713 @Override
714 public void ignoredStartElement(final QName elem, final XMLAttributes attrs, final Augmentations augs) {
715
716
717 if (attrs != null && body_ != null) {
718 final String lp = elem.getLocalpart();
719 if (lp != null && lp.length() == 4) {
720 if ("body".equalsIgnoreCase(lp)) {
721 copyAttributes(body_, attrs);
722 }
723 else if ("html".equalsIgnoreCase(lp)) {
724 final DomNode parent = body_.getParentNode();
725 if (parent instanceof DomElement) {
726 copyAttributes((DomElement) parent, attrs);
727 }
728 }
729 }
730 }
731 }
732
733 private static void copyAttributes(final DomElement to, final XMLAttributes attrs) {
734 final int length = attrs.getLength();
735
736 for (int i = 0; i < length; i++) {
737 final String attrName = StringUtils.toRootLowerCase(attrs.getLocalName(i));
738 if (to.getAttributes().getNamedItem(attrName) == null) {
739 to.setAttribute(attrName, attrs.getValue(i));
740 if (attrName.startsWith("on") && to.getPage().getWebClient().isJavaScriptEngineEnabled()
741 && to.getScriptableObject() instanceof HTMLBodyElement) {
742 final HTMLBodyElement jsBody = to.getScriptableObject();
743 jsBody.createEventHandlerFromAttribute(attrName, attrs.getValue(i));
744 }
745 }
746 }
747 }
748
749
750
751
752 @Override
753 public void parse(final XMLInputSource inputSource) throws XNIException, IOException {
754 final HTMLParserDOMBuilder oldBuilder = page_.getDOMBuilder();
755 page_.setDOMBuilder(this);
756 try {
757 super.parse(inputSource);
758 }
759 finally {
760 page_.setDOMBuilder(oldBuilder);
761 }
762 }
763
764 private static void appendChild(final DomNode parent, final DomNode child) {
765 if (parent instanceof HtmlTemplate) {
766 ((HtmlTemplate) parent).getContent().appendChild(child);
767 return;
768 }
769
770 parent.appendChild(child);
771 }
772 }