1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 package org.htmlunit.html.parser.neko;
16
17 import static org.htmlunit.BrowserVersionFeatures.HTML_COMMAND_TAG;
18 import static org.htmlunit.BrowserVersionFeatures.JS_SCRIPT_IN_TEMPLATE_EXECUTED_ON_ATTACH;
19
20 import java.io.IOException;
21 import java.io.StringReader;
22 import java.net.URL;
23 import java.nio.charset.Charset;
24 import java.util.ArrayDeque;
25 import java.util.Deque;
26
27 import org.htmlunit.BrowserVersion;
28 import org.htmlunit.ObjectInstantiationException;
29 import org.htmlunit.WebClient;
30 import org.htmlunit.WebResponse;
31 import org.htmlunit.cyberneko.HTMLConfiguration;
32 import org.htmlunit.cyberneko.HTMLElements;
33 import org.htmlunit.cyberneko.HTMLScanner;
34 import org.htmlunit.cyberneko.HTMLTagBalancingListener;
35 import org.htmlunit.cyberneko.xerces.parsers.AbstractSAXParser;
36 import org.htmlunit.cyberneko.xerces.xni.Augmentations;
37 import org.htmlunit.cyberneko.xerces.xni.QName;
38 import org.htmlunit.cyberneko.xerces.xni.XMLAttributes;
39 import org.htmlunit.cyberneko.xerces.xni.XMLString;
40 import org.htmlunit.cyberneko.xerces.xni.XNIException;
41 import org.htmlunit.cyberneko.xerces.xni.parser.XMLInputSource;
42 import org.htmlunit.cyberneko.xerces.xni.parser.XMLParserConfiguration;
43 import org.htmlunit.html.DomComment;
44 import org.htmlunit.html.DomDocumentType;
45 import org.htmlunit.html.DomElement;
46 import org.htmlunit.html.DomNode;
47 import org.htmlunit.html.DomText;
48 import org.htmlunit.html.ElementFactory;
49 import org.htmlunit.html.Html;
50 import org.htmlunit.html.HtmlBody;
51 import org.htmlunit.html.HtmlElement;
52 import org.htmlunit.html.HtmlForm;
53 import org.htmlunit.html.HtmlHiddenInput;
54 import org.htmlunit.html.HtmlImage;
55 import org.htmlunit.html.HtmlPage;
56 import org.htmlunit.html.HtmlSvg;
57 import org.htmlunit.html.HtmlTable;
58 import org.htmlunit.html.HtmlTableRow;
59 import org.htmlunit.html.HtmlTemplate;
60 import org.htmlunit.html.ScriptElement;
61 import org.htmlunit.html.SubmittableElement;
62 import org.htmlunit.html.XHtmlPage;
63 import org.htmlunit.html.parser.HTMLParser;
64 import org.htmlunit.html.parser.HTMLParserDOMBuilder;
65 import org.htmlunit.html.parser.HTMLParserListener;
66 import org.htmlunit.javascript.host.html.HTMLBodyElement;
67 import org.htmlunit.util.StringUtils;
68 import org.w3c.dom.Node;
69 import org.xml.sax.Attributes;
70 import org.xml.sax.ContentHandler;
71 import org.xml.sax.Locator;
72 import org.xml.sax.SAXException;
73 import org.xml.sax.ext.LexicalHandler;
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95 final class HtmlUnitNekoDOMBuilder extends AbstractSAXParser
96 implements ContentHandler, LexicalHandler, HTMLTagBalancingListener, HTMLParserDOMBuilder {
97
98
99 private static final HTMLElements HTMLELEMENTS;
100 private static final HTMLElements HTMLELEMENTS_WITH_CMD;
101
102 static {
103
104 final short commandShortCode = HTMLElements.UNKNOWN + 1;
105
106 final HTMLElements.Element command = new HTMLElements.Element(commandShortCode, "COMMAND",
107 HTMLElements.Element.EMPTY, new short[] {HTMLElements.BODY, HTMLElements.HEAD}, null);
108
109 HTMLELEMENTS = new HTMLElements();
110
111 final HTMLElements value = new HTMLElements();
112 value.setElement(command);
113 HTMLELEMENTS_WITH_CMD = value;
114 }
115
116 private enum HeadParsed { YES, SYNTHESIZED, NO }
117
118 private final HTMLParser htmlParser_;
119 private final HtmlPage page_;
120
121 private Locator locator_;
122 private final Deque<DomNode> stack_ = new ArrayDeque<>();
123
124
125 private boolean snippetStartNodeOverwritten_;
126 private final int initialSize_;
127 private DomNode currentNode_;
128 private final boolean createdByJavascript_;
129 private final XMLString characters_ = new XMLString();
130 private HtmlUnitNekoDOMBuilder.HeadParsed headParsed_ = HeadParsed.NO;
131 private HtmlElement body_;
132 private boolean lastTagWasSynthesized_;
133 private HtmlForm consumingForm_;
134 private boolean formEndingIsAdjusting_;
135 private boolean insideSvg_;
136 private boolean insideTemplate_;
137
138 private static final String FEATURE_AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";
139 private static final String FEATURE_PARSE_NOSCRIPT
140 = "http://cyberneko.org/html/features/parse-noscript-content";
141
142
143
144
145
146 @Override
147 public void pushInputString(final String html) {
148 page_.registerParsingStart();
149 page_.registerInlineSnippetParsingStart();
150 try {
151 final WebResponse webResponse = page_.getWebResponse();
152 final Charset charset = webResponse.getContentCharset();
153 final String url = webResponse.getWebRequest().getUrl().toString();
154 final XMLInputSource in = new XMLInputSource(null, url, null, new StringReader(html), charset.name());
155 ((HTMLConfiguration) parserConfiguration_).evaluateInputSource(in);
156 }
157 finally {
158 page_.registerParsingEnd();
159 page_.registerInlineSnippetParsingEnd();
160 }
161 }
162
163
164
165
166
167
168
169 HtmlUnitNekoDOMBuilder(final HTMLParser htmlParser,
170 final DomNode node, final URL url, final String htmlContent, final boolean createdByJavascript) {
171 super(createConfiguration(node.getPage().getWebClient().getBrowserVersion()));
172
173 htmlParser_ = htmlParser;
174 page_ = (HtmlPage) node.getPage();
175
176 currentNode_ = node;
177 for (final Node ancestor : currentNode_.getAncestors()) {
178 stack_.push((DomNode) ancestor);
179 }
180 createdByJavascript_ = createdByJavascript;
181
182 final WebClient webClient = page_.getWebClient();
183 final HTMLParserListener listener = webClient.getHTMLParserListener();
184 final boolean reportErrors = listener != null;
185 if (reportErrors) {
186 parserConfiguration_.setErrorHandler(new HtmlUnitNekoHTMLErrorHandler(listener, url, htmlContent));
187 }
188
189 try {
190 setFeature(FEATURE_AUGMENTATIONS, true);
191 setFeature("http://cyberneko.org/html/features/report-errors", reportErrors);
192 setFeature(FEATURE_PARSE_NOSCRIPT, !webClient.isJavaScriptEnabled());
193 setFeature(HTMLScanner.ALLOW_SELFCLOSING_IFRAME, false);
194
195 setContentHandler(this);
196 setLexicalHandler(this);
197 }
198 catch (final SAXException e) {
199 throw new ObjectInstantiationException("unable to create HTML parser", e);
200 }
201 initialSize_ = stack_.size();
202 }
203
204
205
206
207
208 private static XMLParserConfiguration createConfiguration(final BrowserVersion browserVersion) {
209 if (browserVersion.hasFeature(HTML_COMMAND_TAG)) {
210 return new HTMLConfiguration(HTMLELEMENTS_WITH_CMD);
211 }
212 return new HTMLConfiguration(HTMLELEMENTS);
213 }
214
215
216
217
218 @Override
219 public void setDocumentLocator(final Locator locator) {
220 locator_ = locator;
221 }
222
223
224
225
226 @Override
227 public void startDocument() throws SAXException {
228
229 }
230
231
232 @Override
233 public void startElement(final QName element, final XMLAttributes attributes, final Augmentations augs)
234 throws XNIException {
235
236 lastTagWasSynthesized_ = augs.isSynthesized();
237 super.startElement(element, attributes, augs);
238 }
239
240
241
242
243 @Override
244 public void startElement(String namespaceURI, final String localName, final String qName, final Attributes atts)
245 throws SAXException {
246
247 if (snippetStartNodeOverwritten_) {
248 snippetStartNodeOverwritten_ = false;
249 return;
250 }
251 handleCharacters();
252
253 final String tagLower = StringUtils.toRootLowerCase(localName);
254 if (page_.isParsingHtmlSnippet() && ("html".equals(tagLower) || "body".equals(tagLower))) {
255
256
257 stack_.push(currentNode_);
258 return;
259 }
260
261 if ("head".equals(tagLower)) {
262 if (headParsed_ == HeadParsed.YES || page_.isParsingHtmlSnippet()) {
263
264
265 stack_.push(currentNode_);
266 return;
267 }
268
269 headParsed_ = lastTagWasSynthesized_ ? HeadParsed.SYNTHESIZED : HeadParsed.YES;
270 }
271
272
273
274 HtmlBody oldBody = null;
275 final boolean isBodyTag = "body".equals(tagLower);
276 if (isBodyTag) {
277 final HtmlBody body = page_.getBody();
278 if (body != null) {
279 oldBody = body;
280 }
281 }
282
283 if (namespaceURI != null) {
284 namespaceURI = namespaceURI.trim();
285 }
286
287 if (!(page_ instanceof XHtmlPage) && Html.XHTML_NAMESPACE.equals(namespaceURI)) {
288 namespaceURI = null;
289 }
290
291 final ElementFactory factory =
292 htmlParser_.getElementFactory(page_, namespaceURI, qName, insideSvg_, false);
293 if (factory == HtmlUnitNekoHtmlParser.SVG_FACTORY) {
294 namespaceURI = Html.SVG_NAMESPACE;
295 }
296
297 final DomElement newElement = factory.createElementNS(page_, namespaceURI, qName, atts);
298 newElement.setStartLocation(locator_.getLineNumber(), locator_.getColumnNumber());
299
300
301 addNodeToRightParent(currentNode_, newElement);
302
303 if (newElement instanceof HtmlSvg) {
304 insideSvg_ = true;
305 }
306 else if (newElement instanceof HtmlTemplate) {
307 insideTemplate_ = true;
308 }
309
310
311
312 else if (newElement instanceof HtmlForm) {
313 consumingForm_ = (HtmlForm) newElement;
314 formEndingIsAdjusting_ = false;
315 }
316 else if (consumingForm_ != null) {
317
318 if (newElement instanceof SubmittableElement) {
319
320 if (((HtmlElement) newElement).getEnclosingForm() != consumingForm_) {
321 ((HtmlElement) newElement).setOwningForm(consumingForm_);
322 }
323 }
324 }
325
326
327
328 if (oldBody != null) {
329 oldBody.quietlyRemoveAndMoveChildrenTo(newElement);
330 }
331
332 if (!insideSvg_ && isBodyTag) {
333 body_ = (HtmlElement) newElement;
334 }
335 else if (createdByJavascript_
336 && newElement instanceof ScriptElement
337 && (!insideTemplate_
338 || !page_.getWebClient().getBrowserVersion()
339 .hasFeature(JS_SCRIPT_IN_TEMPLATE_EXECUTED_ON_ATTACH))) {
340 final ScriptElement script = (ScriptElement) newElement;
341 script.markAsCreatedByDomParser();
342 }
343
344 currentNode_ = newElement;
345 stack_.push(currentNode_);
346 }
347
348
349
350
351
352 private void addNodeToRightParent(final DomNode currentNode, final DomElement newElement) {
353 final String currentNodeName = currentNode.getNodeName();
354 final String newNodeName = newElement.getNodeName();
355
356
357 if (isTableChild(newNodeName)) {
358 final DomNode parent =
359 "table".equals(currentNodeName) ? currentNode : findElementOnStack("table");
360 appendChild(parent, newElement);
361 return;
362 }
363 if ("tr".equals(newNodeName)) {
364 final DomNode parent =
365 isTableChild(currentNodeName) ? currentNode : findElementOnStack("tbody", "thead", "tfoot");
366 appendChild(parent, newElement);
367 return;
368 }
369 if (isTableCell(newNodeName)) {
370 final DomNode parent =
371 "tr".equals(currentNodeName) ? currentNode : findElementOnStack("tr");
372 appendChild(parent, newElement);
373 return;
374 }
375
376
377 if ("table".equals(currentNodeName) || isTableChild(currentNodeName) || "tr".equals(currentNodeName)) {
378 if ("template".equals(newNodeName)) {
379 currentNode.appendChild(newElement);
380 }
381
382
383 else if (!"colgroup".equals(currentNodeName)
384 && ("script".equals(newNodeName)
385 || "form".equals(newNodeName)
386 || "style".equals(newNodeName))) {
387 currentNode.appendChild(newElement);
388 }
389
390
391 else if ("col".equals(newNodeName) && "colgroup".equals(currentNodeName)) {
392 currentNode.appendChild(newElement);
393 }
394 else if ("caption".equals(currentNodeName)) {
395 currentNode.appendChild(newElement);
396 }
397 else if (newElement instanceof HtmlHiddenInput) {
398 currentNode.appendChild(newElement);
399 }
400 else {
401
402 final DomNode parent = findElementOnStack("table");
403 parent.insertBefore(newElement);
404 }
405 return;
406 }
407
408 if (formEndingIsAdjusting_ && "form".equals(currentNodeName)) {
409
410 appendChild(currentNode.getParentNode(), newElement);
411 return;
412 }
413
414
415 appendChild(currentNode, newElement);
416 }
417
418 private DomNode findElementOnStack(final String searchedElementName) {
419 for (final DomNode node : stack_) {
420 if (searchedElementName.equals(node.getNodeName())) {
421 return node;
422 }
423 }
424
425
426 return stack_.peek();
427 }
428
429 private DomNode findElementOnStack(final String... searchedElementNames) {
430 for (final DomNode node : stack_) {
431 for (final String searchedElementName : searchedElementNames) {
432 if (searchedElementName.equals(node.getNodeName())) {
433 return node;
434 }
435 }
436 }
437
438
439 return stack_.peek();
440 }
441
442 private static boolean isTableChild(final String nodeName) {
443 if (nodeName == null || nodeName.length() < 5) {
444 return false;
445 }
446
447 return "thead".equals(nodeName)
448 || "tbody".equals(nodeName)
449 || "tfoot".equals(nodeName)
450 || "caption".equals(nodeName)
451 || "colgroup".equals(nodeName);
452 }
453
454 private static boolean isTableCell(final String nodeName) {
455 if (nodeName == null || nodeName.length() != 2) {
456 return false;
457 }
458 return "td".equals(nodeName) || "th".equals(nodeName);
459 }
460
461
462 @Override
463 public void endElement(final QName element, final Augmentations augs)
464 throws XNIException {
465
466 lastTagWasSynthesized_ = augs.isSynthesized();
467 super.endElement(element, augs);
468 }
469
470
471
472
473 @Override
474 public void endElement(final String namespaceURI, final String localName, final String qName)
475 throws SAXException {
476
477 final String tagLower = StringUtils.toRootLowerCase(localName);
478
479 handleCharacters();
480
481 if (page_.isParsingHtmlSnippet()) {
482 if ("html".equals(tagLower) || "body".equals(tagLower)) {
483 return;
484 }
485 if (stack_.size() == initialSize_) {
486
487
488 snippetStartNodeOverwritten_ = !StringUtils.equalsChar('p', tagLower);
489 return;
490 }
491 }
492
493 if ("svg".equals(tagLower)) {
494 insideSvg_ = false;
495 }
496 else if ("template".equals(tagLower)) {
497 insideTemplate_ = false;
498 }
499
500
501
502
503 if (stack_.isEmpty()) {
504 return;
505 }
506
507 final DomNode previousNode = stack_.pop();
508 previousNode.setEndLocation(locator_.getLineNumber(), locator_.getColumnNumber());
509
510 if ("form".equals(tagLower) && !lastTagWasSynthesized_) {
511
512
513 consumingForm_ = null;
514 }
515
516 if (!stack_.isEmpty()) {
517 currentNode_ = stack_.peek();
518 }
519
520 final boolean postponed = page_.isParsingInlineHtmlSnippet();
521 previousNode.onAllChildrenAddedToPage(postponed);
522 }
523
524
525 @Override
526 public void characters(final char[] ch, final int start, final int length) throws SAXException {
527 characters_.append(ch, start, length);
528 }
529
530
531 @Override
532 public void ignorableWhitespace(final char[] ch, final int start, final int length) throws SAXException {
533 characters_.append(ch, start, length);
534 }
535
536
537
538
539 private void handleCharacters() {
540
541 if (characters_.length() == 0) {
542 return;
543 }
544
545
546 final String textValue = characters_.toString();
547 characters_.clear();
548
549 if (org.apache.commons.lang3.StringUtils.isBlank(textValue)) {
550 appendChild(currentNode_, new DomText(page_, textValue));
551 return;
552 }
553
554
555 if (currentNode_ instanceof HtmlTableRow) {
556 final HtmlTableRow row = (HtmlTableRow) currentNode_;
557 final HtmlTable enclosingTable = row.getEnclosingTable();
558 if (enclosingTable != null) {
559 if (enclosingTable.getPreviousSibling() instanceof DomText) {
560 final DomText domText = (DomText) enclosingTable.getPreviousSibling();
561 domText.setTextContent(domText.getWholeText() + textValue);
562 }
563 else {
564 enclosingTable.insertBefore(new DomText(page_, textValue));
565 }
566 }
567 }
568 else if (currentNode_ instanceof HtmlTable) {
569 final HtmlTable enclosingTable = (HtmlTable) currentNode_;
570 if (enclosingTable.getPreviousSibling() instanceof DomText) {
571 final DomText domText = (DomText) enclosingTable.getPreviousSibling();
572 domText.setTextContent(domText.getWholeText() + textValue);
573 }
574 else {
575 enclosingTable.insertBefore(new DomText(page_, textValue));
576 }
577 }
578 else if (currentNode_ instanceof HtmlImage) {
579 currentNode_.getParentNode().appendChild(new DomText(page_, textValue));
580 }
581 else {
582 appendChild(currentNode_, new DomText(page_, textValue));
583 }
584 }
585
586
587 @Override
588 public void endDocument() throws SAXException {
589 handleCharacters();
590 if (locator_ != null) {
591 page_.setEndLocation(locator_.getLineNumber(), locator_.getColumnNumber());
592 }
593 }
594
595
596 @Override
597 public void startPrefixMapping(final String prefix, final String uri) throws SAXException {
598
599 }
600
601
602 @Override
603 public void endPrefixMapping(final String prefix) throws SAXException {
604
605 }
606
607
608 @Override
609 public void processingInstruction(final String target, final String data) throws SAXException {
610
611 }
612
613
614 @Override
615 public void skippedEntity(final String name) throws SAXException {
616
617 }
618
619
620
621
622 @Override
623 public void comment(final char[] ch, final int start, final int length) {
624 handleCharacters();
625 final String data = new String(ch, start, length);
626 final DomComment comment = new DomComment(page_, data);
627 appendChild(currentNode_, comment);
628 }
629
630
631 @Override
632 public void endCDATA() {
633
634 }
635
636
637 @Override
638 public void endDTD() {
639
640 }
641
642
643 @Override
644 public void endEntity(final String name) {
645
646 }
647
648
649 @Override
650 public void startCDATA() {
651
652 }
653
654
655 @Override
656 public void startDTD(final String name, final String publicId, final String systemId) {
657 final DomDocumentType type = new DomDocumentType(page_, name, publicId, systemId);
658 page_.setDocumentType(type);
659
660 final Node child;
661 child = type;
662 page_.appendChild(child);
663 }
664
665
666 @Override
667 public void startEntity(final String name) {
668
669 }
670
671
672
673
674 @Override
675 public void ignoredEndElement(final QName element, final Augmentations augs) {
676
677
678 if ("form".equals(element.getLocalpart()) && consumingForm_ != null) {
679 consumingForm_ = null;
680
681 if (findElementOnStack("table", "form") instanceof HtmlTable) {
682
683 }
684 else {
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712 formEndingIsAdjusting_ = true;
713 }
714 }
715 }
716
717
718
719
720 @Override
721 public void ignoredStartElement(final QName elem, final XMLAttributes attrs, final Augmentations augs) {
722
723
724 if (attrs != null && body_ != null) {
725 final String lp = elem.getLocalpart();
726 if (lp != null && lp.length() == 4) {
727 if ("body".equalsIgnoreCase(lp)) {
728 copyAttributes(body_, attrs);
729 }
730 else if ("html".equalsIgnoreCase(lp)) {
731 final DomNode parent = body_.getParentNode();
732 if (parent instanceof DomElement) {
733 copyAttributes((DomElement) parent, attrs);
734 }
735 }
736 }
737 }
738 }
739
740 private static void copyAttributes(final DomElement to, final XMLAttributes attrs) {
741 final int length = attrs.getLength();
742
743 for (int i = 0; i < length; i++) {
744 final String attrName = StringUtils.toRootLowerCase(attrs.getLocalName(i));
745 if (to.getAttributes().getNamedItem(attrName) == null) {
746 to.setAttribute(attrName, attrs.getValue(i));
747 if (attrName.startsWith("on") && to.getPage().getWebClient().isJavaScriptEngineEnabled()
748 && to.getScriptableObject() instanceof HTMLBodyElement) {
749 final HTMLBodyElement jsBody = to.getScriptableObject();
750 jsBody.createEventHandlerFromAttribute(attrName, attrs.getValue(i));
751 }
752 }
753 }
754 }
755
756
757
758
759 @Override
760 public void parse(final XMLInputSource inputSource) throws XNIException, IOException {
761 final HTMLParserDOMBuilder oldBuilder = page_.getDOMBuilder();
762 page_.setDOMBuilder(this);
763 try {
764 super.parse(inputSource);
765 }
766 finally {
767 page_.setDOMBuilder(oldBuilder);
768 }
769 }
770
771 private static void appendChild(final DomNode parent, final DomNode child) {
772 if (parent instanceof HtmlTemplate) {
773 ((HtmlTemplate) parent).getContent().appendChild(child);
774 return;
775 }
776
777 parent.appendChild(child);
778 }
779 }