View Javadoc
1   /*
2    * Copyright (c) 2002-2025 Gargoyle Software Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * https://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  package org.htmlunit.html.parser.neko;
16  
17  import static org.htmlunit.BrowserVersionFeatures.HTML_COMMAND_TAG;
18  import static org.htmlunit.BrowserVersionFeatures.JS_SCRIPT_IN_TEMPLATE_EXECUTED_ON_ATTACH;
19  
20  import java.io.IOException;
21  import java.io.StringReader;
22  import java.net.URL;
23  import java.nio.charset.Charset;
24  import java.util.ArrayDeque;
25  import java.util.Deque;
26  
27  import org.htmlunit.BrowserVersion;
28  import org.htmlunit.ObjectInstantiationException;
29  import org.htmlunit.WebClient;
30  import org.htmlunit.WebResponse;
31  import org.htmlunit.cyberneko.HTMLConfiguration;
32  import org.htmlunit.cyberneko.HTMLElements;
33  import org.htmlunit.cyberneko.HTMLScanner;
34  import org.htmlunit.cyberneko.HTMLTagBalancingListener;
35  import org.htmlunit.cyberneko.xerces.parsers.AbstractSAXParser;
36  import org.htmlunit.cyberneko.xerces.xni.Augmentations;
37  import org.htmlunit.cyberneko.xerces.xni.QName;
38  import org.htmlunit.cyberneko.xerces.xni.XMLAttributes;
39  import org.htmlunit.cyberneko.xerces.xni.XMLString;
40  import org.htmlunit.cyberneko.xerces.xni.XNIException;
41  import org.htmlunit.cyberneko.xerces.xni.parser.XMLInputSource;
42  import org.htmlunit.cyberneko.xerces.xni.parser.XMLParserConfiguration;
43  import org.htmlunit.html.DomComment;
44  import org.htmlunit.html.DomDocumentType;
45  import org.htmlunit.html.DomElement;
46  import org.htmlunit.html.DomNode;
47  import org.htmlunit.html.DomText;
48  import org.htmlunit.html.ElementFactory;
49  import org.htmlunit.html.Html;
50  import org.htmlunit.html.HtmlBody;
51  import org.htmlunit.html.HtmlElement;
52  import org.htmlunit.html.HtmlForm;
53  import org.htmlunit.html.HtmlHiddenInput;
54  import org.htmlunit.html.HtmlImage;
55  import org.htmlunit.html.HtmlPage;
56  import org.htmlunit.html.HtmlSvg;
57  import org.htmlunit.html.HtmlTable;
58  import org.htmlunit.html.HtmlTableRow;
59  import org.htmlunit.html.HtmlTemplate;
60  import org.htmlunit.html.ScriptElement;
61  import org.htmlunit.html.SubmittableElement;
62  import org.htmlunit.html.XHtmlPage;
63  import org.htmlunit.html.parser.HTMLParser;
64  import org.htmlunit.html.parser.HTMLParserDOMBuilder;
65  import org.htmlunit.html.parser.HTMLParserListener;
66  import org.htmlunit.javascript.host.html.HTMLBodyElement;
67  import org.htmlunit.util.StringUtils;
68  import org.w3c.dom.Node;
69  import org.xml.sax.Attributes;
70  import org.xml.sax.ContentHandler;
71  import org.xml.sax.Locator;
72  import org.xml.sax.SAXException;
73  import org.xml.sax.ext.LexicalHandler;
74  
75  /**
76   * <span style="color:red">INTERNAL API - SUBJECT TO CHANGE AT ANY TIME - USE AT YOUR OWN RISK.</span><br>
77   *
78   * The parser and DOM builder. This class subclasses Xerces's AbstractSAXParser and implements
79   * the ContentHandler interface. Thus all parser APIs are kept private. The ContentHandler methods
80   * consume SAX events to build the page DOM
81   *
82   * @author <a href="mailto:cse@dynabean.de">Christian Sell</a>
83   * @author David K. Taylor
84   * @author Chris Erskine
85   * @author Ahmed Ashour
86   * @author Marc Guillemot
87   * @author Ethan Glasser-Camp
88   * @author Sudhan Moghe
89   * @author Ronald Brill
90   * @author Frank Danek
91   * @author Carsten Steul
92   * @author Ronny Shapiro
93   * @author Atsushi Nakagawa
94   */
95  final class HtmlUnitNekoDOMBuilder extends AbstractSAXParser
96          implements ContentHandler, LexicalHandler, HTMLTagBalancingListener, HTMLParserDOMBuilder {
97  
98      // cache Neko Elements for performance and memory efficiency
99      private static final HTMLElements HTMLELEMENTS;
100     private static final HTMLElements HTMLELEMENTS_WITH_CMD;
101 
102     static {
103         // continue short code enumeration
104         final short commandShortCode = HTMLElements.UNKNOWN + 1;
105 
106         final HTMLElements.Element command = new HTMLElements.Element(commandShortCode, "COMMAND",
107                 HTMLElements.Element.EMPTY, new short[] {HTMLElements.BODY, HTMLElements.HEAD}, null);
108 
109         HTMLELEMENTS = new HTMLElements();
110 
111         final HTMLElements value = new HTMLElements();
112         value.setElement(command);
113         HTMLELEMENTS_WITH_CMD = value;
114     }
115 
116     private enum HeadParsed { YES, SYNTHESIZED, NO }
117 
118     private final HTMLParser htmlParser_;
119     private final HtmlPage page_;
120 
121     private Locator locator_;
122     private final Deque<DomNode> stack_ = new ArrayDeque<>();
123 
124     /** Did the snippet tried to overwrite the start node? */
125     private boolean snippetStartNodeOverwritten_;
126     private final int initialSize_;
127     private DomNode currentNode_;
128     private final boolean createdByJavascript_;
129     private final XMLString characters_ = new XMLString();
130     private HtmlUnitNekoDOMBuilder.HeadParsed headParsed_ = HeadParsed.NO;
131     private HtmlElement body_;
132     private boolean lastTagWasSynthesized_;
133     private HtmlForm consumingForm_;
134     private boolean formEndingIsAdjusting_;
135     private boolean insideSvg_;
136     private boolean insideTemplate_;
137 
138     private static final String FEATURE_AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";
139     private static final String FEATURE_PARSE_NOSCRIPT
140         = "http://cyberneko.org/html/features/parse-noscript-content";
141 
142     /**
143      * Parses and then inserts the specified HTML content into the HTML content currently being parsed.
144      * @param html the HTML content to push
145      */
146     @Override
147     public void pushInputString(final String html) {
148         page_.registerParsingStart();
149         page_.registerInlineSnippetParsingStart();
150         try {
151             final WebResponse webResponse = page_.getWebResponse();
152             final Charset charset = webResponse.getContentCharset();
153             final String url = webResponse.getWebRequest().getUrl().toString();
154             final XMLInputSource in = new XMLInputSource(null, url, null, new StringReader(html), charset.name());
155             ((HTMLConfiguration) parserConfiguration_).evaluateInputSource(in);
156         }
157         finally {
158             page_.registerParsingEnd();
159             page_.registerInlineSnippetParsingEnd();
160         }
161     }
162 
163     /**
164      * Creates a new builder for parsing the specified response contents.
165      * @param node the location at which to insert the new content
166      * @param url the page's URL
167      * @param createdByJavascript if true the (script) tag was created by javascript
168      */
169     HtmlUnitNekoDOMBuilder(final HTMLParser htmlParser,
170             final DomNode node, final URL url, final String htmlContent, final boolean createdByJavascript) {
171         super(createConfiguration(node.getPage().getWebClient().getBrowserVersion()));
172 
173         htmlParser_ = htmlParser;
174         page_ = (HtmlPage) node.getPage();
175 
176         currentNode_ = node;
177         for (final Node ancestor : currentNode_.getAncestors()) {
178             stack_.push((DomNode) ancestor);
179         }
180         createdByJavascript_ = createdByJavascript;
181 
182         final WebClient webClient = page_.getWebClient();
183         final HTMLParserListener listener = webClient.getHTMLParserListener();
184         final boolean reportErrors = listener != null;
185         if (reportErrors) {
186             parserConfiguration_.setErrorHandler(new HtmlUnitNekoHTMLErrorHandler(listener, url, htmlContent));
187         }
188 
189         try {
190             setFeature(FEATURE_AUGMENTATIONS, true);
191             setFeature("http://cyberneko.org/html/features/report-errors", reportErrors);
192             setFeature(FEATURE_PARSE_NOSCRIPT, !webClient.isJavaScriptEnabled());
193             setFeature(HTMLScanner.ALLOW_SELFCLOSING_IFRAME, false);
194 
195             setContentHandler(this);
196             setLexicalHandler(this); //comments and CDATA
197         }
198         catch (final SAXException e) {
199             throw new ObjectInstantiationException("unable to create HTML parser", e);
200         }
201         initialSize_ = stack_.size();
202     }
203 
204     /**
205      * Create the configuration depending on the simulated browser
206      * @return the configuration
207      */
208     private static XMLParserConfiguration createConfiguration(final BrowserVersion browserVersion) {
209         if (browserVersion.hasFeature(HTML_COMMAND_TAG)) {
210             return new HTMLConfiguration(HTMLELEMENTS_WITH_CMD);
211         }
212         return new HTMLConfiguration(HTMLELEMENTS);
213     }
214 
215     /**
216      * {@inheritDoc}
217      */
218     @Override
219     public void setDocumentLocator(final Locator locator) {
220         locator_ = locator;
221     }
222 
223     /**
224      * {@inheritDoc}
225      */
226     @Override
227     public void startDocument() throws SAXException {
228         // nothing to do
229     }
230 
231     /** {@inheritDoc} */
232     @Override
233     public void startElement(final QName element, final XMLAttributes attributes, final Augmentations augs)
234         throws XNIException {
235         // augs might change so we store only the interesting part
236         lastTagWasSynthesized_ = augs.isSynthesized();
237         super.startElement(element, attributes, augs);
238     }
239 
240     /**
241      * {@inheritDoc}
242      */
243     @Override
244     public void startElement(String namespaceURI, final String localName, final String qName, final Attributes atts)
245         throws SAXException {
246 
247         if (snippetStartNodeOverwritten_) {
248             snippetStartNodeOverwritten_ = false;
249             return;
250         }
251         handleCharacters();
252 
253         final String tagLower = StringUtils.toRootLowerCase(localName);
254         if (page_.isParsingHtmlSnippet() && ("html".equals(tagLower) || "body".equals(tagLower))) {
255             // we have to push the current node on the stack to make sure
256             // the endElement call is able to remove a node from the stack
257             stack_.push(currentNode_);
258             return;
259         }
260 
261         if ("head".equals(tagLower)) {
262             if (headParsed_ == HeadParsed.YES || page_.isParsingHtmlSnippet()) {
263                 // we have to push the current node on the stack to make sure
264                 // the endElement call is able to remove a node from the stack
265                 stack_.push(currentNode_);
266                 return;
267             }
268 
269             headParsed_ = lastTagWasSynthesized_ ? HeadParsed.SYNTHESIZED : HeadParsed.YES;
270         }
271 
272         // If we're adding a body element, keep track of any temporary synthetic ones
273         // that we may have had to create earlier (for document.write(), for example).
274         HtmlBody oldBody = null;
275         final boolean isBodyTag = "body".equals(tagLower);
276         if (isBodyTag) {
277             final HtmlBody body = page_.getBody();
278             if (body != null) {
279                 oldBody = body;
280             }
281         }
282 
283         if (namespaceURI != null) {
284             namespaceURI = namespaceURI.trim();
285         }
286         // Add the new node.
287         if (!(page_ instanceof XHtmlPage) && Html.XHTML_NAMESPACE.equals(namespaceURI)) {
288             namespaceURI = null;
289         }
290 
291         final ElementFactory factory =
292                 htmlParser_.getElementFactory(page_, namespaceURI, qName, insideSvg_, false);
293         if (factory == HtmlUnitNekoHtmlParser.SVG_FACTORY) {
294             namespaceURI = Html.SVG_NAMESPACE;
295         }
296 
297         final DomElement newElement = factory.createElementNS(page_, namespaceURI, qName, atts);
298         newElement.setStartLocation(locator_.getLineNumber(), locator_.getColumnNumber());
299 
300         // parse can't replace everything as it does not buffer elements while parsing
301         addNodeToRightParent(currentNode_, newElement);
302 
303         if (newElement instanceof HtmlSvg) {
304             insideSvg_ = true;
305         }
306         else if (newElement instanceof HtmlTemplate) {
307             insideTemplate_ = true;
308         }
309 
310         // Forms own elements simply by enclosing source-wise rather than DOM parent-child relationship
311         // Forms without a </form> will keep consuming forever
312         else if (newElement instanceof HtmlForm) {
313             consumingForm_ = (HtmlForm) newElement;
314             formEndingIsAdjusting_ = false;
315         }
316         else if (consumingForm_ != null) {
317             // If the current form enclosed a suitable element
318             if (newElement instanceof SubmittableElement) {
319                 // Let these be owned by the form
320                 if (((HtmlElement) newElement).getEnclosingForm() != consumingForm_) {
321                     ((HtmlElement) newElement).setOwningForm(consumingForm_);
322                 }
323             }
324         }
325 
326         // If we had an old synthetic body and we just added a real body element, quietly
327         // remove the old body and move its children to the real body element we just added.
328         if (oldBody != null) {
329             oldBody.quietlyRemoveAndMoveChildrenTo(newElement);
330         }
331 
332         if (!insideSvg_ && isBodyTag) {
333             body_ = (HtmlElement) newElement;
334         }
335         else if (createdByJavascript_
336                 && newElement instanceof ScriptElement
337                 && (!insideTemplate_
338                         || !page_.getWebClient().getBrowserVersion()
339                                 .hasFeature(JS_SCRIPT_IN_TEMPLATE_EXECUTED_ON_ATTACH))) {
340             final ScriptElement script = (ScriptElement) newElement;
341             script.markAsCreatedByDomParser();
342         }
343 
344         currentNode_ = newElement;
345         stack_.push(currentNode_);
346     }
347 
348     /**
349      * Adds the new node to the right parent that is not necessary the currentNode in case of
350      * malformed HTML code. The method tries to emulate the behavior of Firefox.
351      */
352     private void addNodeToRightParent(final DomNode currentNode, final DomElement newElement) {
353         final String currentNodeName = currentNode.getNodeName();
354         final String newNodeName = newElement.getNodeName();
355 
356         // First ensure table elements are housed correctly
357         if (isTableChild(newNodeName)) {
358             final DomNode parent =
359                     "table".equals(currentNodeName) ? currentNode : findElementOnStack("table");
360             appendChild(parent, newElement);
361             return;
362         }
363         if ("tr".equals(newNodeName)) {
364             final DomNode parent =
365                     isTableChild(currentNodeName) ? currentNode : findElementOnStack("tbody", "thead", "tfoot");
366             appendChild(parent, newElement);
367             return;
368         }
369         if (isTableCell(newNodeName)) {
370             final DomNode parent =
371                     "tr".equals(currentNodeName) ? currentNode : findElementOnStack("tr");
372             appendChild(parent, newElement);
373             return;
374         }
375 
376         // Next ensure non-table elements don't appear in tables
377         if ("table".equals(currentNodeName) || isTableChild(currentNodeName) || "tr".equals(currentNodeName)) {
378             if ("template".equals(newNodeName)) {
379                 currentNode.appendChild(newElement);
380             }
381 
382             // Scripts, forms, and styles are exempt
383             else if (!"colgroup".equals(currentNodeName)
384                     && ("script".equals(newNodeName)
385                         || "form".equals(newNodeName)
386                         || "style".equals(newNodeName))) {
387                 currentNode.appendChild(newElement);
388             }
389 
390             // These are good
391             else if ("col".equals(newNodeName) && "colgroup".equals(currentNodeName)) {
392                 currentNode.appendChild(newElement);
393             }
394             else if ("caption".equals(currentNodeName)) {
395                 currentNode.appendChild(newElement);
396             }
397             else if (newElement instanceof HtmlHiddenInput) {
398                 currentNode.appendChild(newElement);
399             }
400             else {
401                 // Move before the table
402                 final DomNode parent = findElementOnStack("table");
403                 parent.insertBefore(newElement);
404             }
405             return;
406         }
407 
408         if (formEndingIsAdjusting_ && "form".equals(currentNodeName)) {
409             // We cater to HTMLTagBalancer's shortcomings by moving this node out of the <form>
410             appendChild(currentNode.getParentNode(), newElement);
411             return;
412         }
413 
414         // Everything else
415         appendChild(currentNode, newElement);
416     }
417 
418     private DomNode findElementOnStack(final String searchedElementName) {
419         for (final DomNode node : stack_) {
420             if (searchedElementName.equals(node.getNodeName())) {
421                 return node;
422             }
423         }
424 
425         // this is surely wrong but at least it won't throw a NPE
426         return stack_.peek();
427     }
428 
429     private DomNode findElementOnStack(final String... searchedElementNames) {
430         for (final DomNode node : stack_) {
431             for (final String searchedElementName : searchedElementNames) {
432                 if (searchedElementName.equals(node.getNodeName())) {
433                     return node;
434                 }
435             }
436         }
437 
438         // this is surely wrong but at least it won't throw a NPE
439         return stack_.peek();
440     }
441 
442     private static boolean isTableChild(final String nodeName) {
443         if (nodeName == null || nodeName.length() < 5) {
444             return false;
445         }
446 
447         return "thead".equals(nodeName)
448                 || "tbody".equals(nodeName)
449                 || "tfoot".equals(nodeName)
450                 || "caption".equals(nodeName)
451                 || "colgroup".equals(nodeName);
452     }
453 
454     private static boolean isTableCell(final String nodeName) {
455         if (nodeName == null || nodeName.length() != 2) {
456             return false;
457         }
458         return "td".equals(nodeName) || "th".equals(nodeName);
459     }
460 
461     /** {@inheritDoc} */
462     @Override
463     public void endElement(final QName element, final Augmentations augs)
464         throws XNIException {
465         // augs might change so we store only the interesting part
466         lastTagWasSynthesized_ = augs.isSynthesized();
467         super.endElement(element, augs);
468     }
469 
470     /**
471      * {@inheritDoc}
472      */
473     @Override
474     public void endElement(final String namespaceURI, final String localName, final String qName)
475         throws SAXException {
476 
477         final String tagLower = StringUtils.toRootLowerCase(localName);
478 
479         handleCharacters();
480 
481         if (page_.isParsingHtmlSnippet()) {
482             if ("html".equals(tagLower) || "body".equals(tagLower)) {
483                 return;
484             }
485             if (stack_.size() == initialSize_) {
486                 // a <p> inside a <p> is valid for innerHTML processing
487                 // see HTMLParser2Test for more cases
488                 snippetStartNodeOverwritten_ = !StringUtils.equalsChar('p', tagLower);
489                 return;
490             }
491         }
492 
493         if ("svg".equals(tagLower)) {
494             insideSvg_ = false;
495         }
496         else if ("template".equals(tagLower)) {
497             insideTemplate_ = false;
498         }
499 
500         // this only avoids a problem when the stack is empty here
501         // but for this case we made the problem before - the balancing
502         // is broken already
503         if (stack_.isEmpty()) {
504             return;
505         }
506 
507         final DomNode previousNode = stack_.pop(); //remove currentElement from stack
508         previousNode.setEndLocation(locator_.getLineNumber(), locator_.getColumnNumber());
509 
510         if ("form".equals(tagLower) && !lastTagWasSynthesized_) {
511             // We get here if the </form> was on the same DOM tree depth as the <form> that started it,
512             // otherwise HTMLTagBalancer gives us the end through ignoredEndElement()
513             consumingForm_ = null;
514         }
515 
516         if (!stack_.isEmpty()) {
517             currentNode_ = stack_.peek();
518         }
519 
520         final boolean postponed = page_.isParsingInlineHtmlSnippet();
521         previousNode.onAllChildrenAddedToPage(postponed);
522     }
523 
524     /** {@inheritDoc} */
525     @Override
526     public void characters(final char[] ch, final int start, final int length) throws SAXException {
527         characters_.append(ch, start, length);
528     }
529 
530     /** {@inheritDoc} */
531     @Override
532     public void ignorableWhitespace(final char[] ch, final int start, final int length) throws SAXException {
533         characters_.append(ch, start, length);
534     }
535 
536     /**
537      * Picks up the character data accumulated so far and add it to the current element as a text node.
538      */
539     private void handleCharacters() {
540         // make the code easier to read because we remove a nesting level
541         if (characters_.length() == 0) {
542             return;
543         }
544 
545         // Use the normal behavior: append a text node for the accumulated text.
546         final String textValue = characters_.toString();
547         characters_.clear();
548 
549         if (org.apache.commons.lang3.StringUtils.isBlank(textValue)) {
550             appendChild(currentNode_, new DomText(page_, textValue));
551             return;
552         }
553 
554         // malformed HTML: </td>some text</tr> => text comes before the table
555         if (currentNode_ instanceof HtmlTableRow) {
556             final HtmlTableRow row = (HtmlTableRow) currentNode_;
557             final HtmlTable enclosingTable = row.getEnclosingTable();
558             if (enclosingTable != null) { // may be null when called from Range.createContextualFragment
559                 if (enclosingTable.getPreviousSibling() instanceof DomText) {
560                     final DomText domText = (DomText) enclosingTable.getPreviousSibling();
561                     domText.setTextContent(domText.getWholeText() + textValue);
562                 }
563                 else {
564                     enclosingTable.insertBefore(new DomText(page_, textValue));
565                 }
566             }
567         }
568         else if (currentNode_ instanceof HtmlTable) {
569             final HtmlTable enclosingTable = (HtmlTable) currentNode_;
570             if (enclosingTable.getPreviousSibling() instanceof DomText) {
571                 final DomText domText = (DomText) enclosingTable.getPreviousSibling();
572                 domText.setTextContent(domText.getWholeText() + textValue);
573             }
574             else {
575                 enclosingTable.insertBefore(new DomText(page_, textValue));
576             }
577         }
578         else if (currentNode_ instanceof HtmlImage) {
579             currentNode_.getParentNode().appendChild(new DomText(page_, textValue));
580         }
581         else {
582             appendChild(currentNode_, new DomText(page_, textValue));
583         }
584     }
585 
586     /** {@inheritDoc} */
587     @Override
588     public void endDocument() throws SAXException {
589         handleCharacters();
590         if (locator_ != null) {
591             page_.setEndLocation(locator_.getLineNumber(), locator_.getColumnNumber());
592         }
593     }
594 
595     /** {@inheritDoc} */
596     @Override
597     public void startPrefixMapping(final String prefix, final String uri) throws SAXException {
598         // nothing to do
599     }
600 
601     /** {@inheritDoc} */
602     @Override
603     public void endPrefixMapping(final String prefix) throws SAXException {
604         // nothing to do
605     }
606 
607     /** {@inheritDoc} */
608     @Override
609     public void processingInstruction(final String target, final String data) throws SAXException {
610         // nothing to do
611     }
612 
613     /** {@inheritDoc} */
614     @Override
615     public void skippedEntity(final String name) throws SAXException {
616         // nothing to do
617     }
618 
619     // LexicalHandler methods
620 
621     /** {@inheritDoc} */
622     @Override
623     public void comment(final char[] ch, final int start, final int length) {
624         handleCharacters();
625         final String data = new String(ch, start, length);
626         final DomComment comment = new DomComment(page_, data);
627         appendChild(currentNode_, comment);
628     }
629 
630     /** {@inheritDoc} */
631     @Override
632     public void endCDATA() {
633         // nothing to do
634     }
635 
636     /** {@inheritDoc} */
637     @Override
638     public void endDTD() {
639         // nothing to do
640     }
641 
642     /** {@inheritDoc} */
643     @Override
644     public void endEntity(final String name) {
645         // nothing to do
646     }
647 
648     /** {@inheritDoc} */
649     @Override
650     public void startCDATA() {
651         // nothing to do
652     }
653 
654     /** {@inheritDoc} */
655     @Override
656     public void startDTD(final String name, final String publicId, final String systemId) {
657         final DomDocumentType type = new DomDocumentType(page_, name, publicId, systemId);
658         page_.setDocumentType(type);
659 
660         final Node child;
661         child = type;
662         page_.appendChild(child);
663     }
664 
665     /** {@inheritDoc} */
666     @Override
667     public void startEntity(final String name) {
668         // nothing to do
669     }
670 
671     /**
672      * {@inheritDoc}
673      */
674     @Override
675     public void ignoredEndElement(final QName element, final Augmentations augs) {
676         // HTMLTagBalancer brings us here if </form> was found in the source on a different
677         // DOM tree depth (either above or below) to the <form> that started it
678         if ("form".equals(element.getLocalpart()) && consumingForm_ != null) {
679             consumingForm_ = null;
680 
681             if (findElementOnStack("table", "form") instanceof HtmlTable) {
682                 // The </form> just goes missing for these (really? just tables?)
683             }
684             else {
685                 /*
686                  * This </form> was ignored by HTMLTagBalancer as it generates its own
687                  * </form> at the end of the depth with the starting <form>.
688                  * e.g. This:
689                  * | <form>
690                  * |   <div>
691                  * |     </form> <!--ignored by HTMLTagBalancer-->
692                  * |   </div>
693                  * |   <input>
694                  *
695                  * is turned into:
696                  * | <form>
697                  * |   <div>
698                  * |   </div>
699                  * |   <input>
700                  * | </form> <!--synthesized by HTMLTagBalancer-->
701                  *
702                  * but this isn't suitable for us because </form> shouldn't be ignored but
703                  * rather moved directly behind the tree it's in to instead become:
704                  * | <form>
705                  * |   <div>
706                  * |   </div>
707                  * | </form> <!--moved out of div-->
708                  * | <input> <!--proceeding children are not part of form-->
709                  */
710                 // We cater for this by moving out nodes such as the <input> in the above
711                 // diagram out of the form
712                 formEndingIsAdjusting_ = true;
713             }
714         }
715     }
716 
717     /**
718      * {@inheritDoc}
719      */
720     @Override
721     public void ignoredStartElement(final QName elem, final XMLAttributes attrs, final Augmentations augs) {
722         // when multiple html/body elements are encountered, the attributes of the discarded
723         // elements are used when not previously defined
724         if (attrs != null && body_ != null) {
725             final String lp = elem.getLocalpart();
726             if (lp != null && lp.length() == 4) {
727                 if ("body".equalsIgnoreCase(lp)) {
728                     copyAttributes(body_, attrs);
729                 }
730                 else if ("html".equalsIgnoreCase(lp)) {
731                     final DomNode parent = body_.getParentNode();
732                     if (parent instanceof DomElement) {
733                         copyAttributes((DomElement) parent, attrs);
734                     }
735                 }
736             }
737         }
738     }
739 
740     private static void copyAttributes(final DomElement to, final XMLAttributes attrs) {
741         final int length = attrs.getLength();
742 
743         for (int i = 0; i < length; i++) {
744             final String attrName = StringUtils.toRootLowerCase(attrs.getLocalName(i));
745             if (to.getAttributes().getNamedItem(attrName) == null) {
746                 to.setAttribute(attrName, attrs.getValue(i));
747                 if (attrName.startsWith("on") && to.getPage().getWebClient().isJavaScriptEngineEnabled()
748                         && to.getScriptableObject() instanceof HTMLBodyElement) {
749                     final HTMLBodyElement jsBody = to.getScriptableObject();
750                     jsBody.createEventHandlerFromAttribute(attrName, attrs.getValue(i));
751                 }
752             }
753         }
754     }
755 
756     /**
757      * {@inheritDoc}
758      */
759     @Override
760     public void parse(final XMLInputSource inputSource) throws XNIException, IOException {
761         final HTMLParserDOMBuilder oldBuilder = page_.getDOMBuilder();
762         page_.setDOMBuilder(this);
763         try {
764             super.parse(inputSource);
765         }
766         finally {
767             page_.setDOMBuilder(oldBuilder);
768         }
769     }
770 
771     private static void appendChild(final DomNode parent, final DomNode child) {
772         if (parent instanceof HtmlTemplate) {
773             ((HtmlTemplate) parent).getContent().appendChild(child);
774             return;
775         }
776 
777         parent.appendChild(child);
778     }
779 }