View Javadoc
1   /*
2    * Copyright (c) 2002-2025 Gargoyle Software Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * https://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  package org.htmlunit.javascript.host.xml;
16  
17  import java.util.Arrays;
18  import java.util.HashSet;
19  import java.util.Set;
20  
21  import org.htmlunit.SgmlPage;
22  import org.htmlunit.html.*;
23  import org.htmlunit.javascript.HtmlUnitScriptable;
24  import org.htmlunit.javascript.configuration.JsxClass;
25  import org.htmlunit.javascript.configuration.JsxConstructor;
26  import org.htmlunit.javascript.configuration.JsxFunction;
27  import org.htmlunit.javascript.host.Element;
28  import org.htmlunit.javascript.host.dom.Document;
29  import org.htmlunit.javascript.host.dom.DocumentFragment;
30  import org.htmlunit.javascript.host.dom.Node;
31  import org.htmlunit.util.StringUtils;
32  import org.w3c.dom.NamedNodeMap;
33  
34  /**
35   * A JavaScript object for {@code XMLSerializer}.
36   * see https://w3c.github.io/DOM-Parsing/#the-xmlserializer-interface
37   *
38   * @author Ahmed Ashour
39   * @author Darrell DeBoer
40   * @author Ronald Brill
41   * @author Frank Danek
42   */
43  @JsxClass
44  public class XMLSerializer extends HtmlUnitScriptable {
45  
46      // this is a bit strange but it is the way FF works
47      // output of empty tags are not allowed for these HTML tags
48      private static final Set<String> NON_EMPTY_TAGS = new HashSet<>(Arrays.asList(
49              HtmlAbbreviated.TAG_NAME, HtmlAcronym.TAG_NAME,
50              HtmlAnchor.TAG_NAME, HtmlAddress.TAG_NAME, HtmlAudio.TAG_NAME,
51              HtmlBidirectionalOverride.TAG_NAME, HtmlBig.TAG_NAME,
52              HtmlBlockQuote.TAG_NAME, HtmlBody.TAG_NAME, HtmlBold.TAG_NAME,
53              HtmlButton.TAG_NAME, HtmlCanvas.TAG_NAME, HtmlCaption.TAG_NAME,
54              HtmlCenter.TAG_NAME, HtmlCitation.TAG_NAME, HtmlCode.TAG_NAME,
55              HtmlDefinition.TAG_NAME, HtmlDefinitionDescription.TAG_NAME,
56              HtmlDeletedText.TAG_NAME, HtmlDirectory.TAG_NAME,
57              HtmlDivision.TAG_NAME,
58              HtmlDefinitionList.TAG_NAME,
59              HtmlDefinitionTerm.TAG_NAME, HtmlEmbed.TAG_NAME,
60              HtmlEmphasis.TAG_NAME, HtmlFieldSet.TAG_NAME,
61              HtmlFont.TAG_NAME, HtmlForm.TAG_NAME,
62              HtmlFrame.TAG_NAME, HtmlFrameSet.TAG_NAME, HtmlHeading1.TAG_NAME,
63              HtmlHeading2.TAG_NAME, HtmlHeading3.TAG_NAME,
64              HtmlHeading4.TAG_NAME, HtmlHeading5.TAG_NAME,
65              HtmlHeading6.TAG_NAME, HtmlHead.TAG_NAME,
66              HtmlHtml.TAG_NAME, HtmlInlineFrame.TAG_NAME,
67              HtmlInsertedText.TAG_NAME,
68              HtmlItalic.TAG_NAME, HtmlKeyboard.TAG_NAME, HtmlLabel.TAG_NAME,
69              HtmlLegend.TAG_NAME, HtmlListing.TAG_NAME, HtmlListItem.TAG_NAME,
70              HtmlMap.TAG_NAME, HtmlMarquee.TAG_NAME,
71              HtmlMenu.TAG_NAME,
72              HtmlNoBreak.TAG_NAME, HtmlNoEmbed.TAG_NAME, HtmlNoFrames.TAG_NAME,
73              HtmlNoScript.TAG_NAME, HtmlObject.TAG_NAME, HtmlOrderedList.TAG_NAME,
74              HtmlOptionGroup.TAG_NAME, HtmlOption.TAG_NAME, HtmlParagraph.TAG_NAME,
75              HtmlPlainText.TAG_NAME, HtmlPreformattedText.TAG_NAME,
76              HtmlInlineQuotation.TAG_NAME, HtmlS.TAG_NAME, HtmlSample.TAG_NAME,
77              HtmlScript.TAG_NAME, HtmlSelect.TAG_NAME, HtmlSmall.TAG_NAME,
78              HtmlSource.TAG_NAME, HtmlSpan.TAG_NAME,
79              HtmlStrike.TAG_NAME, HtmlStrong.TAG_NAME, HtmlStyle.TAG_NAME,
80              HtmlSubscript.TAG_NAME, HtmlSuperscript.TAG_NAME, HtmlTitle.TAG_NAME,
81              HtmlTable.TAG_NAME, HtmlTableColumn.TAG_NAME, HtmlTableColumnGroup.TAG_NAME,
82              HtmlTableBody.TAG_NAME, HtmlTableDataCell.TAG_NAME, HtmlTableHeaderCell.TAG_NAME,
83              HtmlTableRow.TAG_NAME, HtmlTextArea.TAG_NAME, HtmlTableFooter.TAG_NAME,
84              HtmlTableHeader.TAG_NAME, HtmlTeletype.TAG_NAME, HtmlUnderlined.TAG_NAME,
85              HtmlUnorderedList.TAG_NAME, HtmlVariable.TAG_NAME, HtmlVideo.TAG_NAME,
86              HtmlWordBreak.TAG_NAME, HtmlExample.TAG_NAME
87      ));
88  
89      /**
90       * JavaScript constructor.
91       */
92      @JsxConstructor
93      public void jsConstructor() {
94          // nothing to do
95      }
96  
97      /**
98       * The subtree rooted by the specified element is serialized to a string.
99       * @param root the root of the subtree to be serialized (this may be any node, even a document)
100      * @return the serialized string
101      */
102     @JsxFunction
103     public String serializeToString(Node root) {
104         if (root == null) {
105             return "";
106         }
107 
108         if (root instanceof DocumentFragment) {
109             Node node = root.getFirstChild();
110             if (node == null) {
111                 return "";
112             }
113 
114             final StringBuilder builder = new StringBuilder();
115             while (node != null) {
116                 builder.append(serializeToString(node));
117                 node = node.getNextSibling();
118             }
119             return builder.toString().trim();
120         }
121 
122         final boolean rootIsDocument = root instanceof Document;
123         if (rootIsDocument) {
124             root = ((Document) root).getDocumentElement();
125         }
126 
127         if (root instanceof Element) {
128             final StringBuilder builder = new StringBuilder();
129             final DomNode node = root.getDomNodeOrDie();
130             final SgmlPage page = node.getPage();
131             final boolean isHtmlPage = page != null && page.isHtmlPage();
132 
133             String forcedNamespace = null;
134             if (!rootIsDocument && isHtmlPage) {
135                 forcedNamespace = "http://www.w3.org/1999/xhtml";
136             }
137             toXml(1, node, builder, forcedNamespace);
138 
139             return builder.toString();
140         }
141 
142         return root.getDomNodeOrDie().asXml();
143     }
144 
145     private void toXml(final int indent,
146             final DomNode node, final StringBuilder builder, final String foredNamespace) {
147         final String nodeName = node.getNodeName();
148         builder.append('<').append(nodeName);
149 
150         String optionalPrefix = "";
151         final String namespaceURI = node.getNamespaceURI();
152         final String prefix = node.getPrefix();
153         if (namespaceURI != null && prefix != null) {
154             boolean sameNamespace = false;
155             for (DomNode parentNode = node.getParentNode(); parentNode instanceof DomElement;
156                     parentNode = parentNode.getParentNode()) {
157                 if (namespaceURI.equals(parentNode.getNamespaceURI())) {
158                     sameNamespace = true;
159                     break;
160                 }
161             }
162             if (node.getParentNode() == null || !sameNamespace) {
163                 ((DomElement) node).setAttribute("xmlns:" + prefix, namespaceURI);
164             }
165         }
166         else if (foredNamespace != null) {
167             builder.append(" xmlns=\"").append(foredNamespace).append('"');
168             optionalPrefix = " ";
169         }
170 
171         final NamedNodeMap attributesMap = node.getAttributes();
172         final int lenght = attributesMap.getLength();
173         for (int i = 0; i < lenght; i++) {
174             final DomAttr attrib = (DomAttr) attributesMap.item(i);
175             builder.append(' ').append(attrib.getQualifiedName())
176                    .append("=\"").append(attrib.getValue()).append('"');
177         }
178         boolean startTagClosed = false;
179         for (final DomNode child : node.getChildren()) {
180             if (!startTagClosed) {
181                 builder.append(optionalPrefix).append('>');
182                 startTagClosed = true;
183             }
184             switch (child.getNodeType()) {
185                 case Node.ELEMENT_NODE:
186                     toXml(indent + 1, child, builder, null);
187                     break;
188 
189                 case Node.TEXT_NODE:
190                     String value = child.getNodeValue();
191                     value = StringUtils.escapeXmlChars(value);
192                     builder.append(value);
193                     break;
194 
195                 case Node.CDATA_SECTION_NODE:
196                 case Node.COMMENT_NODE:
197                     builder.append(child.asXml());
198                     break;
199 
200                 default:
201                     break;
202             }
203         }
204         if (!startTagClosed) {
205             final String tagName = StringUtils.toRootLowerCase(nodeName);
206             if (NON_EMPTY_TAGS.contains(tagName)) {
207                 builder.append("></").append(nodeName).append('>');
208             }
209             else {
210                 builder.append(optionalPrefix).append("/>");
211             }
212         }
213         else {
214             builder.append("</").append(nodeName).append('>');
215         }
216     }
217 
218 }