View Javadoc
1   /*
2    * Copyright (c) 2002-2025 Gargoyle Software Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * https://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  package org.htmlunit.html;
16  
17  import static java.nio.charset.StandardCharsets.ISO_8859_1;
18  
19  import java.io.File;
20  import java.io.IOException;
21  import java.io.InputStream;
22  import java.io.OutputStream;
23  import java.net.URL;
24  import java.nio.charset.Charset;
25  import java.nio.file.Files;
26  import java.util.HashMap;
27  import java.util.Map;
28  import java.util.regex.Pattern;
29  
30  import org.apache.commons.io.FileUtils;
31  import org.apache.commons.io.IOUtils;
32  import org.apache.commons.lang3.StringUtils;
33  import org.apache.commons.logging.Log;
34  import org.apache.commons.logging.LogFactory;
35  import org.htmlunit.Page;
36  import org.htmlunit.SgmlPage;
37  import org.htmlunit.WebResponse;
38  import org.htmlunit.util.MimeType;
39  
40  /**
41   * Utility to handle conversion from HTML code to XML string.
42   * @author Ahmed Ashour
43   * @author Ronald Brill
44   * @author Marc Guillemot
45   */
46  public class XmlSerializer {
47  
48      private static final String FILE_SEPARATOR = "/";
49      private static final Pattern CREATE_FILE_PATTERN = Pattern.compile(".*/");
50  
51      private static final Log LOG = LogFactory.getLog(XmlSerializer.class);
52  
53      private final StringBuilder builder_ = new StringBuilder();
54      private final StringBuilder indent_ = new StringBuilder();
55      private File outputDir_;
56  
57      /**
58       * Saves the given {@link SgmlPage} to the file.
59       * @param page the page to save
60       * @param file the destination
61       * @throws IOException in case of error
62       */
63      public void save(final SgmlPage page, final File file) throws IOException {
64          save(page, file, false);
65      }
66  
67      private void save(final SgmlPage page, final File file, final boolean append) throws IOException {
68          String fileName = file.getName();
69  
70          if (!append) {
71              if (!fileName.endsWith(".htm") && !fileName.endsWith(".html")) {
72                  fileName += ".html";
73              }
74          }
75          final File outputFile = new File(file.getParentFile(), fileName);
76  
77          if (!append && outputFile.exists()) {
78              throw new IOException("File already exists: " + outputFile);
79          }
80          fileName = fileName.substring(0, fileName.lastIndexOf('.'));
81          outputDir_ = new File(file.getParentFile(), fileName);
82  
83          // don't use asXml here because we have to sync the encoding from the
84          // header with the one used by the writer
85          final DomElement node = page.getDocumentElement();
86          Charset charsetName = ISO_8859_1;
87          builder_.setLength(0);
88          indent_.setLength(0);
89          if (page.isHtmlPage()) {
90              charsetName = page.getCharset();
91              if (charsetName != null && node instanceof HtmlHtml) {
92                  builder_.append("<?xml version=\"1.0\" encoding=\"").append(charsetName).append("\"?>\n");
93              }
94          }
95          printXml(node);
96          final String response = builder_.toString();
97          builder_.setLength(0);
98          FileUtils.writeStringToFile(outputFile, response, charsetName, append);
99      }
100 
101     /**
102      * @param node a node
103      * @return the xml representation according to the setting of this serializer
104      * @throws IOException in case of problem saving resources
105      */
106     public String asXml(final DomElement node) throws IOException {
107         builder_.setLength(0);
108         indent_.setLength(0);
109         final SgmlPage page = node.getPage();
110         if (null != page && page.isHtmlPage()) {
111             final Charset charsetName = page.getCharset();
112             if (charsetName != null && node instanceof HtmlHtml) {
113                 builder_.append("<?xml version=\"1.0\" encoding=\"").append(charsetName).append("\"?>\n");
114             }
115         }
116         printXml(node);
117         final String response = builder_.toString();
118         builder_.setLength(0);
119         return response;
120     }
121 
122     protected void printXml(final DomElement node) throws IOException {
123         if (!isExcluded(node)) {
124             final boolean hasChildren = node.getFirstChild() != null;
125             builder_.append(indent_).append('<');
126             printOpeningTag(node);
127 
128             if (!hasChildren && !node.isEmptyXmlTagExpanded()) {
129                 builder_.append("/>\n");
130             }
131             else {
132                 builder_.append(">\n");
133                 for (DomNode child = node.getFirstChild(); child != null; child = child.getNextSibling()) {
134                     indent_.append("  ");
135                     if (child instanceof DomElement) {
136                         printXml((DomElement) child);
137                     }
138                     else {
139                         builder_.append(child);
140                     }
141                     indent_.setLength(indent_.length() - 2);
142                 }
143                 builder_.append(indent_).append("</").append(node.getTagName()).append(">\n");
144             }
145         }
146     }
147 
148     /**
149      * @param node a node
150      * @return the text representation according to the setting of this serializer
151      */
152     public String asText(final DomNode node) {
153         builder_.setLength(0);
154 
155         if (node instanceof DomText) {
156             builder_.append(((DomText) node).getData());
157         }
158         else {
159             printText(node);
160         }
161 
162         final String response = builder_.toString();
163         builder_.setLength(0);
164         return response;
165     }
166 
167     /**
168      * Prints the text content from this node and all children.
169      * @param node the node
170      */
171     protected void printText(final DomNode node) {
172         for (DomNode child = node.getFirstChild(); child != null; child = child.getNextSibling()) {
173             if (child instanceof DomText) {
174                 builder_.append(((DomText) child).getData());
175             }
176             else {
177                 printText(child);
178             }
179         }
180     }
181 
182     /**
183      * Prints the content between "&lt;" and "&gt;" (or "/&gt;") in the output of the tag name
184      * and its attributes in XML format.
185      * @param node the node whose opening tag is to be printed
186      * @throws IOException in case of problem saving resources
187      */
188     protected void printOpeningTag(final DomElement node) throws IOException {
189         builder_.append(node.getTagName());
190         final Map<String, DomAttr> attributes = readAttributes(node);
191 
192         for (final Map.Entry<String, DomAttr> entry : attributes.entrySet()) {
193             builder_.append(' ')
194                 .append(entry.getKey())
195                 .append("=\"");
196             final String value = entry.getValue().getNodeValue();
197             builder_.append(org.htmlunit.util.StringUtils.escapeXmlAttributeValue(value))
198                 .append('"');
199         }
200     }
201 
202     private Map<String, DomAttr> readAttributes(final DomElement node) throws IOException {
203         if (node instanceof HtmlImage) {
204             return getAttributesFor((HtmlImage) node);
205         }
206         else if (node instanceof HtmlLink) {
207             return getAttributesFor((HtmlLink) node);
208         }
209         else if (node instanceof BaseFrameElement) {
210             return getAttributesFor((BaseFrameElement) node);
211         }
212 
213         Map<String, DomAttr> attributes = node.getAttributesMap();
214         if (node instanceof HtmlOption) {
215             attributes = new HashMap<>(attributes);
216             final HtmlOption option = (HtmlOption) node;
217             if (option.isSelected()) {
218                 if (!attributes.containsKey("selected")) {
219                     attributes.put("selected", new DomAttr(node.getPage(), null, "selected", "selected", false));
220                 }
221             }
222             else {
223                 attributes.remove("selected");
224             }
225         }
226         return attributes;
227     }
228 
229     /**
230      * @param frame the frame to get the attributes from
231      * @return the attribute map
232      */
233     private Map<String, DomAttr> getAttributesFor(final BaseFrameElement frame) throws IOException {
234         final Map<String, DomAttr> map = createAttributesCopyWithClonedAttribute(frame, DomElement.SRC_ATTRIBUTE);
235         final DomAttr srcAttr = map.get(DomElement.SRC_ATTRIBUTE);
236         if (srcAttr == null) {
237             return map;
238         }
239 
240         final Page enclosedPage = frame.getEnclosedPage();
241         final String suffix = getFileExtension(enclosedPage);
242         final File file = createFile(srcAttr.getValue(), "." + suffix);
243 
244         if (enclosedPage != null) {
245             if (enclosedPage.isHtmlPage()) {
246                 new XmlSerializer().save((HtmlPage) enclosedPage, file, true);
247             }
248             else {
249                 try (InputStream is = enclosedPage.getWebResponse().getContentAsStream()) {
250                     try (OutputStream fos = Files.newOutputStream(file.toPath())) {
251                         IOUtils.copyLarge(is, fos);
252                     }
253                 }
254             }
255         }
256 
257         srcAttr.setValue(file.getParentFile().getName() + FILE_SEPARATOR + file.getName());
258         return map;
259     }
260 
261     private static String getFileExtension(final Page enclosedPage) {
262         if (enclosedPage != null) {
263             if (enclosedPage.isHtmlPage()) {
264                 return "html";
265             }
266 
267             final URL url = enclosedPage.getUrl();
268             if (url.getPath().contains(".")) {
269                 return StringUtils.substringAfterLast(url.getPath(), ".");
270             }
271         }
272 
273         return ".unknown";
274     }
275 
276     /**
277      * @param link the link to get the attributes from
278      * @return the attribute map
279      * @throws IOException in case of error
280      */
281     protected Map<String, DomAttr> getAttributesFor(final HtmlLink link) throws IOException {
282         final Map<String, DomAttr> map = createAttributesCopyWithClonedAttribute(link, "href");
283         final DomAttr hrefAttr = map.get("href");
284         if (hrefAttr != null && StringUtils.isNotBlank(hrefAttr.getValue())) {
285             final String protocol = link.getWebRequest().getUrl().getProtocol();
286             if ("http".equals(protocol) || "https".equals(protocol)) {
287                 try {
288                     final WebResponse response = link.getWebResponse(true, null);
289 
290                     final File file = createFile(hrefAttr.getValue(), ".css");
291                     FileUtils.writeStringToFile(file, response.getContentAsString(), ISO_8859_1);
292                     hrefAttr.setValue(outputDir_.getName() + FILE_SEPARATOR + file.getName());
293                 }
294                 catch (final IOException e) {
295                     LOG.error("XmlSerializer: IOException while downloading link content from url '"
296                                 + hrefAttr + "'", e);
297                 }
298                 catch (final IllegalStateException e) {
299                     LOG.error("XmlSerializer: IllegalStateException while downloading link content from url '"
300                                 + hrefAttr + "'", e);
301                 }
302             }
303         }
304 
305         return map;
306     }
307 
308     /**
309      * @param image the image to get the attributes from
310      * @return the attribute map
311      */
312     protected Map<String, DomAttr> getAttributesFor(final HtmlImage image) {
313         final Map<String, DomAttr> map = createAttributesCopyWithClonedAttribute(image, DomElement.SRC_ATTRIBUTE);
314         final DomAttr srcAttr = map.get(DomElement.SRC_ATTRIBUTE);
315         if (srcAttr != null && StringUtils.isNotBlank(srcAttr.getValue())) {
316             try {
317                 final WebResponse response = image.getWebResponse(true);
318 
319                 try (InputStream inputStream = response.getContentAsStream()) {
320                     final File file = createFile(srcAttr.getValue(), "." + getSuffix(response));
321                     FileUtils.copyInputStreamToFile(inputStream, file);
322 
323                     final String valueOnFileSystem = outputDir_.getName() + FILE_SEPARATOR + file.getName();
324                     // this is the clone attribute node, not the original one of the page
325                     srcAttr.setValue(valueOnFileSystem);
326                 }
327             }
328             catch (final IOException e) {
329                 LOG.error("XmlSerializer: IOException while downloading image content from url '" + srcAttr + "'", e);
330             }
331             catch (final IllegalStateException e) {
332                 LOG.error("XmlSerializer: IllegalStateException while downloading image content from url '"
333                             + srcAttr + "'", e);
334             }
335         }
336 
337         return map;
338     }
339 
340     private static String getSuffix(final WebResponse response) {
341         // first try to take the one from the requested file
342         final String url = response.getWebRequest().getUrl().toString();
343         final String fileName = StringUtils.substringAfterLast(StringUtils.substringBefore(url, "?"), "/");
344         // if there is a suffix with 2-4 letters, the take it
345         final String suffix = StringUtils.substringAfterLast(fileName, ".");
346         if (suffix.length() > 1 && suffix.length() < 5) {
347             return suffix;
348         }
349 
350         // use content type
351         return MimeType.getFileExtension(response.getContentType());
352     }
353 
354     private static Map<String, DomAttr> createAttributesCopyWithClonedAttribute(final HtmlElement elt,
355             final String attrName) {
356         final Map<String, DomAttr> newMap = new HashMap<>(elt.getAttributesMap());
357 
358         // clone the specified element, if possible
359         final DomAttr attr = newMap.get(attrName);
360         if (null == attr) {
361             return newMap;
362         }
363 
364         final DomAttr clonedAttr = new DomAttr(attr.getPage(), attr.getNamespaceURI(),
365             attr.getQualifiedName(), attr.getValue(), attr.getSpecified());
366 
367         newMap.put(attrName, clonedAttr);
368 
369         return newMap;
370     }
371 
372     /**
373      * @param element the element to check
374      * @return true if the element is a HtmlScript
375      */
376     protected boolean isExcluded(final DomElement element) {
377         return element instanceof HtmlScript;
378     }
379 
380     /**
381      * Computes the best file to save the response to the given URL.
382      * @param url the requested URL
383      * @param extension the preferred extension
384      * @return the file to create
385      * @throws IOException if a problem occurs creating the file
386      */
387     private File createFile(final String url, final String extension) throws IOException {
388         String name = url.replaceFirst("/$", "");
389         name = CREATE_FILE_PATTERN.matcher(name).replaceAll("");
390         name = StringUtils.substringBefore(name, "?"); // remove query
391         name = StringUtils.substringBefore(name, ";"); // remove additional info
392         name = StringUtils.substring(name, 0, 30); // many file systems have a limit at 255, let's limit it
393         name = org.htmlunit.util.StringUtils.sanitizeForFileName(name);
394         if (!name.endsWith(extension)) {
395             name += extension;
396         }
397         int counter = 0;
398         while (true) {
399             final String fileName;
400             if (counter != 0) {
401                 fileName = StringUtils.substringBeforeLast(name, ".")
402                     + "_" + counter + "." + StringUtils.substringAfterLast(name, ".");
403             }
404             else {
405                 fileName = name;
406             }
407             FileUtils.forceMkdir(outputDir_);
408             final File f = new File(outputDir_, fileName);
409             if (f.createNewFile()) {
410                 return f;
411             }
412             counter++;
413         }
414     }
415 }