1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 package org.htmlunit.html;
16
17 import static java.nio.charset.StandardCharsets.ISO_8859_1;
18
19 import java.io.File;
20 import java.io.IOException;
21 import java.io.InputStream;
22 import java.io.OutputStream;
23 import java.net.URL;
24 import java.nio.charset.Charset;
25 import java.nio.file.Files;
26 import java.util.HashMap;
27 import java.util.Map;
28 import java.util.regex.Pattern;
29
30 import org.apache.commons.io.FileUtils;
31 import org.apache.commons.io.IOUtils;
32 import org.apache.commons.lang3.StringUtils;
33 import org.apache.commons.logging.Log;
34 import org.apache.commons.logging.LogFactory;
35 import org.htmlunit.Page;
36 import org.htmlunit.SgmlPage;
37 import org.htmlunit.WebResponse;
38 import org.htmlunit.util.MimeType;
39
40
41
42
43
44
45
46 public class XmlSerializer {
47
48 private static final String FILE_SEPARATOR = "/";
49 private static final Pattern CREATE_FILE_PATTERN = Pattern.compile(".*/");
50
51 private static final Log LOG = LogFactory.getLog(XmlSerializer.class);
52
53 private final StringBuilder builder_ = new StringBuilder();
54 private final StringBuilder indent_ = new StringBuilder();
55 private File outputDir_;
56
57
58
59
60
61
62
63 public void save(final SgmlPage page, final File file) throws IOException {
64 save(page, file, false);
65 }
66
67 private void save(final SgmlPage page, final File file, final boolean append) throws IOException {
68 String fileName = file.getName();
69
70 if (!append) {
71 if (!fileName.endsWith(".htm") && !fileName.endsWith(".html")) {
72 fileName += ".html";
73 }
74 }
75 final File outputFile = new File(file.getParentFile(), fileName);
76
77 if (!append && outputFile.exists()) {
78 throw new IOException("File already exists: " + outputFile);
79 }
80 fileName = fileName.substring(0, fileName.lastIndexOf('.'));
81 outputDir_ = new File(file.getParentFile(), fileName);
82
83
84
85 final DomElement node = page.getDocumentElement();
86 Charset charsetName = ISO_8859_1;
87 builder_.setLength(0);
88 indent_.setLength(0);
89 if (page.isHtmlPage()) {
90 charsetName = page.getCharset();
91 if (charsetName != null && node instanceof HtmlHtml) {
92 builder_.append("<?xml version=\"1.0\" encoding=\"").append(charsetName).append("\"?>\n");
93 }
94 }
95 printXml(node);
96 final String response = builder_.toString();
97 builder_.setLength(0);
98 FileUtils.writeStringToFile(outputFile, response, charsetName, append);
99 }
100
101
102
103
104
105
106 public String asXml(final DomElement node) throws IOException {
107 builder_.setLength(0);
108 indent_.setLength(0);
109 final SgmlPage page = node.getPage();
110 if (null != page && page.isHtmlPage()) {
111 final Charset charsetName = page.getCharset();
112 if (charsetName != null && node instanceof HtmlHtml) {
113 builder_.append("<?xml version=\"1.0\" encoding=\"").append(charsetName).append("\"?>\n");
114 }
115 }
116 printXml(node);
117 final String response = builder_.toString();
118 builder_.setLength(0);
119 return response;
120 }
121
122 protected void printXml(final DomElement node) throws IOException {
123 if (!isExcluded(node)) {
124 final boolean hasChildren = node.getFirstChild() != null;
125 builder_.append(indent_).append('<');
126 printOpeningTag(node);
127
128 if (!hasChildren && !node.isEmptyXmlTagExpanded()) {
129 builder_.append("/>\n");
130 }
131 else {
132 builder_.append(">\n");
133 for (DomNode child = node.getFirstChild(); child != null; child = child.getNextSibling()) {
134 indent_.append(" ");
135 if (child instanceof DomElement) {
136 printXml((DomElement) child);
137 }
138 else {
139 builder_.append(child);
140 }
141 indent_.setLength(indent_.length() - 2);
142 }
143 builder_.append(indent_).append("</").append(node.getTagName()).append(">\n");
144 }
145 }
146 }
147
148
149
150
151
152 public String asText(final DomNode node) {
153 builder_.setLength(0);
154
155 if (node instanceof DomText) {
156 builder_.append(((DomText) node).getData());
157 }
158 else {
159 printText(node);
160 }
161
162 final String response = builder_.toString();
163 builder_.setLength(0);
164 return response;
165 }
166
167
168
169
170
171 protected void printText(final DomNode node) {
172 for (DomNode child = node.getFirstChild(); child != null; child = child.getNextSibling()) {
173 if (child instanceof DomText) {
174 builder_.append(((DomText) child).getData());
175 }
176 else {
177 printText(child);
178 }
179 }
180 }
181
182
183
184
185
186
187
188 protected void printOpeningTag(final DomElement node) throws IOException {
189 builder_.append(node.getTagName());
190 final Map<String, DomAttr> attributes = readAttributes(node);
191
192 for (final Map.Entry<String, DomAttr> entry : attributes.entrySet()) {
193 builder_.append(' ')
194 .append(entry.getKey())
195 .append("=\"");
196 final String value = entry.getValue().getNodeValue();
197 builder_.append(org.htmlunit.util.StringUtils.escapeXmlAttributeValue(value))
198 .append('"');
199 }
200 }
201
202 private Map<String, DomAttr> readAttributes(final DomElement node) throws IOException {
203 if (node instanceof HtmlImage) {
204 return getAttributesFor((HtmlImage) node);
205 }
206 else if (node instanceof HtmlLink) {
207 return getAttributesFor((HtmlLink) node);
208 }
209 else if (node instanceof BaseFrameElement) {
210 return getAttributesFor((BaseFrameElement) node);
211 }
212
213 Map<String, DomAttr> attributes = node.getAttributesMap();
214 if (node instanceof HtmlOption) {
215 attributes = new HashMap<>(attributes);
216 final HtmlOption option = (HtmlOption) node;
217 if (option.isSelected()) {
218 if (!attributes.containsKey("selected")) {
219 attributes.put("selected", new DomAttr(node.getPage(), null, "selected", "selected", false));
220 }
221 }
222 else {
223 attributes.remove("selected");
224 }
225 }
226 return attributes;
227 }
228
229
230
231
232
233 private Map<String, DomAttr> getAttributesFor(final BaseFrameElement frame) throws IOException {
234 final Map<String, DomAttr> map = createAttributesCopyWithClonedAttribute(frame, DomElement.SRC_ATTRIBUTE);
235 final DomAttr srcAttr = map.get(DomElement.SRC_ATTRIBUTE);
236 if (srcAttr == null) {
237 return map;
238 }
239
240 final Page enclosedPage = frame.getEnclosedPage();
241 final String suffix = getFileExtension(enclosedPage);
242 final File file = createFile(srcAttr.getValue(), "." + suffix);
243
244 if (enclosedPage != null) {
245 if (enclosedPage.isHtmlPage()) {
246 new XmlSerializer().save((HtmlPage) enclosedPage, file, true);
247 }
248 else {
249 try (InputStream is = enclosedPage.getWebResponse().getContentAsStream()) {
250 try (OutputStream fos = Files.newOutputStream(file.toPath())) {
251 IOUtils.copyLarge(is, fos);
252 }
253 }
254 }
255 }
256
257 srcAttr.setValue(file.getParentFile().getName() + FILE_SEPARATOR + file.getName());
258 return map;
259 }
260
261 private static String getFileExtension(final Page enclosedPage) {
262 if (enclosedPage != null) {
263 if (enclosedPage.isHtmlPage()) {
264 return "html";
265 }
266
267 final URL url = enclosedPage.getUrl();
268 if (url.getPath().contains(".")) {
269 return StringUtils.substringAfterLast(url.getPath(), ".");
270 }
271 }
272
273 return ".unknown";
274 }
275
276
277
278
279
280
281 protected Map<String, DomAttr> getAttributesFor(final HtmlLink link) throws IOException {
282 final Map<String, DomAttr> map = createAttributesCopyWithClonedAttribute(link, "href");
283 final DomAttr hrefAttr = map.get("href");
284 if (hrefAttr != null && StringUtils.isNotBlank(hrefAttr.getValue())) {
285 final String protocol = link.getWebRequest().getUrl().getProtocol();
286 if ("http".equals(protocol) || "https".equals(protocol)) {
287 try {
288 final WebResponse response = link.getWebResponse(true, null);
289
290 final File file = createFile(hrefAttr.getValue(), ".css");
291 FileUtils.writeStringToFile(file, response.getContentAsString(), ISO_8859_1);
292 hrefAttr.setValue(outputDir_.getName() + FILE_SEPARATOR + file.getName());
293 }
294 catch (final IOException e) {
295 LOG.error("XmlSerializer: IOException while downloading link content from url '"
296 + hrefAttr + "'", e);
297 }
298 catch (final IllegalStateException e) {
299 LOG.error("XmlSerializer: IllegalStateException while downloading link content from url '"
300 + hrefAttr + "'", e);
301 }
302 }
303 }
304
305 return map;
306 }
307
308
309
310
311
312 protected Map<String, DomAttr> getAttributesFor(final HtmlImage image) {
313 final Map<String, DomAttr> map = createAttributesCopyWithClonedAttribute(image, DomElement.SRC_ATTRIBUTE);
314 final DomAttr srcAttr = map.get(DomElement.SRC_ATTRIBUTE);
315 if (srcAttr != null && StringUtils.isNotBlank(srcAttr.getValue())) {
316 try {
317 final WebResponse response = image.getWebResponse(true);
318
319 try (InputStream inputStream = response.getContentAsStream()) {
320 final File file = createFile(srcAttr.getValue(), "." + getSuffix(response));
321 FileUtils.copyInputStreamToFile(inputStream, file);
322
323 final String valueOnFileSystem = outputDir_.getName() + FILE_SEPARATOR + file.getName();
324
325 srcAttr.setValue(valueOnFileSystem);
326 }
327 }
328 catch (final IOException e) {
329 LOG.error("XmlSerializer: IOException while downloading image content from url '" + srcAttr + "'", e);
330 }
331 catch (final IllegalStateException e) {
332 LOG.error("XmlSerializer: IllegalStateException while downloading image content from url '"
333 + srcAttr + "'", e);
334 }
335 }
336
337 return map;
338 }
339
340 private static String getSuffix(final WebResponse response) {
341
342 final String url = response.getWebRequest().getUrl().toString();
343 final String fileName = StringUtils.substringAfterLast(StringUtils.substringBefore(url, "?"), "/");
344
345 final String suffix = StringUtils.substringAfterLast(fileName, ".");
346 if (suffix.length() > 1 && suffix.length() < 5) {
347 return suffix;
348 }
349
350
351 return MimeType.getFileExtension(response.getContentType());
352 }
353
354 private static Map<String, DomAttr> createAttributesCopyWithClonedAttribute(final HtmlElement elt,
355 final String attrName) {
356 final Map<String, DomAttr> newMap = new HashMap<>(elt.getAttributesMap());
357
358
359 final DomAttr attr = newMap.get(attrName);
360 if (null == attr) {
361 return newMap;
362 }
363
364 final DomAttr clonedAttr = new DomAttr(attr.getPage(), attr.getNamespaceURI(),
365 attr.getQualifiedName(), attr.getValue(), attr.getSpecified());
366
367 newMap.put(attrName, clonedAttr);
368
369 return newMap;
370 }
371
372
373
374
375
376 protected boolean isExcluded(final DomElement element) {
377 return element instanceof HtmlScript;
378 }
379
380
381
382
383
384
385
386
387 private File createFile(final String url, final String extension) throws IOException {
388 String name = url.replaceFirst("/$", "");
389 name = CREATE_FILE_PATTERN.matcher(name).replaceAll("");
390 name = StringUtils.substringBefore(name, "?");
391 name = StringUtils.substringBefore(name, ";");
392 name = StringUtils.substring(name, 0, 30);
393 name = org.htmlunit.util.StringUtils.sanitizeForFileName(name);
394 if (!name.endsWith(extension)) {
395 name += extension;
396 }
397 int counter = 0;
398 while (true) {
399 final String fileName;
400 if (counter != 0) {
401 fileName = StringUtils.substringBeforeLast(name, ".")
402 + "_" + counter + "." + StringUtils.substringAfterLast(name, ".");
403 }
404 else {
405 fileName = name;
406 }
407 FileUtils.forceMkdir(outputDir_);
408 final File f = new File(outputDir_, fileName);
409 if (f.createNewFile()) {
410 return f;
411 }
412 counter++;
413 }
414 }
415 }