View Javadoc
1   /*
2    * Copyright (c) 2002-2025 Gargoyle Software Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * https://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  package org.htmlunit;
16  
17  import java.io.IOException;
18  import java.io.InputStream;
19  import java.io.Serializable;
20  import java.nio.charset.StandardCharsets;
21  import java.util.Locale;
22  
23  import org.apache.commons.lang3.ArrayUtils;
24  import org.apache.commons.lang3.StringUtils;
25  import org.htmlunit.html.DomElement;
26  import org.htmlunit.html.Html;
27  import org.htmlunit.html.HtmlPage;
28  import org.htmlunit.html.XHtmlPage;
29  import org.htmlunit.html.parser.HTMLParser;
30  import org.htmlunit.html.parser.neko.HtmlUnitNekoHtmlParser;
31  import org.htmlunit.util.MimeType;
32  import org.htmlunit.xml.XmlPage;
33  
34  /**
35   * The default implementation of {@link PageCreator}. Designed to be extended for easier handling of new content
36   * types. Just check the content type in <code>createPage()</code>
37   * and call <code>super(createPage())</code> if your custom
38   * type isn't found. There are also protected <code>createXXXXPage()</code> methods for creating the {@link Page} types
39   * which HtmlUnit already knows about for your custom content types.
40   *
41   * <p>
42   * The following table shows the type of {@link Page} created depending on the content type:<br>
43   * <br>
44   *  <table border="1" style="width:50%;">
45   *    <tr>
46   *      <th>Content type</th>
47   *      <th>Type of page</th>
48   *    </tr>
49   *    <tr>
50   *      <td>text/html<br>
51   *          text/javascript</td>
52   *      <td>{@link HtmlPage}</td>
53   *    </tr>
54   *    <tr>
55   *      <td>text/xml<br>
56   *      application/xml<br>
57   *      text/vnd.wap.wml<br>
58   *      *+xml
59   *      </td>
60   *      <td>{@link XmlPage}, or an {@link XHtmlPage} if an XHTML namespace is used</td>
61   *    </tr>
62   *    <tr>
63   *      <td>text/*</td>
64   *      <td>{@link TextPage}</td>
65   *    </tr>
66   *    <tr>
67   *      <td>Anything else</td>
68   *      <td>{@link UnexpectedPage}</td>
69   *    </tr>
70   *  </table>
71   *
72   * @author <a href="mailto:mbowler@GargoyleSoftware.com">Mike Bowler</a>
73   * @author <a href="mailto:cse@dynabean.de">Christian Sell</a>
74   * @author <a href="mailto:yourgod@users.sourceforge.net">Brad Clarke</a>
75   * @author Marc Guillemot
76   * @author Ahmed Ashour
77   * @author Daniel Gredler
78   * @author Ronald Brill
79   * @author Antoni Reus
80   */
81  public class DefaultPageCreator implements PageCreator, Serializable {
82  
83      private static final byte[] MARKER_UTF8 = {(byte) 0xef, (byte) 0xbb, (byte) 0xbf};
84      private static final byte[] MARKER_UTF16BE = {(byte) 0xfe, (byte) 0xff};
85      private static final byte[] MARKER_UTF16LE = {(byte) 0xff, (byte) 0xfe};
86  
87      /**
88       * See <a href="http://tools.ietf.org/html/draft-abarth-mime-sniff-05">
89       * http://tools.ietf.org/html/draft-abarth-mime-sniff-05</a>
90       */
91      private static final String[] HTML_PATTERNS = {"!DOCTYPE HTML", "HTML", "HEAD", "SCRIPT",
92          "IFRAME", "H1", "DIV", "FONT", "TABLE", "A", "STYLE", "TITLE", "B", "BODY", "BR", "P", "!--" };
93  
94      private static final HTMLParser HTML_PARSER = new HtmlUnitNekoHtmlParser();
95  
96      /**
97       * The different supported page types.
98       */
99      public enum PageType {
100         /** html. */
101         HTML,
102         /** javascript. */
103         JAVASCRIPT,
104         /** xml. */
105         XML,
106         /** text. */
107         TEXT,
108         /** unknown. */
109         UNKNOWN
110     }
111 
112     /**
113      * Determines the kind of page to create from the content type.
114      * @param contentType the content type to evaluate
115      * @return "xml", "html", "javascript", "text" or "unknown"
116      */
117     public static PageType determinePageType(final String contentType) {
118         if (null == contentType) {
119             return PageType.UNKNOWN;
120         }
121 
122         final String contentTypeLC = org.htmlunit.util.StringUtils
123                                             .toRootLowerCase(contentType);
124 
125         if (MimeType.isJavascriptMimeType(contentTypeLC)) {
126             return PageType.JAVASCRIPT;
127         }
128         switch (contentTypeLC) {
129             case MimeType.TEXT_HTML:
130             case "image/svg+xml":
131                 return PageType.HTML;
132 
133             case MimeType.TEXT_XML:
134             case MimeType.APPLICATION_XML:
135             case "text/vnd.wap.wml":
136                 return PageType.XML;
137 
138             default:
139                 if (contentTypeLC.endsWith("+xml")) {
140                     return PageType.XML;
141                 }
142 
143                 if (contentTypeLC.startsWith("text/")) {
144                     return PageType.TEXT;
145                 }
146 
147                 return PageType.UNKNOWN;
148         }
149     }
150 
151     /**
152      * Determines the kind of page to create from the content type.
153      * @param webResponse the response to investigate
154      * @exception IOException if an IO problem occurs
155      * @return "xml", "html", "javascript", "text" or "unknown"
156      */
157     public static PageType determinePageType(final WebResponse webResponse) throws IOException {
158         final String contentType = webResponse.getContentType();
159         if (!StringUtils.isEmpty(contentType)) {
160             return determinePageType(contentType);
161         }
162 
163         // sniff - http://tools.ietf.org/html/draft-abarth-mime-sniff-05
164         try (InputStream contentAsStream = webResponse.getContentAsStream()) {
165             final byte[] bytes = read(contentAsStream, 512);
166             if (bytes.length == 0) {
167                 return determinePageType(MimeType.TEXT_PLAIN);
168             }
169 
170             // looks a bit strange but correct
171             // if there is a bom header the browsers are handling this as text page
172             if (startsWith(bytes, MARKER_UTF8) || startsWith(bytes, MARKER_UTF16BE)
173                     || startsWith(bytes, MARKER_UTF16LE)) {
174                 return determinePageType(MimeType.TEXT_PLAIN);
175             }
176 
177             if (isBinary(bytes)) {
178                 return determinePageType(MimeType.APPLICATION_OCTET_STREAM);
179             }
180 
181             final String asAsciiString = new String(bytes, StandardCharsets.US_ASCII).trim().toUpperCase(Locale.ROOT);
182 
183             if (asAsciiString.startsWith("<?XML")) {
184                 return determinePageType(MimeType.TEXT_XML);
185             }
186 
187             for (final String htmlPattern : HTML_PATTERNS) {
188                 try {
189                     if ('<' == asAsciiString.charAt(0)) {
190                         if (asAsciiString.startsWith(htmlPattern, 1)) {
191                             final char spaceOrBracket = asAsciiString.charAt(htmlPattern.length() + 1);
192                             if (' ' == spaceOrBracket || '>' == spaceOrBracket) {
193                                 return determinePageType(MimeType.TEXT_HTML);
194                             }
195                         }
196                     }
197                 }
198                 catch (final ArrayIndexOutOfBoundsException ignored) {
199                     // ignore and try next
200                 }
201             }
202         }
203         return determinePageType(MimeType.TEXT_PLAIN);
204     }
205 
206     /**
207      * Create a Page object for the specified web response.
208      *
209      * @param webResponse the response from the server
210      * @param webWindow the window that this page will be loaded into
211      * @exception IOException if an IO problem occurs
212      * @return the new page object
213      */
214     @Override
215     public Page createPage(final WebResponse webResponse, final WebWindow webWindow) throws IOException {
216         final PageType pageType = determinePageType(webResponse);
217         switch (pageType) {
218             case HTML:
219                 return createHtmlPage(webResponse, webWindow);
220 
221             case JAVASCRIPT:
222                 return createHtmlPage(webResponse, webWindow);
223 
224             case XML:
225                 final SgmlPage sgmlPage = createXmlPage(webResponse, webWindow);
226                 final DomElement doc = sgmlPage.getDocumentElement();
227                 if (doc != null && Html.XHTML_NAMESPACE.equals(doc.getNamespaceURI())) {
228                     return createXHtmlPage(webResponse, webWindow);
229                 }
230                 return sgmlPage;
231 
232             case TEXT:
233                 return createTextPage(webResponse, webWindow);
234 
235             default:
236                 return createUnexpectedPage(webResponse, webWindow);
237         }
238     }
239 
240     /**
241      * {@inheritDoc}
242      */
243     @Override
244     public HTMLParser getHtmlParser() {
245         return HTML_PARSER;
246     }
247 
248     /**
249      * See <a href="http://tools.ietf.org/html/draft-abarth-mime-sniff-05#section-4">
250      * http://tools.ietf.org/html/draft-abarth-mime-sniff-05#section-4</a>
251      * @param bytes the bytes to check
252      */
253     private static boolean isBinary(final byte[] bytes) {
254         for (final byte b : bytes) {
255             if ((b >= 0x00 && b < 0x08)
256                 || b == 0x0B
257                 || (b >= 0x0E && b <= 0x1A)
258                 || (b >= 0x1C && b <= 0x1F)) {
259                 return true;
260             }
261         }
262         return false;
263     }
264 
265     private static boolean startsWith(final byte[] bytes, final byte[] lookFor) {
266         if (bytes.length < lookFor.length) {
267             return false;
268         }
269 
270         for (int i = 0; i < lookFor.length; i++) {
271             if (bytes[i] != lookFor[i]) {
272                 return false;
273             }
274         }
275 
276         return true;
277     }
278 
279     private static byte[] read(final InputStream stream, final int maxNb) throws IOException {
280         final byte[] buffer = new byte[maxNb];
281         final int nbRead = stream.read(buffer);
282         if (nbRead == buffer.length) {
283             return buffer;
284         }
285         return ArrayUtils.subarray(buffer, 0, nbRead);
286     }
287 
288     /**
289      * Creates an HtmlPage for this WebResponse.
290      *
291      * @param webResponse the page's source
292      * @param webWindow the WebWindow to place the HtmlPage in
293      * @return the newly created HtmlPage
294      * @throws IOException if the page could not be created
295      */
296     protected HtmlPage createHtmlPage(final WebResponse webResponse, final WebWindow webWindow) throws IOException {
297         final HtmlPage page = new HtmlPage(webResponse, webWindow);
298         webWindow.setEnclosedPage(page);
299 
300         HTML_PARSER.parse(webResponse, page, false, false);
301         return page;
302     }
303 
304     /**
305      * Creates an XHtmlPage for this WebResponse.
306      *
307      * @param webResponse the page's source
308      * @param webWindow the WebWindow to place the HtmlPage in
309      * @return the newly created XHtmlPage
310      * @throws IOException if the page could not be created
311      */
312     protected XHtmlPage createXHtmlPage(final WebResponse webResponse, final WebWindow webWindow) throws IOException {
313         final XHtmlPage page = new XHtmlPage(webResponse, webWindow);
314         webWindow.setEnclosedPage(page);
315 
316         HTML_PARSER.parse(webResponse, page, true, false);
317         return page;
318     }
319 
320     /**
321      * Creates a TextPage for this WebResponse.
322      *
323      * @param webResponse the page's source
324      * @param webWindow the WebWindow to place the TextPage in
325      * @return the newly created TextPage
326      */
327     protected TextPage createTextPage(final WebResponse webResponse, final WebWindow webWindow) {
328         final TextPage newPage = new TextPage(webResponse, webWindow);
329         webWindow.setEnclosedPage(newPage);
330         return newPage;
331     }
332 
333     /**
334      * Creates an UnexpectedPage for this WebResponse.
335      *
336      * @param webResponse the page's source
337      * @param webWindow the WebWindow to place the UnexpectedPage in
338      * @return the newly created UnexpectedPage
339      */
340     protected UnexpectedPage createUnexpectedPage(final WebResponse webResponse, final WebWindow webWindow) {
341         final UnexpectedPage newPage = new UnexpectedPage(webResponse, webWindow);
342         webWindow.setEnclosedPage(newPage);
343         return newPage;
344     }
345 
346     /**
347      * Creates an SgmlPage for this WebResponse.
348      *
349      * @param webResponse the page's source
350      * @param webWindow the WebWindow to place the TextPage in
351      * @return the newly created TextPage
352      * @throws IOException if the page could not be created
353      */
354     protected SgmlPage createXmlPage(final WebResponse webResponse, final WebWindow webWindow) throws IOException {
355         final SgmlPage page = new XmlPage(webResponse, webWindow);
356         webWindow.setEnclosedPage(page);
357         return page;
358     }
359 
360 }