1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 package org.htmlunit;
16
17 import java.io.IOException;
18 import java.io.InputStream;
19 import java.io.Serializable;
20 import java.nio.charset.StandardCharsets;
21 import java.util.Locale;
22
23 import org.apache.commons.lang3.ArrayUtils;
24 import org.apache.commons.lang3.StringUtils;
25 import org.htmlunit.html.DomElement;
26 import org.htmlunit.html.Html;
27 import org.htmlunit.html.HtmlPage;
28 import org.htmlunit.html.XHtmlPage;
29 import org.htmlunit.html.parser.HTMLParser;
30 import org.htmlunit.html.parser.neko.HtmlUnitNekoHtmlParser;
31 import org.htmlunit.util.MimeType;
32 import org.htmlunit.xml.XmlPage;
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81 public class DefaultPageCreator implements PageCreator, Serializable {
82
83 private static final byte[] MARKER_UTF8 = {(byte) 0xef, (byte) 0xbb, (byte) 0xbf};
84 private static final byte[] MARKER_UTF16BE = {(byte) 0xfe, (byte) 0xff};
85 private static final byte[] MARKER_UTF16LE = {(byte) 0xff, (byte) 0xfe};
86
87
88
89
90
91 private static final String[] HTML_PATTERNS = {"!DOCTYPE HTML", "HTML", "HEAD", "SCRIPT",
92 "IFRAME", "H1", "DIV", "FONT", "TABLE", "A", "STYLE", "TITLE", "B", "BODY", "BR", "P", "!--" };
93
94 private static final HTMLParser HTML_PARSER = new HtmlUnitNekoHtmlParser();
95
96
97
98
99 public enum PageType {
100
101 HTML,
102
103 JAVASCRIPT,
104
105 XML,
106
107 TEXT,
108
109 UNKNOWN
110 }
111
112
113
114
115
116
117 public static PageType determinePageType(final String contentType) {
118 if (null == contentType) {
119 return PageType.UNKNOWN;
120 }
121
122 final String contentTypeLC = org.htmlunit.util.StringUtils
123 .toRootLowerCase(contentType);
124
125 if (MimeType.isJavascriptMimeType(contentTypeLC)) {
126 return PageType.JAVASCRIPT;
127 }
128 switch (contentTypeLC) {
129 case MimeType.TEXT_HTML:
130 case "image/svg+xml":
131 return PageType.HTML;
132
133 case MimeType.TEXT_XML:
134 case MimeType.APPLICATION_XML:
135 case "text/vnd.wap.wml":
136 return PageType.XML;
137
138 default:
139 if (contentTypeLC.endsWith("+xml")) {
140 return PageType.XML;
141 }
142
143 if (contentTypeLC.startsWith("text/")) {
144 return PageType.TEXT;
145 }
146
147 return PageType.UNKNOWN;
148 }
149 }
150
151
152
153
154
155
156
157 public static PageType determinePageType(final WebResponse webResponse) throws IOException {
158 final String contentType = webResponse.getContentType();
159 if (!StringUtils.isEmpty(contentType)) {
160 return determinePageType(contentType);
161 }
162
163
164 try (InputStream contentAsStream = webResponse.getContentAsStream()) {
165 final byte[] bytes = read(contentAsStream, 512);
166 if (bytes.length == 0) {
167 return determinePageType(MimeType.TEXT_PLAIN);
168 }
169
170
171
172 if (startsWith(bytes, MARKER_UTF8) || startsWith(bytes, MARKER_UTF16BE)
173 || startsWith(bytes, MARKER_UTF16LE)) {
174 return determinePageType(MimeType.TEXT_PLAIN);
175 }
176
177 if (isBinary(bytes)) {
178 return determinePageType(MimeType.APPLICATION_OCTET_STREAM);
179 }
180
181 final String asAsciiString = new String(bytes, StandardCharsets.US_ASCII).trim().toUpperCase(Locale.ROOT);
182
183 if (asAsciiString.startsWith("<?XML")) {
184 return determinePageType(MimeType.TEXT_XML);
185 }
186
187 for (final String htmlPattern : HTML_PATTERNS) {
188 try {
189 if ('<' == asAsciiString.charAt(0)) {
190 if (asAsciiString.startsWith(htmlPattern, 1)) {
191 final char spaceOrBracket = asAsciiString.charAt(htmlPattern.length() + 1);
192 if (' ' == spaceOrBracket || '>' == spaceOrBracket) {
193 return determinePageType(MimeType.TEXT_HTML);
194 }
195 }
196 }
197 }
198 catch (final ArrayIndexOutOfBoundsException ignored) {
199
200 }
201 }
202 }
203 return determinePageType(MimeType.TEXT_PLAIN);
204 }
205
206
207
208
209
210
211
212
213
214 @Override
215 public Page createPage(final WebResponse webResponse, final WebWindow webWindow) throws IOException {
216 final PageType pageType = determinePageType(webResponse);
217 switch (pageType) {
218 case HTML:
219 return createHtmlPage(webResponse, webWindow);
220
221 case JAVASCRIPT:
222 return createHtmlPage(webResponse, webWindow);
223
224 case XML:
225 final SgmlPage sgmlPage = createXmlPage(webResponse, webWindow);
226 final DomElement doc = sgmlPage.getDocumentElement();
227 if (doc != null && Html.XHTML_NAMESPACE.equals(doc.getNamespaceURI())) {
228 return createXHtmlPage(webResponse, webWindow);
229 }
230 return sgmlPage;
231
232 case TEXT:
233 return createTextPage(webResponse, webWindow);
234
235 default:
236 return createUnexpectedPage(webResponse, webWindow);
237 }
238 }
239
240
241
242
243 @Override
244 public HTMLParser getHtmlParser() {
245 return HTML_PARSER;
246 }
247
248
249
250
251
252
253 private static boolean isBinary(final byte[] bytes) {
254 for (final byte b : bytes) {
255 if ((b >= 0x00 && b < 0x08)
256 || b == 0x0B
257 || (b >= 0x0E && b <= 0x1A)
258 || (b >= 0x1C && b <= 0x1F)) {
259 return true;
260 }
261 }
262 return false;
263 }
264
265 private static boolean startsWith(final byte[] bytes, final byte[] lookFor) {
266 if (bytes.length < lookFor.length) {
267 return false;
268 }
269
270 for (int i = 0; i < lookFor.length; i++) {
271 if (bytes[i] != lookFor[i]) {
272 return false;
273 }
274 }
275
276 return true;
277 }
278
279 private static byte[] read(final InputStream stream, final int maxNb) throws IOException {
280 final byte[] buffer = new byte[maxNb];
281 final int nbRead = stream.read(buffer);
282 if (nbRead == buffer.length) {
283 return buffer;
284 }
285 return ArrayUtils.subarray(buffer, 0, nbRead);
286 }
287
288
289
290
291
292
293
294
295
296 protected HtmlPage createHtmlPage(final WebResponse webResponse, final WebWindow webWindow) throws IOException {
297 final HtmlPage page = new HtmlPage(webResponse, webWindow);
298 webWindow.setEnclosedPage(page);
299
300 HTML_PARSER.parse(webResponse, page, false, false);
301 return page;
302 }
303
304
305
306
307
308
309
310
311
312 protected XHtmlPage createXHtmlPage(final WebResponse webResponse, final WebWindow webWindow) throws IOException {
313 final XHtmlPage page = new XHtmlPage(webResponse, webWindow);
314 webWindow.setEnclosedPage(page);
315
316 HTML_PARSER.parse(webResponse, page, true, false);
317 return page;
318 }
319
320
321
322
323
324
325
326
327 protected TextPage createTextPage(final WebResponse webResponse, final WebWindow webWindow) {
328 final TextPage newPage = new TextPage(webResponse, webWindow);
329 webWindow.setEnclosedPage(newPage);
330 return newPage;
331 }
332
333
334
335
336
337
338
339
340 protected UnexpectedPage createUnexpectedPage(final WebResponse webResponse, final WebWindow webWindow) {
341 final UnexpectedPage newPage = new UnexpectedPage(webResponse, webWindow);
342 webWindow.setEnclosedPage(newPage);
343 return newPage;
344 }
345
346
347
348
349
350
351
352
353
354 protected SgmlPage createXmlPage(final WebResponse webResponse, final WebWindow webWindow) throws IOException {
355 final SgmlPage page = new XmlPage(webResponse, webWindow);
356 webWindow.setEnclosedPage(page);
357 return page;
358 }
359
360 }