View Javadoc
1   /*
2    * Copyright (c) 2002-2025 Gargoyle Software Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * https://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  package org.htmlunit;
16  
17  import static java.nio.charset.StandardCharsets.UTF_16BE;
18  import static java.nio.charset.StandardCharsets.UTF_16LE;
19  import static java.nio.charset.StandardCharsets.UTF_8;
20  
21  import java.io.IOException;
22  import java.io.InputStream;
23  import java.io.Serializable;
24  import java.net.URL;
25  import java.nio.charset.Charset;
26  import java.util.List;
27  
28  import org.apache.commons.io.ByteOrderMark;
29  import org.apache.commons.io.IOUtils;
30  import org.apache.commons.io.input.BOMInputStream;
31  import org.apache.commons.lang3.StringUtils;
32  import org.apache.commons.logging.Log;
33  import org.apache.commons.logging.LogFactory;
34  import org.htmlunit.http.HttpStatus;
35  import org.htmlunit.util.EncodingSniffer;
36  import org.htmlunit.util.MimeType;
37  import org.htmlunit.util.NameValuePair;
38  
39  /**
40   * A response from a web server.
41   *
42   * @author <a href="mailto:mbowler@GargoyleSoftware.com">Mike Bowler</a>
43   * @author Brad Clarke
44   * @author Noboru Sinohara
45   * @author Marc Guillemot
46   * @author Ahmed Ashour
47   * @author Ronald Brill
48   * @author Lai Quang Duong
49   */
50  public class WebResponse implements Serializable {
51  
52      private static final Log LOG = LogFactory.getLog(WebResponse.class);
53      private static final ByteOrderMark[] BOM_HEADERS = {
54          ByteOrderMark.UTF_8,
55          ByteOrderMark.UTF_16LE,
56          ByteOrderMark.UTF_16BE};
57  
58      private final long loadTime_;
59      private final WebResponseData responseData_;
60      private final WebRequest request_;
61      private boolean wasContentCharsetTentative_;
62      private boolean wasBlocked_;
63      private String blockReason_;
64  
65      /**
66       * Constructs with all data.
67       *
68       * @param responseData      Data that was send back
69       * @param url               Where this response came from
70       * @param requestMethod     the method used to get this response
71       * @param loadTime          How long the response took to be sent
72       */
73      public WebResponse(final WebResponseData responseData, final URL url,
74              final HttpMethod requestMethod, final long loadTime) {
75          this(responseData, new WebRequest(url, requestMethod), loadTime);
76      }
77  
78      /**
79       * Constructs with all data.
80       *
81       * @param responseData      Data that was send back
82       * @param request           the request used to get this response
83       * @param loadTime          How long the response took to be sent
84       */
85      public WebResponse(final WebResponseData responseData,
86              final WebRequest request, final long loadTime) {
87          responseData_ = responseData;
88          request_ = request;
89          loadTime_ = loadTime;
90      }
91  
92      /**
93       * Returns the request used to load this response.
94       * @return the request used to load this response
95       */
96      public WebRequest getWebRequest() {
97          return request_;
98      }
99  
100     /**
101      * Returns the response headers as a list of {@link NameValuePair}s.
102      * @return the response headers as a list of {@link NameValuePair}s
103      */
104     public List<NameValuePair> getResponseHeaders() {
105         return responseData_.getResponseHeaders();
106     }
107 
108     /**
109      * Returns the value of the specified response header.
110      * @param headerName the name of the header whose value is to be returned
111      * @return the header value, {@code null} if no response header exists with this name
112      */
113     public String getResponseHeaderValue(final String headerName) {
114         for (final NameValuePair pair : responseData_.getResponseHeaders()) {
115             if (pair.getName().equalsIgnoreCase(headerName)) {
116                 return pair.getValue();
117             }
118         }
119         return null;
120     }
121 
122     /**
123      * Returns the status code that was returned by the server.
124      * @return the status code that was returned by the server
125      */
126     public int getStatusCode() {
127         return responseData_.getStatusCode();
128     }
129 
130     /**
131      * Returns the status message that was returned from the server.
132      * @return the status message that was returned from the server
133      */
134     public String getStatusMessage() {
135         return responseData_.getStatusMessage();
136     }
137 
138     /**
139      * Returns the content type returned from the server, e.g. "text/html".
140      * @return the content type returned from the server, e.g. "text/html"
141      */
142     public String getContentType() {
143         final String contentTypeHeader = getResponseHeaderValue(HttpHeader.CONTENT_TYPE_LC);
144         if (contentTypeHeader == null) {
145             // Not technically legal but some servers don't return a content-type
146             return "";
147         }
148         final int index = contentTypeHeader.indexOf(';');
149         if (index == -1) {
150             return contentTypeHeader;
151         }
152         return contentTypeHeader.substring(0, index);
153     }
154 
155     /**
156      * Returns the content charset specified explicitly in the {@code Content-Type} header
157      * or {@code null} if none was specified.
158      * @return the content charset specified header or {@code null} if none was specified
159      */
160     public Charset getHeaderContentCharset() {
161         final String contentType = getResponseHeaderValue(HttpHeader.CONTENT_TYPE_LC);
162         if (contentType == null) {
163             return null;
164         }
165 
166         final int index = contentType.indexOf(';');
167         if (index == -1 || index == 0) {
168             return null;
169         }
170         if (StringUtils.isBlank(contentType.substring(0, index))) {
171             return null;
172         }
173 
174         return EncodingSniffer.extractEncodingFromContentType(contentType);
175     }
176 
177     /**
178      * Returns the content charset for this response, even if no charset was specified explicitly.
179      * <p>
180      * This method always returns a valid charset. This method first checks the {@code Content-Type}
181      * header or in the content BOM for viable charset. If not found, it attempts to determine the
182      * charset based on the type of the content. As a last resort, this method returns the
183      * value of {@link org.htmlunit.WebRequest#getDefaultResponseContentCharset()} which is
184      * {@link java.nio.charset.StandardCharsets#UTF_8} by default.
185      * @return the content charset for this response
186      */
187     public Charset getContentCharset() {
188         wasContentCharsetTentative_ = false;
189 
190         try (InputStream is = getContentAsStreamWithBomIfApplicable()) {
191             if (is instanceof BOMInputStream) {
192                 final String bomCharsetName = ((BOMInputStream) is).getBOMCharsetName();
193                 if (bomCharsetName != null) {
194                     return Charset.forName(bomCharsetName);
195                 }
196             }
197 
198             Charset charset = getHeaderContentCharset();
199             if (charset != null) {
200                 return charset;
201             }
202 
203             final String contentType = getContentType();
204             switch (DefaultPageCreator.determinePageType(contentType)) {
205                 case HTML:
206                     charset = EncodingSniffer.sniffEncodingFromMetaTag(is);
207                     wasContentCharsetTentative_ = true;
208                     break;
209                 case XML:
210                     charset = EncodingSniffer.sniffEncodingFromXmlDeclaration(is);
211                     if (charset == null) {
212                         charset = UTF_8;
213                     }
214                     break;
215                 default:
216                     if (MimeType.TEXT_CSS.equals(contentType)) {
217                         charset = EncodingSniffer.sniffEncodingFromCssDeclaration(is);
218                     }
219                     break;
220             }
221 
222             if (charset != null) {
223                 return charset;
224             }
225         }
226         catch (final IOException e) {
227             LOG.warn("Error trying to sniff encoding.", e);
228             wasContentCharsetTentative_ = true;
229         }
230         return getWebRequest().getDefaultResponseContentCharset();
231     }
232 
233     /**
234      * Returns whether the charset of the previous call to {@link #getContentCharset()} was "tentative".
235      * <p>
236      * A charset is classed as "tentative" if its detection is prone to false positive/negatives.
237      * <p>
238      * For example, HTML meta-tag sniffing can be fooled by text that looks-like-a-meta-tag inside
239      * JavaScript code (false positive) or if the meta-tag is after the first 1024 bytes (false negative).
240      * @return {@code true} if the charset of the previous call to {@link #getContentCharset()} was
241      *         "tentative".
242      * @see <a href="https://html.spec.whatwg.org/multipage/parsing.html#concept-encoding-confidence">
243      * https://html.spec.whatwg.org/multipage/parsing.html#concept-encoding-confidence</a>
244      */
245     public boolean wasContentCharsetTentative() {
246         return wasContentCharsetTentative_;
247     }
248 
249     /**
250      * Returns the response content as a string, using the charset/encoding specified in the server response.
251      * @return the response content as a string, using the charset/encoding specified in the server response
252      *         or null if the content retrieval was failing
253      */
254     public String getContentAsString() {
255         return getContentAsString(getContentCharset());
256     }
257 
258     /**
259      * Returns the response content as a string, using the specified charset,
260      * rather than the charset/encoding specified in the server response.
261      * If there is a bom header the charset parameter will be overwritten by the bom.
262      * @param encoding the charset/encoding to use to convert the response content into a string
263      * @return the response content as a string or null if the content retrieval was failing
264      */
265     public String getContentAsString(final Charset encoding) {
266         if (responseData_ != null) {
267             try (InputStream in = responseData_.getInputStreamWithBomIfApplicable(BOM_HEADERS)) {
268                 if (in instanceof BOMInputStream) {
269                     try (BOMInputStream bomIn = (BOMInputStream) in) {
270                         // there seems to be a bug in BOMInputStream
271                         // we have to call this before hasBOM(ByteOrderMark)
272                         if (bomIn.hasBOM()) {
273                             if (bomIn.hasBOM(ByteOrderMark.UTF_8)) {
274                                 return IOUtils.toString(bomIn, UTF_8);
275                             }
276                             if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) {
277                                 return IOUtils.toString(bomIn, UTF_16BE);
278                             }
279                             if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) {
280                                 return IOUtils.toString(bomIn, UTF_16LE);
281                             }
282                         }
283                         return IOUtils.toString(bomIn, encoding);
284                     }
285                 }
286 
287                 return IOUtils.toString(in, encoding);
288             }
289             catch (final IOException e) {
290                 LOG.warn(e.getMessage(), e);
291             }
292         }
293         return null;
294     }
295 
296     /**
297      * Returns length of the content data.
298      * @return the length
299      */
300     public long getContentLength() {
301         if (responseData_ == null) {
302             return 0;
303         }
304         return responseData_.getContentLength();
305     }
306 
307     /**
308      * Returns the response content as an input stream.
309      * @return the response content as an input stream
310      * @throws IOException in case of IOProblems
311      */
312     public InputStream getContentAsStream() throws IOException {
313         return responseData_.getInputStream();
314     }
315 
316     /**
317      * <span style="color:red">INTERNAL API - SUBJECT TO CHANGE AT ANY TIME - USE AT YOUR OWN RISK.</span><br>
318      *
319      * @return the associated InputStream wrapped with a bom input stream if applicable
320      * @throws IOException in case of IO problems
321      */
322     public InputStream getContentAsStreamWithBomIfApplicable() throws IOException {
323         if (responseData_ != null) {
324             return responseData_.getInputStreamWithBomIfApplicable(BOM_HEADERS);
325         }
326         return null;
327     }
328 
329     /**
330      * Returns the time it took to load this web response, in milliseconds.
331      * @return the time it took to load this web response, in milliseconds
332      */
333     public long getLoadTime() {
334         return loadTime_;
335     }
336 
337     /**
338      * Clean up the response data.
339      */
340     public void cleanUp() {
341         if (responseData_ != null) {
342             responseData_.cleanUp();
343         }
344     }
345 
346     /**
347      * @return true if the 2xx
348      */
349     public boolean isSuccess() {
350         final int statusCode = getStatusCode();
351         return statusCode >= HttpStatus.OK_200 && statusCode < HttpStatus.MULTIPLE_CHOICES_300;
352     }
353 
354     /**
355      * @return true if the 2xx or 305
356      */
357     public boolean isSuccessOrUseProxy() {
358         final int statusCode = getStatusCode();
359         return (statusCode >= HttpStatus.OK_200 && statusCode < HttpStatus.MULTIPLE_CHOICES_300)
360                 || statusCode == HttpStatus.USE_PROXY_305;
361     }
362 
363     /**
364      * @return true if the 2xx or 305
365      */
366     public boolean isSuccessOrUseProxyOrNotModified() {
367         final int statusCode = getStatusCode();
368         return (statusCode >= HttpStatus.OK_200 && statusCode < HttpStatus.MULTIPLE_CHOICES_300)
369                 || statusCode == HttpStatus.USE_PROXY_305
370                 || statusCode == HttpStatus.NOT_MODIFIED_304;
371     }
372 
373     /**
374      * @return true if the request was blocked
375      */
376     public boolean wasBlocked() {
377         return wasBlocked_;
378     }
379 
380     /**
381      * @return the reason for blocking or null
382      */
383     public String getBlockReason() {
384         return blockReason_;
385     }
386 
387     /**
388      * Sets the wasBlocked state to true.
389      *
390      * @param blockReason the reason
391      */
392     public void markAsBlocked(final String blockReason) {
393         wasBlocked_ = true;
394         blockReason_ = blockReason;
395     }
396 }