View Javadoc
1   /*
2    * Copyright (c) 2002-2025 Gargoyle Software Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * https://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  package org.htmlunit;
16  
17  import java.io.Serializable;
18  import java.net.URL;
19  import java.util.Collections;
20  import java.util.Date;
21  import java.util.HashMap;
22  import java.util.Map;
23  import java.util.regex.Matcher;
24  import java.util.regex.Pattern;
25  
26  import org.htmlunit.cssparser.dom.CSSStyleSheetImpl;
27  import org.htmlunit.http.HttpUtils;
28  import org.htmlunit.util.HeaderUtils;
29  import org.htmlunit.util.UrlUtils;
30  
31  /**
32   * <p>Simple cache implementation which caches compiled JavaScript files and parsed CSS snippets. Caching
33   * compiled JavaScript files avoids unnecessary web requests and additional compilation overhead, while
34   * caching parsed CSS snippets avoids very expensive CSS parsing.</p>
35   *
36   * @author Marc Guillemot
37   * @author Daniel Gredler
38   * @author Ahmed Ashour
39   * @author Anton Demydenko
40   * @author Ronald Brill
41   * @author Ashley Frieze
42   */
43  public class Cache implements Serializable {
44  
45      /** The maximum size of the cache. */
46      private int maxSize_ = 40;
47  
48      private static final Pattern DATE_HEADER_PATTERN = Pattern.compile("-?\\d+");
49      static final long DELAY = 10 * org.apache.commons.lang3.time.DateUtils.MILLIS_PER_MINUTE;
50  
51      // for taking ten percent of a number in milliseconds and converting that to the amount in seconds
52      private static final double TEN_PERCENT_OF_MILLISECONDS_IN_SECONDS = 0.0001;
53  
54      /**
55       * The map which holds the cached responses. Note that when keying on URLs, we key on the string version
56       * of the URLs, rather than on the URLs themselves. This is done for performance, because a) the
57       * {@link java.net.URL#hashCode()} method is synchronized, and b) the {@link java.net.URL#hashCode()}
58       * method triggers DNS lookups of the URL hostnames' IPs. As of this writing, the HtmlUnit unit tests
59       * run ~20% faster whey keying on strings rather than on {@link java.net.URL} instances.
60       */
61      private final Map<String, Entry> entries_ = Collections.synchronizedMap(new HashMap<>(maxSize_));
62  
63      /**
64       * A cache entry.
65       */
66      private static class Entry implements Comparable<Entry>, Serializable {
67          private final String key_;
68          private final WebResponse response_;
69          private final Object value_;
70          private long lastAccess_;
71          private final long createdAt_;
72  
73          Entry(final String key, final WebResponse response, final Object value) {
74              key_ = key;
75              response_ = response;
76              value_ = value;
77              createdAt_ = System.currentTimeMillis();
78              lastAccess_ = createdAt_;
79          }
80  
81          /**
82           * {@inheritDoc}
83           */
84          @Override
85          public int compareTo(final Entry other) {
86              return Long.compare(lastAccess_, other.lastAccess_);
87          }
88  
89          /**
90           * {@inheritDoc}
91           */
92          @Override
93          public boolean equals(final Object obj) {
94              return obj instanceof Entry && lastAccess_ == ((Entry) obj).lastAccess_;
95          }
96  
97          /**
98           * {@inheritDoc}
99           */
100         @Override
101         public int hashCode() {
102             return ((Long) lastAccess_).hashCode();
103         }
104 
105         /**
106          * Updates the last access date.
107          */
108         public void touch() {
109             lastAccess_ = System.currentTimeMillis();
110         }
111 
112         /**
113          * Is this cached entry still fresh?
114          * @param now the current time
115          * @return <code>true</code> if can keep in the cache
116          * @see #isWithinCacheWindow(WebResponse, long, long)
117          */
118         boolean isStillFresh(final long now) {
119             return Cache.isWithinCacheWindow(response_, now, createdAt_);
120         }
121     }
122 
123     /**
124      * <p>Find expiry time using
125      * a) s-maxage specified<br />
126      * b) max-age specified<br />
127      * c) expired specified<br />
128      * d) A Last-Update is specified and the time is now within 10% of the difference between download time and update
129      * time</p>
130      *
131      * @see <a href="https://datatracker.ietf.org/doc/html/rfc7234#section-4.2.2">RFC 7234</a>
132      *
133      * @param response {@link WebResponse}
134      * @param now the current time
135      * @param createdAt when the request was downloaded
136      * @return true if still fresh
137      */
138     static boolean isWithinCacheWindow(final WebResponse response, final long now, final long createdAt) {
139         long freshnessLifetime = 0;
140         if (!HeaderUtils.containsPrivate(response) && HeaderUtils.containsSMaxage(response)) {
141             // check s-maxage
142             freshnessLifetime = HeaderUtils.sMaxage(response);
143         }
144         else if (HeaderUtils.containsMaxAge(response)) {
145             // check max-age
146             freshnessLifetime = HeaderUtils.maxAge(response);
147         }
148         else if (response.getResponseHeaderValue(HttpHeader.EXPIRES) != null) {
149             final Date expires = parseDateHeader(response, HttpHeader.EXPIRES);
150             if (expires != null) {
151                 // use the same logic as in isCacheableContent()
152                 return expires.getTime() - now > DELAY;
153             }
154         }
155         else if (response.getResponseHeaderValue(HttpHeader.LAST_MODIFIED) != null) {
156             final Date lastModified = parseDateHeader(response, HttpHeader.LAST_MODIFIED);
157             if (lastModified != null) {
158                 freshnessLifetime = (long) ((createdAt - lastModified.getTime())
159                         * TEN_PERCENT_OF_MILLISECONDS_IN_SECONDS);
160             }
161         }
162         return now - createdAt < freshnessLifetime * org.apache.commons.lang3.time.DateUtils.MILLIS_PER_SECOND;
163     }
164 
165     /**
166      * Caches the specified object, if the corresponding request and response objects indicate
167      * that it is cacheable.
168      *
169      * @param request the request corresponding to the specified compiled script
170      * @param response the response corresponding to the specified compiled script
171      * @param toCache the object that is to be cached, if possible (may be for instance a compiled script or
172      *        simply a WebResponse)
173      * @return whether the response was cached or not
174      */
175     public boolean cacheIfPossible(final WebRequest request, final WebResponse response, final Object toCache) {
176         if (isCacheable(request, response)) {
177             final URL url = request.getUrl();
178             if (url == null) {
179                 return false;
180             }
181 
182             final Entry entry = new Entry(UrlUtils.normalize(url), response, toCache);
183             entries_.put(entry.key_, entry);
184             deleteOverflow();
185             return true;
186         }
187 
188         return false;
189     }
190 
191     /**
192      * Caches the parsed version of the specified CSS snippet. We key the cache based on CSS snippets (rather
193      * than requests and responses as is done above) because a) this allows us to cache inline CSS, b) CSS is
194      * extremely expensive to parse, so we want to avoid it as much as possible, c) CSS files aren't usually
195      * nearly as large as JavaScript files, so memory bloat won't be too bad, and d) caching on requests and
196      * responses requires checking dynamically (see {@link #isCacheableContent(WebResponse)}), and headers often
197      * aren't set up correctly, disallowing caching when in fact it should be allowed.
198      *
199      * @param css the CSS snippet from which <code>styleSheet</code> is derived
200      * @param styleSheet the parsed version of <code>css</code>
201      */
202     public void cache(final String css, final CSSStyleSheetImpl styleSheet) {
203         final Entry entry = new Entry(css, null, styleSheet);
204         entries_.put(entry.key_, entry);
205         deleteOverflow();
206     }
207 
208     /**
209      * Truncates the cache to the maximal number of entries.
210      */
211     protected void deleteOverflow() {
212         synchronized (entries_) {
213             while (entries_.size() > maxSize_) {
214                 final Entry oldestEntry = Collections.min(entries_.values());
215                 entries_.remove(oldestEntry.key_);
216                 if (oldestEntry.response_ != null) {
217                     oldestEntry.response_.cleanUp();
218                 }
219             }
220         }
221     }
222 
223     /**
224      * Determines if the specified response can be cached.
225      *
226      * @param request the performed request
227      * @param response the received response
228      * @return {@code true} if the response can be cached
229      */
230     protected boolean isCacheable(final WebRequest request, final WebResponse response) {
231         return HttpMethod.GET == response.getWebRequest().getHttpMethod()
232             && UrlUtils.URL_ABOUT_BLANK != request.getUrl()
233             && isCacheableContent(response);
234     }
235 
236     /**
237      * <p>Perform prior validation for 'no-store' directive in Cache-Control header.</p>
238      *
239      * <p>Tries to guess if the content is dynamic or not.</p>
240      *
241      * <p>"Since origin servers do not always provide explicit expiration times, HTTP caches typically
242      * assign heuristic expiration times, employing algorithms that use other header values (such as the
243      * <code>Last-Modified</code> time) to estimate a plausible expiration time".</p>
244      *
245      * <p>The current implementation considers as dynamic content everything except responses with a
246      * <code>Last-Modified</code> header with a date older than 10 minutes or with an <code>Expires</code> header
247      * specifying expiration in more than 10 minutes.</p>
248      *
249      * @see <a href="https://tools.ietf.org/html/rfc7234">RFC 7234</a>
250      * @see <a href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec13.html">RFC 2616</a>
251      * @param response the response to examine
252      * @return {@code true} if the response should be considered as cacheable
253      */
254     protected boolean isCacheableContent(final WebResponse response) {
255         if (HeaderUtils.containsNoStore(response)) {
256             return false;
257         }
258 
259         final long now = getCurrentTimestamp();
260         return isWithinCacheWindow(response, now, now);
261     }
262 
263     /**
264      * Gets the current time stamp. As method to allow overriding it, when simulating another time.
265      * @return the current time stamp
266      */
267     protected long getCurrentTimestamp() {
268         return System.currentTimeMillis();
269     }
270 
271     /**
272      * Parses and returns the specified date header of the specified response. This method
273      * returns {@code null} if the specified header cannot be found or cannot be parsed as a date.
274      *
275      * @param response the response
276      * @param headerName the header name
277      * @return the specified date header of the specified response
278      */
279     protected static Date parseDateHeader(final WebResponse response, final String headerName) {
280         final String value = response.getResponseHeaderValue(headerName);
281         if (value == null) {
282             return null;
283         }
284         final Matcher matcher = DATE_HEADER_PATTERN.matcher(value);
285         if (matcher.matches()) {
286             return new Date();
287         }
288         return HttpUtils.parseDate(value);
289     }
290 
291     /**
292      * Returns the cached response corresponding to the specified request. If there is
293      * no corresponding cached object, this method returns {@code null}.
294      *
295      * <p>Calculates and check if object still fresh(RFC 7234) otherwise returns {@code null}.</p>
296      * @see <a href="https://tools.ietf.org/html/rfc7234">RFC 7234</a>
297      *
298      * @param request the request whose corresponding response is sought
299      * @return the cached response corresponding to the specified request if any
300      */
301     public WebResponse getCachedResponse(final WebRequest request) {
302         final Entry cachedEntry = getCacheEntry(request);
303         if (cachedEntry == null) {
304             return null;
305         }
306         return cachedEntry.response_;
307     }
308 
309     /**
310      * Returns the cached object corresponding to the specified request. If there is
311      * no corresponding cached object, this method returns {@code null}.
312      *
313      * <p>Calculates and check if object still fresh(RFC 7234) otherwise returns {@code null}.</p>
314      * @see <a href="https://tools.ietf.org/html/rfc7234">RFC 7234</a>
315      *
316      * @param request the request whose corresponding cached compiled script is sought
317      * @return the cached object corresponding to the specified request if any
318      */
319     public Object getCachedObject(final WebRequest request) {
320         final Entry cachedEntry = getCacheEntry(request);
321         if (cachedEntry == null) {
322             return null;
323         }
324         return cachedEntry.value_;
325     }
326 
327     private Entry getCacheEntry(final WebRequest request) {
328         if (HttpMethod.GET != request.getHttpMethod()) {
329             return null;
330         }
331 
332         final URL url = request.getUrl();
333         if (url == null) {
334             return null;
335         }
336 
337         final String normalizedUrl = UrlUtils.normalize(url);
338         final Entry cachedEntry = entries_.get(normalizedUrl);
339         if (cachedEntry == null) {
340             return null;
341         }
342 
343         if (cachedEntry.isStillFresh(getCurrentTimestamp())) {
344             synchronized (entries_) {
345                 cachedEntry.touch();
346             }
347             return cachedEntry;
348         }
349         entries_.remove(UrlUtils.normalize(url));
350         return null;
351     }
352 
353     /**
354      * Returns the cached parsed version of the specified CSS snippet. If there is no
355      * corresponding cached stylesheet, this method returns {@code null}.
356      *
357      * @param css the CSS snippet whose cached stylesheet is sought
358      * @return the cached stylesheet corresponding to the specified CSS snippet
359      */
360     public CSSStyleSheetImpl getCachedStyleSheet(final String css) {
361         final Entry cachedEntry = entries_.get(css);
362         if (cachedEntry == null) {
363             return null;
364         }
365         synchronized (entries_) {
366             cachedEntry.touch();
367         }
368         return (CSSStyleSheetImpl) cachedEntry.value_;
369     }
370 
371     /**
372      * Returns the cache's maximum size. This is the maximum number of files that will
373      * be cached. The default is <code>25</code>.
374      *
375      * @return the cache's maximum size
376      */
377     public int getMaxSize() {
378         return maxSize_;
379     }
380 
381     /**
382      * Sets the cache's maximum size. This is the maximum number of files that will
383      * be cached. The default is <code>25</code>.
384      *
385      * @param maxSize the cache's maximum size (must be &gt;= 0)
386      */
387     public void setMaxSize(final int maxSize) {
388         if (maxSize < 0) {
389             throw new IllegalArgumentException("Illegal value for maxSize: " + maxSize);
390         }
391         maxSize_ = maxSize;
392         deleteOverflow();
393     }
394 
395     /**
396      * Returns the number of entries in the cache.
397      *
398      * @return the number of entries in the cache
399      */
400     public int getSize() {
401         return entries_.size();
402     }
403 
404     /**
405      * Clears the cache.
406      */
407     public void clear() {
408         synchronized (entries_) {
409             for (final Entry entry : entries_.values()) {
410                 if (entry.response_ != null) {
411                     entry.response_.cleanUp();
412                 }
413             }
414             entries_.clear();
415         }
416     }
417 
418     /**
419      * Removes outdated entries from the cache.
420      */
421     public void clearOutdated() {
422         synchronized (entries_) {
423             final long now = getCurrentTimestamp();
424 
425             entries_.entrySet().removeIf(entry -> entry.getValue().response_ == null
426                     || !entry.getValue().isStillFresh(now));
427         }
428     }
429 }