View Javadoc
1   /*
2    * Copyright (c) 2002-2025 Gargoyle Software Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * https://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  package org.htmlunit.protocol.data;
16  
17  import static java.nio.charset.StandardCharsets.US_ASCII;
18  import static org.htmlunit.protocol.data.DataURLConnection.DATA_PREFIX;
19  
20  import java.io.UnsupportedEncodingException;
21  import java.net.URL;
22  import java.nio.charset.Charset;
23  import java.nio.charset.IllegalCharsetNameException;
24  import java.nio.charset.UnsupportedCharsetException;
25  import java.util.Base64;
26  
27  import org.apache.commons.lang3.StringUtils;
28  import org.htmlunit.util.MimeType;
29  import org.htmlunit.util.UrlUtils;
30  
31  /**
32   * Helper to work with data URLs.
33   * @see <a href="http://www.ietf.org/rfc/rfc2397.txt">RFC2397</a>
34   * @author Marc Guillemot
35   * @author Ronald Brill
36   * @author Carsten Steul
37   */
38  public class DataUrlDecoder {
39      private static final Charset DEFAULT_CHARSET = US_ASCII;
40      private static final String DEFAULT_MEDIA_TYPE = MimeType.TEXT_PLAIN;
41      private final String mediaType_;
42      private final Charset charset_;
43      private final byte[] content_;
44  
45      /**
46       * C'tor.
47       * @param data the data
48       * @param mediaType the media type
49       * @param charset the charset
50       */
51      protected DataUrlDecoder(final byte[] data, final String mediaType, final Charset charset) {
52          content_ = data;
53          mediaType_ = mediaType;
54          charset_ = charset;
55      }
56  
57      /**
58       * Decodes a data URL providing simple access to the information contained by the URL.
59       * @param url the URL to decode
60       * @return the {@link DataUrlDecoder} holding decoded information
61       * @throws UnsupportedEncodingException if the encoding specified by the data URL is invalid or not
62       */
63      public static DataUrlDecoder decode(final URL url) throws UnsupportedEncodingException {
64          return decodeDataURL(url.toExternalForm());
65      }
66  
67      /**
68       * Decodes a data URL providing simple access to the information contained by the URL.
69       * @param url the string representation of the URL to decode
70       * @return the {@link DataUrlDecoder} holding decoded information
71       * @throws UnsupportedEncodingException if the encoding specified by the data URL is invalid or not
72       *         available on the JVM
73       */
74      public static DataUrlDecoder decodeDataURL(final String url) throws UnsupportedEncodingException {
75          if (!url.startsWith(DATA_PREFIX)) {
76              throw new UnsupportedEncodingException("Invalid data url: '" + url + "' (wrong prefix)");
77          }
78          final int comma = url.indexOf(',');
79          if (comma < 0) {
80              throw new UnsupportedEncodingException("Invalid data url: '" + url + "' (no data)");
81          }
82  
83          String beforeData = url.substring(DATA_PREFIX.length(), comma);
84          final boolean base64 = beforeData.endsWith(";base64");
85          if (base64) {
86              beforeData = beforeData.substring(0, beforeData.length() - 7);
87          }
88          final String mediaType = extractMediaType(beforeData);
89          final Charset charset = extractCharset(beforeData);
90  
91          try {
92              byte[] data = url.substring(comma + 1).getBytes(charset);
93              data = UrlUtils.decodeDataUrl(data, base64);
94              if (base64) {
95                  data = Base64.getDecoder().decode(data);
96              }
97              return new DataUrlDecoder(data, mediaType, charset);
98          }
99          catch (final IllegalArgumentException e) {
100             final UnsupportedEncodingException ex =
101                     new UnsupportedEncodingException("Invalid data url: '" + url + "' (data decoding failed)");
102             ex.initCause(e);
103             throw ex;
104         }
105     }
106 
107     private static Charset extractCharset(final String beforeData) {
108         if (beforeData.contains(";")) {
109             String charsetName = StringUtils.substringAfter(beforeData, ";");
110             charsetName = charsetName.trim();
111             if (charsetName.startsWith("charset=")) {
112                 charsetName = charsetName.substring(8);
113             }
114             try {
115                 return Charset.forName(charsetName);
116             }
117             catch (final UnsupportedCharsetException | IllegalCharsetNameException e) {
118                 return DEFAULT_CHARSET;
119             }
120         }
121         return DEFAULT_CHARSET;
122     }
123 
124     private static String extractMediaType(final String beforeData) {
125         if (beforeData.contains("/")) {
126             if (beforeData.contains(";")) {
127                 return StringUtils.substringBefore(beforeData, ";");
128             }
129             return beforeData;
130         }
131         return DEFAULT_MEDIA_TYPE;
132     }
133 
134     /**
135      * Gets the media type information contained in the data URL.
136      * @return "text/plain" if the URL didn't contain any media type information
137      */
138     public String getMediaType() {
139         return mediaType_;
140     }
141 
142     /**
143      * Gets the charset information specified in the data URL.
144      * @return "US-ASCII" if the URL didn't contain any charset information
145      */
146     public String getCharset() {
147         return charset_.name();
148     }
149 
150     /**
151      * Gets the bytes contained in the data URL.
152      * @return the content
153      */
154     public byte[] getBytes() {
155         return content_;
156     }
157 
158     /**
159      * Gets the text content of the data URL. This makes sense only for data URL that
160      * represents some text.
161      * @return the text content
162      * @throws UnsupportedEncodingException if decoding failed using the specified charset
163      */
164     public String getDataAsString() throws UnsupportedEncodingException {
165         return new String(content_, charset_);
166     }
167 }