View Javadoc
1   /*
2    * Copyright (c) 2002-2025 Gargoyle Software Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * https://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  package org.htmlunit.util;
16  
17  import static java.nio.charset.StandardCharsets.UTF_8;
18  import static org.htmlunit.util.EncodingSniffer.extractEncodingFromContentType;
19  import static org.htmlunit.util.EncodingSniffer.sniffEncodingFromCssDeclaration;
20  import static org.htmlunit.util.EncodingSniffer.sniffEncodingFromMetaTag;
21  import static org.htmlunit.util.EncodingSniffer.sniffEncodingFromXmlDeclaration;
22  import static org.junit.Assert.assertSame;
23  
24  import java.io.ByteArrayInputStream;
25  import java.nio.charset.Charset;
26  
27  import org.htmlunit.HttpHeader;
28  import org.junit.Test;
29  
30  /**
31   * Unit tests for {@link EncodingSniffer}.
32   *
33   * @author Daniel Gredler
34   * @author Ahmed Ashour
35   * @author Ronald Brill
36   * @author Lai Quang Duong
37   */
38  public class EncodingSnifferTest {
39  
40      /**
41       * @throws Exception if an error occurs
42       */
43      @Test
44      public void fromHttpHeaders() throws Exception {
45          header(null, null, null);
46          header(null, "foo", "bar");
47          header(null, HttpHeader.CONTENT_TYPE, "blah");
48  
49          header(null, HttpHeader.CONTENT_TYPE, "text/html;charset=blah");
50          header(UTF_8, HttpHeader.CONTENT_TYPE, "text/html;charset=utf-8");
51          header(UTF_8, HttpHeader.CONTENT_TYPE, "text/html;charset=utf-8;");
52  
53          header(UTF_8, HttpHeader.CONTENT_TYPE, "text/xml;charset=UTF-8");
54      }
55  
56      /**
57       * @throws Exception if an error occurs
58       */
59      @Test
60      public void fromHttpHeadersNoContentType() throws Exception {
61          header(null, HttpHeader.CONTENT_TYPE, "charset=blah");
62          header(UTF_8, HttpHeader.CONTENT_TYPE, "charset=utf-8");
63          header(UTF_8, HttpHeader.CONTENT_TYPE, ";charset=utf-8;");
64      }
65  
66      private static void header(final Charset expectedEncoding, final String headerName, final String headerValue) {
67          assertSame(expectedEncoding, extractEncodingFromContentType(headerValue));
68      }
69  
70      /**
71       * @throws Exception if an error occurs
72       */
73      @Test
74      public void fromMetaTag() throws Exception {
75          meta(null, "");
76          meta(null, "foo");
77          meta(null, "<!--");
78          meta(null, " <!-- blah");
79          meta(null, " <!-- blah --> ");
80          meta(null, "<");
81          meta(null, "</");
82          meta(null, "<meta/>");
83          meta(null, "<meta />");
84          meta(null, "<meta blah />");
85          meta(null, "<meta");
86          meta(null, "<meta ");
87          meta(null, "<meta blah");
88          meta(null, "<meta blah  ");
89          meta(null, "<meta a='b'");
90          meta(null, "<meta a='b' c=d e=\"f\"/>");
91          meta(null, "<meta a='b' c=d e=\"f\" content='text/html; charset=blah' />");
92          meta(UTF_8, "<meta a='b' c=d e=\"f\" content='text/html; charset=utf-8' />");
93          meta(UTF_8, "abc <meta http-equiv='Content-Type' content='text/html; charset=utf-8'/>");
94          meta(UTF_8, "abc <meta http-equiv='Content-Type' content='text/html; CHARSET=UTF-8'/>");
95          meta(UTF_8, "abc <meta http-equiv='Content-Type' content='text/html; chArsEt=UtF-8'/>");
96          meta(UTF_8, "<meta a='b' c=d e=\"f\" CONTENT='text/html; CHARSET=utf-8' />");
97      }
98  
99      private static void meta(final Charset expectedEncoding, final String content) throws Exception {
100         assertSame(expectedEncoding, sniffEncodingFromMetaTag(new ByteArrayInputStream(content.getBytes())));
101     }
102 
103     /**
104      * @throws Exception if an error occurs
105      */
106     @Test
107     public void fromXmlDeclaration() throws Exception {
108         xmlDeclaration(null, "");
109         xmlDeclaration(null, "foo");
110         xmlDeclaration(null, "<?");
111         xmlDeclaration(null, "<?xml");
112         xmlDeclaration(null, "<?xml ");
113         xmlDeclaration(null, "<?xml encoding");
114         xmlDeclaration(null, "<?xml encoding=");
115         xmlDeclaration(null, "<?xml encoding='utf-8");
116         xmlDeclaration(null, "<?xml encoding='utf-8'");
117         xmlDeclaration(null, "<?xml encoding='blah'?>");
118         xmlDeclaration(UTF_8, "<?xml encoding='utf-8'?>");
119         xmlDeclaration(null, "<?xml encoding=\"utf-8");
120         xmlDeclaration(null, "<?xml encoding=\"utf-8\"");
121         xmlDeclaration(UTF_8, "<?xml encoding=\"utf-8\"?>");
122     }
123 
124     private static void xmlDeclaration(final Charset expectedEncoding, final String content) throws Exception {
125         assertSame(expectedEncoding, sniffEncodingFromXmlDeclaration(new ByteArrayInputStream(content.getBytes())));
126     }
127 
128     /**
129      * @throws Exception if an error occurs
130      */
131     @Test
132     public void fromCssDeclaration() throws Exception {
133         cssDeclaration(null, "");
134         cssDeclaration(null, "foo");
135         cssDeclaration(null, "@charset");
136         cssDeclaration(null, "@charset \"utf-8");
137         cssDeclaration(null, "@charset \"utf-8\"");
138         cssDeclaration(null, "@charset\"utf-8\";");
139         cssDeclaration(null, "@charset 'utf-8';");
140         cssDeclaration(UTF_8, "@charset \"utf-8\";");
141         cssDeclaration(null, " @charset \"utf-8\";");
142         cssDeclaration(null, "@charset \"blah\";");
143     }
144 
145     private static void cssDeclaration(final Charset expectedEncoding, final String content) throws Exception {
146         assertSame(expectedEncoding, sniffEncodingFromCssDeclaration(new ByteArrayInputStream(content.getBytes())));
147     }
148 
149     /**
150      * @throws Exception if an error occurs
151      */
152     @Test
153     public void fromContentType() throws Exception {
154         contentType(null, null);
155         contentType(null, "");
156         contentType(null, " \t \n ");
157         contentType(null, "foo");
158         contentType(null, MimeType.TEXT_HTML);
159         contentType(null, "\n text/html \t");
160         contentType(null, "\n text/html ; char \t");
161         contentType(null, "\n text/html ; charset \t");
162         contentType(null, "\n text/html ; charset=");
163         contentType(null, "\n text/html ; charset= \t");
164         contentType(null, "\n text/html ; charset =");
165         contentType(null, "\n text/html ; charset = \n");
166         contentType(null, "\n text/html ; charset=blah");
167         contentType(UTF_8, "\n text/html ; charset=utf-8");
168         contentType(UTF_8, "\n text/html ; charset=utf-8;");
169         contentType(UTF_8, "\n text/html ; charset = \n utf-8 ");
170         contentType(UTF_8, "\n text/html ; charset = \n utf-8 ; ");
171         contentType(null, "\n text/html ; charset = \n'");
172         contentType(null, "\n text/html ; charset = \n' ");
173         contentType(null, "\n text/html ; charset = \n' utf-8");
174         contentType(UTF_8, "\n text/html ; charset = \n'utf-8'");
175         contentType(null, "\n text/html ; charset = \n\"");
176         contentType(null, "\n text/html ; charset = \n\" ");
177         contentType(null, "\n text/html ; charset = \n\" utf-8");
178         contentType(UTF_8, "\n text/html ; charset = \n\"utf-8\"");
179     }
180 
181     private static void contentType(final Charset expectedEncoding, final String contentType) {
182         assertSame(expectedEncoding, extractEncodingFromContentType(contentType));
183     }
184 
185 }