View Javadoc
1   /*
2    * Copyright (c) 2002-2025 Gargoyle Software Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * https://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  package org.htmlunit.html.serializer;
16  
17  import static org.junit.Assert.assertEquals;
18  import static org.junit.Assert.assertTrue;
19  
20  import java.io.IOException;
21  import java.util.Arrays;
22  
23  import org.apache.commons.lang3.StringUtils;
24  import org.htmlunit.WebClient;
25  import org.htmlunit.html.HtmlPage;
26  import org.htmlunit.html.serializer.HtmlSerializerNormalizedText.HtmlSerializerTextBuilder;
27  import org.htmlunit.html.serializer.HtmlSerializerNormalizedText.HtmlSerializerTextBuilder.Mode;
28  import org.junit.Test;
29  
30  /**
31   * Tests for {@link HtmlSerializerNormalizedText}.
32   *
33   * @author Ronald Brill
34   */
35  public class HtmlSerializerNormalizedTextTest {
36  
37      /**
38       * Test {@link HtmlSerializerTextBuilder}.
39       */
40      @Test
41      public void normalize() {
42          HtmlSerializerTextBuilder serializer = new HtmlSerializerNormalizedText.HtmlSerializerTextBuilder();
43          serializer.append("", Mode.NORMALIZE);
44          assertEquals("", serializer.getText());
45  
46          serializer = new HtmlSerializerNormalizedText.HtmlSerializerTextBuilder();
47          serializer.append(" \t\r\n ", Mode.NORMALIZE);
48          assertEquals("", serializer.getText());
49  
50          serializer = new HtmlSerializerNormalizedText.HtmlSerializerTextBuilder();
51          serializer.appendBlockSeparator();
52          assertEquals("", serializer.getText());
53  
54          serializer = new HtmlSerializerNormalizedText.HtmlSerializerTextBuilder();
55          serializer.appendBlockSeparator();
56          serializer.append(" ", Mode.NORMALIZE);
57          assertEquals("", serializer.getText());
58  
59          serializer = new HtmlSerializerNormalizedText.HtmlSerializerTextBuilder();
60          serializer.append(" ", Mode.NORMALIZE);
61          serializer.appendBlockSeparator();
62          assertEquals("", serializer.getText());
63  
64          serializer = new HtmlSerializerNormalizedText.HtmlSerializerTextBuilder();
65          serializer.append(" ", Mode.NORMALIZE);
66          serializer.appendBlockSeparator();
67          serializer.append(" ", Mode.NORMALIZE);
68          assertEquals("", serializer.getText());
69  
70          serializer = new HtmlSerializerNormalizedText.HtmlSerializerTextBuilder();
71          serializer.append(" a  ", Mode.NORMALIZE);
72          serializer.appendBlockSeparator();
73          assertEquals("a", serializer.getText());
74  
75          serializer = new HtmlSerializerNormalizedText.HtmlSerializerTextBuilder();
76          serializer.append(" a  ", Mode.NORMALIZE);
77          serializer.appendBlockSeparator();
78          serializer.append("  x ", Mode.NORMALIZE);
79          assertEquals("a\nx", serializer.getText());
80  
81          serializer = new HtmlSerializerNormalizedText.HtmlSerializerTextBuilder();
82          serializer.append("a", Mode.NORMALIZE);
83          serializer.appendBlockSeparator();
84          serializer.append("x", Mode.NORMALIZE);
85          assertEquals("a\nx", serializer.getText());
86  
87          serializer = new HtmlSerializerNormalizedText.HtmlSerializerTextBuilder();
88          serializer.append("a", Mode.NORMALIZE);
89          serializer.appendBlockSeparator();
90          serializer.appendBlockSeparator();
91          serializer.append("x", Mode.NORMALIZE);
92          assertEquals("a\nx", serializer.getText());
93  
94          serializer = new HtmlSerializerNormalizedText.HtmlSerializerTextBuilder();
95          serializer.append("a", Mode.NORMALIZE);
96          serializer.appendBlockSeparator();
97          serializer.append("  ", Mode.NORMALIZE);
98          serializer.appendBlockSeparator();
99          serializer.append("x", Mode.NORMALIZE);
100         assertEquals("a\nx", serializer.getText());
101 
102         serializer = new HtmlSerializerNormalizedText.HtmlSerializerTextBuilder();
103         serializer.appendNewLine();
104         assertEquals("\n", serializer.getText());
105 
106         serializer = new HtmlSerializerNormalizedText.HtmlSerializerTextBuilder();
107         serializer.appendNewLine();
108         serializer.append(" ", Mode.NORMALIZE);
109         assertEquals("\n", serializer.getText());
110 
111         serializer = new HtmlSerializerNormalizedText.HtmlSerializerTextBuilder();
112         serializer.append(" ", Mode.NORMALIZE);
113         serializer.appendNewLine();
114         assertEquals("\n", serializer.getText());
115 
116         serializer = new HtmlSerializerNormalizedText.HtmlSerializerTextBuilder();
117         serializer.append(" ", Mode.NORMALIZE);
118         serializer.appendNewLine();
119         serializer.append(" ", Mode.NORMALIZE);
120         assertEquals("\n", serializer.getText());
121 
122         serializer = new HtmlSerializerNormalizedText.HtmlSerializerTextBuilder();
123         serializer.appendNewLine();
124         serializer.appendBlockSeparator();
125         serializer.append("x", Mode.NORMALIZE);
126         assertEquals("x", serializer.getText());
127 
128         serializer = new HtmlSerializerNormalizedText.HtmlSerializerTextBuilder();
129         serializer.append("a", Mode.NORMALIZE);
130         serializer.appendNewLine();
131         serializer.appendBlockSeparator();
132         serializer.append("x", Mode.NORMALIZE);
133         assertEquals("a\nx", serializer.getText());
134 
135         serializer = new HtmlSerializerNormalizedText.HtmlSerializerTextBuilder();
136         serializer.append("a", Mode.NORMALIZE);
137         serializer.appendBlockSeparator();
138         serializer.appendBlockSeparator();
139         serializer.appendBlockSeparator();
140         serializer.append("x", Mode.NORMALIZE);
141         assertEquals("a\nx", serializer.getText());
142 
143         serializer = new HtmlSerializerNormalizedText.HtmlSerializerTextBuilder();
144         serializer.append("a", Mode.NORMALIZE);
145         serializer.appendTab();
146         serializer.append(" ", Mode.NORMALIZE);
147         serializer.appendTab();
148         serializer.append("x", Mode.NORMALIZE);
149         assertEquals("a\t \tx", serializer.getText());
150 
151         serializer = new HtmlSerializerNormalizedText.HtmlSerializerTextBuilder();
152         serializer.appendBlockSeparator();
153         serializer.append("\n", Mode.NORMALIZE);
154         serializer.appendBlockSeparator();
155         serializer.append("x", Mode.NORMALIZE);
156         serializer.appendBlockSeparator();
157         serializer.append("y", Mode.NORMALIZE);
158         serializer.appendNewLine();
159         serializer.appendBlockSeparator();
160         serializer.appendBlockSeparator();
161         assertEquals("x\ny", serializer.getText());
162 
163         serializer = new HtmlSerializerNormalizedText.HtmlSerializerTextBuilder();
164         serializer.append("abc", Mode.NORMALIZE);
165         assertEquals("abc", serializer.getText());
166 
167         serializer = new HtmlSerializerNormalizedText.HtmlSerializerTextBuilder();
168         serializer.append("a     b \t\t\t c \r \r o \n\n\n", Mode.NORMALIZE);
169         assertEquals("a b c o", serializer.getText());
170     }
171 
172     /**
173      * Test {@link HtmlSerializerTextBuilder}.
174      */
175     @Test
176     public void normalizeNbsp() {
177         HtmlSerializerTextBuilder serializer = new HtmlSerializerNormalizedText.HtmlSerializerTextBuilder();
178         serializer.append("abc" + (char) 160 + "x", Mode.NORMALIZE);
179         assertEquals("abc x", serializer.getText());
180 
181         serializer = new HtmlSerializerNormalizedText.HtmlSerializerTextBuilder();
182         serializer.append((char) 160 + "x" + (char) 160, Mode.NORMALIZE);
183         assertEquals(" x ", serializer.getText());
184 
185         serializer = new HtmlSerializerNormalizedText.HtmlSerializerTextBuilder();
186         serializer.appendBlockSeparator();
187         serializer.append((char) 160 + "x" + (char) 160, Mode.NORMALIZE);
188         serializer.appendBlockSeparator();
189         assertEquals(" x ", serializer.getText());
190     }
191 
192     /**
193      * Test {@link HtmlSerializerTextBuilder}.
194      */
195     @Test
196     public void normalize2() {
197         HtmlSerializerTextBuilder serializer = new HtmlSerializerNormalizedText.HtmlSerializerTextBuilder();
198         serializer.append("a", Mode.NORMALIZE);
199         serializer.appendBlockSeparator();
200         serializer.appendBlockSeparator();
201         assertEquals("a", serializer.getText());
202 
203         serializer = new HtmlSerializerNormalizedText.HtmlSerializerTextBuilder();
204         serializer.append("a", Mode.NORMALIZE);
205         serializer.appendBlockSeparator();
206         serializer.append("  ", Mode.NORMALIZE);
207         serializer.appendBlockSeparator();
208         assertEquals("a", serializer.getText());
209     }
210 
211     /**
212      * Test {@link HtmlSerializerTextBuilder}.
213      */
214     @Test
215     public void pre() {
216         final HtmlSerializerTextBuilder serializer = new HtmlSerializerNormalizedText.HtmlSerializerTextBuilder();
217         serializer.append("  hello \t abc ", Mode.PRESERVE_BLANK_TAB_NEWLINE);
218         assertEquals("  hello \t abc ", serializer.getText());
219     }
220 
221     /**
222      * Test {@link HtmlSerializerTextBuilder}.
223      */
224     @Test
225     public void textArea() {
226         final HtmlSerializerTextBuilder serializer = new HtmlSerializerNormalizedText.HtmlSerializerTextBuilder();
227         serializer.append("  hello \t abc ", Mode.PRESERVE_BLANK_NEWLINE);
228         assertEquals("  hello   abc", serializer.getText());
229     }
230 
231     /**
232      * Test {@link HtmlSerializerTextBuilder}.
233      */
234     @Test
235     public void performanceWhitespace() {
236         final int length = 100_000;
237         final char[] charArray = new char[length];
238         Arrays.fill(charArray, ' ');
239         charArray[0] = 'a';
240         charArray[length - 1] = 'a';
241         final String text = new String(charArray);
242 
243         final long time = System.currentTimeMillis();
244         final HtmlSerializerTextBuilder serializer = new HtmlSerializerNormalizedText.HtmlSerializerTextBuilder();
245         serializer.append(text, Mode.NORMALIZE);
246         serializer.getText();
247 
248         final long runTime = System.currentTimeMillis() - time;
249         assertTrue("cleanUp() took too much time", runTime < 200);
250     }
251 
252     /**
253      * Test {@link HtmlSerializerTextBuilder}.
254      */
255     @Test
256     public void performanceManyReplaces() {
257         final String expected = StringUtils.repeat("x\n", 100_000).trim();
258 
259         final long time = System.currentTimeMillis();
260 
261         final HtmlSerializerTextBuilder serializer = new HtmlSerializerNormalizedText.HtmlSerializerTextBuilder();
262 
263         for (int i = 0; i < 100_000; i++) {
264             serializer.append(" x ", Mode.NORMALIZE);
265             serializer.appendBlockSeparator();
266         }
267 
268         assertEquals(expected, serializer.getText());
269 
270         final long runTime = System.currentTimeMillis() - time;
271         assertTrue("cleanUp() took too much time", runTime < 200);
272     }
273 
274     /**
275      * Test {@link HtmlSerializerTextBuilder} special spaces.
276      */
277     @Test
278     public void specialSpaces() {
279         HtmlSerializerTextBuilder serializer = new HtmlSerializerNormalizedText.HtmlSerializerTextBuilder();
280         serializer.append("\u3000", Mode.NORMALIZE);
281         assertEquals("\u3000", serializer.getText());
282 
283         serializer = new HtmlSerializerNormalizedText.HtmlSerializerTextBuilder();
284         serializer.append("\uFEFF", Mode.NORMALIZE);
285         assertEquals("\uFEFF", serializer.getText());
286 
287         serializer = new HtmlSerializerNormalizedText.HtmlSerializerTextBuilder();
288         serializer.append("\u200B", Mode.NORMALIZE);
289         assertEquals("\u200B", serializer.getText());
290     }
291 
292     /**
293      * Test {@link HtmlSerializerTextBuilder} special spaces.
294      */
295     @Test
296     public void variousNewLines() {
297         final HtmlSerializerTextBuilder serializer = new HtmlSerializerNormalizedText.HtmlSerializerTextBuilder();
298         serializer.appendNewLine();
299         serializer.append("\n", Mode.NORMALIZE);
300         serializer.appendBlockSeparator();
301         assertEquals("", serializer.getText());
302     }
303 
304     /**
305      * @throws IOException in case of errors
306      */
307     @Test
308     public void cssEnableDisable1() throws IOException {
309         final String html =
310                 "<div>\r\n"
311                   + "<p>p\r\n"
312                     + "<br>br\r\n"
313                   + "</p>\r\n"
314                 + "</div>";
315         final String expected = "p \nbr";
316 
317         try (WebClient webClient = new WebClient()) {
318             final HtmlPage page = webClient.loadHtmlCodeIntoCurrentWindow(html);
319 
320             assertEquals(expected, page.asNormalizedText());
321 
322             webClient.getOptions().setCssEnabled(false);
323             assertEquals(expected, page.asNormalizedText());
324         }
325     }
326 
327     /**
328      * @throws IOException in case of errors
329      */
330     @Test
331     public void cssEnableDisable2() throws IOException {
332         final String html =
333                 "<div>\r\n"
334                   + "<p>p\r\n"
335                     + "<br>br\r\n"
336                   + "</p>\r\n"
337                   + "<p>p</p>\r\n"
338                 + "</div>";
339         final String expected = "p \nbr\np";
340 
341         try (WebClient webClient = new WebClient()) {
342             final HtmlPage page = webClient.loadHtmlCodeIntoCurrentWindow(html);
343 
344             assertEquals(expected, page.asNormalizedText());
345 
346             webClient.getOptions().setCssEnabled(false);
347             assertEquals(expected, page.asNormalizedText());
348         }
349     }
350 
351     /**
352      * @throws IOException in case of errors
353      */
354     @Test
355     public void cssEnableDisable3() throws IOException {
356         final String html =
357                 "<div>\r\n"
358                   + "<p>p\r\n"
359                     + "<br>br\r\n"
360                   + "</p>\r\n"
361                   + "<p>p</p>\r\n"
362                   + "<p>p\r\n"
363                     + "<br>br\r\n"
364                     + "<br>br\r\n"
365                     + "<br>br\r\n"
366                   + "</p>\r\n"
367                   + "<p>p</p>\r\n"
368                 + "</div>";
369 
370         final String expected =
371                 "p \n"
372                 + "br\n"
373                 + "p\n"
374                 + "p \n"
375                 + "br \n"
376                 + "br \n"
377                 + "br\n"
378                 + "p";
379 
380         try (WebClient webClient = new WebClient()) {
381             final HtmlPage page = webClient.loadHtmlCodeIntoCurrentWindow(html);
382 
383             assertEquals(expected, page.asNormalizedText());
384 
385             webClient.getOptions().setCssEnabled(false);
386             assertEquals(expected, page.asNormalizedText());
387         }
388     }
389 }