View Javadoc
1   /*
2    * Copyright (c) 2002-2025 Gargoyle Software Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * https://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  package org.htmlunit.html;
16  
17  import static java.nio.charset.StandardCharsets.UTF_8;
18  
19  import java.util.ArrayList;
20  import java.util.List;
21  
22  import org.htmlunit.MockWebConnection;
23  import org.htmlunit.SimpleWebTestCase;
24  import org.htmlunit.WebClient;
25  import org.htmlunit.util.MimeType;
26  import org.htmlunit.util.StringUtils;
27  import org.junit.jupiter.api.Test;
28  
29  /**
30   * Tests for {@link DomText}.
31   *
32   * @author Marc Guillemot
33   * @author Ahmed Ashour
34   * @author Rodney Gitzel
35   * @author Sudhan Moghe
36   * @author Philip Graf
37   * @author Ronald Brill
38   */
39  public class DomTextTest extends SimpleWebTestCase {
40  
41      /**
42       * Test the clean up of   in strings.
43       * @throws Exception if the test fails
44       */
45      @Test
46      public void asText_nbsp() throws Exception {
47          testPlainText("a b c  d  e",  "a b c d  e");
48          testPlainText("a b c  d   e", "a b c d   e");
49          testPlainText(" a ", " a ");
50          testPlainText("  a ", "  a ");
51          testPlainText(" a  ", " a  ");
52      }
53  
54      /**
55       * Test font formats, as per bug #490.
56       * See http://sourceforge.net/p/htmlunit/bugs/490/.
57       *
58       * @throws Exception if the test fails
59       */
60      @Test
61      public void asText_fontFormat() throws Exception {
62          testAsText("a <b>b</b> c",  "a b c");
63          testAsText("a <b>b</b>c",   "a bc");
64          testAsText("a<b>b</b> c",   "ab c");
65          testAsText("a<b>b</b>c",    "abc");
66  
67          // italics and teletype should work the same way
68          testAsText("a <i>b</i> c",  "a b c");
69          testAsText("a <i>b</i>c",   "a bc");
70          testAsText("a<i>b</i> c",   "ab c");
71          testAsText("a<i>b</i>c",    "abc");
72  
73          testAsText("a <tt>b</tt> c",  "a b c");
74          testAsText("a <tt>b</tt>c",   "a bc");
75          testAsText("a<tt>b</tt> c",   "ab c");
76          testAsText("a<tt>b</tt>c",    "abc");
77  
78          testAsText("a <font>b</font> c",  "a b c");
79          testAsText("a<font>b</font> c",   "ab c");
80          testAsText("a <font>b</font>c",   "a bc");
81          testAsText("a<font>b</font>c",    "abc");
82  
83          testAsText("a <span>b</span> c",  "a b c");
84          testAsText("a<span>b</span> c",   "ab c");
85          testAsText("a <span>b</span>c",   "a bc");
86          testAsText("a<span>b</span>c",    "abc");
87  
88          testAsText("a<b><font><i>b</i></font></b>c",  "abc");
89          testAsText("a<b><font> <i>b</i></font></b>c", "a bc");
90      }
91  
92      /**
93       * This test once tested regression for bug #490 but the expectations have been changed
94       * as asNormalizedText() should now use new lines when appropriate.
95       * @throws Exception if the test fails
96       */
97      @Test
98      public void asNormalizedTextRegression() throws Exception {
99          String expected = "a\nb\nc";
100         testAsText("a<ul><li>b</ul>c", expected);
101         testAsText("a<p>b<br>c", expected);
102         testAsText("a<table><tr><td>b</td></tr></table>c", expected);
103         testAsText("a<div>b</div>c", expected);
104 
105         expected = "a\nb\nb\nc";
106         testAsText("a<table><tr><td> b </td></tr>\n<tr><td> b </td></tr></table>c", expected);
107     }
108 
109     /**
110      * Checks the HtmlTable* objects themselves.
111      * @throws Exception if the test fails
112      */
113     @Test
114     public void asText_table_elements() throws Exception {
115         final String html = "<table id='table'><tr id='row'><td id='cell'> b </td></tr>\n</table>\n";
116         final String content = DOCTYPE_HTML + "<html><body><span id='foo'>" + html + "</span></body></html>";
117 
118         final HtmlPage page = loadPage(content);
119 
120         assertEquals("b", page.getHtmlElementById("cell").asNormalizedText());
121         assertEquals("b", page.getHtmlElementById("row").asNormalizedText());
122         assertEquals("b", page.getHtmlElementById("table").asNormalizedText());
123     }
124 
125     private void testPlainText(final String html, final String expectedText) throws Exception {
126         final String content = DOCTYPE_HTML + "<html><body><span id='foo'>" + html + "</span></body></html>";
127 
128         final HtmlPage page = loadPage(content);
129         assertEquals(expectedText, page.asNormalizedText());
130 
131         final HtmlElement elt = page.getHtmlElementById("foo");
132         assertEquals(expectedText, elt.asNormalizedText());
133 
134         final DomNode node = elt.getFirstChild();
135         assertEquals(expectedText, node.asNormalizedText());
136     }
137 
138     private void testAsText(final String html, final String expectedText) throws Exception {
139         final String content = DOCTYPE_HTML + "<html><body><span id='foo'>" + html + "</span></body></html>";
140 
141         final HtmlPage page = loadPage(content);
142         final HtmlElement elt = page.getHtmlElementById("foo");
143         assertEquals(expectedText, elt.asNormalizedText());
144     }
145 
146     /**
147      * @throws Exception if the test fails
148      */
149     @Test
150     public void asXml() throws Exception {
151         final String unicodeString = "\u064A\u0627 \u0644\u064A\u064A\u0644";
152         final String html = DOCTYPE_HTML
153             + "<html>\n"
154             + "<head><meta http-equiv='Content-Type' content='text/html; charset=UTF-8'></head>\n"
155             + "<body><span id='foo'>" + unicodeString + "</span></body></html>";
156 
157         final int[] expectedValues = {1610, 1575, 32, 1604, 1610, 1610, 1604};
158 
159         final WebClient client = getWebClient();
160         final MockWebConnection webConnection = new MockWebConnection();
161 
162         webConnection.setDefaultResponse(StringUtils.toByteArray(html, UTF_8), 200, "OK", MimeType.TEXT_HTML);
163         client.setWebConnection(webConnection);
164 
165         final HtmlPage page = client.getPage(URL_FIRST);
166         final String xml = page.getHtmlElementById("foo").getFirstChild().asXml();
167         assertEquals(expectedValues.length, xml.length());
168         int index = 0;
169         for (final int expectedValue : expectedValues) {
170             assertEquals(expectedValue, xml.codePointAt(index++));
171         }
172     }
173 
174     /**
175      * @throws Exception if the test fails
176      */
177     @Test
178     public void splitText() throws Exception {
179         final String html = DOCTYPE_HTML
180             + "<html><head></head><body>\n"
181             + "<br><div id='tag'></div><br></body></html>";
182         final HtmlPage page = loadPage(html);
183 
184         final DomNode divNode = page.getElementById("tag");
185 
186         final DomText node = new DomText(page, "test split");
187         divNode.insertBefore(node);
188 
189         final DomNode previousSibling = node.getPreviousSibling();
190         final DomNode nextSibling = node.getNextSibling();
191         final DomNode parent = node.getParentNode();
192 
193         // position among parent's children
194         final int position = readPositionAmongParentChildren(node);
195 
196         final DomText newNode = node.splitText(5);
197 
198         assertSame("new node previous sibling", node, newNode.getPreviousSibling());
199         assertSame("previous sibling", previousSibling, node.getPreviousSibling());
200         assertSame("new node next sibling", nextSibling, newNode.getNextSibling());
201         assertSame("next sibling", newNode, node.getNextSibling());
202         assertSame("parent", parent, newNode.getParentNode());
203         assertSame(node, previousSibling.getNextSibling());
204         assertSame(newNode, nextSibling.getPreviousSibling());
205         assertEquals(position + 1, readPositionAmongParentChildren(newNode));
206     }
207 
208     /**
209      * @throws Exception if the test fails
210      */
211     @Test
212     public void splitLastDomText() throws Exception {
213         final String content = DOCTYPE_HTML
214             + "<html><head></head><body>\n"
215             + "<br><div id='tag'></div><br></body></html>";
216         final HtmlPage page = loadPage(content);
217 
218         final DomNode divNode = page.getElementById("tag");
219 
220         final DomText firstNode = new DomText(page, "test split");
221         divNode.appendChild(firstNode);
222 
223         assertNull(firstNode.getPreviousSibling());
224 
225         final DomText secondNode = firstNode.splitText(5);
226 
227         final DomText thirdNode = new DomText(page, "test split");
228         divNode.appendChild(thirdNode);
229 
230         assertSame(secondNode, firstNode.getNextSibling());
231         assertNull(firstNode.getPreviousSibling());
232         assertSame(firstNode, secondNode.getPreviousSibling());
233         assertSame(thirdNode, secondNode.getNextSibling());
234         assertSame(secondNode, thirdNode.getPreviousSibling());
235         assertNull(thirdNode.getNextSibling());
236         assertSame(divNode, secondNode.getParentNode());
237         assertSame(divNode, thirdNode.getParentNode());
238         assertEquals(0, readPositionAmongParentChildren(firstNode));
239         assertEquals(1, readPositionAmongParentChildren(secondNode));
240         assertEquals(2, readPositionAmongParentChildren(thirdNode));
241     }
242 
243     /**
244      * Reads the position of the node among the children of its parent
245      * @param node the node to look at
246      * @return the position
247      */
248     private static int readPositionAmongParentChildren(final DomNode node) {
249         int i = 0;
250         for (final DomNode child : node.getParentNode().getChildren()) {
251             if (child == node) {
252                 return i;
253             }
254             i++;
255         }
256         return -1;
257     }
258 
259     /**
260      * @throws Exception if the test fails
261      */
262     @Test
263     public void splitText2() throws Exception {
264         final String html = DOCTYPE_HTML
265             + "<html><head><title>foo</title><script>\n"
266             + "  function test() {\n"
267             + "    var div = document.getElementById('myDiv');\n"
268             + "    div.appendChild(document.createElement('a'));\n"
269             + "    var text = document.createTextNode('123456');\n"
270             + "    div.appendChild(text);\n"
271             + "    div.appendChild(document.createElement('hr'));\n"
272             + "    alert(div.childNodes.length);\n"
273             + "    text.splitText(3);\n"
274             + "    alert(div.childNodes.length);\n"
275             + "    alert(div.childNodes.item(2).nodeValue);\n"
276             + "  }\n"
277             + "</script></head><body onload='test()'>\n"
278             + "  <div id='myDiv'></div>\n"
279             + "</body></html>";
280         final String[] expectedAlerts = {"3", "4", "456"};
281         final List<String> collectedAlerts = new ArrayList<>();
282         loadPage(html, collectedAlerts);
283         assertEquals(expectedAlerts, collectedAlerts);
284     }
285 
286     /**
287      * @throws Exception if an error occurs
288      */
289     @Test
290     public void setTextContent() throws Exception {
291         final String html = DOCTYPE_HTML + "<html><body><span id='s'>abc</span></body></html>";
292         final HtmlPage page = loadPage(html);
293         final DomText text = (DomText) page.getElementById("s").getFirstChild();
294         assertEquals("abc", text.getTextContent());
295         text.setTextContent("xyz");
296         assertEquals("xyz", text.getTextContent());
297         assertEquals("xyz", page.asNormalizedText());
298     }
299 
300     /**
301      * Test case for #1366.
302      * @throws Exception if an error occurs
303      */
304     @Test
305     public void getTextContentWhitespace() throws Exception {
306         final String html = DOCTYPE_HTML + "<html><body><div id='s'><b>Hello</b> <b>World</b>!</div></body></html>";
307         final HtmlPage page = loadPage(html);
308         final HtmlElement text = page.getHtmlElementById("s");
309         assertEquals("Hello World!", text.getTextContent());
310     }
311 
312     /**
313      * Tests if {@code getCanonicalXPath()} returns the correct XPath for a text
314      * node without other text node siblings.
315      * @throws Exception if an error occurs
316      */
317     @Test
318     public void getCanonicalXPath_withoutTextSiblings() throws Exception {
319         final String html = DOCTYPE_HTML + "<html><body><span id='s'>abc</span></body></html>";
320         final HtmlPage page = loadPage(html);
321         final DomText text = (DomText) page.getElementById("s").getFirstChild();
322         assertEquals("/html/body/span/text()", text.getCanonicalXPath());
323         assertEquals(text, page.getFirstByXPath(text.getCanonicalXPath()));
324     }
325 
326     /**
327      * Tests if {@code getCanonicalXPath()} returns the correct XPath for a text
328      * node with other text node siblings.
329      * @throws Exception if an error occurs
330      */
331     @Test
332     public void getCanonicalXPath_withTextSiblings() throws Exception {
333         final String html = DOCTYPE_HTML + "<html><body><span id='s'>abc<br/>def</span></body></html>";
334         final HtmlPage page = loadPage(html);
335 
336         final DomText text1 = (DomText) page.getElementById("s").getFirstChild();
337         assertEquals("abc", text1.getData());
338         assertEquals("/html/body/span/text()[1]", text1.getCanonicalXPath());
339         assertEquals(text1, page.getFirstByXPath(text1.getCanonicalXPath()));
340 
341         final DomText text2 = (DomText) page.getElementById("s").getChildNodes().get(2);
342         assertEquals("def", text2.getData());
343         assertEquals("/html/body/span/text()[2]", text2.getCanonicalXPath());
344         assertEquals(text2, page.getFirstByXPath(text2.getCanonicalXPath()));
345     }
346 
347 }