View Javadoc
1   /*
2    * Copyright (c) 2002-2025 Gargoyle Software Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * https://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  package org.htmlunit.html;
16  
17  import static java.nio.charset.StandardCharsets.UTF_8;
18  
19  import java.util.ArrayList;
20  import java.util.List;
21  
22  import org.htmlunit.MockWebConnection;
23  import org.htmlunit.SimpleWebTestCase;
24  import org.htmlunit.WebClient;
25  import org.htmlunit.junit.BrowserRunner;
26  import org.htmlunit.util.MimeType;
27  import org.htmlunit.util.StringUtils;
28  import org.junit.Test;
29  import org.junit.runner.RunWith;
30  
31  /**
32   * Tests for {@link DomText}.
33   *
34   * @author Marc Guillemot
35   * @author Ahmed Ashour
36   * @author Rodney Gitzel
37   * @author Sudhan Moghe
38   * @author Philip Graf
39   */
40  @RunWith(BrowserRunner.class)
41  public class DomTextTest extends SimpleWebTestCase {
42  
43      /**
44       * Test the clean up of   in strings.
45       * @throws Exception if the test fails
46       */
47      @Test
48      public void asText_nbsp() throws Exception {
49          testPlainText("a b c  d  e",  "a b c d  e");
50          testPlainText("a b c  d   e", "a b c d   e");
51          testPlainText(" a ", " a ");
52          testPlainText("  a ", "  a ");
53          testPlainText(" a  ", " a  ");
54      }
55  
56      /**
57       * Test font formats, as per bug #490.
58       * See http://sourceforge.net/p/htmlunit/bugs/490/.
59       *
60       * @throws Exception if the test fails
61       */
62      @Test
63      public void asText_fontFormat() throws Exception {
64          testAsText("a <b>b</b> c",  "a b c");
65          testAsText("a <b>b</b>c",   "a bc");
66          testAsText("a<b>b</b> c",   "ab c");
67          testAsText("a<b>b</b>c",    "abc");
68  
69          // italics and teletype should work the same way
70          testAsText("a <i>b</i> c",  "a b c");
71          testAsText("a <i>b</i>c",   "a bc");
72          testAsText("a<i>b</i> c",   "ab c");
73          testAsText("a<i>b</i>c",    "abc");
74  
75          testAsText("a <tt>b</tt> c",  "a b c");
76          testAsText("a <tt>b</tt>c",   "a bc");
77          testAsText("a<tt>b</tt> c",   "ab c");
78          testAsText("a<tt>b</tt>c",    "abc");
79  
80          testAsText("a <font>b</font> c",  "a b c");
81          testAsText("a<font>b</font> c",   "ab c");
82          testAsText("a <font>b</font>c",   "a bc");
83          testAsText("a<font>b</font>c",    "abc");
84  
85          testAsText("a <span>b</span> c",  "a b c");
86          testAsText("a<span>b</span> c",   "ab c");
87          testAsText("a <span>b</span>c",   "a bc");
88          testAsText("a<span>b</span>c",    "abc");
89  
90          testAsText("a<b><font><i>b</i></font></b>c",  "abc");
91          testAsText("a<b><font> <i>b</i></font></b>c", "a bc");
92      }
93  
94      /**
95       * This test once tested regression for bug #490 but the expectations have been changed
96       * as asNormalizedText() should now use new lines when appropriate.
97       * @throws Exception if the test fails
98       */
99      @Test
100     public void asNormalizedTextRegression() throws Exception {
101         String expected = "a\nb\nc";
102         testAsText("a<ul><li>b</ul>c", expected);
103         testAsText("a<p>b<br>c", expected);
104         testAsText("a<table><tr><td>b</td></tr></table>c", expected);
105         testAsText("a<div>b</div>c", expected);
106 
107         expected = "a\nb\nb\nc";
108         testAsText("a<table><tr><td> b </td></tr>\n<tr><td> b </td></tr></table>c", expected);
109     }
110 
111     /**
112      * Checks the HtmlTable* objects themselves.
113      * @throws Exception if the test fails
114      */
115     @Test
116     public void asText_table_elements() throws Exception {
117         final String html = "<table id='table'><tr id='row'><td id='cell'> b </td></tr>\n</table>\n";
118         final String content = DOCTYPE_HTML + "<html><body><span id='foo'>" + html + "</span></body></html>";
119 
120         final HtmlPage page = loadPage(content);
121 
122         assertEquals("b", page.getHtmlElementById("cell").asNormalizedText());
123         assertEquals("b", page.getHtmlElementById("row").asNormalizedText());
124         assertEquals("b", page.getHtmlElementById("table").asNormalizedText());
125     }
126 
127     private void testPlainText(final String html, final String expectedText) throws Exception {
128         final String content = DOCTYPE_HTML + "<html><body><span id='foo'>" + html + "</span></body></html>";
129 
130         final HtmlPage page = loadPage(content);
131         assertEquals(expectedText, page.asNormalizedText());
132 
133         final HtmlElement elt = page.getHtmlElementById("foo");
134         assertEquals(expectedText, elt.asNormalizedText());
135 
136         final DomNode node = elt.getFirstChild();
137         assertEquals(expectedText, node.asNormalizedText());
138     }
139 
140     private void testAsText(final String html, final String expectedText) throws Exception {
141         final String content = DOCTYPE_HTML + "<html><body><span id='foo'>" + html + "</span></body></html>";
142 
143         final HtmlPage page = loadPage(content);
144         final HtmlElement elt = page.getHtmlElementById("foo");
145         assertEquals(expectedText, elt.asNormalizedText());
146     }
147 
148     /**
149      * @throws Exception if the test fails
150      */
151     @Test
152     public void asXml() throws Exception {
153         final String unicodeString = "\u064A\u0627 \u0644\u064A\u064A\u0644";
154         final String html = DOCTYPE_HTML
155             + "<html>\n"
156             + "<head><meta http-equiv='Content-Type' content='text/html; charset=UTF-8'></head>\n"
157             + "<body><span id='foo'>" + unicodeString + "</span></body></html>";
158 
159         final int[] expectedValues = {1610, 1575, 32, 1604, 1610, 1610, 1604};
160 
161         final WebClient client = getWebClient();
162         final MockWebConnection webConnection = new MockWebConnection();
163 
164         webConnection.setDefaultResponse(StringUtils.toByteArray(html, UTF_8), 200, "OK", MimeType.TEXT_HTML);
165         client.setWebConnection(webConnection);
166 
167         final HtmlPage page = client.getPage(URL_FIRST);
168         final String xml = page.getHtmlElementById("foo").getFirstChild().asXml().trim();
169         assertEquals(expectedValues.length, xml.length());
170         int index = 0;
171         for (final int expectedValue : expectedValues) {
172             assertEquals(expectedValue, xml.codePointAt(index++));
173         }
174     }
175 
176     /**
177      * @throws Exception if the test fails
178      */
179     @Test
180     public void splitText() throws Exception {
181         final String html = DOCTYPE_HTML
182             + "<html><head></head><body>\n"
183             + "<br><div id='tag'></div><br></body></html>";
184         final HtmlPage page = loadPage(html);
185 
186         final DomNode divNode = page.getElementById("tag");
187 
188         final DomText node = new DomText(page, "test split");
189         divNode.insertBefore(node);
190 
191         final DomNode previousSibling = node.getPreviousSibling();
192         final DomNode nextSibling = node.getNextSibling();
193         final DomNode parent = node.getParentNode();
194 
195         // position among parent's children
196         final int position = readPositionAmongParentChildren(node);
197 
198         final DomText newNode = node.splitText(5);
199 
200         assertSame("new node previous sibling", node, newNode.getPreviousSibling());
201         assertSame("previous sibling", previousSibling, node.getPreviousSibling());
202         assertSame("new node next sibling", nextSibling, newNode.getNextSibling());
203         assertSame("next sibling", newNode, node.getNextSibling());
204         assertSame("parent", parent, newNode.getParentNode());
205         assertSame(node, previousSibling.getNextSibling());
206         assertSame(newNode, nextSibling.getPreviousSibling());
207         assertEquals(position + 1, readPositionAmongParentChildren(newNode));
208     }
209 
210     /**
211      * @throws Exception if the test fails
212      */
213     @Test
214     public void splitLastDomText() throws Exception {
215         final String content = DOCTYPE_HTML
216             + "<html><head></head><body>\n"
217             + "<br><div id='tag'></div><br></body></html>";
218         final HtmlPage page = loadPage(content);
219 
220         final DomNode divNode = page.getElementById("tag");
221 
222         final DomText firstNode = new DomText(page, "test split");
223         divNode.appendChild(firstNode);
224 
225         assertNull(firstNode.getPreviousSibling());
226 
227         final DomText secondNode = firstNode.splitText(5);
228 
229         final DomText thirdNode = new DomText(page, "test split");
230         divNode.appendChild(thirdNode);
231 
232         assertSame(secondNode, firstNode.getNextSibling());
233         assertNull(firstNode.getPreviousSibling());
234         assertSame(firstNode, secondNode.getPreviousSibling());
235         assertSame(thirdNode, secondNode.getNextSibling());
236         assertSame(secondNode, thirdNode.getPreviousSibling());
237         assertNull(thirdNode.getNextSibling());
238         assertSame(divNode, secondNode.getParentNode());
239         assertSame(divNode, thirdNode.getParentNode());
240         assertEquals(0, readPositionAmongParentChildren(firstNode));
241         assertEquals(1, readPositionAmongParentChildren(secondNode));
242         assertEquals(2, readPositionAmongParentChildren(thirdNode));
243     }
244 
245     /**
246      * Reads the position of the node among the children of its parent
247      * @param node the node to look at
248      * @return the position
249      */
250     private static int readPositionAmongParentChildren(final DomNode node) {
251         int i = 0;
252         for (final DomNode child : node.getParentNode().getChildren()) {
253             if (child == node) {
254                 return i;
255             }
256             i++;
257         }
258         return -1;
259     }
260 
261     /**
262      * @throws Exception if the test fails
263      */
264     @Test
265     public void splitText2() throws Exception {
266         final String html = DOCTYPE_HTML
267             + "<html><head><title>foo</title><script>\n"
268             + "  function test() {\n"
269             + "    var div = document.getElementById('myDiv');\n"
270             + "    div.appendChild(document.createElement('a'));\n"
271             + "    var text = document.createTextNode('123456');\n"
272             + "    div.appendChild(text);\n"
273             + "    div.appendChild(document.createElement('hr'));\n"
274             + "    alert(div.childNodes.length);\n"
275             + "    text.splitText(3);\n"
276             + "    alert(div.childNodes.length);\n"
277             + "    alert(div.childNodes.item(2).nodeValue);\n"
278             + "  }\n"
279             + "</script></head><body onload='test()'>\n"
280             + "  <div id='myDiv'></div>\n"
281             + "</body></html>";
282         final String[] expectedAlerts = {"3", "4", "456"};
283         final List<String> collectedAlerts = new ArrayList<>();
284         loadPage(html, collectedAlerts);
285         assertEquals(expectedAlerts, collectedAlerts);
286     }
287 
288     /**
289      * @throws Exception if an error occurs
290      */
291     @Test
292     public void setTextContent() throws Exception {
293         final String html = DOCTYPE_HTML + "<html><body><span id='s'>abc</span></body></html>";
294         final HtmlPage page = loadPage(html);
295         final DomText text = (DomText) page.getElementById("s").getFirstChild();
296         assertEquals("abc", text.getTextContent());
297         text.setTextContent("xyz");
298         assertEquals("xyz", text.getTextContent());
299         assertEquals("xyz", page.asNormalizedText());
300     }
301 
302     /**
303      * Test case for #1366.
304      * @throws Exception if an error occurs
305      */
306     @Test
307     public void getTextContentWhitespace() throws Exception {
308         final String html = DOCTYPE_HTML + "<html><body><div id='s'><b>Hello</b> <b>World</b>!</div></body></html>";
309         final HtmlPage page = loadPage(html);
310         final HtmlElement text = page.getHtmlElementById("s");
311         assertEquals("Hello World!", text.getTextContent());
312     }
313 
314     /**
315      * Tests if {@code getCanonicalXPath()} returns the correct XPath for a text
316      * node without other text node siblings.
317      * @throws Exception if an error occurs
318      */
319     @Test
320     public void getCanonicalXPath_withoutTextSiblings() throws Exception {
321         final String html = DOCTYPE_HTML + "<html><body><span id='s'>abc</span></body></html>";
322         final HtmlPage page = loadPage(html);
323         final DomText text = (DomText) page.getElementById("s").getFirstChild();
324         assertEquals("/html/body/span/text()", text.getCanonicalXPath());
325         assertEquals(text, page.getFirstByXPath(text.getCanonicalXPath()));
326     }
327 
328     /**
329      * Tests if {@code getCanonicalXPath()} returns the correct XPath for a text
330      * node with other text node siblings.
331      * @throws Exception if an error occurs
332      */
333     @Test
334     public void getCanonicalXPath_withTextSiblings() throws Exception {
335         final String html = DOCTYPE_HTML + "<html><body><span id='s'>abc<br/>def</span></body></html>";
336         final HtmlPage page = loadPage(html);
337 
338         final DomText text1 = (DomText) page.getElementById("s").getFirstChild();
339         assertEquals("abc", text1.getData());
340         assertEquals("/html/body/span/text()[1]", text1.getCanonicalXPath());
341         assertEquals(text1, page.getFirstByXPath(text1.getCanonicalXPath()));
342 
343         final DomText text2 = (DomText) page.getElementById("s").getChildNodes().get(2);
344         assertEquals("def", text2.getData());
345         assertEquals("/html/body/span/text()[2]", text2.getCanonicalXPath());
346         assertEquals(text2, page.getFirstByXPath(text2.getCanonicalXPath()));
347     }
348 
349 }