View Javadoc
1   /*
2    * Copyright (c) 2002-2025 Gargoyle Software Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * https://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  package org.htmlunit.html.parser;
16  
17  import java.net.URL;
18  
19  import org.htmlunit.SimpleWebTestCase;
20  import org.htmlunit.StringWebResponse;
21  import org.htmlunit.WebClient;
22  import org.htmlunit.WebResponse;
23  import org.htmlunit.html.DomElement;
24  import org.htmlunit.html.HtmlDivision;
25  import org.htmlunit.html.HtmlElement;
26  import org.htmlunit.html.HtmlPage;
27  import org.htmlunit.html.HtmlPageTest;
28  import org.htmlunit.html.HtmlTableColumnGroup;
29  import org.htmlunit.html.XHtmlPage;
30  import org.junit.jupiter.api.Test;
31  
32  /**
33   * Test class for {@link HTMLParser}.
34   *
35   * @author Christian Sell
36   * @author Marc Guillemot
37   * @author Ahmed Ashour
38   * @author Sudhan Moghe
39   * @author Frank Danek
40   */
41  public class HTMLParserTest extends SimpleWebTestCase {
42  
43      /**
44       * Tests the new HTMLParser on a simple HTML string.
45       * @throws Exception failure
46       */
47      @Test
48      public void simpleHTMLString() throws Exception {
49          final WebClient webClient = getWebClient();
50          final WebResponse webResponse = new StringWebResponse(
51              "<html><head><title>TITLE</title></head><body><div>TEST</div></body></html>", URL_FIRST);
52  
53          final HtmlPage page = new HtmlPage(webResponse, webClient.getCurrentWindow());
54          webClient.getCurrentWindow().setEnclosedPage(page);
55  
56          webClient.getPageCreator().getHtmlParser().parse(null, webResponse, page, false, false);
57  
58          final String stringVal = page.<HtmlDivision>getFirstByXPath("//div").getFirstChild().getNodeValue();
59          assertEquals("TEST", stringVal);
60  
61          final HtmlElement node = (HtmlElement) page.getFirstByXPath("//*[./text() = 'TEST']");
62          assertEquals(node.getTagName(), HtmlDivision.TAG_NAME);
63      }
64  
65      /**
66       * Regression test for bug #766: parse failure when parsing page with UTF-8 BOM (byte order mark).
67       * The HTML file used is from NekoHTML's bug number #54.
68       * @throws Exception if an error occurs
69       */
70      @Test
71      public void bomUtf8() throws Exception {
72          final String resource = "bom-utf8.html";
73          final URL url = getClass().getClassLoader().getResource(resource);
74          assertNotNull(url);
75  
76          final WebClient client = getWebClient();
77          final HtmlPage page = client.getPage(url);
78          assertEquals("Welcome to Suffolk Coastal District Council online", page.getTitleText());
79      }
80  
81      /**
82       * This HTML was causing an EmptyStackException to be thrown.
83       * @throws Exception if an error occurs
84       */
85      @Test
86      public void emptyStack() throws Exception {
87          final String html =
88                "<html>\n"
89              + "  <body onload='document.getElementById(\"s\").innerHTML = "
90              + "    \"<h1><span><span></span></span><span><span></span></span></h1>\";'>\n"
91              + "    <div>\n"
92              + "      <div>\n"
93              + "        <table>\n"
94              + "          <tbody>\n"
95              + "            <tr>\n"
96              + "              <td>\n"
97              + "                <table>\n"
98              + "                  <tbody>\n"
99              + "                    <tr>\n"
100             + "                      <td>\n"
101             + "                        <div>\n"
102             + "                          <div>\n"
103             + "                            <h1>\n"
104             + "                              <span id='s'>blah</span>\n"
105             + "                            </h1>\n"
106             + "                          </div>\n"
107             + "                        </div>\n"
108             + "                      </td>\n"
109             + "                    </tr>\n"
110             + "                  </tbody>\n"
111             + "                </table>\n"
112             + "              </td>\n"
113             + "            </tr>\n"
114             + "          </tbody>\n"
115             + "        </table>\n"
116             + "      </div>\n"
117             + "    </div>\n"
118             + "  </body>\n"
119             + "</html>";
120         final HtmlPage page = loadPage(html);
121         assertNotNull(page);
122     }
123 
124     /**
125      * @throws Exception failure
126      */
127     @Test
128     public void tableWithoutColgroup() throws Exception {
129         final String html = HtmlPageTest.STANDARDS_MODE_PREFIX_
130             + "<html><head>\n"
131             + "</head>\n"
132             + "<body>\n"
133             + "  <table><col width='7'/><col width='1'/><tbody><tr><td>seven</td><td>One</td></tr></tbody></table>\n"
134             + "</body></html>";
135 
136         final WebClient webClient = getWebClient();
137         final WebResponse webResponse = new StringWebResponse(html, URL_FIRST);
138 
139         final XHtmlPage page = new XHtmlPage(webResponse, webClient.getCurrentWindow());
140         webClient.getCurrentWindow().setEnclosedPage(page);
141 
142         webClient.getPageCreator().getHtmlParser().parse(null, webResponse, page, true, false);
143 
144         final DomElement col = page.getElementsByTagName("col").get(0);
145         assertEquals(col.getParentNode().getNodeName(), HtmlTableColumnGroup.TAG_NAME);
146     }
147 }