View Javadoc
1   /*
2    * Copyright (c) 2002-2025 Gargoyle Software Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * https://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  package org.htmlunit.html.parser;
16  
17  import java.net.URL;
18  
19  import org.htmlunit.SimpleWebTestCase;
20  import org.htmlunit.StringWebResponse;
21  import org.htmlunit.WebClient;
22  import org.htmlunit.WebResponse;
23  import org.htmlunit.html.DomElement;
24  import org.htmlunit.html.HtmlDivision;
25  import org.htmlunit.html.HtmlElement;
26  import org.htmlunit.html.HtmlPage;
27  import org.htmlunit.html.HtmlPageTest;
28  import org.htmlunit.html.HtmlTableColumnGroup;
29  import org.htmlunit.html.XHtmlPage;
30  import org.htmlunit.junit.BrowserRunner;
31  import org.junit.Test;
32  import org.junit.runner.RunWith;
33  
34  /**
35   * Test class for {@link HTMLParser}.
36   *
37   * @author <a href="mailto:cse@dynabean.de">Christian Sell</a>
38   * @author Marc Guillemot
39   * @author Ahmed Ashour
40   * @author Sudhan Moghe
41   * @author Frank Danek
42   */
43  @RunWith(BrowserRunner.class)
44  public class HTMLParserTest extends SimpleWebTestCase {
45  
46      /**
47       * Tests the new HTMLParser on a simple HTML string.
48       * @throws Exception failure
49       */
50      @Test
51      public void simpleHTMLString() throws Exception {
52          final WebClient webClient = getWebClient();
53          final WebResponse webResponse = new StringWebResponse(
54              "<html><head><title>TITLE</title></head><body><div>TEST</div></body></html>", URL_FIRST);
55  
56          final HtmlPage page = new HtmlPage(webResponse, webClient.getCurrentWindow());
57          webClient.getCurrentWindow().setEnclosedPage(page);
58  
59          webClient.getPageCreator().getHtmlParser().parse(webResponse, page, false, false);
60  
61          final String stringVal = page.<HtmlDivision>getFirstByXPath("//div").getFirstChild().getNodeValue();
62          assertEquals("TEST", stringVal);
63  
64          final HtmlElement node = (HtmlElement) page.getFirstByXPath("//*[./text() = 'TEST']");
65          assertEquals(node.getTagName(), HtmlDivision.TAG_NAME);
66      }
67  
68      /**
69       * Regression test for bug #766: parse failure when parsing page with UTF-8 BOM (byte order mark).
70       * The HTML file used is from NekoHTML's bug number #54.
71       * @throws Exception if an error occurs
72       */
73      @Test
74      public void bomUtf8() throws Exception {
75          final String resource = "bom-utf8.html";
76          final URL url = getClass().getClassLoader().getResource(resource);
77          assertNotNull(url);
78  
79          final WebClient client = getWebClient();
80          final HtmlPage page = client.getPage(url);
81          assertEquals("Welcome to Suffolk Coastal District Council online", page.getTitleText());
82      }
83  
84      /**
85       * This HTML was causing an EmptyStackException to be thrown.
86       * @throws Exception if an error occurs
87       */
88      @Test
89      public void emptyStack() throws Exception {
90          final String html =
91                "<html>\n"
92              + "  <body onload='document.getElementById(\"s\").innerHTML = "
93              + "    \"<h1><span><span></span></span><span><span></span></span></h1>\";'>\n"
94              + "    <div>\n"
95              + "      <div>\n"
96              + "        <table>\n"
97              + "          <tbody>\n"
98              + "            <tr>\n"
99              + "              <td>\n"
100             + "                <table>\n"
101             + "                  <tbody>\n"
102             + "                    <tr>\n"
103             + "                      <td>\n"
104             + "                        <div>\n"
105             + "                          <div>\n"
106             + "                            <h1>\n"
107             + "                              <span id='s'>blah</span>\n"
108             + "                            </h1>\n"
109             + "                          </div>\n"
110             + "                        </div>\n"
111             + "                      </td>\n"
112             + "                    </tr>\n"
113             + "                  </tbody>\n"
114             + "                </table>\n"
115             + "              </td>\n"
116             + "            </tr>\n"
117             + "          </tbody>\n"
118             + "        </table>\n"
119             + "      </div>\n"
120             + "    </div>\n"
121             + "  </body>\n"
122             + "</html>";
123         final HtmlPage page = loadPage(html);
124         assertNotNull(page);
125     }
126 
127     /**
128      * @throws Exception failure
129      */
130     @Test
131     public void tableWithoutColgroup() throws Exception {
132         final String html = HtmlPageTest.STANDARDS_MODE_PREFIX_
133             + "<html><head>\n"
134             + "</head>\n"
135             + "<body>\n"
136             + "  <table><col width='7'/><col width='1'/><tbody><tr><td>seven</td><td>One</td></tr></tbody></table>\n"
137             + "</body></html>";
138 
139         final WebClient webClient = getWebClient();
140         final WebResponse webResponse = new StringWebResponse(html, URL_FIRST);
141 
142         final XHtmlPage page = new XHtmlPage(webResponse, webClient.getCurrentWindow());
143         webClient.getCurrentWindow().setEnclosedPage(page);
144 
145         webClient.getPageCreator().getHtmlParser().parse(webResponse, page, true, false);
146 
147         final DomElement col = page.getElementsByTagName("col").get(0);
148         assertEquals(col.getParentNode().getNodeName(), HtmlTableColumnGroup.TAG_NAME);
149     }
150 }