View Javadoc
1   /*
2    * Copyright (c) 2002-2025 Gargoyle Software Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * https://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  package org.htmlunit.html;
16  
17  import java.io.Serializable;
18  
19  import org.w3c.dom.DOMException;
20  import org.w3c.dom.Node;
21  import org.w3c.dom.traversal.NodeFilter;
22  
23  /**
24   * In general this is an implementation of org.w3c.dom.traversal.TreeWalker.
25   * The class org.w3c.dom.traversal.TreeWalker is not available on Android
26   * therefore we have this impl as backend.
27   *
28   * @see <a href="http://www.w3.org/TR/DOM-Level-2-Traversal-Range/traversal.html">
29   * DOM-Level-2-Traversal-Range</a>
30   * @author <a href="mailto:mike@10gen.com">Mike Dirolf</a>
31   * @author Frank Danek
32   * @author Ahmed Ashour
33   * @author Ronald Brill
34   */
35  public class HtmlDomTreeWalker implements Serializable {
36  
37      private final DomNode root_;
38      private DomNode currentNode_;
39      private final int whatToShow_;
40      private final NodeFilter filter_;
41      private final boolean expandEntityReferences_;
42  
43      /**
44       * Creates an instance.
45       *
46       * @param root The root node of the TreeWalker. Must not be
47       *          {@code null}.
48       * @param whatToShow Flag specifying which types of nodes appear in the
49       *          logical view of the TreeWalker. See {@link NodeFilter} for the
50       *          set of possible Show_ values.
51       * @param filter The {@link NodeFilter} to be used with this TreeWalker,
52       *          or {@code null} to indicate no filter.
53       * @param expandEntityReferences If false, the contents of
54       *          EntityReference nodes are not present in the logical view.
55       * @throws DOMException on attempt to create a TreeWalker with a root that
56       *          is {@code null}.
57       */
58      public HtmlDomTreeWalker(final DomNode root, final int whatToShow, final NodeFilter filter,
59              final boolean expandEntityReferences) throws DOMException {
60          if (root == null) {
61              throw new IllegalArgumentException("root must not be null");
62          }
63          root_ = root;
64          whatToShow_ = whatToShow;
65          filter_ = filter;
66          expandEntityReferences_ = expandEntityReferences;
67          currentNode_ = root_;
68      }
69  
70      /**
71       * @see org.w3c.dom.traversal.TreeWalker#getRoot()
72       * @return the root node
73       */
74      public DomNode getRoot() {
75          return root_;
76      }
77  
78      /**
79       * @see org.w3c.dom.traversal.TreeWalker#getWhatToShow()
80       * @return NodeFilter constant
81       */
82      public int getWhatToShow() {
83          return whatToShow_;
84      }
85  
86      /**
87       * @see org.w3c.dom.traversal.TreeWalker#getFilter()
88       * @return the filter
89       */
90      public NodeFilter getFilter() {
91          return filter_;
92      }
93  
94      /**
95       * @see org.w3c.dom.traversal.TreeWalker#getExpandEntityReferences()
96       * @return the ExpandEntityReferences setting
97       */
98      public boolean getExpandEntityReferences() {
99          return expandEntityReferences_;
100     }
101 
102     /**
103      * @see org.w3c.dom.traversal.TreeWalker#getCurrentNode()
104      * @return the current node
105      */
106     public DomNode getCurrentNode() {
107         return currentNode_;
108     }
109 
110     /**
111      * @see org.w3c.dom.traversal.TreeWalker#setCurrentNode(Node)
112      * @param currentNode the current node
113      * @throws DOMException if the current node provides is null
114      */
115     public void setCurrentNode(final Node currentNode) throws DOMException {
116         if (currentNode == null) {
117             throw new DOMException(DOMException.NOT_SUPPORTED_ERR,
118                     "currentNode cannot be set to null");
119         }
120         currentNode_ = (DomNode) currentNode;
121     }
122 
123     /**
124      * @see org.w3c.dom.traversal.TreeWalker#nextNode()
125      * @return the next node
126      */
127     public DomNode nextNode() {
128         final DomNode leftChild = getEquivalentLogical(currentNode_.getFirstChild(), false);
129         if (leftChild != null) {
130             currentNode_ = leftChild;
131             return leftChild;
132         }
133         final DomNode rightSibling = getEquivalentLogical(currentNode_.getNextSibling(), false);
134         if (rightSibling != null) {
135             currentNode_ = rightSibling;
136             return rightSibling;
137         }
138 
139         final DomNode uncle = getFirstUncleNode(currentNode_);
140         if (uncle != null) {
141             currentNode_ = uncle;
142             return uncle;
143         }
144 
145         return null;
146     }
147 
148     /**
149      * Helper method to get the first uncle node in document order (preorder
150      * traversal) from the given node.
151      */
152     private DomNode getFirstUncleNode(final DomNode n) {
153         if (n == root_ || n == null) {
154             return null;
155         }
156 
157         final DomNode parent = n.getParentNode();
158         if (parent == null) {
159             return null;
160         }
161 
162         final DomNode uncle = getEquivalentLogical(parent.getNextSibling(), false);
163         if (uncle != null) {
164             return uncle;
165         }
166 
167         return getFirstUncleNode(parent);
168     }
169 
170     /**
171      * Recursively find the logical node occupying the same position as this
172      * _actual_ node. It could be the same node, a different node, or null
173      * depending on filtering.
174      *
175      * @param n The actual node we are trying to find the "equivalent" of
176      * @param lookLeft If true, traverse the tree in the left direction. If
177      *          false, traverse the tree to the right.
178      * @return the logical node in the same position as n
179      */
180     private DomNode getEquivalentLogical(final DomNode n, final boolean lookLeft) {
181         // Base cases
182         if (n == null) {
183             return null;
184         }
185         if (isNodeVisible(n)) {
186             return n;
187         }
188 
189         // If a node is skipped, try getting one of its descendants
190         if (isNodeSkipped(n)) {
191             final DomNode child;
192             if (lookLeft) {
193                 child = getEquivalentLogical(n.getLastChild(), lookLeft);
194             }
195             else {
196                 child = getEquivalentLogical(n.getFirstChild(), lookLeft);
197             }
198 
199             if (child != null) {
200                 return child;
201             }
202         }
203 
204         // If this node is rejected or has no descendants that will work, go
205         // to its sibling.
206         return getSibling(n, lookLeft);
207     }
208 
209     /**
210      * Returns whether the node is visible by the TreeWalker.
211      */
212     private boolean isNodeVisible(final Node n) {
213         if (acceptNode(n) == NodeFilter.FILTER_ACCEPT) {
214             if (filter_ == null || filter_.acceptNode(n) == NodeFilter.FILTER_ACCEPT) {
215                 return expandEntityReferences_ || n.getParentNode() == null
216                         || n.getParentNode().getNodeType() != Node.ENTITY_REFERENCE_NODE;
217             }
218         }
219         return false;
220     }
221 
222     /**
223      * Test whether a specified node is visible in the logical view of a
224      * TreeWalker, based solely on the whatToShow constant.
225      *
226      * @param n The node to check to see if it should be shown or not
227      * @return a constant to determine whether the node is accepted, rejected,
228      *          or skipped.
229      */
230     private short acceptNode(final Node n) {
231         final int flag = getFlagForNode(n);
232 
233         if ((whatToShow_ & flag) != 0) {
234             return NodeFilter.FILTER_ACCEPT;
235         }
236         // Skip, don't reject.
237         return NodeFilter.FILTER_SKIP;
238     }
239 
240     /**
241      * <span style="color:red">INTERNAL API - SUBJECT TO CHANGE AT ANY TIME - USE AT YOUR OWN RISK.</span><br>
242      *
243      * Given a {@link Node}, return the appropriate constant for whatToShow.
244      *
245      * @param node the node
246      * @return the whatToShow constant for the type of specified node
247      */
248     public static int getFlagForNode(final Node node) {
249         switch (node.getNodeType()) {
250             case Node.ELEMENT_NODE:
251                 return NodeFilter.SHOW_ELEMENT;
252             case Node.ATTRIBUTE_NODE:
253                 return NodeFilter.SHOW_ATTRIBUTE;
254             case Node.TEXT_NODE:
255                 return NodeFilter.SHOW_TEXT;
256             case Node.CDATA_SECTION_NODE:
257                 return NodeFilter.SHOW_CDATA_SECTION;
258             case Node.ENTITY_REFERENCE_NODE:
259                 return NodeFilter.SHOW_ENTITY_REFERENCE;
260             case Node.ENTITY_NODE:
261                 return NodeFilter.SHOW_ENTITY;
262             case Node.PROCESSING_INSTRUCTION_NODE:
263                 return NodeFilter.SHOW_PROCESSING_INSTRUCTION;
264             case Node.COMMENT_NODE:
265                 return NodeFilter.SHOW_COMMENT;
266             case Node.DOCUMENT_NODE:
267                 return NodeFilter.SHOW_DOCUMENT;
268             case Node.DOCUMENT_TYPE_NODE:
269                 return NodeFilter.SHOW_DOCUMENT_TYPE;
270             case Node.DOCUMENT_FRAGMENT_NODE:
271                 return NodeFilter.SHOW_DOCUMENT_FRAGMENT;
272             case Node.NOTATION_NODE:
273                 return NodeFilter.SHOW_NOTATION;
274             default:
275                 return 0;
276         }
277     }
278 
279     /* Returns whether the node is skipped by the TreeWalker. */
280     private boolean isNodeSkipped(final Node n) {
281         return !isNodeVisible(n) && !isNodeRejected(n);
282     }
283 
284     /* Returns whether the node is rejected by the TreeWalker. */
285     private boolean isNodeRejected(final Node n) {
286         if (acceptNode(n) == NodeFilter.FILTER_REJECT) {
287             return true;
288         }
289         if (filter_ != null && filter_.acceptNode(n) == NodeFilter.FILTER_REJECT) {
290             return true;
291         }
292         return !expandEntityReferences_ && n.getParentNode() != null
293                 && n.getParentNode().getNodeType() == Node.ENTITY_REFERENCE_NODE;
294     }
295 
296     // Helper method for getEquivalentLogical
297     private DomNode getSibling(final DomNode n, final boolean lookLeft) {
298         if (n == null) {
299             return null;
300         }
301 
302         if (isNodeVisible(n)) {
303             return null;
304         }
305 
306         final DomNode sibling;
307         if (lookLeft) {
308             sibling = n.getPreviousSibling();
309         }
310         else {
311             sibling = n.getNextSibling();
312         }
313 
314         if (sibling == null) {
315             // If this node has no logical siblings at or below it's "level", it might have one above
316             if (n == root_) {
317                 return null;
318             }
319             return getSibling(n.getParentNode(), lookLeft);
320 
321         }
322         return getEquivalentLogical(sibling, lookLeft);
323     }
324 
325     /**
326      * @see org.w3c.dom.traversal.TreeWalker#nextSibling()
327      * @return the next sibling node
328      */
329     public DomNode nextSibling() {
330         if (currentNode_ == root_) {
331             return null;
332         }
333 
334         final DomNode newNode = getEquivalentLogical(currentNode_.getNextSibling(), false);
335 
336         if (newNode != null) {
337             currentNode_ = newNode;
338         }
339 
340         return newNode;
341     }
342 
343     /**
344      * @see org.w3c.dom.traversal.TreeWalker#parentNode()
345      * @return the parent node
346      */
347     public DomNode parentNode() {
348         if (currentNode_ == root_) {
349             return null;
350         }
351 
352         DomNode newNode = currentNode_;
353 
354         do {
355             newNode = newNode.getParentNode();
356         }
357         while (newNode != null && !isNodeVisible(newNode) && newNode != root_);
358 
359         if (newNode == null || !isNodeVisible(newNode)) {
360             return null;
361         }
362         currentNode_ = newNode;
363         return newNode;
364     }
365 
366     /**
367      * @see org.w3c.dom.traversal.TreeWalker#previousSibling()
368      * @return the previous sibling node
369      */
370     public DomNode previousSibling() {
371         if (currentNode_ == root_) {
372             return null;
373         }
374 
375         final DomNode newNode = getEquivalentLogical(currentNode_.getPreviousSibling(), true);
376 
377         if (newNode != null) {
378             currentNode_ = newNode;
379         }
380 
381         return newNode;
382     }
383 
384     /**
385      * @see org.w3c.dom.traversal.TreeWalker#lastChild()
386      * @return the last child node
387      */
388     public DomNode lastChild() {
389         final DomNode newNode = getEquivalentLogical(currentNode_.getLastChild(), true);
390 
391         if (newNode != null) {
392             currentNode_ = newNode;
393         }
394 
395         return newNode;
396     }
397 
398     /**
399      * @see org.w3c.dom.traversal.TreeWalker#previousNode()
400      * @return the previous node
401      */
402     public DomNode previousNode() {
403         final DomNode newNode = getPreviousNode(currentNode_);
404 
405         if (newNode != null) {
406             currentNode_ = newNode;
407         }
408 
409         return newNode;
410     }
411 
412     /**
413      * Helper method to get the previous node in document order (preorder
414      * traversal) from the given node.
415      */
416     private DomNode getPreviousNode(final DomNode n) {
417         if (n == root_) {
418             return null;
419         }
420         final DomNode left = getEquivalentLogical(n.getPreviousSibling(), true);
421         if (left == null) {
422             final DomNode parent = n.getParentNode();
423             if (parent == null) {
424                 return null;
425             }
426             if (isNodeVisible(parent)) {
427                 return parent;
428             }
429         }
430 
431         DomNode follow = left;
432         if (follow != null) {
433             while (follow.hasChildNodes()) {
434                 final DomNode toFollow = getEquivalentLogical(follow.getLastChild(), true);
435                 if (toFollow == null) {
436                     break;
437                 }
438                 follow = toFollow;
439             }
440         }
441         return follow;
442     }
443 
444     /**
445      * @see org.w3c.dom.traversal.TreeWalker#firstChild()
446      * @return the first child node
447      */
448     public DomNode firstChild() {
449         final DomNode newNode = getEquivalentLogical(currentNode_.getFirstChild(), false);
450 
451         if (newNode != null) {
452             currentNode_ = newNode;
453         }
454 
455         return newNode;
456     }
457 }