1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 package org.htmlunit.html.serializer;
16
17 import static org.htmlunit.css.CssStyleSheet.BLOCK;
18 import static org.htmlunit.html.DomElement.ATTRIBUTE_NOT_DEFINED;
19
20 import java.util.Iterator;
21 import java.util.List;
22
23 import org.htmlunit.Page;
24 import org.htmlunit.SgmlPage;
25 import org.htmlunit.WebWindow;
26 import org.htmlunit.html.DomComment;
27 import org.htmlunit.html.DomElement;
28 import org.htmlunit.html.DomNode;
29 import org.htmlunit.html.DomText;
30 import org.htmlunit.html.HtmlBody;
31 import org.htmlunit.html.HtmlBreak;
32 import org.htmlunit.html.HtmlCheckBoxInput;
33 import org.htmlunit.html.HtmlDetails;
34 import org.htmlunit.html.HtmlElement;
35 import org.htmlunit.html.HtmlElement.DisplayStyle;
36 import org.htmlunit.html.HtmlHiddenInput;
37 import org.htmlunit.html.HtmlInlineFrame;
38 import org.htmlunit.html.HtmlInput;
39 import org.htmlunit.html.HtmlListItem;
40 import org.htmlunit.html.HtmlNoFrames;
41 import org.htmlunit.html.HtmlNoScript;
42 import org.htmlunit.html.HtmlNumberInput;
43 import org.htmlunit.html.HtmlOption;
44 import org.htmlunit.html.HtmlOrderedList;
45 import org.htmlunit.html.HtmlPreformattedText;
46 import org.htmlunit.html.HtmlRadioButtonInput;
47 import org.htmlunit.html.HtmlResetInput;
48 import org.htmlunit.html.HtmlScript;
49 import org.htmlunit.html.HtmlSelect;
50 import org.htmlunit.html.HtmlStyle;
51 import org.htmlunit.html.HtmlSubmitInput;
52 import org.htmlunit.html.HtmlSummary;
53 import org.htmlunit.html.HtmlTable;
54 import org.htmlunit.html.HtmlTableCell;
55 import org.htmlunit.html.HtmlTableFooter;
56 import org.htmlunit.html.HtmlTableHeader;
57 import org.htmlunit.html.HtmlTableRow;
58 import org.htmlunit.html.HtmlTextArea;
59 import org.htmlunit.html.HtmlTitle;
60 import org.htmlunit.html.HtmlUnorderedList;
61 import org.htmlunit.html.TableRowGroup;
62 import org.htmlunit.html.serializer.HtmlSerializerNormalizedText.HtmlSerializerTextBuilder.Mode;
63 import org.htmlunit.util.StringUtils;
64
65
66
67
68
69
70
71
72
73
74
75
76 public class HtmlSerializerNormalizedText {
77
78 private boolean ignoreMaskedElements_ = true;
79
80
81
82
83
84
85 public String asText(final DomNode node) {
86 final HtmlSerializerTextBuilder builder = new HtmlSerializerTextBuilder();
87 appendNode(builder, node);
88 return builder.getText();
89 }
90
91
92
93
94
95
96
97 protected void appendChildren(final HtmlSerializerTextBuilder builder, final DomNode node) {
98 for (final DomNode child : node.getChildren()) {
99 appendNode(builder, child);
100 }
101 }
102
103
104
105
106
107
108
109
110 protected void appendNode(final HtmlSerializerTextBuilder builder, final DomNode node) {
111 if (node instanceof DomText text1) {
112 appendText(builder, text1);
113 }
114 else if (node instanceof DomComment) {
115
116 }
117 else if (node instanceof HtmlBreak break1) {
118 appendBreak(builder, break1);
119 }
120 else if (node instanceof HtmlHiddenInput) {
121
122 }
123 else if (node instanceof HtmlScript) {
124
125 }
126 else if (node instanceof HtmlStyle) {
127
128 }
129 else if (node instanceof HtmlNoFrames) {
130
131 }
132 else if (node instanceof HtmlTextArea area) {
133 appendTextArea(builder, area);
134 }
135 else if (node instanceof HtmlTitle title) {
136 appendTitle(builder, title);
137 }
138 else if (node instanceof HtmlTableRow row) {
139 appendTableRow(builder, row);
140 }
141 else if (node instanceof HtmlSelect select) {
142 appendSelect(builder, select);
143 }
144 else if (node instanceof HtmlSubmitInput input5) {
145 appendSubmitInput(builder, input5);
146 }
147 else if (node instanceof HtmlResetInput input4) {
148 appendResetInput(builder, input4);
149 }
150 else if (node instanceof HtmlCheckBoxInput input3) {
151 appendCheckBoxInput(builder, input3);
152 }
153 else if (node instanceof HtmlRadioButtonInput input2) {
154 appendRadioButtonInput(builder, input2);
155 }
156 else if (node instanceof HtmlNumberInput input1) {
157 appendNumberInput(builder, input1);
158 }
159 else if (node instanceof HtmlInput input) {
160 appendInput(builder, input);
161 }
162 else if (node instanceof HtmlTable table) {
163 appendTable(builder, table);
164 }
165 else if (node instanceof HtmlOrderedList list1) {
166 appendOrderedList(builder, list1);
167 }
168 else if (node instanceof HtmlUnorderedList list) {
169 appendUnorderedList(builder, list);
170 }
171 else if (node instanceof HtmlPreformattedText text) {
172 appendPreformattedText(builder, text);
173 }
174 else if (node instanceof HtmlInlineFrame frame) {
175 appendInlineFrame(builder, frame);
176 }
177 else if (node instanceof HtmlDetails details) {
178 appendDetails(builder, details);
179 }
180 else if (node instanceof HtmlNoScript && node.getPage().getWebClient().isJavaScriptEnabled()) {
181
182 }
183 else {
184 appendDomNode(builder, node);
185 }
186 }
187
188
189
190
191
192
193
194 protected void appendDomNode(final HtmlSerializerTextBuilder builder, final DomNode domNode) {
195 boolean block = false;
196 if (!(domNode instanceof HtmlBody)) {
197 final SgmlPage page = domNode.getPage();
198 final WebWindow window = page.getEnclosingWindow();
199 if (window.getWebClient().getOptions().isCssEnabled()) {
200 if (domNode instanceof DomElement element) {
201 final String display = window.getComputedStyle(element, null).getDisplay();
202 block = BLOCK.equals(display);
203 }
204 }
205 else if (domNode instanceof HtmlElement element) {
206 block = DisplayStyle.BLOCK == element.getDefaultStyleDisplay();
207 }
208 }
209
210 if (block) {
211 builder.appendBlockSeparator();
212 }
213 appendChildren(builder, domNode);
214 if (block) {
215 builder.appendBlockSeparator();
216 }
217 }
218
219
220
221
222
223
224
225 protected void appendSubmitInput(final HtmlSerializerTextBuilder builder, final HtmlSubmitInput htmlSubmitInput) {
226 String text = htmlSubmitInput.getValueAttribute();
227 if (ATTRIBUTE_NOT_DEFINED == text) {
228 text = HtmlSubmitInput.DEFAULT_VALUE;
229 }
230
231 builder.append(text, Mode.NORMALIZE);
232 }
233
234
235
236
237
238
239
240 protected void appendInput(final HtmlSerializerTextBuilder builder, final HtmlInput htmlInput) {
241 builder.append(" ", Mode.NORMALIZE);
242 builder.append(htmlInput.getRawValue(), Mode.NORMALIZE);
243 builder.append(" ", Mode.NORMALIZE);
244 }
245
246
247
248
249
250
251
252 protected void appendNumberInput(final HtmlSerializerTextBuilder builder, final HtmlNumberInput htmlNumberInput) {
253 builder.append(" ", Mode.NORMALIZE);
254
255 String val = htmlNumberInput.getRawValue();
256 final int lastPos = val.length() - 1;
257 if (lastPos >= 0 && val.charAt(lastPos) == '.') {
258 val = val.substring(0, lastPos);
259 }
260 builder.append(val, Mode.NORMALIZE);
261
262 builder.append(" ", Mode.NORMALIZE);
263 }
264
265
266
267
268
269
270
271 protected void appendResetInput(final HtmlSerializerTextBuilder builder, final HtmlResetInput htmlResetInput) {
272 String text = htmlResetInput.getValueAttribute();
273 if (ATTRIBUTE_NOT_DEFINED == text) {
274 text = HtmlResetInput.DEFAULT_VALUE;
275 }
276
277 builder.append(text, Mode.NORMALIZE);
278 }
279
280
281
282
283
284
285 protected void appendUnorderedList(final HtmlSerializerTextBuilder builder,
286 final HtmlUnorderedList htmlUnorderedList) {
287 builder.appendBlockSeparator();
288 boolean first = true;
289 for (final DomNode item : htmlUnorderedList.getChildren()) {
290 if (!first) {
291 builder.appendBlockSeparator();
292 }
293 first = false;
294 appendNode(builder, item);
295 }
296 builder.appendBlockSeparator();
297 }
298
299
300
301
302
303
304 protected void appendDetails(final HtmlSerializerTextBuilder builder,
305 final HtmlDetails htmlDetails) {
306 if (htmlDetails.isOpen()) {
307 appendChildren(builder, htmlDetails);
308 return;
309 }
310
311 for (final DomNode child : htmlDetails.getChildren()) {
312 if (child instanceof HtmlSummary) {
313 appendNode(builder, child);
314 }
315 }
316 }
317
318
319
320
321
322
323 protected void appendTitle(final HtmlSerializerTextBuilder builder, final HtmlTitle htmlTitle) {
324
325
326
327
328 final DomNode child = htmlTitle.getFirstChild();
329 if (child instanceof DomText text) {
330 builder.append(text.getData(), Mode.NORMALIZE);
331 builder.appendBlockSeparator();
332 }
333 }
334
335
336
337
338
339
340
341 protected void appendTableRow(final HtmlSerializerTextBuilder builder, final HtmlTableRow htmlTableRow) {
342 boolean first = true;
343 for (final HtmlTableCell cell : htmlTableRow.getCells()) {
344 if (first) {
345 first = false;
346 }
347 else {
348 builder.appendTab();
349 }
350 appendChildren(builder, cell);
351 }
352 }
353
354
355
356
357
358
359
360 protected void appendTextArea(final HtmlSerializerTextBuilder builder, final HtmlTextArea htmlTextArea) {
361 if (isVisible(htmlTextArea)) {
362 builder.append(htmlTextArea.getText(), Mode.PRESERVE_BLANK_NEWLINE);
363 }
364 }
365
366
367
368
369
370
371
372 protected void appendTable(final HtmlSerializerTextBuilder builder, final HtmlTable htmlTable) {
373 builder.appendBlockSeparator();
374 final String caption = htmlTable.getCaptionText();
375 if (caption != null) {
376 builder.append(caption, Mode.NORMALIZE);
377 builder.appendBlockSeparator();
378 }
379
380 boolean first = true;
381
382
383 final HtmlTableHeader tableHeader = htmlTable.getHeader();
384 if (tableHeader != null) {
385 first = appendTableRows(builder, tableHeader.getRows(), true, null, null);
386 }
387 final HtmlTableFooter tableFooter = htmlTable.getFooter();
388
389 final List<HtmlTableRow> tableRows = htmlTable.getRows();
390 first = appendTableRows(builder, tableRows, first, tableHeader, tableFooter);
391
392 if (tableFooter != null) {
393 first = appendTableRows(builder, tableFooter.getRows(), first, null, null);
394 }
395 else if (tableRows.isEmpty()) {
396 final DomNode firstChild = htmlTable.getFirstChild();
397 if (firstChild != null) {
398 appendNode(builder, firstChild);
399 }
400 }
401
402 builder.appendBlockSeparator();
403 }
404
405
406
407
408
409
410
411
412
413
414
415 protected boolean appendTableRows(final HtmlSerializerTextBuilder builder,
416 final List<HtmlTableRow> rows, boolean first, final TableRowGroup skipParent1,
417 final TableRowGroup skipParent2) {
418 for (final HtmlTableRow row : rows) {
419 if (row.getParentNode() == skipParent1 || row.getParentNode() == skipParent2) {
420 continue;
421 }
422 if (!first) {
423 builder.appendBlockSeparator();
424 }
425 first = false;
426 appendTableRow(builder, row);
427 }
428 return first;
429 }
430
431
432
433
434
435
436
437 protected void appendSelect(final HtmlSerializerTextBuilder builder, final HtmlSelect htmlSelect) {
438 final List<HtmlOption> options = htmlSelect.getSelectedOptions();
439
440 for (final Iterator<HtmlOption> i = options.iterator(); i.hasNext();) {
441 final HtmlOption currentOption = i.next();
442 appendChildren(builder, currentOption);
443 if (i.hasNext()) {
444 builder.appendBlockSeparator();
445 }
446 }
447 }
448
449
450
451
452
453
454
455 protected void appendOrderedList(final HtmlSerializerTextBuilder builder, final HtmlOrderedList htmlOrderedList) {
456 builder.appendBlockSeparator();
457 boolean first = true;
458 int i = 1;
459
460 final String start = htmlOrderedList.getStartAttribute();
461 if (ATTRIBUTE_NOT_DEFINED != start) {
462 try {
463 i = (int) Math.round(Double.parseDouble(start));
464 }
465 catch (final Exception e) {
466
467 }
468 }
469
470 for (final DomNode item : htmlOrderedList.getChildren()) {
471 if (!first) {
472 builder.appendBlockSeparator();
473 }
474 first = false;
475 if (item instanceof HtmlListItem) {
476 builder.append(Integer.toString(i++), Mode.NORMALIZE);
477 builder.append(". ", Mode.NORMALIZE);
478 appendChildren(builder, item);
479 }
480 else {
481 appendNode(builder, item);
482 }
483 }
484 builder.appendBlockSeparator();
485 }
486
487
488
489
490
491
492
493 protected void appendPreformattedText(final HtmlSerializerTextBuilder builder,
494 final HtmlPreformattedText htmlPreformattedText) {
495 if (isVisible(htmlPreformattedText)) {
496 builder.appendBlockSeparator();
497 builder.append(htmlPreformattedText.getTextContent(), Mode.PRESERVE_BLANK_TAB_NEWLINE);
498 builder.appendBlockSeparator();
499 }
500 }
501
502
503
504
505
506
507
508 protected void appendInlineFrame(final HtmlSerializerTextBuilder builder,
509 final HtmlInlineFrame htmlInlineFrame) {
510 if (isVisible(htmlInlineFrame)) {
511 builder.appendBlockSeparator();
512 final Page page = htmlInlineFrame.getEnclosedPage();
513 if (page instanceof SgmlPage sgmlPage) {
514 builder.append(sgmlPage.asNormalizedText(), Mode.NORMALIZE);
515 }
516 builder.appendBlockSeparator();
517 }
518 }
519
520
521
522
523
524
525
526 protected void appendText(final HtmlSerializerTextBuilder builder, final DomText domText) {
527 final DomNode parent = domText.getParentNode();
528 if (parent == null || parent instanceof HtmlTitle || isVisible(parent)) {
529 builder.append(domText.getData(), Mode.NORMALIZE);
530 }
531 }
532
533
534
535
536
537
538
539 protected void appendBreak(final HtmlSerializerTextBuilder builder, final HtmlBreak htmlBreak) {
540 builder.appendNewLine();
541 }
542
543
544
545
546
547
548
549 protected void appendCheckBoxInput(final HtmlSerializerTextBuilder builder,
550 final HtmlCheckBoxInput htmlCheckBoxInput) {
551 if (htmlCheckBoxInput.isChecked()) {
552 builder.append("checked", Mode.NORMALIZE);
553 }
554 else {
555 builder.append("unchecked", Mode.NORMALIZE);
556 }
557 }
558
559
560
561
562
563
564
565 protected void appendRadioButtonInput(final HtmlSerializerTextBuilder builder,
566 final HtmlRadioButtonInput htmlRadioButtonInput) {
567 if (htmlRadioButtonInput.isChecked()) {
568 builder.append("checked", Mode.NORMALIZE);
569 }
570 else {
571 builder.append("unchecked", Mode.NORMALIZE);
572 }
573 }
574
575 private boolean isVisible(final DomNode node) {
576 return !ignoreMaskedElements_ || node.isDisplayed();
577 }
578
579
580
581
582
583
584 public void setIgnoreMaskedElements(final boolean ignore) {
585 ignoreMaskedElements_ = ignore;
586 }
587
588
589
590
591 protected static class HtmlSerializerTextBuilder {
592
593
594 protected enum Mode {
595
596 NORMALIZE,
597
598
599 PRESERVE_BLANK_TAB_NEWLINE,
600
601
602 PRESERVE_BLANK_NEWLINE
603 }
604
605 private enum State {
606 DEFAULT,
607 EMPTY,
608 TRIM,
609 BLANK_AT_END,
610 BLANK_AT_END_AFTER_NEWLINE,
611 NEWLINE_AT_END,
612 BLOCK_SEPARATOR_AT_END
613 }
614
615 private static final String LINE_SEPARATOR = "\n";
616 private static final int LINE_SEPARATOR_LENGTH = LINE_SEPARATOR.length();
617
618 private State state_;
619 private final StringBuilder builder_;
620 private int trimRightPos_;
621
622
623
624
625 public HtmlSerializerTextBuilder() {
626 builder_ = new StringBuilder();
627 state_ = State.EMPTY;
628 trimRightPos_ = builder_.length();
629 }
630
631
632
633
634
635
636
637 public void append(final String content, final Mode mode) {
638 if (content == null) {
639 return;
640 }
641 final int length = content.length();
642 if (length == 0) {
643 return;
644 }
645
646 String text = content;
647 if (mode == Mode.PRESERVE_BLANK_NEWLINE) {
648 text = StringUtils.trimRight(text);
649 }
650
651 boolean crFound = false;
652 final int textLength = text.length();
653 for (int i = 0; i < textLength; i++) {
654 final char c = text.charAt(i);
655
656 if (mode == Mode.NORMALIZE) {
657 if (isSpace(c)) {
658 switch (state_) {
659 case EMPTY:
660 case TRIM:
661 case BLANK_AT_END:
662 case BLANK_AT_END_AFTER_NEWLINE:
663 case BLOCK_SEPARATOR_AT_END:
664 break;
665 case NEWLINE_AT_END:
666 builder_.append(' ');
667 state_ = State.BLANK_AT_END_AFTER_NEWLINE;
668 break;
669 default:
670 builder_.append(' ');
671 state_ = State.BLANK_AT_END;
672 break;
673 }
674 }
675 else if (c == (char) 160) {
676 builder_.append(' ');
677 state_ = State.DEFAULT;
678 trimRightPos_ = builder_.length();
679 }
680 else {
681 builder_.append(c);
682 state_ = State.DEFAULT;
683 trimRightPos_ = builder_.length();
684 }
685 continue;
686 }
687
688
689 if (c == '\n') {
690 appendNewLine();
691 crFound = false;
692 }
693 else {
694 if (crFound) {
695 appendNewLine();
696 }
697 crFound = c == '\r';
698
699 if (c == '\t') {
700 if (mode == Mode.PRESERVE_BLANK_TAB_NEWLINE) {
701 appendTab();
702 }
703 else if (state_ != State.BLOCK_SEPARATOR_AT_END) {
704 builder_.append(' ');
705 }
706 }
707 else if (c == (char) 160) {
708 appendBlank();
709 }
710 else if (c == ' ') {
711 appendBlank();
712 }
713 else {
714 builder_.append(c);
715 }
716 trimRightPos_ = builder_.length();
717 }
718 }
719
720 if (crFound) {
721 appendNewLine();
722 }
723
724 if (mode != Mode.NORMALIZE) {
725
726 state_ = State.TRIM;
727 }
728 }
729
730
731
732
733 public void appendBlockSeparator() {
734 switch (state_) {
735 case EMPTY:
736 break;
737 case BLANK_AT_END:
738 builder_.setLength(trimRightPos_);
739 if (builder_.length() == 0) {
740 state_ = State.EMPTY;
741 }
742 else {
743 builder_.append(LINE_SEPARATOR);
744 state_ = State.BLOCK_SEPARATOR_AT_END;
745 }
746 break;
747 case BLANK_AT_END_AFTER_NEWLINE:
748 builder_.setLength(trimRightPos_ - LINE_SEPARATOR_LENGTH);
749 trimRightPos_ = trimRightPos_ - LINE_SEPARATOR_LENGTH;
750 if (builder_.length() == 0) {
751 state_ = State.EMPTY;
752 }
753 else {
754 builder_.append(LINE_SEPARATOR);
755 state_ = State.BLOCK_SEPARATOR_AT_END;
756 }
757 break;
758 case BLOCK_SEPARATOR_AT_END:
759 break;
760 case NEWLINE_AT_END:
761 builder_.setLength(builder_.length() - LINE_SEPARATOR_LENGTH);
762 trimRightPos_ = trimRightPos_ - LINE_SEPARATOR_LENGTH;
763 if (builder_.length() == 0) {
764 state_ = State.EMPTY;
765 }
766 else {
767 builder_.append(LINE_SEPARATOR);
768 state_ = State.BLOCK_SEPARATOR_AT_END;
769 }
770 break;
771 default:
772 builder_.append(LINE_SEPARATOR);
773 state_ = State.BLOCK_SEPARATOR_AT_END;
774 break;
775 }
776 }
777
778
779
780
781 public void appendNewLine() {
782 builder_.append(LINE_SEPARATOR);
783 state_ = State.NEWLINE_AT_END;
784 trimRightPos_ = builder_.length();
785 }
786
787
788
789
790 public void appendTab() {
791 builder_.append('\t');
792 trimRightPos_ = builder_.length();
793 }
794
795
796
797
798 private void appendBlank() {
799 builder_.append(' ');
800 trimRightPos_ = builder_.length();
801 }
802
803
804
805
806 public String getText() {
807 return builder_.substring(0, trimRightPos_);
808 }
809
810 private static boolean isSpace(final char ch) {
811 return ch == ' ' || ch == '\t' || ch == '\n' || ch == '\f' || ch == '\r';
812 }
813 }
814 }