1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 package org.htmlunit.html.serializer;
16
17 import static org.htmlunit.css.CssStyleSheet.BLOCK;
18 import static org.htmlunit.html.DomElement.ATTRIBUTE_NOT_DEFINED;
19
20 import java.util.Iterator;
21 import java.util.List;
22
23 import org.apache.commons.lang3.StringUtils;
24 import org.htmlunit.Page;
25 import org.htmlunit.SgmlPage;
26 import org.htmlunit.WebWindow;
27 import org.htmlunit.html.DomComment;
28 import org.htmlunit.html.DomElement;
29 import org.htmlunit.html.DomNode;
30 import org.htmlunit.html.DomText;
31 import org.htmlunit.html.HtmlBody;
32 import org.htmlunit.html.HtmlBreak;
33 import org.htmlunit.html.HtmlCheckBoxInput;
34 import org.htmlunit.html.HtmlDetails;
35 import org.htmlunit.html.HtmlElement;
36 import org.htmlunit.html.HtmlElement.DisplayStyle;
37 import org.htmlunit.html.HtmlHiddenInput;
38 import org.htmlunit.html.HtmlInlineFrame;
39 import org.htmlunit.html.HtmlInput;
40 import org.htmlunit.html.HtmlListItem;
41 import org.htmlunit.html.HtmlNoFrames;
42 import org.htmlunit.html.HtmlNoScript;
43 import org.htmlunit.html.HtmlNumberInput;
44 import org.htmlunit.html.HtmlOption;
45 import org.htmlunit.html.HtmlOrderedList;
46 import org.htmlunit.html.HtmlPreformattedText;
47 import org.htmlunit.html.HtmlRadioButtonInput;
48 import org.htmlunit.html.HtmlResetInput;
49 import org.htmlunit.html.HtmlScript;
50 import org.htmlunit.html.HtmlSelect;
51 import org.htmlunit.html.HtmlStyle;
52 import org.htmlunit.html.HtmlSubmitInput;
53 import org.htmlunit.html.HtmlSummary;
54 import org.htmlunit.html.HtmlTable;
55 import org.htmlunit.html.HtmlTableCell;
56 import org.htmlunit.html.HtmlTableFooter;
57 import org.htmlunit.html.HtmlTableHeader;
58 import org.htmlunit.html.HtmlTableRow;
59 import org.htmlunit.html.HtmlTextArea;
60 import org.htmlunit.html.HtmlTitle;
61 import org.htmlunit.html.HtmlUnorderedList;
62 import org.htmlunit.html.TableRowGroup;
63 import org.htmlunit.html.serializer.HtmlSerializerNormalizedText.HtmlSerializerTextBuilder.Mode;
64
65
66
67
68
69
70
71
72
73
74
75
76 public class HtmlSerializerNormalizedText {
77
78 private boolean ignoreMaskedElements_ = true;
79
80
81
82
83
84
85 public String asText(final DomNode node) {
86 final HtmlSerializerTextBuilder builder = new HtmlSerializerTextBuilder();
87 appendNode(builder, node);
88 return builder.getText();
89 }
90
91
92
93
94
95
96
97 protected void appendChildren(final HtmlSerializerTextBuilder builder, final DomNode node) {
98 for (final DomNode child : node.getChildren()) {
99 appendNode(builder, child);
100 }
101 }
102
103
104
105
106
107
108
109
110 protected void appendNode(final HtmlSerializerTextBuilder builder, final DomNode node) {
111 if (node instanceof DomText) {
112 appendText(builder, (DomText) node);
113 }
114 else if (node instanceof DomComment) {
115
116 }
117 else if (node instanceof HtmlBreak) {
118 appendBreak(builder, (HtmlBreak) node);
119 }
120 else if (node instanceof HtmlHiddenInput) {
121
122 }
123 else if (node instanceof HtmlScript) {
124
125 }
126 else if (node instanceof HtmlStyle) {
127
128 }
129 else if (node instanceof HtmlNoFrames) {
130
131 }
132 else if (node instanceof HtmlTextArea) {
133 appendTextArea(builder, (HtmlTextArea) node);
134 }
135 else if (node instanceof HtmlTitle) {
136 appendTitle(builder, (HtmlTitle) node);
137 }
138 else if (node instanceof HtmlTableRow) {
139 appendTableRow(builder, (HtmlTableRow) node);
140 }
141 else if (node instanceof HtmlSelect) {
142 appendSelect(builder, (HtmlSelect) node);
143 }
144 else if (node instanceof HtmlSubmitInput) {
145 appendSubmitInput(builder, (HtmlSubmitInput) node);
146 }
147 else if (node instanceof HtmlResetInput) {
148 appendResetInput(builder, (HtmlResetInput) node);
149 }
150 else if (node instanceof HtmlCheckBoxInput) {
151 appendCheckBoxInput(builder, (HtmlCheckBoxInput) node);
152 }
153 else if (node instanceof HtmlRadioButtonInput) {
154 appendRadioButtonInput(builder, (HtmlRadioButtonInput) node);
155 }
156 else if (node instanceof HtmlNumberInput) {
157 appendNumberInput(builder, (HtmlNumberInput) node);
158 }
159 else if (node instanceof HtmlInput) {
160 appendInput(builder, (HtmlInput) node);
161 }
162 else if (node instanceof HtmlTable) {
163 appendTable(builder, (HtmlTable) node);
164 }
165 else if (node instanceof HtmlOrderedList) {
166 appendOrderedList(builder, (HtmlOrderedList) node);
167 }
168 else if (node instanceof HtmlUnorderedList) {
169 appendUnorderedList(builder, (HtmlUnorderedList) node);
170 }
171 else if (node instanceof HtmlPreformattedText) {
172 appendPreformattedText(builder, (HtmlPreformattedText) node);
173 }
174 else if (node instanceof HtmlInlineFrame) {
175 appendInlineFrame(builder, (HtmlInlineFrame) node);
176 }
177 else if (node instanceof HtmlDetails) {
178 appendDetails(builder, (HtmlDetails) node);
179 }
180 else if (node instanceof HtmlNoScript && node.getPage().getWebClient().isJavaScriptEnabled()) {
181
182 }
183 else {
184 appendDomNode(builder, node);
185 }
186 }
187
188
189
190
191
192
193
194 protected void appendDomNode(final HtmlSerializerTextBuilder builder, final DomNode domNode) {
195 boolean block = false;
196 if (!(domNode instanceof HtmlBody)) {
197 final SgmlPage page = domNode.getPage();
198 final WebWindow window = page.getEnclosingWindow();
199 if (window.getWebClient().getOptions().isCssEnabled()) {
200 if (domNode instanceof DomElement) {
201 final String display = window.getComputedStyle((DomElement) domNode, null).getDisplay();
202 block = BLOCK.equals(display);
203 }
204 }
205 else if (domNode instanceof HtmlElement) {
206 block = DisplayStyle.BLOCK == ((HtmlElement) domNode).getDefaultStyleDisplay();
207 }
208 }
209
210 if (block) {
211 builder.appendBlockSeparator();
212 }
213 appendChildren(builder, domNode);
214 if (block) {
215 builder.appendBlockSeparator();
216 }
217 }
218
219
220
221
222
223
224
225 protected void appendSubmitInput(final HtmlSerializerTextBuilder builder, final HtmlSubmitInput htmlSubmitInput) {
226 String text = htmlSubmitInput.getValueAttribute();
227 if (ATTRIBUTE_NOT_DEFINED == text) {
228 text = HtmlSubmitInput.DEFAULT_VALUE;
229 }
230
231 builder.append(text, Mode.NORMALIZE);
232 }
233
234
235
236
237
238
239
240 protected void appendInput(final HtmlSerializerTextBuilder builder, final HtmlInput htmlInput) {
241 builder.append(" ", Mode.NORMALIZE);
242 builder.append(htmlInput.getRawValue(), Mode.NORMALIZE);
243 builder.append(" ", Mode.NORMALIZE);
244 }
245
246
247
248
249
250
251
252 protected void appendNumberInput(final HtmlSerializerTextBuilder builder, final HtmlNumberInput htmlNumberInput) {
253 builder.append(" ", Mode.NORMALIZE);
254
255 String val = htmlNumberInput.getRawValue();
256 final int lastPos = val.length() - 1;
257 if (lastPos >= 0 && val.charAt(lastPos) == '.') {
258 val = val.substring(0, lastPos);
259 }
260 builder.append(val, Mode.NORMALIZE);
261
262 builder.append(" ", Mode.NORMALIZE);
263 }
264
265
266
267
268
269
270
271 protected void appendResetInput(final HtmlSerializerTextBuilder builder, final HtmlResetInput htmlResetInput) {
272 String text = htmlResetInput.getValueAttribute();
273 if (ATTRIBUTE_NOT_DEFINED == text) {
274 text = HtmlResetInput.DEFAULT_VALUE;
275 }
276
277 builder.append(text, Mode.NORMALIZE);
278 }
279
280
281
282
283
284
285 protected void appendUnorderedList(final HtmlSerializerTextBuilder builder,
286 final HtmlUnorderedList htmlUnorderedList) {
287 builder.appendBlockSeparator();
288 boolean first = true;
289 for (final DomNode item : htmlUnorderedList.getChildren()) {
290 if (!first) {
291 builder.appendBlockSeparator();
292 }
293 first = false;
294 appendNode(builder, item);
295 }
296 builder.appendBlockSeparator();
297 }
298
299
300
301
302
303
304 protected void appendDetails(final HtmlSerializerTextBuilder builder,
305 final HtmlDetails htmlDetails) {
306 if (htmlDetails.isOpen()) {
307 appendChildren(builder, htmlDetails);
308 return;
309 }
310
311 for (final DomNode child : htmlDetails.getChildren()) {
312 if (child instanceof HtmlSummary) {
313 appendNode(builder, child);
314 }
315 }
316 }
317
318
319
320
321
322
323 protected void appendTitle(final HtmlSerializerTextBuilder builder, final HtmlTitle htmlTitle) {
324
325
326
327
328 final DomNode child = htmlTitle.getFirstChild();
329 if (child instanceof DomText) {
330 builder.append(((DomText) child).getData(), Mode.NORMALIZE);
331 builder.appendBlockSeparator();
332 }
333 }
334
335
336
337
338
339
340
341 protected void appendTableRow(final HtmlSerializerTextBuilder builder, final HtmlTableRow htmlTableRow) {
342 boolean first = true;
343 for (final HtmlTableCell cell : htmlTableRow.getCells()) {
344 if (!first) {
345 builder.appendTab();
346 }
347 else {
348 first = false;
349 }
350 appendChildren(builder, cell);
351 }
352 }
353
354
355
356
357
358
359
360 protected void appendTextArea(final HtmlSerializerTextBuilder builder, final HtmlTextArea htmlTextArea) {
361 if (isVisible(htmlTextArea)) {
362 builder.append(htmlTextArea.getText(), Mode.PRESERVE_BLANK_NEWLINE);
363 }
364 }
365
366
367
368
369
370
371
372 protected void appendTable(final HtmlSerializerTextBuilder builder, final HtmlTable htmlTable) {
373 builder.appendBlockSeparator();
374 final String caption = htmlTable.getCaptionText();
375 if (caption != null) {
376 builder.append(caption, Mode.NORMALIZE);
377 builder.appendBlockSeparator();
378 }
379
380 boolean first = true;
381
382
383 final HtmlTableHeader tableHeader = htmlTable.getHeader();
384 if (tableHeader != null) {
385 first = appendTableRows(builder, tableHeader.getRows(), true, null, null);
386 }
387 final HtmlTableFooter tableFooter = htmlTable.getFooter();
388
389 final List<HtmlTableRow> tableRows = htmlTable.getRows();
390 first = appendTableRows(builder, tableRows, first, tableHeader, tableFooter);
391
392 if (tableFooter != null) {
393 first = appendTableRows(builder, tableFooter.getRows(), first, null, null);
394 }
395 else if (tableRows.isEmpty()) {
396 final DomNode firstChild = htmlTable.getFirstChild();
397 if (firstChild != null) {
398 appendNode(builder, firstChild);
399 }
400 }
401
402 builder.appendBlockSeparator();
403 }
404
405
406
407
408
409
410
411
412
413
414
415 protected boolean appendTableRows(final HtmlSerializerTextBuilder builder,
416 final List<HtmlTableRow> rows, boolean first, final TableRowGroup skipParent1,
417 final TableRowGroup skipParent2) {
418 for (final HtmlTableRow row : rows) {
419 if (row.getParentNode() == skipParent1 || row.getParentNode() == skipParent2) {
420 continue;
421 }
422 if (!first) {
423 builder.appendBlockSeparator();
424 }
425 first = false;
426 appendTableRow(builder, row);
427 }
428 return first;
429 }
430
431
432
433
434
435
436
437 protected void appendSelect(final HtmlSerializerTextBuilder builder, final HtmlSelect htmlSelect) {
438 final List<HtmlOption> options = htmlSelect.getSelectedOptions();
439
440 for (final Iterator<HtmlOption> i = options.iterator(); i.hasNext();) {
441 final HtmlOption currentOption = i.next();
442 appendChildren(builder, currentOption);
443 if (i.hasNext()) {
444 builder.appendBlockSeparator();
445 }
446 }
447 }
448
449
450
451
452
453
454
455 protected void appendOrderedList(final HtmlSerializerTextBuilder builder, final HtmlOrderedList htmlOrderedList) {
456 builder.appendBlockSeparator();
457 boolean first = true;
458 int i = 1;
459 for (final DomNode item : htmlOrderedList.getChildren()) {
460 if (!first) {
461 builder.appendBlockSeparator();
462 }
463 first = false;
464 if (item instanceof HtmlListItem) {
465 builder.append(Integer.toString(i++), Mode.NORMALIZE);
466 builder.append(". ", Mode.NORMALIZE);
467 appendChildren(builder, item);
468 }
469 else {
470 appendNode(builder, item);
471 }
472 }
473 builder.appendBlockSeparator();
474 }
475
476
477
478
479
480
481
482 protected void appendPreformattedText(final HtmlSerializerTextBuilder builder,
483 final HtmlPreformattedText htmlPreformattedText) {
484 if (isVisible(htmlPreformattedText)) {
485 builder.appendBlockSeparator();
486 builder.append(htmlPreformattedText.getTextContent(), Mode.PRESERVE_BLANK_TAB_NEWLINE);
487 builder.appendBlockSeparator();
488 }
489 }
490
491
492
493
494
495
496
497 protected void appendInlineFrame(final HtmlSerializerTextBuilder builder,
498 final HtmlInlineFrame htmlInlineFrame) {
499 if (isVisible(htmlInlineFrame)) {
500 builder.appendBlockSeparator();
501 final Page page = htmlInlineFrame.getEnclosedPage();
502 if (page instanceof SgmlPage) {
503 builder.append(((SgmlPage) page).asNormalizedText(), Mode.NORMALIZE);
504 }
505 builder.appendBlockSeparator();
506 }
507 }
508
509
510
511
512
513
514
515 protected void appendText(final HtmlSerializerTextBuilder builder, final DomText domText) {
516 final DomNode parent = domText.getParentNode();
517 if (parent == null || parent instanceof HtmlTitle || isVisible(parent)) {
518 builder.append(domText.getData(), Mode.NORMALIZE);
519 }
520 }
521
522
523
524
525
526
527
528 protected void appendBreak(final HtmlSerializerTextBuilder builder, final HtmlBreak htmlBreak) {
529 builder.appendNewLine();
530 }
531
532
533
534
535
536
537
538 protected void appendCheckBoxInput(final HtmlSerializerTextBuilder builder,
539 final HtmlCheckBoxInput htmlCheckBoxInput) {
540 if (htmlCheckBoxInput.isChecked()) {
541 builder.append("checked", Mode.NORMALIZE);
542 }
543 else {
544 builder.append("unchecked", Mode.NORMALIZE);
545 }
546 }
547
548
549
550
551
552
553
554 protected void appendRadioButtonInput(final HtmlSerializerTextBuilder builder,
555 final HtmlRadioButtonInput htmlRadioButtonInput) {
556 if (htmlRadioButtonInput.isChecked()) {
557 builder.append("checked", Mode.NORMALIZE);
558 }
559 else {
560 builder.append("unchecked", Mode.NORMALIZE);
561 }
562 }
563
564 private boolean isVisible(final DomNode node) {
565 return !ignoreMaskedElements_ || node.isDisplayed();
566 }
567
568
569
570
571
572
573 public void setIgnoreMaskedElements(final boolean ignore) {
574 ignoreMaskedElements_ = ignore;
575 }
576
577
578
579
580 protected static class HtmlSerializerTextBuilder {
581
582
583 protected enum Mode {
584
585 NORMALIZE,
586
587
588 PRESERVE_BLANK_TAB_NEWLINE,
589
590
591 PRESERVE_BLANK_NEWLINE
592 }
593
594 private enum State {
595 DEFAULT,
596 EMPTY,
597 TRIM,
598 BLANK_AT_END,
599 BLANK_AT_END_AFTER_NEWLINE,
600 NEWLINE_AT_END,
601 BLOCK_SEPARATOR_AT_END
602 }
603
604 private static final String LINE_SEPARATOR = "\n";
605 private static final int LINE_SEPARATOR_LENGTH = LINE_SEPARATOR.length();
606
607 private State state_;
608 private final StringBuilder builder_;
609 private int trimRightPos_;
610
611
612
613
614 public HtmlSerializerTextBuilder() {
615 builder_ = new StringBuilder();
616 state_ = State.EMPTY;
617 trimRightPos_ = builder_.length();
618 }
619
620
621
622
623
624
625
626 public void append(final String content, final Mode mode) {
627 if (content == null) {
628 return;
629 }
630 final int length = content.length();
631 if (length == 0) {
632 return;
633 }
634
635 String text = content;
636 if (mode == Mode.PRESERVE_BLANK_NEWLINE) {
637 text = StringUtils.stripEnd(text, null);
638 }
639
640 boolean crFound = false;
641 for (final char c : text.toCharArray()) {
642 if (mode == Mode.NORMALIZE) {
643 if (isSpace(c)) {
644 switch (state_) {
645 case EMPTY:
646 case TRIM:
647 case BLANK_AT_END:
648 case BLANK_AT_END_AFTER_NEWLINE:
649 case BLOCK_SEPARATOR_AT_END:
650 break;
651 case NEWLINE_AT_END:
652 builder_.append(' ');
653 state_ = State.BLANK_AT_END_AFTER_NEWLINE;
654 break;
655 default:
656 builder_.append(' ');
657 state_ = State.BLANK_AT_END;
658 break;
659 }
660 }
661 else if (c == (char) 160) {
662 builder_.append(' ');
663 state_ = State.DEFAULT;
664 trimRightPos_ = builder_.length();
665 }
666 else {
667 builder_.append(c);
668 state_ = State.DEFAULT;
669 trimRightPos_ = builder_.length();
670 }
671 continue;
672 }
673
674
675 if (c == '\n') {
676 appendNewLine();
677 crFound = false;
678 }
679 else {
680 if (crFound) {
681 appendNewLine();
682 }
683 crFound = c == '\r';
684
685 if (c == '\t') {
686 if (mode == Mode.PRESERVE_BLANK_TAB_NEWLINE) {
687 appendTab();
688 }
689 else if (state_ != State.BLOCK_SEPARATOR_AT_END) {
690 builder_.append(' ');
691 }
692 }
693 else if (c == (char) 160) {
694 appendBlank();
695 }
696 else if (c == ' ') {
697 appendBlank();
698 }
699 else {
700 builder_.append(c);
701 }
702 trimRightPos_ = builder_.length();
703 }
704 }
705
706 if (crFound) {
707 appendNewLine();
708 }
709
710 if (mode != Mode.NORMALIZE) {
711
712 state_ = State.TRIM;
713 }
714 }
715
716
717
718
719 public void appendBlockSeparator() {
720 switch (state_) {
721 case EMPTY:
722 break;
723 case BLANK_AT_END:
724 builder_.setLength(trimRightPos_);
725 if (builder_.length() == 0) {
726 state_ = State.EMPTY;
727 }
728 else {
729 builder_.append(LINE_SEPARATOR);
730 state_ = State.BLOCK_SEPARATOR_AT_END;
731 }
732 break;
733 case BLANK_AT_END_AFTER_NEWLINE:
734 builder_.setLength(trimRightPos_ - LINE_SEPARATOR_LENGTH);
735 trimRightPos_ = trimRightPos_ - LINE_SEPARATOR_LENGTH;
736 if (builder_.length() == 0) {
737 state_ = State.EMPTY;
738 }
739 else {
740 builder_.append(LINE_SEPARATOR);
741 state_ = State.BLOCK_SEPARATOR_AT_END;
742 }
743 break;
744 case BLOCK_SEPARATOR_AT_END:
745 break;
746 case NEWLINE_AT_END:
747 builder_.setLength(builder_.length() - LINE_SEPARATOR_LENGTH);
748 trimRightPos_ = trimRightPos_ - LINE_SEPARATOR_LENGTH;
749 if (builder_.length() == 0) {
750 state_ = State.EMPTY;
751 }
752 else {
753 builder_.append(LINE_SEPARATOR);
754 state_ = State.BLOCK_SEPARATOR_AT_END;
755 }
756 break;
757 default:
758 builder_.append(LINE_SEPARATOR);
759 state_ = State.BLOCK_SEPARATOR_AT_END;
760 break;
761 }
762 }
763
764
765
766
767 public void appendNewLine() {
768 builder_.append(LINE_SEPARATOR);
769 state_ = State.NEWLINE_AT_END;
770 trimRightPos_ = builder_.length();
771 }
772
773
774
775
776 public void appendTab() {
777 builder_.append('\t');
778 trimRightPos_ = builder_.length();
779 }
780
781
782
783
784 private void appendBlank() {
785 builder_.append(' ');
786 trimRightPos_ = builder_.length();
787 }
788
789
790
791
792 public String getText() {
793 return builder_.substring(0, trimRightPos_);
794 }
795
796 private static boolean isSpace(final char ch) {
797 return ch == ' ' || ch == '\t' || ch == '\n' || ch == '\f' || ch == '\r';
798 }
799 }
800 }