stdex
Additional custom or not Standard C++ covered algorithms
Loading...
Searching...
No Matches
html.hpp
1/*
2 SPDX-License-Identifier: MIT
3 Copyright © 2016-2024 Amebis
4*/
5
6#pragma once
7
8#include "assert.hpp"
9#include "compat.hpp"
10#include "exception.hpp"
11#include "interval.hpp"
12#include "mapping.hpp"
13#include "parser.hpp"
14#include "progress.hpp"
15#include "sgml.hpp"
16#include "string.hpp"
17#include "system.hpp"
18#include "unicode.hpp"
19#include <exception>
20#include <list>
21#include <map>
22#include <memory>
23#include <stdexcept>
24#include <string_view>
25#include <string>
26#include <vector>
27
28#ifdef _WIN32
29#undef small
30#endif
31
32namespace stdex
33{
34 namespace html
35 {
43 template<class TR = std::char_traits<char>, class AX = std::allocator<char>>
44 void escape(
45 _Inout_ std::basic_string<char, TR, AX>& dst,
46 _In_reads_or_z_opt_(num_chars) const char* src, _In_ size_t num_chars)
47 {
48 stdex_assert(src || !num_chars);
49 for (size_t i = 0; i < num_chars && src[i]; ++i) {
50 switch (src[i]) {
51 case '&': dst += "&amp;"; break;
52 case ';': dst += "&semi;"; break;
53 case '\"': dst += "&quot;"; break;
54 case '\'': dst += "&#x27;"; break;
55 case '<': dst += "&lt;"; break;
56 case '>': dst += "&gt;"; break;
57 case 0x00a0: dst += "&nbsp;"; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space.
58 default: dst += src[i]; break;
59 }
60 }
61 }
62
70 template<class TR = std::char_traits<wchar_t>, class AX = std::allocator<wchar_t>>
71 void escape(
72 _Inout_ std::basic_string<wchar_t, TR, AX>& dst,
73 _In_reads_or_z_opt_(num_chars) const wchar_t* src, _In_ size_t num_chars)
74 {
75 stdex_assert(src || !num_chars);
76 for (size_t i = 0; i < num_chars && src[i]; ++i) {
77 switch (src[i]) {
78 case L'&': dst += L"&amp;"; break;
79 case L';': dst += L"&semi;"; break;
80 case L'\"': dst += L"&quot;"; break;
81 case L'\'': dst += L"&#x27;"; break;
82 case L'<': dst += L"&lt;"; break;
83 case L'>': dst += L"&gt;"; break;
84 case L'\u00a0': dst += L"&nbsp;"; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space.
85 default: dst += src[i]; break;
86 }
87 }
88 }
89
96 template<class T, size_t N, class TR = std::char_traits<T>, class AX = std::allocator<T>>
97 void escape(
98 _Inout_ std::basic_string<T, TR, AX>& dst,
99 _In_ const T (&src)[N])
100 {
101 escape(dst, src, N);
102 }
103
110 template<class T, class TR_dst = std::char_traits<T>, class AX_dst = std::allocator<T>, class TR_src = std::char_traits<T>, class AX_src = std::allocator<T>>
111 void escape(
112 _Inout_ std::basic_string<T, TR_dst, AX_dst>& dst,
113 _In_ const std::basic_string<T, TR_src, AX_src>& src)
114 {
115 escape(dst, src.data(), src.size());
116 }
117
124 template<class TR = std::char_traits<char>, class AX = std::allocator<char>>
125 void escape_min(_Inout_ std::basic_string<char, TR, AX>& dst, _In_ char chr)
126 {
127 switch (chr) {
128 case '&': dst += "&amp;"; break;
129 case '<': dst += "&lt;"; break;
130 case '>': dst += "&gt;"; break;
131 case 0x00a0: dst += "&nbsp;"; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space.
132 default: dst += chr; break;
133 }
134 }
135
142 template<class TR = std::char_traits<wchar_t>, class AX = std::allocator<wchar_t>>
143 void escape_min(_Inout_ std::basic_string<wchar_t, TR, AX>& dst, _In_ wchar_t chr)
144 {
145 switch (chr) {
146 case L'&': dst += L"&amp;"; break;
147 case L'<': dst += L"&lt;"; break;
148 case L'>': dst += L"&gt;"; break;
149 case L'\u00a0': dst += L"&nbsp;"; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space.
150 default: dst += chr; break;
151 }
152 }
153
161 template<class TR = std::char_traits<char>, class AX = std::allocator<char>>
162 void escape_min(
163 _Inout_ std::basic_string<char, TR, AX>& dst,
164 _In_reads_or_z_opt_(num_chars) const char* src, _In_ size_t num_chars)
165 {
166 stdex_assert(src || !num_chars);
167 for (size_t i = 0; i < num_chars && src[i]; ++i) {
168 switch (src[i]) {
169 case '&': dst += "&amp;"; break;
170 case '<': dst += "&lt;"; break;
171 case '>': dst += "&gt;"; break;
172 case 0x00a0: dst += "&nbsp;"; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space.
173 default: dst += src[i]; break;
174 }
175 }
176 }
177
185 template<class TR = std::char_traits<wchar_t>, class AX = std::allocator<wchar_t>>
186 void escape_min(
187 _Inout_ std::basic_string<wchar_t, TR, AX>& dst,
188 _In_reads_or_z_opt_(num_chars) const wchar_t* src, _In_ size_t num_chars)
189 {
190 stdex_assert(src || !num_chars);
191 for (size_t i = 0; i < num_chars && src[i]; ++i) {
192 switch (src[i]) {
193 case L'&': dst += L"&amp;"; break;
194 case L'<': dst += L"&lt;"; break;
195 case L'>': dst += L"&gt;"; break;
196 case L'\u00a0': dst += L"&nbsp;"; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space.
197 default: dst += src[i]; break;
198 }
199 }
200 }
201
208 template<class T, size_t N, class TR = std::char_traits<T>, class AX = std::allocator<T>>
209 void escape_min(
210 _Inout_ std::basic_string<T, TR, AX>& dst,
211 _In_ const T (&src)[N])
212 {
213 escape_min(dst, src, N);
214 }
215
222 template<class T, class TR_dst = std::char_traits<T>, class AX_dst = std::allocator<T>, class TR_src = std::char_traits<T>, class AX_src = std::allocator<T>>
223 void escape_min(
224 _Inout_ std::basic_string<T, TR_dst, AX_dst>& dst,
225 _In_ const std::basic_string<T, TR_src, AX_src>& src)
226 {
227 escape_min(dst, src.data(), src.size());
228 }
229
237 template<class TR = std::char_traits<char>, class AX = std::allocator<char>>
238 void url_unescape(
239 _Inout_ std::basic_string<char, TR, AX>& dst,
240 _In_reads_or_z_opt_(num_chars) const char* src, _In_ size_t num_chars)
241 {
242 stdex_assert(src || !num_chars);
243 for (size_t i = 0; i < num_chars && src[i];) {
244 switch (src[i]) {
245 case '+':
246 dst += ' '; i++;
247 break;
248
249 case '%': {
250 i++;
251
252 char chr;
253 if ('0' <= src[i] && src[i] <= '9') chr = static_cast<char>((src[i++] - '0') << 4);
254 else if ('A' <= src[i] && src[i] <= 'F') chr = static_cast<char>((src[i++] - 'A' + 10) << 4);
255 else if ('a' <= src[i] && src[i] <= 'f') chr = static_cast<char>((src[i++] - 'a' + 10) << 4);
256 else { dst += '%'; continue; }
257 if ('0' <= src[i] && src[i] <= '9') chr |= static_cast<char>((src[i++] - '0'));
258 else if ('A' <= src[i] && src[i] <= 'F') chr |= static_cast<char>((src[i++] - 'A' + 10));
259 else if ('a' <= src[i] && src[i] <= 'f') chr |= static_cast<char>((src[i++] - 'a' + 10));
260 else { dst += '%'; dst += src[i - 1]; continue; }
261
262 dst += chr;
263 break;
264 }
265
266 default:
267 dst += src[i++];
268 }
269 }
270 }
271
278 template<size_t N, class TR = std::char_traits<char>, class AX = std::allocator<char>>
279 void url_unescape(
280 _Inout_ std::basic_string<char, TR, AX>& dst,
281 _In_ const char (&src)[N])
282 {
283 url_unescape(dst, src, N);
284 }
285
292 template<class TR_dst = std::char_traits<char>, class AX_dst = std::allocator<char>>
293 void url_unescape(
294 _Inout_ std::basic_string<char, TR_dst, AX_dst>& dst,
295 _In_ const std::basic_string_view<char, std::char_traits<char>> src)
296 {
297 url_unescape(dst, src.data(), src.size());
298 }
299
307 template<class TR = std::char_traits<char>, class AX = std::allocator<char>>
308 void url_escape(
309 _Inout_ std::basic_string<char, TR, AX>& dst,
310 _In_reads_or_z_opt_(num_chars) const char* src, _In_ size_t num_chars)
311 {
312 stdex_assert(src || !num_chars);
313 for (size_t i = 0; i < num_chars && src[i]; ++i) {
314 switch (src[i]) {
315 case ' ': dst += "%20"; break;
316 case '<': dst += "%3C"; break;
317 case '>': dst += "%3E"; break;
318 case '#': dst += "%23"; break;
319 case '%': dst += "%25"; break;
320 case '{': dst += "%7B"; break;
321 case '}': dst += "%7D"; break;
322 case '|': dst += "%7C"; break;
323 case '\\': dst += "%5C"; break;
324 case '^': dst += "%5E"; break;
325 case '~': dst += "%7E"; break;
326 case '[': dst += "%5B"; break;
327 case ']': dst += "%5D"; break;
328 case '`': dst += "%60"; break;
329 case ';': dst += "%3B"; break;
330 case '+': dst += "%2B"; break;
331 case '/': dst += "%2F"; break;
332 case '?': dst += "%3F"; break;
333 case ':': dst += "%3A"; break;
334 case '@': dst += "%40"; break;
335 case '=': dst += "%3D"; break;
336 case '&': dst += "%26"; break;
337 case '$': dst += "%24"; break;
338 default:
339 if (0x20 < static_cast<uint8_t>(src[i]) && static_cast<uint8_t>(src[i]) < 0x7f)
340 dst += src[i];
341 else {
342 dst += '%';
343 uint8_t n = (static_cast<uint8_t>(src[i]) & 0xf0) >> 4;
344 dst += n < 10 ? static_cast<char>('0' + n) : static_cast<char>('A' + n - 10);
345 n = ((uint8_t)src[i] & 0x0f);
346 dst += n < 10 ? static_cast<char>('0' + n) : static_cast<char>('A' + n - 10);
347 }
348 }
349 }
350 }
351
358 template<size_t N, class TR = std::char_traits<char>, class AX = std::allocator<char>>
359 void url_escape(
360 _Inout_ std::basic_string<char, TR, AX>& dst,
361 _In_ const char (&src)[N])
362 {
363 url_escape(dst, src, N);
364 }
365
372 template<class TR_dst = std::char_traits<char>, class AX_dst = std::allocator<char>>
373 void url_escape(
374 _Inout_ std::basic_string<char, TR_dst, AX_dst>& dst,
375 _In_ const std::basic_string_view<char, std::char_traits<char>> src)
376 {
377 url_escape(dst, src.data(), src.size());
378 }
379
387 template<class T, class TR = std::char_traits<T>, class AX = std::allocator<T>>
388 void css_unescape(
389 _Inout_ std::basic_string<T, TR, AX>& dst,
390 _In_reads_or_z_opt_(num_chars) const T* src, _In_ size_t num_chars)
391 {
392 stdex_assert(src || !num_chars);
393 for (size_t i = 0; i < num_chars && src[i];) {
394 if (src[i] != '\\')
395 dst += src[i++];
396 else if (i + 1 < num_chars) {
397 i++;
398
399 switch (src[i]) {
400 // Classic escapes
401 case 'n': dst += '\n'; i++; break;
402 case 'r': dst += '\r'; i++; break;
403 case 't': dst += '\t'; i++; break;
404
405 // `\` at the end of the line
406 case '\n': i++; break;
407
408 // `\nnnn` escape
409 case '0':
410 case '1':
411 case '2':
412 case '3':
413 case '4':
414 case '5':
415 case '6':
416 case '7':
417 case '8':
418 case '9':
419 case 'A': case 'a':
420 case 'B': case 'b':
421 case 'C': case 'c':
422 case 'D': case 'd':
423 case 'E': case 'e':
424 case 'F': case 'f': {
425 wchar_t chr = 0;
426 size_t end = std::min(num_chars, i + 6);
427
428 for (; i < end; ++i) {
429 if ('0' <= src[i] && src[i] <= '9') chr = chr * 0x10 + src[i] - '0';
430 else if ('A' <= src[i] && src[i] <= 'F') chr = chr * 0x10 + src[i] - 'A' + 10;
431 else if ('a' <= src[i] && src[i] <= 'f') chr = chr * 0x10 + src[i] - 'a' + 10;
432 else break;
433 }
434
435 dst += static_cast<T>(chr);
436
437 if (i < end && src[i] == ' ') {
438 // Skip space after `\nnnn`.
439 i++;
440 }
441 break;
442 }
443
444 default: dst += src[i++];
445 }
446 }
447 }
448 }
449
456 template<class T, size_t N, class TR = std::char_traits<T>, class AX = std::allocator<T>>
457 void css_unescape(
458 _Inout_ std::basic_string<T, TR, AX>& dst,
459 _In_ const T (&src)[N])
460 {
461 css_unescape(dst, src, N);
462 }
463
470 template<class T, class TR_dst = std::char_traits<T>, class AX_dst = std::allocator<T>, class TR_src = std::char_traits<T>, class AX_src = std::allocator<T>>
471 void css_unescape(
472 _Inout_ std::basic_string<T, TR_dst, AX_dst>& dst,
473 _In_ const std::basic_string<T, TR_src, AX_src>& src)
474 {
475 css_unescape(dst, src.data(), src.size());
476 }
477
485 template<class TR = std::char_traits<char>, class AX = std::allocator<char>>
486 void css_escape(
487 _Inout_ std::basic_string<char, TR, AX>& dst,
488 _In_reads_or_z_opt_(num_chars) const char* src, _In_ size_t num_chars)
489 {
490 stdex_assert(src || !num_chars);
491 for (size_t i = 0; i < num_chars && src[i]; ++i) {
492 switch (src[i]) {
493 case '\\': dst += "\\\\"; break;
494 case '\n': dst += "\\n"; break;
495 case '\r': dst += "\\r"; break;
496 case '\t': dst += "\\t"; break;
497 case '\"': dst += "\\\""; break;
498 case '\'': dst += "\\'"; break;
499 default: dst += src[i]; break;
500 }
501 }
502 }
503
511 template<class TR = std::char_traits<wchar_t>, class AX = std::allocator<wchar_t>>
512 void css_escape(
513 _Inout_ std::basic_string<wchar_t, TR, AX>& dst,
514 _In_reads_or_z_opt_(num_chars) const wchar_t* src, _In_ size_t num_chars)
515 {
516 stdex_assert(src || !num_chars);
517 for (size_t i = 0; i < num_chars && src[i]; ++i) {
518 switch (src[i]) {
519 case L'\\': dst += L"\\\\"; break;
520 case L'\n': dst += L"\\n"; break;
521 case L'\r': dst += L"\\r"; break;
522 case L'\t': dst += L"\\t"; break;
523 case L'\"': dst += L"\\\""; break;
524 case L'\'': dst += L"\\'"; break;
525 default: dst += src[i]; break;
526 }
527 }
528 }
529
536 template<class T, size_t N, class TR = std::char_traits<T>, class AX = std::allocator<T>>
537 void css_escape(
538 _Inout_ std::basic_string<T, TR, AX>& dst,
539 _In_ const T (&src)[N])
540 {
541 css_escape(dst, src, N);
542 }
543
550 template<class T, class TR_dst = std::char_traits<T>, class AX_dst = std::allocator<T>, class TR_src = std::char_traits<T>, class AX_src = std::allocator<T>>
551 void css_escape(
552 _Inout_ std::basic_string<T, TR_dst, AX_dst>& dst,
553 _In_ const std::basic_string<T, TR_src, AX_src>& src)
554 {
555 css_escape(dst, src.data(), src.size());
556 }
557
561 enum class element_t {
562 empty = 0,
563 a,
564 abbr,
565 acronym,
566 address,
567 applet,
568 area,
569 b,
570 base,
571 basefont,
572 bdo,
573 bgsound, // Microsoft Specific
574 big,
575 blink, // Microsoft Specific
576 blockquote,
577 body,
578 br,
579 button,
580 caption,
581 center,
582 cite,
583 code,
584 col,
585 colgroup,
586 comment, // Microsoft Specific
587 dd,
588 del,
589 dfn,
590 dir,
591 div,
592 dl,
593 dt,
594 em,
595 embed, // Microsoft Specific
596 fieldset,
597 font,
598 form,
599 frame,
600 frameset,
601 h1,
602 h2,
603 h3,
604 h4,
605 h5,
606 h6,
607 head,
608 hr,
609 html,
610 i,
611 iframe,
612 img,
613 input,
614 ins,
615 isindex,
616 kbd,
617 label,
618 legend,
619 li,
620 link,
621 listing, // Microsoft Specific
622 map,
623 marquee, // Microsoft Specific
624 menu,
625 meta,
626 nextid, // Microsoft Specific
627 nobr, // Microsoft Specific
628 noembed, // Microsoft Specific
629 noframes,
630 noscript,
631 object,
632 ol,
633 optgroup,
634 option,
635 p,
636 param,
637 plaintext, // Microsoft Specific
638 pre,
639 q,
640 rt, // Microsoft Specific
641 ruby, // Microsoft Specific
642 s,
643 samp,
644 script,
645 select,
646 small,
647 span,
648 strike,
649 strong,
650 style,
651 sub,
652 sup,
653 table,
654 tbody,
655 td,
656 textarea,
657 tfoot,
658 th,
659 thead,
660 title,
661 tr,
662 tt,
663 u,
664 ul,
665 var,
666 wbr, // Microsoft Specific
667 xmp, // Microsoft Specific
668
669 unknown = -1,
670 PCDATA = -2,
671 CDATA = -3,
672 };
673
677 enum class element_span_t {
678 needs_end = 0,
679 end_optional,
680 immediate,
681 };
682
687 {
693 static element_span_t span(_In_ element_t code)
694 {
695 static element_span_t lookup[] = {
696 element_span_t::needs_end, // a
697 element_span_t::needs_end, // abbr
698 element_span_t::needs_end, // acronym
699 element_span_t::needs_end, // address
700 element_span_t::needs_end, // applet
701 element_span_t::immediate, // area
702 element_span_t::needs_end, // b
703 element_span_t::immediate, // base
704 element_span_t::immediate, // basefont
705 element_span_t::needs_end, // bdo
706 element_span_t::immediate, // bgsound
707 element_span_t::needs_end, // big
708 element_span_t::needs_end, // blink
709 element_span_t::needs_end, // blockquote
710 element_span_t::end_optional, // body
711 element_span_t::immediate, // br
712 element_span_t::needs_end, // button
713 element_span_t::needs_end, // caption
714 element_span_t::needs_end, // center
715 element_span_t::needs_end, // cite
716 element_span_t::needs_end, // code
717 element_span_t::immediate, // col
718 element_span_t::end_optional, // colgroup
719 element_span_t::needs_end, // comment
720 element_span_t::end_optional, // dd
721 element_span_t::needs_end, // del
722 element_span_t::needs_end, // dfn
723 element_span_t::needs_end, // dir
724 element_span_t::needs_end, // div
725 element_span_t::needs_end, // dl
726 element_span_t::end_optional, // dt
727 element_span_t::needs_end, // em
728 element_span_t::immediate, // embed
729 element_span_t::needs_end, // fieldset
730 element_span_t::needs_end, // font
731 element_span_t::needs_end, // form
732 element_span_t::immediate, // frame
733 element_span_t::needs_end, // frameset
734 element_span_t::needs_end, // h1
735 element_span_t::needs_end, // h2
736 element_span_t::needs_end, // h3
737 element_span_t::needs_end, // h4
738 element_span_t::needs_end, // h5
739 element_span_t::needs_end, // h6
740 element_span_t::end_optional, // head
741 element_span_t::immediate, // hr
742 element_span_t::end_optional, // html
743 element_span_t::needs_end, // i
744 element_span_t::needs_end, // iframe
745 element_span_t::immediate, // img
746 element_span_t::immediate, // input
747 element_span_t::needs_end, // ins
748 element_span_t::immediate, // isindex
749 element_span_t::needs_end, // kbd
750 element_span_t::needs_end, // label
751 element_span_t::needs_end, // legend
752 element_span_t::end_optional, // li
753 element_span_t::immediate, // link
754 element_span_t::needs_end, // listing
755 element_span_t::needs_end, // map
756 element_span_t::needs_end, // marquee
757 element_span_t::needs_end, // menu
758 element_span_t::immediate, // meta
759 element_span_t::immediate, // nextid
760 element_span_t::needs_end, // nobr
761 element_span_t::needs_end, // noembed
762 element_span_t::needs_end, // noframes
763 element_span_t::needs_end, // noscript
764 element_span_t::needs_end, // object
765 element_span_t::needs_end, // ol
766 element_span_t::needs_end, // optgroup
767 element_span_t::end_optional, // option
768 element_span_t::end_optional, // p
769 element_span_t::immediate, // param
770 element_span_t::end_optional, // plaintext
771 element_span_t::needs_end, // pre
772 element_span_t::needs_end, // q
773 element_span_t::immediate, // rt
774 element_span_t::needs_end, // ruby
775 element_span_t::needs_end, // s
776 element_span_t::needs_end, // samp
777 element_span_t::needs_end, // script
778 element_span_t::needs_end, // select
779 element_span_t::needs_end, // small
780 element_span_t::needs_end, // span
781 element_span_t::needs_end, // strike
782 element_span_t::needs_end, // strong
783 element_span_t::needs_end, // style
784 element_span_t::needs_end, // sub
785 element_span_t::needs_end, // sup
786 element_span_t::needs_end, // table
787 element_span_t::end_optional, // tbody
788 element_span_t::end_optional, // td
789 element_span_t::needs_end, // textarea
790 element_span_t::end_optional, // tfoot
791 element_span_t::end_optional, // th
792 element_span_t::end_optional, // thead
793 element_span_t::needs_end, // title
794 element_span_t::end_optional, // tr
795 element_span_t::needs_end, // tt
796 element_span_t::needs_end, // u
797 element_span_t::needs_end, // ul
798 element_span_t::needs_end, // var
799 element_span_t::immediate, // wbr
800 element_span_t::needs_end, // xmp
801 };
802 return element_t::a <= code && code <= element_t::xmp ?
803 lookup[static_cast<size_t>(code) - static_cast<size_t>(element_t::a)] :
804 element_span_t::needs_end;
805 }
806
812 static bool is_fontstyle(_In_ element_t code)
813 {
814 switch (code) {
815 case element_t::tt:
816 case element_t::i:
817 case element_t::b:
818 case element_t::u:
819 case element_t::s:
820 case element_t::strike:
821 case element_t::blink:
822 case element_t::big:
823 case element_t::small:
824 return true;
825 default:
826 return false;
827 };
828 }
829
835 static bool is_phrase(_In_ element_t code)
836 {
837 switch (code) {
838 case element_t::em:
839 case element_t::strong:
840 case element_t::dfn:
841 case element_t::code:
842 case element_t::samp:
843 case element_t::kbd:
844 case element_t::var:
845 case element_t::cite:
846 case element_t::abbr:
847 case element_t::acronym:
848 case element_t::xmp:
849 return true;
850 default:
851 return false;
852 };
853 }
854
860 static bool is_special(_In_ element_t code)
861 {
862 switch (code) {
863 case element_t::a:
864 case element_t::img:
865 case element_t::applet:
866 case element_t::object:
867 case element_t::embed:
868 case element_t::font:
869 case element_t::basefont:
870 case element_t::br:
871 case element_t::wbr:
872 case element_t::rt:
873 case element_t::script:
874 case element_t::map:
875 case element_t::q:
876 case element_t::sub:
877 case element_t::sup:
878 case element_t::ruby:
879 case element_t::span:
880 case element_t::bdo:
881 case element_t::iframe:
882 case element_t::nobr:
883 return true;
884 default:
885 return false;
886 };
887 }
888
894 static bool is_formctrl(_In_ element_t code)
895 {
896 switch (code) {
897 case element_t::input:
898 case element_t::select:
899 case element_t::textarea:
900 case element_t::label:
901 case element_t::button:
902 return true;
903 default:
904 return false;
905 };
906 }
907
913 static bool is_inline(_In_ element_t code)
914 {
915 return
916 code == element_t::PCDATA ||
917 is_fontstyle(code) ||
918 is_phrase(code) ||
919 is_special(code) ||
920 is_formctrl(code);
921 }
922
928 static bool is_heading(_In_ element_t code)
929 {
930 switch (code) {
931 case element_t::h1:
932 case element_t::h2:
933 case element_t::h3:
934 case element_t::h4:
935 case element_t::h5:
936 case element_t::h6:
937 return true;
938 default:
939 return false;
940 };
941 }
942
948 static bool is_list(_In_ element_t code)
949 {
950 switch (code) {
951 case element_t::ul:
952 case element_t::ol:
953 case element_t::dir:
954 case element_t::menu:
955 return true;
956 default:
957 return false;
958 };
959 }
960
966 static bool is_preformatted(_In_ element_t code)
967 {
968 switch (code) {
969 case element_t::pre:
970 case element_t::listing:
971 return true;
972 default:
973 return false;
974 }
975 }
976
982 static bool is_block(_In_ element_t code)
983 {
984 if (is_heading(code) ||
985 is_list(code) ||
986 is_preformatted(code)) return true;
987 switch (code) {
988 case element_t::p:
989 case element_t::dl:
990 case element_t::div:
991 case element_t::center:
992 case element_t::marquee:
993 case element_t::noscript:
994 case element_t::noframes:
995 case element_t::noembed:
996 case element_t::blockquote:
997 case element_t::form:
998 case element_t::isindex:
999 case element_t::hr:
1000 case element_t::table:
1001 case element_t::fieldset:
1002 case element_t::address:
1003 return true;
1004 default:
1005 return false;
1006 };
1007 }
1008
1014 static bool is_flow(_In_ element_t code)
1015 {
1016 return is_block(code) || is_inline(code);
1017 }
1018
1024 static bool is_head_content(_In_ element_t code)
1025 {
1026 switch (code) {
1027 case element_t::title:
1028 case element_t::isindex:
1029 case element_t::base:
1030 case element_t::nextid:
1031 return true;
1032 default:
1033 return false;
1034 };
1035 }
1036
1042 static bool is_head_misc(_In_ element_t code)
1043 {
1044 switch (code) {
1045 case element_t::script:
1046 case element_t::style:
1047 case element_t::meta:
1048 case element_t::link:
1049 case element_t::object:
1050 return true;
1051 default:
1052 return false;
1053 };
1054 }
1055
1061 static bool is_pre_exclusion(_In_ element_t code)
1062 {
1063 switch (code) {
1064 case element_t::img:
1065 case element_t::object:
1066 case element_t::applet:
1067 case element_t::embed:
1068 case element_t::big:
1069 case element_t::small:
1070 case element_t::sub:
1071 case element_t::sup:
1072 case element_t::ruby:
1073 case element_t::font:
1074 case element_t::basefont:
1075 case element_t::nobr:
1076 return true;
1077 default:
1078 return false;
1079 };
1080 }
1081
1087 static bool is_html_content(_In_ element_t code)
1088 {
1089 switch (code) {
1090 case element_t::head:
1091 case element_t::body:
1092 case element_t::frameset:
1093 return true;
1094 default:
1095 return false;
1096 };
1097 }
1098
1104 static bool is_group(_In_ element_t code)
1105 {
1106 if (is_block(code) ||
1107 is_html_content(code) ||
1108 is_head_content(code)) return true;
1109 switch (code) {
1110 case element_t::col:
1111 case element_t::colgroup:
1112 case element_t::dd:
1113 case element_t::dir:
1114 case element_t::dt:
1115 case element_t::frame:
1116 case element_t::iframe:
1117 case element_t::legend:
1118 case element_t::td:
1119 case element_t::th:
1120 case element_t::tr:
1121 return true;
1122 default:
1123 return false;
1124 };
1125 }
1126
1135 static bool may_contain(_In_ element_t parent, _In_ element_t child)
1136 {
1137 if (child == element_t::unknown || child == element_t::comment)
1138 return true;
1139 if (is_fontstyle(parent) || is_phrase(parent))
1140 return is_inline(child);
1141 if (is_heading(parent))
1142 return is_inline(child);
1143
1144 switch (parent) {
1145 case element_t::a: return is_inline(child) && child != element_t::a;
1146 case element_t::address: return is_inline(child) || child == element_t::p;
1147 case element_t::applet: return is_flow(child) || child == element_t::param;
1148 case element_t::area: return false;
1149 case element_t::base: return false;
1150 case element_t::basefont: return false;
1151 case element_t::bdo: return is_inline(child);
1152 case element_t::blockquote: return is_flow(child);
1153 case element_t::body: return is_flow(child) || child == element_t::ins || child == element_t::del;
1154 case element_t::br: return false;
1155 case element_t::button: return is_flow(child) && !is_formctrl(child) && child != element_t::a && child != element_t::form && child != element_t::isindex && child != element_t::fieldset && child != element_t::iframe;
1156 case element_t::caption: return is_inline(child);
1157 case element_t::center: return is_flow(child);
1158 case element_t::col: return false;
1159 case element_t::colgroup: return child == element_t::col;
1160 case element_t::comment: return child == element_t::CDATA;
1161 case element_t::dd: return is_flow(child);
1162 case element_t::del: return is_flow(child);
1163 case element_t::dir: return child == element_t::li;
1164 case element_t::div: return is_flow(child);
1165 case element_t::dl: return child == element_t::dt || child == element_t::dd;
1166 case element_t::dt: return is_inline(child);
1167 case element_t::embed: return is_flow(child) || child == element_t::param;
1168 case element_t::fieldset: return is_flow(child) || child == element_t::legend || child == element_t::PCDATA;
1169 case element_t::font: return is_inline(child);
1170 case element_t::form: return is_flow(child) && child != element_t::form;
1171 case element_t::frame: return false;
1172 case element_t::frameset: return child == element_t::frameset || child == element_t::frame || child == element_t::noframes;
1173 case element_t::head: return is_head_content(child) || is_head_misc(child);
1174 case element_t::hr: return false;
1175 case element_t::html: return is_html_content(child);
1176 case element_t::iframe: return is_flow(child);
1177 case element_t::img: return false;
1178 case element_t::input: return false;
1179 case element_t::ins: return is_flow(child);
1180 case element_t::isindex: return false;
1181 case element_t::label: return is_inline(child) && child != element_t::label;
1182 case element_t::legend: return is_inline(child);
1183 case element_t::li: return is_flow(child);
1184 case element_t::link: return false;
1185 case element_t::listing: return child == element_t::CDATA;
1186 case element_t::map: return is_block(child) || child == element_t::area;
1187 case element_t::marquee: return is_flow(child);
1188 case element_t::menu: return child == element_t::li;
1189 case element_t::meta: return false;
1190 case element_t::nobr: return is_inline(child) || child == element_t::wbr;
1191 case element_t::noframes: return (is_flow(child) || child == element_t::body) && child != element_t::noframes;
1192 case element_t::noscript: return is_flow(child);
1193 case element_t::noembed: return is_flow(child);
1194 case element_t::object: return is_flow(child) || child == element_t::param;
1195 case element_t::ol: return child == element_t::li;
1196 case element_t::optgroup: return child == element_t::option;
1197 case element_t::option: return child == element_t::PCDATA;
1198 case element_t::p: return is_inline(child);
1199 case element_t::param: return false;
1200 case element_t::plaintext: return is_flow(child);
1201 case element_t::pre: return is_inline(child) && !is_pre_exclusion(child);
1202 case element_t::q: return is_inline(child);
1203 case element_t::rt: return false;
1204 case element_t::ruby: return is_inline(child);
1205 case element_t::script: return child == element_t::CDATA;
1206 case element_t::select: return child == element_t::optgroup || child == element_t::option;
1207 case element_t::span: return is_inline(child);
1208 case element_t::style: return child == element_t::CDATA;
1209 case element_t::sub: return is_inline(child);
1210 case element_t::sup: return is_inline(child);
1211 case element_t::table: return child == element_t::caption || child == element_t::col || child == element_t::colgroup || child == element_t::thead || child == element_t::tfoot || child == element_t::tbody;
1212 case element_t::tbody: return child == element_t::tr;
1213 case element_t::td: return is_flow(child);
1214 case element_t::textarea: return child == element_t::PCDATA;
1215 case element_t::tfoot: return child == element_t::tr;
1216 case element_t::th: return is_flow(child);
1217 case element_t::thead: return child == element_t::tr;
1218 case element_t::title: return child == element_t::PCDATA;
1219 case element_t::tr: return child == element_t::td || child == element_t::th;
1220 case element_t::ul: return child == element_t::li;
1221 case element_t::wbr: return false;
1222 case element_t::unknown: return true;
1223 default: return false;
1224 }
1225 }
1226
1234 template <class T>
1235 static bool is_uri(_In_ element_t code, _In_reads_or_z_opt_(num_chars) const T* attr_name, _In_ size_t num_chars)
1236 {
1237 stdex_assert(attr_name || !num_chars);
1238 switch (code) {
1239 case element_t::a: return stdex::strnicmp(attr_name, num_chars, "href", SIZE_MAX) == 0;
1240 case element_t::applet: return stdex::strnicmp(attr_name, num_chars, "code", SIZE_MAX) == 0 ||
1241 stdex::strnicmp(attr_name, num_chars, "codebase", SIZE_MAX) == 0 ||
1242 stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX) == 0;
1243 case element_t::area: return stdex::strnicmp(attr_name, num_chars, "href", SIZE_MAX) == 0;
1244 case element_t::base: return stdex::strnicmp(attr_name, num_chars, "href", SIZE_MAX) == 0;
1245 case element_t::bgsound: return stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX) == 0;
1246 case element_t::blockquote: return stdex::strnicmp(attr_name, num_chars, "cite", SIZE_MAX) == 0;
1247 case element_t::body: return stdex::strnicmp(attr_name, num_chars, "background", SIZE_MAX) == 0;
1248 case element_t::comment: return stdex::strnicmp(attr_name, num_chars, "data", SIZE_MAX) == 0;
1249 case element_t::del: return stdex::strnicmp(attr_name, num_chars, "cite", SIZE_MAX) == 0;
1250 case element_t::embed: return stdex::strnicmp(attr_name, num_chars, "pluginspage", SIZE_MAX) == 0 ||
1251 stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX) == 0;
1252 case element_t::form: return stdex::strnicmp(attr_name, num_chars, "action", SIZE_MAX) == 0;
1253 case element_t::frame: return stdex::strnicmp(attr_name, num_chars, "longdesc", SIZE_MAX) == 0 ||
1254 stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX) == 0;
1255 case element_t::head: return stdex::strnicmp(attr_name, num_chars, "profile", SIZE_MAX) == 0;
1256 case element_t::iframe: return stdex::strnicmp(attr_name, num_chars, "longdesc", SIZE_MAX) == 0 ||
1257 stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX) == 0;
1258 case element_t::img: return stdex::strnicmp(attr_name, num_chars, "longdesc", SIZE_MAX) == 0 ||
1259 stdex::strnicmp(attr_name, num_chars, "lowsrc", SIZE_MAX) == 0 ||
1260 stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX) == 0 ||
1261 stdex::strnicmp(attr_name, num_chars, "usemap", SIZE_MAX) == 0;
1262 case element_t::input: return stdex::strnicmp(attr_name, num_chars, "lowsrc", SIZE_MAX) == 0 ||
1263 stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX) == 0 ||
1264 stdex::strnicmp(attr_name, num_chars, "usemap", SIZE_MAX) == 0;
1265 case element_t::ins: return stdex::strnicmp(attr_name, num_chars, "cite", SIZE_MAX) == 0;
1266 case element_t::link: return stdex::strnicmp(attr_name, num_chars, "href", SIZE_MAX) == 0;
1267 case element_t::object: return stdex::strnicmp(attr_name, num_chars, "basehref", SIZE_MAX) == 0 ||
1268 stdex::strnicmp(attr_name, num_chars, "classid", SIZE_MAX) == 0 ||
1269 stdex::strnicmp(attr_name, num_chars, "code", SIZE_MAX) == 0 ||
1270 stdex::strnicmp(attr_name, num_chars, "codebase", SIZE_MAX) == 0 ||
1271 stdex::strnicmp(attr_name, num_chars, "data", SIZE_MAX) == 0 ||
1272 stdex::strnicmp(attr_name, num_chars, "usemap", SIZE_MAX) == 0;
1273 case element_t::q: return stdex::strnicmp(attr_name, num_chars, "cite", SIZE_MAX) == 0;
1274 case element_t::script: return stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX) == 0;
1275 case element_t::table: return stdex::strnicmp(attr_name, num_chars, "background", SIZE_MAX) == 0;
1276 case element_t::td: return stdex::strnicmp(attr_name, num_chars, "background", SIZE_MAX) == 0;
1277 case element_t::th: return stdex::strnicmp(attr_name, num_chars, "background", SIZE_MAX) == 0;
1278 default: return false;
1279 }
1280 }
1281
1289 template <class T>
1290 static bool is_localizable(element_t code, const T* attr_name, size_t num_chars)
1291 {
1292 stdex_assert(attr_name || !num_chars);
1293 if (stdex::strnicmp(attr_name, num_chars, "title", SIZE_MAX) == 0)
1294 return true;
1295 switch (code) {
1296 case element_t::applet: return stdex::strnicmp(attr_name, num_chars, "alt", SIZE_MAX) == 0;
1297 case element_t::area: return stdex::strnicmp(attr_name, num_chars, "alt", SIZE_MAX) == 0;
1298 case element_t::img: return stdex::strnicmp(attr_name, num_chars, "alt", SIZE_MAX) == 0;
1299 case element_t::input: return stdex::strnicmp(attr_name, num_chars, "alt", SIZE_MAX) == 0;
1300 case element_t::object: return stdex::strnicmp(attr_name, num_chars, "alt", SIZE_MAX) == 0;
1301 case element_t::table: return stdex::strnicmp(attr_name, num_chars, "summary", SIZE_MAX) == 0;
1302 case element_t::td: return stdex::strnicmp(attr_name, num_chars, "abbr", SIZE_MAX) == 0;
1303 case element_t::th: return stdex::strnicmp(attr_name, num_chars, "abbr", SIZE_MAX) == 0;
1304 default: return false;
1305 }
1306 }
1307 };
1308
1309 class sequence;
1310 using sequence_store = std::vector<std::unique_ptr<sequence>>;
1311
1316 {
1317 public:
1318 stdex::parser::html_sequence_t type;
1321
1322 sequence(_In_ stdex::parser::html_sequence_t _type = stdex::parser::html_sequence_t::unknown, _In_ size_t start = 0, size_t end = 0, _In_opt_ sequence* _parent = nullptr) :
1323 type(_type),
1324 interval(start, end),
1325 parent(_parent)
1326 {}
1327
1328 virtual ~sequence() {} // make polymorphic
1329 };
1330
1334 class element : public sequence
1335 {
1336 public:
1337 template <class T>
1338 element(_Inout_ stdex::parser::basic_html_tag<T>&& tag, _In_z_ const T* src, _In_opt_ sequence* parent = nullptr) :
1339 sequence(tag.type, tag.interval.start, tag.interval.end, parent),
1340 code(element_code(src + tag.name.start, tag.name.size())),
1341 name(std::move(tag.name)),
1342 attributes(std::move(tag.attributes))
1343 {}
1344
1345 template <class T>
1346 static element_t element_code(_In_reads_z_(num_chars) const T* name, size_t num_chars)
1347 {
1348 static const struct {
1349 const char* name;
1350 element_t code;
1351 } mapping[] = {
1352 { "a", element_t::a, },
1353 { "abbr", element_t::abbr, },
1354 { "acronym", element_t::acronym, },
1355 { "address", element_t::address, },
1356 { "applet", element_t::applet, },
1357 { "area", element_t::area, },
1358 { "b", element_t::b, },
1359 { "base", element_t::base, },
1360 { "basefont", element_t::basefont, },
1361 { "bdo", element_t::bdo, },
1362 { "bgsound", element_t::bgsound, },
1363 { "big", element_t::big, },
1364 { "blink", element_t::blink, },
1365 { "blockquote", element_t::blockquote, },
1366 { "body", element_t::body, },
1367 { "br", element_t::br, },
1368 { "button", element_t::button, },
1369 { "caption", element_t::caption, },
1370 { "center", element_t::center, },
1371 { "cite", element_t::cite, },
1372 { "code", element_t::code, },
1373 { "col", element_t::col, },
1374 { "colgroup", element_t::colgroup, },
1375 { "comment", element_t::comment, },
1376 { "dd", element_t::dd, },
1377 { "del", element_t::del, },
1378 { "dfn", element_t::dfn, },
1379 { "dir", element_t::dir, },
1380 { "div", element_t::div, },
1381 { "dl", element_t::dl, },
1382 { "dt", element_t::dt, },
1383 { "em", element_t::em, },
1384 { "embed", element_t::embed, },
1385 { "fieldset", element_t::fieldset, },
1386 { "font", element_t::font, },
1387 { "form", element_t::form, },
1388 { "frame", element_t::frame, },
1389 { "frameset", element_t::frameset, },
1390 { "h1", element_t::h1, },
1391 { "h2", element_t::h2, },
1392 { "h3", element_t::h3, },
1393 { "h4", element_t::h4, },
1394 { "h5", element_t::h5, },
1395 { "h6", element_t::h6, },
1396 { "head", element_t::head, },
1397 { "hr", element_t::hr, },
1398 { "html", element_t::html, },
1399 { "i", element_t::i, },
1400 { "iframe", element_t::iframe, },
1401 { "img", element_t::img, },
1402 { "input", element_t::input, },
1403 { "ins", element_t::ins, },
1404 { "isindex", element_t::isindex, },
1405 { "kbd", element_t::kbd, },
1406 { "label", element_t::label, },
1407 { "legend", element_t::legend, },
1408 { "li", element_t::li, },
1409 { "link", element_t::link, },
1410 { "listing", element_t::listing, },
1411 { "map", element_t::map, },
1412 { "marquee", element_t::marquee, },
1413 { "menu", element_t::menu, },
1414 { "meta", element_t::meta, },
1415 { "nextid", element_t::nextid, },
1416 { "nobr", element_t::nobr, },
1417 { "noembed", element_t::noembed, },
1418 { "noframes", element_t::noframes, },
1419 { "noscript", element_t::noscript, },
1420 { "object", element_t::object, },
1421 { "ol", element_t::ol, },
1422 { "optgroup", element_t::optgroup, },
1423 { "option", element_t::option, },
1424 { "p", element_t::p, },
1425 { "param", element_t::param, },
1426 { "plaintext", element_t::plaintext, },
1427 { "pre", element_t::pre, },
1428 { "q", element_t::q, },
1429 { "rt", element_t::rt, },
1430 { "ruby", element_t::ruby, },
1431 { "s", element_t::s, },
1432 { "samp", element_t::samp, },
1433 { "script", element_t::script, },
1434 { "select", element_t::select, },
1435 { "small", element_t::small, },
1436 { "span", element_t::span, },
1437 { "strike", element_t::strike, },
1438 { "strong", element_t::strong, },
1439 { "style", element_t::style, },
1440 { "sub", element_t::sub, },
1441 { "sup", element_t::sup, },
1442 { "table", element_t::table, },
1443 { "tbody", element_t::tbody, },
1444 { "td", element_t::td, },
1445 { "textarea", element_t::textarea, },
1446 { "tfoot", element_t::tfoot, },
1447 { "th", element_t::th, },
1448 { "thead", element_t::thead, },
1449 { "title", element_t::title, },
1450 { "tr", element_t::tr, },
1451 { "tt", element_t::tt, },
1452 { "u", element_t::u, },
1453 { "ul", element_t::ul, },
1454 { "var", element_t::var, },
1455 { "wbr", element_t::wbr, },
1456 { "xmp", element_t::xmp, },
1457 };
1458#ifndef NDEBUG
1459 // The mapping table MUST be sorted and all names in lowercase.
1460 for (size_t i = 1; i < _countof(mapping); i++)
1461 stdex_assert(stdex::strcmp(mapping[i - 1].name, mapping[i].name) <= 0);
1462 for (size_t i = 0; i < _countof(mapping); i++) {
1463 for (size_t j = 0; mapping[i].name[j]; j++)
1464 stdex_assert(stdex::islower(mapping[i].name[j]) | stdex::isdigit(mapping[i].name[j]));
1465 }
1466#endif
1467 for (size_t i = 0, j = _countof(mapping); i < j; ) {
1468 size_t m = (i + j) / 2;
1469 int r = 0;
1470 for (size_t i1 = 0, i2 = 0;;) {
1471 if (!mapping[m].name[i1]) {
1472 r = i2 >= num_chars || !name[i2] ? 0 : -1;
1473 break;
1474 }
1475 if (i2 >= num_chars || !name[i2]) {
1476 r = 1;
1477 break;
1478 }
1479
1480 auto chr = static_cast<char>(stdex::tolower(name[i2++]));
1481 if (mapping[m].name[i1] > chr) {
1482 r = 1;
1483 break;
1484 }
1485 if (mapping[m].name[i1] < chr) {
1486 r = -1;
1487 break;
1488 }
1489 i1++;
1490 }
1491
1492 if (r < 0)
1493 i = m + 1;
1494 else if (r > 0)
1495 j = m;
1496 else
1497 return mapping[m].code;
1498 }
1499 return element_t::unknown;
1500 }
1501
1502 public:
1503 element_t code;
1505 std::vector<stdex::parser::html_attribute> attributes;
1506 };
1507
1508 class element_end;
1509
1513 class element_start : public element
1514 {
1515 public:
1516 template <class T>
1517 element_start(_Inout_ stdex::parser::basic_html_tag<T>&& tag, _In_z_ const T* src, _In_opt_ sequence* parent = nullptr, _In_opt_ sequence* _end = nullptr) :
1518 element(std::move(tag), src, parent),
1519 end(_end)
1520 {}
1521
1522 public:
1524 };
1525
1529 class element_end : public sequence
1530 {
1531 public:
1532 template <class T>
1533 element_end(_Inout_ stdex::parser::basic_html_tag<T>&& tag, _In_z_ const T* src, _In_opt_ sequence* parent = nullptr, _In_opt_ element_start* _start = nullptr) :
1534 sequence(tag.type, tag.interval.start, tag.interval.end, parent),
1535 code(element::element_code(src + tag.name.start, tag.name.size())),
1536 name(std::move(tag.name)),
1537 start(_start)
1538 {}
1539
1540 public:
1541 element_t code;
1544 };
1545
1549 class declaration : public sequence
1550 {
1551 public:
1552 template <class T>
1553 declaration(_Inout_ stdex::parser::basic_html_tag<T>&& tag, _In_opt_ sequence* parent = nullptr) :
1554 sequence(tag.type, tag.interval.start, tag.interval.end, parent),
1555 name(std::move(tag.name)),
1556 attributes(std::move(tag.attributes))
1557 {}
1558
1559 public:
1561 std::vector<stdex::parser::html_attribute> attributes;
1562 };
1563
1567 class comment : public sequence
1568 {
1569 public:
1570 template <class T>
1571 comment(_Inout_ stdex::parser::basic_html_tag<T>&& tag, _In_opt_ sequence* parent = nullptr) :
1572 sequence(tag.type, tag.interval.start, tag.interval.end, parent),
1573 content(std::move(tag.name))
1574 {}
1575
1576 public:
1578 };
1579
1583 class instruction : public sequence
1584 {
1585 public:
1586 template <class T>
1587 instruction(_Inout_ stdex::parser::basic_html_tag<T>&& tag, _In_opt_ sequence* parent = nullptr) :
1588 sequence(tag.type, tag.interval.start, tag.interval.end, parent),
1589 content(std::move(tag.name))
1590 {}
1591
1592 public:
1594 };
1595
1599 template<class T, class TR = std::char_traits<T>, class AX = std::allocator<T>>
1600 struct entity
1601 {
1603 std::basic_string<T, TR, AX> value;
1604 };
1605
1609 template<class T, class TR = std::char_traits<T>, class AX = std::allocator<T>>
1610 class parser;
1611
1615 template<class T, class TR = std::char_traits<T>, class AX = std::allocator<T>>
1617 {
1618 public:
1619 document() :
1620 m_num_parsed(0),
1621 m_charset(stdex::charset_id::system),
1622
1623 // Declaration parsing data
1626 m_is_cdata(false),
1627 m_is_rcdata(false),
1628
1629 // Element parsing data
1631 {}
1632
1636 void clear()
1637 {
1638 m_source.clear();
1639 m_num_parsed = 0;
1640 m_charset = stdex::charset_id::system;
1641
1642 // Declaration parsing data
1644 m_is_cdata = m_is_rcdata = false;
1645 m_entities.clear();
1646
1647 // Element parsing data
1648 m_sequences.clear();
1649
1650 m_element_stack.clear();
1651 m_is_special_element = false;
1652 }
1653
1657 void append(_In_reads_or_z_opt_(num_chars) const T* source, _In_ size_t num_chars)
1658 {
1659 stdex_assert(source || !num_chars);
1660 m_source.append(source, stdex::strnlen(source, num_chars));
1661 source = m_source.data();
1662 num_chars = m_source.size();
1663
1664 for (size_t i = m_num_parsed; i < num_chars;) {
1665 if (m_is_cdata || m_is_rcdata) {
1666 if (m_condition_end.match(source, i, num_chars)) {
1667 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new sequence(
1668 m_is_cdata ? stdex::parser::html_sequence_t::CDATA : stdex::parser::html_sequence_t::PCDATA,
1669 m_num_parsed, i,
1670 active_element()))));
1671 m_is_cdata = m_is_rcdata = false;
1672 i = m_num_parsed = m_condition_end.interval.end;
1673 continue;
1674 }
1675 goto next_char;
1676 }
1677
1679 if (m_condition_end.match(source, i, num_chars)) {
1681 i = m_num_parsed = m_condition_end.interval.end;
1682 continue;
1683 }
1684 goto next_char;
1685 }
1686
1687 if (m_num_valid_conditions && m_condition_end.match(source, i, num_chars)) {
1688 if (m_num_parsed < i)
1689 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new sequence(stdex::parser::html_sequence_t::text, m_num_parsed, i, active_element()))));
1690
1692 i = m_num_parsed = m_condition_end.interval.end;
1693 continue;
1694 }
1695
1696 if (m_condition_start.match(source, i, num_chars)) {
1697 auto condition_src(replace_entities(source + m_condition_start.condition.start, m_condition_start.condition.size()));
1698 if (stdex::strncmp(condition_src.data(), condition_src.size(), "CDATA", SIZE_MAX) == 0)
1699 m_is_cdata = true;
1700 else if (stdex::strncmp(condition_src.data(), condition_src.size(), "RCDATA", SIZE_MAX) == 0)
1701 m_is_rcdata = true;
1704 else if (stdex::strncmp(condition_src.data(), condition_src.size(), "IGNORE", SIZE_MAX) == 0)
1706 else
1708
1709 i = m_num_parsed = m_condition_start.interval.end;
1710 continue;
1711 }
1712
1714 auto parent = active_element();
1715 stdex_assert(parent);
1716 if (m_tag.match(source, i, num_chars) &&
1717 m_tag.type == stdex::parser::html_sequence_t::element_end &&
1718 element::element_code(source + m_tag.name.start, m_tag.name.size()) == parent->code)
1719 {
1720 if (m_num_parsed < i)
1721 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new sequence(stdex::parser::html_sequence_t::text, m_num_parsed, i, parent))));
1722 i = m_num_parsed = m_tag.interval.end;
1723 std::unique_ptr<element_end> e(new element_end(std::move(m_tag), source, parent->parent, parent));
1724 parent->end = e.get();
1725 m_sequences.push_back(std::move(e));
1726 m_element_stack.pop_back();
1727 m_is_special_element = false;
1728 continue;
1729 }
1730 goto next_char;
1731 }
1732
1733 if (m_tag.match(source, i, num_chars)) {
1734 if (m_num_parsed < i)
1735 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new sequence(stdex::parser::html_sequence_t::text, m_num_parsed, i, active_element()))));
1736 i = m_num_parsed = m_tag.interval.end;
1737
1738 switch (m_tag.type) {
1739 case stdex::parser::html_sequence_t::element:
1740 case stdex::parser::html_sequence_t::element_start: {
1741 std::unique_ptr<element> e(
1742 m_tag.type == stdex::parser::html_sequence_t::element ? new element(std::move(m_tag), source) :
1743 m_tag.type == stdex::parser::html_sequence_t::element_start ? new element_start(std::move(m_tag), source) :
1744 nullptr);
1745
1746 // Does this tag end any of the started elements?
1747 for (size_t j = m_element_stack.size(); j--; ) {
1748 auto starting_tag = m_element_stack[j];
1749 stdex_assert(starting_tag && starting_tag->type == stdex::parser::html_sequence_t::element_start);
1750 if (element_traits::may_contain(starting_tag->code, e->code)) {
1751 e->parent = starting_tag;
1752 break;
1753 }
1754 e->parent = starting_tag->parent;
1755 starting_tag->end = e.get();
1756 m_element_stack.resize(j);
1757 }
1758
1759 if (e->type == stdex::parser::html_sequence_t::element_start) {
1760 auto e_start = static_cast<element_start*>(e.get());
1761 if (element_traits::span(e->code) == element_span_t::immediate)
1762 e_start->end = e.get();
1763 else {
1764 m_element_stack.push_back(e_start);
1765 switch (e->code) {
1766 case element_t::code:
1767 case element_t::comment:
1768 case element_t::script:
1769 case element_t::style:
1770 m_is_special_element = true;
1771 break;
1772 default:;
1773 }
1774 }
1775 }
1776
1777 if (e->code == element_t::meta && m_charset == stdex::charset_id::system) {
1778 bool is_content_type = false;
1779 stdex::parser::html_attribute* content_attr = nullptr;
1780 for (auto& attr : e->attributes) {
1781 if (stdex::strnicmp(source + attr.name.start, attr.name.size(), "http-equiv", SIZE_MAX) == 0 &&
1782 stdex::strnicmp(source + attr.value.start, attr.value.size(), "content-type", SIZE_MAX) == 0)
1783 is_content_type = true;
1784 else if (stdex::strnicmp(source + attr.name.start, attr.name.size(), "content", SIZE_MAX) == 0)
1785 content_attr = &attr;
1786 }
1787 if (is_content_type && content_attr) {
1788 // <meta http-equiv="Content-Type" content="..."> found.
1790 if (content.match(source, content_attr->value.start, content_attr->value.end) &&
1791 content.charset)
1792 {
1793 std::string str;
1794 str.reserve(content.charset.size());
1795 for (size_t j = content.charset.start; j < content.charset.end; ++j)
1796 str.push_back(static_cast<char>(source[j]));
1797 m_charset = stdex::charset_from_name(str);
1798 }
1799 }
1800 }
1801
1802 m_sequences.push_back(std::move(e));
1803 break;
1804 }
1805 case stdex::parser::html_sequence_t::element_end: {
1806 std::unique_ptr<element_end> e(new element_end(std::move(m_tag), source, active_element()));
1807
1808 for (size_t j = m_element_stack.size(); j--; ) {
1809 auto starting_tag = m_element_stack[j];
1810 stdex_assert(starting_tag && starting_tag->type == stdex::parser::html_sequence_t::element_start);
1811 if (starting_tag->code == e->code ||
1812 (starting_tag->code == element_t::unknown && e->code == element_t::unknown && stdex::strnicmp(source + starting_tag->name.start, starting_tag->name.size(), source + e->name.start, e->name.size()) == 0))
1813 {
1814 e->start = starting_tag;
1815 e->parent = starting_tag->parent;
1816 starting_tag->end = e.get();
1817 m_element_stack.resize(j);
1818 break;
1819 }
1820 }
1821
1822 m_sequences.push_back(std::move(e));
1823 break;
1824 }
1825 case stdex::parser::html_sequence_t::declaration:
1826 if (m_tag.attributes.size() > 3 &&
1827 stdex::strnicmp(source + m_tag.attributes[0].name.start, m_tag.attributes[0].name.size(), "entity", SIZE_MAX) == 0)
1828 {
1829 if (stdex::strncmp(source + m_tag.attributes[1].name.start, m_tag.attributes[1].name.size(), "%", SIZE_MAX) == 0 &&
1830 stdex::strncmp(source + m_tag.attributes[3].name.start, m_tag.attributes[3].name.size(), "SYSTEM", SIZE_MAX) &&
1831 stdex::strncmp(source + m_tag.attributes[3].name.start, m_tag.attributes[3].name.size(), "PUBLIC", SIZE_MAX))
1832 {
1833 std::unique_ptr<entity<T, TR, AX>> e(new entity<T, TR, AX>());
1834 e->name = m_tag.attributes[2].name;
1835 e->value = std::move(replace_entities(source + m_tag.attributes[3].name.start, m_tag.attributes[3].name.size()));
1836 m_entities.push_back(std::move(e));
1837 }
1838
1839 // TODO: Parse & entities and entities in SYSTEM and PUBLIC external files.
1840 }
1841 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new declaration(std::move(m_tag), active_element()))));
1842 break;
1843 case stdex::parser::html_sequence_t::comment:
1844 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new comment(std::move(m_tag), active_element()))));
1845 break;
1846 case stdex::parser::html_sequence_t::instruction:
1847 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new instruction(std::move(m_tag), active_element()))));
1848 break;
1849 default:
1850 throw std::invalid_argument("unknown tag type");
1851 }
1852
1853 continue;
1854 }
1855
1856 next_char:
1857 if (m_any_char.match(source, i, num_chars)) {
1858 // Skip any character, but don't declare it as parsed yet. It might be a part of unfinished tag.
1859 i = m_any_char.interval.end;
1860 }
1861 else
1862 break;
1863 }
1864 }
1865
1870 {
1871 size_t i = m_source.size();
1872 if (m_num_parsed < i)
1873 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new sequence(stdex::parser::html_sequence_t::text, m_num_parsed, i, active_element()))));
1874 m_num_parsed = i;
1875 m_element_stack.clear();
1876 }
1877
1881 void assign(_In_reads_or_z_opt_(num_chars) const T* source, _In_ size_t num_chars)
1882 {
1883 clear();
1884 append(source, num_chars);
1885 finalize();
1886 }
1887
1891 const std::basic_string<T, TR, AX>& source() const { return m_source; }
1892
1893 friend class parser<T, TR, AX>;
1894
1895 protected:
1900 {
1901 return m_element_stack.empty() ? nullptr : m_element_stack.back();
1902 }
1903
1907 std::basic_string<T, TR, AX> replace_entities(_In_reads_or_z_opt_(num_chars) const T* input, _In_ size_t num_chars) const
1908 {
1909 stdex_assert(input || !num_chars);
1910 const size_t num_entities = m_entities.size();
1911 const T* source = m_source.data();
1912 std::basic_string<T, TR, AX> output;
1913 for (size_t i = 0; i < num_chars && input[i];) {
1914 if (input[i] == '%') {
1915 for (size_t j = 0; j < num_entities; j++) {
1916 auto& e = m_entities[j];
1917 size_t entity_size = e->name.size();
1918 if (i + entity_size + 1 < num_chars &&
1919 stdex::strncmp(input + i + 1, source + e->name.start, entity_size) == 0 &&
1920 input[i + entity_size + 1] == ';')
1921 {
1922 output += e->value;
1923 i += entity_size + 2;
1924 goto next_char;
1925 }
1926 }
1927 throw std::runtime_error("undefined entity");
1928 }
1929 output += input[i++];
1930 next_char:;
1931 }
1932 return output;
1933 }
1934
1935 protected:
1936 std::basic_string<T, TR, AX> m_source;
1938 stdex::charset_id m_charset;
1939
1940 // Declaration parsing data
1948 std::vector<std::unique_ptr<entity<T, TR, AX>>> m_entities;
1949
1950 // Element parsing data
1952 sequence_store m_sequences;
1953 std::vector<element_start*> m_element_stack;
1955 };
1956
1960 enum class token_t {
1961 root = 0,
1962 complete,
1963 starting,
1964 ending,
1965 url,
1966 };
1967
1971 constexpr size_t token_tag_max =
1972 sizeof(void*) * 2 // Memory address in hexadecimal
1973 + 2 // Leading and trailing parenthesis
1974 + 1; // Zero terminator
1975
1980 constexpr char token_tag_start = '\x12';
1981
1986 constexpr char token_tag_end = '\x13';
1987
1991 class token
1992 {
1993 protected:
1994 token(_In_ token_t _type = token_t::root, _In_opt_ sequence* _sequence = nullptr, _In_ uintptr_t _data = 0) :
1995 type(_type),
1996 sequence(_sequence),
1997 data(_data)
1998 {}
1999
2000 template<class T, class TR, class AX>
2001 friend class parser;
2002
2003 public:
2004 virtual ~token() {} // make polymorphic
2005
2013 template<class TR = std::char_traits<char>, class AX = std::allocator<char>>
2014 size_t append_tag(_Inout_ std::basic_string<char, TR, AX>& str) const
2015 {
2016 size_t n = str.size();
2017 // Use %X instead of %p to omit leading zeros and save space.
2018 stdex::appendf(str, "%c%zX%c", stdex::locale_C, token_tag_start, reinterpret_cast<uintptr_t>(this), token_tag_end);
2019 return str.size() - n;
2020 }
2021
2029 template<class TR = std::char_traits<wchar_t>, class AX = std::allocator<wchar_t>>
2030 size_t append_tag(_Inout_ std::basic_string<wchar_t, TR, AX>& str) const
2031 {
2032 // Use %X instead of %p to omit leading zeros and save space.
2033 return stdex::appendf(str, L"%c%zX%c", stdex::locale_C, static_cast<wchar_t>(token_tag_start), reinterpret_cast<uintptr_t>(this), static_cast<wchar_t>(token_tag_end));
2034 }
2035
2036 template<class T>
2037 static token* parse_tag(const T* str, size_t& offset)
2038 {
2039 if (str[offset] != static_cast<T>(token_tag_start))
2040 return nullptr;
2041
2042 // Locate tag end.
2043 size_t end;
2044 for (end = offset + 1; ; end++) {
2045 if (!str[end])
2046 return nullptr;
2047 if (str[end] == token_tag_end)
2048 break;
2049 }
2050
2051 // Parse hexadecimal token memory address.
2052 token* t = reinterpret_cast<token*>(stdex::strtouint<T, uintptr_t>(str + offset + 1, end - offset - 1, nullptr, 16));
2053 if (!t)
2054 throw std::invalid_argument("null token");
2055 offset = end + 1;
2056 return t;
2057 }
2058
2059 public:
2060 token_t type;
2062 uintptr_t data;
2063 };
2064
2065 using token_vector = std::vector<std::unique_ptr<token>>;
2066 using token_list = std::list<token*>;
2067
2071 enum text_type_flag_t : uint32_t {
2072 has_tokens = 1 << 0,
2073 has_text = 1 << 1,
2074 is_title = 1 << 2,
2075 is_bullet = 1 << 3,
2076 };
2077
2081 template<class T, class TR = std::char_traits<T>, class AX = std::allocator<T>>
2082 class text_token : public token
2083 {
2084 protected:
2085 text_token(
2086 _In_ token_t type = token_t::complete,
2087 _In_reads_or_z_opt_(num_chars) const T* _text = nullptr, _In_ size_t num_chars = 0,
2088 _In_ uint32_t _text_type = 0,
2089 _In_opt_ stdex::html::sequence* sequence = nullptr, _In_ uintptr_t data = 0) :
2091 text(_text, num_chars),
2092 text_type(_text_type)
2093 {}
2094
2095 friend class parser<T, TR, AX>;
2096
2097 public:
2098 std::basic_string<T, TR, AX> text;
2099 uint32_t text_type;
2100 stdex::mapping_vector<size_t> mapping;
2101 };
2102
2106 template<class T, class TR = std::char_traits<T>, class AX = std::allocator<T>>
2107 class starting_token : public text_token<T, TR, AX>
2108 {
2109 protected:
2111 _In_reads_or_z_opt_(num_chars_text) const T* _text = nullptr, _In_ size_t num_chars_text = 0,
2112 _In_reads_or_z_opt_(num_chars_name) const T* _name = nullptr, _In_ size_t num_chars_name = 0,
2113 _In_ uint32_t text_type = 0,
2114 _In_opt_ stdex::html::sequence* sequence = nullptr,
2115 _In_opt_ stdex::html::sequence* _end_sequence = nullptr,
2116 _In_ uintptr_t data = 0) :
2117 text_token<T, TR, AX>(token_t::starting, _text, num_chars_text, text_type, sequence, data),
2118 name(_name, num_chars_name),
2119 end_sequence(_end_sequence)
2120 {}
2121
2122 friend class parser<T, TR, AX>;
2123
2124 public:
2125 std::basic_string<T, TR, AX> name;
2127 };
2128
2132 enum class token_url_t {
2133 plain = 0, // URL is not using any particular encoding scheme (as-is)
2134 sgml, // URL is encoded using SGML entities
2135 css, // URL is encoded using CSS escaping scheme
2136 };
2137
2141 template<class T, class TR = std::char_traits<T>, class AX = std::allocator<T>>
2142 class url_token : public token
2143 {
2144 protected:
2145 url_token(
2146 _In_reads_or_z_opt_(num_chars) const T* _url = nullptr, _In_ size_t num_chars = 0,
2147 token_url_t _encoding = token_url_t::plain,
2148 _In_opt_ stdex::html::sequence* sequence = nullptr, _In_ uintptr_t data = 0) :
2149 token(token_t::url, sequence, data),
2150 url(_url, num_chars),
2151 encoding(_encoding)
2152 {}
2153
2154 friend class parser<T, TR, AX>;
2155
2156 public:
2157 std::basic_string<T, TR, AX> url;
2158 token_url_t encoding;
2159 };
2160
2166 std::list<stdex::html::token*> active_tokens;
2167 size_t word_index;
2169 };
2170
2171 using inserted_token_list = std::list<inserted_token>;
2172
2173 template<class T, class TR, class AX>
2175 {
2176 public:
2177 parser(
2178 _In_ const document<T, TR, AX>& document,
2179 _In_reads_or_z_opt_(num_chars) const stdex::schar_t* url = nullptr, _In_ size_t num_chars = 0,
2180 _In_ bool parse_frames = false, _In_ stdex::progress<size_t>* progress = nullptr) :
2182 m_url(url, stdex::strnlen(url, num_chars)),
2183 m_parse_frames(parse_frames),
2185 m_source(nullptr)
2186 {}
2187
2192 {
2193 stdex_assert(m_tokens.empty());
2194
2195 if (m_progress) {
2196 m_progress->set_range(0, m_document.source().size());
2197 m_progress->set(0);
2198 }
2199
2200 m_source = m_document.source().data();
2202 return parse(m_document.m_sequences.end());
2203 }
2204
2211 static void link(_Inout_ std::basic_string<T, TR, AX>& source, _In_ const text_token<T, TR, AX>* t)
2212 {
2213 stdex_assert(t);
2214 stdex_assert(
2215 t->type == token_t::complete ||
2216 t->type == token_t::starting ||
2217 t->type == token_t::ending ||
2218 t->type == token_t::root);
2219
2220 if (t->text_type & has_tokens) {
2221 const T* root = t->text.data();
2222 for (size_t i = 0, num_chars = t->text.size(); i < num_chars && root[i];) {
2223 stdex_assert(root[i] != token_tag_end);
2224 const token* t2 = token::parse_tag(root, i);
2225 if (t2) {
2226 switch (t2->type) {
2227 case token_t::complete:
2228 case token_t::starting:
2229 case token_t::ending:
2230 case token_t::root:
2231 link(source, dynamic_cast<const text_token<T, TR, AX>*>(t2));
2232 break;
2233 case token_t::url: {
2234 auto t2_url = dynamic_cast<const url_token<T, TR, AX>*>(t2);
2235 switch (t2_url->encoding) {
2236 case token_url_t::plain:
2237 source += t2_url->url;
2238 break;
2239 case token_url_t::sgml:
2240 escape(source, t2_url->url.data(), t2_url->url.size());
2241 break;
2242 case token_url_t::css:
2243 css_escape(source, t2_url->url.data(), t2_url->url.size());
2244 break;
2245 default:
2246 throw std::invalid_argument("unsupported URL encoding");
2247 }
2248 break;
2249 }
2250 default:
2251 throw std::invalid_argument("unsupported token type");
2252 }
2253 }
2254 else if (t->text_type & has_text) {
2255 escape_min(source, root[i]);
2256 i++;
2257 }
2258 else
2259 source += root[i++];
2260 }
2261 }
2262 else if (t->text_type & has_text) {
2263 // Token contains no references to other tokens. But, it does contain text that requires escaping.
2264 escape_min(source, t->text.data(), t->text.size());
2265 }
2266 else
2267 source += t->text;
2268 }
2269
2278 static void start_tokens(_Inout_ std::basic_string<T, TR, AX>& source, _Inout_ token_list& active_tokens, _In_ const token_list& new_tokens, _In_ token_list::const_iterator from)
2279 {
2280 for (; from != new_tokens.cend(); ++from) {
2281 auto t = *from;
2282 t->append_tag(source);
2283 active_tokens.push_back(t);
2284 }
2285 }
2286
2296 token_list::const_iterator end_tokens(_Inout_ std::basic_string<T, TR, AX>& source, _Inout_ token_list& active_tokens, _In_ const token_list& new_tokens)
2297 {
2298 // Skip matching tokens in active_tokens and new_tokens.
2299 token_list::const_iterator i1, i2;
2300 for (i1 = active_tokens.cbegin(), i2 = new_tokens.cbegin(); i1 != active_tokens.cend(); ++i1, ++i2) {
2301 if (i2 == new_tokens.cend() || *i1 != *i2) {
2302 // Got two tokens, where lists don't match anymore, or new_tokens list is out.
2303 // End tokens not relevant anymore in reverse order of starting.
2304 for (auto i = active_tokens.cend(); i != active_tokens.cbegin(); ) {
2305 auto t1 = dynamic_cast<starting_token<T, TR, AX>*>(*(--i));
2306 stdex_assert(t1 && t1->type == token_t::starting);
2307
2308 std::unique_ptr<text_token<T, TR, AX>> t2(new text_token<T, TR, AX>(token_t::ending));
2309 t2->text.reserve(t1->name.size() + 3);
2310 t2->text += '<';
2311 t2->text += '/';
2312 t2->text += t1->name;
2313 t2->text += '>';
2314 append_token(std::move(t2), source);
2315
2316 // Pop the active token.
2317 if (i1 == i) {
2318 active_tokens.erase(i);
2319 break;
2320 }
2321 active_tokens.erase(i);
2322 i = active_tokens.cend();
2323 }
2324 break;
2325 }
2326 }
2327 return i2;
2328 }
2329
2339 void append_inserted_tokens(_Inout_ std::basic_string<T, TR, AX>& source, _Inout_ inserted_token_list& inserted_tokens,
2340 _In_ size_t word_index, _In_ bool after_word,
2341 _Inout_ token_list& active_tokens)
2342 {
2343 for (auto i = inserted_tokens.begin(); i != inserted_tokens.end(); ) {
2344 auto& t = *i;
2345 stdex_assert(t.token);
2346 if (t.word_index == word_index && t.after_word == after_word) {
2347 if (t.token->type != token_t::ending)
2348 start_tokens(source, active_tokens, t.active_tokens, end_tokens(source, active_tokens, t.active_tokens));
2349 t.token->append_tag(source);
2350 inserted_tokens.erase(i++);
2351 }
2352 else
2353 ++i;
2354 }
2355 }
2356
2363 static void merge(_Inout_ token_list& a, _In_ const token_list& b)
2364 {
2365 for (auto i2 = b.begin(); i2 != b.end(); ++i2) {
2366 auto t2 = *i2;
2367 for (auto i1 = a.begin(); i1 != a.end(); ++i1) {
2368 if (i1 == a.end()) {
2369 a.push_back(t2);
2370 break;
2371 }
2372 auto t1 = *i1;
2373 if (t1 == t2)
2374 break;
2375 }
2376 }
2377 }
2378
2382 void make_absolute_url(std::basic_string<T, TR, AX>& rel)
2383 {
2384 _Unreferenced_(rel);
2385
2386 if (m_url.empty())
2387 return;
2388
2389 // TODO: Implement!
2390 }
2391
2395 const token_vector& tokens() const { return m_tokens; }
2396
2397 protected:
2405 template <class T_token>
2406 T_token* append_token(_Inout_ std::unique_ptr<T_token>&& token)
2407 {
2408 if (!token)
2409 return nullptr;
2410 auto t = token.get();
2411 m_tokens.push_back(std::move(token));
2412 return t;
2413 }
2414
2423 template <class T_token>
2424 size_t append_token(_Inout_ std::unique_ptr<T_token>&& token, _Inout_ std::basic_string<T, TR, AX>& source)
2425 {
2426 if (!token)
2427 return 0;
2428 size_t n = token->append_tag(source);
2429 m_tokens.push_back(std::move(token));
2430 return n;
2431 }
2432
2441 text_token<T, TR, AX>* parse(_In_ const sequence_store::const_iterator& end, _In_ uint32_t text_type = 0)
2442 {
2444 std::unique_ptr<text_token<T, TR, AX>> token(new text_token<T, TR, AX>(
2445 token_t::complete,
2446 nullptr, 0,
2447 text_type,
2448 m_offset != end ? m_offset->get() : nullptr));
2449
2450 while (m_offset != end) {
2451 auto& s = *m_offset;
2452
2453 if (m_progress) {
2454 if (m_progress->cancel())
2455 throw stdex::user_cancelled();
2456 m_progress->set(s->interval.start);
2457 }
2458
2459 // No token_tag_start and token_tag_end chars, please.
2460 stdex_assert(
2461 stdex::strnchr(m_source + s->interval.start, s->interval.size(), static_cast<T>(token_tag_start)) == stdex::npos &&
2462 stdex::strnchr(m_source + s->interval.start, s->interval.size(), static_cast<T>(token_tag_end)) == stdex::npos);
2463
2464 if (s->type == stdex::parser::html_sequence_t::text) {
2465 rel.from = s->interval.start;
2466 token->mapping.push_back(rel);
2467 stdex::sgml2strcat(token->text, m_source + s->interval.start, s->interval.size(), 0, rel, &token->mapping);
2468 rel.to = token->text.size();
2469 if (!(token->text_type & has_text) &&
2470 !stdex::isblank(m_source + s->interval.start, s->interval.size()))
2471 token->text_type |= has_text;
2472 ++m_offset;
2473 }
2474 else if (s->type == stdex::parser::html_sequence_t::element || s->type == stdex::parser::html_sequence_t::element_start) {
2475 const element* s_el = static_cast<const element*>(s.get());
2476 stdex_assert(s_el);
2477 const element_start* s_el_start = s->type == stdex::parser::html_sequence_t::element_start ? static_cast<const element_start*>(s.get()) : nullptr;
2478 if (s_el->code == element_t::frameset && !m_parse_frames)
2479 throw std::invalid_argument("<frameset> detected");
2480
2481 {
2482 size_t offset = s->interval.start;
2483 std::unique_ptr<text_token<T, TR, AX>> t(s->type == stdex::parser::html_sequence_t::element || element_traits::span(s_el_start->code) == element_span_t::immediate ?
2484 new text_token<T, TR, AX>(token_t::complete, nullptr, 0, 0, s.get()) :
2485 new starting_token<T, TR, AX>(nullptr, 0, m_source + s_el_start->name.start, s_el_start->name.size(), 0, s.get(), s_el_start->end));
2486
2487 // Copy the tag contents, but mind any attributes containing localizable text.
2488 for (auto& a : s_el->attributes) {
2489 if (a.value.empty() ||
2490 stdex::isblank(m_source + a.value.start, a.value.size()))
2491 continue;
2492
2493 if (element_traits::is_uri(s_el->code, m_source + a.name.start, a.name.size())) {
2494 t->text.append(m_source + offset, a.value.start - offset);
2495 std::unique_ptr<url_token<T, TR, AX>> t_url(new url_token<T, TR, AX>(
2496 nullptr, 0,
2497 token_url_t::sgml,
2498 s.get()));
2499 stdex::sgml2strcat(t_url->url, m_source + a.value.start, a.value.size());
2500 append_token(std::move(t_url), t->text);
2501 t->text_type |= has_tokens;
2502 offset = a.value.end;
2503 }
2504 else if (element_traits::is_localizable(s_el->code, m_source + a.name.start, a.name.size())) {
2505 t->text.append(m_source + offset, a.value.start - offset);
2506 std::unique_ptr<text_token<T, TR, AX>> t_value(new text_token<T, TR, AX>(
2507 token_t::complete,
2508 nullptr, 0,
2509 has_text | is_title,
2510 s.get()));
2511 stdex::mapping<size_t> rel_value(a.value.start, 0);
2512 t_value->mapping.push_back(rel_value);
2513 stdex::sgml2strcat(t_value->text, m_source + a.value.start, a.value.size(), 0, rel_value, &t_value->mapping);
2514 append_token(std::move(t_value), t->text);
2515 t->text_type |= has_tokens;
2516 offset = a.value.end;
2517 }
2518 }
2519
2520 t->text.append(m_source + offset, s->interval.end - offset);
2521 rel.from = s->interval.start;
2522 token->mapping.push_back(rel);
2523 rel.to += append_token(std::move(t), token->text);
2524 token->text_type |= has_tokens;
2525 }
2526 ++m_offset;
2527
2528 if (s_el_start) {
2529 if (s_el_start->code == element_t::address ||
2530 s_el_start->code == element_t::code ||
2531 s_el_start->code == element_t::comment ||
2532 s_el_start->code == element_t::cite ||
2533 s_el_start->code == element_t::kbd ||
2534 s_el_start->code == element_t::samp ||
2535 s_el_start->code == element_t::script ||
2536 s_el_start->code == element_t::style)
2537 {
2538 // Non-localizable
2539 auto s_end = s_el_start->end;
2540 stdex_assert(s_end);
2541
2542 if (s->interval.end < s_end->interval.start) {
2543 if (s_el_start->code != element_t::style) {
2544 rel.from = s->interval.start;
2545 token->mapping.push_back(rel);
2546 rel.to += append_token(std::move(std::unique_ptr<text_token<T, TR, AX>>(
2548 token_t::complete,
2549 m_source + s->interval.end, s_end->interval.start - s->interval.end,
2550 0,
2551 m_offset->get()))),
2552 token->text);
2553 }
2554 else {
2555 // Partially parse CSS. It may contain URLs we need to make absolute.
2556 auto t = parse_css(s->interval.end, s_end->interval.start);
2557 stdex_assert(t);
2558 rel.from = s->interval.start;
2559 token->mapping.push_back(rel);
2560 rel.to += t->append_tag(token->text);
2561 }
2562 token->text_type |= has_tokens;
2563 }
2564 while (m_offset != end && m_offset->get() != s_end)
2565 ++m_offset;
2566 }
2567 else if (element_traits::is_group(s_el_start->code)) {
2568 auto limit = m_offset;
2569 while (limit != end && limit->get() != s_el_start->end)
2570 ++limit;
2571 auto t = parse(limit,
2572 (element_traits::is_heading(s_el_start->code) || s_el_start->code == element_t::dt || s_el_start->code == element_t::title ? is_title : 0) |
2573 (element_traits::is_list(s_el_start->code) ? is_bullet : 0));
2574 rel.from = s->interval.start;
2575 token->mapping.push_back(rel);
2576 rel.to += t->append_tag(token->text);
2577 token->text_type |= has_tokens;
2578 }
2579 }
2580 }
2581 else if (s->type == stdex::parser::html_sequence_t::element_end) {
2582 rel.from = s->interval.start;
2583 token->mapping.push_back(rel);
2584 rel.to += append_token(std::move(std::unique_ptr<text_token<T, TR, AX>>(
2586 token_t::ending,
2587 m_source + s->interval.start, s->interval.size(),
2588 0,
2589 s.get()))),
2590 token->text);
2591 token->text_type |= has_tokens;
2592 ++m_offset;
2593 }
2594 else {
2595 // Declaration, instruction, (P)CDATA section, comment...
2596 rel.from = s->interval.start;
2597 token->mapping.push_back(rel);
2598 rel.to += append_token(std::move(std::unique_ptr<text_token<T, TR, AX>>(
2600 token_t::complete,
2601 m_source + s->interval.start, s->interval.size(),
2602 0,
2603 s.get()))),
2604 token->text);
2605 token->text_type |= has_tokens;
2606 ++m_offset;
2607 }
2608 }
2609
2610 return append_token(std::move(token));
2611 }
2612
2616 text_token<T, TR, AX>* parse_css(size_t start, size_t end)
2617 {
2618 stdex::interval<size_t> section, content;
2619 std::unique_ptr<text_token<T, TR, AX>> token(
2621 token_t::complete,
2622 nullptr, 0,
2623 0,
2624 m_offset->get()));
2625
2626 for (;;) {
2627 if (m_css_comment.match(m_source, start, end)) {
2628 token->text.append(m_source + start, m_css_comment.interval.end - start);
2629 start = m_css_comment.interval.end;
2630 }
2631 else if (m_css_cdo.match(m_source, start, end)) {
2632 token->text.append(m_source + start, m_css_cdo.interval.end - start);
2633 start = m_css_cdo.interval.end;
2634 }
2635 else if (m_css_cdc.match(m_source, start, end)) {
2636 token->text.append(m_source + start, m_css_cdc.interval.end - start);
2637 start = m_css_cdc.interval.end;
2638 }
2639 else if (
2640 (m_css_import.match(m_source, start, end) && ((void)(section = m_css_import.interval), (void)(content = m_css_import.content), true)) ||
2641 (m_css_uri.match(m_source, start, end) && ((void)(section = m_css_uri.interval), (void)(content = m_css_uri.content), true)))
2642 {
2643 std::unique_ptr<url_token<T, TR, AX>> t_url(
2645 nullptr, 0,
2646 token_url_t::css,
2647 m_offset->get()));
2648 css_unescape(t_url->url, m_source + content.start, content.size());
2649 token->text.append(m_source + start, content.start - start);
2650 append_token(std::move(t_url), token->text);
2651 token->text.append(m_source + content.end, section.end - content.end);
2652 token->text_type |= has_tokens;
2653 start = section.end;
2654 }
2655 else if (m_any_char.match(m_source, start, end)) {
2656 token->text.append(m_source + start, m_any_char.interval.end - start);
2657 start = m_any_char.interval.end;
2658 }
2659 else
2660 break;
2661 }
2662
2663 return append_token(std::move(token));
2664 }
2665
2666 protected:
2668 const stdex::sstring m_url;
2669 const bool m_parse_frames;
2671 const T* m_source;
2672 token_vector m_tokens;
2673 sequence_store::const_iterator m_offset;
2674
2675 // For detecting URLs in CSS
2683 };
2684 }
2685}
HTML comment.
Definition html.hpp:1568
stdex::interval< size_t > content
Comment content position in source.
Definition html.hpp:1577
HTML declaration.
Definition html.hpp:1550
stdex::interval< size_t > name
Declaration name position in source.
Definition html.hpp:1560
std::vector< stdex::parser::html_attribute > attributes
Declaration attribute positions in source.
Definition html.hpp:1561
HTML document.
Definition html.hpp:1617
bool m_is_rcdata
Inside of RCDATA?
Definition html.hpp:1944
const std::basic_string< T, TR, AX > & source() const
Returns document HTML source code.
Definition html.hpp:1891
void append(_In_reads_or_z_opt_(num_chars) const T *source, size_t num_chars)
Parses HTML source code by chunks.
Definition html.hpp:1657
size_t m_num_valid_conditions
Number of started valid conditions.
Definition html.hpp:1941
size_t m_num_invalid_conditions
Number of started invalid conditions.
Definition html.hpp:1942
bool m_is_cdata
Inside of CDATA?
Definition html.hpp:1943
stdex::charset_id m_charset
Document charset.
Definition html.hpp:1938
sequence_store m_sequences
Store of sequences.
Definition html.hpp:1952
element_start * active_element() const
Returns starting tag of currently active element or nullptr if no element is known to be started.
Definition html.hpp:1899
size_t m_num_parsed
Number of characters already parsed.
Definition html.hpp:1937
std::vector< element_start * > m_element_stack
LIFO stack of started elements.
Definition html.hpp:1953
void finalize()
Finalizes document when no more appending is planned.
Definition html.hpp:1869
std::basic_string< T, TR, AX > replace_entities(_In_reads_or_z_opt_(num_chars) const T *input, size_t num_chars) const
Replaces entities with their content.
Definition html.hpp:1907
void assign(_In_reads_or_z_opt_(num_chars) const T *source, size_t num_chars)
Parses HTML document source code.
Definition html.hpp:1881
bool m_is_special_element
Inside of a special element (<SCRIPT>, <STYLE>, ...)?
Definition html.hpp:1954
std::vector< std::unique_ptr< entity< T, TR, AX > > > m_entities
Array of entities.
Definition html.hpp:1948
void clear()
Empties document.
Definition html.hpp:1636
std::basic_string< T, TR, AX > m_source
Document HTML source code.
Definition html.hpp:1936
Ending tag of an HTML element </...>
Definition html.hpp:1530
stdex::interval< size_t > name
Element name position in source.
Definition html.hpp:1542
element_start * start
Corresponding starting tag.
Definition html.hpp:1543
element_t code
Element code.
Definition html.hpp:1541
Starting tag of an HTML element <...>
Definition html.hpp:1514
sequence * end
Corresponding ending tag of type element_end; When element is ended by a start of another element,...
Definition html.hpp:1523
HTML element <.../>
Definition html.hpp:1335
stdex::interval< size_t > name
Element name position in source.
Definition html.hpp:1504
std::vector< stdex::parser::html_attribute > attributes
Element attribute positions in source.
Definition html.hpp:1505
element_t code
Element code.
Definition html.hpp:1503
HTML instruction.
Definition html.hpp:1584
stdex::interval< size_t > content
Instruction content position in source.
Definition html.hpp:1593
HTML parser.
Definition html.hpp:2175
token_vector m_tokens
HTML token storage.
Definition html.hpp:2672
void append_inserted_tokens(std::basic_string< T, TR, AX > &source, inserted_token_list &inserted_tokens, size_t word_index, bool after_word, token_list &active_tokens)
Adds matching inserted tokens before/after the given word in source code.
Definition html.hpp:2339
text_token< T, TR, AX > * parse(const sequence_store::const_iterator &end, uint32_t text_type=0)
Recursively parses HTML document.
Definition html.hpp:2441
const stdex::sstring m_url
Absolute document URL.
Definition html.hpp:2668
text_token< T, TR, AX > * parse()
Parses HTML document.
Definition html.hpp:2191
const document< T, TR, AX > & m_document
Document being analyzed.
Definition html.hpp:2667
token_list::const_iterator end_tokens(std::basic_string< T, TR, AX > &source, token_list &active_tokens, const token_list &new_tokens)
Pops ending tokens from the active token list and append their tags to the source code string.
Definition html.hpp:2296
static void merge(token_list &a, const token_list &b)
Adds tokens from list b to list a creating an union.
Definition html.hpp:2363
text_token< T, TR, AX > * parse_css(size_t start, size_t end)
Parses CSS.
Definition html.hpp:2616
static void start_tokens(std::basic_string< T, TR, AX > &source, token_list &active_tokens, const token_list &new_tokens, token_list::const_iterator from)
Pushes tokens to the active token list and appends their tags to the source code string.
Definition html.hpp:2278
static void link(std::basic_string< T, TR, AX > &source, const text_token< T, TR, AX > *t)
Rebuilds HTML source code from the token tree.
Definition html.hpp:2211
T_token * append_token(std::unique_ptr< T_token > &&token)
Adds token to the collection.
Definition html.hpp:2406
sequence_store::const_iterator m_offset
Index of active section.
Definition html.hpp:2673
const T * m_source
HTML source code.
Definition html.hpp:2671
stdex::progress< size_t > * m_progress
Progress indicator.
Definition html.hpp:2670
const bool m_parse_frames
Parse frames.
Definition html.hpp:2669
void make_absolute_url(std::basic_string< T, TR, AX > &rel)
Converts URL to absolute.
Definition html.hpp:2382
size_t append_token(std::unique_ptr< T_token > &&token, std::basic_string< T, TR, AX > &source)
Adds token to the collection and appends its tag to the source code string.
Definition html.hpp:2424
const token_vector & tokens() const
Returns collection of tokens.
Definition html.hpp:2395
Base class for HTML sequences.
Definition html.hpp:1316
stdex::interval< size_t > interval
Sequence position in source.
Definition html.hpp:1319
stdex::parser::html_sequence_t type
Sequence type. Enum is used for performance reasons (vs. dynamic_cast)
Definition html.hpp:1318
sequence * parent
Parent sequence.
Definition html.hpp:1320
Token representing start HTML tag.
Definition html.hpp:2108
stdex::html::sequence * end_sequence
Ending tag sequence.
Definition html.hpp:2126
std::basic_string< T, TR, AX > name
Element name allowing later recreation of ending </tag>
Definition html.hpp:2125
Token representing part of HTML text.
Definition html.hpp:2083
stdex::mapping_vector< size_t > mapping
Mapping between source and text positions.
Definition html.hpp:2100
uint32_t text_type
Mask of text_type_flag_t to specify text content.
Definition html.hpp:2099
std::basic_string< T, TR, AX > text
Token text.
Definition html.hpp:2098
HTML token base class.
Definition html.hpp:1992
sequence * sequence
Pointer to the sequence this token represents or nullptr when it doesn't trivially represent one sequ...
Definition html.hpp:2061
uintptr_t data
Any user-supplied data.
Definition html.hpp:2062
size_t append_tag(std::basic_string< wchar_t, TR, AX > &str) const
Appends token tag to the source code.
Definition html.hpp:2030
token_t type
Token type.
Definition html.hpp:2060
size_t append_tag(std::basic_string< char, TR, AX > &str) const
Appends token tag to the source code.
Definition html.hpp:2014
HTTP token representing an URL.
Definition html.hpp:2143
token_url_t encoding
URL encoding.
Definition html.hpp:2158
std::basic_string< T, TR, AX > url
URL.
Definition html.hpp:2157
Test for any code unit.
Definition parser.hpp:216
Legacy CSS comment end -->
Definition parser.hpp:7451
Legacy CSS comment start <!--
Definition parser.hpp:7413
CSS comment.
Definition parser.hpp:7353
CSS import directive.
Definition parser.hpp:7665
CSS string.
Definition parser.hpp:7488
URI in CSS.
Definition parser.hpp:7555
End of condition ...]]>
Definition parser.hpp:8336
Start of condition <![condition[...
Definition parser.hpp:8270
Tag.
Definition parser.hpp:8034
MIME content type.
Definition parser.hpp:7749
stdex::interval< size_t > charset
charset position in source
Definition parser.hpp:7761
Progress indicator base class.
Definition progress.hpp:22
virtual bool cancel()
Query whether user requested abort.
Definition progress.hpp:70
virtual void set(T value)
Set current progress.
Definition progress.hpp:52
virtual void set_range(T start, T end)
Set progress range extent.
Definition progress.hpp:42
User cancelled exception.
Definition exception.hpp:17
Describes attributes associated with a HTML element.
Definition html.hpp:687
static bool is_group(element_t code)
Does element represent a separate part of text?
Definition html.hpp:1104
static bool is_flow(element_t code)
Does element typically represent text?
Definition html.hpp:1014
static bool is_heading(element_t code)
Does element represent a heading?
Definition html.hpp:928
static bool is_head_content(element_t code)
Is element part of the document head?
Definition html.hpp:1024
static bool is_fontstyle(element_t code)
Does element represent font styling?
Definition html.hpp:812
static bool is_block(element_t code)
Is element typically displayed as a stand-alone section of text?
Definition html.hpp:982
static bool is_head_misc(element_t code)
May element be a part of document head?
Definition html.hpp:1042
static bool is_list(element_t code)
Does element represent a list of items?
Definition html.hpp:948
static bool is_uri(element_t code, _In_reads_or_z_opt_(num_chars) const T *attr_name, size_t num_chars)
Checks if expected element attribute value is URI.
Definition html.hpp:1235
static bool is_preformatted(element_t code)
Does element represent preformatted text, source code etc.?
Definition html.hpp:966
static bool is_localizable(element_t code, const T *attr_name, size_t num_chars)
Checks if expected element attribute value is localizable.
Definition html.hpp:1290
static bool is_special(element_t code)
Does element represent non-textual item in the document?
Definition html.hpp:860
static bool is_pre_exclusion(element_t code)
May element be a part of <pre></pre>?
Definition html.hpp:1061
static bool is_inline(element_t code)
Is element typically displayed inline with text?
Definition html.hpp:913
static bool is_html_content(element_t code)
Does element represent the document body?
Definition html.hpp:1087
static bool is_formctrl(element_t code)
Does element represent a form control?
Definition html.hpp:894
static bool is_phrase(element_t code)
Does element represent a phrase-of-speech?
Definition html.hpp:835
static bool may_contain(element_t parent, element_t child)
Checks if one element may nest inside another.
Definition html.hpp:1135
static element_span_t span(element_t code)
Returns expected element span in HTML code.
Definition html.hpp:693
HTML entity.
Definition html.hpp:1601
std::basic_string< T, TR, AX > value
Entity value.
Definition html.hpp:1603
stdex::interval< size_t > name
Name position in source.
Definition html.hpp:1602
Inserted HTML token.
Definition html.hpp:2164
bool after_word
true if token is anchored after the word; false if anchored before the word
Definition html.hpp:2168
std::list< stdex::html::token * > active_tokens
List of started tokens at inserted token.
Definition html.hpp:2166
size_t word_index
Index of the word, token is anchored to.
Definition html.hpp:2167
token * token
Points to the token.
Definition html.hpp:2165
Numerical interval.
Definition interval.hpp:18
T size() const
Returns interval size.
Definition interval.hpp:47
T end
interval end
Definition interval.hpp:20
T start
interval start
Definition interval.hpp:19
Maps index in source string to index in destination string.
Definition mapping.hpp:18
Tag attribute.
Definition parser.hpp:8024
stdex::interval< size_t > value
attribute value position in source
Definition parser.hpp:8026