10#include "exception.hpp"
11#include "interval.hpp"
14#include "progress.hpp"
43 template<
class TR = std::
char_traits<
char>,
class AX = std::allocator<
char>>
45 _Inout_ std::basic_string<char, TR, AX>& dst,
46 _In_reads_or_z_opt_(num_chars)
const char* src, _In_
size_t num_chars)
48 stdex_assert(src || !num_chars);
49 for (
size_t i = 0; i < num_chars && src[i]; ++i) {
51 case '&': dst +=
"&";
break;
52 case ';': dst +=
";";
break;
53 case '\"': dst +=
""";
break;
54 case '\'': dst +=
"'";
break;
55 case '<': dst +=
"<";
break;
56 case '>': dst +=
">";
break;
57 case 0x00a0: dst +=
" ";
break;
58 default: dst += src[i];
break;
70 template<
class TR = std::
char_traits<
wchar_t>,
class AX = std::allocator<
wchar_t>>
72 _Inout_ std::basic_string<wchar_t, TR, AX>& dst,
73 _In_reads_or_z_opt_(num_chars)
const wchar_t* src, _In_
size_t num_chars)
75 stdex_assert(src || !num_chars);
76 for (
size_t i = 0; i < num_chars && src[i]; ++i) {
78 case L
'&': dst += L
"&";
break;
79 case L
';': dst += L
";";
break;
80 case L
'\"': dst += L
""";
break;
81 case L
'\'': dst += L
"'";
break;
82 case L
'<': dst += L
"<";
break;
83 case L
'>': dst += L
">";
break;
84 case L
'\u00a0': dst += L
" ";
break;
85 default: dst += src[i];
break;
96 template<
class T,
size_t N,
class TR = std::
char_traits<T>,
class AX = std::allocator<T>>
98 _Inout_ std::basic_string<T, TR, AX>& dst,
99 _In_
const T (&src)[N])
110 template<
class T,
class TR_dst = std::
char_traits<T>,
class AX_dst = std::allocator<T>,
class TR_src = std::
char_traits<T>,
class AX_src = std::allocator<T>>
112 _Inout_ std::basic_string<T, TR_dst, AX_dst>& dst,
113 _In_
const std::basic_string<T, TR_src, AX_src>& src)
115 escape(dst, src.data(), src.size());
124 template<
class TR = std::
char_traits<
char>,
class AX = std::allocator<
char>>
125 void escape_min(_Inout_ std::basic_string<char, TR, AX>& dst, _In_
char chr)
128 case '&': dst +=
"&";
break;
129 case '<': dst +=
"<";
break;
130 case '>': dst +=
">";
break;
131 case 0x00a0: dst +=
" ";
break;
132 default: dst += chr;
break;
142 template<
class TR = std::
char_traits<
wchar_t>,
class AX = std::allocator<
wchar_t>>
143 void escape_min(_Inout_ std::basic_string<wchar_t, TR, AX>& dst, _In_
wchar_t chr)
146 case L
'&': dst += L
"&";
break;
147 case L
'<': dst += L
"<";
break;
148 case L
'>': dst += L
">";
break;
149 case L
'\u00a0': dst += L
" ";
break;
150 default: dst += chr;
break;
161 template<
class TR = std::
char_traits<
char>,
class AX = std::allocator<
char>>
163 _Inout_ std::basic_string<char, TR, AX>& dst,
164 _In_reads_or_z_opt_(num_chars)
const char* src, _In_
size_t num_chars)
166 stdex_assert(src || !num_chars);
167 for (
size_t i = 0; i < num_chars && src[i]; ++i) {
169 case '&': dst +=
"&";
break;
170 case '<': dst +=
"<";
break;
171 case '>': dst +=
">";
break;
172 case 0x00a0: dst +=
" ";
break;
173 default: dst += src[i];
break;
185 template<
class TR = std::
char_traits<
wchar_t>,
class AX = std::allocator<
wchar_t>>
187 _Inout_ std::basic_string<wchar_t, TR, AX>& dst,
188 _In_reads_or_z_opt_(num_chars)
const wchar_t* src, _In_
size_t num_chars)
190 stdex_assert(src || !num_chars);
191 for (
size_t i = 0; i < num_chars && src[i]; ++i) {
193 case L
'&': dst += L
"&";
break;
194 case L
'<': dst += L
"<";
break;
195 case L
'>': dst += L
">";
break;
196 case L
'\u00a0': dst += L
" ";
break;
197 default: dst += src[i];
break;
208 template<
class T,
size_t N,
class TR = std::
char_traits<T>,
class AX = std::allocator<T>>
210 _Inout_ std::basic_string<T, TR, AX>& dst,
211 _In_
const T (&src)[N])
213 escape_min(dst, src, N);
222 template<
class T,
class TR_dst = std::
char_traits<T>,
class AX_dst = std::allocator<T>,
class TR_src = std::
char_traits<T>,
class AX_src = std::allocator<T>>
224 _Inout_ std::basic_string<T, TR_dst, AX_dst>& dst,
225 _In_
const std::basic_string<T, TR_src, AX_src>& src)
227 escape_min(dst, src.data(), src.size());
237 template<
class TR = std::
char_traits<
char>,
class AX = std::allocator<
char>>
239 _Inout_ std::basic_string<char, TR, AX>& dst,
240 _In_reads_or_z_opt_(num_chars)
const char* src, _In_
size_t num_chars)
242 stdex_assert(src || !num_chars);
243 for (
size_t i = 0; i < num_chars && src[i];) {
253 if (
'0' <= src[i] && src[i] <=
'9') chr =
static_cast<char>((src[i++] -
'0') << 4);
254 else if (
'A' <= src[i] && src[i] <=
'F') chr =
static_cast<char>((src[i++] -
'A' + 10) << 4);
255 else if (
'a' <= src[i] && src[i] <=
'f') chr =
static_cast<char>((src[i++] -
'a' + 10) << 4);
256 else { dst +=
'%';
continue; }
257 if (
'0' <= src[i] && src[i] <=
'9') chr |=
static_cast<char>((src[i++] -
'0'));
258 else if (
'A' <= src[i] && src[i] <=
'F') chr |=
static_cast<char>((src[i++] -
'A' + 10));
259 else if (
'a' <= src[i] && src[i] <=
'f') chr |=
static_cast<char>((src[i++] -
'a' + 10));
260 else { dst +=
'%'; dst += src[i - 1];
continue; }
278 template<
size_t N,
class TR = std::
char_traits<
char>,
class AX = std::allocator<
char>>
280 _Inout_ std::basic_string<char, TR, AX>& dst,
281 _In_
const char (&src)[N])
283 url_unescape(dst, src, N);
292 template<
class TR_dst = std::
char_traits<
char>,
class AX_dst = std::allocator<
char>>
294 _Inout_ std::basic_string<char, TR_dst, AX_dst>& dst,
295 _In_
const std::basic_string_view<
char, std::char_traits<char>> src)
297 url_unescape(dst, src.data(), src.size());
307 template<
class TR = std::
char_traits<
char>,
class AX = std::allocator<
char>>
309 _Inout_ std::basic_string<char, TR, AX>& dst,
310 _In_reads_or_z_opt_(num_chars)
const char* src, _In_
size_t num_chars)
312 stdex_assert(src || !num_chars);
313 for (
size_t i = 0; i < num_chars && src[i]; ++i) {
315 case ' ': dst +=
"%20";
break;
316 case '<': dst +=
"%3C";
break;
317 case '>': dst +=
"%3E";
break;
318 case '#': dst +=
"%23";
break;
319 case '%': dst +=
"%25";
break;
320 case '{': dst +=
"%7B";
break;
321 case '}': dst +=
"%7D";
break;
322 case '|': dst +=
"%7C";
break;
323 case '\\': dst +=
"%5C";
break;
324 case '^': dst +=
"%5E";
break;
325 case '~': dst +=
"%7E";
break;
326 case '[': dst +=
"%5B";
break;
327 case ']': dst +=
"%5D";
break;
328 case '`': dst +=
"%60";
break;
329 case ';': dst +=
"%3B";
break;
330 case '+': dst +=
"%2B";
break;
331 case '/': dst +=
"%2F";
break;
332 case '?': dst +=
"%3F";
break;
333 case ':': dst +=
"%3A";
break;
334 case '@': dst +=
"%40";
break;
335 case '=': dst +=
"%3D";
break;
336 case '&': dst +=
"%26";
break;
337 case '$': dst +=
"%24";
break;
339 if (0x20 <
static_cast<uint8_t
>(src[i]) &&
static_cast<uint8_t
>(src[i]) < 0x7f)
343 uint8_t n = (
static_cast<uint8_t
>(src[i]) & 0xf0) >> 4;
344 dst += n < 10 ? static_cast<char>(
'0' + n) : static_cast<char>(
'A' + n - 10);
345 n = ((uint8_t)src[i] & 0x0f);
346 dst += n < 10 ? static_cast<char>(
'0' + n) : static_cast<char>(
'A' + n - 10);
358 template<
size_t N,
class TR = std::
char_traits<
char>,
class AX = std::allocator<
char>>
360 _Inout_ std::basic_string<char, TR, AX>& dst,
361 _In_
const char (&src)[N])
363 url_escape(dst, src, N);
372 template<
class TR_dst = std::
char_traits<
char>,
class AX_dst = std::allocator<
char>>
374 _Inout_ std::basic_string<char, TR_dst, AX_dst>& dst,
375 _In_
const std::basic_string_view<
char, std::char_traits<char>> src)
377 url_escape(dst, src.data(), src.size());
387 template<
class T,
class TR = std::
char_traits<T>,
class AX = std::allocator<T>>
389 _Inout_ std::basic_string<T, TR, AX>& dst,
390 _In_reads_or_z_opt_(num_chars)
const T* src, _In_
size_t num_chars)
392 stdex_assert(src || !num_chars);
393 for (
size_t i = 0; i < num_chars && src[i];) {
396 else if (i + 1 < num_chars) {
401 case 'n': dst +=
'\n'; i++;
break;
402 case 'r': dst +=
'\r'; i++;
break;
403 case 't': dst +=
'\t'; i++;
break;
406 case '\n': i++;
break;
424 case 'F':
case 'f': {
426 size_t end = std::min(num_chars, i + 6);
428 for (; i < end; ++i) {
429 if (
'0' <= src[i] && src[i] <=
'9') chr = chr * 0x10 + src[i] -
'0';
430 else if (
'A' <= src[i] && src[i] <=
'F') chr = chr * 0x10 + src[i] -
'A' + 10;
431 else if (
'a' <= src[i] && src[i] <=
'f') chr = chr * 0x10 + src[i] -
'a' + 10;
435 dst +=
static_cast<T
>(chr);
437 if (i < end && src[i] ==
' ') {
444 default: dst += src[i++];
456 template<
class T,
size_t N,
class TR = std::
char_traits<T>,
class AX = std::allocator<T>>
458 _Inout_ std::basic_string<T, TR, AX>& dst,
459 _In_
const T (&src)[N])
461 css_unescape(dst, src, N);
470 template<
class T,
class TR_dst = std::
char_traits<T>,
class AX_dst = std::allocator<T>,
class TR_src = std::
char_traits<T>,
class AX_src = std::allocator<T>>
472 _Inout_ std::basic_string<T, TR_dst, AX_dst>& dst,
473 _In_
const std::basic_string<T, TR_src, AX_src>& src)
475 css_unescape(dst, src.data(), src.size());
485 template<
class TR = std::
char_traits<
char>,
class AX = std::allocator<
char>>
487 _Inout_ std::basic_string<char, TR, AX>& dst,
488 _In_reads_or_z_opt_(num_chars)
const char* src, _In_
size_t num_chars)
490 stdex_assert(src || !num_chars);
491 for (
size_t i = 0; i < num_chars && src[i]; ++i) {
493 case '\\': dst +=
"\\\\";
break;
494 case '\n': dst +=
"\\n";
break;
495 case '\r': dst +=
"\\r";
break;
496 case '\t': dst +=
"\\t";
break;
497 case '\"': dst +=
"\\\"";
break;
498 case '\'': dst +=
"\\'";
break;
499 default: dst += src[i];
break;
511 template<
class TR = std::
char_traits<
wchar_t>,
class AX = std::allocator<
wchar_t>>
513 _Inout_ std::basic_string<wchar_t, TR, AX>& dst,
514 _In_reads_or_z_opt_(num_chars)
const wchar_t* src, _In_
size_t num_chars)
516 stdex_assert(src || !num_chars);
517 for (
size_t i = 0; i < num_chars && src[i]; ++i) {
519 case L
'\\': dst += L
"\\\\";
break;
520 case L
'\n': dst += L
"\\n";
break;
521 case L
'\r': dst += L
"\\r";
break;
522 case L
'\t': dst += L
"\\t";
break;
523 case L
'\"': dst += L
"\\\"";
break;
524 case L
'\'': dst += L
"\\'";
break;
525 default: dst += src[i];
break;
536 template<
class T,
size_t N,
class TR = std::
char_traits<T>,
class AX = std::allocator<T>>
538 _Inout_ std::basic_string<T, TR, AX>& dst,
539 _In_
const T (&src)[N])
541 css_escape(dst, src, N);
550 template<
class T,
class TR_dst = std::
char_traits<T>,
class AX_dst = std::allocator<T>,
class TR_src = std::
char_traits<T>,
class AX_src = std::allocator<T>>
552 _Inout_ std::basic_string<T, TR_dst, AX_dst>& dst,
553 _In_
const std::basic_string<T, TR_src, AX_src>& src)
555 css_escape(dst, src.data(), src.size());
561 enum class element_t {
677 enum class element_span_t {
693 static element_span_t
span(_In_ element_t code)
695 static element_span_t lookup[] = {
696 element_span_t::needs_end,
697 element_span_t::needs_end,
698 element_span_t::needs_end,
699 element_span_t::needs_end,
700 element_span_t::needs_end,
701 element_span_t::immediate,
702 element_span_t::needs_end,
703 element_span_t::immediate,
704 element_span_t::immediate,
705 element_span_t::needs_end,
706 element_span_t::immediate,
707 element_span_t::needs_end,
708 element_span_t::needs_end,
709 element_span_t::needs_end,
710 element_span_t::end_optional,
711 element_span_t::immediate,
712 element_span_t::needs_end,
713 element_span_t::needs_end,
714 element_span_t::needs_end,
715 element_span_t::needs_end,
716 element_span_t::needs_end,
717 element_span_t::immediate,
718 element_span_t::end_optional,
719 element_span_t::needs_end,
720 element_span_t::end_optional,
721 element_span_t::needs_end,
722 element_span_t::needs_end,
723 element_span_t::needs_end,
724 element_span_t::needs_end,
725 element_span_t::needs_end,
726 element_span_t::end_optional,
727 element_span_t::needs_end,
728 element_span_t::immediate,
729 element_span_t::needs_end,
730 element_span_t::needs_end,
731 element_span_t::needs_end,
732 element_span_t::immediate,
733 element_span_t::needs_end,
734 element_span_t::needs_end,
735 element_span_t::needs_end,
736 element_span_t::needs_end,
737 element_span_t::needs_end,
738 element_span_t::needs_end,
739 element_span_t::needs_end,
740 element_span_t::end_optional,
741 element_span_t::immediate,
742 element_span_t::end_optional,
743 element_span_t::needs_end,
744 element_span_t::needs_end,
745 element_span_t::immediate,
746 element_span_t::immediate,
747 element_span_t::needs_end,
748 element_span_t::immediate,
749 element_span_t::needs_end,
750 element_span_t::needs_end,
751 element_span_t::needs_end,
752 element_span_t::end_optional,
753 element_span_t::immediate,
754 element_span_t::needs_end,
755 element_span_t::needs_end,
756 element_span_t::needs_end,
757 element_span_t::needs_end,
758 element_span_t::immediate,
759 element_span_t::immediate,
760 element_span_t::needs_end,
761 element_span_t::needs_end,
762 element_span_t::needs_end,
763 element_span_t::needs_end,
764 element_span_t::needs_end,
765 element_span_t::needs_end,
766 element_span_t::needs_end,
767 element_span_t::end_optional,
768 element_span_t::end_optional,
769 element_span_t::immediate,
770 element_span_t::end_optional,
771 element_span_t::needs_end,
772 element_span_t::needs_end,
773 element_span_t::immediate,
774 element_span_t::needs_end,
775 element_span_t::needs_end,
776 element_span_t::needs_end,
777 element_span_t::needs_end,
778 element_span_t::needs_end,
779 element_span_t::needs_end,
780 element_span_t::needs_end,
781 element_span_t::needs_end,
782 element_span_t::needs_end,
783 element_span_t::needs_end,
784 element_span_t::needs_end,
785 element_span_t::needs_end,
786 element_span_t::needs_end,
787 element_span_t::end_optional,
788 element_span_t::end_optional,
789 element_span_t::needs_end,
790 element_span_t::end_optional,
791 element_span_t::end_optional,
792 element_span_t::end_optional,
793 element_span_t::needs_end,
794 element_span_t::end_optional,
795 element_span_t::needs_end,
796 element_span_t::needs_end,
797 element_span_t::needs_end,
798 element_span_t::needs_end,
799 element_span_t::immediate,
800 element_span_t::needs_end,
802 return element_t::a <= code && code <= element_t::xmp ?
803 lookup[
static_cast<size_t>(code) -
static_cast<size_t>(element_t::a)] :
804 element_span_t::needs_end;
820 case element_t::strike:
821 case element_t::blink:
823 case element_t::small:
839 case element_t::strong:
841 case element_t::code:
842 case element_t::samp:
845 case element_t::cite:
846 case element_t::abbr:
847 case element_t::acronym:
865 case element_t::applet:
866 case element_t::object:
867 case element_t::embed:
868 case element_t::font:
869 case element_t::basefont:
873 case element_t::script:
878 case element_t::ruby:
879 case element_t::span:
881 case element_t::iframe:
882 case element_t::nobr:
897 case element_t::input:
898 case element_t::select:
899 case element_t::textarea:
900 case element_t::label:
901 case element_t::button:
916 code == element_t::PCDATA ||
954 case element_t::menu:
970 case element_t::listing:
991 case element_t::center:
992 case element_t::marquee:
993 case element_t::noscript:
994 case element_t::noframes:
995 case element_t::noembed:
996 case element_t::blockquote:
997 case element_t::form:
998 case element_t::isindex:
1000 case element_t::table:
1001 case element_t::fieldset:
1002 case element_t::address:
1027 case element_t::title:
1028 case element_t::isindex:
1029 case element_t::base:
1030 case element_t::nextid:
1045 case element_t::script:
1046 case element_t::style:
1047 case element_t::meta:
1048 case element_t::link:
1049 case element_t::object:
1064 case element_t::img:
1065 case element_t::object:
1066 case element_t::applet:
1067 case element_t::embed:
1068 case element_t::big:
1069 case element_t::small:
1070 case element_t::sub:
1071 case element_t::sup:
1072 case element_t::ruby:
1073 case element_t::font:
1074 case element_t::basefont:
1075 case element_t::nobr:
1090 case element_t::head:
1091 case element_t::body:
1092 case element_t::frameset:
1110 case element_t::col:
1111 case element_t::colgroup:
1113 case element_t::dir:
1115 case element_t::frame:
1116 case element_t::iframe:
1117 case element_t::legend:
1137 if (child == element_t::unknown || child == element_t::comment)
1145 case element_t::a:
return is_inline(child) && child != element_t::a;
1146 case element_t::address:
return is_inline(child) || child == element_t::p;
1147 case element_t::applet:
return is_flow(child) || child == element_t::param;
1148 case element_t::area:
return false;
1149 case element_t::base:
return false;
1150 case element_t::basefont:
return false;
1151 case element_t::bdo:
return is_inline(child);
1152 case element_t::blockquote:
return is_flow(child);
1153 case element_t::body:
return is_flow(child) || child == element_t::ins || child == element_t::del;
1154 case element_t::br:
return false;
1155 case element_t::button:
return is_flow(child) && !
is_formctrl(child) && child != element_t::a && child != element_t::form && child != element_t::isindex && child != element_t::fieldset && child != element_t::iframe;
1156 case element_t::caption:
return is_inline(child);
1157 case element_t::center:
return is_flow(child);
1158 case element_t::col:
return false;
1159 case element_t::colgroup:
return child == element_t::col;
1160 case element_t::comment:
return child == element_t::CDATA;
1161 case element_t::dd:
return is_flow(child);
1162 case element_t::del:
return is_flow(child);
1163 case element_t::dir:
return child == element_t::li;
1164 case element_t::div:
return is_flow(child);
1165 case element_t::dl:
return child == element_t::dt || child == element_t::dd;
1166 case element_t::dt:
return is_inline(child);
1167 case element_t::embed:
return is_flow(child) || child == element_t::param;
1168 case element_t::fieldset:
return is_flow(child) || child == element_t::legend || child == element_t::PCDATA;
1169 case element_t::font:
return is_inline(child);
1170 case element_t::form:
return is_flow(child) && child != element_t::form;
1171 case element_t::frame:
return false;
1172 case element_t::frameset:
return child == element_t::frameset || child == element_t::frame || child == element_t::noframes;
1174 case element_t::hr:
return false;
1176 case element_t::iframe:
return is_flow(child);
1177 case element_t::img:
return false;
1178 case element_t::input:
return false;
1179 case element_t::ins:
return is_flow(child);
1180 case element_t::isindex:
return false;
1181 case element_t::label:
return is_inline(child) && child != element_t::label;
1182 case element_t::legend:
return is_inline(child);
1183 case element_t::li:
return is_flow(child);
1184 case element_t::link:
return false;
1185 case element_t::listing:
return child == element_t::CDATA;
1186 case element_t::map:
return is_block(child) || child == element_t::area;
1187 case element_t::marquee:
return is_flow(child);
1188 case element_t::menu:
return child == element_t::li;
1189 case element_t::meta:
return false;
1190 case element_t::nobr:
return is_inline(child) || child == element_t::wbr;
1191 case element_t::noframes:
return (
is_flow(child) || child == element_t::body) && child != element_t::noframes;
1192 case element_t::noscript:
return is_flow(child);
1193 case element_t::noembed:
return is_flow(child);
1194 case element_t::object:
return is_flow(child) || child == element_t::param;
1195 case element_t::ol:
return child == element_t::li;
1196 case element_t::optgroup:
return child == element_t::option;
1197 case element_t::option:
return child == element_t::PCDATA;
1198 case element_t::p:
return is_inline(child);
1199 case element_t::param:
return false;
1200 case element_t::plaintext:
return is_flow(child);
1202 case element_t::q:
return is_inline(child);
1203 case element_t::rt:
return false;
1204 case element_t::ruby:
return is_inline(child);
1205 case element_t::script:
return child == element_t::CDATA;
1206 case element_t::select:
return child == element_t::optgroup || child == element_t::option;
1207 case element_t::span:
return is_inline(child);
1208 case element_t::style:
return child == element_t::CDATA;
1209 case element_t::sub:
return is_inline(child);
1210 case element_t::sup:
return is_inline(child);
1211 case element_t::table:
return child == element_t::caption || child == element_t::col || child == element_t::colgroup || child == element_t::thead || child == element_t::tfoot || child == element_t::tbody;
1212 case element_t::tbody:
return child == element_t::tr;
1213 case element_t::td:
return is_flow(child);
1214 case element_t::textarea:
return child == element_t::PCDATA;
1215 case element_t::tfoot:
return child == element_t::tr;
1216 case element_t::th:
return is_flow(child);
1217 case element_t::thead:
return child == element_t::tr;
1218 case element_t::title:
return child == element_t::PCDATA;
1219 case element_t::tr:
return child == element_t::td || child == element_t::th;
1220 case element_t::ul:
return child == element_t::li;
1221 case element_t::wbr:
return false;
1222 case element_t::unknown:
return true;
1223 default:
return false;
1235 static bool is_uri(_In_ element_t code, _In_reads_or_z_opt_(num_chars)
const T* attr_name, _In_
size_t num_chars)
1237 stdex_assert(attr_name || !num_chars);
1239 case element_t::a:
return stdex::strnicmp(attr_name, num_chars,
"href", SIZE_MAX) == 0;
1240 case element_t::applet:
return stdex::strnicmp(attr_name, num_chars,
"code", SIZE_MAX) == 0 ||
1241 stdex::strnicmp(attr_name, num_chars,
"codebase", SIZE_MAX) == 0 ||
1242 stdex::strnicmp(attr_name, num_chars,
"src", SIZE_MAX) == 0;
1243 case element_t::area:
return stdex::strnicmp(attr_name, num_chars,
"href", SIZE_MAX) == 0;
1244 case element_t::base:
return stdex::strnicmp(attr_name, num_chars,
"href", SIZE_MAX) == 0;
1245 case element_t::bgsound:
return stdex::strnicmp(attr_name, num_chars,
"src", SIZE_MAX) == 0;
1246 case element_t::blockquote:
return stdex::strnicmp(attr_name, num_chars,
"cite", SIZE_MAX) == 0;
1247 case element_t::body:
return stdex::strnicmp(attr_name, num_chars,
"background", SIZE_MAX) == 0;
1248 case element_t::comment:
return stdex::strnicmp(attr_name, num_chars,
"data", SIZE_MAX) == 0;
1249 case element_t::del:
return stdex::strnicmp(attr_name, num_chars,
"cite", SIZE_MAX) == 0;
1250 case element_t::embed:
return stdex::strnicmp(attr_name, num_chars,
"pluginspage", SIZE_MAX) == 0 ||
1251 stdex::strnicmp(attr_name, num_chars,
"src", SIZE_MAX) == 0;
1252 case element_t::form:
return stdex::strnicmp(attr_name, num_chars,
"action", SIZE_MAX) == 0;
1253 case element_t::frame:
return stdex::strnicmp(attr_name, num_chars,
"longdesc", SIZE_MAX) == 0 ||
1254 stdex::strnicmp(attr_name, num_chars,
"src", SIZE_MAX) == 0;
1255 case element_t::head:
return stdex::strnicmp(attr_name, num_chars,
"profile", SIZE_MAX) == 0;
1256 case element_t::iframe:
return stdex::strnicmp(attr_name, num_chars,
"longdesc", SIZE_MAX) == 0 ||
1257 stdex::strnicmp(attr_name, num_chars,
"src", SIZE_MAX) == 0;
1258 case element_t::img:
return stdex::strnicmp(attr_name, num_chars,
"longdesc", SIZE_MAX) == 0 ||
1259 stdex::strnicmp(attr_name, num_chars,
"lowsrc", SIZE_MAX) == 0 ||
1260 stdex::strnicmp(attr_name, num_chars,
"src", SIZE_MAX) == 0 ||
1261 stdex::strnicmp(attr_name, num_chars,
"usemap", SIZE_MAX) == 0;
1262 case element_t::input:
return stdex::strnicmp(attr_name, num_chars,
"lowsrc", SIZE_MAX) == 0 ||
1263 stdex::strnicmp(attr_name, num_chars,
"src", SIZE_MAX) == 0 ||
1264 stdex::strnicmp(attr_name, num_chars,
"usemap", SIZE_MAX) == 0;
1265 case element_t::ins:
return stdex::strnicmp(attr_name, num_chars,
"cite", SIZE_MAX) == 0;
1266 case element_t::link:
return stdex::strnicmp(attr_name, num_chars,
"href", SIZE_MAX) == 0;
1267 case element_t::object:
return stdex::strnicmp(attr_name, num_chars,
"basehref", SIZE_MAX) == 0 ||
1268 stdex::strnicmp(attr_name, num_chars,
"classid", SIZE_MAX) == 0 ||
1269 stdex::strnicmp(attr_name, num_chars,
"code", SIZE_MAX) == 0 ||
1270 stdex::strnicmp(attr_name, num_chars,
"codebase", SIZE_MAX) == 0 ||
1271 stdex::strnicmp(attr_name, num_chars,
"data", SIZE_MAX) == 0 ||
1272 stdex::strnicmp(attr_name, num_chars,
"usemap", SIZE_MAX) == 0;
1273 case element_t::q:
return stdex::strnicmp(attr_name, num_chars,
"cite", SIZE_MAX) == 0;
1274 case element_t::script:
return stdex::strnicmp(attr_name, num_chars,
"src", SIZE_MAX) == 0;
1275 case element_t::table:
return stdex::strnicmp(attr_name, num_chars,
"background", SIZE_MAX) == 0;
1276 case element_t::td:
return stdex::strnicmp(attr_name, num_chars,
"background", SIZE_MAX) == 0;
1277 case element_t::th:
return stdex::strnicmp(attr_name, num_chars,
"background", SIZE_MAX) == 0;
1278 default:
return false;
1292 stdex_assert(attr_name || !num_chars);
1293 if (stdex::strnicmp(attr_name, num_chars,
"title", SIZE_MAX) == 0)
1296 case element_t::applet:
return stdex::strnicmp(attr_name, num_chars,
"alt", SIZE_MAX) == 0;
1297 case element_t::area:
return stdex::strnicmp(attr_name, num_chars,
"alt", SIZE_MAX) == 0;
1298 case element_t::img:
return stdex::strnicmp(attr_name, num_chars,
"alt", SIZE_MAX) == 0;
1299 case element_t::input:
return stdex::strnicmp(attr_name, num_chars,
"alt", SIZE_MAX) == 0;
1300 case element_t::object:
return stdex::strnicmp(attr_name, num_chars,
"alt", SIZE_MAX) == 0;
1301 case element_t::table:
return stdex::strnicmp(attr_name, num_chars,
"summary", SIZE_MAX) == 0;
1302 case element_t::td:
return stdex::strnicmp(attr_name, num_chars,
"abbr", SIZE_MAX) == 0;
1303 case element_t::th:
return stdex::strnicmp(attr_name, num_chars,
"abbr", SIZE_MAX) == 0;
1304 default:
return false;
1310 using sequence_store = std::vector<std::unique_ptr<sequence>>;
1318 stdex::parser::html_sequence_t
type;
1322 sequence(_In_ stdex::parser::html_sequence_t _type = stdex::parser::html_sequence_t::unknown, _In_
size_t start = 0,
size_t end = 0, _In_opt_
sequence* _parent =
nullptr) :
1339 sequence(tag.type, tag.interval.start, tag.interval.end,
parent),
1340 code(element_code(src + tag.name.start, tag.name.size())),
1341 name(std::move(tag.name)),
1346 static element_t element_code(_In_reads_z_(num_chars)
const T*
name,
size_t num_chars)
1348 static const struct {
1352 {
"a", element_t::a, },
1353 {
"abbr", element_t::abbr, },
1354 {
"acronym", element_t::acronym, },
1355 {
"address", element_t::address, },
1356 {
"applet", element_t::applet, },
1357 {
"area", element_t::area, },
1358 {
"b", element_t::b, },
1359 {
"base", element_t::base, },
1360 {
"basefont", element_t::basefont, },
1361 {
"bdo", element_t::bdo, },
1362 {
"bgsound", element_t::bgsound, },
1363 {
"big", element_t::big, },
1364 {
"blink", element_t::blink, },
1365 {
"blockquote", element_t::blockquote, },
1366 {
"body", element_t::body, },
1367 {
"br", element_t::br, },
1368 {
"button", element_t::button, },
1369 {
"caption", element_t::caption, },
1370 {
"center", element_t::center, },
1371 {
"cite", element_t::cite, },
1372 {
"code", element_t::code, },
1373 {
"col", element_t::col, },
1374 {
"colgroup", element_t::colgroup, },
1375 {
"comment", element_t::comment, },
1376 {
"dd", element_t::dd, },
1377 {
"del", element_t::del, },
1378 {
"dfn", element_t::dfn, },
1379 {
"dir", element_t::dir, },
1380 {
"div", element_t::div, },
1381 {
"dl", element_t::dl, },
1382 {
"dt", element_t::dt, },
1383 {
"em", element_t::em, },
1384 {
"embed", element_t::embed, },
1385 {
"fieldset", element_t::fieldset, },
1386 {
"font", element_t::font, },
1387 {
"form", element_t::form, },
1388 {
"frame", element_t::frame, },
1389 {
"frameset", element_t::frameset, },
1390 {
"h1", element_t::h1, },
1391 {
"h2", element_t::h2, },
1392 {
"h3", element_t::h3, },
1393 {
"h4", element_t::h4, },
1394 {
"h5", element_t::h5, },
1395 {
"h6", element_t::h6, },
1396 {
"head", element_t::head, },
1397 {
"hr", element_t::hr, },
1398 {
"html", element_t::html, },
1399 {
"i", element_t::i, },
1400 {
"iframe", element_t::iframe, },
1401 {
"img", element_t::img, },
1402 {
"input", element_t::input, },
1403 {
"ins", element_t::ins, },
1404 {
"isindex", element_t::isindex, },
1405 {
"kbd", element_t::kbd, },
1406 {
"label", element_t::label, },
1407 {
"legend", element_t::legend, },
1408 {
"li", element_t::li, },
1409 {
"link", element_t::link, },
1410 {
"listing", element_t::listing, },
1411 {
"map", element_t::map, },
1412 {
"marquee", element_t::marquee, },
1413 {
"menu", element_t::menu, },
1414 {
"meta", element_t::meta, },
1415 {
"nextid", element_t::nextid, },
1416 {
"nobr", element_t::nobr, },
1417 {
"noembed", element_t::noembed, },
1418 {
"noframes", element_t::noframes, },
1419 {
"noscript", element_t::noscript, },
1420 {
"object", element_t::object, },
1421 {
"ol", element_t::ol, },
1422 {
"optgroup", element_t::optgroup, },
1423 {
"option", element_t::option, },
1424 {
"p", element_t::p, },
1425 {
"param", element_t::param, },
1426 {
"plaintext", element_t::plaintext, },
1427 {
"pre", element_t::pre, },
1428 {
"q", element_t::q, },
1429 {
"rt", element_t::rt, },
1430 {
"ruby", element_t::ruby, },
1431 {
"s", element_t::s, },
1432 {
"samp", element_t::samp, },
1433 {
"script", element_t::script, },
1434 {
"select", element_t::select, },
1435 {
"small", element_t::small, },
1436 {
"span", element_t::span, },
1437 {
"strike", element_t::strike, },
1438 {
"strong", element_t::strong, },
1439 {
"style", element_t::style, },
1440 {
"sub", element_t::sub, },
1441 {
"sup", element_t::sup, },
1442 {
"table", element_t::table, },
1443 {
"tbody", element_t::tbody, },
1444 {
"td", element_t::td, },
1445 {
"textarea", element_t::textarea, },
1446 {
"tfoot", element_t::tfoot, },
1447 {
"th", element_t::th, },
1448 {
"thead", element_t::thead, },
1449 {
"title", element_t::title, },
1450 {
"tr", element_t::tr, },
1451 {
"tt", element_t::tt, },
1452 {
"u", element_t::u, },
1453 {
"ul", element_t::ul, },
1454 {
"var", element_t::var, },
1455 {
"wbr", element_t::wbr, },
1456 {
"xmp", element_t::xmp, },
1460 for (
size_t i = 1; i < _countof(
mapping); i++)
1462 for (
size_t i = 0; i < _countof(
mapping); i++) {
1463 for (
size_t j = 0;
mapping[i].name[j]; j++)
1467 for (
size_t i = 0, j = _countof(
mapping); i < j; ) {
1468 size_t m = (i + j) / 2;
1470 for (
size_t i1 = 0, i2 = 0;;) {
1472 r = i2 >= num_chars || !
name[i2] ? 0 : -1;
1475 if (i2 >= num_chars || !
name[i2]) {
1480 auto chr =
static_cast<char>(stdex::tolower(
name[i2++]));
1499 return element_t::unknown;
1534 sequence(tag.type, tag.interval.start, tag.interval.end,
parent),
1535 code(element::element_code(src + tag.name.start, tag.name.size())),
1536 name(std::move(tag.name)),
1554 sequence(tag.type, tag.interval.start, tag.interval.end,
parent),
1555 name(std::move(tag.name)),
1572 sequence(tag.type, tag.interval.start, tag.interval.end,
parent),
1588 sequence(tag.type, tag.interval.start, tag.interval.end,
parent),
1599 template<
class T,
class TR = std::
char_traits<T>,
class AX = std::allocator<T>>
1609 template<
class T,
class TR = std::
char_traits<T>,
class AX = std::allocator<T>>
1615 template<
class T,
class TR = std::
char_traits<T>,
class AX = std::allocator<T>>
1657 void append(_In_reads_or_z_opt_(num_chars)
const T*
source, _In_
size_t num_chars)
1659 stdex_assert(
source || !num_chars);
1666 if (m_condition_end.match(
source, i, num_chars)) {
1668 m_is_cdata ? stdex::parser::html_sequence_t::CDATA : stdex::parser::html_sequence_t::PCDATA,
1679 if (m_condition_end.match(
source, i, num_chars)) {
1696 if (m_condition_start.match(
source, i, num_chars)) {
1697 auto condition_src(
replace_entities(
source + m_condition_start.condition.start, m_condition_start.condition.size()));
1698 if (stdex::strncmp(condition_src.data(), condition_src.size(),
"CDATA", SIZE_MAX) == 0)
1700 else if (stdex::strncmp(condition_src.data(), condition_src.size(),
"RCDATA", SIZE_MAX) == 0)
1704 else if (stdex::strncmp(condition_src.data(), condition_src.size(),
"IGNORE", SIZE_MAX) == 0)
1715 stdex_assert(parent);
1716 if (m_tag.match(
source, i, num_chars) &&
1717 m_tag.type == stdex::parser::html_sequence_t::element_end &&
1718 element::element_code(
source + m_tag.name.start, m_tag.name.size()) == parent->code)
1723 std::unique_ptr<element_end> e(
new element_end(std::move(m_tag),
source, parent->parent, parent));
1724 parent->end = e.get();
1733 if (m_tag.match(
source, i, num_chars)) {
1738 switch (m_tag.type) {
1739 case stdex::parser::html_sequence_t::element:
1740 case stdex::parser::html_sequence_t::element_start: {
1741 std::unique_ptr<element> e(
1742 m_tag.type == stdex::parser::html_sequence_t::element ?
new element(std::move(m_tag),
source) :
1743 m_tag.type == stdex::parser::html_sequence_t::element_start ?
new element_start(std::move(m_tag),
source) :
1749 stdex_assert(starting_tag && starting_tag->type == stdex::parser::html_sequence_t::element_start);
1751 e->parent = starting_tag;
1754 e->parent = starting_tag->parent;
1755 starting_tag->end = e.get();
1759 if (e->type == stdex::parser::html_sequence_t::element_start) {
1762 e_start->
end = e.get();
1766 case element_t::code:
1767 case element_t::comment:
1768 case element_t::script:
1769 case element_t::style:
1777 if (e->code == element_t::meta &&
m_charset == stdex::charset_id::system) {
1778 bool is_content_type =
false;
1780 for (
auto& attr : e->attributes) {
1781 if (stdex::strnicmp(
source + attr.name.start, attr.name.size(),
"http-equiv", SIZE_MAX) == 0 &&
1782 stdex::strnicmp(
source + attr.value.start, attr.value.size(),
"content-type", SIZE_MAX) == 0)
1783 is_content_type =
true;
1784 else if (stdex::strnicmp(
source + attr.name.start, attr.name.size(),
"content", SIZE_MAX) == 0)
1785 content_attr = &attr;
1787 if (is_content_type && content_attr) {
1796 str.push_back(
static_cast<char>(
source[j]));
1797 m_charset = stdex::charset_from_name(str);
1805 case stdex::parser::html_sequence_t::element_end: {
1810 stdex_assert(starting_tag && starting_tag->type == stdex::parser::html_sequence_t::element_start);
1811 if (starting_tag->code == e->code ||
1812 (starting_tag->code == element_t::unknown && e->code == element_t::unknown && stdex::strnicmp(
source + starting_tag->name.start, starting_tag->name.size(),
source + e->name.start, e->name.size()) == 0))
1814 e->start = starting_tag;
1815 e->parent = starting_tag->parent;
1816 starting_tag->end = e.get();
1825 case stdex::parser::html_sequence_t::declaration:
1826 if (m_tag.attributes.size() > 3 &&
1827 stdex::strnicmp(
source + m_tag.attributes[0].name.start, m_tag.attributes[0].name.size(),
"entity", SIZE_MAX) == 0)
1829 if (stdex::strncmp(
source + m_tag.attributes[1].name.start, m_tag.attributes[1].name.size(),
"%", SIZE_MAX) == 0 &&
1830 stdex::strncmp(
source + m_tag.attributes[3].name.start, m_tag.attributes[3].name.size(),
"SYSTEM", SIZE_MAX) &&
1831 stdex::strncmp(
source + m_tag.attributes[3].name.start, m_tag.attributes[3].name.size(),
"PUBLIC", SIZE_MAX))
1834 e->name = m_tag.attributes[2].name;
1835 e->value = std::move(
replace_entities(
source + m_tag.attributes[3].name.start, m_tag.attributes[3].name.size()));
1843 case stdex::parser::html_sequence_t::comment:
1846 case stdex::parser::html_sequence_t::instruction:
1850 throw std::invalid_argument(
"unknown tag type");
1857 if (m_any_char.match(
source, i, num_chars)) {
1859 i = m_any_char.interval.end;
1881 void assign(_In_reads_or_z_opt_(num_chars)
const T*
source, _In_
size_t num_chars)
1893 friend class parser<T, TR, AX>;
1907 std::basic_string<T, TR, AX>
replace_entities(_In_reads_or_z_opt_(num_chars)
const T* input, _In_
size_t num_chars)
const
1909 stdex_assert(input || !num_chars);
1910 const size_t num_entities =
m_entities.size();
1912 std::basic_string<T, TR, AX> output;
1913 for (
size_t i = 0; i < num_chars && input[i];) {
1914 if (input[i] ==
'%') {
1915 for (
size_t j = 0; j < num_entities; j++) {
1917 size_t entity_size = e->name.size();
1918 if (i + entity_size + 1 < num_chars &&
1919 stdex::strncmp(input + i + 1,
source + e->name.start, entity_size) == 0 &&
1920 input[i + entity_size + 1] ==
';')
1923 i += entity_size + 2;
1927 throw std::runtime_error(
"undefined entity");
1929 output += input[i++];
1960 enum class token_t {
1971 constexpr size_t token_tag_max =
1980 constexpr char token_tag_start =
'\x12';
1986 constexpr char token_tag_end =
'\x13';
1994 token(_In_ token_t _type = token_t::root, _In_opt_
sequence* _sequence =
nullptr, _In_ uintptr_t _data = 0) :
2000 template<
class T,
class TR,
class AX>
2013 template<
class TR = std::
char_traits<
char>,
class AX = std::allocator<
char>>
2014 size_t append_tag(_Inout_ std::basic_string<char, TR, AX>& str)
const
2016 size_t n = str.size();
2018 stdex::appendf(str,
"%c%zX%c", stdex::locale_C, token_tag_start,
reinterpret_cast<uintptr_t
>(
this), token_tag_end);
2019 return str.size() - n;
2029 template<
class TR = std::
char_traits<
wchar_t>,
class AX = std::allocator<
wchar_t>>
2030 size_t append_tag(_Inout_ std::basic_string<wchar_t, TR, AX>& str)
const
2033 return stdex::appendf(str, L
"%c%zX%c", stdex::locale_C,
static_cast<wchar_t>(token_tag_start),
reinterpret_cast<uintptr_t
>(
this),
static_cast<wchar_t>(token_tag_end));
2037 static token* parse_tag(
const T* str,
size_t& offset)
2039 if (str[offset] !=
static_cast<T
>(token_tag_start))
2044 for (end = offset + 1; ; end++) {
2047 if (str[end] == token_tag_end)
2052 token* t =
reinterpret_cast<token*
>(stdex::strtouint<T, uintptr_t>(str + offset + 1, end - offset - 1,
nullptr, 16));
2054 throw std::invalid_argument(
"null token");
2065 using token_vector = std::vector<std::unique_ptr<token>>;
2066 using token_list = std::list<token*>;
2071 enum text_type_flag_t : uint32_t {
2072 has_tokens = 1 << 0,
2081 template<
class T,
class TR = std::
char_traits<T>,
class AX = std::allocator<T>>
2086 _In_ token_t
type = token_t::complete,
2087 _In_reads_or_z_opt_(num_chars)
const T* _text =
nullptr, _In_
size_t num_chars = 0,
2088 _In_ uint32_t _text_type = 0,
2091 text(_text, num_chars),
2095 friend class parser<T, TR, AX>;
2106 template<
class T,
class TR = std::
char_traits<T>,
class AX = std::allocator<T>>
2111 _In_reads_or_z_opt_(num_chars_text)
const T* _text =
nullptr, _In_
size_t num_chars_text = 0,
2112 _In_reads_or_z_opt_(num_chars_name)
const T* _name =
nullptr, _In_
size_t num_chars_name = 0,
2116 _In_ uintptr_t
data = 0) :
2118 name(_name, num_chars_name),
2122 friend class parser<T, TR, AX>;
2132 enum class token_url_t {
2141 template<
class T,
class TR = std::
char_traits<T>,
class AX = std::allocator<T>>
2146 _In_reads_or_z_opt_(num_chars)
const T* _url =
nullptr, _In_
size_t num_chars = 0,
2147 token_url_t _encoding = token_url_t::plain,
2150 url(_url, num_chars),
2154 friend class parser<T, TR, AX>;
2157 std::basic_string<T, TR, AX>
url;
2171 using inserted_token_list = std::list<inserted_token>;
2173 template<
class T,
class TR,
class AX>
2179 _In_reads_or_z_opt_(num_chars)
const stdex::schar_t* url =
nullptr, _In_
size_t num_chars = 0,
2182 m_url(url, stdex::strnlen(url, num_chars)),
2215 t->type == token_t::complete ||
2216 t->type == token_t::starting ||
2217 t->type == token_t::ending ||
2218 t->type == token_t::root);
2220 if (t->text_type & has_tokens) {
2221 const T* root = t->text.data();
2222 for (
size_t i = 0, num_chars = t->text.size(); i < num_chars && root[i];) {
2223 stdex_assert(root[i] != token_tag_end);
2224 const token* t2 = token::parse_tag(root, i);
2227 case token_t::complete:
2228 case token_t::starting:
2229 case token_t::ending:
2233 case token_t::url: {
2235 switch (t2_url->encoding) {
2236 case token_url_t::plain:
2237 source += t2_url->url;
2239 case token_url_t::sgml:
2240 escape(source, t2_url->url.data(), t2_url->url.size());
2242 case token_url_t::css:
2243 css_escape(source, t2_url->url.data(), t2_url->url.size());
2246 throw std::invalid_argument(
"unsupported URL encoding");
2251 throw std::invalid_argument(
"unsupported token type");
2254 else if (t->text_type & has_text) {
2255 escape_min(source, root[i]);
2259 source += root[i++];
2262 else if (t->text_type & has_text) {
2264 escape_min(source, t->text.data(), t->text.size());
2278 static void start_tokens(_Inout_ std::basic_string<T, TR, AX>& source, _Inout_ token_list& active_tokens, _In_
const token_list& new_tokens, _In_ token_list::const_iterator from)
2280 for (; from != new_tokens.cend(); ++from) {
2282 t->append_tag(source);
2283 active_tokens.push_back(t);
2296 token_list::const_iterator
end_tokens(_Inout_ std::basic_string<T, TR, AX>& source, _Inout_ token_list& active_tokens, _In_
const token_list& new_tokens)
2299 token_list::const_iterator i1, i2;
2300 for (i1 = active_tokens.cbegin(), i2 = new_tokens.cbegin(); i1 != active_tokens.cend(); ++i1, ++i2) {
2301 if (i2 == new_tokens.cend() || *i1 != *i2) {
2304 for (
auto i = active_tokens.cend(); i != active_tokens.cbegin(); ) {
2306 stdex_assert(t1 && t1->type == token_t::starting);
2309 t2->text.reserve(t1->name.size() + 3);
2312 t2->text += t1->name;
2318 active_tokens.erase(i);
2321 active_tokens.erase(i);
2322 i = active_tokens.cend();
2340 _In_
size_t word_index, _In_
bool after_word,
2341 _Inout_ token_list& active_tokens)
2343 for (
auto i = inserted_tokens.begin(); i != inserted_tokens.end(); ) {
2345 stdex_assert(t.token);
2346 if (t.word_index == word_index && t.after_word == after_word) {
2347 if (t.token->type != token_t::ending)
2348 start_tokens(source, active_tokens, t.active_tokens,
end_tokens(source, active_tokens, t.active_tokens));
2349 t.token->append_tag(source);
2350 inserted_tokens.erase(i++);
2363 static void merge(_Inout_ token_list& a, _In_
const token_list& b)
2365 for (
auto i2 = b.begin(); i2 != b.end(); ++i2) {
2367 for (
auto i1 = a.begin(); i1 != a.end(); ++i1) {
2368 if (i1 == a.end()) {
2384 _Unreferenced_(rel);
2405 template <
class T_token>
2410 auto t =
token.get();
2423 template <
class T_token>
2424 size_t append_token(_Inout_ std::unique_ptr<T_token>&&
token, _Inout_ std::basic_string<T, TR, AX>& source)
2461 stdex::strnchr(
m_source + s->interval.start, s->interval.size(),
static_cast<T
>(token_tag_start)) == stdex::npos &&
2462 stdex::strnchr(
m_source + s->interval.start, s->interval.size(),
static_cast<T
>(token_tag_end)) == stdex::npos);
2464 if (s->type == stdex::parser::html_sequence_t::text) {
2465 rel.from = s->interval.start;
2466 token->mapping.push_back(rel);
2467 stdex::sgml2strcat(
token->text,
m_source + s->interval.start, s->interval.size(), 0, rel, &
token->mapping);
2468 rel.to =
token->text.size();
2469 if (!(
token->text_type & has_text) &&
2470 !stdex::isblank(
m_source + s->interval.start, s->interval.size()))
2471 token->text_type |= has_text;
2474 else if (s->type == stdex::parser::html_sequence_t::element || s->type == stdex::parser::html_sequence_t::element_start) {
2477 const element_start* s_el_start = s->type == stdex::parser::html_sequence_t::element_start ?
static_cast<const element_start*
>(s.get()) :
nullptr;
2479 throw std::invalid_argument(
"<frameset> detected");
2482 size_t offset = s->interval.start;
2483 std::unique_ptr<text_token<T, TR, AX>> t(s->type == stdex::parser::html_sequence_t::element ||
element_traits::span(s_el_start->
code) == element_span_t::immediate ?
2489 if (a.value.empty() ||
2490 stdex::isblank(
m_source + a.value.start, a.value.size()))
2494 t->text.append(
m_source + offset, a.value.start - offset);
2499 stdex::sgml2strcat(t_url->url,
m_source + a.value.start, a.value.size());
2501 t->text_type |= has_tokens;
2502 offset = a.value.end;
2505 t->text.append(
m_source + offset, a.value.start - offset);
2509 has_text | is_title,
2512 t_value->mapping.push_back(rel_value);
2513 stdex::sgml2strcat(t_value->text,
m_source + a.value.start, a.value.size(), 0, rel_value, &t_value->mapping);
2515 t->text_type |= has_tokens;
2516 offset = a.value.end;
2520 t->text.append(
m_source + offset, s->interval.end - offset);
2521 rel.from = s->interval.start;
2522 token->mapping.push_back(rel);
2524 token->text_type |= has_tokens;
2529 if (s_el_start->
code == element_t::address ||
2530 s_el_start->
code == element_t::code ||
2531 s_el_start->
code == element_t::comment ||
2532 s_el_start->
code == element_t::cite ||
2533 s_el_start->
code == element_t::kbd ||
2534 s_el_start->
code == element_t::samp ||
2535 s_el_start->
code == element_t::script ||
2536 s_el_start->
code == element_t::style)
2539 auto s_end = s_el_start->
end;
2540 stdex_assert(s_end);
2542 if (s->interval.end < s_end->interval.start) {
2543 if (s_el_start->
code != element_t::style) {
2544 rel.from = s->interval.start;
2545 token->mapping.push_back(rel);
2549 m_source + s->interval.end, s_end->interval.start - s->interval.end,
2556 auto t =
parse_css(s->interval.end, s_end->interval.start);
2558 rel.from = s->interval.start;
2559 token->mapping.push_back(rel);
2560 rel.to += t->append_tag(
token->text);
2562 token->text_type |= has_tokens;
2569 while (limit != end && limit->get() != s_el_start->
end)
2571 auto t =
parse(limit,
2574 rel.from = s->interval.start;
2575 token->mapping.push_back(rel);
2576 rel.to += t->append_tag(
token->text);
2577 token->text_type |= has_tokens;
2581 else if (s->type == stdex::parser::html_sequence_t::element_end) {
2582 rel.from = s->interval.start;
2583 token->mapping.push_back(rel);
2587 m_source + s->interval.start, s->interval.size(),
2591 token->text_type |= has_tokens;
2596 rel.from = s->interval.start;
2597 token->mapping.push_back(rel);
2601 m_source + s->interval.start, s->interval.size(),
2605 token->text_type |= has_tokens;
2619 std::unique_ptr<text_token<T, TR, AX>>
token(
2627 if (m_css_comment.match(
m_source, start, end)) {
2628 token->text.append(
m_source + start, m_css_comment.interval.end - start);
2629 start = m_css_comment.interval.end;
2631 else if (m_css_cdo.match(
m_source, start, end)) {
2632 token->text.append(
m_source + start, m_css_cdo.interval.end - start);
2633 start = m_css_cdo.interval.end;
2635 else if (m_css_cdc.match(
m_source, start, end)) {
2636 token->text.append(
m_source + start, m_css_cdc.interval.end - start);
2637 start = m_css_cdc.interval.end;
2640 (m_css_import.match(
m_source, start, end) && ((
void)(section = m_css_import.interval), (
void)(content = m_css_import.content),
true)) ||
2641 (m_css_uri.match(
m_source, start, end) && ((
void)(section = m_css_uri.interval), (
void)(content = m_css_uri.content),
true)))
2643 std::unique_ptr<url_token<T, TR, AX>> t_url(
2652 token->text_type |= has_tokens;
2653 start = section.
end;
2655 else if (m_any_char.match(
m_source, start, end)) {
2656 token->text.append(
m_source + start, m_any_char.interval.end - start);
2657 start = m_any_char.interval.end;
HTML declaration.
Definition html.hpp:1550
stdex::interval< size_t > name
Declaration name position in source.
Definition html.hpp:1560
std::vector< stdex::parser::html_attribute > attributes
Declaration attribute positions in source.
Definition html.hpp:1561
HTML document.
Definition html.hpp:1617
bool m_is_rcdata
Inside of RCDATA?
Definition html.hpp:1944
const std::basic_string< T, TR, AX > & source() const
Returns document HTML source code.
Definition html.hpp:1891
void append(_In_reads_or_z_opt_(num_chars) const T *source, size_t num_chars)
Parses HTML source code by chunks.
Definition html.hpp:1657
size_t m_num_valid_conditions
Number of started valid conditions.
Definition html.hpp:1941
size_t m_num_invalid_conditions
Number of started invalid conditions.
Definition html.hpp:1942
bool m_is_cdata
Inside of CDATA?
Definition html.hpp:1943
stdex::charset_id m_charset
Document charset.
Definition html.hpp:1938
sequence_store m_sequences
Store of sequences.
Definition html.hpp:1952
element_start * active_element() const
Returns starting tag of currently active element or nullptr if no element is known to be started.
Definition html.hpp:1899
size_t m_num_parsed
Number of characters already parsed.
Definition html.hpp:1937
std::vector< element_start * > m_element_stack
LIFO stack of started elements.
Definition html.hpp:1953
void finalize()
Finalizes document when no more appending is planned.
Definition html.hpp:1869
std::basic_string< T, TR, AX > replace_entities(_In_reads_or_z_opt_(num_chars) const T *input, size_t num_chars) const
Replaces entities with their content.
Definition html.hpp:1907
void assign(_In_reads_or_z_opt_(num_chars) const T *source, size_t num_chars)
Parses HTML document source code.
Definition html.hpp:1881
bool m_is_special_element
Inside of a special element (<SCRIPT>, <STYLE>, ...)?
Definition html.hpp:1954
std::vector< std::unique_ptr< entity< T, TR, AX > > > m_entities
Array of entities.
Definition html.hpp:1948
void clear()
Empties document.
Definition html.hpp:1636
std::basic_string< T, TR, AX > m_source
Document HTML source code.
Definition html.hpp:1936
Ending tag of an HTML element </...>
Definition html.hpp:1530
stdex::interval< size_t > name
Element name position in source.
Definition html.hpp:1542
element_start * start
Corresponding starting tag.
Definition html.hpp:1543
element_t code
Element code.
Definition html.hpp:1541
Starting tag of an HTML element <...>
Definition html.hpp:1514
sequence * end
Corresponding ending tag of type element_end; When element is ended by a start of another element,...
Definition html.hpp:1523
HTML element <.../>
Definition html.hpp:1335
stdex::interval< size_t > name
Element name position in source.
Definition html.hpp:1504
std::vector< stdex::parser::html_attribute > attributes
Element attribute positions in source.
Definition html.hpp:1505
element_t code
Element code.
Definition html.hpp:1503
HTML instruction.
Definition html.hpp:1584
stdex::interval< size_t > content
Instruction content position in source.
Definition html.hpp:1593
HTML parser.
Definition html.hpp:2175
token_vector m_tokens
HTML token storage.
Definition html.hpp:2672
void append_inserted_tokens(std::basic_string< T, TR, AX > &source, inserted_token_list &inserted_tokens, size_t word_index, bool after_word, token_list &active_tokens)
Adds matching inserted tokens before/after the given word in source code.
Definition html.hpp:2339
text_token< T, TR, AX > * parse(const sequence_store::const_iterator &end, uint32_t text_type=0)
Recursively parses HTML document.
Definition html.hpp:2441
const stdex::sstring m_url
Absolute document URL.
Definition html.hpp:2668
text_token< T, TR, AX > * parse()
Parses HTML document.
Definition html.hpp:2191
const document< T, TR, AX > & m_document
Document being analyzed.
Definition html.hpp:2667
token_list::const_iterator end_tokens(std::basic_string< T, TR, AX > &source, token_list &active_tokens, const token_list &new_tokens)
Pops ending tokens from the active token list and append their tags to the source code string.
Definition html.hpp:2296
static void merge(token_list &a, const token_list &b)
Adds tokens from list b to list a creating an union.
Definition html.hpp:2363
text_token< T, TR, AX > * parse_css(size_t start, size_t end)
Parses CSS.
Definition html.hpp:2616
static void start_tokens(std::basic_string< T, TR, AX > &source, token_list &active_tokens, const token_list &new_tokens, token_list::const_iterator from)
Pushes tokens to the active token list and appends their tags to the source code string.
Definition html.hpp:2278
static void link(std::basic_string< T, TR, AX > &source, const text_token< T, TR, AX > *t)
Rebuilds HTML source code from the token tree.
Definition html.hpp:2211
T_token * append_token(std::unique_ptr< T_token > &&token)
Adds token to the collection.
Definition html.hpp:2406
sequence_store::const_iterator m_offset
Index of active section.
Definition html.hpp:2673
const T * m_source
HTML source code.
Definition html.hpp:2671
stdex::progress< size_t > * m_progress
Progress indicator.
Definition html.hpp:2670
const bool m_parse_frames
Parse frames.
Definition html.hpp:2669
void make_absolute_url(std::basic_string< T, TR, AX > &rel)
Converts URL to absolute.
Definition html.hpp:2382
size_t append_token(std::unique_ptr< T_token > &&token, std::basic_string< T, TR, AX > &source)
Adds token to the collection and appends its tag to the source code string.
Definition html.hpp:2424
const token_vector & tokens() const
Returns collection of tokens.
Definition html.hpp:2395
Base class for HTML sequences.
Definition html.hpp:1316
stdex::interval< size_t > interval
Sequence position in source.
Definition html.hpp:1319
stdex::parser::html_sequence_t type
Sequence type. Enum is used for performance reasons (vs. dynamic_cast)
Definition html.hpp:1318
sequence * parent
Parent sequence.
Definition html.hpp:1320
Token representing start HTML tag.
Definition html.hpp:2108
stdex::html::sequence * end_sequence
Ending tag sequence.
Definition html.hpp:2126
std::basic_string< T, TR, AX > name
Element name allowing later recreation of ending </tag>
Definition html.hpp:2125
Token representing part of HTML text.
Definition html.hpp:2083
stdex::mapping_vector< size_t > mapping
Mapping between source and text positions.
Definition html.hpp:2100
uint32_t text_type
Mask of text_type_flag_t to specify text content.
Definition html.hpp:2099
std::basic_string< T, TR, AX > text
Token text.
Definition html.hpp:2098
HTML token base class.
Definition html.hpp:1992
sequence * sequence
Pointer to the sequence this token represents or nullptr when it doesn't trivially represent one sequ...
Definition html.hpp:2061
uintptr_t data
Any user-supplied data.
Definition html.hpp:2062
size_t append_tag(std::basic_string< wchar_t, TR, AX > &str) const
Appends token tag to the source code.
Definition html.hpp:2030
token_t type
Token type.
Definition html.hpp:2060
size_t append_tag(std::basic_string< char, TR, AX > &str) const
Appends token tag to the source code.
Definition html.hpp:2014
HTTP token representing an URL.
Definition html.hpp:2143
token_url_t encoding
URL encoding.
Definition html.hpp:2158
std::basic_string< T, TR, AX > url
URL.
Definition html.hpp:2157
Test for any code unit.
Definition parser.hpp:216
Legacy CSS comment end -->
Definition parser.hpp:7451
Legacy CSS comment start <!--
Definition parser.hpp:7413
CSS import directive.
Definition parser.hpp:7665
CSS string.
Definition parser.hpp:7488
URI in CSS.
Definition parser.hpp:7555
End of condition ...]]>
Definition parser.hpp:8336
Start of condition <![condition[...
Definition parser.hpp:8270
Tag.
Definition parser.hpp:8034
MIME content type.
Definition parser.hpp:7749
stdex::interval< size_t > charset
charset position in source
Definition parser.hpp:7761
Progress indicator base class.
Definition progress.hpp:22
virtual bool cancel()
Query whether user requested abort.
Definition progress.hpp:70
virtual void set(T value)
Set current progress.
Definition progress.hpp:52
virtual void set_range(T start, T end)
Set progress range extent.
Definition progress.hpp:42
User cancelled exception.
Definition exception.hpp:17
Describes attributes associated with a HTML element.
Definition html.hpp:687
static bool is_group(element_t code)
Does element represent a separate part of text?
Definition html.hpp:1104
static bool is_flow(element_t code)
Does element typically represent text?
Definition html.hpp:1014
static bool is_heading(element_t code)
Does element represent a heading?
Definition html.hpp:928
static bool is_head_content(element_t code)
Is element part of the document head?
Definition html.hpp:1024
static bool is_fontstyle(element_t code)
Does element represent font styling?
Definition html.hpp:812
static bool is_block(element_t code)
Is element typically displayed as a stand-alone section of text?
Definition html.hpp:982
static bool is_head_misc(element_t code)
May element be a part of document head?
Definition html.hpp:1042
static bool is_list(element_t code)
Does element represent a list of items?
Definition html.hpp:948
static bool is_uri(element_t code, _In_reads_or_z_opt_(num_chars) const T *attr_name, size_t num_chars)
Checks if expected element attribute value is URI.
Definition html.hpp:1235
static bool is_preformatted(element_t code)
Does element represent preformatted text, source code etc.?
Definition html.hpp:966
static bool is_localizable(element_t code, const T *attr_name, size_t num_chars)
Checks if expected element attribute value is localizable.
Definition html.hpp:1290
static bool is_special(element_t code)
Does element represent non-textual item in the document?
Definition html.hpp:860
static bool is_pre_exclusion(element_t code)
May element be a part of <pre></pre>?
Definition html.hpp:1061
static bool is_inline(element_t code)
Is element typically displayed inline with text?
Definition html.hpp:913
static bool is_html_content(element_t code)
Does element represent the document body?
Definition html.hpp:1087
static bool is_formctrl(element_t code)
Does element represent a form control?
Definition html.hpp:894
static bool is_phrase(element_t code)
Does element represent a phrase-of-speech?
Definition html.hpp:835
static bool may_contain(element_t parent, element_t child)
Checks if one element may nest inside another.
Definition html.hpp:1135
static element_span_t span(element_t code)
Returns expected element span in HTML code.
Definition html.hpp:693
HTML entity.
Definition html.hpp:1601
std::basic_string< T, TR, AX > value
Entity value.
Definition html.hpp:1603
stdex::interval< size_t > name
Name position in source.
Definition html.hpp:1602
Inserted HTML token.
Definition html.hpp:2164
bool after_word
true if token is anchored after the word; false if anchored before the word
Definition html.hpp:2168
std::list< stdex::html::token * > active_tokens
List of started tokens at inserted token.
Definition html.hpp:2166
size_t word_index
Index of the word, token is anchored to.
Definition html.hpp:2167
token * token
Points to the token.
Definition html.hpp:2165
Numerical interval.
Definition interval.hpp:18
T size() const
Returns interval size.
Definition interval.hpp:47
T end
interval end
Definition interval.hpp:20
T start
interval start
Definition interval.hpp:19
Maps index in source string to index in destination string.
Definition mapping.hpp:18
Tag attribute.
Definition parser.hpp:8024
stdex::interval< size_t > value
attribute value position in source
Definition parser.hpp:8026