stdex
Additional custom or not Standard C++ covered algorithms
Loading...
Searching...
No Matches
sgml.hpp
1/*
2 SPDX-License-Identifier: MIT
3 Copyright © 2023-2024 Amebis
4*/
5
6#pragma once
7
8#include "assert.hpp"
9#include "compat.hpp"
10#include "mapping.hpp"
11#include "sgml_unicode.hpp"
12#include "string.hpp"
13#include <string.h>
14#include <exception>
15#include <string>
16
17#if defined(__GNUC__)
18#pragma GCC diagnostic push
19#pragma GCC diagnostic ignored "-Wexit-time-destructors"
20#endif
21
22namespace stdex
23{
25 template <class T>
26 const utf32_t* sgml2uni(_In_reads_or_z_(count) const T* entity, _In_ size_t count, utf32_t buf[2])
27 {
28 stdex_assert(entity && count);
29
30 if (count < 2 || entity[0] != '#') {
31 for (size_t i = 0, j = _countof(sgml_unicode); i < j; ) {
32 size_t m = (i + j) / 2;
33 if (sgml_unicode[m].sgml[0] < entity[0])
34 i = m + 1;
35 else if (sgml_unicode[m].sgml[0] > entity[0])
36 j = m;
37 else {
38 auto r = strncmp<char, T>(sgml_unicode[m].sgml + 1, _countof(sgml_unicode[0].sgml) - 1, entity + 1, count - 1);
39 if (r < 0)
40 i = m + 1;
41 else if (r > 0)
42 j = m;
43 else {
44 for (; i < m && strncmp<char, T>(sgml_unicode[m - 1].sgml, _countof(sgml_unicode[0].sgml), entity, count) == 0; m--);
45 return sgml_unicode[m].unicode;
46 }
47 }
48 }
49 return nullptr;
50 }
51
52 buf[0] = entity[1] == 'x' || entity[1] == 'X' ?
53 static_cast<utf32_t>(strtou32(&entity[2], count - 2, nullptr, 16)) :
54 static_cast<utf32_t>(strtou32(&entity[1], count - 1, nullptr, 10));
55 buf[1] = 0;
56 return buf;
57 }
58
59 inline const utf16_t* utf32_to_wstr(_In_opt_z_ const utf32_t* str, utf16_t* buf)
60 {
61 if (!str)
62 return nullptr;
63 for (size_t i = 0, j = 0;; ++i) {
64 if (!str[i]) {
65 buf[j] = 0;
66 return buf;
67 }
68 if (str[i] < 0x10000)
69 buf[j++] = static_cast<utf16_t>(str[i]);
70 else {
71 ucs4_to_surrogate_pair(&buf[j], str[i]);
72 j += 2;
73 }
74 }
75 }
76
77 inline const utf32_t* utf32_to_wstr(_In_opt_z_ const utf32_t* str, utf32_t* buf)
78 {
79 _Unreferenced_(buf);
80 return str;
81 }
82
83 template <class T>
84 const T* sgmlend(
85 _In_reads_or_z_opt_(count) const T* str, _In_ size_t count)
86 {
87 stdex_assert(str || !count);
88 for (size_t i = 0; i < count; i++) {
89 if (str[i] == ';')
90 return str + i;
91 if (!str[i] || str[i] == '&' || isspace(str[i]))
92 break;
93 }
94 return nullptr;
95 }
97
98 constexpr int sgml_full = 0x40000000;
99 constexpr int sgml_quot = 0x00000001;
100 constexpr int sgml_apos = 0x00000002;
101 constexpr int sgml_quot_apos = sgml_quot | sgml_apos;
102 constexpr int sgml_amp = 0x00000004;
103 constexpr int sgml_lt_gt = 0x00000008;
104 constexpr int sgml_bsol = 0x00000010;
105 constexpr int sgml_dollar = 0x00000020;
106 constexpr int sgml_percnt = 0x00000040;
107 constexpr int sgml_commat = 0x00000080;
108 constexpr int sgml_num = 0x00000100;
109 constexpr int sgml_lpar_rpar = 0x00000200;
110 constexpr int sgml_lcub_rcub = 0x00000400;
111 constexpr int sgml_lsqb_rsqb = 0x00000800;
112 constexpr int sgml_sgml = sgml_amp | sgml_lt_gt;
113 constexpr int sgml_ml_attrib = sgml_amp | sgml_quot_apos;
114 constexpr int sgml_c = sgml_amp | sgml_bsol | sgml_quot_apos;
115 // constexpr int sgml_kolos = sgml_amp | sgml_quot | sgml_dollar | sgml_percnt | sgml_lt_gt | sgml_bsol/* | sgml_commat | sgml_num*/ | sgml_lpar_rpar | sgml_lcub_rcub | sgml_lsqb_rsqb;
116
126 template <class T_from>
127 size_t sgmlerr(
128 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
129 _In_ int what = 0)
130 {
131 stdex_assert(src || !count_src);
132
133 const bool
134 do_ascii = (what & sgml_full) == 0;
135
136 for (size_t i = 0; i < count_src && src[i];) {
137 if (src[i] == '&') {
138 auto end = sgmlend(&src[i + 1], count_src - i - 1);
139 if (end) {
140 utf32_t chr[2];
141 size_t n = end - src - i - 1;
142 auto entity_w = sgml2uni(&src[i + 1], n, chr);
143 if (entity_w) {
144 i = end - src + 1;
145 continue;
146 }
147
148 // Unknown entity.
149 return i;
150 }
151
152 // Unterminated entity.
153 return i;
154 }
155
156 if (do_ascii && !is7bit(src[i])) {
157 // Non-ASCII character
158 return i;
159 }
160 i++;
161 }
162
163 return npos;
164 }
165
174 template <class T_from, class TR_from = std::char_traits<T_from>, class AX_from = std::allocator<T_from>>
175 size_t sgmlerr(
176 _In_ const std::basic_string<T_from, TR_from, AX_from>& src,
177 _In_ int what = 0)
178 {
179 return sgmlerr(src.data(), src.size(), what);
180 }
181
192 template <class T_to, class T_from, class TR_to = std::char_traits<T_to>, class AX_to = std::allocator<T_to>>
193 void sgml2strcat(
194 _Inout_ std::basic_string<T_to, TR_to, AX_to>& dst,
195 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
196 _In_ int skip = 0,
197 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
198 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
199 {
200 stdex_assert(src || !count_src);
201
202 const bool
203 skip_quot = (skip & sgml_quot) == 0,
204 skip_apos = (skip & sgml_apos) == 0,
205 skip_amp = (skip & sgml_amp) == 0,
206 skip_lt_gt = (skip & sgml_lt_gt) == 0,
207 skip_bsol = (skip & sgml_bsol) == 0,
208 skip_dollar = (skip & sgml_dollar) == 0,
209 skip_percnt = (skip & sgml_percnt) == 0,
210 skip_commat = (skip & sgml_commat) == 0,
211 skip_num = (skip & sgml_num) == 0,
212 skip_lpar_rpar = (skip & sgml_lpar_rpar) == 0,
213 skip_lcub_rcub = (skip & sgml_lcub_rcub) == 0,
214 skip_lsqb_rsqb = (skip & sgml_lsqb_rsqb) == 0;
215
216 count_src = strnlen(src, count_src);
217 dst.reserve(dst.size() + count_src);
218 for (size_t i = 0; i < count_src;) {
219 if (src[i] == '&') {
220 auto end = sgmlend(&src[i + 1], count_src - i - 1);
221 if (end) {
222 utf32_t chr32[2];
223 stdex_assert(&src[i + 1] <= end);
224 size_t n = static_cast<size_t>(end - src) - i - 1;
225 T_to chr[5];
226 auto entity_w = utf32_to_wstr(sgml2uni(&src[i + 1], n, chr32), chr);
227 if (entity_w &&
228 (skip_quot || (entity_w[0] != '"')) &&
229 (skip_apos || (entity_w[0] != '\'')) &&
230 (skip_amp || (entity_w[0] != '&')) &&
231 (skip_lt_gt || (entity_w[0] != '<' && entity_w[0] != '>')) &&
232 (skip_bsol || (entity_w[0] != '\\')) &&
233 (skip_dollar || (entity_w[0] != '$')) &&
234 (skip_percnt || (entity_w[0] != '%')) &&
235 (skip_commat || (entity_w[0] != '@')) &&
236 (skip_num || (entity_w[0] != '#')) &&
237 (skip_lpar_rpar || (entity_w[0] != '(' && entity_w[0] != ')')) &&
238 (skip_lcub_rcub || (entity_w[0] != '{' && entity_w[0] != '}')) &&
239 (skip_lsqb_rsqb || (entity_w[0] != '[' && entity_w[0] != ']')))
240 {
241 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + dst.size()));
242 dst.append(entity_w);
243 stdex_assert(src <= end);
244 i = static_cast<size_t>(end - src) + 1;
245 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + dst.size()));
246 continue;
247 }
248 }
249 }
250 dst.append(1, src[i++]);
251 }
252 }
253
263 template <class T_to, class T_from, class TR_to = std::char_traits<T_to>, class AX_to = std::allocator<T_to>, class TR_from = std::char_traits<T_from>, class AX_from = std::allocator<T_from>>
264 void sgml2strcat(
265 _Inout_ std::basic_string<T_to, TR_to, AX_to>& dst,
266 _In_ const std::basic_string<T_from, TR_from, AX_from>& src,
267 _In_ int skip = 0,
268 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
269 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
270 {
271 sgml2strcat(dst, src.data(), src.size(), skip, offset, map);
272 }
273
287 template <class T_to, class T_from>
288 size_t sgml2strcat(
289 _Inout_cap_(count_dst) T_to* dst, _In_ size_t count_dst,
290 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
291 _In_ int skip = 0,
292 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
293 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
294 {
295 stdex_assert(dst || !count_dst);
296 stdex_assert(src || !count_src);
297
298 static const std::invalid_argument buffer_overrun("buffer overrun");
299 const bool
300 skip_quot = (skip & sgml_quot) == 0,
301 skip_apos = (skip & sgml_apos) == 0,
302 skip_amp = (skip & sgml_amp) == 0,
303 skip_lt_gt = (skip & sgml_lt_gt) == 0,
304 skip_bsol = (skip & sgml_bsol) == 0,
305 skip_dollar = (skip & sgml_dollar) == 0,
306 skip_percnt = (skip & sgml_percnt) == 0,
307 skip_commat = (skip & sgml_commat) == 0,
308 skip_num = (skip & sgml_num) == 0,
309 skip_lpar_rpar = (skip & sgml_lpar_rpar) == 0,
310 skip_lcub_rcub = (skip & sgml_lcub_rcub) == 0,
311 skip_lsqb_rsqb = (skip & sgml_lsqb_rsqb) == 0;
312
313 size_t j = strnlen(dst, count_dst);
314 count_src = strnlen(src, count_src);
315 for (size_t i = 0; i < count_src;) {
316 if (src[i] == '&') {
317 auto end = sgmlend(&src[i + 1], count_src - i - 1);
318 if (end) {
319 utf32_t chr32[2];
320 T_to chr[5];
321 size_t n = end - src - i - 1;
322 auto entity_w = utf32_to_wstr(sgml2uni(&src[i + 1], n, chr32), chr);
323 if (entity_w &&
324 (skip_quot || (entity_w[0] != '"')) &&
325 (skip_apos || (entity_w[0] != '\'')) &&
326 (skip_amp || (entity_w[0] != '&')) &&
327 (skip_lt_gt || (entity_w[0] != '<' && entity_w[0] != '>')) &&
328 (skip_bsol || (entity_w[0] != '\\')) &&
329 (skip_dollar || (entity_w[0] != '$')) &&
330 (skip_percnt || (entity_w[0] != '%')) &&
331 (skip_commat || (entity_w[0] != '@')) &&
332 (skip_num || (entity_w[0] != '#')) &&
333 (skip_lpar_rpar || (entity_w[0] != '(' && entity_w[0] != ')')) &&
334 (skip_lcub_rcub || (entity_w[0] != '{' && entity_w[0] != '}')) &&
335 (skip_lsqb_rsqb || (entity_w[0] != '[' && entity_w[0] != ']')))
336 {
337 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + j));
338 size_t m = strlen(entity_w);
339 if (j + m >= count_dst)
340 throw buffer_overrun;
341 memcpy(dst + j, entity_w, m * sizeof(*entity_w)); j += m;
342 i = end - src + 1;
343 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + j));
344 continue;
345 }
346 }
347 }
348 if (j + 1 >= count_dst)
349 throw buffer_overrun;
350 dst[j++] = src[i++];
351 }
352 if (j >= count_dst)
353 throw buffer_overrun;
354 dst[j] = 0;
355 return j;
356 }
357
368 template <class T_to, class T_from, class TR_to = std::char_traits<T_to>, class AX_to = std::allocator<T_to>>
369 void sgml2strcpy(
370 _Inout_ std::basic_string<T_to, TR_to, AX_to>& dst,
371 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
372 _In_ int skip = 0,
373 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
374 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
375 {
376 dst.clear();
377 if (map)
378 map->clear();
379 sgml2strcat(dst, src, count_src, skip, offset, map);
380 }
381
391 template<class T_to, class T_from, class TR_to = std::char_traits<T_to>, class AX_to = std::allocator<T_to>, class TR_from = std::char_traits<T_from>, class AX_from = std::allocator<T_from>>
392 void sgml2strcpy(
393 _Inout_ std::basic_string<T_to, TR_to, AX_to>& dst,
394 _In_ const std::basic_string<T_from, TR_from, AX_from>& src,
395 _In_ int skip = 0,
396 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
397 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
398 {
399 sgml2strcpy(dst, src.data(), src.size(), skip, offset, map);
400 }
401
415 template <class T_to, class T_from>
416 size_t sgml2strcpy(
417 _Inout_cap_(count_dst) T_to* dst, _In_ size_t count_dst,
418 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
419 _In_ int skip = 0,
420 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
421 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
422 {
423 stdex_assert(dst || !count_dst);
424 if (count_dst)
425 dst[0] = 0;
426 if (map)
427 map->clear();
428 return sgml2strcat(dst, count_dst, src, count_src, skip, offset, map);
429 }
430
442 template <class T_to = wchar_t, class T_from, class TR_to = std::char_traits<T_to>, class AX_to = std::allocator<T_to>>
443 std::basic_string<T_to, TR_to, AX_to> sgml2str(
444 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
445 _In_ int skip = 0,
446 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
447 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
448 {
449 std::basic_string<T_to, TR_to, AX_to> dst;
450 sgml2strcat(dst, src, count_src, skip, offset, map);
451 return dst;
452 }
453
464 template <class T_to = wchar_t, class T_from, class TR_to = std::char_traits<T_to>, class AX_to = std::allocator<T_to>, class TR_from = std::char_traits<T_from>, class AX_from = std::allocator<T_from>>
465 std::basic_string<T_to, TR_to, AX_to> sgml2str(
466 _In_ const std::basic_string<T_from, TR_from, AX_from>& src,
467 _In_ int skip = 0,
468 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
469 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
470 {
471 return sgml2str<T_to, T_from, TR_to, AX_to>(src.data(), src.size(), skip, offset, map);
472 }
473
475 inline const char* chr2sgml(_In_reads_or_z_(count) const utf16_t* entity, _In_ size_t count)
476 {
477 stdex_assert(entity && count);
478
479 utf32_t e2;
480 size_t offset;
481 if (count < 2 || !is_surrogate_pair(entity)) {
482 e2 = static_cast<utf32_t>(entity[0]);
483 offset = 1;
484 }
485 else {
486 e2 = surrogate_pair_to_ucs4(entity);
487 offset = 2;
488 }
489 for (size_t i = 0, j = _countof(unicode_sgml); i < j; ) {
490 size_t m = (i + j) / 2;
491 auto e1 = sgml_unicode[unicode_sgml[m]].unicode[0];
492 if (e1 < e2)
493 i = m + 1;
494 else if (e1 > e2)
495 j = m;
496 else {
497 auto r = strncmp(sgml_unicode[unicode_sgml[m]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + offset, count - offset);
498 if (r < 0)
499 i = m + 1;
500 else if (r > 0)
501 j = m;
502 else {
503 for (; i < m && sgml_unicode[unicode_sgml[m - 1]].unicode[0] == e2 && strncmp(sgml_unicode[unicode_sgml[m - 1]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + offset, count - offset) == 0; m--);
504 return sgml_unicode[unicode_sgml[m]].sgml;
505 }
506 }
507 }
508 return nullptr;
509 }
510
511 inline const char* chr2sgml(_In_reads_or_z_(count) const utf32_t* entity, _In_ size_t count)
512 {
513 stdex_assert(entity && count);
514
515 utf32_t e2 = entity[0];
516 for (size_t i = 0, j = _countof(unicode_sgml); i < j; ) {
517 size_t m = (i + j) / 2;
518 auto e1 = sgml_unicode[unicode_sgml[m]].unicode[0];
519 if (e1 < e2)
520 i = m + 1;
521 else if (e1 > e2)
522 j = m;
523 else {
524 auto r = strncmp(sgml_unicode[unicode_sgml[m]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + 1, count - 1);
525 if (r < 0)
526 i = m + 1;
527 else if (r > 0)
528 j = m;
529 else {
530 for (; i < m && sgml_unicode[unicode_sgml[m - 1]].unicode[0] == e2 && strncmp(sgml_unicode[unicode_sgml[m - 1]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + 1, count - 1) == 0; m--);
531 return sgml_unicode[unicode_sgml[m]].sgml;
532 }
533 }
534 }
535 return nullptr;
536 }
537
538 inline utf32_t wstr_to_utf32(_In_reads_(end) const utf16_t* src, _Inout_ size_t& i, _In_ size_t end)
539 {
540 stdex_assert(i < end);
541 if (i + 1 >= end || !is_surrogate_pair(src + i))
542 return src[i++];
543
544 utf32_t unicode = surrogate_pair_to_ucs4(src + i);
545 i += 2;
546 return unicode;
547 }
548
549 inline utf32_t wstr_to_utf32(_In_reads_(end) const utf32_t* src, _Inout_ size_t& i, _In_ size_t end)
550 {
551 _Unreferenced_(end);
552 stdex_assert(i < end);
553 return src[i++];
554 }
556
565 template <class T_from, class TR_to = std::char_traits<char>, class AX_to = std::allocator<char>>
566 void str2sgmlcat(
567 _Inout_ std::basic_string<char, TR_to, AX_to>& dst,
568 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
569 _In_ int what = 0)
570 {
571 stdex_assert(src || !count_src);
572
573 const bool
574 do_ascii = (what & sgml_full) == 0,
575 do_quot = (what & sgml_quot) == 0,
576 do_apos = (what & sgml_apos) == 0,
577 do_lt_gt = (what & sgml_lt_gt) == 0,
578 do_bsol = (what & sgml_bsol) == 0,
579 do_dollar = (what & sgml_dollar) == 0,
580 do_percnt = (what & sgml_percnt) == 0,
581 do_commat = (what & sgml_commat) == 0,
582 do_num = (what & sgml_num) == 0,
583 do_lpar_rpar = (what & sgml_lpar_rpar) == 0,
584 do_lcub_rcub = (what & sgml_lcub_rcub) == 0,
585 do_lsqb_rsqb = (what & sgml_lsqb_rsqb) == 0;
586
587 count_src = strnlen(src, count_src);
588 dst.reserve(dst.size() + count_src);
589 for (size_t i = 0; i < count_src;) {
590 size_t n = glyphlen(src + i, count_src - i);
591 if (n == 1 &&
592 do_ascii && is7bit(src[i]) &&
593 src[i] != '&' &&
594 (do_quot || (src[i] != '"')) &&
595 (do_apos || (src[i] != '\'')) &&
596 (do_lt_gt || (src[i] != '<' && src[i] != '>')) &&
597 (do_bsol || (src[i] != '\\')) &&
598 (do_dollar || (src[i] != '$')) &&
599 (do_percnt || (src[i] != '%')) &&
600 (do_commat || (src[i] != '@')) &&
601 (do_num || (src[i] != '#')) &&
602 (do_lpar_rpar || (src[i] != '(' && src[i] != ')')) &&
603 (do_lcub_rcub || (src[i] != '{' && src[i] != '}')) &&
604 (do_lsqb_rsqb || (src[i] != '[' && src[i] != ']')))
605 {
606 // 7-bit ASCII and no desire to encode it as an SGML entity.
607 dst.append(1, static_cast<char>(src[i++]));
608 }
609 else {
610 const char* entity = chr2sgml(src + i, n);
611 if (entity) {
612 dst.append(1, '&');
613 dst.append(entity);
614 dst.append(1, ';');
615 i += n;
616 }
617 else if (n == 1) {
618 // Trivial character (1 code unit, 1 glyph), no entity available.
619 if (is7bit(src[i]))
620 dst.append(1, static_cast<char>(src[i++]));
621 else {
622 char tmp[3 + 8 + 1 + 1];
623 snprintf(tmp, _countof(tmp), "&#x%x;", static_cast<unsigned int>(src[i++]));
624 dst.append(tmp);
625 }
626 }
627 else {
628 // Non-trivial character. Decompose.
629 const size_t end = i + n;
630 while (i < end) {
631 if ((entity = chr2sgml(src + i, 1)) != nullptr) {
632 dst.append(1, '&');
633 dst.append(entity);
634 dst.append(1, ';');
635 i++;
636 }
637 else if (is7bit(src[i]))
638 dst.append(1, static_cast<char>(src[i++]));
639 else {
640 char tmp[3 + 8 + 1 + 1];
641 snprintf(tmp, _countof(tmp), "&#x%x;", static_cast<unsigned int>(wstr_to_utf32(src, i, end)));
642 dst.append(tmp);
643 }
644 }
645 }
646 }
647 }
648 }
649
657 template <class T_from, class TR_to = std::char_traits<char>, class AX_to = std::allocator<char>, class TR_from = std::char_traits<T_from>, class AX_from = std::allocator<T_from>>
658 void str2sgmlcat(
659 _Inout_ std::basic_string<char, TR_to, AX_to>& dst,
660 _In_ const std::basic_string<T_from, TR_from, AX_from>& src,
661 _In_ int what = 0)
662 {
663 str2sgmlcat(dst, src.data(), src.size(), what);
664 }
665
677 template <class T_from>
678 size_t str2sgmlcat(
679 _Inout_cap_(count_dst) char* dst, _In_ size_t count_dst,
680 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
681 _In_ int what = 0)
682 {
683 stdex_assert(dst || !count_dst);
684 stdex_assert(src || !count_src);
685
686 static const std::invalid_argument buffer_overrun("buffer overrun");
687 const bool
688 do_ascii = (what & sgml_full) == 0,
689 do_quot = (what & sgml_quot) == 0,
690 do_apos = (what & sgml_apos) == 0,
691 do_lt_gt = (what & sgml_lt_gt) == 0,
692 do_bsol = (what & sgml_bsol) == 0,
693 do_dollar = (what & sgml_dollar) == 0,
694 do_percnt = (what & sgml_percnt) == 0,
695 do_commat = (what & sgml_commat) == 0,
696 do_num = (what & sgml_num) == 0,
697 do_lpar_rpar = (what & sgml_lpar_rpar) == 0,
698 do_lcub_rcub = (what & sgml_lcub_rcub) == 0,
699 do_lsqb_rsqb = (what & sgml_lsqb_rsqb) == 0;
700
701 size_t j = strnlen(dst, count_dst);
702 count_src = strnlen(src, count_src);
703 for (size_t i = 0; i < count_src;) {
704 size_t n = glyphlen(src + i, count_src - i);
705 if (n == 1 &&
706 do_ascii && is7bit(src[i]) &&
707 src[i] != '&' &&
708 (do_quot || (src[i] != '"')) &&
709 (do_apos || (src[i] != '\'')) &&
710 (do_lt_gt || (src[i] != '<' && src[i] != '>')) &&
711 (do_bsol || (src[i] != '\\')) &&
712 (do_dollar || (src[i] != '$')) &&
713 (do_percnt || (src[i] != '%')) &&
714 (do_commat || (src[i] != '@')) &&
715 (do_num || (src[i] != '#')) &&
716 (do_lpar_rpar || (src[i] != '(' && src[i] != ')')) &&
717 (do_lcub_rcub || (src[i] != '{' && src[i] != '}')) &&
718 (do_lsqb_rsqb || (src[i] != '[' && src[i] != ']')))
719 {
720 // 7-bit ASCII and no desire to encode it as an SGML entity.
721 if (j + 1 >= count_dst)
722 throw buffer_overrun;
723 dst[j++] = static_cast<char>(src[i++]);
724 }
725 else {
726 const char* entity = chr2sgml(src + i, n);
727 if (entity) {
728 size_t m = strlen(entity);
729 if (j + m + 2 >= count_dst)
730 throw buffer_overrun;
731 dst[j++] = '&';
732 memcpy(dst + j, entity, m * sizeof(char)); j += m;
733 dst[j++] = ';';
734 i += n;
735 }
736 else if (n == 1) {
737 // Trivial character (1 code unit, 1 glyph), no entity available.
738 if (is7bit(src[i])) {
739 if (j + 1 >= count_dst)
740 throw buffer_overrun;
741 dst[j++] = static_cast<char>(src[i++]);
742 }
743 else {
744 char tmp[3 + 8 + 1 + 1];
745 int m = snprintf(tmp, _countof(tmp), "&#x%x;", static_cast<unsigned int>(src[i++]));
746 stdex_assert(m >= 0);
747 if (static_cast<size_t>(m) >= count_dst)
748 throw buffer_overrun;
749 memcpy(dst + j, tmp, static_cast<size_t>(m) * sizeof(char));
750 j += static_cast<size_t>(m);
751 }
752 }
753 else {
754 // Non-trivial character. Decompose.
755 const size_t end = i + n;
756 while (i < end) {
757 if ((entity = chr2sgml(src + i, 1)) != nullptr) {
758 size_t m = strlen(entity);
759 if (j + m + 2 >= count_dst)
760 throw buffer_overrun;
761 dst[j++] = '&';
762 memcpy(dst + j, entity, m * sizeof(char)); j += m;
763 dst[j++] = ';';
764 i++;
765 }
766 else if (is7bit(src[i])) {
767 if (j + 1 >= count_dst)
768 throw buffer_overrun;
769 dst[j++] = static_cast<char>(src[i++]);
770 }
771 else {
772 char tmp[3 + 8 + 1 + 1];
773 int m = snprintf(tmp, _countof(tmp), "&#x%x;", static_cast<unsigned int>(wstr_to_utf32(src, i, end)));
774 stdex_assert(m >= 0);
775 if (static_cast<size_t>(m) >= count_dst)
776 throw buffer_overrun;
777 memcpy(dst + j, tmp, static_cast<size_t>(m) * sizeof(char));
778 j += static_cast<size_t>(m);
779 }
780 }
781 }
782 }
783 }
784 if (j >= count_dst)
785 throw buffer_overrun;
786 dst[j] = 0;
787 return j;
788 }
789
798 template <class T_from, class TR_to = std::char_traits<char>, class AX_to = std::allocator<char>>
799 void str2sgmlcpy(
800 _Inout_ std::basic_string<char, TR_to, AX_to>& dst,
801 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
802 _In_ int what = 0)
803 {
804 dst.clear();
805 str2sgmlcat(dst, src, count_src, what);
806 }
807
815 template <class T_from, class TR_to = std::char_traits<char>, class AX_to = std::allocator<char>, class TR_from = std::char_traits<T_from>, class AX_from = std::allocator<T_from>>
816 void str2sgmlcpy(
817 _Inout_ std::basic_string<char, TR_to, AX_to>& dst,
818 _In_ const std::basic_string<T_from, TR_from, AX_from>& src,
819 _In_ int what = 0)
820 {
821 str2sgmlcpy(dst, src.data(), src.size(), what);
822 }
823
835 template <class T_from>
836 size_t str2sgmlcpy(
837 _Inout_cap_(count_dst) char* dst, _In_ size_t count_dst,
838 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
839 _In_ int what = 0)
840 {
841 stdex_assert(dst || !count_dst);
842 if (count_dst)
843 dst[0] = 0;
844 return str2sgmlcat(dst, count_dst, src, count_src, what);
845 }
846
856 template <class T_from>
857 std::string str2sgml(
858 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
859 _In_ int what = 0)
860 {
861 std::string dst;
862 str2sgmlcat(dst, src, count_src, what);
863 return dst;
864 }
865
874 template <class T_from, class TR_from = std::char_traits<T_from>, class AX_from = std::allocator<T_from>>
875 std::string str2sgml(
876 _In_ const std::basic_string<T_from, TR_from, AX_from>& src,
877 _In_ int what = 0)
878 {
879 return str2sgml(src.data(), src.size(), what);
880 }
881}
882
883#if defined(__GNUC__)
884#pragma GCC diagnostic pop
885#endif