From e4f8a7f8cf5eb14edbebb0636a5e87e357c6e784 Mon Sep 17 00:00:00 2001 From: Kiro Date: Thu, 28 Nov 2024 12:07:16 +0100 Subject: [PATCH] improveParseUTF8Performance --- core/string/ustring.cpp | 289 ++++++++++++++++---------------- tests/core/string/test_string.h | 46 ++++- 2 files changed, 179 insertions(+), 156 deletions(-) diff --git a/core/string/ustring.cpp b/core/string/ustring.cpp index 9e99fc3b2f5..868f7ae995a 100644 --- a/core/string/ustring.cpp +++ b/core/string/ustring.cpp @@ -2087,11 +2087,6 @@ Error String::parse_utf8(const char *p_utf8, int p_len, bool p_skip_cr) { return ERR_INVALID_DATA; } - String aux; - - int cstr_size = 0; - int str_size = 0; - /* HANDLE BOM (Byte Order Mark) */ if (p_len < 0 || p_len >= 3) { bool has_bom = uint8_t(p_utf8[0]) == 0xef && uint8_t(p_utf8[1]) == 0xbb && uint8_t(p_utf8[2]) == 0xbf; @@ -2104,162 +2099,160 @@ Error String::parse_utf8(const char *p_utf8, int p_len, bool p_skip_cr) { } } - bool decode_error = false; - bool decode_failed = false; - { - const char *ptrtmp = p_utf8; - const char *ptrtmp_limit = p_len >= 0 ? &p_utf8[p_len] : nullptr; - int skip = 0; - uint8_t c_start = 0; - while (ptrtmp != ptrtmp_limit && *ptrtmp) { -#if CHAR_MIN == 0 - uint8_t c = *ptrtmp; -#else - uint8_t c = *ptrtmp >= 0 ? *ptrtmp : uint8_t(256 + *ptrtmp); -#endif - - if (skip == 0) { - if (p_skip_cr && c == '\r') { - ptrtmp++; - continue; - } - /* Determine the number of characters in sequence */ - if ((c & 0x80) == 0) { - skip = 0; - } else if ((c & 0xe0) == 0xc0) { - skip = 1; - } else if ((c & 0xf0) == 0xe0) { - skip = 2; - } else if ((c & 0xf8) == 0xf0) { - skip = 3; - } else if ((c & 0xfc) == 0xf8) { - skip = 4; - } else if ((c & 0xfe) == 0xfc) { - skip = 5; - } else { - skip = 0; - print_unicode_error(vformat("Invalid UTF-8 leading byte (%x)", c), true); - decode_failed = true; - } - c_start = c; - - if (skip == 1 && (c & 0x1e) == 0) { - print_unicode_error(vformat("Overlong encoding (%x ...)", c)); - decode_error = true; - } - str_size++; - } else { - if ((c_start == 0xe0 && skip == 2 && c < 0xa0) || (c_start == 0xf0 && skip == 3 && c < 0x90) || (c_start == 0xf8 && skip == 4 && c < 0x88) || (c_start == 0xfc && skip == 5 && c < 0x84)) { - print_unicode_error(vformat("Overlong encoding (%x %x ...)", c_start, c)); - decode_error = true; - } - if (c < 0x80 || c > 0xbf) { - print_unicode_error(vformat("Invalid UTF-8 continuation byte (%x ... %x ...)", c_start, c), true); - decode_failed = true; - skip = 0; - } else { - --skip; - } - } - - cstr_size++; - ptrtmp++; - } - - if (skip) { - print_unicode_error(vformat("Missing %d UTF-8 continuation byte(s)", skip), true); - decode_failed = true; - } + if (p_len < 0) { + p_len = strlen(p_utf8); } - if (str_size == 0) { - clear(); - return OK; // empty string - } - - resize(str_size + 1); + // If all utf8 characters maps to ASCII, then the max size will be p_len, and we add +1 for the null termination. + resize(p_len + 1); char32_t *dst = ptrw(); - dst[str_size] = 0; - int skip = 0; - uint32_t unichar = 0; - while (cstr_size) { -#if CHAR_MIN == 0 - uint8_t c = *p_utf8; -#else - uint8_t c = *p_utf8 >= 0 ? *p_utf8 : uint8_t(256 + *p_utf8); -#endif + Error result = Error::OK; - if (skip == 0) { - if (p_skip_cr && c == '\r') { - p_utf8++; - continue; + const uint8_t *ptrtmp = (uint8_t *)p_utf8; + const uint8_t *ptr_limit = (uint8_t *)p_utf8 + p_len; + + while (ptrtmp < ptr_limit && *ptrtmp) { + uint8_t c = *ptrtmp; + + if (p_skip_cr && c == '\r') { + ++ptrtmp; + continue; + } + uint32_t unicode = _replacement_char; + uint32_t size = 1; + + if ((c & 0b10000000) == 0) { + unicode = c; + if (unicode > 0x7F) { + unicode = _replacement_char; + print_unicode_error(vformat("Invalid unicode codepoint (%d)", unicode), true); + result = Error::ERR_INVALID_DATA; } - /* Determine the number of characters in sequence */ - if ((c & 0x80) == 0) { - *(dst++) = c; - unichar = 0; - skip = 0; - } else if ((c & 0xe0) == 0xc0) { - unichar = (0xff >> 3) & c; - skip = 1; - } else if ((c & 0xf0) == 0xe0) { - unichar = (0xff >> 4) & c; - skip = 2; - } else if ((c & 0xf8) == 0xf0) { - unichar = (0xff >> 5) & c; - skip = 3; - } else if ((c & 0xfc) == 0xf8) { - unichar = (0xff >> 6) & c; - skip = 4; - } else if ((c & 0xfe) == 0xfc) { - unichar = (0xff >> 7) & c; - skip = 5; + } else if ((c & 0b11100000) == 0b11000000) { + if (ptrtmp + 1 >= ptr_limit) { + print_unicode_error(vformat("Missing %x UTF-8 continuation byte", c), true); + result = Error::ERR_INVALID_DATA; } else { - *(dst++) = _replacement_char; - unichar = 0; - skip = 0; + uint8_t c2 = *(ptrtmp + 1); + + if ((c2 & 0b11000000) == 0b10000000) { + unicode = (uint32_t)((c & 0b00011111) << 6) | (uint32_t)(c2 & 0b00111111); + + if (unicode < 0x80) { + unicode = _replacement_char; + print_unicode_error(vformat("Overlong encoding (%x %x)", c, c2)); + result = Error::ERR_INVALID_DATA; + } else if (unicode > 0x7FF) { + unicode = _replacement_char; + print_unicode_error(vformat("Invalid unicode codepoint (%d)", unicode), true); + result = Error::ERR_INVALID_DATA; + } else { + size = 2; + } + } else { + print_unicode_error(vformat("Byte %x is not a correct continuation byte after %x", c2, c)); + result = Error::ERR_INVALID_DATA; + } + } + } else if ((c & 0b11110000) == 0b11100000) { + uint32_t range_min = (c == 0xE0) ? 0xA0 : 0x80; + uint32_t range_max = (c == 0xED) ? 0x9F : 0xBF; + uint8_t c2 = (ptrtmp + 1) < ptr_limit ? *(ptrtmp + 1) : 0; + uint8_t c3 = (ptrtmp + 2) < ptr_limit ? *(ptrtmp + 2) : 0; + bool c2_valid = c2 && (c2 >= range_min) && (c2 <= range_max); + bool c3_valid = c3 && ((c3 & 0b11000000) == 0b10000000); + + if (c2_valid && c3_valid) { + unicode = (uint32_t)((c & 0b00001111) << 12) | (uint32_t)((c2 & 0b00111111) << 6) | (uint32_t)(c3 & 0b00111111); + + if (unicode < 0x800) { + unicode = _replacement_char; + print_unicode_error(vformat("Overlong encoding (%x %x %x)", c, c2, c3)); + result = Error::ERR_INVALID_DATA; + } else if (unicode > 0xFFFF) { + unicode = _replacement_char; + print_unicode_error(vformat("Invalid unicode codepoint (%d)", unicode), true); + result = Error::ERR_INVALID_DATA; + } else { + size = 3; + } + } else { + if (c2 == 0) { + print_unicode_error(vformat("Missing %x UTF-8 continuation byte", c), true); + } else if (c2_valid == false) { + print_unicode_error(vformat("Byte %x is not a correct continuation byte after %x", c2, c)); + } else if (c3 == 0) { + print_unicode_error(vformat("Missing %x %x UTF-8 continuation byte", c, c2), true); + } else { + print_unicode_error(vformat("Byte %x is not a correct continuation byte after %x %x", c3, c, c2)); + // The unicode specification, in paragraphe 3.9 "Unicode Encoding Forms" Conformance + // state : "Only when a sequence of two or three bytes is a truncated version of a sequence which is + // otherwise well-formed to that point, is more than one byte replaced with a single U+FFFD" + // So here we replace the first 2 bytes with one single replacement_char. + size = 2; + } + + result = Error::ERR_INVALID_DATA; + } + } else if ((c & 0b11111000) == 0b11110000) { + uint32_t range_min = (c == 0xF0) ? 0x90 : 0x80; + uint32_t range_max = (c == 0xF4) ? 0x8F : 0xBF; + + uint8_t c2 = ((ptrtmp + 1) < ptr_limit) ? *(ptrtmp + 1) : 0; + uint8_t c3 = ((ptrtmp + 2) < ptr_limit) ? *(ptrtmp + 2) : 0; + uint8_t c4 = ((ptrtmp + 3) < ptr_limit) ? *(ptrtmp + 3) : 0; + + bool c2_valid = c2 && (c2 >= range_min) && (c2 <= range_max); + bool c3_valid = c3 && ((c3 & 0b11000000) == 0b10000000); + bool c4_valid = c4 && ((c4 & 0b11000000) == 0b10000000); + + if (c2_valid && c3_valid && c4_valid) { + unicode = (uint32_t)((c & 0b00000111) << 18) | (uint32_t)((c2 & 0b00111111) << 12) | (uint32_t)((c3 & 0b00111111) << 6) | (uint32_t)(c4 & 0b00111111); + + if (unicode < 0x10000) { + unicode = _replacement_char; + print_unicode_error(vformat("Overlong encoding (%x %x %x %x)", c, c2, c3, c4)); + result = Error::ERR_INVALID_DATA; + } else if (unicode > 0x10FFFF) { + unicode = _replacement_char; + print_unicode_error(vformat("Invalid unicode codepoint (%d)", unicode), true); + result = Error::ERR_INVALID_DATA; + } else { + size = 4; + } + } else { + if (c2 == 0) { + print_unicode_error(vformat("Missing %x UTF-8 continuation byte", c), true); + } else if (c2_valid == false) { + print_unicode_error(vformat("Byte %x is not a correct continuation byte after %x", c2, c)); + } else if (c3 == 0) { + print_unicode_error(vformat("Missing %x %x UTF-8 continuation byte", c, c2), true); + } else if (c3_valid == false) { + print_unicode_error(vformat("Byte %x is not a correct continuation byte after %x %x", c3, c, c2)); + size = 2; + } else if (c4 == 0) { + print_unicode_error(vformat("Missing %x %x %x UTF-8 continuation byte", c, c2, c3), true); + } else { + print_unicode_error(vformat("Byte %x is not a correct continuation byte after %x %x %x", c4, c, c2, c3)); + size = 3; + } + + result = Error::ERR_INVALID_DATA; } } else { - if (c < 0x80 || c > 0xbf) { - *(dst++) = _replacement_char; - skip = 0; - } else { - unichar = (unichar << 6) | (c & 0x3f); - --skip; - if (skip == 0) { - if (unichar == 0) { - print_unicode_error("NUL character", true); - decode_failed = true; - unichar = _replacement_char; - } else if ((unichar & 0xfffff800) == 0xd800) { - print_unicode_error(vformat("Unpaired surrogate (%x)", unichar), true); - decode_failed = true; - unichar = _replacement_char; - } else if (unichar > 0x10ffff) { - print_unicode_error(vformat("Invalid unicode codepoint (%x)", unichar), true); - decode_failed = true; - unichar = _replacement_char; - } - *(dst++) = unichar; - } - } + print_unicode_error(vformat("Invalid UTF-8 leading byte (%x)", c), true); + result = Error::ERR_INVALID_DATA; } - cstr_size--; - p_utf8++; - } - if (skip) { - *(dst++) = 0x20; + (*dst++) = unicode; + ptrtmp += size; } - if (decode_failed) { - return ERR_INVALID_DATA; - } else if (decode_error) { - return ERR_PARSE_ERROR; - } else { - return OK; - } + (*dst++) = 0; + resize(dst - ptr()); + + return result; } CharString String::utf8() const { diff --git a/tests/core/string/test_string.h b/tests/core/string/test_string.h index 9adc97e8450..d01c9c043ab 100644 --- a/tests/core/string/test_string.h +++ b/tests/core/string/test_string.h @@ -166,11 +166,11 @@ TEST_CASE("[String] UTF8 with CR") { CHECK(no_cr == base.replace("\r", "")); } -TEST_CASE("[String] Invalid UTF8 (non-standard)") { +TEST_CASE("[String] Invalid UTF8 (non shortest form sequence)") { ERR_PRINT_OFF - static const uint8_t u8str[] = { 0x45, 0xE3, 0x81, 0x8A, 0xE3, 0x82, 0x88, 0xE3, 0x81, 0x86, 0xF0, 0x9F, 0x8E, 0xA4, 0xF0, 0x82, 0x82, 0xAC, 0xED, 0xA0, 0x81, 0 }; - // + +2 +2 +2 +3 overlong +3 unpaired +2 - static const char32_t u32str[] = { 0x45, 0x304A, 0x3088, 0x3046, 0x1F3A4, 0x20AC, 0xFFFD, 0 }; + // Examples from the unicode standard : 3.9 Unicode Encoding Forms - Table 3.8. + static const uint8_t u8str[] = { 0xC0, 0xAF, 0xE0, 0x80, 0xBF, 0xF0, 0x81, 0x82, 0x41, 0 }; + static const char32_t u32str[] = { 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x41, 0 }; String s; Error err = s.parse_utf8((const char *)u8str); CHECK(err == ERR_INVALID_DATA); @@ -181,11 +181,41 @@ TEST_CASE("[String] Invalid UTF8 (non-standard)") { ERR_PRINT_ON } -TEST_CASE("[String] Invalid UTF8 (unrecoverable)") { +TEST_CASE("[String] Invalid UTF8 (ill formed sequences for surrogates)") { ERR_PRINT_OFF - static const uint8_t u8str[] = { 0x45, 0xE3, 0x81, 0x8A, 0x8F, 0xE3, 0xE3, 0x98, 0x8F, 0xE3, 0x82, 0x88, 0xE3, 0x81, 0x86, 0xC0, 0x80, 0xF0, 0x9F, 0x8E, 0xA4, 0xF0, 0x82, 0x82, 0xAC, 0xED, 0xA0, 0x81, 0 }; - // + +2 inv +2 inv inv inv +2 +2 ovl NUL +1 +3 overlong +3 unpaired +2 - static const char32_t u32str[] = { 0x45, 0x304A, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x3088, 0x3046, 0xFFFD, 0x1F3A4, 0x20AC, 0xFFFD, 0 }; + // Examples from the unicode standard : 3.9 Unicode Encoding Forms - Table 3.9. + static const uint8_t u8str[] = { 0xED, 0xA0, 0x80, 0xED, 0xBF, 0xBF, 0xED, 0xAF, 0x41, 0 }; + static const char32_t u32str[] = { 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x41, 0 }; + String s; + Error err = s.parse_utf8((const char *)u8str); + CHECK(err == ERR_INVALID_DATA); + CHECK(s == u32str); + + CharString cs = (const char *)u8str; + CHECK(String::utf8(cs) == s); + ERR_PRINT_ON +} + +TEST_CASE("[String] Invalid UTF8 (other ill formed sequences)") { + ERR_PRINT_OFF + // Examples from the unicode standard : 3.9 Unicode Encoding Forms - Table 3.10. + static const uint8_t u8str[] = { 0xF4, 0x91, 0x92, 0x93, 0xFF, 0x41, 0x80, 0xBF, 0x42, 0 }; + static const char32_t u32str[] = { 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x41, 0xFFFD, 0xFFFD, 0x42, 0 }; + String s; + Error err = s.parse_utf8((const char *)u8str); + CHECK(err == ERR_INVALID_DATA); + CHECK(s == u32str); + + CharString cs = (const char *)u8str; + CHECK(String::utf8(cs) == s); + ERR_PRINT_ON +} + +TEST_CASE("[String] Invalid UTF8 (truncated sequences)") { + ERR_PRINT_OFF + // Examples from the unicode standard : 3.9 Unicode Encoding Forms - Table 3.11. + static const uint8_t u8str[] = { 0xE1, 0x80, 0xE2, 0xF0, 0x91, 0x92, 0xF1, 0xBF, 0x41, 0 }; + static const char32_t u32str[] = { 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x41, 0 }; String s; Error err = s.parse_utf8((const char *)u8str); CHECK(err == ERR_INVALID_DATA);