From e4f8a7f8cf5eb14edbebb0636a5e87e357c6e784 Mon Sep 17 00:00:00 2001
From: Kiro <mouton.guillaume88@gmail.com>
Date: Thu, 28 Nov 2024 12:07:16 +0100
Subject: [PATCH] improveParseUTF8Performance

---
 core/string/ustring.cpp         | 289 ++++++++++++++++----------------
 tests/core/string/test_string.h |  46 ++++-
 2 files changed, 179 insertions(+), 156 deletions(-)

diff --git a/core/string/ustring.cpp b/core/string/ustring.cpp
index 9e99fc3b2f5..868f7ae995a 100644
--- a/core/string/ustring.cpp
+++ b/core/string/ustring.cpp
@@ -2087,11 +2087,6 @@ Error String::parse_utf8(const char *p_utf8, int p_len, bool p_skip_cr) {
 		return ERR_INVALID_DATA;
 	}
 
-	String aux;
-
-	int cstr_size = 0;
-	int str_size = 0;
-
 	/* HANDLE BOM (Byte Order Mark) */
 	if (p_len < 0 || p_len >= 3) {
 		bool has_bom = uint8_t(p_utf8[0]) == 0xef && uint8_t(p_utf8[1]) == 0xbb && uint8_t(p_utf8[2]) == 0xbf;
@@ -2104,162 +2099,160 @@ Error String::parse_utf8(const char *p_utf8, int p_len, bool p_skip_cr) {
 		}
 	}
 
-	bool decode_error = false;
-	bool decode_failed = false;
-	{
-		const char *ptrtmp = p_utf8;
-		const char *ptrtmp_limit = p_len >= 0 ? &p_utf8[p_len] : nullptr;
-		int skip = 0;
-		uint8_t c_start = 0;
-		while (ptrtmp != ptrtmp_limit && *ptrtmp) {
-#if CHAR_MIN == 0
-			uint8_t c = *ptrtmp;
-#else
-			uint8_t c = *ptrtmp >= 0 ? *ptrtmp : uint8_t(256 + *ptrtmp);
-#endif
-
-			if (skip == 0) {
-				if (p_skip_cr && c == '\r') {
-					ptrtmp++;
-					continue;
-				}
-				/* Determine the number of characters in sequence */
-				if ((c & 0x80) == 0) {
-					skip = 0;
-				} else if ((c & 0xe0) == 0xc0) {
-					skip = 1;
-				} else if ((c & 0xf0) == 0xe0) {
-					skip = 2;
-				} else if ((c & 0xf8) == 0xf0) {
-					skip = 3;
-				} else if ((c & 0xfc) == 0xf8) {
-					skip = 4;
-				} else if ((c & 0xfe) == 0xfc) {
-					skip = 5;
-				} else {
-					skip = 0;
-					print_unicode_error(vformat("Invalid UTF-8 leading byte (%x)", c), true);
-					decode_failed = true;
-				}
-				c_start = c;
-
-				if (skip == 1 && (c & 0x1e) == 0) {
-					print_unicode_error(vformat("Overlong encoding (%x ...)", c));
-					decode_error = true;
-				}
-				str_size++;
-			} else {
-				if ((c_start == 0xe0 && skip == 2 && c < 0xa0) || (c_start == 0xf0 && skip == 3 && c < 0x90) || (c_start == 0xf8 && skip == 4 && c < 0x88) || (c_start == 0xfc && skip == 5 && c < 0x84)) {
-					print_unicode_error(vformat("Overlong encoding (%x %x ...)", c_start, c));
-					decode_error = true;
-				}
-				if (c < 0x80 || c > 0xbf) {
-					print_unicode_error(vformat("Invalid UTF-8 continuation byte (%x ... %x ...)", c_start, c), true);
-					decode_failed = true;
-					skip = 0;
-				} else {
-					--skip;
-				}
-			}
-
-			cstr_size++;
-			ptrtmp++;
-		}
-
-		if (skip) {
-			print_unicode_error(vformat("Missing %d UTF-8 continuation byte(s)", skip), true);
-			decode_failed = true;
-		}
+	if (p_len < 0) {
+		p_len = strlen(p_utf8);
 	}
 
-	if (str_size == 0) {
-		clear();
-		return OK; // empty string
-	}
-
-	resize(str_size + 1);
+	// If all utf8 characters maps to ASCII, then the max size will be p_len, and we add +1 for the null termination.
+	resize(p_len + 1);
 	char32_t *dst = ptrw();
-	dst[str_size] = 0;
 
-	int skip = 0;
-	uint32_t unichar = 0;
-	while (cstr_size) {
-#if CHAR_MIN == 0
-		uint8_t c = *p_utf8;
-#else
-		uint8_t c = *p_utf8 >= 0 ? *p_utf8 : uint8_t(256 + *p_utf8);
-#endif
+	Error result = Error::OK;
 
-		if (skip == 0) {
-			if (p_skip_cr && c == '\r') {
-				p_utf8++;
-				continue;
+	const uint8_t *ptrtmp = (uint8_t *)p_utf8;
+	const uint8_t *ptr_limit = (uint8_t *)p_utf8 + p_len;
+
+	while (ptrtmp < ptr_limit && *ptrtmp) {
+		uint8_t c = *ptrtmp;
+
+		if (p_skip_cr && c == '\r') {
+			++ptrtmp;
+			continue;
+		}
+		uint32_t unicode = _replacement_char;
+		uint32_t size = 1;
+
+		if ((c & 0b10000000) == 0) {
+			unicode = c;
+			if (unicode > 0x7F) {
+				unicode = _replacement_char;
+				print_unicode_error(vformat("Invalid unicode codepoint (%d)", unicode), true);
+				result = Error::ERR_INVALID_DATA;
 			}
-			/* Determine the number of characters in sequence */
-			if ((c & 0x80) == 0) {
-				*(dst++) = c;
-				unichar = 0;
-				skip = 0;
-			} else if ((c & 0xe0) == 0xc0) {
-				unichar = (0xff >> 3) & c;
-				skip = 1;
-			} else if ((c & 0xf0) == 0xe0) {
-				unichar = (0xff >> 4) & c;
-				skip = 2;
-			} else if ((c & 0xf8) == 0xf0) {
-				unichar = (0xff >> 5) & c;
-				skip = 3;
-			} else if ((c & 0xfc) == 0xf8) {
-				unichar = (0xff >> 6) & c;
-				skip = 4;
-			} else if ((c & 0xfe) == 0xfc) {
-				unichar = (0xff >> 7) & c;
-				skip = 5;
+		} else if ((c & 0b11100000) == 0b11000000) {
+			if (ptrtmp + 1 >= ptr_limit) {
+				print_unicode_error(vformat("Missing %x UTF-8 continuation byte", c), true);
+				result = Error::ERR_INVALID_DATA;
 			} else {
-				*(dst++) = _replacement_char;
-				unichar = 0;
-				skip = 0;
+				uint8_t c2 = *(ptrtmp + 1);
+
+				if ((c2 & 0b11000000) == 0b10000000) {
+					unicode = (uint32_t)((c & 0b00011111) << 6) | (uint32_t)(c2 & 0b00111111);
+
+					if (unicode < 0x80) {
+						unicode = _replacement_char;
+						print_unicode_error(vformat("Overlong encoding (%x %x)", c, c2));
+						result = Error::ERR_INVALID_DATA;
+					} else if (unicode > 0x7FF) {
+						unicode = _replacement_char;
+						print_unicode_error(vformat("Invalid unicode codepoint (%d)", unicode), true);
+						result = Error::ERR_INVALID_DATA;
+					} else {
+						size = 2;
+					}
+				} else {
+					print_unicode_error(vformat("Byte %x is not a correct continuation byte after %x", c2, c));
+					result = Error::ERR_INVALID_DATA;
+				}
+			}
+		} else if ((c & 0b11110000) == 0b11100000) {
+			uint32_t range_min = (c == 0xE0) ? 0xA0 : 0x80;
+			uint32_t range_max = (c == 0xED) ? 0x9F : 0xBF;
+			uint8_t c2 = (ptrtmp + 1) < ptr_limit ? *(ptrtmp + 1) : 0;
+			uint8_t c3 = (ptrtmp + 2) < ptr_limit ? *(ptrtmp + 2) : 0;
+			bool c2_valid = c2 && (c2 >= range_min) && (c2 <= range_max);
+			bool c3_valid = c3 && ((c3 & 0b11000000) == 0b10000000);
+
+			if (c2_valid && c3_valid) {
+				unicode = (uint32_t)((c & 0b00001111) << 12) | (uint32_t)((c2 & 0b00111111) << 6) | (uint32_t)(c3 & 0b00111111);
+
+				if (unicode < 0x800) {
+					unicode = _replacement_char;
+					print_unicode_error(vformat("Overlong encoding (%x %x %x)", c, c2, c3));
+					result = Error::ERR_INVALID_DATA;
+				} else if (unicode > 0xFFFF) {
+					unicode = _replacement_char;
+					print_unicode_error(vformat("Invalid unicode codepoint (%d)", unicode), true);
+					result = Error::ERR_INVALID_DATA;
+				} else {
+					size = 3;
+				}
+			} else {
+				if (c2 == 0) {
+					print_unicode_error(vformat("Missing %x UTF-8 continuation byte", c), true);
+				} else if (c2_valid == false) {
+					print_unicode_error(vformat("Byte %x is not a correct continuation byte after %x", c2, c));
+				} else if (c3 == 0) {
+					print_unicode_error(vformat("Missing %x %x UTF-8 continuation byte", c, c2), true);
+				} else {
+					print_unicode_error(vformat("Byte %x is not a correct continuation byte after %x %x", c3, c, c2));
+					// The unicode specification, in paragraphe 3.9 "Unicode Encoding Forms" Conformance
+					// state : "Only when a sequence of two or three bytes is a truncated version of a sequence which is
+					// otherwise well-formed to that point, is more than one byte replaced with a single U+FFFD"
+					// So here we replace the first 2 bytes with one single replacement_char.
+					size = 2;
+				}
+
+				result = Error::ERR_INVALID_DATA;
+			}
+		} else if ((c & 0b11111000) == 0b11110000) {
+			uint32_t range_min = (c == 0xF0) ? 0x90 : 0x80;
+			uint32_t range_max = (c == 0xF4) ? 0x8F : 0xBF;
+
+			uint8_t c2 = ((ptrtmp + 1) < ptr_limit) ? *(ptrtmp + 1) : 0;
+			uint8_t c3 = ((ptrtmp + 2) < ptr_limit) ? *(ptrtmp + 2) : 0;
+			uint8_t c4 = ((ptrtmp + 3) < ptr_limit) ? *(ptrtmp + 3) : 0;
+
+			bool c2_valid = c2 && (c2 >= range_min) && (c2 <= range_max);
+			bool c3_valid = c3 && ((c3 & 0b11000000) == 0b10000000);
+			bool c4_valid = c4 && ((c4 & 0b11000000) == 0b10000000);
+
+			if (c2_valid && c3_valid && c4_valid) {
+				unicode = (uint32_t)((c & 0b00000111) << 18) | (uint32_t)((c2 & 0b00111111) << 12) | (uint32_t)((c3 & 0b00111111) << 6) | (uint32_t)(c4 & 0b00111111);
+
+				if (unicode < 0x10000) {
+					unicode = _replacement_char;
+					print_unicode_error(vformat("Overlong encoding (%x %x %x %x)", c, c2, c3, c4));
+					result = Error::ERR_INVALID_DATA;
+				} else if (unicode > 0x10FFFF) {
+					unicode = _replacement_char;
+					print_unicode_error(vformat("Invalid unicode codepoint (%d)", unicode), true);
+					result = Error::ERR_INVALID_DATA;
+				} else {
+					size = 4;
+				}
+			} else {
+				if (c2 == 0) {
+					print_unicode_error(vformat("Missing %x UTF-8 continuation byte", c), true);
+				} else if (c2_valid == false) {
+					print_unicode_error(vformat("Byte %x is not a correct continuation byte after %x", c2, c));
+				} else if (c3 == 0) {
+					print_unicode_error(vformat("Missing %x %x UTF-8 continuation byte", c, c2), true);
+				} else if (c3_valid == false) {
+					print_unicode_error(vformat("Byte %x is not a correct continuation byte after %x %x", c3, c, c2));
+					size = 2;
+				} else if (c4 == 0) {
+					print_unicode_error(vformat("Missing %x %x %x UTF-8 continuation byte", c, c2, c3), true);
+				} else {
+					print_unicode_error(vformat("Byte %x is not a correct continuation byte after %x %x %x", c4, c, c2, c3));
+					size = 3;
+				}
+
+				result = Error::ERR_INVALID_DATA;
 			}
 		} else {
-			if (c < 0x80 || c > 0xbf) {
-				*(dst++) = _replacement_char;
-				skip = 0;
-			} else {
-				unichar = (unichar << 6) | (c & 0x3f);
-				--skip;
-				if (skip == 0) {
-					if (unichar == 0) {
-						print_unicode_error("NUL character", true);
-						decode_failed = true;
-						unichar = _replacement_char;
-					} else if ((unichar & 0xfffff800) == 0xd800) {
-						print_unicode_error(vformat("Unpaired surrogate (%x)", unichar), true);
-						decode_failed = true;
-						unichar = _replacement_char;
-					} else if (unichar > 0x10ffff) {
-						print_unicode_error(vformat("Invalid unicode codepoint (%x)", unichar), true);
-						decode_failed = true;
-						unichar = _replacement_char;
-					}
-					*(dst++) = unichar;
-				}
-			}
+			print_unicode_error(vformat("Invalid UTF-8 leading byte (%x)", c), true);
+			result = Error::ERR_INVALID_DATA;
 		}
 
-		cstr_size--;
-		p_utf8++;
-	}
-	if (skip) {
-		*(dst++) = 0x20;
+		(*dst++) = unicode;
+		ptrtmp += size;
 	}
 
-	if (decode_failed) {
-		return ERR_INVALID_DATA;
-	} else if (decode_error) {
-		return ERR_PARSE_ERROR;
-	} else {
-		return OK;
-	}
+	(*dst++) = 0;
+	resize(dst - ptr());
+
+	return result;
 }
 
 CharString String::utf8() const {
diff --git a/tests/core/string/test_string.h b/tests/core/string/test_string.h
index 9adc97e8450..d01c9c043ab 100644
--- a/tests/core/string/test_string.h
+++ b/tests/core/string/test_string.h
@@ -166,11 +166,11 @@ TEST_CASE("[String] UTF8 with CR") {
 	CHECK(no_cr == base.replace("\r", ""));
 }
 
-TEST_CASE("[String] Invalid UTF8 (non-standard)") {
+TEST_CASE("[String] Invalid UTF8 (non shortest form sequence)") {
 	ERR_PRINT_OFF
-	static const uint8_t u8str[] = { 0x45, 0xE3, 0x81, 0x8A, 0xE3, 0x82, 0x88, 0xE3, 0x81, 0x86, 0xF0, 0x9F, 0x8E, 0xA4, 0xF0, 0x82, 0x82, 0xAC, 0xED, 0xA0, 0x81, 0 };
-	//                               +     +2                +2                +2                +3                      overlong +3             unpaired +2
-	static const char32_t u32str[] = { 0x45, 0x304A, 0x3088, 0x3046, 0x1F3A4, 0x20AC, 0xFFFD, 0 };
+	// Examples from the unicode standard : 3.9 Unicode Encoding Forms - Table 3.8.
+	static const uint8_t u8str[] = { 0xC0, 0xAF, 0xE0, 0x80, 0xBF, 0xF0, 0x81, 0x82, 0x41, 0 };
+	static const char32_t u32str[] = { 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x41, 0 };
 	String s;
 	Error err = s.parse_utf8((const char *)u8str);
 	CHECK(err == ERR_INVALID_DATA);
@@ -181,11 +181,41 @@ TEST_CASE("[String] Invalid UTF8 (non-standard)") {
 	ERR_PRINT_ON
 }
 
-TEST_CASE("[String] Invalid UTF8 (unrecoverable)") {
+TEST_CASE("[String] Invalid UTF8 (ill formed sequences for surrogates)") {
 	ERR_PRINT_OFF
-	static const uint8_t u8str[] = { 0x45, 0xE3, 0x81, 0x8A, 0x8F, 0xE3, 0xE3, 0x98, 0x8F, 0xE3, 0x82, 0x88, 0xE3, 0x81, 0x86, 0xC0, 0x80, 0xF0, 0x9F, 0x8E, 0xA4, 0xF0, 0x82, 0x82, 0xAC, 0xED, 0xA0, 0x81, 0 };
-	//                               +     +2                inv   +2    inv   inv   inv   +2                +2                ovl NUL +1  +3                      overlong +3             unpaired +2
-	static const char32_t u32str[] = { 0x45, 0x304A, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x3088, 0x3046, 0xFFFD, 0x1F3A4, 0x20AC, 0xFFFD, 0 };
+	// Examples from the unicode standard : 3.9 Unicode Encoding Forms - Table 3.9.
+	static const uint8_t u8str[] = { 0xED, 0xA0, 0x80, 0xED, 0xBF, 0xBF, 0xED, 0xAF, 0x41, 0 };
+	static const char32_t u32str[] = { 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x41, 0 };
+	String s;
+	Error err = s.parse_utf8((const char *)u8str);
+	CHECK(err == ERR_INVALID_DATA);
+	CHECK(s == u32str);
+
+	CharString cs = (const char *)u8str;
+	CHECK(String::utf8(cs) == s);
+	ERR_PRINT_ON
+}
+
+TEST_CASE("[String] Invalid UTF8 (other ill formed sequences)") {
+	ERR_PRINT_OFF
+	// Examples from the unicode standard : 3.9 Unicode Encoding Forms - Table 3.10.
+	static const uint8_t u8str[] = { 0xF4, 0x91, 0x92, 0x93, 0xFF, 0x41, 0x80, 0xBF, 0x42, 0 };
+	static const char32_t u32str[] = { 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x41, 0xFFFD, 0xFFFD, 0x42, 0 };
+	String s;
+	Error err = s.parse_utf8((const char *)u8str);
+	CHECK(err == ERR_INVALID_DATA);
+	CHECK(s == u32str);
+
+	CharString cs = (const char *)u8str;
+	CHECK(String::utf8(cs) == s);
+	ERR_PRINT_ON
+}
+
+TEST_CASE("[String] Invalid UTF8 (truncated sequences)") {
+	ERR_PRINT_OFF
+	// Examples from the unicode standard : 3.9 Unicode Encoding Forms - Table 3.11.
+	static const uint8_t u8str[] = { 0xE1, 0x80, 0xE2, 0xF0, 0x91, 0x92, 0xF1, 0xBF, 0x41, 0 };
+	static const char32_t u32str[] = { 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x41, 0 };
 	String s;
 	Error err = s.parse_utf8((const char *)u8str);
 	CHECK(err == ERR_INVALID_DATA);