Update the UTF-8 decoder (#1741)

This commit is contained in:
Rangi
2025-07-09 23:13:30 -04:00
committed by GitHub
parent 34cf959c9d
commit a40109e4e4
5 changed files with 62 additions and 52 deletions

View File

@@ -266,14 +266,15 @@ size_t charmap_ConvertNext(std::string_view &input, std::vector<int32_t> *output
} else if (inputIdx < input.length()) { // No match found, but there is some input left
size_t codepointLen = 0;
// This will write the codepoint's value to `output`, little-endian
for (uint32_t state = 0, codepoint = 0; inputIdx + codepointLen < input.length();) {
if (decode(&state, &codepoint, input[inputIdx + codepointLen]) == 1) {
for (uint32_t state = UTF8_ACCEPT, codepoint = 0;
inputIdx + codepointLen < input.length();) {
if (decode(&state, &codepoint, input[inputIdx + codepointLen]) == UTF8_REJECT) {
error("Input string is not valid UTF-8");
codepointLen = 1;
break;
}
codepointLen++;
if (state == 0) {
if (state == UTF8_ACCEPT) {
break;
}
}