mirror of
https://github.com/gbdev/rgbds.git
synced 2025-11-20 18:22:07 +00:00
Update the UTF-8 decoder (#1741)
This commit is contained in:
3
include/extern/utf8decoder.hpp
vendored
3
include/extern/utf8decoder.hpp
vendored
@@ -5,6 +5,9 @@
|
|||||||
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
|
#define UTF8_ACCEPT 0
|
||||||
|
#define UTF8_REJECT 12
|
||||||
|
|
||||||
uint32_t decode(uint32_t *state, uint32_t *codep, uint8_t byte);
|
uint32_t decode(uint32_t *state, uint32_t *codep, uint8_t byte);
|
||||||
|
|
||||||
#endif // RGBDS_EXTERN_UTF8DECODER_HPP
|
#endif // RGBDS_EXTERN_UTF8DECODER_HPP
|
||||||
|
|||||||
@@ -266,14 +266,15 @@ size_t charmap_ConvertNext(std::string_view &input, std::vector<int32_t> *output
|
|||||||
} else if (inputIdx < input.length()) { // No match found, but there is some input left
|
} else if (inputIdx < input.length()) { // No match found, but there is some input left
|
||||||
size_t codepointLen = 0;
|
size_t codepointLen = 0;
|
||||||
// This will write the codepoint's value to `output`, little-endian
|
// This will write the codepoint's value to `output`, little-endian
|
||||||
for (uint32_t state = 0, codepoint = 0; inputIdx + codepointLen < input.length();) {
|
for (uint32_t state = UTF8_ACCEPT, codepoint = 0;
|
||||||
if (decode(&state, &codepoint, input[inputIdx + codepointLen]) == 1) {
|
inputIdx + codepointLen < input.length();) {
|
||||||
|
if (decode(&state, &codepoint, input[inputIdx + codepointLen]) == UTF8_REJECT) {
|
||||||
error("Input string is not valid UTF-8");
|
error("Input string is not valid UTF-8");
|
||||||
codepointLen = 1;
|
codepointLen = 1;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
codepointLen++;
|
codepointLen++;
|
||||||
if (state == 0) {
|
if (state == UTF8_ACCEPT) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2708,26 +2708,26 @@ static void errorInvalidUTF8Byte(uint8_t byte, char const *functionName) {
|
|||||||
static size_t strlenUTF8(std::string const &str, bool printErrors) {
|
static size_t strlenUTF8(std::string const &str, bool printErrors) {
|
||||||
char const *ptr = str.c_str();
|
char const *ptr = str.c_str();
|
||||||
size_t len = 0;
|
size_t len = 0;
|
||||||
uint32_t state = 0;
|
uint32_t state = UTF8_ACCEPT;
|
||||||
|
|
||||||
for (uint32_t codepoint = 0; *ptr; ptr++) {
|
for (uint32_t codepoint = 0; *ptr; ptr++) {
|
||||||
uint8_t byte = *ptr;
|
uint8_t byte = *ptr;
|
||||||
|
|
||||||
switch (decode(&state, &codepoint, byte)) {
|
switch (decode(&state, &codepoint, byte)) {
|
||||||
case 1:
|
case UTF8_REJECT:
|
||||||
if (printErrors) {
|
if (printErrors) {
|
||||||
errorInvalidUTF8Byte(byte, "STRLEN");
|
errorInvalidUTF8Byte(byte, "STRLEN");
|
||||||
}
|
}
|
||||||
state = 0;
|
state = UTF8_ACCEPT;
|
||||||
// fallthrough
|
// fallthrough
|
||||||
case 0:
|
case UTF8_ACCEPT:
|
||||||
len++;
|
len++;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check for partial code point.
|
// Check for partial code point.
|
||||||
if (state != 0) {
|
if (state != UTF8_ACCEPT) {
|
||||||
if (printErrors) {
|
if (printErrors) {
|
||||||
error("STRLEN: Incomplete UTF-8 character");
|
error("STRLEN: Incomplete UTF-8 character");
|
||||||
}
|
}
|
||||||
@@ -2740,18 +2740,18 @@ static size_t strlenUTF8(std::string const &str, bool printErrors) {
|
|||||||
static std::string strsliceUTF8(std::string const &str, uint32_t start, uint32_t stop) {
|
static std::string strsliceUTF8(std::string const &str, uint32_t start, uint32_t stop) {
|
||||||
char const *ptr = str.c_str();
|
char const *ptr = str.c_str();
|
||||||
size_t index = 0;
|
size_t index = 0;
|
||||||
uint32_t state = 0;
|
uint32_t state = UTF8_ACCEPT;
|
||||||
uint32_t codepoint = 0;
|
uint32_t codepoint = 0;
|
||||||
uint32_t curIdx = 0;
|
uint32_t curIdx = 0;
|
||||||
|
|
||||||
// Advance to starting index in source string.
|
// Advance to starting index in source string.
|
||||||
while (ptr[index] && curIdx < start) {
|
while (ptr[index] && curIdx < start) {
|
||||||
switch (decode(&state, &codepoint, ptr[index])) {
|
switch (decode(&state, &codepoint, ptr[index])) {
|
||||||
case 1:
|
case UTF8_REJECT:
|
||||||
errorInvalidUTF8Byte(ptr[index], "STRSLICE");
|
errorInvalidUTF8Byte(ptr[index], "STRSLICE");
|
||||||
state = 0;
|
state = UTF8_ACCEPT;
|
||||||
// fallthrough
|
// fallthrough
|
||||||
case 0:
|
case UTF8_ACCEPT:
|
||||||
curIdx++;
|
curIdx++;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@@ -2773,11 +2773,11 @@ static std::string strsliceUTF8(std::string const &str, uint32_t start, uint32_t
|
|||||||
// Advance to ending index in source string.
|
// Advance to ending index in source string.
|
||||||
while (ptr[index] && curIdx < stop) {
|
while (ptr[index] && curIdx < stop) {
|
||||||
switch (decode(&state, &codepoint, ptr[index])) {
|
switch (decode(&state, &codepoint, ptr[index])) {
|
||||||
case 1:
|
case UTF8_REJECT:
|
||||||
errorInvalidUTF8Byte(ptr[index], "STRSLICE");
|
errorInvalidUTF8Byte(ptr[index], "STRSLICE");
|
||||||
state = 0;
|
state = UTF8_ACCEPT;
|
||||||
// fallthrough
|
// fallthrough
|
||||||
case 0:
|
case UTF8_ACCEPT:
|
||||||
curIdx++;
|
curIdx++;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@@ -2785,7 +2785,7 @@ static std::string strsliceUTF8(std::string const &str, uint32_t start, uint32_t
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Check for partial code point.
|
// Check for partial code point.
|
||||||
if (state != 0) {
|
if (state != UTF8_ACCEPT) {
|
||||||
error("STRSLICE: Incomplete UTF-8 character");
|
error("STRSLICE: Incomplete UTF-8 character");
|
||||||
curIdx++;
|
curIdx++;
|
||||||
}
|
}
|
||||||
@@ -2804,18 +2804,18 @@ static std::string strsliceUTF8(std::string const &str, uint32_t start, uint32_t
|
|||||||
static std::string strsubUTF8(std::string const &str, uint32_t pos, uint32_t len) {
|
static std::string strsubUTF8(std::string const &str, uint32_t pos, uint32_t len) {
|
||||||
char const *ptr = str.c_str();
|
char const *ptr = str.c_str();
|
||||||
size_t index = 0;
|
size_t index = 0;
|
||||||
uint32_t state = 0;
|
uint32_t state = UTF8_ACCEPT;
|
||||||
uint32_t codepoint = 0;
|
uint32_t codepoint = 0;
|
||||||
uint32_t curPos = 1;
|
uint32_t curPos = 1;
|
||||||
|
|
||||||
// Advance to starting position in source string.
|
// Advance to starting position in source string.
|
||||||
while (ptr[index] && curPos < pos) {
|
while (ptr[index] && curPos < pos) {
|
||||||
switch (decode(&state, &codepoint, ptr[index])) {
|
switch (decode(&state, &codepoint, ptr[index])) {
|
||||||
case 1:
|
case UTF8_REJECT:
|
||||||
errorInvalidUTF8Byte(ptr[index], "STRSUB");
|
errorInvalidUTF8Byte(ptr[index], "STRSUB");
|
||||||
state = 0;
|
state = UTF8_ACCEPT;
|
||||||
// fallthrough
|
// fallthrough
|
||||||
case 0:
|
case UTF8_ACCEPT:
|
||||||
curPos++;
|
curPos++;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@@ -2836,11 +2836,11 @@ static std::string strsubUTF8(std::string const &str, uint32_t pos, uint32_t len
|
|||||||
// Compute the result length in bytes.
|
// Compute the result length in bytes.
|
||||||
while (ptr[index] && curLen < len) {
|
while (ptr[index] && curLen < len) {
|
||||||
switch (decode(&state, &codepoint, ptr[index])) {
|
switch (decode(&state, &codepoint, ptr[index])) {
|
||||||
case 1:
|
case UTF8_REJECT:
|
||||||
errorInvalidUTF8Byte(ptr[index], "STRSUB");
|
errorInvalidUTF8Byte(ptr[index], "STRSUB");
|
||||||
state = 0;
|
state = UTF8_ACCEPT;
|
||||||
// fallthrough
|
// fallthrough
|
||||||
case 0:
|
case UTF8_ACCEPT:
|
||||||
curLen++;
|
curLen++;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@@ -2848,7 +2848,7 @@ static std::string strsubUTF8(std::string const &str, uint32_t pos, uint32_t len
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Check for partial code point.
|
// Check for partial code point.
|
||||||
if (state != 0) {
|
if (state != UTF8_ACCEPT) {
|
||||||
error("STRSUB: Incomplete UTF-8 character");
|
error("STRSUB: Incomplete UTF-8 character");
|
||||||
curLen++;
|
curLen++;
|
||||||
}
|
}
|
||||||
|
|||||||
56
src/extern/utf8decoder.cpp
vendored
56
src/extern/utf8decoder.cpp
vendored
@@ -6,37 +6,43 @@
|
|||||||
|
|
||||||
#include "extern/utf8decoder.hpp"
|
#include "extern/utf8decoder.hpp"
|
||||||
|
|
||||||
|
// clang-format off: vertically align values
|
||||||
static uint8_t const utf8d[] = {
|
static uint8_t const utf8d[] = {
|
||||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00..0f
|
// The first part of the table maps bytes to character classes that
|
||||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 10..1f
|
// to reduce the size of the transition table and create bitmasks.
|
||||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20..2f
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00..0f
|
||||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 30..3f
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 10..1f
|
||||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40..4f
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20..2f
|
||||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 50..5f
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 30..3f
|
||||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60..6f
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40..4f
|
||||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 70..7f
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 50..5f
|
||||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80..8f
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60..6f
|
||||||
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // 90..9f
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 70..7f
|
||||||
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // a0..af
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80..8f
|
||||||
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // b0..bf
|
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // 90..9f
|
||||||
8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // c0..cf
|
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // a0..af
|
||||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // d0..df
|
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // b0..bf
|
||||||
|
8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // c0..cf
|
||||||
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // d0..df
|
||||||
10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, // e0..ef
|
10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, // e0..ef
|
||||||
11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, // f0..ff
|
11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, // f0..ff
|
||||||
0, 1, 2, 3, 5, 8, 7, 1, 1, 1, 4, 6, 1, 1, 1, 1, // s0
|
// The second part is a transition table that maps a combination
|
||||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // s1
|
// of a state of the automaton and a character class to a state.
|
||||||
1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, // s1
|
0, 12, 24, 36, 60, 96, 84, 12, 12, 12, 48, 72, // s0
|
||||||
1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, // s3
|
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, // s1
|
||||||
1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, // s4
|
12, 0, 12, 12, 12, 12, 12, 0, 12, 0, 12, 12, // s2
|
||||||
1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, // s5
|
12, 24, 12, 12, 12, 12, 12, 24, 12, 24, 12, 12, // s3
|
||||||
1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, // s6
|
12, 12, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, // s4
|
||||||
1, 3, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, // s7
|
12, 24, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12, // s5
|
||||||
1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // s8
|
12, 12, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, // s6
|
||||||
|
12, 36, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, // s7
|
||||||
|
12, 36, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, // s8
|
||||||
};
|
};
|
||||||
|
// clang-format on
|
||||||
|
|
||||||
uint32_t decode(uint32_t *state, uint32_t *codep, uint8_t byte) {
|
uint32_t decode(uint32_t *state, uint32_t *codep, uint8_t byte) {
|
||||||
uint8_t type = utf8d[byte];
|
uint8_t type = utf8d[byte];
|
||||||
*codep = *state != 0 ? (byte & 0b111111) | (*codep << 6) : byte & (0xFF >> type);
|
*codep = *state != UTF8_ACCEPT ? (byte & 0b111111) | (*codep << 6) : (0xff >> type) & byte;
|
||||||
*state = utf8d[0x100 + *state * 0x10 + type];
|
*state = utf8d[0x100 + *state + type];
|
||||||
return *state;
|
return *state;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -272,11 +272,11 @@ static void writeSymName(std::string const &name, FILE *file) {
|
|||||||
} else {
|
} else {
|
||||||
// Output illegal characters using Unicode escapes ('\u' or '\U')
|
// Output illegal characters using Unicode escapes ('\u' or '\U')
|
||||||
// Decode the UTF-8 codepoint; or at least attempt to
|
// Decode the UTF-8 codepoint; or at least attempt to
|
||||||
uint32_t state = 0, codepoint;
|
uint32_t state = UTF8_ACCEPT, codepoint;
|
||||||
|
|
||||||
do {
|
do {
|
||||||
decode(&state, &codepoint, *ptr);
|
decode(&state, &codepoint, *ptr);
|
||||||
if (state == 1) {
|
if (state == UTF8_REJECT) {
|
||||||
// This sequence was invalid; emit a U+FFFD, and recover
|
// This sequence was invalid; emit a U+FFFD, and recover
|
||||||
codepoint = 0xFFFD;
|
codepoint = 0xFFFD;
|
||||||
// Skip continuation bytes
|
// Skip continuation bytes
|
||||||
@@ -287,7 +287,7 @@ static void writeSymName(std::string const &name, FILE *file) {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
++ptr;
|
++ptr;
|
||||||
} while (state != 0);
|
} while (state != UTF8_ACCEPT);
|
||||||
|
|
||||||
fprintf(file, codepoint <= 0xFFFF ? "\\u%04" PRIx32 : "\\U%08" PRIx32, codepoint);
|
fprintf(file, codepoint <= 0xFFFF ? "\\u%04" PRIx32 : "\\U%08" PRIx32, codepoint);
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user