Refactor readUTF8Char into charmap_ConvertNext

2026-03-04 12:03:04 +00:00 · 2025-01-28 00:04:49 -05:00
parent 34a9c8e083
commit e49291b7cf
4 changed files with 23 additions and 38 deletions
--- a/include/util.hpp
+++ b/include/util.hpp
@@ -3,13 +3,6 @@
 #ifndef RGBDS_UTIL_HPP
 #define RGBDS_UTIL_HPP
 #include <stddef.h>
 #include <stdint.h>
 #include <vector>
 char const *printChar(int c);
 // @return The number of bytes read, or 0 if invalid data was found
 size_t readUTF8Char(std::vector<int32_t> *dest, char const *src);
 #endif // RGBDS_UTIL_HPP
--- a/src/asm/charmap.cpp
+++ b/src/asm/charmap.cpp
@@ -11,6 +11,7 @@
 #include <unordered_map>
 #include <utility>
 #include "extern/utf8decoder.hpp"
 #include "helpers.hpp"
 #include "util.hpp"
@@ -224,16 +225,30 @@ size_t charmap_ConvertNext(std::string_view &input, std::vector<int32_t> *output
 		matchLen = value.size();
 	} else if (inputIdx < input.length()) { // No match found, but there is some input left
-		int firstChar = input[inputIdx];
+		size_t codepointLen = 0;
 		// This will write the codepoint's value to `output`, little-endian
-		size_t codepointLen = readUTF8Char(output, input.data() + inputIdx);
+		for (uint32_t state = 0, codepoint = 0;;) {
 			if (decode(&state, &codepoint, input[inputIdx + codepointLen]) == 1) {
 				codepointLen = 0;
 				break;
 			}
 			if (output) {
 				output->push_back(input[inputIdx + codepointLen]);
 			}
 			codepointLen++;
 			if (state == 0) {
 				break;
 			}
 		}
 		if (codepointLen == 0) {
 			error("Input string is not valid UTF-8\n");
 		}
 		// Warn if this character is not mapped but any others are
-		if (charmap.nodes.size() > 1) {
+		if (int firstChar = input[inputIdx]; charmap.nodes.size() > 1) {
 			warning(WARNING_UNMAPPED_CHAR_1, "Unmapped character %s\n", printChar(firstChar));
 		} else if (charmap.name != DEFAULT_CHARMAP_NAME) {
 			warning(
--- a/src/asm/parser.y
+++ b/src/asm/parser.y
@@ -2527,10 +2527,10 @@ static size_t strlenUTF8(std::string const &str) {
 	size_t len = 0;
 	uint32_t state = 0;
-	for (uint32_t codep = 0; *ptr; ptr++) {
+	for (uint32_t codepoint = 0; *ptr; ptr++) {
 		uint8_t byte = *ptr;
-		switch (decode(&state, &codep, byte)) {
+		switch (decode(&state, &codepoint, byte)) {
 		case 1:
 			errorInvalidUTF8Byte(byte, "STRLEN");
 			state = 0;
@@ -2553,12 +2553,12 @@ static std::string strsubUTF8(std::string const &str, uint32_t pos, uint32_t len
 	char const *ptr = str.c_str();
 	size_t index = 0;
 	uint32_t state = 0;
-	uint32_t codep = 0;
+	uint32_t codepoint = 0;
 	uint32_t curPos = 1; // RGBASM strings are 1-indexed!
 	// Advance to starting position in source string.
 	while (ptr[index] && curPos < pos) {
-		switch (decode(&state, &codep, ptr[index])) {
+		switch (decode(&state, &codepoint, ptr[index])) {
 		case 1:
 			errorInvalidUTF8Byte(ptr[index], "STRSUB");
 			state = 0;
@@ -2583,7 +2583,7 @@ static std::string strsubUTF8(std::string const &str, uint32_t pos, uint32_t len
 	// Compute the result length in bytes.
 	while (ptr[index] && curLen < len) {
-		switch (decode(&state, &codep, ptr[index])) {
+		switch (decode(&state, &codepoint, ptr[index])) {
 		case 1:
 			errorInvalidUTF8Byte(ptr[index], "STRSUB");
 			state = 0;
--- a/src/util.cpp
+++ b/src/util.cpp
@@ -5,9 +5,6 @@
 #include <ctype.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <vector>
 #include "extern/utf8decoder.hpp"
 char const *printChar(int c) {
 	// "'A'" + '\0': 4 bytes
@@ -53,23 +50,3 @@ char const *printChar(int c) {
 	buf[4] = '\0';
 	return buf;
 }
 size_t readUTF8Char(std::vector<int32_t> *dest, char const *src) {
 	uint32_t state = 0, codepoint;
 	size_t i = 0;
 	for (;;) {
 		if (decode(&state, &codepoint, src[i]) == 1) {
 			return 0;
 		}
 		if (dest) {
 			dest->push_back(src[i]);
 		}
 		i++;
 		if (state == 0) {
 			return i;
 		}
 	}
 }