Implement CHARLEN and CHARSUB

Fixes #786
2026-03-03 11:33:04 +00:00 · 2021-03-08 15:11:12 -05:00
parent d43408f4f3
commit 2005ed1df9
10 changed files with 120 additions and 32 deletions
--- a/include/asm/charmap.h
+++ b/include/asm/charmap.h
@@ -18,5 +18,6 @@ void charmap_Push(void);
 void charmap_Pop(void);
 void charmap_Add(char *mapping, uint8_t value);
 size_t charmap_Convert(char const *input, uint8_t *output);
+size_t charmap_ConvertNext(char const **input, uint8_t *output);

 #endif /* RGBDS_ASM_CHARMAP_H */
--- a/src/asm/charmap.c
+++ b/src/asm/charmap.c
@@ -57,7 +57,7 @@ struct CharmapStackEntry {

 struct CharmapStackEntry *charmapStack;

-static struct Charmap *charmap_Get(const char *name)
+static struct Charmap *charmap_Get(char const *name)
 {
 	return hash_GetElement(charmaps, name);
 }
@@ -192,6 +192,19 @@ void charmap_Add(char *mapping, uint8_t value)
 }

 size_t charmap_Convert(char const *input, uint8_t *output)
+{
+	size_t outputLen = 0;
+
+	for (size_t charLen = charmap_ConvertNext(&input, output); charLen;
+	     charLen = charmap_ConvertNext(&input, output)) {
+		output += charLen;
+		outputLen += charLen;
+	}
+
+	return outputLen;
+}
+
+size_t charmap_ConvertNext(char const **input, uint8_t *output)
 {
 	/*
 	 * The goal is to match the longest mapping possible.
@@ -199,7 +212,6 @@ size_t charmap_Convert(char const *input, uint8_t *output)
 	 * If that would lead to a dead end, rewind characters until the last match, and output.
 	 * If no match, read a UTF-8 codepoint and output that.
 	 */
-	size_t outputLen = 0;
 	struct Charmap const *charmap = *currentCharmap;
 	struct Charnode const *node = &charmap->nodes[0];
 	struct Charnode const *match = NULL;
@@ -207,10 +219,10 @@ size_t charmap_Convert(char const *input, uint8_t *output)

 	for (;;) {
 		/* We still want NULs to reach the `else` path, to give a chance to rewind */
-		uint8_t c = *input - 1;
+		uint8_t c = **input - 1;

-		if (*input && node->next[c]) {
-			input++; /* Consume that char */
+		if (**input && node->next[c]) {
+			(*input)++; /* Consume that char */
 			rewindDistance++;

 			node = &charmap->nodes[node->next[c]];
@@ -220,31 +232,32 @@ size_t charmap_Convert(char const *input, uint8_t *output)
 			}

 		} else {
-			input -= rewindDistance; /* Rewind */
+			*input -= rewindDistance; /* Rewind */
 			rewindDistance = 0;
 			node = &charmap->nodes[0];

 			if (match) { /* Arrived at a dead end with a match found */
-				*output++ = match->value;
-				outputLen++;
-				match = NULL; /* Reset match for next round */
+				if (output)
+					*output = match->value;

-			} else if (*input) { /* No match found */
-				size_t codepointLen = readUTF8Char(output, input);
+				return 1;

-				if (codepointLen == 0) {
+			} else if (**input) { /* No match found */
+				size_t codepointLen = readUTF8Char(output, *input);
+
+				if (codepointLen == 0)
 					error("Input string is not valid UTF-8!\n");
-					break;
-				}
-				input += codepointLen; /* OK because UTF-8 has no NUL in multi-byte chars */
-				output += codepointLen;
-				outputLen += codepointLen;
-			}

-			if (!*input)
-				break;
+				/* OK because UTF-8 has no NUL in multi-byte chars */
+				*input += codepointLen;
+
+				return codepointLen;
+
+			} else { /* End of input */
+				return 0;
+			}
 		}
 	}

-	return outputLen;
+	unreachable_();
 }
--- a/src/asm/lexer.c
+++ b/src/asm/lexer.c
@@ -210,6 +210,9 @@ static struct KeywordMapping {
 	{"STRRPL", T_OP_STRRPL},
 	{"STRFMT", T_OP_STRFMT},

+	{"CHARLEN", T_OP_CHARLEN},
+	{"CHARSUB", T_OP_CHARSUB},
+
 	{"INCLUDE", T_POP_INCLUDE},
 	{"PRINT", T_POP_PRINT},
 	{"PRINTLN", T_POP_PRINTLN},
@@ -589,7 +592,7 @@ struct KeywordDictNode {
 	uint16_t children[0x60 - ' '];
 	struct KeywordMapping const *keyword;
 /* Since the keyword structure is invariant, the min number of nodes is known at compile time */
-} keywordDict[351] = {0}; /* Make sure to keep this correct when adding keywords! */
+} keywordDict[357] = {0}; /* Make sure to keep this correct when adding keywords! */

 /* Convert a char into its index into the dict */
 static uint8_t dictIndex(char c)
--- a/src/asm/parser.y
+++ b/src/asm/parser.y
@@ -82,13 +82,12 @@ static char *strrstr(char *s1, char *s2)
 	return NULL;
 }

-static size_t strlenUTF8(const char *s)
+static size_t strlenUTF8(char const *s)
 {
 	size_t len = 0;
 	uint32_t state = 0;
-	uint32_t codep = 0;

-	while (*s) {
+	for (uint32_t codep = 0; *s; s++) {
 		switch (decode(&state, &codep, *s)) {
 		case 1:
 			fatalerror("STRLEN: Invalid UTF-8 character\n");
@@ -97,7 +96,6 @@ static size_t strlenUTF8(const char *s)
 			len++;
 			break;
 		}
-		s++;
 	}

 	/* Check for partial code point. */
@@ -107,13 +105,12 @@ static size_t strlenUTF8(const char *s)
 	return len;
 }

-static void strsubUTF8(char *dest, size_t destLen, const char *src, uint32_t pos, uint32_t len)
+static void strsubUTF8(char *dest, size_t destLen, char const *src, uint32_t pos, uint32_t len)
 {
 	size_t srcIndex = 0;
 	size_t destIndex = 0;
 	uint32_t state = 0;
 	uint32_t codep = 0;
-	uint32_t curPos = 1;
 	uint32_t curLen = 0;

 	if (pos < 1) {
@@ -122,7 +119,7 @@ static void strsubUTF8(char *dest, size_t destLen, const char *src, uint32_t pos
 	}

 	/* Advance to starting position in source string. */
-	while (src[srcIndex] && curPos < pos) {
+	for (uint32_t curPos = 1; src[srcIndex] && curPos < pos; srcIndex++) {
 		switch (decode(&state, &codep, src[srcIndex])) {
 		case 1:
 			fatalerror("STRSUB: Invalid UTF-8 character\n");
@@ -131,7 +128,6 @@ static void strsubUTF8(char *dest, size_t destLen, const char *src, uint32_t pos
 			curPos++;
 			break;
 		}
-		srcIndex++;
 	}

 	if (!src[srcIndex] && len)
@@ -162,6 +158,42 @@ static void strsubUTF8(char *dest, size_t destLen, const char *src, uint32_t pos
 	dest[destIndex] = '\0';
 }

+static size_t charlenUTF8(char const *s)
+{
+	size_t len;
+
+	for (len = 0; charmap_ConvertNext(&s, NULL); len++)
+		;
+
+	return len;
+}
+
+static void charsubUTF8(char *dest, char const *src, uint32_t pos)
+{
+	size_t charLen = 1;
+
+	if (pos < 1) {
+		warning(WARNING_BUILTIN_ARG, "CHARSUB: Position starts at 1\n");
+		pos = 1;
+	}
+
+	/* Advance to starting position in source string. */
+	for (uint32_t curPos = 1; charLen && curPos < pos; curPos++)
+		charLen = charmap_ConvertNext(&src, NULL);
+
+	char const *start = src;
+
+	if (!charmap_ConvertNext(&src, NULL))
+		warning(WARNING_BUILTIN_ARG,
+			"CHARSUB: Position %lu is past the end of the string\n",
+			(unsigned long)pos);
+
+	/* Copy from source to destination. */
+	memcpy(dest, start, src - start);
+
+	dest[src - start] = '\0';
+}
+
 static void strrpl(char *dest, size_t destLen, char const *src, char const *old, char const *new)
 {
 	size_t oldLen = strlen(old);
@@ -503,6 +535,9 @@ enum {
 %token	T_OP_STRRPL "STRRPL"
 %token	T_OP_STRFMT "STRFMT"

+%token	T_OP_CHARLEN "CHARLEN"
+%token	T_OP_CHARSUB "CHARSUB"
+
 %token	<tzSym> T_LABEL "label"
 %token	<tzSym> T_ID "identifier"
 %token	<tzSym> T_LOCAL_ID "local identifier"
@@ -1451,6 +1486,9 @@ relocexpr_no_str : scoped_anon_id	{ rpn_Symbol(&$$, $1); }
 		| T_OP_STRLEN T_LPAREN string T_RPAREN {
 			rpn_Number(&$$, strlenUTF8($3));
 		}
+		| T_OP_CHARLEN T_LPAREN string T_RPAREN {
+			rpn_Number(&$$, charlenUTF8($3));
+		}
 		| T_LPAREN relocexpr T_RPAREN	{ $$ = $2; }
 ;

@@ -1488,6 +1526,9 @@ string		: T_STRING
 		| T_OP_STRSUB T_LPAREN string T_COMMA uconst T_COMMA uconst T_RPAREN {
 			strsubUTF8($$, sizeof($$), $3, $5, $7);
 		}
+		| T_OP_CHARSUB T_LPAREN string T_COMMA uconst T_RPAREN {
+			charsubUTF8($$, $3, $5);
+		}
 		| T_OP_STRCAT T_LPAREN T_RPAREN {
 			$$[0] = '\0';
 		}
--- a/src/asm/rgbasm.5
+++ b/src/asm/rgbasm.5
@@ -394,11 +394,13 @@ Most of them return a string, however some of these functions actually return an
 .It Fn STRCMP str1 str2 Ta Returns -1 if Ar str1 No is alphabetically lower than Ar str2 No , zero if they match, 1 if Ar str1 No is greater than Ar str2 .
 .It Fn STRIN str1 str2 Ta Returns the first position of Ar str2 No in Ar str1 No or zero if it's not present Pq first character is position 1 .
 .It Fn STRRIN str1 str2 Ta Returns the last position of Ar str2 No in Ar str1 No or zero if it's not present Pq first character is position 1 .
-.It Fn STRSUB str pos len Ta Returns a substring from Ar str No starting at Ar pos Po first character is position 1 Pc and Ar len No characters long.
+.It Fn STRSUB str pos len Ta Returns a substring from Ar str No starting at Ar pos No (first character is position 1) and Ar len No characters long.
 .It Fn STRUPR str Ta Returns Ar str No with all letters in uppercase.
 .It Fn STRLWR str Ta Returns Ar str No with all letters in lowercase.
 .It Fn STRRPL str old new Ta Returns Ar str No with each non-overlapping occurrence of the substring Ar old No replaced with Ar new .
 .It Fn STRFMT fmt args... Ta Returns the string Ar fmt No with each
+.It Fn CHARLEN str Ta Returns the number of charmap entries in Ar str No with the current charmap.
+.It Fn CHARSUB str pos Ta Returns the substring for the charmap entry at Ar pos No in Ar str No (first character is position 1) with the current charmap.
 .Ql %spec
 pattern replaced by interpolating the format
 .Ar spec
--- a/src/asm/util.c
+++ b/src/asm/util.c
@@ -67,6 +67,7 @@ size_t readUTF8Char(uint8_t *dest, char const *src)
 		if (decode(&state, &codep, src[i]) == 1)
 			return 0;

+		if (dest)
 			dest[i] = src[i];
 		i++;

--- a/test/asm/charlen-charsub.asm
+++ b/test/asm/charlen-charsub.asm
@@ -0,0 +1,25 @@
+	charmap "<NULL>", $00
+	charmap "A", $10
+	charmap "B", $20
+	charmap "C", $30
+	charmap "Bold", $88
+
+SECTION "test", ROM0
+
+S EQUS "XBold<NULL>ABC"
+
+	assert CHARLEN("{S}") == 6
+	println CHARSUB("{S}", 2)
+	assert !STRCMP(CHARSUB("{S}", 2), "Bold")
+	assert CHARSUB("{S}", 2) == "Bold" && "Bold" == $88
+	assert CHARSUB("{S}", 1) == $58 ; ASCII "X"
+	db "{S}"
+
+	newcharmap ascii
+
+	assert CHARLEN("{S}") == 14
+	println CHARSUB("{S}", 2)
+	assert !STRCMP(CHARSUB("{S}", 2), "B")
+	assert CHARSUB("{S}", 2) == "B" && "B" == $42 ; ASCII "B"
+	assert CHARSUB("{S}", 1) == $58 ; ASCII "X"
+	db "{S}"
--- a/test/asm/charlen-charsub.err
+++ b/test/asm/charlen-charsub.err
--- a/test/asm/charlen-charsub.out
+++ b/test/asm/charlen-charsub.out
@@ -0,0 +1,2 @@
+Bold
+B
--- a/test/asm/charlen-charsub.out.bin
+++ b/test/asm/charlen-charsub.out.bin