From 2005ed1df9a6abfbb693889129af15f78569282e Mon Sep 17 00:00:00 2001 From: Rangi Date: Mon, 8 Mar 2021 15:11:12 -0500 Subject: [PATCH] Implement CHARLEN and CHARSUB Fixes #786 --- include/asm/charmap.h | 1 + src/asm/charmap.c | 55 +++++++++++++++++------------ src/asm/lexer.c | 5 ++- src/asm/parser.y | 57 ++++++++++++++++++++++++++----- src/asm/rgbasm.5 | 4 ++- src/asm/util.c | 3 +- test/asm/charlen-charsub.asm | 25 ++++++++++++++ test/asm/charlen-charsub.err | 0 test/asm/charlen-charsub.out | 2 ++ test/asm/charlen-charsub.out.bin | Bin 0 -> 20 bytes 10 files changed, 120 insertions(+), 32 deletions(-) create mode 100644 test/asm/charlen-charsub.asm create mode 100644 test/asm/charlen-charsub.err create mode 100644 test/asm/charlen-charsub.out create mode 100644 test/asm/charlen-charsub.out.bin diff --git a/include/asm/charmap.h b/include/asm/charmap.h index 4f8c2ac9..b6b14b91 100644 --- a/include/asm/charmap.h +++ b/include/asm/charmap.h @@ -18,5 +18,6 @@ void charmap_Push(void); void charmap_Pop(void); void charmap_Add(char *mapping, uint8_t value); size_t charmap_Convert(char const *input, uint8_t *output); +size_t charmap_ConvertNext(char const **input, uint8_t *output); #endif /* RGBDS_ASM_CHARMAP_H */ diff --git a/src/asm/charmap.c b/src/asm/charmap.c index ab474c65..12cc3c40 100644 --- a/src/asm/charmap.c +++ b/src/asm/charmap.c @@ -57,7 +57,7 @@ struct CharmapStackEntry { struct CharmapStackEntry *charmapStack; -static struct Charmap *charmap_Get(const char *name) +static struct Charmap *charmap_Get(char const *name) { return hash_GetElement(charmaps, name); } @@ -192,6 +192,19 @@ void charmap_Add(char *mapping, uint8_t value) } size_t charmap_Convert(char const *input, uint8_t *output) +{ + size_t outputLen = 0; + + for (size_t charLen = charmap_ConvertNext(&input, output); charLen; + charLen = charmap_ConvertNext(&input, output)) { + output += charLen; + outputLen += charLen; + } + + return outputLen; +} + +size_t charmap_ConvertNext(char const **input, uint8_t *output) { /* * The goal is to match the longest mapping possible. @@ -199,7 +212,6 @@ size_t charmap_Convert(char const *input, uint8_t *output) * If that would lead to a dead end, rewind characters until the last match, and output. * If no match, read a UTF-8 codepoint and output that. */ - size_t outputLen = 0; struct Charmap const *charmap = *currentCharmap; struct Charnode const *node = &charmap->nodes[0]; struct Charnode const *match = NULL; @@ -207,10 +219,10 @@ size_t charmap_Convert(char const *input, uint8_t *output) for (;;) { /* We still want NULs to reach the `else` path, to give a chance to rewind */ - uint8_t c = *input - 1; + uint8_t c = **input - 1; - if (*input && node->next[c]) { - input++; /* Consume that char */ + if (**input && node->next[c]) { + (*input)++; /* Consume that char */ rewindDistance++; node = &charmap->nodes[node->next[c]]; @@ -220,31 +232,32 @@ size_t charmap_Convert(char const *input, uint8_t *output) } } else { - input -= rewindDistance; /* Rewind */ + *input -= rewindDistance; /* Rewind */ rewindDistance = 0; node = &charmap->nodes[0]; if (match) { /* Arrived at a dead end with a match found */ - *output++ = match->value; - outputLen++; - match = NULL; /* Reset match for next round */ + if (output) + *output = match->value; - } else if (*input) { /* No match found */ - size_t codepointLen = readUTF8Char(output, input); + return 1; - if (codepointLen == 0) { + } else if (**input) { /* No match found */ + size_t codepointLen = readUTF8Char(output, *input); + + if (codepointLen == 0) error("Input string is not valid UTF-8!\n"); - break; - } - input += codepointLen; /* OK because UTF-8 has no NUL in multi-byte chars */ - output += codepointLen; - outputLen += codepointLen; - } - if (!*input) - break; + /* OK because UTF-8 has no NUL in multi-byte chars */ + *input += codepointLen; + + return codepointLen; + + } else { /* End of input */ + return 0; + } } } - return outputLen; + unreachable_(); } diff --git a/src/asm/lexer.c b/src/asm/lexer.c index c066b0c6..27d59229 100644 --- a/src/asm/lexer.c +++ b/src/asm/lexer.c @@ -210,6 +210,9 @@ static struct KeywordMapping { {"STRRPL", T_OP_STRRPL}, {"STRFMT", T_OP_STRFMT}, + {"CHARLEN", T_OP_CHARLEN}, + {"CHARSUB", T_OP_CHARSUB}, + {"INCLUDE", T_POP_INCLUDE}, {"PRINT", T_POP_PRINT}, {"PRINTLN", T_POP_PRINTLN}, @@ -589,7 +592,7 @@ struct KeywordDictNode { uint16_t children[0x60 - ' ']; struct KeywordMapping const *keyword; /* Since the keyword structure is invariant, the min number of nodes is known at compile time */ -} keywordDict[351] = {0}; /* Make sure to keep this correct when adding keywords! */ +} keywordDict[357] = {0}; /* Make sure to keep this correct when adding keywords! */ /* Convert a char into its index into the dict */ static uint8_t dictIndex(char c) diff --git a/src/asm/parser.y b/src/asm/parser.y index 2198d7e9..1b156af6 100644 --- a/src/asm/parser.y +++ b/src/asm/parser.y @@ -82,13 +82,12 @@ static char *strrstr(char *s1, char *s2) return NULL; } -static size_t strlenUTF8(const char *s) +static size_t strlenUTF8(char const *s) { size_t len = 0; uint32_t state = 0; - uint32_t codep = 0; - while (*s) { + for (uint32_t codep = 0; *s; s++) { switch (decode(&state, &codep, *s)) { case 1: fatalerror("STRLEN: Invalid UTF-8 character\n"); @@ -97,7 +96,6 @@ static size_t strlenUTF8(const char *s) len++; break; } - s++; } /* Check for partial code point. */ @@ -107,13 +105,12 @@ static size_t strlenUTF8(const char *s) return len; } -static void strsubUTF8(char *dest, size_t destLen, const char *src, uint32_t pos, uint32_t len) +static void strsubUTF8(char *dest, size_t destLen, char const *src, uint32_t pos, uint32_t len) { size_t srcIndex = 0; size_t destIndex = 0; uint32_t state = 0; uint32_t codep = 0; - uint32_t curPos = 1; uint32_t curLen = 0; if (pos < 1) { @@ -122,7 +119,7 @@ static void strsubUTF8(char *dest, size_t destLen, const char *src, uint32_t pos } /* Advance to starting position in source string. */ - while (src[srcIndex] && curPos < pos) { + for (uint32_t curPos = 1; src[srcIndex] && curPos < pos; srcIndex++) { switch (decode(&state, &codep, src[srcIndex])) { case 1: fatalerror("STRSUB: Invalid UTF-8 character\n"); @@ -131,7 +128,6 @@ static void strsubUTF8(char *dest, size_t destLen, const char *src, uint32_t pos curPos++; break; } - srcIndex++; } if (!src[srcIndex] && len) @@ -162,6 +158,42 @@ static void strsubUTF8(char *dest, size_t destLen, const char *src, uint32_t pos dest[destIndex] = '\0'; } +static size_t charlenUTF8(char const *s) +{ + size_t len; + + for (len = 0; charmap_ConvertNext(&s, NULL); len++) + ; + + return len; +} + +static void charsubUTF8(char *dest, char const *src, uint32_t pos) +{ + size_t charLen = 1; + + if (pos < 1) { + warning(WARNING_BUILTIN_ARG, "CHARSUB: Position starts at 1\n"); + pos = 1; + } + + /* Advance to starting position in source string. */ + for (uint32_t curPos = 1; charLen && curPos < pos; curPos++) + charLen = charmap_ConvertNext(&src, NULL); + + char const *start = src; + + if (!charmap_ConvertNext(&src, NULL)) + warning(WARNING_BUILTIN_ARG, + "CHARSUB: Position %lu is past the end of the string\n", + (unsigned long)pos); + + /* Copy from source to destination. */ + memcpy(dest, start, src - start); + + dest[src - start] = '\0'; +} + static void strrpl(char *dest, size_t destLen, char const *src, char const *old, char const *new) { size_t oldLen = strlen(old); @@ -503,6 +535,9 @@ enum { %token T_OP_STRRPL "STRRPL" %token T_OP_STRFMT "STRFMT" +%token T_OP_CHARLEN "CHARLEN" +%token T_OP_CHARSUB "CHARSUB" + %token T_LABEL "label" %token T_ID "identifier" %token T_LOCAL_ID "local identifier" @@ -1451,6 +1486,9 @@ relocexpr_no_str : scoped_anon_id { rpn_Symbol(&$$, $1); } | T_OP_STRLEN T_LPAREN string T_RPAREN { rpn_Number(&$$, strlenUTF8($3)); } + | T_OP_CHARLEN T_LPAREN string T_RPAREN { + rpn_Number(&$$, charlenUTF8($3)); + } | T_LPAREN relocexpr T_RPAREN { $$ = $2; } ; @@ -1488,6 +1526,9 @@ string : T_STRING | T_OP_STRSUB T_LPAREN string T_COMMA uconst T_COMMA uconst T_RPAREN { strsubUTF8($$, sizeof($$), $3, $5, $7); } + | T_OP_CHARSUB T_LPAREN string T_COMMA uconst T_RPAREN { + charsubUTF8($$, $3, $5); + } | T_OP_STRCAT T_LPAREN T_RPAREN { $$[0] = '\0'; } diff --git a/src/asm/rgbasm.5 b/src/asm/rgbasm.5 index 9fcee20d..178ce9ba 100644 --- a/src/asm/rgbasm.5 +++ b/src/asm/rgbasm.5 @@ -394,11 +394,13 @@ Most of them return a string, however some of these functions actually return an .It Fn STRCMP str1 str2 Ta Returns -1 if Ar str1 No is alphabetically lower than Ar str2 No , zero if they match, 1 if Ar str1 No is greater than Ar str2 . .It Fn STRIN str1 str2 Ta Returns the first position of Ar str2 No in Ar str1 No or zero if it's not present Pq first character is position 1 . .It Fn STRRIN str1 str2 Ta Returns the last position of Ar str2 No in Ar str1 No or zero if it's not present Pq first character is position 1 . -.It Fn STRSUB str pos len Ta Returns a substring from Ar str No starting at Ar pos Po first character is position 1 Pc and Ar len No characters long. +.It Fn STRSUB str pos len Ta Returns a substring from Ar str No starting at Ar pos No (first character is position 1) and Ar len No characters long. .It Fn STRUPR str Ta Returns Ar str No with all letters in uppercase. .It Fn STRLWR str Ta Returns Ar str No with all letters in lowercase. .It Fn STRRPL str old new Ta Returns Ar str No with each non-overlapping occurrence of the substring Ar old No replaced with Ar new . .It Fn STRFMT fmt args... Ta Returns the string Ar fmt No with each +.It Fn CHARLEN str Ta Returns the number of charmap entries in Ar str No with the current charmap. +.It Fn CHARSUB str pos Ta Returns the substring for the charmap entry at Ar pos No in Ar str No (first character is position 1) with the current charmap. .Ql %spec pattern replaced by interpolating the format .Ar spec diff --git a/src/asm/util.c b/src/asm/util.c index e368121d..dbbd89eb 100644 --- a/src/asm/util.c +++ b/src/asm/util.c @@ -67,7 +67,8 @@ size_t readUTF8Char(uint8_t *dest, char const *src) if (decode(&state, &codep, src[i]) == 1) return 0; - dest[i] = src[i]; + if (dest) + dest[i] = src[i]; i++; if (state == 0) diff --git a/test/asm/charlen-charsub.asm b/test/asm/charlen-charsub.asm new file mode 100644 index 00000000..06792966 --- /dev/null +++ b/test/asm/charlen-charsub.asm @@ -0,0 +1,25 @@ + charmap "", $00 + charmap "A", $10 + charmap "B", $20 + charmap "C", $30 + charmap "Bold", $88 + +SECTION "test", ROM0 + +S EQUS "XBoldABC" + + assert CHARLEN("{S}") == 6 + println CHARSUB("{S}", 2) + assert !STRCMP(CHARSUB("{S}", 2), "Bold") + assert CHARSUB("{S}", 2) == "Bold" && "Bold" == $88 + assert CHARSUB("{S}", 1) == $58 ; ASCII "X" + db "{S}" + + newcharmap ascii + + assert CHARLEN("{S}") == 14 + println CHARSUB("{S}", 2) + assert !STRCMP(CHARSUB("{S}", 2), "B") + assert CHARSUB("{S}", 2) == "B" && "B" == $42 ; ASCII "B" + assert CHARSUB("{S}", 1) == $58 ; ASCII "X" + db "{S}" diff --git a/test/asm/charlen-charsub.err b/test/asm/charlen-charsub.err new file mode 100644 index 00000000..e69de29b diff --git a/test/asm/charlen-charsub.out b/test/asm/charlen-charsub.out new file mode 100644 index 00000000..2f534ea5 --- /dev/null +++ b/test/asm/charlen-charsub.out @@ -0,0 +1,2 @@ +Bold +B diff --git a/test/asm/charlen-charsub.out.bin b/test/asm/charlen-charsub.out.bin new file mode 100644 index 0000000000000000000000000000000000000000..fe254d69faa4640ca2fed6e32c71069a86e15cc2 GIT binary patch literal 20 bcma#(U=UC+h;Yi!NwM(@_3^QDbaDm&IeP__ literal 0 HcmV?d00001