From 975f85260da45ee2e12b55204bfd7434e15960ac Mon Sep 17 00:00:00 2001 From: dbrotz <43593771+dbrotz@users.noreply.github.com> Date: Sun, 2 Jun 2019 16:10:34 -0700 Subject: [PATCH] Use code points instead of bytes for STRSUB/STRLEN --- src/asm/asmy.y | 94 ++++++++++++++++++++++++++++++++++++++++----- test/asm/strlen.asm | 9 +++++ test/asm/strlen.out | 2 + test/asm/strsub.asm | 22 +++++++++++ test/asm/strsub.out | 31 +++++++++++++++ 5 files changed, 148 insertions(+), 10 deletions(-) create mode 100644 test/asm/strlen.asm create mode 100644 test/asm/strlen.out create mode 100644 test/asm/strsub.asm create mode 100644 test/asm/strsub.out diff --git a/src/asm/asmy.y b/src/asm/asmy.y index d212673f..b03aeac9 100644 --- a/src/asm/asmy.y +++ b/src/asm/asmy.y @@ -26,6 +26,8 @@ #include "asm/rpn.h" #include "asm/symbol.h" +#include "extern/utf8decoder.h" + #include "common.h" #include "linkdefs.h" @@ -431,6 +433,85 @@ static void updateUnion(void) pPCSymbol->nValue = unionStart[unionIndex]; } +static size_t strlenUTF8(const char *s) +{ + size_t len = 0; + uint32_t state = 0; + uint32_t codep = 0; + + while (*s) { + switch (decode(&state, &codep, (uint8_t)*s)) { + case 1: + fatalerror("STRLEN: Invalid UTF-8 character"); + break; + case 0: + len++; + break; + } + s++; + } + + /* Check for partial code point. */ + if (state != 0) + fatalerror("STRLEN: Invalid UTF-8 character"); + + return len; +} + +static void strsubUTF8(char *dest, const char *src, uint32_t pos, uint32_t len) +{ + size_t srcIndex = 0; + size_t destIndex = 0; + uint32_t state = 0; + uint32_t codep = 0; + uint32_t curPos = 1; + uint32_t curLen = 0; + + if (pos < 1) { + warning("STRSUB: Position starts at 1"); + pos = 1; + } + + /* Advance to starting position in source string. */ + while (src[srcIndex] && curPos < pos) { + switch (decode(&state, &codep, (uint8_t)src[srcIndex])) { + case 1: + fatalerror("STRSUB: Invalid UTF-8 character"); + break; + case 0: + curPos++; + break; + } + srcIndex++; + } + + if (!src[srcIndex]) + warning("STRSUB: Position %lu is past the end of the string", + (unsigned long)pos); + + /* Copy from source to destination. */ + while (src[srcIndex] && destIndex < MAXSTRLEN && curLen < len) { + switch (decode(&state, &codep, (uint8_t)src[srcIndex])) { + case 1: + fatalerror("STRSUB: Invalid UTF-8 character"); + break; + case 0: + curLen++; + break; + } + dest[destIndex++] = src[srcIndex++]; + } + + if (curLen < len) + warning("STRSUB: Length too big: %lu", (unsigned long)len); + + /* Check for partial code point. */ + if (state != 0) + fatalerror("STRSUB: Invalid UTF-8 character"); + + dest[destIndex] = 0; +} + %} %union @@ -1249,7 +1330,7 @@ relocconst : T_ID else rpn_Number(&$$, 0); } - | T_OP_STRLEN '(' string ')' { rpn_Number(&$$, strlen($3)); } + | T_OP_STRLEN '(' string ')' { rpn_Number(&$$, strlenUTF8($3)); } | '(' relocconst ')' { $$ = $2; } ; @@ -1327,7 +1408,7 @@ const : T_ID { constexpr_Symbol(&$$, $1); } else constexpr_Number(&$$, 0); } - | T_OP_STRLEN '(' string ')' { constexpr_Number(&$$, strlen($3)); } + | T_OP_STRLEN '(' string ')' { constexpr_Number(&$$, strlenUTF8($3)); } | '(' const ')' { $$ = $2; } ; @@ -1338,14 +1419,7 @@ string : T_STRING } | T_OP_STRSUB '(' string comma uconst comma uconst ')' { - uint32_t len = $7; - if (len > MAXSTRLEN) { - warning("STRSUB: Length too big: %u", len); - len = MAXSTRLEN; - } - - if (snprintf($$, len + 1, "%s", $3 + $5 - 1) > MAXSTRLEN) - warning("STRSUB: String too long '%s'", $$); + strsubUTF8($$, $3, $5, $7); } | T_OP_STRCAT '(' string comma string ')' { diff --git a/test/asm/strlen.asm b/test/asm/strlen.asm new file mode 100644 index 00000000..37d9241d --- /dev/null +++ b/test/asm/strlen.asm @@ -0,0 +1,9 @@ +SECTION "sec", ROM0 + +xstrlen: MACRO + PRINTV STRLEN(\1) + PRINTT "\n" +ENDM + + xstrlen "ABC" + xstrlen "カタカナ" diff --git a/test/asm/strlen.out b/test/asm/strlen.out new file mode 100644 index 00000000..18620f97 --- /dev/null +++ b/test/asm/strlen.out @@ -0,0 +1,2 @@ +$3 +$4 diff --git a/test/asm/strsub.asm b/test/asm/strsub.asm new file mode 100644 index 00000000..23fb8bf5 --- /dev/null +++ b/test/asm/strsub.asm @@ -0,0 +1,22 @@ +SECTION "sec", ROM0 + +xstrsub: MACRO + PRINTT STRSUB(\1, \2, \3) + PRINTT "\n" +ENDM + + xstrsub "ABC", 1, 1 + xstrsub "ABC", 2, 1 + xstrsub "ABC", 3, 1 + xstrsub "ABC", 1, 2 + xstrsub "ABC", 2, 2 + xstrsub "ABC", 2, 32 + xstrsub "ABC", 2, 300 + xstrsub "ABC", 0, 300 + xstrsub "ABC", 4, 0 + xstrsub "ABC", 4, 1 + xstrsub "カタカナ", 1, 2 + xstrsub "カタカナ", 3, 2 + xstrsub "カタカナ", 3, 10 + xstrsub "g̈", 1, 1 + xstrsub "g̈", 1, 2 diff --git a/test/asm/strsub.out b/test/asm/strsub.out new file mode 100644 index 00000000..79eea030 --- /dev/null +++ b/test/asm/strsub.out @@ -0,0 +1,31 @@ +warning: strsub.asm(13) -> xstrsub(1): + STRSUB: Length too big: 32 +warning: strsub.asm(14) -> xstrsub(1): + STRSUB: Length too big: 300 +warning: strsub.asm(15) -> xstrsub(1): + STRSUB: Position starts at 1 +warning: strsub.asm(15) -> xstrsub(1): + STRSUB: Length too big: 300 +warning: strsub.asm(16) -> xstrsub(1): + STRSUB: Position 4 is past the end of the string +warning: strsub.asm(17) -> xstrsub(1): + STRSUB: Position 4 is past the end of the string +warning: strsub.asm(17) -> xstrsub(1): + STRSUB: Length too big: 1 +warning: strsub.asm(20) -> xstrsub(1): + STRSUB: Length too big: 10 +A +B +C +AB +BC +BC +BC +ABC + + +カタ +カナ +カナ +g +g̈