Use code points instead of bytes for STRSUB/STRLEN

This commit is contained in:
dbrotz
2019-06-02 16:10:34 -07:00
parent f29d768989
commit 975f85260d
5 changed files with 148 additions and 10 deletions

View File

@@ -26,6 +26,8 @@
#include "asm/rpn.h"
#include "asm/symbol.h"
#include "extern/utf8decoder.h"
#include "common.h"
#include "linkdefs.h"
@@ -431,6 +433,85 @@ static void updateUnion(void)
pPCSymbol->nValue = unionStart[unionIndex];
}
static size_t strlenUTF8(const char *s)
{
size_t len = 0;
uint32_t state = 0;
uint32_t codep = 0;
while (*s) {
switch (decode(&state, &codep, (uint8_t)*s)) {
case 1:
fatalerror("STRLEN: Invalid UTF-8 character");
break;
case 0:
len++;
break;
}
s++;
}
/* Check for partial code point. */
if (state != 0)
fatalerror("STRLEN: Invalid UTF-8 character");
return len;
}
static void strsubUTF8(char *dest, const char *src, uint32_t pos, uint32_t len)
{
size_t srcIndex = 0;
size_t destIndex = 0;
uint32_t state = 0;
uint32_t codep = 0;
uint32_t curPos = 1;
uint32_t curLen = 0;
if (pos < 1) {
warning("STRSUB: Position starts at 1");
pos = 1;
}
/* Advance to starting position in source string. */
while (src[srcIndex] && curPos < pos) {
switch (decode(&state, &codep, (uint8_t)src[srcIndex])) {
case 1:
fatalerror("STRSUB: Invalid UTF-8 character");
break;
case 0:
curPos++;
break;
}
srcIndex++;
}
if (!src[srcIndex])
warning("STRSUB: Position %lu is past the end of the string",
(unsigned long)pos);
/* Copy from source to destination. */
while (src[srcIndex] && destIndex < MAXSTRLEN && curLen < len) {
switch (decode(&state, &codep, (uint8_t)src[srcIndex])) {
case 1:
fatalerror("STRSUB: Invalid UTF-8 character");
break;
case 0:
curLen++;
break;
}
dest[destIndex++] = src[srcIndex++];
}
if (curLen < len)
warning("STRSUB: Length too big: %lu", (unsigned long)len);
/* Check for partial code point. */
if (state != 0)
fatalerror("STRSUB: Invalid UTF-8 character");
dest[destIndex] = 0;
}
%}
%union
@@ -1249,7 +1330,7 @@ relocconst : T_ID
else
rpn_Number(&$$, 0);
}
| T_OP_STRLEN '(' string ')' { rpn_Number(&$$, strlen($3)); }
| T_OP_STRLEN '(' string ')' { rpn_Number(&$$, strlenUTF8($3)); }
| '(' relocconst ')' { $$ = $2; }
;
@@ -1327,7 +1408,7 @@ const : T_ID { constexpr_Symbol(&$$, $1); }
else
constexpr_Number(&$$, 0);
}
| T_OP_STRLEN '(' string ')' { constexpr_Number(&$$, strlen($3)); }
| T_OP_STRLEN '(' string ')' { constexpr_Number(&$$, strlenUTF8($3)); }
| '(' const ')' { $$ = $2; }
;
@@ -1338,14 +1419,7 @@ string : T_STRING
}
| T_OP_STRSUB '(' string comma uconst comma uconst ')'
{
uint32_t len = $7;
if (len > MAXSTRLEN) {
warning("STRSUB: Length too big: %u", len);
len = MAXSTRLEN;
}
if (snprintf($$, len + 1, "%s", $3 + $5 - 1) > MAXSTRLEN)
warning("STRSUB: String too long '%s'", $$);
strsubUTF8($$, $3, $5, $7);
}
| T_OP_STRCAT '(' string comma string ')'
{

9
test/asm/strlen.asm Normal file
View File

@@ -0,0 +1,9 @@
SECTION "sec", ROM0
xstrlen: MACRO
PRINTV STRLEN(\1)
PRINTT "\n"
ENDM
xstrlen "ABC"
xstrlen "カタカナ"

2
test/asm/strlen.out Normal file
View File

@@ -0,0 +1,2 @@
$3
$4

22
test/asm/strsub.asm Normal file
View File

@@ -0,0 +1,22 @@
SECTION "sec", ROM0
xstrsub: MACRO
PRINTT STRSUB(\1, \2, \3)
PRINTT "\n"
ENDM
xstrsub "ABC", 1, 1
xstrsub "ABC", 2, 1
xstrsub "ABC", 3, 1
xstrsub "ABC", 1, 2
xstrsub "ABC", 2, 2
xstrsub "ABC", 2, 32
xstrsub "ABC", 2, 300
xstrsub "ABC", 0, 300
xstrsub "ABC", 4, 0
xstrsub "ABC", 4, 1
xstrsub "カタカナ", 1, 2
xstrsub "カタカナ", 3, 2
xstrsub "カタカナ", 3, 10
xstrsub "g̈", 1, 1
xstrsub "g̈", 1, 2

31
test/asm/strsub.out Normal file
View File

@@ -0,0 +1,31 @@
warning: strsub.asm(13) -> xstrsub(1):
STRSUB: Length too big: 32
warning: strsub.asm(14) -> xstrsub(1):
STRSUB: Length too big: 300
warning: strsub.asm(15) -> xstrsub(1):
STRSUB: Position starts at 1
warning: strsub.asm(15) -> xstrsub(1):
STRSUB: Length too big: 300
warning: strsub.asm(16) -> xstrsub(1):
STRSUB: Position 4 is past the end of the string
warning: strsub.asm(17) -> xstrsub(1):
STRSUB: Position 4 is past the end of the string
warning: strsub.asm(17) -> xstrsub(1):
STRSUB: Length too big: 1
warning: strsub.asm(20) -> xstrsub(1):
STRSUB: Length too big: 10
A
B
C
AB
BC
BC
BC
ABC
カタ
カナ
カナ
g