diff --git a/include/asm/charmap.h b/include/asm/charmap.h index a1a13004..4f8c2ac9 100644 --- a/include/asm/charmap.h +++ b/include/asm/charmap.h @@ -11,38 +11,12 @@ #include -#include "asm/symbol.h" - -#define MAXCHARMAPS 512 -#define CHARMAPLENGTH 16 -#define MAXCHARNODES (MAXCHARMAPS * CHARMAPLENGTH + 1) - -/* - * A node for trie structure. - */ -struct Charnode { - uint8_t code; /* the value in a key-value pair. */ - uint8_t isCode; /* has 1 if it's a code node, not just a bridge node. */ - struct Charnode *next[256]; /* each index representing the next possible - * character from its current state. - */ -}; - -struct Charmap { - char name[MAXSYMLEN + 1]; - int32_t charCount; /* user-side count. */ - int32_t nodeCount; /* node-side count. */ - struct Charnode nodes[MAXCHARNODES]; /* first node is reserved for the - * root node in charmap. - */ -}; - -void charmap_InitMain(void); struct Charmap *charmap_New(const char *name, const char *baseName); +void charmap_Delete(struct Charmap *charmap); void charmap_Set(const char *name); void charmap_Push(void); void charmap_Pop(void); -int32_t charmap_Add(char *input, uint8_t output); -int32_t charmap_Convert(char **input); +void charmap_Add(char *mapping, uint8_t value); +size_t charmap_Convert(char const *input, uint8_t *output); #endif /* RGBDS_ASM_CHARMAP_H */ diff --git a/include/asm/util.h b/include/asm/util.h index 6e40fa83..c03281f5 100644 --- a/include/asm/util.h +++ b/include/asm/util.h @@ -12,6 +12,6 @@ #include uint32_t calchash(const char *s); -int32_t readUTF8Char(char *dest, char *src); +size_t readUTF8Char(uint8_t *dest, char const *src); #endif /* RGBDS_UTIL_H */ diff --git a/include/asm/warning.h b/include/asm/warning.h index ac2932d5..485fdefb 100644 --- a/include/asm/warning.h +++ b/include/asm/warning.h @@ -14,19 +14,19 @@ extern unsigned int nbErrors; enum WarningID { - WARNING_ASSERT, /* Assertions */ - WARNING_BUILTIN_ARG, /* Invalid args to builtins */ - WARNING_DIV, /* Division undefined behavior */ - WARNING_EMPTY_DATA_DIRECTIVE, - /* `db`, `dw` or `dl` with no directive in ROM */ - WARNING_EMPTY_ENTRY, /* Empty entry in `db`, `dw` or `dl` */ - WARNING_LARGE_CONSTANT, /* Constants too large */ - WARNING_LONG_STR, /* String too long for internal buffers */ - WARNING_OBSOLETE, /* Obsolete things */ - WARNING_SHIFT, /* Shifting undefined behavior */ - WARNING_SHIFT_AMOUNT, /* Strange shift amount */ - WARNING_TRUNCATION, /* Implicit truncation loses some bits */ - WARNING_USER, /* User warnings */ + WARNING_ASSERT, /* Assertions */ + WARNING_BUILTIN_ARG, /* Invalid args to builtins */ + WARNING_CHARMAP_REDEF, /* Charmap entry re-definition */ + WARNING_DIV, /* Division undefined behavior */ + WARNING_EMPTY_DATA_DIRECTIVE, /* `db`, `dw` or `dl` with no directive in ROM */ + WARNING_EMPTY_ENTRY, /* Empty entry in `db`, `dw` or `dl` */ + WARNING_LARGE_CONSTANT, /* Constants too large */ + WARNING_LONG_STR, /* String too long for internal buffers */ + WARNING_OBSOLETE, /* Obsolete things */ + WARNING_SHIFT, /* Shifting undefined behavior */ + WARNING_SHIFT_AMOUNT, /* Strange shift amount */ + WARNING_TRUNCATION, /* Implicit truncation loses some bits */ + WARNING_USER, /* User warnings */ NB_WARNINGS, diff --git a/src/asm/asmy.y b/src/asm/asmy.y index 50ef200a..6a1694f7 100644 --- a/src/asm/asmy.y +++ b/src/asm/asmy.y @@ -96,7 +96,7 @@ size_t symvaluetostring(char *dest, size_t maxLength, char *symName, return length; } -static uint32_t str2int2(char *s, int32_t length) +static uint32_t str2int2(uint8_t *s, int32_t length) { int32_t i; uint32_t r = 0; @@ -104,7 +104,7 @@ static uint32_t str2int2(char *s, int32_t length) i = length < 4 ? 0 : length - 4; while (i < length) { r <<= 8; - r |= (uint8_t)s[i]; + r |= s[i]; i++; } @@ -1023,9 +1023,7 @@ incbin : T_POP_INCBIN string { charmap : T_POP_CHARMAP string ',' const { if ($4 < INT8_MIN || $4 > UINT8_MAX) warning(WARNING_TRUNCATION, "Expression must be 8-bit\n"); - - if (charmap_Add($2, (uint8_t)$4) == -1) - error("Error adding new charmap mapping: %s\n", strerror(errno)); + charmap_Add($2, (uint8_t)$4); } ; @@ -1132,11 +1130,11 @@ constlist_8bit_entry : /* empty */ { } | reloc_8bit_no_str { out_RelByte(&$1); } | string { - char *s = $1; - int32_t length = charmap_Convert(&s); + uint8_t *output = malloc(strlen($1)); /* Cannot be larger than that */ + int32_t length = charmap_Convert($1, output); - out_AbsByteGroup((uint8_t*)s, length); - free(s); + out_AbsByteGroup(output, length); + free(output); } ; @@ -1189,11 +1187,11 @@ reloc_16bit : relocexpr { relocexpr : relocexpr_no_str | string { - char *s = $1; - int32_t length = charmap_Convert(&s); - uint32_t r = str2int2(s, length); + uint8_t *output = malloc(strlen($1)); /* Cannot be longer than that */ + int32_t length = charmap_Convert($1, output); + uint32_t r = str2int2(output, length); - free(s); + free(output); rpn_Number(&$$, r); } ; diff --git a/src/asm/charmap.c b/src/asm/charmap.c index 146211d0..c8cfca72 100644 --- a/src/asm/charmap.c +++ b/src/asm/charmap.c @@ -22,18 +22,36 @@ #include "hashmap.h" -#define CHARMAP_HASH_SIZE (1 << 9) +/* + * Charmaps are stored using a structure known as "trie". + * Essentially a tree, where each nodes stores a single character's worth of info: + * whether there exists a mapping that ends at the current character, + */ +struct Charnode { + bool isTerminal; /* Whether there exists a mapping that ends here */ + uint8_t value; /* If the above is true, its corresponding value */ + /* This MUST be indexes and not pointers, because pointers get invalidated by `realloc`!! */ + size_t next[255]; /* Indexes of where to go next, 0 = nowhere */ +}; + +#define INITIAL_CAPACITY 32 + +struct Charmap { + char *name; + size_t usedNodes; /* How many nodes are being used */ + size_t capacity; /* How many nodes have been allocated */ + struct Charnode nodes[]; /* first node is reserved for the root node */ +}; + +static HashMap charmaps; + +static struct Charmap *currentCharmap; struct CharmapStackEntry { struct Charmap *charmap; struct CharmapStackEntry *next; }; -static HashMap charmaps; - -static struct Charmap *mainCharmap; -static struct Charmap *currentCharmap; - struct CharmapStackEntry *charmapStack; static inline struct Charmap *charmap_Get(const char *name) @@ -41,16 +59,21 @@ static inline struct Charmap *charmap_Get(const char *name) return hash_GetElement(charmaps, name); } -static void CopyNode(struct Charmap *dest, - const struct Charmap *src, - int nodeIdx) +static inline struct Charmap *resizeCharmap(struct Charmap *map, size_t capacity) { - dest->nodes[nodeIdx].code = src->nodes[nodeIdx].code; - dest->nodes[nodeIdx].isCode = src->nodes[nodeIdx].isCode; - for (int i = 0; i < 256; i++) - if (src->nodes[nodeIdx].next[i]) - dest->nodes[nodeIdx].next[i] = dest->nodes + - (src->nodes[nodeIdx].next[i] - src->nodes); + struct Charmap *new = realloc(map, sizeof(*map) + sizeof(*map->nodes) * capacity); + + if (!new) + fatalerror("Failed to %s charmap: %s\n", + map ? "create" : "resize", strerror(errno)); + new->capacity = capacity; + return new; +} + +static inline void initNode(struct Charnode *node) +{ + node->isTerminal = false; + memset(node->next, 0, sizeof(node->next)); } struct Charmap *charmap_New(const char *name, const char *baseName) @@ -66,28 +89,23 @@ struct Charmap *charmap_New(const char *name, const char *baseName) struct Charmap *charmap = charmap_Get(name); - if (charmap != NULL) { + if (charmap) { error("Charmap '%s' already exists\n", name); - return NULL; + return charmap; } - charmap = malloc(sizeof(*charmap)); - if (charmap == NULL) - fatalerror("Failed to create charmap: %s\n", strerror(errno)); - /* Init the new charmap's fields */ - snprintf(charmap->name, sizeof(charmap->name), "%s", name); - if (base != NULL) { - charmap->charCount = base->charCount; - charmap->nodeCount = base->nodeCount; + if (base) { + charmap = resizeCharmap(NULL, base->capacity); + charmap->usedNodes = base->usedNodes; - for (int i = 0; i < MAXCHARNODES; i++) - CopyNode(charmap, base, i); + memcpy(charmap->nodes, base->nodes, sizeof(base->nodes[0]) * charmap->usedNodes); } else { - charmap->charCount = 0; - charmap->nodeCount = 0; - memset(charmap->nodes, 0, sizeof(charmap->nodes)); + charmap = resizeCharmap(NULL, INITIAL_CAPACITY); + charmap->usedNodes = 1; + initNode(&charmap->nodes[0]); /* Init the root node */ } + charmap->name = strdup(name); hash_AddElement(charmaps, charmap->name, charmap); currentCharmap = charmap; @@ -95,6 +113,12 @@ struct Charmap *charmap_New(const char *name, const char *baseName) return charmap; } +void charmap_Delete(struct Charmap *charmap) +{ + free(charmap->name); + free(charmap); +} + void charmap_Set(const char *name) { struct Charmap *charmap = charmap_Get(name); @@ -109,9 +133,9 @@ void charmap_Push(void) { struct CharmapStackEntry *stackEntry; - stackEntry = malloc(sizeof(struct CharmapStackEntry)); + stackEntry = malloc(sizeof(*stackEntry)); if (stackEntry == NULL) - fatalerror("No memory for charmap stack\n"); + fatalerror("Failed to alloc charmap stack entry: %s\n", strerror(errno)); stackEntry->charmap = currentCharmap; stackEntry->next = charmapStack; @@ -121,8 +145,10 @@ void charmap_Push(void) void charmap_Pop(void) { - if (charmapStack == NULL) - fatalerror("No entries in the charmap stack\n"); + if (charmapStack == NULL) { + error("No entries in the charmap stack\n"); + return; + } struct CharmapStackEntry *top = charmapStack; @@ -131,109 +157,86 @@ void charmap_Pop(void) free(top); } -void charmap_InitMain(void) +void charmap_Add(char *mapping, uint8_t value) { - mainCharmap = charmap_New("main", NULL); -} + struct Charnode *node = ¤tCharmap->nodes[0]; -int32_t charmap_Add(char *input, uint8_t output) -{ - int32_t i; - uint8_t v; + for (uint8_t c; *mapping; mapping++) { + c = *mapping - 1; - struct Charmap *charmap = currentCharmap; - struct Charnode *curr_node, *temp_node; - - curr_node = &charmap->nodes[0]; - - for (i = 0; (v = (uint8_t)input[i]); i++) { - if (curr_node->next[v]) { - curr_node = curr_node->next[v]; + if (node->next[c]) { + node = ¤tCharmap->nodes[node->next[c]]; } else { - temp_node = &charmap->nodes[charmap->nodeCount + 1]; + /* Register next available node */ + node->next[c] = currentCharmap->usedNodes; + /* If no more nodes are available, get new ones */ + if (currentCharmap->usedNodes == currentCharmap->capacity) { + currentCharmap->capacity *= 2; + currentCharmap = resizeCharmap(currentCharmap, currentCharmap->capacity); + } - curr_node->next[v] = temp_node; - curr_node = temp_node; - - ++charmap->nodeCount; + /* Switch to and init new node */ + node = ¤tCharmap->nodes[currentCharmap->usedNodes++]; + initNode(node); } } - /* prevent duplicated keys by accepting only first key-value pair. */ - if (curr_node->isCode) - return charmap->charCount; + if (node->isTerminal) + warning(WARNING_CHARMAP_REDEF, "Overriding charmap mapping"); - curr_node->code = output; - curr_node->isCode = 1; - - return ++charmap->charCount; + node->isTerminal = true; + node->value = value; } -int32_t charmap_Convert(char **input) +size_t charmap_Convert(char const *input, uint8_t *output) { - struct Charmap *charmap = currentCharmap; - struct Charnode *charnode; + /* + * The goal is to match the longest mapping possible. + * For that, advance through the trie with each character read. + * If that would lead to a dead end, rewind characters until the last match, and output. + * If no match, read a UTF-8 codepoint and output that. + */ + size_t outputLen = 0; + struct Charnode const *node = ¤tCharmap->nodes[0]; + struct Charnode const *match = NULL; + size_t rewindDistance = 0; - char *output; - char outchar[8]; + for (;;) { + /* We still want NULs to reach the `else` path, to give a chance to rewind */ + uint8_t c = *input - 1; - int32_t i, match, length; - uint8_t v, foundCode; + if (*input && node->next[c]) { + input++; /* Consume that char */ + rewindDistance++; - output = malloc(strlen(*input)); - if (output == NULL) - fatalerror("Not enough memory for charmap conversion buffer: %s\n", - strerror(errno)); + node = ¤tCharmap->nodes[node->next[c]]; + if (node->isTerminal) { + match = node; + rewindDistance = 0; /* Rewind from after the match */ + } - length = 0; + } else { + input -= rewindDistance; /* Rewind */ + rewindDistance = 0; + node = ¤tCharmap->nodes[0]; - while (**input) { - charnode = &charmap->nodes[0]; + if (match) { /* Arrived at a dead end with a match found */ + *output++ = match->value; + outputLen++; + match = NULL; /* Reset match for next round */ - /* - * Find the longest valid match which has been registered in - * charmap, possibly yielding multiple or no matches. - * The longest match is taken, meaning partial matches shorter - * than the longest one are ignored. - */ - for (i = match = 0; (v = (*input)[i]);) { - if (!charnode->next[v]) + } else if (*input) { /* No match found */ + size_t codepointLen = readUTF8Char(output, input); + + input += codepointLen; /* OK because UTF-8 has no NUL in multi-byte chars */ + output += codepointLen; + outputLen += codepointLen; + } + + if (!*input) break; - - charnode = charnode->next[v]; - i++; - - if (charnode->isCode) { - match = i; - foundCode = charnode->code; - } } - - if (match) { - output[length] = foundCode; - - length++; - } else { - /* - * put a utf-8 character - * if failed to find a match. - */ - match = readUTF8Char(outchar, *input); - - if (match) { - memcpy(output + length, *input, match); - } else { - output[length] = 0; - match = 1; - } - - length += match; - } - - *input += match; } - *input = output; - - return length; + return outputLen; } diff --git a/src/asm/main.c b/src/asm/main.c index 60e70c55..2e94fe4c 100644 --- a/src/asm/main.c +++ b/src/asm/main.c @@ -538,7 +538,7 @@ int main(int argc, char *argv[]) sym_SetExportAll(exportall); fstk_Init(tzMainfile); opt_ParseDefines(); - charmap_InitMain(); + charmap_New("main", NULL); yy_set_state(LEX_STATE_NORMAL); opt_SetCurrentOptions(&DefaultOptions); diff --git a/src/asm/rgbasm.1 b/src/asm/rgbasm.1 index b8b92ef4..cd05c970 100644 --- a/src/asm/rgbasm.1 +++ b/src/asm/rgbasm.1 @@ -203,6 +203,10 @@ Warn about incorrect arguments to built-in functions, such as with indexes outside of the string's bounds. This warning is enabled by .Fl Wall . +.It Fl Wcharmap-redef +Warn when re-defining a charmap mapping. +This warning is enabled by +.Fl Wall . .It Fl Wdiv Warn when dividing the smallest negative integer by -1, which yields itself due to integer overflow. .It Fl Wempty-entry diff --git a/src/asm/util.c b/src/asm/util.c index 3ab9804d..51f88ba8 100644 --- a/src/asm/util.c +++ b/src/asm/util.c @@ -27,21 +27,20 @@ uint32_t calchash(const char *s) return hash; } -int32_t readUTF8Char(char *dest, char *src) +size_t readUTF8Char(uint8_t *dest, char const *src) { - uint32_t state; + uint32_t state = 0; uint32_t codep; - int32_t i; + size_t i = 0; - for (i = 0, state = 0;; i++) { + for (;;) { if (decode(&state, &codep, (uint8_t)src[i]) == 1) fatalerror("invalid UTF-8 character\n"); dest[i] = src[i]; + i++; - if (state == 0) { - dest[++i] = '\0'; + if (state == 0) return i; - } } } diff --git a/src/asm/warning.c b/src/asm/warning.c index fd69ad24..f1fe00a1 100644 --- a/src/asm/warning.c +++ b/src/asm/warning.c @@ -30,6 +30,7 @@ enum WarningState { static enum WarningState const defaultWarnings[NB_WARNINGS] = { [WARNING_ASSERT] = WARNING_ENABLED, [WARNING_BUILTIN_ARG] = WARNING_DISABLED, + [WARNING_CHARMAP_REDEF] = WARNING_DISABLED, [WARNING_DIV] = WARNING_DISABLED, [WARNING_EMPTY_DATA_DIRECTIVE] = WARNING_DISABLED, [WARNING_EMPTY_ENTRY] = WARNING_DISABLED, @@ -68,6 +69,7 @@ static enum WarningState warningState(enum WarningID id) static char const *warningFlags[NB_WARNINGS_ALL] = { "assert", "builtin-args", + "charmap-redef", "div", "empty-data-directive", "empty-entry", @@ -92,6 +94,7 @@ enum MetaWarningCommand { /* Warnings that probably indicate an error */ static uint8_t const _wallCommands[] = { WARNING_BUILTIN_ARG, + WARNING_CHARMAP_REDEF, WARNING_EMPTY_DATA_DIRECTIVE, WARNING_LARGE_CONSTANT, WARNING_LONG_STR, diff --git a/test/asm/multiple-charmaps.err b/test/asm/multiple-charmaps.err index 9649ac4e..13053523 100644 --- a/test/asm/multiple-charmaps.err +++ b/test/asm/multiple-charmaps.err @@ -4,3 +4,4 @@ ERROR: multiple-charmaps.asm(102) -> multiple-charmaps.asm::set_(13): Charmap 'map5' doesn't exist ERROR: multiple-charmaps.asm(104) -> multiple-charmaps.asm::pop_(23): No entries in the charmap stack +error: Assembly aborted (3 errors)!