diff --git a/src/asm/lexer.cpp b/src/asm/lexer.cpp index 6247c974..a6d4d89f 100644 --- a/src/asm/lexer.cpp +++ b/src/asm/lexer.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #ifndef _MSC_VER #include #endif @@ -77,17 +78,30 @@ } while (0) #endif // !( defined(_MSC_VER) || defined(__MINGW32__) ) +struct CaseInsensitive { + // FNV-1a hash of an uppercased string + size_t operator()(std::string const &str) const { + size_t hash = 0x811C9DC5; + + for (char const &c : str) + hash = (hash ^ toupper(c)) * 16777619; + return hash; + } + + // Compare two strings without case-sensitivity (by converting to uppercase) + bool operator()(std::string const &str1, std::string const &str2) const { + return std::equal(RANGE(str1), RANGE(str2), [](char c1, char c2) { + return toupper(c1) == toupper(c2); + }); + } +}; + // Identifiers that are also keywords are listed here. This ONLY applies to ones // that would normally be matched as identifiers! Check out `yylex_NORMAL` to // see how this is used. // Tokens / keywords not handled here are handled in `yylex_NORMAL`'s switch. -static struct KeywordMapping { - char const *name; - int token; -} const keywords[] = { - // CAUTION when editing this: adding keywords will probably require extra nodes in the - // `keywordDict` array. If you forget to, you will probably trip up an assertion, anyways. - // Also, all entries in this array must be in uppercase for the dict to build correctly. +// This assumes that no two keywords have the same name. +static std::unordered_map keywordDict = { {"ADC", T_Z80_ADC}, {"ADD", T_Z80_ADD}, {"AND", T_Z80_AND}, @@ -458,65 +472,6 @@ void lexer_DeleteState(struct LexerState &state) munmap(state.mmap.ptr, state.mmap.size); } -struct KeywordDictNode { - // The identifier charset is (currently) 44 characters big. By storing entries for the - // entire printable ASCII charset, minus lower-case due to case-insensitivity, - // we only waste (0x60 - 0x20) - 70 = 20 entries per node, which should be acceptable. - // In turn, this allows greatly simplifying checking an index into this array, - // which should help speed up the lexer. - uint16_t children[0x60 - ' ']; - struct KeywordMapping const *keyword; -// Since the keyword structure is invariant, the min number of nodes is known at compile time -} keywordDict[377] = {}; // Make sure to keep this correct when adding keywords! - -// Convert a char into its index into the dict -static uint8_t dictIndex(char c) -{ - // Translate uppercase to lowercase (roughly) - if (c > 0x60) - c = c - ('a' - 'A'); - return c - ' '; -} - -void lexer_Init(void) -{ - // Build the dictionary of keywords. This could be done at compile time instead, however: - // - Doing so manually is a task nobody wants to undertake - // - It would be massively hard to read - // - Doing it within CC or CPP would be quite non-trivial - // - Doing it externally would require some extra work to use only POSIX tools - // - The startup overhead isn't much compared to the program's - uint16_t usedNodes = 1; - - for (size_t i = 0; i < ARRAY_SIZE(keywords); i++) { - uint16_t nodeID = 0; - - // Walk the dictionary, creating intermediate nodes for the keyword - for (char const *ptr = keywords[i].name; *ptr; ptr++) { - // We should be able to assume all entries are well-formed - if (keywordDict[nodeID].children[*ptr - ' '] == 0) { - // If this gets tripped up, set the size of keywordDict to - // something high, compile with `-DPRINT_NODE_COUNT` (see below), - // and set the size to that. - assert(usedNodes < sizeof(keywordDict) / sizeof(*keywordDict)); - - // There is no node at that location, grab one from the pool - keywordDict[nodeID].children[*ptr - ' '] = usedNodes; - usedNodes++; - } - nodeID = keywordDict[nodeID].children[*ptr - ' ']; - } - - // This assumes that no two keywords have the same name - keywordDict[nodeID].keyword = &keywords[i]; - } - -#ifdef PRINT_NODE_COUNT // For the maintainer to check how many nodes are needed - printf("Lexer keyword dictionary: %zu keywords in %u nodes (pool size %zu)\n", - ARRAY_SIZE(keywords), usedNodes, ARRAY_SIZE(keywordDict)); -#endif -} - void lexer_SetMode(enum LexerMode mode) { lexerState->mode = mode; @@ -1191,7 +1146,6 @@ static int readIdentifier(char firstChar) { // Lex while checking for a keyword yylval.symName[0] = firstChar; - uint16_t nodeID = keywordDict[0].children[dictIndex(firstChar)]; int tokenType = firstChar == '.' ? T_LOCAL_ID : T_ID; size_t i = 1; @@ -1206,10 +1160,6 @@ static int readIdentifier(char firstChar) // If the char was a dot, mark the identifier as local if (c == '.') tokenType = T_LOCAL_ID; - - // Attempt to traverse the tree to check for a keyword - if (nodeID) // Do nothing if matching already failed - nodeID = keywordDict[nodeID].children[dictIndex(c)]; } } @@ -1219,10 +1169,10 @@ static int readIdentifier(char firstChar) } yylval.symName[i] = '\0'; // Terminate the string - if (keywordDict[nodeID].keyword) - return keywordDict[nodeID].keyword->token; + // Attempt to check for a keyword + auto search = keywordDict.find(yylval.symName); - return tokenType; + return search != keywordDict.end() ? search->second : tokenType; } // Functions to read strings diff --git a/src/asm/main.cpp b/src/asm/main.cpp index 8f0e50d5..ee60689d 100644 --- a/src/asm/main.cpp +++ b/src/asm/main.cpp @@ -395,7 +395,6 @@ int main(int argc, char *argv[]) charmap_New(DEFAULT_CHARMAP_NAME, NULL); // Init lexer and file stack, providing file info - lexer_Init(); fstk_Init(mainFileName, maxDepth); // Perform parse (yyparse is auto-generated from `parser.y`)