Use std::unordered_map for the keyword dict

This commit is contained in:
Rangi42
2024-02-27 21:58:36 -05:00
committed by Sylvie
parent 962398969b
commit beb1997378
2 changed files with 24 additions and 75 deletions

View File

@@ -14,6 +14,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unordered_map>
#ifndef _MSC_VER
#include <unistd.h>
#endif
@@ -77,17 +78,30 @@
} while (0)
#endif // !( defined(_MSC_VER) || defined(__MINGW32__) )
struct CaseInsensitive {
// FNV-1a hash of an uppercased string
size_t operator()(std::string const &str) const {
size_t hash = 0x811C9DC5;
for (char const &c : str)
hash = (hash ^ toupper(c)) * 16777619;
return hash;
}
// Compare two strings without case-sensitivity (by converting to uppercase)
bool operator()(std::string const &str1, std::string const &str2) const {
return std::equal(RANGE(str1), RANGE(str2), [](char c1, char c2) {
return toupper(c1) == toupper(c2);
});
}
};
// Identifiers that are also keywords are listed here. This ONLY applies to ones
// that would normally be matched as identifiers! Check out `yylex_NORMAL` to
// see how this is used.
// Tokens / keywords not handled here are handled in `yylex_NORMAL`'s switch.
static struct KeywordMapping {
char const *name;
int token;
} const keywords[] = {
// CAUTION when editing this: adding keywords will probably require extra nodes in the
// `keywordDict` array. If you forget to, you will probably trip up an assertion, anyways.
// Also, all entries in this array must be in uppercase for the dict to build correctly.
// This assumes that no two keywords have the same name.
static std::unordered_map<std::string, int, CaseInsensitive, CaseInsensitive> keywordDict = {
{"ADC", T_Z80_ADC},
{"ADD", T_Z80_ADD},
{"AND", T_Z80_AND},
@@ -458,65 +472,6 @@ void lexer_DeleteState(struct LexerState &state)
munmap(state.mmap.ptr, state.mmap.size);
}
struct KeywordDictNode {
// The identifier charset is (currently) 44 characters big. By storing entries for the
// entire printable ASCII charset, minus lower-case due to case-insensitivity,
// we only waste (0x60 - 0x20) - 70 = 20 entries per node, which should be acceptable.
// In turn, this allows greatly simplifying checking an index into this array,
// which should help speed up the lexer.
uint16_t children[0x60 - ' '];
struct KeywordMapping const *keyword;
// Since the keyword structure is invariant, the min number of nodes is known at compile time
} keywordDict[377] = {}; // Make sure to keep this correct when adding keywords!
// Convert a char into its index into the dict
static uint8_t dictIndex(char c)
{
// Translate uppercase to lowercase (roughly)
if (c > 0x60)
c = c - ('a' - 'A');
return c - ' ';
}
void lexer_Init(void)
{
// Build the dictionary of keywords. This could be done at compile time instead, however:
// - Doing so manually is a task nobody wants to undertake
// - It would be massively hard to read
// - Doing it within CC or CPP would be quite non-trivial
// - Doing it externally would require some extra work to use only POSIX tools
// - The startup overhead isn't much compared to the program's
uint16_t usedNodes = 1;
for (size_t i = 0; i < ARRAY_SIZE(keywords); i++) {
uint16_t nodeID = 0;
// Walk the dictionary, creating intermediate nodes for the keyword
for (char const *ptr = keywords[i].name; *ptr; ptr++) {
// We should be able to assume all entries are well-formed
if (keywordDict[nodeID].children[*ptr - ' '] == 0) {
// If this gets tripped up, set the size of keywordDict to
// something high, compile with `-DPRINT_NODE_COUNT` (see below),
// and set the size to that.
assert(usedNodes < sizeof(keywordDict) / sizeof(*keywordDict));
// There is no node at that location, grab one from the pool
keywordDict[nodeID].children[*ptr - ' '] = usedNodes;
usedNodes++;
}
nodeID = keywordDict[nodeID].children[*ptr - ' '];
}
// This assumes that no two keywords have the same name
keywordDict[nodeID].keyword = &keywords[i];
}
#ifdef PRINT_NODE_COUNT // For the maintainer to check how many nodes are needed
printf("Lexer keyword dictionary: %zu keywords in %u nodes (pool size %zu)\n",
ARRAY_SIZE(keywords), usedNodes, ARRAY_SIZE(keywordDict));
#endif
}
void lexer_SetMode(enum LexerMode mode)
{
lexerState->mode = mode;
@@ -1191,7 +1146,6 @@ static int readIdentifier(char firstChar)
{
// Lex while checking for a keyword
yylval.symName[0] = firstChar;
uint16_t nodeID = keywordDict[0].children[dictIndex(firstChar)];
int tokenType = firstChar == '.' ? T_LOCAL_ID : T_ID;
size_t i = 1;
@@ -1206,10 +1160,6 @@ static int readIdentifier(char firstChar)
// If the char was a dot, mark the identifier as local
if (c == '.')
tokenType = T_LOCAL_ID;
// Attempt to traverse the tree to check for a keyword
if (nodeID) // Do nothing if matching already failed
nodeID = keywordDict[nodeID].children[dictIndex(c)];
}
}
@@ -1219,10 +1169,10 @@ static int readIdentifier(char firstChar)
}
yylval.symName[i] = '\0'; // Terminate the string
if (keywordDict[nodeID].keyword)
return keywordDict[nodeID].keyword->token;
// Attempt to check for a keyword
auto search = keywordDict.find(yylval.symName);
return tokenType;
return search != keywordDict.end() ? search->second : tokenType;
}
// Functions to read strings

View File

@@ -395,7 +395,6 @@ int main(int argc, char *argv[])
charmap_New(DEFAULT_CHARMAP_NAME, NULL);
// Init lexer and file stack, providing file info
lexer_Init();
fstk_Init(mainFileName, maxDepth);
// Perform parse (yyparse is auto-generated from `parser.y`)