mirror of
https://github.com/gbdev/rgbds.git
synced 2025-11-21 10:42:07 +00:00
Use std::unordered_map for the keyword dict
This commit is contained in:
@@ -14,6 +14,7 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
#include <unordered_map>
|
||||||
#ifndef _MSC_VER
|
#ifndef _MSC_VER
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#endif
|
#endif
|
||||||
@@ -77,17 +78,30 @@
|
|||||||
} while (0)
|
} while (0)
|
||||||
#endif // !( defined(_MSC_VER) || defined(__MINGW32__) )
|
#endif // !( defined(_MSC_VER) || defined(__MINGW32__) )
|
||||||
|
|
||||||
|
struct CaseInsensitive {
|
||||||
|
// FNV-1a hash of an uppercased string
|
||||||
|
size_t operator()(std::string const &str) const {
|
||||||
|
size_t hash = 0x811C9DC5;
|
||||||
|
|
||||||
|
for (char const &c : str)
|
||||||
|
hash = (hash ^ toupper(c)) * 16777619;
|
||||||
|
return hash;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compare two strings without case-sensitivity (by converting to uppercase)
|
||||||
|
bool operator()(std::string const &str1, std::string const &str2) const {
|
||||||
|
return std::equal(RANGE(str1), RANGE(str2), [](char c1, char c2) {
|
||||||
|
return toupper(c1) == toupper(c2);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
// Identifiers that are also keywords are listed here. This ONLY applies to ones
|
// Identifiers that are also keywords are listed here. This ONLY applies to ones
|
||||||
// that would normally be matched as identifiers! Check out `yylex_NORMAL` to
|
// that would normally be matched as identifiers! Check out `yylex_NORMAL` to
|
||||||
// see how this is used.
|
// see how this is used.
|
||||||
// Tokens / keywords not handled here are handled in `yylex_NORMAL`'s switch.
|
// Tokens / keywords not handled here are handled in `yylex_NORMAL`'s switch.
|
||||||
static struct KeywordMapping {
|
// This assumes that no two keywords have the same name.
|
||||||
char const *name;
|
static std::unordered_map<std::string, int, CaseInsensitive, CaseInsensitive> keywordDict = {
|
||||||
int token;
|
|
||||||
} const keywords[] = {
|
|
||||||
// CAUTION when editing this: adding keywords will probably require extra nodes in the
|
|
||||||
// `keywordDict` array. If you forget to, you will probably trip up an assertion, anyways.
|
|
||||||
// Also, all entries in this array must be in uppercase for the dict to build correctly.
|
|
||||||
{"ADC", T_Z80_ADC},
|
{"ADC", T_Z80_ADC},
|
||||||
{"ADD", T_Z80_ADD},
|
{"ADD", T_Z80_ADD},
|
||||||
{"AND", T_Z80_AND},
|
{"AND", T_Z80_AND},
|
||||||
@@ -458,65 +472,6 @@ void lexer_DeleteState(struct LexerState &state)
|
|||||||
munmap(state.mmap.ptr, state.mmap.size);
|
munmap(state.mmap.ptr, state.mmap.size);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct KeywordDictNode {
|
|
||||||
// The identifier charset is (currently) 44 characters big. By storing entries for the
|
|
||||||
// entire printable ASCII charset, minus lower-case due to case-insensitivity,
|
|
||||||
// we only waste (0x60 - 0x20) - 70 = 20 entries per node, which should be acceptable.
|
|
||||||
// In turn, this allows greatly simplifying checking an index into this array,
|
|
||||||
// which should help speed up the lexer.
|
|
||||||
uint16_t children[0x60 - ' '];
|
|
||||||
struct KeywordMapping const *keyword;
|
|
||||||
// Since the keyword structure is invariant, the min number of nodes is known at compile time
|
|
||||||
} keywordDict[377] = {}; // Make sure to keep this correct when adding keywords!
|
|
||||||
|
|
||||||
// Convert a char into its index into the dict
|
|
||||||
static uint8_t dictIndex(char c)
|
|
||||||
{
|
|
||||||
// Translate uppercase to lowercase (roughly)
|
|
||||||
if (c > 0x60)
|
|
||||||
c = c - ('a' - 'A');
|
|
||||||
return c - ' ';
|
|
||||||
}
|
|
||||||
|
|
||||||
void lexer_Init(void)
|
|
||||||
{
|
|
||||||
// Build the dictionary of keywords. This could be done at compile time instead, however:
|
|
||||||
// - Doing so manually is a task nobody wants to undertake
|
|
||||||
// - It would be massively hard to read
|
|
||||||
// - Doing it within CC or CPP would be quite non-trivial
|
|
||||||
// - Doing it externally would require some extra work to use only POSIX tools
|
|
||||||
// - The startup overhead isn't much compared to the program's
|
|
||||||
uint16_t usedNodes = 1;
|
|
||||||
|
|
||||||
for (size_t i = 0; i < ARRAY_SIZE(keywords); i++) {
|
|
||||||
uint16_t nodeID = 0;
|
|
||||||
|
|
||||||
// Walk the dictionary, creating intermediate nodes for the keyword
|
|
||||||
for (char const *ptr = keywords[i].name; *ptr; ptr++) {
|
|
||||||
// We should be able to assume all entries are well-formed
|
|
||||||
if (keywordDict[nodeID].children[*ptr - ' '] == 0) {
|
|
||||||
// If this gets tripped up, set the size of keywordDict to
|
|
||||||
// something high, compile with `-DPRINT_NODE_COUNT` (see below),
|
|
||||||
// and set the size to that.
|
|
||||||
assert(usedNodes < sizeof(keywordDict) / sizeof(*keywordDict));
|
|
||||||
|
|
||||||
// There is no node at that location, grab one from the pool
|
|
||||||
keywordDict[nodeID].children[*ptr - ' '] = usedNodes;
|
|
||||||
usedNodes++;
|
|
||||||
}
|
|
||||||
nodeID = keywordDict[nodeID].children[*ptr - ' '];
|
|
||||||
}
|
|
||||||
|
|
||||||
// This assumes that no two keywords have the same name
|
|
||||||
keywordDict[nodeID].keyword = &keywords[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef PRINT_NODE_COUNT // For the maintainer to check how many nodes are needed
|
|
||||||
printf("Lexer keyword dictionary: %zu keywords in %u nodes (pool size %zu)\n",
|
|
||||||
ARRAY_SIZE(keywords), usedNodes, ARRAY_SIZE(keywordDict));
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
void lexer_SetMode(enum LexerMode mode)
|
void lexer_SetMode(enum LexerMode mode)
|
||||||
{
|
{
|
||||||
lexerState->mode = mode;
|
lexerState->mode = mode;
|
||||||
@@ -1191,7 +1146,6 @@ static int readIdentifier(char firstChar)
|
|||||||
{
|
{
|
||||||
// Lex while checking for a keyword
|
// Lex while checking for a keyword
|
||||||
yylval.symName[0] = firstChar;
|
yylval.symName[0] = firstChar;
|
||||||
uint16_t nodeID = keywordDict[0].children[dictIndex(firstChar)];
|
|
||||||
int tokenType = firstChar == '.' ? T_LOCAL_ID : T_ID;
|
int tokenType = firstChar == '.' ? T_LOCAL_ID : T_ID;
|
||||||
size_t i = 1;
|
size_t i = 1;
|
||||||
|
|
||||||
@@ -1206,10 +1160,6 @@ static int readIdentifier(char firstChar)
|
|||||||
// If the char was a dot, mark the identifier as local
|
// If the char was a dot, mark the identifier as local
|
||||||
if (c == '.')
|
if (c == '.')
|
||||||
tokenType = T_LOCAL_ID;
|
tokenType = T_LOCAL_ID;
|
||||||
|
|
||||||
// Attempt to traverse the tree to check for a keyword
|
|
||||||
if (nodeID) // Do nothing if matching already failed
|
|
||||||
nodeID = keywordDict[nodeID].children[dictIndex(c)];
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1219,10 +1169,10 @@ static int readIdentifier(char firstChar)
|
|||||||
}
|
}
|
||||||
yylval.symName[i] = '\0'; // Terminate the string
|
yylval.symName[i] = '\0'; // Terminate the string
|
||||||
|
|
||||||
if (keywordDict[nodeID].keyword)
|
// Attempt to check for a keyword
|
||||||
return keywordDict[nodeID].keyword->token;
|
auto search = keywordDict.find(yylval.symName);
|
||||||
|
|
||||||
return tokenType;
|
return search != keywordDict.end() ? search->second : tokenType;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Functions to read strings
|
// Functions to read strings
|
||||||
|
|||||||
@@ -395,7 +395,6 @@ int main(int argc, char *argv[])
|
|||||||
charmap_New(DEFAULT_CHARMAP_NAME, NULL);
|
charmap_New(DEFAULT_CHARMAP_NAME, NULL);
|
||||||
|
|
||||||
// Init lexer and file stack, providing file info
|
// Init lexer and file stack, providing file info
|
||||||
lexer_Init();
|
|
||||||
fstk_Init(mainFileName, maxDepth);
|
fstk_Init(mainFileName, maxDepth);
|
||||||
|
|
||||||
// Perform parse (yyparse is auto-generated from `parser.y`)
|
// Perform parse (yyparse is auto-generated from `parser.y`)
|
||||||
|
|||||||
Reference in New Issue
Block a user