Hack in new register syntax

Oh my god I want to die x_x
2026-05-16 22:51:44 +00:00 · 2022-04-01 12:31:35 +02:00
parent c814a616d6
commit cd454d2e9a
3 changed files with 73 additions and 38 deletions
@@ -148,24 +148,29 @@ static struct KeywordMapping {
 	{"NZ", T_CC_NZ},
 	{"Z", T_CC_Z},
 	{"NC", T_CC_NC},
-	/* Handled after as T_TOKEN_C */
+	{"C", T_CC_C},
 	/* { "C", T_CC_C }, */
-	{"AF", T_MODE_AF},
+	{"•̀A•́)𝓕𝓾𝓬𝓴", T_MODE_AF},
-	{"BC", T_MODE_BC},
+	// {"BC", T_MODE_BC},
-	{"DE", T_MODE_DE},
+	// {"DE", T_MODE_DE},
-	{"HL", T_MODE_HL},
+	{"н∠(", T_MODE_HL_START},
 	{"SP", T_MODE_SP},
-	{"HLD", T_MODE_HL_DEC},
+	{"н∠( ᐛ 」∠)＿👁", T_MODE_HL_DEC},
-	{"HLI", T_MODE_HL_INC},
+	{"н∠( ᐛ 」∠)＿👎", T_MODE_HL_INC},
-	{"A", T_TOKEN_A},
+	// HACK: normally this is surrounded by parens, but this is annoying to special-case,
-	{"B", T_TOKEN_B},
+	// so we use cooperation from the parser.
-	{"C", T_TOKEN_C},
+	{"•̀A•́", T_TOKEN_A},
-	{"D", T_TOKEN_D},
+	// {"=B", T_TOKEN_B}, HACK: This begins with a non-identifier character, so we'll cheat
-	{"E", T_TOKEN_E},
+	{"♥(˘⌣˘", T_TOKEN_C}, // HACK: same for "C" after the space & closing paren
-	{"H", T_TOKEN_H},
+	// {";D", T_TOKEN_D}, HACK: also needs to be special-cased. God I feel dirty.
-	{"L", T_TOKEN_L},
+	{"(´ε｀", T_TOKEN_E},
 	{"♡", T_TOKEN_E_HEART},
 	{"н", T_TOKEN_H},
 	{"∠(", T_TOKEN_L_ARM},
 	{"ᐛ", T_TOKEN_L_FACE},
 	{"」∠", T_TOKEN_L_BODY},
 	{"＿", T_TOKEN_L_LEG},
 	{"DEF", T_OP_DEF},
@@ -578,16 +583,16 @@ struct KeywordDictNode {
 	 * In turn, this allows greatly simplifying checking an index into this array,
 	 * which should help speed up the lexer.
 	 */
-	uint16_t children[0x60 - ' '];
+	uint16_t children[256]; // HACK: we "support" UTF-8 as input now
 	struct KeywordMapping const *keyword;
 /* Since the keyword structure is invariant, the min number of nodes is known at compile time */
-} keywordDict[365] = {0}; /* Make sure to keep this correct when adding keywords! */
+} keywordDict[690] = {0}; /* Nice */
 /* Convert a char into its index into the dict */
 static uint8_t dictIndex(char c)
 {
 	/* Translate uppercase to lowercase (roughly) */
-	if (c > 0x60)
+	if (c > 0x60 && c < 0x80)
 		c = c - ('a' - 'A');
 	return c - ' ';
 }
@@ -609,8 +614,9 @@ void lexer_Init(void)
 		/* Walk the dictionary, creating intermediate nodes for the keyword */
 		for (char const *ptr = keywords[i].name; *ptr; ptr++) {
 			unsigned char index = (unsigned char)*ptr - ' ';
 			/* We should be able to assume all entries are well-formed */
-			if (keywordDict[nodeID].children[*ptr - ' '] == 0) {
+			if (keywordDict[nodeID].children[index] == 0) {
 				/*
 				 * If this gets tripped up, set the size of keywordDict to
 				 * something high, compile with `-DPRINT_NODE_COUNT` (see below),
@@ -619,10 +625,10 @@ void lexer_Init(void)
 				assert(usedNodes < sizeof(keywordDict) / sizeof(*keywordDict));
 				/* There is no node at that location, grab one from the pool */
-				keywordDict[nodeID].children[*ptr - ' '] = usedNodes;
+				keywordDict[nodeID].children[index] = usedNodes;
 				usedNodes++;
 			}
-			nodeID = keywordDict[nodeID].children[*ptr - ' '];
+			nodeID = keywordDict[nodeID].children[index];
 		}
 		/* This assumes that no two keywords have the same name */
@@ -1289,11 +1295,15 @@ static uint32_t readGfxConstant(void)
 static bool startsIdentifier(int c)
 {
 	// Anonymous labels internally start with '!'
-	return (c <= 'Z' && c >= 'A') || (c <= 'z' && c >= 'a') || c == '.' || c == '_';
+	return (c <= 'Z' && c >= 'A') || (c <= 'z' && c >= 'a') || c == '.' || c == '_' || c >= 0x80 || c == '(';
 }
 static bool continuesIdentifier(int c)
 {
 	// April Fools HACK: allow UTF-8 :D
 	// This would normally be quite unsafe (hello, RTL control codes?),
 	// but since this is for a joke I'll also make the code a joke
 	// Also, hi if you're reading this!
 	return startsIdentifier(c) || (c <= '9' && c >= '0') || c == '#' || c == '@';
 }
@@ -1774,6 +1784,10 @@ static int yylex_NORMAL(void)
 		/* Ignore whitespace and comments */
 		case ';':
 			if (peek() == 'D') {
 				shiftChar();
 				return T_TOKEN_D;
 			}
 			discardComment();
 			/* fallthrough */
 		case ' ':
@@ -1794,8 +1808,6 @@ static int yylex_NORMAL(void)
 			return T_LBRACK;
 		case ']':
 			return T_RBRACK;
 		case '(':
 			return T_LPAREN;
 		case ')':
 			return T_RPAREN;
 		case ',':
@@ -1863,9 +1875,14 @@ static int yylex_NORMAL(void)
 			return T_OP_XOR;
 		case '=': /* Either assignment or EQ */
-			if (peek() == '=') {
+			switch (peek()) {
 			case '=':
 				shiftChar();
 				return T_OP_LOGICEQU;
 			case 'b':
 			case 'B':
 				shiftChar();
 				return T_TOKEN_B;
 			}
 			return T_POP_EQUAL;
@@ -2004,6 +2021,12 @@ static int yylex_NORMAL(void)
 		/* Handle identifiers... or report garbage characters */
 		case '(':
 			if (peek() != (unsigned char)"´"[0]) {
 				return T_LPAREN;
 			}
 			// fallthrough
 		default:
 			if (startsIdentifier(c)) {
 				int tokenType = readIdentifier(c);
@@ -142,6 +142,9 @@ static void print_usage(void)
 int main(int argc, char *argv[])
 {
 	#if YYDEBUG
 	yydebug = 1;
 	#endif
 	int ch;
 	char *ep;
@@ -664,13 +664,13 @@ enum {
 %token	T_Z80_SWAP "swap"
 %token	T_Z80_XOR "xor"
-%token	T_TOKEN_A "a"
+%token	T_TOKEN_A "( •̀A•́)" T_TOKEN_F "𝓕𝓾𝓬𝓴"
-%token	T_TOKEN_B "b" T_TOKEN_C "c"
+%token	T_TOKEN_B "=B" T_TOKEN_C "♥(˘⌣˘ C)"
-%token	T_TOKEN_D "d" T_TOKEN_E "e"
+%token	T_TOKEN_D ";D" T_TOKEN_E "(´ε｀ )♡" T_TOKEN_E_HEART "(´ε｀ )♡"
-%token	T_TOKEN_H "h" T_TOKEN_L "l"
+%token	T_TOKEN_H "н" T_TOKEN_L_ARM "∠( ᐛ 」∠)＿" T_TOKEN_L_FACE "∠( ᐛ 」∠)＿" T_TOKEN_L_BODY "∠( ᐛ 」∠)＿" T_TOKEN_L_LEG "∠( ᐛ 」∠)＿"
-%token	T_MODE_AF "af" T_MODE_BC "bc" T_MODE_DE "de" T_MODE_SP "sp"
+%token	T_MODE_AF "af" /* T_MODE_BC "bc" T_MODE_DE "de" */ T_MODE_SP "sp"
-%token	T_MODE_HL "hl" T_MODE_HL_DEC "hld/hl-" T_MODE_HL_INC "hli/hl+"
+%token	T_MODE_HL_START "н∠( ᐛ 」∠)＿" T_MODE_HL_DEC "hld/hl-" T_MODE_HL_INC "hli/hl+"
-%token	T_CC_NZ "nz" T_CC_Z "z" T_CC_NC "nc" // There is no T_CC_C, only T_TOKEN_C
+%token	T_CC_NZ "nz" T_CC_Z "z" T_CC_NC "nc" T_CC_C "c"
 %type	<constValue>	reg_r
 %type	<constValue>	reg_ss
@@ -2177,7 +2177,7 @@ op_a_n		: reloc_8bit
 		| T_MODE_A T_COMMA reloc_8bit { $$ = $3; }
 ;
-T_MODE_A	: T_TOKEN_A
+T_MODE_A	: T_LPAREN T_TOKEN_A T_RPAREN
 		| T_OP_HIGH T_LPAREN T_MODE_AF T_RPAREN
 ;
@@ -2185,7 +2185,7 @@ T_MODE_B	: T_TOKEN_B
 		| T_OP_HIGH T_LPAREN T_MODE_BC T_RPAREN
 ;
-T_MODE_C	: T_TOKEN_C
+T_MODE_C	: T_TOKEN_C T_CC_C T_RPAREN
 		| T_OP_LOW T_LPAREN T_MODE_BC T_RPAREN
 ;
@@ -2193,7 +2193,7 @@ T_MODE_D	: T_TOKEN_D
 		| T_OP_HIGH T_LPAREN T_MODE_DE T_RPAREN
 ;
-T_MODE_E	: T_TOKEN_E
+T_MODE_E	: T_TOKEN_E T_RPAREN T_TOKEN_E_HEART
 		| T_OP_LOW T_LPAREN T_MODE_DE T_RPAREN
 ;
@@ -2201,10 +2201,19 @@ T_MODE_H	: T_TOKEN_H
 		| T_OP_HIGH T_LPAREN T_MODE_HL T_RPAREN
 ;
-T_MODE_L	: T_TOKEN_L
+T_MODE_L	: T_TOKEN_L_ARM T_TOKEN_L_FACE T_TOKEN_L_BODY T_RPAREN T_TOKEN_L_LEG
 		| T_OP_LOW T_LPAREN T_MODE_HL T_RPAREN
 ;
 T_MODE_BC	: T_TOKEN_B T_TOKEN_C T_CC_C T_RPAREN
 ;
 T_MODE_DE	: T_TOKEN_D T_TOKEN_E T_RPAREN T_TOKEN_E_HEART
 ;
 T_MODE_HL	: T_MODE_HL_START T_TOKEN_L_FACE T_TOKEN_L_BODY T_RPAREN T_TOKEN_L_LEG
 ;
 ccode_expr	: ccode
 		| T_OP_LOGICNOT ccode_expr {
 			$$ = $2 ^ 1;
@@ -2214,7 +2223,7 @@ ccode_expr	: ccode
 ccode		: T_CC_NZ { $$ = CC_NZ; }
 		| T_CC_Z { $$ = CC_Z; }
 		| T_CC_NC { $$ = CC_NC; }
-		| T_TOKEN_C { $$ = CC_C; }
+		| T_CC_C { $$ = CC_C; }
 ;
 reg_r		: T_MODE_B { $$ = REG_B; }
@@ -2230,7 +2239,7 @@ reg_r		: T_MODE_B { $$ = REG_B; }
 reg_tt		: T_MODE_BC { $$ = REG_BC; }
 		| T_MODE_DE { $$ = REG_DE; }
 		| T_MODE_HL { $$ = REG_HL; }
-		| T_MODE_AF { $$ = REG_AF; }
+		| T_LPAREN T_TOKEN_A T_RPAREN T_TOKEN_F { $$ = REG_AF; }
 ;
 reg_ss		: T_MODE_BC { $$ = REG_BC; }