diff --git a/TODO b/TODO index 8cb1a309..81c93dfe 100644 --- a/TODO +++ b/TODO @@ -657,6 +657,17 @@ happen with yy_start: stmt | expr). Then adjust the skeletons so that this initial token (YY_START_STMT, YY_START_EXPR) be shifted first in the corresponding parse function. +*** Number of useless symbols +AT_TEST( +[[%start exp; +exp: exp;]], +[[input.y: warning: 2 nonterminals useless in grammar [-Wother] +input.y: warning: 2 rules useless in grammar [-Wother] +input.y:2.8-10: error: start symbol exp does not derive any sentence]]) + +We should say "1 nonterminal": the other one is $accept, which should not +participate in the count. + ** %include This is a popular demand. We already made many changes in the parser that should make this reasonably easy to implement. diff --git a/src/parse-gram.y b/src/parse-gram.y index c849eb80..598639b4 100644 --- a/src/parse-gram.y +++ b/src/parse-gram.y @@ -381,9 +381,9 @@ params: grammar_declaration: symbol_declaration -| "%start" symbol +| "%start" symbols.1 { - grammar_start_symbol_set ($2, @2); + grammar_start_symbols_set ($2); } | code_props_type "{...}" generic_symlist { diff --git a/src/reader.c b/src/reader.c index 6932decd..81955719 100644 --- a/src/reader.c +++ b/src/reader.c @@ -21,7 +21,9 @@ #include #include "system.h" +#include #include +#include #include "complain.h" #include "conflicts.h" @@ -40,7 +42,7 @@ static void prepare_percent_define_front_end_variables (void); static void check_and_convert_grammar (void); static symbol_list *grammar = NULL; -static bool start_flag = false; +symbol_list *start_symbols = NULL; merger_list *merge_functions; /* Was %union seen? */ @@ -54,16 +56,9 @@ bool default_prec = true; `-----------------------*/ void -grammar_start_symbol_set (symbol *sym, location loc) +grammar_start_symbols_set (symbol_list *syms) { - if (start_flag) - complain (&loc, complaint, _("multiple %s declarations"), "%start"); - else - { - start_flag = true; - startsymbol = sym; - startsymbol_loc = loc; - } + start_symbols = symbol_list_append (start_symbols, syms); } @@ -791,6 +786,95 @@ create_start_rule (symbol *swtok, symbol *start) grammar = initial_rule; } +/* Fetch (or create) a token "YY_PARSE_foo" for start symbol "foo". + + We don't use the simple "YY_FOO" because (i) we might get clashes + with some of our symbols (e.g., cast => YY_CAST), and (ii) upcasing + introduces possible clashes between terminal FOO and nonterminal + foo. */ +symbol * +switching_token (const symbol *start) +{ + char buf[100]; + size_t len = sizeof buf; + char *name + = asnprintf (buf, &len, + "YY_PARSE_%s", start->alias ? start->alias->tag : start->tag); + if (!name) + xalloc_die (); + // Setting the location ensures deterministic symbol numbers. + symbol *res = symbol_get (name, start->location); + if (name != buf) + free (name); + symbol_class_set (res, token_sym, start->location, false); + return res; +} + +/* Create the start rules in reverse order, since they are inserted at + the top of the grammar. That way the rules follow the order of + declaration to %start. */ + +static void +create_multiple_start_rules (symbol_list *start_syms) +{ + if (start_syms) + { + create_multiple_start_rules (start_syms->next); + assert (start_syms->content_type == SYMLIST_SYMBOL); + symbol *start = start_syms->content.sym; + symbol *swtok = switching_token (start); + create_start_rule (swtok, start); + } +} + +/* For each start symbol "foo", create the rule "$accept: YY_FOO + foo $end". */ +static void +create_start_rules (void) +{ + if (!start_symbols) + { + symbol *start = find_start_symbol (); + start_symbols = symbol_list_sym_new (start, start->location); + } + + const bool several = start_symbols->next; + if (several) + create_multiple_start_rules (start_symbols); + else + { + symbol *start = start_symbols->content.sym; + create_start_rule (NULL, start); + } +} + +static void +check_start_symbols (void) +{ + // Sanity checks on the start symbols. + for (symbol_list *list = start_symbols; list; list = list->next) + { + const symbol *start = list->content.sym; + if (start->content->class == unknown_sym) + { + complain (&start->location, complaint, + _("the start symbol %s is undefined"), + start->tag); + // I claim this situation is unreachable. This is caught + // before, and we get "symbol 'foo' is used, but is not + // defined as a token and has no rules". + abort (); + } + if (start->content->class == token_sym) + complain (&start->location, complaint, + _("the start symbol %s is a token"), + start->tag); + } + if (complaint_status == status_complaint) + exit (EXIT_FAILURE); +} + + /*-------------------------------------------------------------. | Check the grammar that has just been read, and convert it to | | internal form. | @@ -818,19 +902,12 @@ check_and_convert_grammar (void) } } + /* Insert the initial rule(s). */ + create_start_rules (); + /* Report any undefined symbols and consider them nonterminals. */ symbols_check_defined (); - /* Find the start symbol if no %start. */ - if (!start_flag) - { - symbol *start = find_start_symbol (); - grammar_start_symbol_set (start, start->location); - } - - /* Insert the initial rule. */ - create_start_rule (NULL, startsymbol); - if (SYMBOL_NUMBER_MAXIMUM - nnterms < ntokens) complain (NULL, fatal, "too many symbols in input grammar (limit is %d)", SYMBOL_NUMBER_MAXIMUM); @@ -840,6 +917,8 @@ check_and_convert_grammar (void) /* Assign the symbols their symbol numbers. */ symbols_pack (); + check_start_symbols (); + /* Scan rule actions after invoking symbol_check_alias_consistency (in symbols_pack above) so that token types are set correctly before the rule action type checking. diff --git a/src/reader.h b/src/reader.h index 39ede8c3..b3431dca 100644 --- a/src/reader.h +++ b/src/reader.h @@ -38,7 +38,18 @@ typedef struct merger_list void free_merger_functions (void); extern merger_list *merge_functions; -void grammar_start_symbol_set (symbol *sym, location loc); +/* List of the start symbols. */ +extern symbol_list *start_symbols; + +/* Fetch (or create) a token "YY_PARSE_foo" for start symbol "foo". + + We don't use the simple "YY_FOO" because (i) we might get clashes + with some of our symbols (e.g., cast => YY_CAST), and (ii) upcasing + introduces possible clashes between terminal FOO and nonterminal + foo. */ +symbol *switching_token (const symbol *start); + +void grammar_start_symbols_set (symbol_list *syms); void grammar_current_rule_begin (symbol *lhs, location loc, named_ref *lhs_named_ref); diff --git a/src/reduce.c b/src/reduce.c index 0061b687..c9979e0a 100644 --- a/src/reduce.c +++ b/src/reduce.c @@ -275,7 +275,8 @@ nonterminals_reduce (void) if (!bitset_test (V, i)) { nterm_map[i - ntokens] = n++; - if (symbols[i]->content->status != used) + if (symbols[i]->content->status != used + && symbols[i] != acceptsymbol) complain (&symbols[i]->location, Wother, _("nonterminal useless in grammar: %s"), symbols[i]->tag); @@ -381,10 +382,18 @@ reduce_grammar (void) { reduce_print (); - if (!bitset_test (N, acceptsymbol->content->number - ntokens)) - complain (&startsymbol_loc, fatal, - _("start symbol %s does not derive any sentence"), - startsymbol->tag); + // Check that start symbols have non-empty languages. + bool failure = false; + for (symbol_list *list = start_symbols; list; list = list->next) + if (!bitset_test (N, list->content.sym->content->number - ntokens)) + { + failure = true; + complain (&list->sym_loc, complaint, + _("start symbol %s does not derive any sentence"), + list->content.sym->tag); + } + if (failure) + exit (EXIT_FAILURE); /* First reduce the nonterminals, as they renumber themselves in the whole grammar. If you change the order, nonterms would be diff --git a/src/symtab.c b/src/symtab.c index b5556715..31a3c048 100644 --- a/src/symtab.c +++ b/src/symtab.c @@ -60,8 +60,6 @@ symbol *errtoken = NULL; symbol *undeftoken = NULL; symbol *eoftoken = NULL; symbol *acceptsymbol = NULL; -symbol *startsymbol = NULL; -location startsymbol_loc; /* Precedence relation graph. */ static symgraph **prec_nodes; @@ -1146,15 +1144,6 @@ symbols_pack (void) symbols_token_translations_init (); - if (startsymbol->content->class == unknown_sym) - complain (&startsymbol_loc, fatal, - _("the start symbol %s is undefined"), - startsymbol->tag); - else if (startsymbol->content->class == token_sym) - complain (&startsymbol_loc, fatal, - _("the start symbol %s is a token"), - startsymbol->tag); - // If some user tokens are internationalized, the internal ones // should be too. if (has_translations ()) diff --git a/src/symtab.h b/src/symtab.h index e85e5468..1ec8042b 100644 --- a/src/symtab.h +++ b/src/symtab.h @@ -247,11 +247,6 @@ extern symbol *eoftoken; $accept: start-symbol $end */ extern symbol *acceptsymbol; -/** The user start symbol. */ -extern symbol *startsymbol; -/** The location of the \c \%start declaration. */ -extern location startsymbol_loc; - /** Whether a symbol declared with a type tag. */ extern bool tag_seen; diff --git a/tests/reduce.at b/tests/reduce.at index b561100e..c1af62ee 100644 --- a/tests/reduce.at +++ b/tests/reduce.at @@ -445,23 +445,69 @@ AT_CLEANUP -## ---------------- ## -## Empty Language. ## -## ---------------- ## +## ------------------- ## +## Bad start symbols. ## +## ------------------- ## -AT_SETUP([Empty Language]) +AT_SETUP([Bad start symbols]) +m4_pushdef([AT_TEST], +[ AT_DATA([[input.y]], -[[%output "input.c" -%% -exp: exp; -]]) +[%% +$1 +]) AT_BISON_CHECK([[input.y]], 1, [], +[$2 +]) +]) + +AT_TEST( +[[exp: exp;]], [[input.y: warning: 2 nonterminals useless in grammar [-Wother] input.y: warning: 2 rules useless in grammar [-Wother] -input.y:3.1-3: fatal error: start symbol exp does not derive any sentence -]]) +input.y:2.1-3: error: start symbol exp does not derive any sentence]]) + +AT_TEST( +[[%start exp; +exp: exp;]], +[[input.y: warning: 2 nonterminals useless in grammar [-Wother] +input.y: warning: 2 rules useless in grammar [-Wother] +input.y:2.8-10: error: start symbol exp does not derive any sentence]]) + +AT_TEST( +[[%start exp stmt; +exp: exp; +stmt: "stmt"]], +[[input.y: warning: 1 nonterminal useless in grammar [-Wother] +input.y: warning: 2 rules useless in grammar [-Wother] +input.y:2.8-10: error: start symbol exp does not derive any sentence]]) + +AT_TEST( +[[%start exp stmt; +exp: exp; +stmt: stmt]], +[[input.y: warning: 3 nonterminals useless in grammar [-Wother] +input.y: warning: 4 rules useless in grammar [-Wother] +input.y:2.8-10: error: start symbol exp does not derive any sentence +input.y:2.12-15: error: start symbol stmt does not derive any sentence]]) + +AT_TEST( +[[%start exp; +stmt: stmt]], +[[input.y:2.8-10: warning: symbol 'exp' is used, but is not defined as a token and has no rules [-Wother] +input.y: warning: 3 nonterminals useless in grammar [-Wother] +input.y: warning: 2 rules useless in grammar [-Wother] +input.y:2.8-10: error: start symbol exp does not derive any sentence]]) + +AT_TEST( +[[%token FOO; +%start FOO; +stmt: FOO]], +[[input.y:2.8-10: error: the start symbol FOO is a token]]) + +m4_popdef([AT_TEST]) AT_CLEANUP