multistart: turn start symbols into rules on $accept

Now that the parser can read several start symbols, let's process
them, and create the corresponding rules.

* src/parse-gram.y (grammar_declaration): Accept a list of start symbols.
* src/reader.h, src/reader.c (grammar_start_symbol_set): Rename as...
(grammar_start_symbols_set): this.

* src/reader.h, src/reader.c (start_flag): Replace with...
(start_symbols): this.
* src/reader.c (grammar_start_symbols_set): Build a list of start
symbols.
(switching_token, create_start_rules): New.
(check_and_convert_grammar): Use them to turn the list of start
symbols into a set of rules.
* src/reduce.c (nonterminals_reduce): Don't complain about $accept,
it's an internal detail.
(reduce_grammar): Complain about all the start symbols that don't
derive sentences.

* src/symtab.c (startsymbol, startsymbol_loc): Remove, replaced by
start_symbols.
symbols_pack): Move the check about the start symbols
to...
* src/symlist.c (check_start_symbols): here.
Adjust to multiple start symbols.
* tests/reduce.at (Empty Language): Generalize into...
(Bad start symbols): this.
This commit is contained in:
Akim Demaille
2020-02-20 18:11:29 +01:00
parent db68f61595
commit 8eaddf326b
8 changed files with 194 additions and 54 deletions

11
TODO
View File

@@ -657,6 +657,17 @@ happen with yy_start: stmt | expr). Then adjust the skeletons so that this
initial token (YY_START_STMT, YY_START_EXPR) be shifted first in the
corresponding parse function.
*** Number of useless symbols
AT_TEST(
[[%start exp;
exp: exp;]],
[[input.y: warning: 2 nonterminals useless in grammar [-Wother]
input.y: warning: 2 rules useless in grammar [-Wother]
input.y:2.8-10: error: start symbol exp does not derive any sentence]])
We should say "1 nonterminal": the other one is $accept, which should not
participate in the count.
** %include
This is a popular demand. We already made many changes in the parser that
should make this reasonably easy to implement.

View File

@@ -381,9 +381,9 @@ params:
grammar_declaration:
symbol_declaration
| "%start" symbol
| "%start" symbols.1
{
grammar_start_symbol_set ($2, @2);
grammar_start_symbols_set ($2);
}
| code_props_type "{...}" generic_symlist
{

View File

@@ -21,7 +21,9 @@
#include <config.h>
#include "system.h"
#include <c-ctype.h>
#include <quote.h>
#include <vasnprintf.h>
#include "complain.h"
#include "conflicts.h"
@@ -40,7 +42,7 @@ static void prepare_percent_define_front_end_variables (void);
static void check_and_convert_grammar (void);
static symbol_list *grammar = NULL;
static bool start_flag = false;
symbol_list *start_symbols = NULL;
merger_list *merge_functions;
/* Was %union seen? */
@@ -54,16 +56,9 @@ bool default_prec = true;
`-----------------------*/
void
grammar_start_symbol_set (symbol *sym, location loc)
grammar_start_symbols_set (symbol_list *syms)
{
if (start_flag)
complain (&loc, complaint, _("multiple %s declarations"), "%start");
else
{
start_flag = true;
startsymbol = sym;
startsymbol_loc = loc;
}
start_symbols = symbol_list_append (start_symbols, syms);
}
@@ -791,6 +786,95 @@ create_start_rule (symbol *swtok, symbol *start)
grammar = initial_rule;
}
/* Fetch (or create) a token "YY_PARSE_foo" for start symbol "foo".
We don't use the simple "YY_FOO" because (i) we might get clashes
with some of our symbols (e.g., cast => YY_CAST), and (ii) upcasing
introduces possible clashes between terminal FOO and nonterminal
foo. */
symbol *
switching_token (const symbol *start)
{
char buf[100];
size_t len = sizeof buf;
char *name
= asnprintf (buf, &len,
"YY_PARSE_%s", start->alias ? start->alias->tag : start->tag);
if (!name)
xalloc_die ();
// Setting the location ensures deterministic symbol numbers.
symbol *res = symbol_get (name, start->location);
if (name != buf)
free (name);
symbol_class_set (res, token_sym, start->location, false);
return res;
}
/* Create the start rules in reverse order, since they are inserted at
the top of the grammar. That way the rules follow the order of
declaration to %start. */
static void
create_multiple_start_rules (symbol_list *start_syms)
{
if (start_syms)
{
create_multiple_start_rules (start_syms->next);
assert (start_syms->content_type == SYMLIST_SYMBOL);
symbol *start = start_syms->content.sym;
symbol *swtok = switching_token (start);
create_start_rule (swtok, start);
}
}
/* For each start symbol "foo", create the rule "$accept: YY_FOO
foo $end". */
static void
create_start_rules (void)
{
if (!start_symbols)
{
symbol *start = find_start_symbol ();
start_symbols = symbol_list_sym_new (start, start->location);
}
const bool several = start_symbols->next;
if (several)
create_multiple_start_rules (start_symbols);
else
{
symbol *start = start_symbols->content.sym;
create_start_rule (NULL, start);
}
}
static void
check_start_symbols (void)
{
// Sanity checks on the start symbols.
for (symbol_list *list = start_symbols; list; list = list->next)
{
const symbol *start = list->content.sym;
if (start->content->class == unknown_sym)
{
complain (&start->location, complaint,
_("the start symbol %s is undefined"),
start->tag);
// I claim this situation is unreachable. This is caught
// before, and we get "symbol 'foo' is used, but is not
// defined as a token and has no rules".
abort ();
}
if (start->content->class == token_sym)
complain (&start->location, complaint,
_("the start symbol %s is a token"),
start->tag);
}
if (complaint_status == status_complaint)
exit (EXIT_FAILURE);
}
/*-------------------------------------------------------------.
| Check the grammar that has just been read, and convert it to |
| internal form. |
@@ -818,19 +902,12 @@ check_and_convert_grammar (void)
}
}
/* Insert the initial rule(s). */
create_start_rules ();
/* Report any undefined symbols and consider them nonterminals. */
symbols_check_defined ();
/* Find the start symbol if no %start. */
if (!start_flag)
{
symbol *start = find_start_symbol ();
grammar_start_symbol_set (start, start->location);
}
/* Insert the initial rule. */
create_start_rule (NULL, startsymbol);
if (SYMBOL_NUMBER_MAXIMUM - nnterms < ntokens)
complain (NULL, fatal, "too many symbols in input grammar (limit is %d)",
SYMBOL_NUMBER_MAXIMUM);
@@ -840,6 +917,8 @@ check_and_convert_grammar (void)
/* Assign the symbols their symbol numbers. */
symbols_pack ();
check_start_symbols ();
/* Scan rule actions after invoking symbol_check_alias_consistency
(in symbols_pack above) so that token types are set correctly
before the rule action type checking.

View File

@@ -38,7 +38,18 @@ typedef struct merger_list
void free_merger_functions (void);
extern merger_list *merge_functions;
void grammar_start_symbol_set (symbol *sym, location loc);
/* List of the start symbols. */
extern symbol_list *start_symbols;
/* Fetch (or create) a token "YY_PARSE_foo" for start symbol "foo".
We don't use the simple "YY_FOO" because (i) we might get clashes
with some of our symbols (e.g., cast => YY_CAST), and (ii) upcasing
introduces possible clashes between terminal FOO and nonterminal
foo. */
symbol *switching_token (const symbol *start);
void grammar_start_symbols_set (symbol_list *syms);
void grammar_current_rule_begin (symbol *lhs, location loc,
named_ref *lhs_named_ref);

View File

@@ -275,7 +275,8 @@ nonterminals_reduce (void)
if (!bitset_test (V, i))
{
nterm_map[i - ntokens] = n++;
if (symbols[i]->content->status != used)
if (symbols[i]->content->status != used
&& symbols[i] != acceptsymbol)
complain (&symbols[i]->location, Wother,
_("nonterminal useless in grammar: %s"),
symbols[i]->tag);
@@ -381,10 +382,18 @@ reduce_grammar (void)
{
reduce_print ();
if (!bitset_test (N, acceptsymbol->content->number - ntokens))
complain (&startsymbol_loc, fatal,
_("start symbol %s does not derive any sentence"),
startsymbol->tag);
// Check that start symbols have non-empty languages.
bool failure = false;
for (symbol_list *list = start_symbols; list; list = list->next)
if (!bitset_test (N, list->content.sym->content->number - ntokens))
{
failure = true;
complain (&list->sym_loc, complaint,
_("start symbol %s does not derive any sentence"),
list->content.sym->tag);
}
if (failure)
exit (EXIT_FAILURE);
/* First reduce the nonterminals, as they renumber themselves in the
whole grammar. If you change the order, nonterms would be

View File

@@ -60,8 +60,6 @@ symbol *errtoken = NULL;
symbol *undeftoken = NULL;
symbol *eoftoken = NULL;
symbol *acceptsymbol = NULL;
symbol *startsymbol = NULL;
location startsymbol_loc;
/* Precedence relation graph. */
static symgraph **prec_nodes;
@@ -1146,15 +1144,6 @@ symbols_pack (void)
symbols_token_translations_init ();
if (startsymbol->content->class == unknown_sym)
complain (&startsymbol_loc, fatal,
_("the start symbol %s is undefined"),
startsymbol->tag);
else if (startsymbol->content->class == token_sym)
complain (&startsymbol_loc, fatal,
_("the start symbol %s is a token"),
startsymbol->tag);
// If some user tokens are internationalized, the internal ones
// should be too.
if (has_translations ())

View File

@@ -247,11 +247,6 @@ extern symbol *eoftoken;
$accept: start-symbol $end */
extern symbol *acceptsymbol;
/** The user start symbol. */
extern symbol *startsymbol;
/** The location of the \c \%start declaration. */
extern location startsymbol_loc;
/** Whether a symbol declared with a type tag. */
extern bool tag_seen;

View File

@@ -445,23 +445,69 @@ AT_CLEANUP
## ---------------- ##
## Empty Language. ##
## ---------------- ##
## ------------------- ##
## Bad start symbols. ##
## ------------------- ##
AT_SETUP([Empty Language])
AT_SETUP([Bad start symbols])
m4_pushdef([AT_TEST],
[
AT_DATA([[input.y]],
[[%output "input.c"
%%
exp: exp;
]])
[%%
$1
])
AT_BISON_CHECK([[input.y]], 1, [],
[$2
])
])
AT_TEST(
[[exp: exp;]],
[[input.y: warning: 2 nonterminals useless in grammar [-Wother]
input.y: warning: 2 rules useless in grammar [-Wother]
input.y:3.1-3: fatal error: start symbol exp does not derive any sentence
]])
input.y:2.1-3: error: start symbol exp does not derive any sentence]])
AT_TEST(
[[%start exp;
exp: exp;]],
[[input.y: warning: 2 nonterminals useless in grammar [-Wother]
input.y: warning: 2 rules useless in grammar [-Wother]
input.y:2.8-10: error: start symbol exp does not derive any sentence]])
AT_TEST(
[[%start exp stmt;
exp: exp;
stmt: "stmt"]],
[[input.y: warning: 1 nonterminal useless in grammar [-Wother]
input.y: warning: 2 rules useless in grammar [-Wother]
input.y:2.8-10: error: start symbol exp does not derive any sentence]])
AT_TEST(
[[%start exp stmt;
exp: exp;
stmt: stmt]],
[[input.y: warning: 3 nonterminals useless in grammar [-Wother]
input.y: warning: 4 rules useless in grammar [-Wother]
input.y:2.8-10: error: start symbol exp does not derive any sentence
input.y:2.12-15: error: start symbol stmt does not derive any sentence]])
AT_TEST(
[[%start exp;
stmt: stmt]],
[[input.y:2.8-10: warning: symbol 'exp' is used, but is not defined as a token and has no rules [-Wother]
input.y: warning: 3 nonterminals useless in grammar [-Wother]
input.y: warning: 2 rules useless in grammar [-Wother]
input.y:2.8-10: error: start symbol exp does not derive any sentence]])
AT_TEST(
[[%token FOO;
%start FOO;
stmt: FOO]],
[[input.y:2.8-10: error: the start symbol FOO is a token]])
m4_popdef([AT_TEST])
AT_CLEANUP