multistart: turn start symbols into rules on $accept

Now that the parser can read several start symbols, let's process them, and create the corresponding rules. * src/parse-gram.y (grammar_declaration): Accept a list of start symbols. * src/reader.h, src/reader.c (grammar_start_symbol_set): Rename as... (grammar_start_symbols_set): this. * src/reader.h, src/reader.c (start_flag): Replace with... (start_symbols): this. * src/reader.c (grammar_start_symbols_set): Build a list of start symbols. (switching_token, create_start_rules): New. (check_and_convert_grammar): Use them to turn the list of start symbols into a set of rules. * src/reduce.c (nonterminals_reduce): Don't complain about $accept, it's an internal detail. (reduce_grammar): Complain about all the start symbols that don't derive sentences. * src/symtab.c (startsymbol, startsymbol_loc): Remove, replaced by start_symbols. symbols_pack): Move the check about the start symbols to... * src/symlist.c (check_start_symbols): here. Adjust to multiple start symbols. * tests/reduce.at (Empty Language): Generalize into... (Bad start symbols): this.
2026-06-14 03:32:12 +00:00 · 2020-02-20 18:11:29 +01:00
parent db68f61595
commit 8eaddf326b
8 changed files with 194 additions and 54 deletions
@@ -657,6 +657,17 @@ happen with yy_start: stmt | expr).  Then adjust the skeletons so that this
 initial token (YY_START_STMT, YY_START_EXPR) be shifted first in the
 corresponding parse function.
 *** Number of useless symbols
 AT_TEST(
 [[%start exp;
 exp: exp;]],
 [[input.y: warning: 2 nonterminals useless in grammar [-Wother]
 input.y: warning: 2 rules useless in grammar [-Wother]
 input.y:2.8-10: error: start symbol exp does not derive any sentence]])
 We should say "1 nonterminal": the other one is $accept, which should not
 participate in the count.
 ** %include
 This is a popular demand.  We already made many changes in the parser that
 should make this reasonably easy to implement.
@@ -381,9 +381,9 @@ params:
 grammar_declaration:
  symbol_declaration
-| "%start" symbol
+| "%start" symbols.1
    {
-      grammar_start_symbol_set ($2, @2);
+      grammar_start_symbols_set ($2);
    }
 | code_props_type "{...}" generic_symlist
    {
@@ -21,7 +21,9 @@
 #include <config.h>
 #include "system.h"
 #include <c-ctype.h>
 #include <quote.h>
 #include <vasnprintf.h>
 #include "complain.h"
 #include "conflicts.h"
@@ -40,7 +42,7 @@ static void prepare_percent_define_front_end_variables (void);
 static void check_and_convert_grammar (void);
 static symbol_list *grammar = NULL;
-static bool start_flag = false;
+symbol_list *start_symbols = NULL;
 merger_list *merge_functions;
 /* Was %union seen?  */
@@ -54,16 +56,9 @@ bool default_prec = true;
 `-----------------------*/
 void
-grammar_start_symbol_set (symbol *sym, location loc)
+grammar_start_symbols_set (symbol_list *syms)
 {
-  if (start_flag)
+  start_symbols = symbol_list_append (start_symbols, syms);
    complain (&loc, complaint, _("multiple %s declarations"), "%start");
  else
    {
      start_flag = true;
      startsymbol = sym;
      startsymbol_loc = loc;
    }
 }
@@ -791,6 +786,95 @@ create_start_rule (symbol *swtok, symbol *start)
  grammar = initial_rule;
 }
 /* Fetch (or create) a token "YY_PARSE_foo" for start symbol "foo".
   We don't use the simple "YY_FOO" because (i) we might get clashes
   with some of our symbols (e.g., cast => YY_CAST), and (ii) upcasing
   introduces possible clashes between terminal FOO and nonterminal
   foo.  */
 symbol *
 switching_token (const symbol *start)
 {
  char buf[100];
  size_t len = sizeof buf;
  char *name
    = asnprintf (buf, &len,
                 "YY_PARSE_%s", start->alias ? start->alias->tag : start->tag);
  if (!name)
    xalloc_die ();
  // Setting the location ensures deterministic symbol numbers.
  symbol *res = symbol_get (name, start->location);
  if (name != buf)
    free (name);
  symbol_class_set (res, token_sym, start->location, false);
  return res;
 }
 /* Create the start rules in reverse order, since they are inserted at
   the top of the grammar.  That way the rules follow the order of
   declaration to %start.  */
 static void
 create_multiple_start_rules (symbol_list *start_syms)
 {
  if (start_syms)
    {
      create_multiple_start_rules (start_syms->next);
      assert (start_syms->content_type == SYMLIST_SYMBOL);
      symbol *start = start_syms->content.sym;
      symbol *swtok = switching_token (start);
      create_start_rule (swtok, start);
    }
 }
 /* For each start symbol "foo", create the rule "$accept: YY_FOO
   foo $end". */
 static void
 create_start_rules (void)
 {
  if (!start_symbols)
    {
      symbol *start = find_start_symbol ();
      start_symbols = symbol_list_sym_new (start, start->location);
    }
  const bool several = start_symbols->next;
  if (several)
    create_multiple_start_rules (start_symbols);
  else
    {
      symbol *start = start_symbols->content.sym;
      create_start_rule (NULL, start);
    }
 }
 static void
 check_start_symbols (void)
 {
  // Sanity checks on the start symbols.
  for (symbol_list *list = start_symbols; list; list = list->next)
    {
      const symbol *start = list->content.sym;
      if (start->content->class == unknown_sym)
        {
          complain (&start->location, complaint,
                    _("the start symbol %s is undefined"),
                    start->tag);
          // I claim this situation is unreachable.  This is caught
          // before, and we get "symbol 'foo' is used, but is not
          // defined as a token and has no rules".
          abort ();
        }
      if (start->content->class == token_sym)
        complain (&start->location, complaint,
                  _("the start symbol %s is a token"),
                  start->tag);
    }
  if (complaint_status == status_complaint)
    exit (EXIT_FAILURE);
 }
 /*-------------------------------------------------------------.
 | Check the grammar that has just been read, and convert it to |
 | internal form.                                               |
@@ -818,19 +902,12 @@ check_and_convert_grammar (void)
      }
    }
  /* Insert the initial rule(s).  */
  create_start_rules ();
  /* Report any undefined symbols and consider them nonterminals.  */
  symbols_check_defined ();
  /* Find the start symbol if no %start.  */
  if (!start_flag)
    {
      symbol *start = find_start_symbol ();
      grammar_start_symbol_set (start, start->location);
    }
  /* Insert the initial rule.  */
  create_start_rule (NULL, startsymbol);
  if (SYMBOL_NUMBER_MAXIMUM - nnterms < ntokens)
    complain (NULL, fatal, "too many symbols in input grammar (limit is %d)",
              SYMBOL_NUMBER_MAXIMUM);
@@ -840,6 +917,8 @@ check_and_convert_grammar (void)
  /* Assign the symbols their symbol numbers.  */
  symbols_pack ();
  check_start_symbols ();
  /* Scan rule actions after invoking symbol_check_alias_consistency
     (in symbols_pack above) so that token types are set correctly
     before the rule action type checking.
@@ -38,7 +38,18 @@ typedef struct merger_list
 void free_merger_functions (void);
 extern merger_list *merge_functions;
-void grammar_start_symbol_set (symbol *sym, location loc);
+/* List of the start symbols.  */
 extern symbol_list *start_symbols;
 /* Fetch (or create) a token "YY_PARSE_foo" for start symbol "foo".
   We don't use the simple "YY_FOO" because (i) we might get clashes
   with some of our symbols (e.g., cast => YY_CAST), and (ii) upcasing
   introduces possible clashes between terminal FOO and nonterminal
   foo.  */
 symbol *switching_token (const symbol *start);
 void grammar_start_symbols_set (symbol_list *syms);
 void grammar_current_rule_begin (symbol *lhs, location loc,
                                 named_ref *lhs_named_ref);
@@ -275,7 +275,8 @@ nonterminals_reduce (void)
      if (!bitset_test (V, i))
        {
          nterm_map[i - ntokens] = n++;
-          if (symbols[i]->content->status != used)
+          if (symbols[i]->content->status != used
              && symbols[i] != acceptsymbol)
            complain (&symbols[i]->location, Wother,
                      _("nonterminal useless in grammar: %s"),
                      symbols[i]->tag);
@@ -381,10 +382,18 @@ reduce_grammar (void)
    {
      reduce_print ();
-      if (!bitset_test (N, acceptsymbol->content->number - ntokens))
+      // Check that start symbols have non-empty languages.
-        complain (&startsymbol_loc, fatal,
+      bool failure = false;
-                  _("start symbol %s does not derive any sentence"),
+      for (symbol_list *list = start_symbols; list; list = list->next)
-                  startsymbol->tag);
+        if (!bitset_test (N, list->content.sym->content->number - ntokens))
          {
            failure = true;
            complain (&list->sym_loc, complaint,
                      _("start symbol %s does not derive any sentence"),
                      list->content.sym->tag);
          }
      if (failure)
        exit (EXIT_FAILURE);
      /* First reduce the nonterminals, as they renumber themselves in the
         whole grammar.  If you change the order, nonterms would be
@@ -60,8 +60,6 @@ symbol *errtoken = NULL;
 symbol *undeftoken = NULL;
 symbol *eoftoken = NULL;
 symbol *acceptsymbol = NULL;
 symbol *startsymbol = NULL;
 location startsymbol_loc;
 /* Precedence relation graph. */
 static symgraph **prec_nodes;
@@ -1146,15 +1144,6 @@ symbols_pack (void)
  symbols_token_translations_init ();
  if (startsymbol->content->class == unknown_sym)
    complain (&startsymbol_loc, fatal,
              _("the start symbol %s is undefined"),
              startsymbol->tag);
  else if (startsymbol->content->class == token_sym)
    complain (&startsymbol_loc, fatal,
              _("the start symbol %s is a token"),
              startsymbol->tag);
  // If some user tokens are internationalized, the internal ones
  // should be too.
  if (has_translations ())
@@ -247,11 +247,6 @@ extern symbol *eoftoken;
   $accept: start-symbol $end */
 extern symbol *acceptsymbol;
 /** The user start symbol. */
 extern symbol *startsymbol;
 /** The location of the \c \%start declaration.  */
 extern location startsymbol_loc;
 /** Whether a symbol declared with a type tag.  */
 extern bool tag_seen;
@@ -445,23 +445,69 @@ AT_CLEANUP
-## ---------------- ##
+## ------------------- ##
-## Empty Language.  ##
+## Bad start symbols.  ##
-## ---------------- ##
+## ------------------- ##
-AT_SETUP([Empty Language])
+AT_SETUP([Bad start symbols])
 m4_pushdef([AT_TEST],
 [
 AT_DATA([[input.y]],
-[[%output "input.c"
+[%%
-%%
+$1
-exp: exp;
+])
 ]])
 AT_BISON_CHECK([[input.y]], 1, [],
 [$2
 ])
 ])
 AT_TEST(
 [[exp: exp;]],
 [[input.y: warning: 2 nonterminals useless in grammar [-Wother]
 input.y: warning: 2 rules useless in grammar [-Wother]
-input.y:3.1-3: fatal error: start symbol exp does not derive any sentence
+input.y:2.1-3: error: start symbol exp does not derive any sentence]])
-]])
+
 AT_TEST(
 [[%start exp;
 exp: exp;]],
 [[input.y: warning: 2 nonterminals useless in grammar [-Wother]
 input.y: warning: 2 rules useless in grammar [-Wother]
 input.y:2.8-10: error: start symbol exp does not derive any sentence]])
 AT_TEST(
 [[%start exp stmt;
 exp: exp;
 stmt: "stmt"]],
 [[input.y: warning: 1 nonterminal useless in grammar [-Wother]
 input.y: warning: 2 rules useless in grammar [-Wother]
 input.y:2.8-10: error: start symbol exp does not derive any sentence]])
 AT_TEST(
 [[%start exp stmt;
 exp: exp;
 stmt: stmt]],
 [[input.y: warning: 3 nonterminals useless in grammar [-Wother]
 input.y: warning: 4 rules useless in grammar [-Wother]
 input.y:2.8-10: error: start symbol exp does not derive any sentence
 input.y:2.12-15: error: start symbol stmt does not derive any sentence]])
 AT_TEST(
 [[%start exp;
 stmt: stmt]],
 [[input.y:2.8-10: warning: symbol 'exp' is used, but is not defined as a token and has no rules [-Wother]
 input.y: warning: 3 nonterminals useless in grammar [-Wother]
 input.y: warning: 2 rules useless in grammar [-Wother]
 input.y:2.8-10: error: start symbol exp does not derive any sentence]])
 AT_TEST(
 [[%token FOO;
 %start FOO;
 stmt: FOO]],
 [[input.y:2.8-10: error: the start symbol FOO is a token]])
 m4_popdef([AT_TEST])
 AT_CLEANUP