multistart: turn start symbols into rules on $accept

Now that the parser can read several start symbols, let's process them, and create the corresponding rules. * src/parse-gram.y (grammar_declaration): Accept a list of start symbols. * src/reader.h, src/reader.c (grammar_start_symbol_set): Rename as... (grammar_start_symbols_set): this. * src/reader.h, src/reader.c (start_flag): Replace with... (start_symbols): this. * src/reader.c (grammar_start_symbols_set): Build a list of start symbols. (switching_token, create_start_rules): New. (check_and_convert_grammar): Use them to turn the list of start symbols into a set of rules. * src/reduce.c (nonterminals_reduce): Don't complain about $accept, it's an internal detail. (reduce_grammar): Complain about all the start symbols that don't derive sentences. * src/symtab.c (startsymbol, startsymbol_loc): Remove, replaced by start_symbols. symbols_pack): Move the check about the start symbols to... * src/symlist.c (check_start_symbols): here. Adjust to multiple start symbols. * tests/reduce.at (Empty Language): Generalize into... (Bad start symbols): this.
2026-04-23 18:19:38 +00:00 · 2020-02-20 18:11:29 +01:00
parent db68f61595
commit 8eaddf326b
8 changed files with 194 additions and 54 deletions
@@ -657,6 +657,17 @@ happen with yy_start: stmt | expr).  Then adjust the skeletons so that this
 initial token (YY_START_STMT, YY_START_EXPR) be shifted first in the
 corresponding parse function.

+*** Number of useless symbols
+AT_TEST(
+[[%start exp;
+exp: exp;]],
+[[input.y: warning: 2 nonterminals useless in grammar [-Wother]
+input.y: warning: 2 rules useless in grammar [-Wother]
+input.y:2.8-10: error: start symbol exp does not derive any sentence]])
+
+We should say "1 nonterminal": the other one is $accept, which should not
+participate in the count.
+
 ** %include
 This is a popular demand.  We already made many changes in the parser that
 should make this reasonably easy to implement.
@@ -381,9 +381,9 @@ params:

 grammar_declaration:
  symbol_declaration
-| "%start" symbol
+| "%start" symbols.1
    {
-      grammar_start_symbol_set ($2, @2);
+      grammar_start_symbols_set ($2);
    }
 | code_props_type "{...}" generic_symlist
    {
@@ -21,7 +21,9 @@
 #include <config.h>
 #include "system.h"

+#include <c-ctype.h>
 #include <quote.h>
+#include <vasnprintf.h>

 #include "complain.h"
 #include "conflicts.h"
@@ -40,7 +42,7 @@ static void prepare_percent_define_front_end_variables (void);
 static void check_and_convert_grammar (void);

 static symbol_list *grammar = NULL;
-static bool start_flag = false;
+symbol_list *start_symbols = NULL;
 merger_list *merge_functions;

 /* Was %union seen?  */
@@ -54,16 +56,9 @@ bool default_prec = true;
 `-----------------------*/

 void
-grammar_start_symbol_set (symbol *sym, location loc)
+grammar_start_symbols_set (symbol_list *syms)
 {
-  if (start_flag)
-    complain (&loc, complaint, _("multiple %s declarations"), "%start");
-  else
-    {
-      start_flag = true;
-      startsymbol = sym;
-      startsymbol_loc = loc;
-    }
+  start_symbols = symbol_list_append (start_symbols, syms);
 }


@@ -791,6 +786,95 @@ create_start_rule (symbol *swtok, symbol *start)
  grammar = initial_rule;
 }

+/* Fetch (or create) a token "YY_PARSE_foo" for start symbol "foo".
+
+   We don't use the simple "YY_FOO" because (i) we might get clashes
+   with some of our symbols (e.g., cast => YY_CAST), and (ii) upcasing
+   introduces possible clashes between terminal FOO and nonterminal
+   foo.  */
+symbol *
+switching_token (const symbol *start)
+{
+  char buf[100];
+  size_t len = sizeof buf;
+  char *name
+    = asnprintf (buf, &len,
+                 "YY_PARSE_%s", start->alias ? start->alias->tag : start->tag);
+  if (!name)
+    xalloc_die ();
+  // Setting the location ensures deterministic symbol numbers.
+  symbol *res = symbol_get (name, start->location);
+  if (name != buf)
+    free (name);
+  symbol_class_set (res, token_sym, start->location, false);
+  return res;
+}
+
+/* Create the start rules in reverse order, since they are inserted at
+   the top of the grammar.  That way the rules follow the order of
+   declaration to %start.  */
+
+static void
+create_multiple_start_rules (symbol_list *start_syms)
+{
+  if (start_syms)
+    {
+      create_multiple_start_rules (start_syms->next);
+      assert (start_syms->content_type == SYMLIST_SYMBOL);
+      symbol *start = start_syms->content.sym;
+      symbol *swtok = switching_token (start);
+      create_start_rule (swtok, start);
+    }
+}
+
+/* For each start symbol "foo", create the rule "$accept: YY_FOO
+   foo $end". */
+static void
+create_start_rules (void)
+{
+  if (!start_symbols)
+    {
+      symbol *start = find_start_symbol ();
+      start_symbols = symbol_list_sym_new (start, start->location);
+    }
+
+  const bool several = start_symbols->next;
+  if (several)
+    create_multiple_start_rules (start_symbols);
+  else
+    {
+      symbol *start = start_symbols->content.sym;
+      create_start_rule (NULL, start);
+    }
+}
+
+static void
+check_start_symbols (void)
+{
+  // Sanity checks on the start symbols.
+  for (symbol_list *list = start_symbols; list; list = list->next)
+    {
+      const symbol *start = list->content.sym;
+      if (start->content->class == unknown_sym)
+        {
+          complain (&start->location, complaint,
+                    _("the start symbol %s is undefined"),
+                    start->tag);
+          // I claim this situation is unreachable.  This is caught
+          // before, and we get "symbol 'foo' is used, but is not
+          // defined as a token and has no rules".
+          abort ();
+        }
+      if (start->content->class == token_sym)
+        complain (&start->location, complaint,
+                  _("the start symbol %s is a token"),
+                  start->tag);
+    }
+  if (complaint_status == status_complaint)
+    exit (EXIT_FAILURE);
+}
+
+
 /*-------------------------------------------------------------.
 | Check the grammar that has just been read, and convert it to |
 | internal form.                                               |
@@ -818,19 +902,12 @@ check_and_convert_grammar (void)
      }
    }

+  /* Insert the initial rule(s).  */
+  create_start_rules ();
+
  /* Report any undefined symbols and consider them nonterminals.  */
  symbols_check_defined ();

-  /* Find the start symbol if no %start.  */
-  if (!start_flag)
-    {
-      symbol *start = find_start_symbol ();
-      grammar_start_symbol_set (start, start->location);
-    }
-
-  /* Insert the initial rule.  */
-  create_start_rule (NULL, startsymbol);
-
  if (SYMBOL_NUMBER_MAXIMUM - nnterms < ntokens)
    complain (NULL, fatal, "too many symbols in input grammar (limit is %d)",
              SYMBOL_NUMBER_MAXIMUM);
@@ -840,6 +917,8 @@ check_and_convert_grammar (void)
  /* Assign the symbols their symbol numbers.  */
  symbols_pack ();

+  check_start_symbols ();
+
  /* Scan rule actions after invoking symbol_check_alias_consistency
     (in symbols_pack above) so that token types are set correctly
     before the rule action type checking.
@@ -38,7 +38,18 @@ typedef struct merger_list
 void free_merger_functions (void);
 extern merger_list *merge_functions;

-void grammar_start_symbol_set (symbol *sym, location loc);
+/* List of the start symbols.  */
+extern symbol_list *start_symbols;
+
+/* Fetch (or create) a token "YY_PARSE_foo" for start symbol "foo".
+
+   We don't use the simple "YY_FOO" because (i) we might get clashes
+   with some of our symbols (e.g., cast => YY_CAST), and (ii) upcasing
+   introduces possible clashes between terminal FOO and nonterminal
+   foo.  */
+symbol *switching_token (const symbol *start);
+
+void grammar_start_symbols_set (symbol_list *syms);

 void grammar_current_rule_begin (symbol *lhs, location loc,
                                 named_ref *lhs_named_ref);
@@ -275,7 +275,8 @@ nonterminals_reduce (void)
      if (!bitset_test (V, i))
        {
          nterm_map[i - ntokens] = n++;
-          if (symbols[i]->content->status != used)
+          if (symbols[i]->content->status != used
+              && symbols[i] != acceptsymbol)
            complain (&symbols[i]->location, Wother,
                      _("nonterminal useless in grammar: %s"),
                      symbols[i]->tag);
@@ -381,10 +382,18 @@ reduce_grammar (void)
    {
      reduce_print ();

-      if (!bitset_test (N, acceptsymbol->content->number - ntokens))
-        complain (&startsymbol_loc, fatal,
-                  _("start symbol %s does not derive any sentence"),
-                  startsymbol->tag);
+      // Check that start symbols have non-empty languages.
+      bool failure = false;
+      for (symbol_list *list = start_symbols; list; list = list->next)
+        if (!bitset_test (N, list->content.sym->content->number - ntokens))
+          {
+            failure = true;
+            complain (&list->sym_loc, complaint,
+                      _("start symbol %s does not derive any sentence"),
+                      list->content.sym->tag);
+          }
+      if (failure)
+        exit (EXIT_FAILURE);

      /* First reduce the nonterminals, as they renumber themselves in the
         whole grammar.  If you change the order, nonterms would be
@@ -60,8 +60,6 @@ symbol *errtoken = NULL;
 symbol *undeftoken = NULL;
 symbol *eoftoken = NULL;
 symbol *acceptsymbol = NULL;
-symbol *startsymbol = NULL;
-location startsymbol_loc;

 /* Precedence relation graph. */
 static symgraph **prec_nodes;
@@ -1146,15 +1144,6 @@ symbols_pack (void)

  symbols_token_translations_init ();

-  if (startsymbol->content->class == unknown_sym)
-    complain (&startsymbol_loc, fatal,
-              _("the start symbol %s is undefined"),
-              startsymbol->tag);
-  else if (startsymbol->content->class == token_sym)
-    complain (&startsymbol_loc, fatal,
-              _("the start symbol %s is a token"),
-              startsymbol->tag);
-
  // If some user tokens are internationalized, the internal ones
  // should be too.
  if (has_translations ())
@@ -247,11 +247,6 @@ extern symbol *eoftoken;
   $accept: start-symbol $end */
 extern symbol *acceptsymbol;

-/** The user start symbol. */
-extern symbol *startsymbol;
-/** The location of the \c \%start declaration.  */
-extern location startsymbol_loc;
-
 /** Whether a symbol declared with a type tag.  */
 extern bool tag_seen;

@@ -445,23 +445,69 @@ AT_CLEANUP



-## ---------------- ##
-## Empty Language.  ##
-## ---------------- ##
+## ------------------- ##
+## Bad start symbols.  ##
+## ------------------- ##

-AT_SETUP([Empty Language])
+AT_SETUP([Bad start symbols])

+m4_pushdef([AT_TEST],
+[
 AT_DATA([[input.y]],
-[[%output "input.c"
-%%
-exp: exp;
-]])
+[%%
+$1
+])

 AT_BISON_CHECK([[input.y]], 1, [],
+[$2
+])
+])
+
+AT_TEST(
+[[exp: exp;]],
 [[input.y: warning: 2 nonterminals useless in grammar [-Wother]
 input.y: warning: 2 rules useless in grammar [-Wother]
-input.y:3.1-3: fatal error: start symbol exp does not derive any sentence
-]])
+input.y:2.1-3: error: start symbol exp does not derive any sentence]])
+
+AT_TEST(
+[[%start exp;
+exp: exp;]],
+[[input.y: warning: 2 nonterminals useless in grammar [-Wother]
+input.y: warning: 2 rules useless in grammar [-Wother]
+input.y:2.8-10: error: start symbol exp does not derive any sentence]])
+
+AT_TEST(
+[[%start exp stmt;
+exp: exp;
+stmt: "stmt"]],
+[[input.y: warning: 1 nonterminal useless in grammar [-Wother]
+input.y: warning: 2 rules useless in grammar [-Wother]
+input.y:2.8-10: error: start symbol exp does not derive any sentence]])
+
+AT_TEST(
+[[%start exp stmt;
+exp: exp;
+stmt: stmt]],
+[[input.y: warning: 3 nonterminals useless in grammar [-Wother]
+input.y: warning: 4 rules useless in grammar [-Wother]
+input.y:2.8-10: error: start symbol exp does not derive any sentence
+input.y:2.12-15: error: start symbol stmt does not derive any sentence]])
+
+AT_TEST(
+[[%start exp;
+stmt: stmt]],
+[[input.y:2.8-10: warning: symbol 'exp' is used, but is not defined as a token and has no rules [-Wother]
+input.y: warning: 3 nonterminals useless in grammar [-Wother]
+input.y: warning: 2 rules useless in grammar [-Wother]
+input.y:2.8-10: error: start symbol exp does not derive any sentence]])
+
+AT_TEST(
+[[%token FOO;
+%start FOO;
+stmt: FOO]],
+[[input.y:2.8-10: error: the start symbol FOO is a token]])
+
+m4_popdef([AT_TEST])

 AT_CLEANUP