input: stop treating lone CRs as end-of-lines

We used to treat lone CRs (\r, aka ^M) as regular NLs (\n), probably to please Classic MacOS. As of today, it makes more sense to treat \r like a plain white space character. https://lists.gnu.org/archive/html/bison-patches/2019-09/msg00027.html * src/scan-gram.l (no_cr_read): Remove. Instead, use... (eol): this new abbreviation denoting end-of-line. * src/location.c (caret_getc): New. (location_caret): Use it. * tests/diagnostics.at (Carriage return): Adjust expectations. (CR NL): New.
2026-07-06 15:50:03 +00:00 · 2019-09-10 18:51:25 +02:00
parent 5e4133175d
commit 19da501e06
4 changed files with 76 additions and 71 deletions
@@ -2,6 +2,12 @@ GNU Bison NEWS

 * Noteworthy changes in release ?.? (????-??-??) [?]

+** Backward incompatible changes
+
+  Lone carriage-return characters (aka \r or ^M) in the grammar files are no
+  longer treated as end-of-lines.  This changes the diagnostics, and in
+  particular their locations.
+
 ** Bug fixes

  In Java, %define api.prefix was ignored.  It now behaves as expected.
@@ -169,7 +169,7 @@ static struct
 } caret_info;

 void
-caret_free ()
+caret_free (void)
 {
  if (caret_info.source)
    {
@@ -178,6 +178,23 @@ caret_free ()
    }
 }

+/* Getc, but smash \r\n as \n.  */
+static int
+caret_getc (void)
+{
+  FILE *f = caret_info.source;
+  int res = getc (f);
+  if (res == '\r')
+    {
+      int c = getc (f);
+      if (c == '\n')
+        res = c;
+      else
+        ungetc (c, f);
+    }
+  return res;
+}
+
 void
 location_caret (location loc, const char *style, FILE *out)
 {
@@ -230,7 +247,7 @@ location_caret (location loc, const char *style, FILE *out)
  /* Advance to the line's position, keeping track of the offset.  */
  while (caret_info.line < loc.start.line)
    {
-      int c = getc (caret_info.source);
+      int c = caret_getc ();
      if (c == EOF)
        /* Something is wrong, that line number does not exist.  */
        return;
@@ -241,7 +258,7 @@ location_caret (location loc, const char *style, FILE *out)
  /* Read the actual line.  Don't update the offset, so that we keep a pointer
     to the start of the line.  */
  {
-    int c = getc (caret_info.source);
+    int c = caret_getc ();
    if (c != EOF)
      {
        bool single_line = loc.start.line == loc.end.line;
@@ -268,7 +285,7 @@ location_caret (location loc, const char *style, FILE *out)
                  opened = true;
                }
              fputc (c, out);
-              c = getc (caret_info.source);
+              c = caret_getc ();
              ++byte;
              if (opened
                  && (single_line
@@ -49,9 +49,6 @@ static boundary scanner_cursor;

 #define YY_USER_ACTION  location_compute (loc, &scanner_cursor, yytext, yyleng);

-static size_t no_cr_read (FILE *, char *, size_t);
-#define YY_INPUT(buf, result, size) ((result) = no_cr_read (yyin, buf, size))
-
 /* Report that yytext is an extension, and evaluate to its token type.  */
 #define BISON_DIRECTIVE(Directive)                              \
  (bison_directive (loc, yytext), PERCENT_ ## Directive)
@@ -139,12 +136,14 @@ id        {letter}({letter}|[-0-9])*
 int       [0-9]+
 xint      0[xX][0-9abcdefABCDEF]+

+eol       \n|\r\n
+
 /* UTF-8 Encoded Unicode Code Point, from Flex's documentation. */
 mbchar    [\x09\x0A\x0D\x20-\x7E]|[\xC2-\xDF][\x80-\xBF]|\xE0[\xA0-\xBF][\x80-\xBF]|[\xE1-\xEC\xEE\xEF]([\x80-\xBF]{2})|\xED[\x80-\x9F][\x80-\xBF]|\xF0[\x\90-\xBF]([\x80-\xBF]{2})|[\xF1-\xF3]([\x80-\xBF]{3})|\xF4[\x80-\x8F]([\x80-\xBF]{2})

 /* Zero or more instances of backslash-newline.  Following GCC, allow
   white space between the backslash and the newline.  */
-splice   (\\[ \f\t\v]*\n)*
+splice   (\\[ \f\t\v]*{eol})*

 /* An equal sign, with optional leading whitespaces. This is used in some
   deprecated constructs. */
@@ -193,7 +192,7 @@ eqopt    ({sp}=)?
  "," {
     complain (loc, Wother, _("stray ',' treated as white space"));
  }
-  [ \f\n\t\v]  |
+  [ \f\t\v\r]|{eol}  |
  "//".*       continue;
  "/*" {
    token_start = loc->start;
@@ -201,9 +200,7 @@ eqopt    ({sp}=)?
    BEGIN SC_YACC_COMMENT;
  }

-  /* #line directives are not documented, and may be withdrawn or
-     modified in future versions of Bison.  */
-  ^"#line "{int}(" \"".*"\"")?"\n" {
+  ^"#line "{int}(" \"".*"\"")?{eol} {
    handle_syncline (yytext + sizeof "#line " - 1, *loc);
  }
 }
@@ -329,7 +326,7 @@ eqopt    ({sp}=)?
  }

  /* Semantic predicate. */
-  "%?"[ \f\n\t\v]*"{" {
+  "%?"([ \f\t\v]|{eol})*"{" {
    nesting = 0;
    code_start = loc->start;
    BEGIN SC_PREDICATE;
@@ -358,7 +355,7 @@ eqopt    ({sp}=)?
    BEGIN SC_BRACKETED_ID;
  }

-  [^\[%A-Za-z0-9_<>{}\"\'*;|=/, \f\n\t\v]+|. {
+  [^\[%A-Za-z0-9_<>{}\"\'*;|=/, \f\r\n\t\v]+|. {
    complain (loc, complaint, "%s: %s",
              ngettext ("invalid character", "invalid characters", yyleng),
              quote_mem (yytext, yyleng));
@@ -457,7 +454,7 @@ eqopt    ({sp}=)?
      complain (loc, complaint, _("an identifier expected"));
  }

-  [^\].A-Za-z0-9_/ \f\n\t\v]+|. {
+  [^\].A-Za-z0-9_/ \f\r\n\t\v]+|. {
    complain (loc, complaint, "%s: %s",
              ngettext ("invalid character in bracketed name",
                        "invalid characters in bracketed name", yyleng),
@@ -490,7 +487,7 @@ eqopt    ({sp}=)?
 <SC_YACC_COMMENT>
 {
  "*/"     BEGIN context_state;
-  .|\n     continue;
+  .|{eol}  continue;
  <<EOF>>  unexpected_eof (token_start, "*/"); BEGIN context_state;
 }

@@ -512,7 +509,7 @@ eqopt    ({sp}=)?

 <SC_LINE_COMMENT>
 {
-  "\n"           STRING_GROW; BEGIN context_state;
+  {eol}          STRING_GROW; BEGIN context_state;
  {splice}       STRING_GROW;
  <<EOF>>        BEGIN context_state;
 }
@@ -534,7 +531,7 @@ eqopt    ({sp}=)?
    RETURN_VALUE (STRING, last_string);
  }
  <<EOF>>   unexpected_eof (token_start, "\"");
-  "\n"      unexpected_newline (token_start, "\"");
+  {eol}     unexpected_newline (token_start, "\"");
 }

  /*----------------------------------------------------------.
@@ -563,7 +560,7 @@ eqopt    ({sp}=)?
    BEGIN INITIAL;
    return CHAR;
  }
-  "\n"      unexpected_newline (token_start, "'");
+  {eol}     unexpected_newline (token_start, "'");
  <<EOF>>   unexpected_eof (token_start, "'");
 }

@@ -640,7 +637,7 @@ eqopt    ({sp}=)?
    else
      obstack_1grow (&obstack_for_string, c);
  }
-  \\(.|\n)      {
+  \\(.|{eol})      {
    char const *p = yytext + 1;
    /* Quote only if escaping won't make the character visible.  */
    if (c_isspace ((unsigned char) *p) && c_isprint ((unsigned char) *p))
@@ -664,14 +661,14 @@ eqopt    ({sp}=)?
 <SC_CHARACTER>
 {
  "'"           STRING_GROW; BEGIN context_state;
-  \n            unexpected_newline (token_start, "'");
+  {eol}         unexpected_newline (token_start, "'");
  <<EOF>>       unexpected_eof (token_start, "'");
 }

 <SC_STRING>
 {
  "\""          STRING_GROW; BEGIN context_state;
-  \n            unexpected_newline (token_start, "\"");
+  {eol}         unexpected_newline (token_start, "\"");
  <<EOF>>       unexpected_eof (token_start, "\"");
 }

@@ -808,53 +805,6 @@ eqopt    ({sp}=)?

 %%

-/* Read bytes from FP into buffer BUF of size SIZE.  Return the
-   number of bytes read.  Remove '\r' from input, treating \r\n
-   and isolated \r as \n.  */
-
-static size_t
-no_cr_read (FILE *fp, char *buf, size_t size)
-{
-  size_t bytes_read = fread (buf, 1, size, fp);
-  if (bytes_read)
-    {
-      char *w = memchr (buf, '\r', bytes_read);
-      if (w)
-        {
-          char const *r = ++w;
-          char const *lim = buf + bytes_read;
-
-          for (;;)
-            {
-              /* Found an '\r'.  Treat it like '\n', but ignore any
-                 '\n' that immediately follows.  */
-              w[-1] = '\n';
-              if (r == lim)
-                {
-                  int ch = getc (fp);
-                  if (ch != '\n' && ungetc (ch, fp) != ch)
-                    break;
-                }
-              else if (*r == '\n')
-                r++;
-
-              /* Copy until the next '\r'.  */
-              do
-                {
-                  if (r == lim)
-                    return w - buf;
-                }
-              while ((*w++ = *r++) != '\r');
-            }
-
-          return w - buf;
-        }
-    }
-
-  return bytes_read;
-}
-
-

 /*------------------------------------------------------.
 | Scan NUMBER for a base-BASE integer at location LOC.  |
@@ -274,11 +274,43 @@ AT_TEST([[Carriage return]],
 %%
 ]],
 [1],
-[[input.y:37.8-38.0: <error>error:</error> missing '"' at end of line
-input.y:37.8-38.0: <error>error:</error> syntax error, unexpected string, expecting char or identifier or <tag>
+[[input.y:10.8-11.0: <error>error:</error> missing '"' at end of line
+   10 | %token <error>"</error>
+      |        <error>^</error>
+input.y:10.8-11.0: <error>error:</error> syntax error, unexpected string, expecting char or identifier or <tag>
+   10 | %token <error>"</error>
+      |        <error>^</error>
 ]])


+## ------- ##
+## CR NL.  ##
+## ------- ##
+
+# Check Windows EOLs.
+
+AT_TEST([[CR NL]],
+[[^M
+%token ^M FOO^M
+%token ^M FOO^M
+%%^M
+exp:^M
+]],
+[0],
+[[input.y:11.9-11: <warning>warning:</warning> symbol FOO redeclared [<warning>-Wother</warning>]
+   11 | %token 
+ <warning>FOO</warning>
+      |         <warning>^~~</warning>
+input.y:10.9-11: previous declaration
+   10 | %token 
+ <note>FOO</note>
+      |         <note>^~~</note>
+input.y:13.5: <warning>warning:</warning> empty rule without %empty [<warning>-Wempty-rule</warning>]
+   13 | exp:
+      |     <warning>^</warning>
+input.y: <warning>warning:</warning> fix-its can be applied.  Rerun with option '--update'. [<warning>-Wother</warning>]
+]])
+