input: stop treating lone CRs as end-of-lines

We used to treat lone CRs (\r, aka ^M) as regular NLs (\n), probably to please Classic MacOS. As of today, it makes more sense to treat \r like a plain white space character. https://lists.gnu.org/archive/html/bison-patches/2019-09/msg00027.html * src/scan-gram.l (no_cr_read): Remove. Instead, use... (eol): this new abbreviation denoting end-of-line. * src/location.c (caret_getc): New. (location_caret): Use it. * tests/diagnostics.at (Carriage return): Adjust expectations. (CR NL): New.
2026-06-16 12:42:13 +00:00 · 2019-09-10 18:51:25 +02:00
parent 5e4133175d
commit 19da501e06
4 changed files with 76 additions and 71 deletions
@@ -2,6 +2,12 @@ GNU Bison NEWS
 * Noteworthy changes in release ?.? (????-??-??) [?]
 ** Backward incompatible changes
  Lone carriage-return characters (aka \r or ^M) in the grammar files are no
  longer treated as end-of-lines.  This changes the diagnostics, and in
  particular their locations.
 ** Bug fixes
  In Java, %define api.prefix was ignored.  It now behaves as expected.
@@ -169,7 +169,7 @@ static struct
 } caret_info;
 void
-caret_free ()
+caret_free (void)
 {
  if (caret_info.source)
    {
@@ -178,6 +178,23 @@ caret_free ()
    }
 }
 /* Getc, but smash \r\n as \n.  */
 static int
 caret_getc (void)
 {
  FILE *f = caret_info.source;
  int res = getc (f);
  if (res == '\r')
    {
      int c = getc (f);
      if (c == '\n')
        res = c;
      else
        ungetc (c, f);
    }
  return res;
 }
 void
 location_caret (location loc, const char *style, FILE *out)
 {
@@ -230,7 +247,7 @@ location_caret (location loc, const char *style, FILE *out)
  /* Advance to the line's position, keeping track of the offset.  */
  while (caret_info.line < loc.start.line)
    {
-      int c = getc (caret_info.source);
+      int c = caret_getc ();
      if (c == EOF)
        /* Something is wrong, that line number does not exist.  */
        return;
@@ -241,7 +258,7 @@ location_caret (location loc, const char *style, FILE *out)
  /* Read the actual line.  Don't update the offset, so that we keep a pointer
     to the start of the line.  */
  {
-    int c = getc (caret_info.source);
+    int c = caret_getc ();
    if (c != EOF)
      {
        bool single_line = loc.start.line == loc.end.line;
@@ -268,7 +285,7 @@ location_caret (location loc, const char *style, FILE *out)
                  opened = true;
                }
              fputc (c, out);
-              c = getc (caret_info.source);
+              c = caret_getc ();
              ++byte;
              if (opened
                  && (single_line
@@ -49,9 +49,6 @@ static boundary scanner_cursor;
 #define YY_USER_ACTION  location_compute (loc, &scanner_cursor, yytext, yyleng);
 static size_t no_cr_read (FILE *, char *, size_t);
 #define YY_INPUT(buf, result, size) ((result) = no_cr_read (yyin, buf, size))
 /* Report that yytext is an extension, and evaluate to its token type.  */
 #define BISON_DIRECTIVE(Directive)                              \
  (bison_directive (loc, yytext), PERCENT_ ## Directive)
@@ -139,12 +136,14 @@ id        {letter}({letter}|[-0-9])*
 int       [0-9]+
 xint      0[xX][0-9abcdefABCDEF]+
 eol       \n|\r\n
 /* UTF-8 Encoded Unicode Code Point, from Flex's documentation. */
 mbchar    [\x09\x0A\x0D\x20-\x7E]|[\xC2-\xDF][\x80-\xBF]|\xE0[\xA0-\xBF][\x80-\xBF]|[\xE1-\xEC\xEE\xEF]([\x80-\xBF]{2})|\xED[\x80-\x9F][\x80-\xBF]|\xF0[\x\90-\xBF]([\x80-\xBF]{2})|[\xF1-\xF3]([\x80-\xBF]{3})|\xF4[\x80-\x8F]([\x80-\xBF]{2})
 /* Zero or more instances of backslash-newline.  Following GCC, allow
   white space between the backslash and the newline.  */
-splice   (\\[ \f\t\v]*\n)*
+splice   (\\[ \f\t\v]*{eol})*
 /* An equal sign, with optional leading whitespaces. This is used in some
   deprecated constructs. */
@@ -193,7 +192,7 @@ eqopt    ({sp}=)?
  "," {
     complain (loc, Wother, _("stray ',' treated as white space"));
  }
-  [ \f\n\t\v]  |
+  [ \f\t\v\r]|{eol}  |
  "//".*       continue;
  "/*" {
    token_start = loc->start;
@@ -201,9 +200,7 @@ eqopt    ({sp}=)?
    BEGIN SC_YACC_COMMENT;
  }
-  /* #line directives are not documented, and may be withdrawn or
+  ^"#line "{int}(" \"".*"\"")?{eol} {
     modified in future versions of Bison.  */
  ^"#line "{int}(" \"".*"\"")?"\n" {
    handle_syncline (yytext + sizeof "#line " - 1, *loc);
  }
 }
@@ -329,7 +326,7 @@ eqopt    ({sp}=)?
  }
  /* Semantic predicate. */
-  "%?"[ \f\n\t\v]*"{" {
+  "%?"([ \f\t\v]|{eol})*"{" {
    nesting = 0;
    code_start = loc->start;
    BEGIN SC_PREDICATE;
@@ -358,7 +355,7 @@ eqopt    ({sp}=)?
    BEGIN SC_BRACKETED_ID;
  }
-  [^\[%A-Za-z0-9_<>{}\"\'*;|=/, \f\n\t\v]+|. {
+  [^\[%A-Za-z0-9_<>{}\"\'*;|=/, \f\r\n\t\v]+|. {
    complain (loc, complaint, "%s: %s",
              ngettext ("invalid character", "invalid characters", yyleng),
              quote_mem (yytext, yyleng));
@@ -457,7 +454,7 @@ eqopt    ({sp}=)?
      complain (loc, complaint, _("an identifier expected"));
  }
-  [^\].A-Za-z0-9_/ \f\n\t\v]+|. {
+  [^\].A-Za-z0-9_/ \f\r\n\t\v]+|. {
    complain (loc, complaint, "%s: %s",
              ngettext ("invalid character in bracketed name",
                        "invalid characters in bracketed name", yyleng),
@@ -490,7 +487,7 @@ eqopt    ({sp}=)?
 <SC_YACC_COMMENT>
 {
  "*/"     BEGIN context_state;
-  .|\n     continue;
+  .|{eol}  continue;
  <<EOF>>  unexpected_eof (token_start, "*/"); BEGIN context_state;
 }
@@ -512,7 +509,7 @@ eqopt    ({sp}=)?
 <SC_LINE_COMMENT>
 {
-  "\n"           STRING_GROW; BEGIN context_state;
+  {eol}          STRING_GROW; BEGIN context_state;
  {splice}       STRING_GROW;
  <<EOF>>        BEGIN context_state;
 }
@@ -534,7 +531,7 @@ eqopt    ({sp}=)?
    RETURN_VALUE (STRING, last_string);
  }
  <<EOF>>   unexpected_eof (token_start, "\"");
-  "\n"      unexpected_newline (token_start, "\"");
+  {eol}     unexpected_newline (token_start, "\"");
 }
  /*----------------------------------------------------------.
@@ -563,7 +560,7 @@ eqopt    ({sp}=)?
    BEGIN INITIAL;
    return CHAR;
  }
-  "\n"      unexpected_newline (token_start, "'");
+  {eol}     unexpected_newline (token_start, "'");
  <<EOF>>   unexpected_eof (token_start, "'");
 }
@@ -640,7 +637,7 @@ eqopt    ({sp}=)?
    else
      obstack_1grow (&obstack_for_string, c);
  }
-  \\(.|\n)      {
+  \\(.|{eol})      {
    char const *p = yytext + 1;
    /* Quote only if escaping won't make the character visible.  */
    if (c_isspace ((unsigned char) *p) && c_isprint ((unsigned char) *p))
@@ -664,14 +661,14 @@ eqopt    ({sp}=)?
 <SC_CHARACTER>
 {
  "'"           STRING_GROW; BEGIN context_state;
-  \n            unexpected_newline (token_start, "'");
+  {eol}         unexpected_newline (token_start, "'");
  <<EOF>>       unexpected_eof (token_start, "'");
 }
 <SC_STRING>
 {
  "\""          STRING_GROW; BEGIN context_state;
-  \n            unexpected_newline (token_start, "\"");
+  {eol}         unexpected_newline (token_start, "\"");
  <<EOF>>       unexpected_eof (token_start, "\"");
 }
@@ -808,53 +805,6 @@ eqopt    ({sp}=)?
 %%
 /* Read bytes from FP into buffer BUF of size SIZE.  Return the
   number of bytes read.  Remove '\r' from input, treating \r\n
   and isolated \r as \n.  */
 static size_t
 no_cr_read (FILE *fp, char *buf, size_t size)
 {
  size_t bytes_read = fread (buf, 1, size, fp);
  if (bytes_read)
    {
      char *w = memchr (buf, '\r', bytes_read);
      if (w)
        {
          char const *r = ++w;
          char const *lim = buf + bytes_read;
          for (;;)
            {
              /* Found an '\r'.  Treat it like '\n', but ignore any
                 '\n' that immediately follows.  */
              w[-1] = '\n';
              if (r == lim)
                {
                  int ch = getc (fp);
                  if (ch != '\n' && ungetc (ch, fp) != ch)
                    break;
                }
              else if (*r == '\n')
                r++;
              /* Copy until the next '\r'.  */
              do
                {
                  if (r == lim)
                    return w - buf;
                }
              while ((*w++ = *r++) != '\r');
            }
          return w - buf;
        }
    }
  return bytes_read;
 }
 /*------------------------------------------------------.
 | Scan NUMBER for a base-BASE integer at location LOC.  |
@@ -274,11 +274,43 @@ AT_TEST([[Carriage return]],
 %%
 ]],
 [1],
-[[input.y:37.8-38.0: <error>error:</error> missing '"' at end of line
+[[input.y:10.8-11.0: <error>error:</error> missing '"' at end of line
-input.y:37.8-38.0: <error>error:</error> syntax error, unexpected string, expecting char or identifier or <tag>
+   10 | %token <error>"</error>
      |        <error>^</error>
 input.y:10.8-11.0: <error>error:</error> syntax error, unexpected string, expecting char or identifier or <tag>
   10 | %token <error>"</error>
      |        <error>^</error>
 ]])
 ## ------- ##
 ## CR NL.  ##
 ## ------- ##
 # Check Windows EOLs.
 AT_TEST([[CR NL]],
 [[^M
 %token ^M FOO^M
 %token ^M FOO^M
 %%^M
 exp:^M
 ]],
 [0],
 [[input.y:11.9-11: <warning>warning:</warning> symbol FOO redeclared [<warning>-Wother</warning>]
   11 | %token 
 <warning>FOO</warning>
      |         <warning>^~~</warning>
 input.y:10.9-11: previous declaration
   10 | %token 
 <note>FOO</note>
      |         <note>^~~</note>
 input.y:13.5: <warning>warning:</warning> empty rule without %empty [<warning>-Wempty-rule</warning>]
   13 | exp:
      |     <warning>^</warning>
 input.y: <warning>warning:</warning> fix-its can be applied.  Rerun with option '--update'. [<warning>-Wother</warning>]
 ]])