input: stop treating lone CRs as end-of-lines

We used to treat lone CRs (\r, aka ^M) as regular NLs (\n), probably
to please Classic MacOS.  As of today, it makes more sense to treat \r
like a plain white space character.

https://lists.gnu.org/archive/html/bison-patches/2019-09/msg00027.html

* src/scan-gram.l (no_cr_read): Remove.  Instead, use...
(eol): this new abbreviation denoting end-of-line.
* src/location.c (caret_getc): New.
(location_caret): Use it.
* tests/diagnostics.at (Carriage return): Adjust expectations.
(CR NL): New.
This commit is contained in:
Akim Demaille
2019-09-10 18:51:25 +02:00
parent 5e4133175d
commit 19da501e06
4 changed files with 76 additions and 71 deletions

6
NEWS
View File

@@ -2,6 +2,12 @@ GNU Bison NEWS
* Noteworthy changes in release ?.? (????-??-??) [?] * Noteworthy changes in release ?.? (????-??-??) [?]
** Backward incompatible changes
Lone carriage-return characters (aka \r or ^M) in the grammar files are no
longer treated as end-of-lines. This changes the diagnostics, and in
particular their locations.
** Bug fixes ** Bug fixes
In Java, %define api.prefix was ignored. It now behaves as expected. In Java, %define api.prefix was ignored. It now behaves as expected.

View File

@@ -169,7 +169,7 @@ static struct
} caret_info; } caret_info;
void void
caret_free () caret_free (void)
{ {
if (caret_info.source) if (caret_info.source)
{ {
@@ -178,6 +178,23 @@ caret_free ()
} }
} }
/* Getc, but smash \r\n as \n. */
static int
caret_getc (void)
{
FILE *f = caret_info.source;
int res = getc (f);
if (res == '\r')
{
int c = getc (f);
if (c == '\n')
res = c;
else
ungetc (c, f);
}
return res;
}
void void
location_caret (location loc, const char *style, FILE *out) location_caret (location loc, const char *style, FILE *out)
{ {
@@ -230,7 +247,7 @@ location_caret (location loc, const char *style, FILE *out)
/* Advance to the line's position, keeping track of the offset. */ /* Advance to the line's position, keeping track of the offset. */
while (caret_info.line < loc.start.line) while (caret_info.line < loc.start.line)
{ {
int c = getc (caret_info.source); int c = caret_getc ();
if (c == EOF) if (c == EOF)
/* Something is wrong, that line number does not exist. */ /* Something is wrong, that line number does not exist. */
return; return;
@@ -241,7 +258,7 @@ location_caret (location loc, const char *style, FILE *out)
/* Read the actual line. Don't update the offset, so that we keep a pointer /* Read the actual line. Don't update the offset, so that we keep a pointer
to the start of the line. */ to the start of the line. */
{ {
int c = getc (caret_info.source); int c = caret_getc ();
if (c != EOF) if (c != EOF)
{ {
bool single_line = loc.start.line == loc.end.line; bool single_line = loc.start.line == loc.end.line;
@@ -268,7 +285,7 @@ location_caret (location loc, const char *style, FILE *out)
opened = true; opened = true;
} }
fputc (c, out); fputc (c, out);
c = getc (caret_info.source); c = caret_getc ();
++byte; ++byte;
if (opened if (opened
&& (single_line && (single_line

View File

@@ -49,9 +49,6 @@ static boundary scanner_cursor;
#define YY_USER_ACTION location_compute (loc, &scanner_cursor, yytext, yyleng); #define YY_USER_ACTION location_compute (loc, &scanner_cursor, yytext, yyleng);
static size_t no_cr_read (FILE *, char *, size_t);
#define YY_INPUT(buf, result, size) ((result) = no_cr_read (yyin, buf, size))
/* Report that yytext is an extension, and evaluate to its token type. */ /* Report that yytext is an extension, and evaluate to its token type. */
#define BISON_DIRECTIVE(Directive) \ #define BISON_DIRECTIVE(Directive) \
(bison_directive (loc, yytext), PERCENT_ ## Directive) (bison_directive (loc, yytext), PERCENT_ ## Directive)
@@ -139,12 +136,14 @@ id {letter}({letter}|[-0-9])*
int [0-9]+ int [0-9]+
xint 0[xX][0-9abcdefABCDEF]+ xint 0[xX][0-9abcdefABCDEF]+
eol \n|\r\n
/* UTF-8 Encoded Unicode Code Point, from Flex's documentation. */ /* UTF-8 Encoded Unicode Code Point, from Flex's documentation. */
mbchar [\x09\x0A\x0D\x20-\x7E]|[\xC2-\xDF][\x80-\xBF]|\xE0[\xA0-\xBF][\x80-\xBF]|[\xE1-\xEC\xEE\xEF]([\x80-\xBF]{2})|\xED[\x80-\x9F][\x80-\xBF]|\xF0[\x\90-\xBF]([\x80-\xBF]{2})|[\xF1-\xF3]([\x80-\xBF]{3})|\xF4[\x80-\x8F]([\x80-\xBF]{2}) mbchar [\x09\x0A\x0D\x20-\x7E]|[\xC2-\xDF][\x80-\xBF]|\xE0[\xA0-\xBF][\x80-\xBF]|[\xE1-\xEC\xEE\xEF]([\x80-\xBF]{2})|\xED[\x80-\x9F][\x80-\xBF]|\xF0[\x\90-\xBF]([\x80-\xBF]{2})|[\xF1-\xF3]([\x80-\xBF]{3})|\xF4[\x80-\x8F]([\x80-\xBF]{2})
/* Zero or more instances of backslash-newline. Following GCC, allow /* Zero or more instances of backslash-newline. Following GCC, allow
white space between the backslash and the newline. */ white space between the backslash and the newline. */
splice (\\[ \f\t\v]*\n)* splice (\\[ \f\t\v]*{eol})*
/* An equal sign, with optional leading whitespaces. This is used in some /* An equal sign, with optional leading whitespaces. This is used in some
deprecated constructs. */ deprecated constructs. */
@@ -193,7 +192,7 @@ eqopt ({sp}=)?
"," { "," {
complain (loc, Wother, _("stray ',' treated as white space")); complain (loc, Wother, _("stray ',' treated as white space"));
} }
[ \f\n\t\v] | [ \f\t\v\r]|{eol} |
"//".* continue; "//".* continue;
"/*" { "/*" {
token_start = loc->start; token_start = loc->start;
@@ -201,9 +200,7 @@ eqopt ({sp}=)?
BEGIN SC_YACC_COMMENT; BEGIN SC_YACC_COMMENT;
} }
/* #line directives are not documented, and may be withdrawn or ^"#line "{int}(" \"".*"\"")?{eol} {
modified in future versions of Bison. */
^"#line "{int}(" \"".*"\"")?"\n" {
handle_syncline (yytext + sizeof "#line " - 1, *loc); handle_syncline (yytext + sizeof "#line " - 1, *loc);
} }
} }
@@ -329,7 +326,7 @@ eqopt ({sp}=)?
} }
/* Semantic predicate. */ /* Semantic predicate. */
"%?"[ \f\n\t\v]*"{" { "%?"([ \f\t\v]|{eol})*"{" {
nesting = 0; nesting = 0;
code_start = loc->start; code_start = loc->start;
BEGIN SC_PREDICATE; BEGIN SC_PREDICATE;
@@ -358,7 +355,7 @@ eqopt ({sp}=)?
BEGIN SC_BRACKETED_ID; BEGIN SC_BRACKETED_ID;
} }
[^\[%A-Za-z0-9_<>{}\"\'*;|=/, \f\n\t\v]+|. { [^\[%A-Za-z0-9_<>{}\"\'*;|=/, \f\r\n\t\v]+|. {
complain (loc, complaint, "%s: %s", complain (loc, complaint, "%s: %s",
ngettext ("invalid character", "invalid characters", yyleng), ngettext ("invalid character", "invalid characters", yyleng),
quote_mem (yytext, yyleng)); quote_mem (yytext, yyleng));
@@ -457,7 +454,7 @@ eqopt ({sp}=)?
complain (loc, complaint, _("an identifier expected")); complain (loc, complaint, _("an identifier expected"));
} }
[^\].A-Za-z0-9_/ \f\n\t\v]+|. { [^\].A-Za-z0-9_/ \f\r\n\t\v]+|. {
complain (loc, complaint, "%s: %s", complain (loc, complaint, "%s: %s",
ngettext ("invalid character in bracketed name", ngettext ("invalid character in bracketed name",
"invalid characters in bracketed name", yyleng), "invalid characters in bracketed name", yyleng),
@@ -490,7 +487,7 @@ eqopt ({sp}=)?
<SC_YACC_COMMENT> <SC_YACC_COMMENT>
{ {
"*/" BEGIN context_state; "*/" BEGIN context_state;
.|\n continue; .|{eol} continue;
<<EOF>> unexpected_eof (token_start, "*/"); BEGIN context_state; <<EOF>> unexpected_eof (token_start, "*/"); BEGIN context_state;
} }
@@ -512,7 +509,7 @@ eqopt ({sp}=)?
<SC_LINE_COMMENT> <SC_LINE_COMMENT>
{ {
"\n" STRING_GROW; BEGIN context_state; {eol} STRING_GROW; BEGIN context_state;
{splice} STRING_GROW; {splice} STRING_GROW;
<<EOF>> BEGIN context_state; <<EOF>> BEGIN context_state;
} }
@@ -534,7 +531,7 @@ eqopt ({sp}=)?
RETURN_VALUE (STRING, last_string); RETURN_VALUE (STRING, last_string);
} }
<<EOF>> unexpected_eof (token_start, "\""); <<EOF>> unexpected_eof (token_start, "\"");
"\n" unexpected_newline (token_start, "\""); {eol} unexpected_newline (token_start, "\"");
} }
/*----------------------------------------------------------. /*----------------------------------------------------------.
@@ -563,7 +560,7 @@ eqopt ({sp}=)?
BEGIN INITIAL; BEGIN INITIAL;
return CHAR; return CHAR;
} }
"\n" unexpected_newline (token_start, "'"); {eol} unexpected_newline (token_start, "'");
<<EOF>> unexpected_eof (token_start, "'"); <<EOF>> unexpected_eof (token_start, "'");
} }
@@ -640,7 +637,7 @@ eqopt ({sp}=)?
else else
obstack_1grow (&obstack_for_string, c); obstack_1grow (&obstack_for_string, c);
} }
\\(.|\n) { \\(.|{eol}) {
char const *p = yytext + 1; char const *p = yytext + 1;
/* Quote only if escaping won't make the character visible. */ /* Quote only if escaping won't make the character visible. */
if (c_isspace ((unsigned char) *p) && c_isprint ((unsigned char) *p)) if (c_isspace ((unsigned char) *p) && c_isprint ((unsigned char) *p))
@@ -664,14 +661,14 @@ eqopt ({sp}=)?
<SC_CHARACTER> <SC_CHARACTER>
{ {
"'" STRING_GROW; BEGIN context_state; "'" STRING_GROW; BEGIN context_state;
\n unexpected_newline (token_start, "'"); {eol} unexpected_newline (token_start, "'");
<<EOF>> unexpected_eof (token_start, "'"); <<EOF>> unexpected_eof (token_start, "'");
} }
<SC_STRING> <SC_STRING>
{ {
"\"" STRING_GROW; BEGIN context_state; "\"" STRING_GROW; BEGIN context_state;
\n unexpected_newline (token_start, "\""); {eol} unexpected_newline (token_start, "\"");
<<EOF>> unexpected_eof (token_start, "\""); <<EOF>> unexpected_eof (token_start, "\"");
} }
@@ -808,53 +805,6 @@ eqopt ({sp}=)?
%% %%
/* Read bytes from FP into buffer BUF of size SIZE. Return the
number of bytes read. Remove '\r' from input, treating \r\n
and isolated \r as \n. */
static size_t
no_cr_read (FILE *fp, char *buf, size_t size)
{
size_t bytes_read = fread (buf, 1, size, fp);
if (bytes_read)
{
char *w = memchr (buf, '\r', bytes_read);
if (w)
{
char const *r = ++w;
char const *lim = buf + bytes_read;
for (;;)
{
/* Found an '\r'. Treat it like '\n', but ignore any
'\n' that immediately follows. */
w[-1] = '\n';
if (r == lim)
{
int ch = getc (fp);
if (ch != '\n' && ungetc (ch, fp) != ch)
break;
}
else if (*r == '\n')
r++;
/* Copy until the next '\r'. */
do
{
if (r == lim)
return w - buf;
}
while ((*w++ = *r++) != '\r');
}
return w - buf;
}
}
return bytes_read;
}
/*------------------------------------------------------. /*------------------------------------------------------.
| Scan NUMBER for a base-BASE integer at location LOC. | | Scan NUMBER for a base-BASE integer at location LOC. |

View File

@@ -274,11 +274,43 @@ AT_TEST([[Carriage return]],
%% %%
]], ]],
[1], [1],
[[input.y:37.8-38.0: <error>error:</error> missing '"' at end of line [[input.y:10.8-11.0: <error>error:</error> missing '"' at end of line
input.y:37.8-38.0: <error>error:</error> syntax error, unexpected string, expecting char or identifier or <tag> 10 | %token <error>"</error>
| <error>^</error>
input.y:10.8-11.0: <error>error:</error> syntax error, unexpected string, expecting char or identifier or <tag>
10 | %token <error>"</error>
| <error>^</error>
]]) ]])
## ------- ##
## CR NL. ##
## ------- ##
# Check Windows EOLs.
AT_TEST([[CR NL]],
[[^M
%token ^M FOO^M
%token ^M FOO^M
%%^M
exp:^M
]],
[0],
[[input.y:11.9-11: <warning>warning:</warning> symbol FOO redeclared [<warning>-Wother</warning>]
11 | %token
<warning>FOO</warning>
| <warning>^~~</warning>
input.y:10.9-11: previous declaration
10 | %token
<note>FOO</note>
| <note>^~~</note>
input.y:13.5: <warning>warning:</warning> empty rule without %empty [<warning>-Wempty-rule</warning>]
13 | exp:
| <warning>^</warning>
input.y: <warning>warning:</warning> fix-its can be applied. Rerun with option '--update'. [<warning>-Wother</warning>]
]])