Reuse isWhitespace and isNewline, also refactoring readAtFile

2026-06-26 19:18:06 +00:00 · 2025-08-05 13:46:53 -04:00
parent 98c5c7f776
commit 504a45a4ed
5 changed files with 34 additions and 61 deletions
@@ -253,14 +253,6 @@ static UpperMap<int> const keywordDict{

 static auto ldio = keywordDict.find("LDIO");

-static bool isWhitespace(int c) {
-	return c == ' ' || c == '\t';
-}
-
-static bool isNewline(int c) {
-	return c == '\r' || c == '\n';
-}
-
 static LexerState *lexerState = nullptr;
 static LexerState *lexerStateEOL = nullptr;

@@ -20,6 +20,7 @@
 #include "platform.hpp"
 #include "style.hpp"
 #include "usage.hpp"
+#include "util.hpp"
 #include "verbosity.hpp"
 #include "version.hpp"

@@ -210,68 +211,44 @@ static std::vector<size_t> readAtFile(std::string const &path, std::vector<char>
 		fatal("Error reading @%s: %s", file.c_str(path), strerror(errno));
 	}

-	// We only filter out `EOF`, but calling `isblank()` on anything else is UB!
-	static_assert(
-	    std::streambuf::traits_type::eof() == EOF,
-	    "isblank(std::streambuf::traits_type::eof()) is UB!"
-	);
-	std::vector<size_t> argvOfs;
-
-	for (;;) {
-		int c;
+	for (std::vector<size_t> argvOfs;;) {
+		int c = file->sbumpc();

 		// First, discard any leading whitespace
-		do {
+		while (isWhitespace(c)) {
 			c = file->sbumpc();
-			if (c == EOF) {
-				return argvOfs;
-			}
-		} while (isblank(c));
+		}

-		switch (c) {
-		case '#': // If it's a comment, discard everything until EOL
-			while ((c = file->sbumpc()) != '\n') {
-				if (c == EOF) {
-					return argvOfs;
-				}
+		// If it's a comment, discard everything until EOL
+		if (c == '#') {
+			c = file->sbumpc();
+			while (c != EOF && !isNewline(c)) {
+				c = file->sbumpc();
 			}
-			continue; // Start processing the next line
-		// If it's an empty line, ignore it
-		case '\r':          // Assuming CRLF here
-			file->sbumpc(); // Discard the upcoming '\n'
-			[[fallthrough]];
-		case '\n':
+		}
+
+		if (c == EOF) {
+			return argvOfs;
+		} else if (isNewline(c)) {
 			continue; // Start processing the next line
 		}

 		// Alright, now we can parse the line
 		do {
+			argvOfs.push_back(argPool.size());
+
 			// Read one argument (until the next whitespace char).
 			// We know there is one because we already have its first character in `c`.
-			argvOfs.push_back(argPool.size());
-			// Reading and appending characters one at a time may be inefficient, but I'm counting
-			// on `vector` and `sbumpc` to do the right thing here.
-			argPool.push_back(c); // Push the character we've already read
-			for (;;) {
-				c = file->sbumpc();
-				if (c == EOF || c == '\n' || isblank(c)) {
-					break;
-				} else if (c == '\r') {
-					file->sbumpc(); // Discard the '\n'
-					break;
-				}
+			for (; c != EOF && !isNewline(c) && !isWhitespace(c); c = file->sbumpc()) {
 				argPool.push_back(c);
 			}
 			argPool.push_back('\0');

 			// Discard whitespace until the next argument (candidate)
-			while (isblank(c)) {
+			while (isWhitespace(c)) {
 				c = file->sbumpc();
 			}
-			if (c == '\r') {
-				c = file->sbumpc(); // Skip the '\n'
-			}
-		} while (c != '\n' && c != EOF); // End if we reached EOL
+		} while (c != EOF && !isNewline(c)); // End if we reached EOL
 	}
 }

@@ -60,14 +60,6 @@ void lexer_IncLineNo() {
 	++lexerStack.back().lineNo;
 }

-static bool isWhiteSpace(int c) {
-	return c == ' ' || c == '\t';
-}
-
-static bool isNewline(int c) {
-	return c == '\r' || c == '\n';
-}
-
 yy::parser::symbol_type yylex(); // Forward declaration for `yywrap`

 static yy::parser::symbol_type yywrap() {
@@ -268,7 +260,7 @@ yy::parser::symbol_type yylex() {
 	int c = context.file.sbumpc();

 	// First, skip leading whitespace.
-	while (isWhiteSpace(c)) {
+	while (isWhitespace(c)) {
 		c = context.file.sbumpc();
 	}
 	// Then, skip a comment if applicable.
@@ -5,6 +5,14 @@
 #include <stdint.h>
 #include <stdio.h>

+bool isWhitespace(int c) {
+	return c == ' ' || c == '\t';
+}
+
+bool isNewline(int c) {
+	return c == '\r' || c == '\n';
+}
+
 bool startsIdentifier(int c) {
 	// This returns false for anonymous labels, which internally start with a '!',
 	// and for section fragment literal labels, which internally start with a '$'.
@@ -25,7 +33,8 @@ char const *printChar(int c) {
 		return "EOF";
 	}

-	if (isprint(c)) {
+	// Handle printable ASCII characters
+	if (c >= ' ' && c <= '~') {
 		buf[0] = '\'';
 		buf[1] = c;
 		buf[2] = '\'';