Simplify the lexer by removing BufferedContent and inlining ViewedContent (#1981)

Instead of reading 64 characters at a time into a rolling buffer
as `shiftChar()` is called, we read 8192 characters at a time into
a complete buffer before any `peek()`/`shiftChar()` operations.
This commit is contained in:
Rangi
2026-05-25 21:05:59 -04:00
committed by GitHub
parent cfa0adf295
commit 55db252a8f
3 changed files with 77 additions and 166 deletions
+3 -45
View File
@@ -9,21 +9,10 @@
#include <stddef.h> #include <stddef.h>
#include <stdint.h> #include <stdint.h>
#include <string> #include <string>
#include <variant>
#include <vector> #include <vector>
#include "platform.hpp" // SSIZE_MAX
#include "asm/intern.hpp" #include "asm/intern.hpp"
// This value is a compromise between `LexerState` allocation performance when reading the entire
// file works, and buffering performance when it doesn't (e.g. when piping a file into RGBASM).
static constexpr size_t LEXER_BUF_SIZE = 64;
// The buffer needs to be large enough for the maximum `lexerState->peek()` lookahead distance
static_assert(LEXER_BUF_SIZE > 1, "Lexer buffer size is too small");
// This caps the size of buffer reads, and according to POSIX, passing more than SSIZE_MAX is UB
static_assert(LEXER_BUF_SIZE <= SSIZE_MAX, "Lexer buffer size is too large");
enum LexerMode { enum LexerMode {
LEXER_NORMAL, LEXER_NORMAL,
LEXER_RAW, LEXER_RAW,
@@ -47,34 +36,6 @@ struct ContentSpan {
size_t size; size_t size;
}; };
struct ViewedContent {
ContentSpan span; // Span of chars
size_t offset = 0; // Cursor into `span.ptr`
ViewedContent(ContentSpan const &span_) : span(span_) {}
ViewedContent(std::shared_ptr<char[]> ptr, size_t size) : span({.ptr = ptr, .size = size}) {}
std::shared_ptr<char[]> makeSharedContentPtr() const {
return std::shared_ptr<char[]>(span.ptr, &span.ptr[offset]);
}
};
struct BufferedContent {
int fd; // File from which to read chars
char buf[LEXER_BUF_SIZE] = {}; // Circular buffer of chars
size_t offset = 0; // Cursor into `buf`
size_t size = 0; // Number of "fresh" chars in `buf`
BufferedContent(int fd_) : fd(fd_) {}
~BufferedContent();
void advance(); // Increment `offset` circularly, decrement `size`
void refill(); // Read from `fd` to fill `buf`
private:
size_t readMore(size_t startIndex, size_t nbChars);
};
struct IfStackEntry { struct IfStackEntry {
bool ranIfBlock; // Whether an IF/ELIF/ELSE block ran already bool ranIfBlock; // Whether an IF/ELIF/ELSE block ran already
bool reachedElseBlock; // Whether an ELSE block ran already bool reachedElseBlock; // Whether an ELSE block ran already
@@ -100,20 +61,17 @@ struct LexerState {
size_t expansionScanDistance; // Max distance already scanned for expansions size_t expansionScanDistance; // Max distance already scanned for expansions
std::deque<Expansion> expansionStack; // Front is the innermost current expansion std::deque<Expansion> expansionStack; // Front is the innermost current expansion
std::variant<std::monostate, ViewedContent, BufferedContent> content; ContentSpan content; // Span of chars
size_t offset = 0; // Cursor into `content.ptr`
~LexerState(); ~LexerState();
int peekChar(); int peekChar();
int peekCharAhead(); int peekCharAhead();
std::shared_ptr<char[]> makeSharedCaptureBufPtr() const {
return std::shared_ptr<char[]>(captureBuf, captureBuf->data());
}
void setAsCurrentState(); void setAsCurrentState();
void setFileAsNextState(std::string const &filePath, bool updateStateNow); void setFileAsNextState(std::string const &filePath, bool updateStateNow);
void setViewAsNextState(char const *name, ContentSpan const &span, uint32_t lineNo_); void setViewAsNextState(char const *name, ContentSpan const &content_, uint32_t lineNo_);
void clear(uint32_t lineNo_); void clear(uint32_t lineNo_);
}; };
+73 -120
View File
@@ -351,9 +351,11 @@ void LexerState::setAsCurrentState() {
} }
void LexerState::setFileAsNextState(std::string const &filePath, bool updateStateNow) { void LexerState::setFileAsNextState(std::string const &filePath, bool updateStateNow) {
int fd = -1;
if (filePath == "-") { if (filePath == "-") {
path = "<stdin>"; path = "<stdin>";
content.emplace<BufferedContent>(STDIN_FILENO); fd = STDIN_FILENO;
verbosePrint(VERB_INFO, "Opening stdin\n"); // LCOV_EXCL_LINE verbosePrint(VERB_INFO, "Opening stdin\n"); // LCOV_EXCL_LINE
} else { } else {
struct stat statBuf; struct stat statBuf;
@@ -366,20 +368,20 @@ void LexerState::setFileAsNextState(std::string const &filePath, bool updateStat
if (std::streamsize size = statBuf.st_size; statBuf.st_size > 0) { if (std::streamsize size = statBuf.st_size; statBuf.st_size > 0) {
// Read the entire file for better performance // Read the entire file for better performance
// Ideally we'd use C++20 `auto ptr = std::make_shared<char[]>(size)`, // Ideally we'd use C++20 `content.ptr = std::make_shared<char[]>(size)`,
// but it has insufficient compiler support // but it has insufficient compiler support
auto ptr = std::shared_ptr<char[]>(new char[size]); content.ptr = std::shared_ptr<char[]>(new char[size]);
content.size = static_cast<size_t>(size);
if (std::ifstream fs(path, std::ios::binary); !fs) { if (std::ifstream fs(path, std::ios::binary); !fs) {
// LCOV_EXCL_START // LCOV_EXCL_START
fatal("Failed to open file \"%s\": %s", path.c_str(), strerror(errno)); fatal("Failed to open file \"%s\": %s", path.c_str(), strerror(errno));
// LCOV_EXCL_STOP // LCOV_EXCL_STOP
} else if (!fs.read(ptr.get(), size) || fs.gcount() != size) { } else if (!fs.read(content.ptr.get(), size) || fs.gcount() != size) {
// LCOV_EXCL_START // LCOV_EXCL_START
fatal("Failed to read file \"%s\": %s", path.c_str(), strerror(errno)); fatal("Failed to read file \"%s\": %s", path.c_str(), strerror(errno));
// LCOV_EXCL_STOP // LCOV_EXCL_STOP
} }
content.emplace<ViewedContent>(ptr, static_cast<size_t>(size));
// LCOV_EXCL_START // LCOV_EXCL_START
verbosePrint(VERB_INFO, "File \"%s\" is fully read\n", path.c_str()); verbosePrint(VERB_INFO, "File \"%s\" is fully read\n", path.c_str());
@@ -395,19 +397,56 @@ void LexerState::setFileAsNextState(std::string const &filePath, bool updateStat
} }
// LCOV_EXCL_STOP // LCOV_EXCL_STOP
// Have a fallback if reading the file failed // Have a fallback if measuring the file size failed
int fd = open(path.c_str(), O_RDONLY); fd = open(path.c_str(), O_RDONLY);
if (fd < 0) { if (fd < 0) {
// LCOV_EXCL_START // LCOV_EXCL_START
fatal("Failed to open file \"%s\": %s", path.c_str(), strerror(errno)); fatal("Failed to open file \"%s\": %s", path.c_str(), strerror(errno));
// LCOV_EXCL_STOP // LCOV_EXCL_STOP
} }
content.emplace<BufferedContent>(fd);
verbosePrint(VERB_INFO, "File \"%s\" is opened\n", path.c_str()); // LCOV_EXCL_LINE verbosePrint(VERB_INFO, "File \"%s\" is opened\n", path.c_str()); // LCOV_EXCL_LINE
} }
} }
if (fd >= 0) {
// If the file is stdin, or if measuring its size failed, read it in pieces
Defer closeFile{[&] {
if (fd != STDIN_FILENO) {
close(fd);
}
}};
// Reasonably large buffer size for `read` performance
char buf[8192];
// POSIX specifies that lengths greater than SSIZE_MAX yield implementation-defined results
static_assert(sizeof(buf) <= SSIZE_MAX, "Lexer buffer size is too large");
auto vec = std::make_shared<std::vector<char>>();
for (;;) {
ssize_t ret = read(fd, buf, sizeof(buf));
// Exit on errors, unless we only were interrupted
if (ret == -1 && errno != EINTR) {
// LCOV_EXCL_START
fatal("Failed to read file \"%s\": %s", path.c_str(), strerror(errno));
// LCOV_EXCL_STOP
}
// EOF reached
if (ret == 0) {
break;
}
// If anything was read, accumulate it, and continue
if (ret != -1) {
vec->insert(vec->end(), buf, buf + ret);
}
}
content.ptr = std::shared_ptr<char[]>(vec, vec->data());
content.size = vec->size();
verbosePrint(VERB_INFO, "File \"%s\" is fully read\n", path.c_str()); // LCOV_EXCL_LINE
}
offset = 0;
clear(0); clear(0);
if (updateStateNow) { if (updateStateNow) {
lexerState = this; lexerState = this;
@@ -416,17 +455,18 @@ void LexerState::setFileAsNextState(std::string const &filePath, bool updateStat
} }
} }
void LexerState::setViewAsNextState(char const *name, ContentSpan const &span, uint32_t lineNo_) { void LexerState::setViewAsNextState(
char const *name, ContentSpan const &content_, uint32_t lineNo_
) {
path = name; // Used to report read errors in `.peek()` path = name; // Used to report read errors in `.peek()`
content.emplace<ViewedContent>(span); content = content_;
offset = 0;
clear(lineNo_); clear(lineNo_);
lexerStateEOL = this; lexerStateEOL = this;
} }
void lexer_RestartRept(uint32_t lineNo) { void lexer_RestartRept(uint32_t lineNo) {
if (std::holds_alternative<ViewedContent>(lexerState->content)) { lexerState->offset = 0;
std::get<ViewedContent>(lexerState->content).offset = 0;
}
lexerState->clear(lineNo); lexerState->clear(lineNo);
} }
@@ -450,66 +490,6 @@ bool Expansion::advance() {
return ++offset > size(); return ++offset > size();
} }
BufferedContent::~BufferedContent() {
close(fd);
}
void BufferedContent::advance() {
assume(offset < std::size(buf));
if (++offset == std::size(buf)) {
offset = 0; // Wrap around if necessary
}
if (size > 0) {
--size;
}
}
void BufferedContent::refill() {
assume(size <= std::size(buf));
size_t target = std::size(buf) - size; // Aim: making the buf full
// Compute the index we'll start writing to
size_t startIndex = (offset + size) % std::size(buf);
// If the range to fill passes over the buffer wrapping point, we need two reads
if (startIndex + target > std::size(buf)) {
size_t nbExpectedChars = std::size(buf) - startIndex;
size_t nbReadChars = readMore(startIndex, nbExpectedChars);
startIndex += nbReadChars;
if (startIndex == std::size(buf)) {
startIndex = 0;
}
// If the read was incomplete, don't perform a second read
target -= nbReadChars;
if (nbReadChars < nbExpectedChars) {
target = 0;
}
}
if (target != 0) {
readMore(startIndex, target);
}
}
size_t BufferedContent::readMore(size_t startIndex, size_t nbChars) {
// This buffer overflow made me lose WEEKS of my life. Never again.
assume(startIndex + nbChars <= std::size(buf));
ssize_t nbReadChars = read(fd, &buf[startIndex], nbChars);
if (nbReadChars == -1) {
// LCOV_EXCL_START
fatal("Error reading file \"%s\": %s", lexerState->path.c_str(), strerror(errno));
// LCOV_EXCL_STOP
}
size += nbReadChars;
assume(size <= std::size(buf));
// `nbReadChars` cannot be negative, so it's fine to cast to `size_t`
return static_cast<size_t>(nbReadChars);
}
void lexer_SetMode(LexerMode mode) { void lexer_SetMode(LexerMode mode) {
lexerState->mode = mode; lexerState->mode = mode;
} }
@@ -683,20 +663,8 @@ int LexerState::peekChar() {
} }
} }
if (std::holds_alternative<ViewedContent>(content)) { if (offset < content.size) {
auto &view = std::get<ViewedContent>(content); return static_cast<uint8_t>(content.ptr[offset]);
if (view.offset < view.span.size) {
return static_cast<uint8_t>(view.span.ptr[view.offset]);
}
} else {
auto &cbuf = std::get<BufferedContent>(content);
if (cbuf.size == 0) {
cbuf.refill();
}
assume(cbuf.offset < std::size(cbuf.buf));
if (cbuf.size > 0) {
return static_cast<uint8_t>(cbuf.buf[cbuf.offset]);
}
} }
// If there aren't enough chars, give up // If there aren't enough chars, give up
@@ -719,20 +687,8 @@ int LexerState::peekCharAhead() {
distance -= exp.size() - exp.offset; distance -= exp.size() - exp.offset;
} }
if (std::holds_alternative<ViewedContent>(content)) { if (offset + distance < content.size) {
auto &view = std::get<ViewedContent>(content); return static_cast<uint8_t>(content.ptr[offset + distance]);
if (view.offset + distance < view.span.size) {
return static_cast<uint8_t>(view.span.ptr[view.offset + distance]);
}
} else {
auto &cbuf = std::get<BufferedContent>(content);
assume(distance < std::size(cbuf.buf));
if (cbuf.size <= distance) {
cbuf.refill();
}
if (cbuf.size > distance) {
return static_cast<uint8_t>(cbuf.buf[(cbuf.offset + distance) % std::size(cbuf.buf)]);
}
} }
// If there aren't enough chars, give up // If there aren't enough chars, give up
@@ -808,11 +764,7 @@ static void shiftChar() {
} }
} else { } else {
// Advance within the file contents // Advance within the file contents
if (std::holds_alternative<ViewedContent>(lexerState->content)) { ++lexerState->offset;
++std::get<ViewedContent>(lexerState->content).offset;
} else {
std::get<BufferedContent>(lexerState->content).advance();
}
} }
return; return;
} }
@@ -2165,13 +2117,13 @@ static Token skipToLeadingKeyword(
static Token skipToLeadingKeyword() { static Token skipToLeadingKeyword() {
assume(!lexerState->enableExpansions); assume(!lexerState->enableExpansions);
if (std::holds_alternative<ViewedContent>(lexerState->content) if (lexerState->expansionStack.empty()) {
&& lexerState->expansionStack.empty()) { // Optimize the common case (no ongoing expansions) to avoid
// Optimize the common case (a fully-read assembly file without ongoing // the bookkeeping of `peek` and `shiftChar`.
// expansions) to avoid the bookkeeping of `peek` and `shiftChar`. char const *ptr = lexerState->content.ptr.get();
auto &view = std::get<ViewedContent>(lexerState->content); auto quickPeek = [&]() {
char const *ptr = view.span.ptr.get(); return lexerState->offset < lexerState->content.size ? ptr[lexerState->offset] : EOF;
auto quickPeek = [&]() { return view.offset < view.span.size ? ptr[view.offset] : EOF; }; };
auto quickNextLine = []() { ++lexerState->lineNo; }; auto quickNextLine = []() { ++lexerState->lineNo; };
auto quickFinalize = []() { auto quickFinalize = []() {
// When `skipToLeadingKeyword` returns a token, there has been one more // When `skipToLeadingKeyword` returns a token, there has been one more
@@ -2185,14 +2137,14 @@ static Token skipToLeadingKeyword() {
if (lexerState->capturing) { if (lexerState->capturing) {
assume(lexerState->captureBuf == nullptr); assume(lexerState->captureBuf == nullptr);
auto quickCaptureShiftChar = [&]() { auto quickCaptureShiftChar = [&]() {
++view.offset; ++lexerState->offset;
++lexerState->captureSize; ++lexerState->captureSize;
}; };
return skipToLeadingKeyword( return skipToLeadingKeyword(
quickPeek, quickCaptureShiftChar, quickNextLine, quickFinalize quickPeek, quickCaptureShiftChar, quickNextLine, quickFinalize
); );
} else { } else {
auto quickShiftChar = [&]() { ++view.offset; }; auto quickShiftChar = [&]() { ++lexerState->offset; };
return skipToLeadingKeyword(quickPeek, quickShiftChar, quickNextLine, quickFinalize); return skipToLeadingKeyword(quickPeek, quickShiftChar, quickNextLine, quickFinalize);
} }
} else { } else {
@@ -2389,10 +2341,10 @@ static Capture makeCapture(char const *name, InvocableR<int, int> auto callback)
Capture capture = { Capture capture = {
.lineNo = lexer_GetLineNo(), .span = {.ptr = nullptr, .size = 0} .lineNo = lexer_GetLineNo(), .span = {.ptr = nullptr, .size = 0}
}; };
if (std::holds_alternative<ViewedContent>(lexerState->content) if (lexerState->expansionStack.empty()) {
&& lexerState->expansionStack.empty()) { capture.span.ptr = std::shared_ptr<char[]>(
auto &view = std::get<ViewedContent>(lexerState->content); lexerState->content.ptr, &lexerState->content.ptr[lexerState->offset]
capture.span.ptr = view.makeSharedContentPtr(); );
} else { } else {
assume(lexerState->captureBuf == nullptr); assume(lexerState->captureBuf == nullptr);
lexerState->captureBuf = std::make_shared<std::vector<char>>(); lexerState->captureBuf = std::make_shared<std::vector<char>>();
@@ -2411,7 +2363,8 @@ static Capture makeCapture(char const *name, InvocableR<int, int> auto callback)
} else if (size_t endTokenLength = callback(token.type); endTokenLength > 0) { } else if (size_t endTokenLength = callback(token.type); endTokenLength > 0) {
if (!capture.span.ptr) { if (!capture.span.ptr) {
// Retrieve the capture buffer now that we're done capturing // Retrieve the capture buffer now that we're done capturing
capture.span.ptr = lexerState->makeSharedCaptureBufPtr(); capture.span.ptr =
std::shared_ptr<char[]>(lexerState->captureBuf, lexerState->captureBuf->data());
} }
// Subtract the length of the ending token; we know we have read it exactly, // Subtract the length of the ending token; we know we have read it exactly,
// not e.g. an interpolation or EQUS expansion, since those are disabled. // not e.g. an interpolation or EQUS expansion, since those are disabled.
+1 -1
View File
@@ -1,5 +1,5 @@
SECTION "test", ROM0[0] SECTION "test", ROM0[0]
ld [ $ff00 + c ], a ld [ $ff00 + c ], a
; 257 spaces exceeds both LEXER_BUF_SIZE (64) and uint8_t limit (255) ; 257 spaces exceeds the uint8_t limit (255)
ld [ $ff00 + c ], a ld [ $ff00 + c ], a
ld [ $ff00 + c ], a ld [ $ff00 + c ], a