mirror of
https://github.com/gbdev/rgbds.git
synced 2026-06-09 18:22:35 +00:00
Simplify the lexer by removing BufferedContent and inlining ViewedContent (#1981)
Instead of reading 64 characters at a time into a rolling buffer as `shiftChar()` is called, we read 8192 characters at a time into a complete buffer before any `peek()`/`shiftChar()` operations.
This commit is contained in:
+3
-45
@@ -9,21 +9,10 @@
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <string>
|
||||
#include <variant>
|
||||
#include <vector>
|
||||
|
||||
#include "platform.hpp" // SSIZE_MAX
|
||||
|
||||
#include "asm/intern.hpp"
|
||||
|
||||
// This value is a compromise between `LexerState` allocation performance when reading the entire
|
||||
// file works, and buffering performance when it doesn't (e.g. when piping a file into RGBASM).
|
||||
static constexpr size_t LEXER_BUF_SIZE = 64;
|
||||
// The buffer needs to be large enough for the maximum `lexerState->peek()` lookahead distance
|
||||
static_assert(LEXER_BUF_SIZE > 1, "Lexer buffer size is too small");
|
||||
// This caps the size of buffer reads, and according to POSIX, passing more than SSIZE_MAX is UB
|
||||
static_assert(LEXER_BUF_SIZE <= SSIZE_MAX, "Lexer buffer size is too large");
|
||||
|
||||
enum LexerMode {
|
||||
LEXER_NORMAL,
|
||||
LEXER_RAW,
|
||||
@@ -47,34 +36,6 @@ struct ContentSpan {
|
||||
size_t size;
|
||||
};
|
||||
|
||||
struct ViewedContent {
|
||||
ContentSpan span; // Span of chars
|
||||
size_t offset = 0; // Cursor into `span.ptr`
|
||||
|
||||
ViewedContent(ContentSpan const &span_) : span(span_) {}
|
||||
ViewedContent(std::shared_ptr<char[]> ptr, size_t size) : span({.ptr = ptr, .size = size}) {}
|
||||
|
||||
std::shared_ptr<char[]> makeSharedContentPtr() const {
|
||||
return std::shared_ptr<char[]>(span.ptr, &span.ptr[offset]);
|
||||
}
|
||||
};
|
||||
|
||||
struct BufferedContent {
|
||||
int fd; // File from which to read chars
|
||||
char buf[LEXER_BUF_SIZE] = {}; // Circular buffer of chars
|
||||
size_t offset = 0; // Cursor into `buf`
|
||||
size_t size = 0; // Number of "fresh" chars in `buf`
|
||||
|
||||
BufferedContent(int fd_) : fd(fd_) {}
|
||||
~BufferedContent();
|
||||
|
||||
void advance(); // Increment `offset` circularly, decrement `size`
|
||||
void refill(); // Read from `fd` to fill `buf`
|
||||
|
||||
private:
|
||||
size_t readMore(size_t startIndex, size_t nbChars);
|
||||
};
|
||||
|
||||
struct IfStackEntry {
|
||||
bool ranIfBlock; // Whether an IF/ELIF/ELSE block ran already
|
||||
bool reachedElseBlock; // Whether an ELSE block ran already
|
||||
@@ -100,20 +61,17 @@ struct LexerState {
|
||||
size_t expansionScanDistance; // Max distance already scanned for expansions
|
||||
std::deque<Expansion> expansionStack; // Front is the innermost current expansion
|
||||
|
||||
std::variant<std::monostate, ViewedContent, BufferedContent> content;
|
||||
ContentSpan content; // Span of chars
|
||||
size_t offset = 0; // Cursor into `content.ptr`
|
||||
|
||||
~LexerState();
|
||||
|
||||
int peekChar();
|
||||
int peekCharAhead();
|
||||
|
||||
std::shared_ptr<char[]> makeSharedCaptureBufPtr() const {
|
||||
return std::shared_ptr<char[]>(captureBuf, captureBuf->data());
|
||||
}
|
||||
|
||||
void setAsCurrentState();
|
||||
void setFileAsNextState(std::string const &filePath, bool updateStateNow);
|
||||
void setViewAsNextState(char const *name, ContentSpan const &span, uint32_t lineNo_);
|
||||
void setViewAsNextState(char const *name, ContentSpan const &content_, uint32_t lineNo_);
|
||||
|
||||
void clear(uint32_t lineNo_);
|
||||
};
|
||||
|
||||
+73
-120
@@ -351,9 +351,11 @@ void LexerState::setAsCurrentState() {
|
||||
}
|
||||
|
||||
void LexerState::setFileAsNextState(std::string const &filePath, bool updateStateNow) {
|
||||
int fd = -1;
|
||||
|
||||
if (filePath == "-") {
|
||||
path = "<stdin>";
|
||||
content.emplace<BufferedContent>(STDIN_FILENO);
|
||||
fd = STDIN_FILENO;
|
||||
verbosePrint(VERB_INFO, "Opening stdin\n"); // LCOV_EXCL_LINE
|
||||
} else {
|
||||
struct stat statBuf;
|
||||
@@ -366,20 +368,20 @@ void LexerState::setFileAsNextState(std::string const &filePath, bool updateStat
|
||||
|
||||
if (std::streamsize size = statBuf.st_size; statBuf.st_size > 0) {
|
||||
// Read the entire file for better performance
|
||||
// Ideally we'd use C++20 `auto ptr = std::make_shared<char[]>(size)`,
|
||||
// Ideally we'd use C++20 `content.ptr = std::make_shared<char[]>(size)`,
|
||||
// but it has insufficient compiler support
|
||||
auto ptr = std::shared_ptr<char[]>(new char[size]);
|
||||
content.ptr = std::shared_ptr<char[]>(new char[size]);
|
||||
content.size = static_cast<size_t>(size);
|
||||
|
||||
if (std::ifstream fs(path, std::ios::binary); !fs) {
|
||||
// LCOV_EXCL_START
|
||||
fatal("Failed to open file \"%s\": %s", path.c_str(), strerror(errno));
|
||||
// LCOV_EXCL_STOP
|
||||
} else if (!fs.read(ptr.get(), size) || fs.gcount() != size) {
|
||||
} else if (!fs.read(content.ptr.get(), size) || fs.gcount() != size) {
|
||||
// LCOV_EXCL_START
|
||||
fatal("Failed to read file \"%s\": %s", path.c_str(), strerror(errno));
|
||||
// LCOV_EXCL_STOP
|
||||
}
|
||||
content.emplace<ViewedContent>(ptr, static_cast<size_t>(size));
|
||||
|
||||
// LCOV_EXCL_START
|
||||
verbosePrint(VERB_INFO, "File \"%s\" is fully read\n", path.c_str());
|
||||
@@ -395,19 +397,56 @@ void LexerState::setFileAsNextState(std::string const &filePath, bool updateStat
|
||||
}
|
||||
// LCOV_EXCL_STOP
|
||||
|
||||
// Have a fallback if reading the file failed
|
||||
int fd = open(path.c_str(), O_RDONLY);
|
||||
// Have a fallback if measuring the file size failed
|
||||
fd = open(path.c_str(), O_RDONLY);
|
||||
if (fd < 0) {
|
||||
// LCOV_EXCL_START
|
||||
fatal("Failed to open file \"%s\": %s", path.c_str(), strerror(errno));
|
||||
// LCOV_EXCL_STOP
|
||||
}
|
||||
content.emplace<BufferedContent>(fd);
|
||||
|
||||
verbosePrint(VERB_INFO, "File \"%s\" is opened\n", path.c_str()); // LCOV_EXCL_LINE
|
||||
}
|
||||
}
|
||||
|
||||
if (fd >= 0) {
|
||||
// If the file is stdin, or if measuring its size failed, read it in pieces
|
||||
Defer closeFile{[&] {
|
||||
if (fd != STDIN_FILENO) {
|
||||
close(fd);
|
||||
}
|
||||
}};
|
||||
|
||||
// Reasonably large buffer size for `read` performance
|
||||
char buf[8192];
|
||||
// POSIX specifies that lengths greater than SSIZE_MAX yield implementation-defined results
|
||||
static_assert(sizeof(buf) <= SSIZE_MAX, "Lexer buffer size is too large");
|
||||
|
||||
auto vec = std::make_shared<std::vector<char>>();
|
||||
for (;;) {
|
||||
ssize_t ret = read(fd, buf, sizeof(buf));
|
||||
// Exit on errors, unless we only were interrupted
|
||||
if (ret == -1 && errno != EINTR) {
|
||||
// LCOV_EXCL_START
|
||||
fatal("Failed to read file \"%s\": %s", path.c_str(), strerror(errno));
|
||||
// LCOV_EXCL_STOP
|
||||
}
|
||||
// EOF reached
|
||||
if (ret == 0) {
|
||||
break;
|
||||
}
|
||||
// If anything was read, accumulate it, and continue
|
||||
if (ret != -1) {
|
||||
vec->insert(vec->end(), buf, buf + ret);
|
||||
}
|
||||
}
|
||||
content.ptr = std::shared_ptr<char[]>(vec, vec->data());
|
||||
content.size = vec->size();
|
||||
|
||||
verbosePrint(VERB_INFO, "File \"%s\" is fully read\n", path.c_str()); // LCOV_EXCL_LINE
|
||||
}
|
||||
|
||||
offset = 0;
|
||||
clear(0);
|
||||
if (updateStateNow) {
|
||||
lexerState = this;
|
||||
@@ -416,17 +455,18 @@ void LexerState::setFileAsNextState(std::string const &filePath, bool updateStat
|
||||
}
|
||||
}
|
||||
|
||||
void LexerState::setViewAsNextState(char const *name, ContentSpan const &span, uint32_t lineNo_) {
|
||||
void LexerState::setViewAsNextState(
|
||||
char const *name, ContentSpan const &content_, uint32_t lineNo_
|
||||
) {
|
||||
path = name; // Used to report read errors in `.peek()`
|
||||
content.emplace<ViewedContent>(span);
|
||||
content = content_;
|
||||
offset = 0;
|
||||
clear(lineNo_);
|
||||
lexerStateEOL = this;
|
||||
}
|
||||
|
||||
void lexer_RestartRept(uint32_t lineNo) {
|
||||
if (std::holds_alternative<ViewedContent>(lexerState->content)) {
|
||||
std::get<ViewedContent>(lexerState->content).offset = 0;
|
||||
}
|
||||
lexerState->offset = 0;
|
||||
lexerState->clear(lineNo);
|
||||
}
|
||||
|
||||
@@ -450,66 +490,6 @@ bool Expansion::advance() {
|
||||
return ++offset > size();
|
||||
}
|
||||
|
||||
BufferedContent::~BufferedContent() {
|
||||
close(fd);
|
||||
}
|
||||
|
||||
void BufferedContent::advance() {
|
||||
assume(offset < std::size(buf));
|
||||
if (++offset == std::size(buf)) {
|
||||
offset = 0; // Wrap around if necessary
|
||||
}
|
||||
if (size > 0) {
|
||||
--size;
|
||||
}
|
||||
}
|
||||
|
||||
void BufferedContent::refill() {
|
||||
assume(size <= std::size(buf));
|
||||
size_t target = std::size(buf) - size; // Aim: making the buf full
|
||||
|
||||
// Compute the index we'll start writing to
|
||||
size_t startIndex = (offset + size) % std::size(buf);
|
||||
|
||||
// If the range to fill passes over the buffer wrapping point, we need two reads
|
||||
if (startIndex + target > std::size(buf)) {
|
||||
size_t nbExpectedChars = std::size(buf) - startIndex;
|
||||
size_t nbReadChars = readMore(startIndex, nbExpectedChars);
|
||||
|
||||
startIndex += nbReadChars;
|
||||
if (startIndex == std::size(buf)) {
|
||||
startIndex = 0;
|
||||
}
|
||||
|
||||
// If the read was incomplete, don't perform a second read
|
||||
target -= nbReadChars;
|
||||
if (nbReadChars < nbExpectedChars) {
|
||||
target = 0;
|
||||
}
|
||||
}
|
||||
if (target != 0) {
|
||||
readMore(startIndex, target);
|
||||
}
|
||||
}
|
||||
|
||||
size_t BufferedContent::readMore(size_t startIndex, size_t nbChars) {
|
||||
// This buffer overflow made me lose WEEKS of my life. Never again.
|
||||
assume(startIndex + nbChars <= std::size(buf));
|
||||
ssize_t nbReadChars = read(fd, &buf[startIndex], nbChars);
|
||||
|
||||
if (nbReadChars == -1) {
|
||||
// LCOV_EXCL_START
|
||||
fatal("Error reading file \"%s\": %s", lexerState->path.c_str(), strerror(errno));
|
||||
// LCOV_EXCL_STOP
|
||||
}
|
||||
|
||||
size += nbReadChars;
|
||||
assume(size <= std::size(buf));
|
||||
|
||||
// `nbReadChars` cannot be negative, so it's fine to cast to `size_t`
|
||||
return static_cast<size_t>(nbReadChars);
|
||||
}
|
||||
|
||||
void lexer_SetMode(LexerMode mode) {
|
||||
lexerState->mode = mode;
|
||||
}
|
||||
@@ -683,20 +663,8 @@ int LexerState::peekChar() {
|
||||
}
|
||||
}
|
||||
|
||||
if (std::holds_alternative<ViewedContent>(content)) {
|
||||
auto &view = std::get<ViewedContent>(content);
|
||||
if (view.offset < view.span.size) {
|
||||
return static_cast<uint8_t>(view.span.ptr[view.offset]);
|
||||
}
|
||||
} else {
|
||||
auto &cbuf = std::get<BufferedContent>(content);
|
||||
if (cbuf.size == 0) {
|
||||
cbuf.refill();
|
||||
}
|
||||
assume(cbuf.offset < std::size(cbuf.buf));
|
||||
if (cbuf.size > 0) {
|
||||
return static_cast<uint8_t>(cbuf.buf[cbuf.offset]);
|
||||
}
|
||||
if (offset < content.size) {
|
||||
return static_cast<uint8_t>(content.ptr[offset]);
|
||||
}
|
||||
|
||||
// If there aren't enough chars, give up
|
||||
@@ -719,20 +687,8 @@ int LexerState::peekCharAhead() {
|
||||
distance -= exp.size() - exp.offset;
|
||||
}
|
||||
|
||||
if (std::holds_alternative<ViewedContent>(content)) {
|
||||
auto &view = std::get<ViewedContent>(content);
|
||||
if (view.offset + distance < view.span.size) {
|
||||
return static_cast<uint8_t>(view.span.ptr[view.offset + distance]);
|
||||
}
|
||||
} else {
|
||||
auto &cbuf = std::get<BufferedContent>(content);
|
||||
assume(distance < std::size(cbuf.buf));
|
||||
if (cbuf.size <= distance) {
|
||||
cbuf.refill();
|
||||
}
|
||||
if (cbuf.size > distance) {
|
||||
return static_cast<uint8_t>(cbuf.buf[(cbuf.offset + distance) % std::size(cbuf.buf)]);
|
||||
}
|
||||
if (offset + distance < content.size) {
|
||||
return static_cast<uint8_t>(content.ptr[offset + distance]);
|
||||
}
|
||||
|
||||
// If there aren't enough chars, give up
|
||||
@@ -808,11 +764,7 @@ static void shiftChar() {
|
||||
}
|
||||
} else {
|
||||
// Advance within the file contents
|
||||
if (std::holds_alternative<ViewedContent>(lexerState->content)) {
|
||||
++std::get<ViewedContent>(lexerState->content).offset;
|
||||
} else {
|
||||
std::get<BufferedContent>(lexerState->content).advance();
|
||||
}
|
||||
++lexerState->offset;
|
||||
}
|
||||
return;
|
||||
}
|
||||
@@ -2165,13 +2117,13 @@ static Token skipToLeadingKeyword(
|
||||
static Token skipToLeadingKeyword() {
|
||||
assume(!lexerState->enableExpansions);
|
||||
|
||||
if (std::holds_alternative<ViewedContent>(lexerState->content)
|
||||
&& lexerState->expansionStack.empty()) {
|
||||
// Optimize the common case (a fully-read assembly file without ongoing
|
||||
// expansions) to avoid the bookkeeping of `peek` and `shiftChar`.
|
||||
auto &view = std::get<ViewedContent>(lexerState->content);
|
||||
char const *ptr = view.span.ptr.get();
|
||||
auto quickPeek = [&]() { return view.offset < view.span.size ? ptr[view.offset] : EOF; };
|
||||
if (lexerState->expansionStack.empty()) {
|
||||
// Optimize the common case (no ongoing expansions) to avoid
|
||||
// the bookkeeping of `peek` and `shiftChar`.
|
||||
char const *ptr = lexerState->content.ptr.get();
|
||||
auto quickPeek = [&]() {
|
||||
return lexerState->offset < lexerState->content.size ? ptr[lexerState->offset] : EOF;
|
||||
};
|
||||
auto quickNextLine = []() { ++lexerState->lineNo; };
|
||||
auto quickFinalize = []() {
|
||||
// When `skipToLeadingKeyword` returns a token, there has been one more
|
||||
@@ -2185,14 +2137,14 @@ static Token skipToLeadingKeyword() {
|
||||
if (lexerState->capturing) {
|
||||
assume(lexerState->captureBuf == nullptr);
|
||||
auto quickCaptureShiftChar = [&]() {
|
||||
++view.offset;
|
||||
++lexerState->offset;
|
||||
++lexerState->captureSize;
|
||||
};
|
||||
return skipToLeadingKeyword(
|
||||
quickPeek, quickCaptureShiftChar, quickNextLine, quickFinalize
|
||||
);
|
||||
} else {
|
||||
auto quickShiftChar = [&]() { ++view.offset; };
|
||||
auto quickShiftChar = [&]() { ++lexerState->offset; };
|
||||
return skipToLeadingKeyword(quickPeek, quickShiftChar, quickNextLine, quickFinalize);
|
||||
}
|
||||
} else {
|
||||
@@ -2389,10 +2341,10 @@ static Capture makeCapture(char const *name, InvocableR<int, int> auto callback)
|
||||
Capture capture = {
|
||||
.lineNo = lexer_GetLineNo(), .span = {.ptr = nullptr, .size = 0}
|
||||
};
|
||||
if (std::holds_alternative<ViewedContent>(lexerState->content)
|
||||
&& lexerState->expansionStack.empty()) {
|
||||
auto &view = std::get<ViewedContent>(lexerState->content);
|
||||
capture.span.ptr = view.makeSharedContentPtr();
|
||||
if (lexerState->expansionStack.empty()) {
|
||||
capture.span.ptr = std::shared_ptr<char[]>(
|
||||
lexerState->content.ptr, &lexerState->content.ptr[lexerState->offset]
|
||||
);
|
||||
} else {
|
||||
assume(lexerState->captureBuf == nullptr);
|
||||
lexerState->captureBuf = std::make_shared<std::vector<char>>();
|
||||
@@ -2411,7 +2363,8 @@ static Capture makeCapture(char const *name, InvocableR<int, int> auto callback)
|
||||
} else if (size_t endTokenLength = callback(token.type); endTokenLength > 0) {
|
||||
if (!capture.span.ptr) {
|
||||
// Retrieve the capture buffer now that we're done capturing
|
||||
capture.span.ptr = lexerState->makeSharedCaptureBufPtr();
|
||||
capture.span.ptr =
|
||||
std::shared_ptr<char[]>(lexerState->captureBuf, lexerState->captureBuf->data());
|
||||
}
|
||||
// Subtract the length of the ending token; we know we have read it exactly,
|
||||
// not e.g. an interpolation or EQUS expansion, since those are disabled.
|
||||
|
||||
+1
-1
@@ -1,5 +1,5 @@
|
||||
SECTION "test", ROM0[0]
|
||||
ld [ $ff00 + c ], a
|
||||
; 257 spaces exceeds both LEXER_BUF_SIZE (64) and uint8_t limit (255)
|
||||
; 257 spaces exceeds the uint8_t limit (255)
|
||||
ld [ $ff00 + c ], a
|
||||
ld [ $ff00 + c ], a
|
||||
|
||||
Reference in New Issue
Block a user