Refactor BufferedContent and Expansion to have methods

Use a buffer size that is a power of two for fast modulus
This commit is contained in:
Rangi42
2024-03-28 11:20:32 -04:00
parent cf7bdb19b6
commit 4172d330b9
2 changed files with 115 additions and 105 deletions

View File

@@ -13,7 +13,7 @@
#include "platform.hpp" // SSIZE_MAX #include "platform.hpp" // SSIZE_MAX
#define LEXER_BUF_SIZE 42 // TODO: determine a sane value for this #define LEXER_BUF_SIZE 128
// The buffer needs to be large enough for the maximum `peekInternal` lookahead distance // The buffer needs to be large enough for the maximum `peekInternal` lookahead distance
static_assert(LEXER_BUF_SIZE > 1, "Lexer buffer size is too small"); static_assert(LEXER_BUF_SIZE > 1, "Lexer buffer size is too small");
// This caps the size of buffer reads, and according to POSIX, passing more than SSIZE_MAX is UB // This caps the size of buffer reads, and according to POSIX, passing more than SSIZE_MAX is UB
@@ -31,14 +31,12 @@ enum LexerMode {
struct Expansion { struct Expansion {
std::optional<std::string> name; std::optional<std::string> name;
std::shared_ptr<std::string> contents; std::shared_ptr<std::string> contents;
size_t offset; // Cursor into the contents size_t offset; // Cursor into `contents`
size_t size() const { return contents->size(); } size_t size() const { return contents->size(); }
}; bool canPeek(uint8_t distance) const { return offset + distance < contents->size(); }
uint8_t peek(uint8_t distance) const { return (*contents)[offset + distance]; }
struct IfStackEntry { bool advance(); // Increment `offset`; return whether it then exceeds `contents`
bool ranIfBlock; // Whether an IF/ELIF/ELSE block ran already
bool reachedElseBlock; // Whether an ELSE block ran already
}; };
struct ContentSpan { struct ContentSpan {
@@ -47,21 +45,40 @@ struct ContentSpan {
}; };
struct ViewedContent { struct ViewedContent {
ContentSpan span; ContentSpan span; // Span of chars
size_t offset = 0; size_t offset = 0; // Cursor into `span.ptr`
ViewedContent(ContentSpan const &span_) : span(span_) {} ViewedContent(ContentSpan const &span_) : span(span_) {}
ViewedContent(std::shared_ptr<char[]> ptr, size_t size) : span({.ptr = ptr, .size = size}) {} ViewedContent(std::shared_ptr<char[]> ptr, size_t size) : span({.ptr = ptr, .size = size}) {}
bool canPeek(uint8_t distance) const { return offset + distance < span.size; }
uint8_t peek(uint8_t distance) const { return span.ptr[offset + distance]; }
std::shared_ptr<char[]> makeSharedContentPtr() const {
return std::shared_ptr<char[]>(span.ptr, &span.ptr[offset]);
}
}; };
struct BufferedContent { struct BufferedContent {
int fd; int fd; // File from which to read chars
size_t index = 0; // Read index into the buffer char buf[LEXER_BUF_SIZE] = {}; // Circular buffer of chars
char buf[LEXER_BUF_SIZE] = {}; // Circular buffer size_t offset = 0; // Cursor into `buf`
size_t nbChars = 0; // Number of "fresh" chars in the buffer size_t size = 0; // Number of "fresh" chars in `buf`
BufferedContent(int fd_) : fd(fd_) {} BufferedContent(int fd_) : fd(fd_) {}
~BufferedContent(); ~BufferedContent();
bool canPeek(uint8_t distance) const { return size > distance; }
uint8_t peek(uint8_t distance) const { return buf[(offset + distance) % LEXER_BUF_SIZE]; }
void advance(); // Increment `offset` circularly, decrement `size`
void refill(); // Read from `fd` to fill `buf`
private:
size_t readMore(size_t startIndex, size_t nbChars);
};
struct IfStackEntry {
bool ranIfBlock; // Whether an IF/ELIF/ELSE block ran already
bool reachedElseBlock; // Whether an ELSE block ran already
}; };
struct LexerState { struct LexerState {
@@ -77,8 +94,7 @@ struct LexerState {
bool capturing; // Whether the text being lexed should be captured bool capturing; // Whether the text being lexed should be captured
size_t captureSize; // Amount of text captured size_t captureSize; // Amount of text captured
std::shared_ptr<std::vector<char>> std::shared_ptr<std::vector<char>> captureBuf; // Buffer to send the captured text to if set
captureBuf; // Buffer to send the captured text to if non-null
bool disableMacroArgs; bool disableMacroArgs;
bool disableInterpolation; bool disableInterpolation;
@@ -90,6 +106,10 @@ struct LexerState {
~LexerState(); ~LexerState();
std::shared_ptr<char[]> makeSharedCaptureBufPtr() const {
return std::shared_ptr<char[]>(captureBuf, captureBuf->data());
}
void setAsCurrentState(); void setAsCurrentState();
bool setFileAsNextState(std::string const &filePath, bool updateStateNow); bool setFileAsNextState(std::string const &filePath, bool updateStateNow);
void setViewAsNextState(char const *name, ContentSpan const &span, uint32_t lineNo_); void setViewAsNextState(char const *name, ContentSpan const &span, uint32_t lineNo_);

View File

@@ -46,10 +46,8 @@
#include <handleapi.h> // CloseHandle #include <handleapi.h> // CloseHandle
// clang-format on // clang-format on
#define MAP_FAILED nullptr static char *mapFile(int fd, std::string const &path, size_t) {
void *mappingAddr = nullptr;
static void mapFile(void *&mappingAddr, int fd, std::string const &path, size_t) {
mappingAddr = MAP_FAILED;
if (HANDLE file = CreateFileA( if (HANDLE file = CreateFileA(
path.c_str(), path.c_str(),
GENERIC_READ, GENERIC_READ,
@@ -67,10 +65,11 @@ static void mapFile(void *&mappingAddr, int fd, std::string const &path, size_t)
} }
CloseHandle(file); CloseHandle(file);
} }
return (char *)mappingAddr;
} }
struct MunmapDeleter { struct FileUnmapDeleter {
MunmapDeleter(size_t) {} FileUnmapDeleter(size_t) {}
void operator()(char *mappingAddr) { UnmapViewOfFile(mappingAddr); } void operator()(char *mappingAddr) { UnmapViewOfFile(mappingAddr); }
}; };
@@ -78,8 +77,8 @@ struct MunmapDeleter {
#else // defined(_MSC_VER) || defined(__MINGW32__) #else // defined(_MSC_VER) || defined(__MINGW32__)
#include <sys/mman.h> #include <sys/mman.h>
static void mapFile(void *&mappingAddr, int fd, std::string const &path, size_t size) { static char *mapFile(int fd, std::string const &path, size_t size) {
mappingAddr = mmap(nullptr, size, PROT_READ, MAP_PRIVATE, fd, 0); void *mappingAddr = mmap(nullptr, size, PROT_READ, MAP_PRIVATE, fd, 0);
if (mappingAddr == MAP_FAILED && errno == ENOTSUP) { if (mappingAddr == MAP_FAILED && errno == ENOTSUP) {
// The implementation may not support MAP_PRIVATE; try again with MAP_SHARED // The implementation may not support MAP_PRIVATE; try again with MAP_SHARED
// instead, offering, I believe, weaker guarantees about external modifications to // instead, offering, I believe, weaker guarantees about external modifications to
@@ -88,12 +87,13 @@ static void mapFile(void *&mappingAddr, int fd, std::string const &path, size_t
printf("mmap(%s, MAP_PRIVATE) failed, retrying with MAP_SHARED\n", path.c_str()); printf("mmap(%s, MAP_PRIVATE) failed, retrying with MAP_SHARED\n", path.c_str());
mappingAddr = mmap(nullptr, size, PROT_READ, MAP_SHARED, fd, 0); mappingAddr = mmap(nullptr, size, PROT_READ, MAP_SHARED, fd, 0);
} }
return mappingAddr != MAP_FAILED ? (char *)mappingAddr : nullptr;
} }
struct MunmapDeleter { struct FileUnmapDeleter {
size_t mappingSize; size_t mappingSize;
MunmapDeleter(size_t mappingSize_) : mappingSize(mappingSize_) {} FileUnmapDeleter(size_t mappingSize_) : mappingSize(mappingSize_) {}
void operator()(char *mappingAddr) { munmap(mappingAddr, mappingSize); } void operator()(char *mappingAddr) { munmap(mappingAddr, mappingSize); }
}; };
@@ -418,18 +418,12 @@ bool LexerState::setFileAsNextState(std::string const &filePath, bool updateStat
bool isMmapped = false; bool isMmapped = false;
if (statBuf.st_size > 0) { if (size_t size = (size_t)statBuf.st_size; statBuf.st_size > 0) {
// Try using `mmap` for better performance // Try using `mmap` for better performance
void *mappingAddr; if (char *mappingAddr = mapFile(fd, path, size); mappingAddr != nullptr) {
mapFile(mappingAddr, fd, path, statBuf.st_size);
if (mappingAddr != MAP_FAILED) {
close(fd); close(fd);
content.emplace<ViewedContent>( content.emplace<ViewedContent>(
std::shared_ptr<char[]>( std::shared_ptr<char[]>(mappingAddr, FileUnmapDeleter(size)), size
(char *)mappingAddr, MunmapDeleter((size_t)statBuf.st_size)
),
(size_t)statBuf.st_size
); );
if (verbose) if (verbose)
printf("File \"%s\" is mmap()ped\n", path.c_str()); printf("File \"%s\" is mmap()ped\n", path.c_str());
@@ -489,10 +483,63 @@ LexerState::~LexerState() {
assert(this != lexerStateEOL); assert(this != lexerStateEOL);
} }
bool Expansion::advance() {
assert(offset <= size());
offset++;
return offset > size();
}
BufferedContent::~BufferedContent() { BufferedContent::~BufferedContent() {
close(fd); close(fd);
} }
void BufferedContent::advance() {
assert(offset < LEXER_BUF_SIZE);
offset++;
if (offset == LEXER_BUF_SIZE)
offset = 0; // Wrap around if necessary
assert(size > 0);
size--;
}
void BufferedContent::refill() {
size_t target = LEXER_BUF_SIZE - size; // Aim: making the buf full
// Compute the index we'll start writing to
size_t startIndex = (offset + size) % LEXER_BUF_SIZE;
// If the range to fill passes over the buffer wrapping point, we need two reads
if (startIndex + target > LEXER_BUF_SIZE) {
size_t nbExpectedChars = LEXER_BUF_SIZE - startIndex;
size_t nbReadChars = readMore(startIndex, nbExpectedChars);
startIndex += nbReadChars;
if (startIndex == LEXER_BUF_SIZE)
startIndex = 0;
// If the read was incomplete, don't perform a second read
target -= nbReadChars;
if (nbReadChars < nbExpectedChars)
target = 0;
}
if (target != 0)
readMore(startIndex, target);
}
size_t BufferedContent::readMore(size_t startIndex, size_t nbChars) {
// This buffer overflow made me lose WEEKS of my life. Never again.
assert(startIndex + nbChars <= LEXER_BUF_SIZE);
ssize_t nbReadChars = read(fd, &buf[startIndex], nbChars);
if (nbReadChars == -1)
fatalerror("Error while reading \"%s\": %s\n", lexerState->path.c_str(), strerror(errno));
size += nbReadChars;
// `nbReadChars` cannot be negative, so it's fine to cast to `size_t`
return (size_t)nbReadChars;
}
void lexer_SetMode(LexerMode mode) { void lexer_SetMode(LexerMode mode) {
lexerState->mode = mode; lexerState->mode = mode;
} }
@@ -640,75 +687,32 @@ static std::shared_ptr<std::string> readMacroArg(char name) {
} }
} }
static size_t readInternal(BufferedContent &cbuf, size_t bufIndex, size_t nbChars) {
// This buffer overflow made me lose WEEKS of my life. Never again.
assert(bufIndex + nbChars <= LEXER_BUF_SIZE);
ssize_t nbReadChars = read(cbuf.fd, &cbuf.buf[bufIndex], nbChars);
if (nbReadChars == -1)
fatalerror("Error while reading \"%s\": %s\n", lexerState->path.c_str(), strerror(errno));
// `nbReadChars` cannot be negative, so it's fine to cast to `size_t`
return (size_t)nbReadChars;
}
// We only need one character of lookahead, for macro arguments // We only need one character of lookahead, for macro arguments
static int peekInternal(uint8_t distance) { static int peekInternal(uint8_t distance) {
for (Expansion &exp : lexerState->expansions) { for (Expansion &exp : lexerState->expansions) {
// An expansion that has reached its end will have `exp->offset` == `exp->size()`, // An expansion that has reached its end will have `exp->offset` == `exp->size()`,
// and `peekInternal` will continue with its parent // and `peekInternal` will continue with its parent
assert(exp.offset <= exp.size()); assert(exp.offset <= exp.size());
if (distance < exp.size() - exp.offset) if (exp.canPeek(distance))
return (*exp.contents)[exp.offset + distance]; return exp.peek(distance);
distance -= exp.size() - exp.offset; distance -= exp.size() - exp.offset;
} }
if (distance >= LEXER_BUF_SIZE)
fatalerror(
"Internal lexer error: buffer has insufficient size for peeking (%" PRIu8 " >= %u)\n",
distance,
LEXER_BUF_SIZE
);
if (auto *view = std::get_if<ViewedContent>(&lexerState->content); view) { if (auto *view = std::get_if<ViewedContent>(&lexerState->content); view) {
if (size_t idx = view->offset + distance; idx < view->span.size) if (view->canPeek(distance))
return (uint8_t)view->span.ptr[idx]; return view->peek(distance);
return EOF; return EOF;
} else { } else {
assert(std::holds_alternative<BufferedContent>(lexerState->content)); assert(std::holds_alternative<BufferedContent>(lexerState->content));
auto &cbuf = std::get<BufferedContent>(lexerState->content); auto &cbuf = std::get<BufferedContent>(lexerState->content);
if (cbuf.nbChars > distance) assert(distance < LEXER_BUF_SIZE);
return (uint8_t)cbuf.buf[(cbuf.index + distance) % LEXER_BUF_SIZE]; if (cbuf.canPeek(distance))
return cbuf.peek(distance);
// Buffer isn't full enough, read some chars in // Buffer isn't full enough, read some chars in
size_t target = LEXER_BUF_SIZE - cbuf.nbChars; // Aim: making the buf full cbuf.refill();
if (cbuf.canPeek(distance))
// Compute the index we'll start writing to return cbuf.peek(distance);
size_t writeIndex = (cbuf.index + cbuf.nbChars) % LEXER_BUF_SIZE;
// If the range to fill passes over the buffer wrapping point, we need two reads
if (writeIndex + target > LEXER_BUF_SIZE) {
size_t nbExpectedChars = LEXER_BUF_SIZE - writeIndex;
size_t nbReadChars = readInternal(cbuf, writeIndex, nbExpectedChars);
cbuf.nbChars += nbReadChars;
writeIndex += nbReadChars;
if (writeIndex == LEXER_BUF_SIZE)
writeIndex = 0;
// If the read was incomplete, don't perform a second read
target -= nbReadChars;
if (nbReadChars < nbExpectedChars)
target = 0;
}
if (target != 0)
cbuf.nbChars += readInternal(cbuf, writeIndex, target);
if (cbuf.nbChars > distance)
return (uint8_t)cbuf.buf[(cbuf.index + distance) % LEXER_BUF_SIZE];
// If there aren't enough chars even after refilling, give up // If there aren't enough chars even after refilling, give up
return EOF; return EOF;
} }
@@ -777,11 +781,7 @@ static void shiftChar() {
restart: restart:
if (!lexerState->expansions.empty()) { if (!lexerState->expansions.empty()) {
// Advance within the current expansion // Advance within the current expansion
Expansion &expansion = lexerState->expansions.front(); if (Expansion &exp = lexerState->expansions.front(); exp.advance()) {
assert(expansion.offset <= expansion.size());
expansion.offset++;
if (expansion.offset > expansion.size()) {
// When advancing would go past an expansion's end, // When advancing would go past an expansion's end,
// move up to its parent and try again to advance // move up to its parent and try again to advance
lexerState->expansions.pop_front(); lexerState->expansions.pop_front();
@@ -795,12 +795,7 @@ restart:
} else { } else {
assert(std::holds_alternative<BufferedContent>(lexerState->content)); assert(std::holds_alternative<BufferedContent>(lexerState->content));
auto &cbuf = std::get<BufferedContent>(lexerState->content); auto &cbuf = std::get<BufferedContent>(lexerState->content);
assert(cbuf.index < LEXER_BUF_SIZE); cbuf.advance();
cbuf.index++;
if (cbuf.index == LEXER_BUF_SIZE)
cbuf.index = 0; // Wrap around if necessary
assert(cbuf.nbChars > 0);
cbuf.nbChars--;
} }
} }
} }
@@ -2174,11 +2169,7 @@ static Capture startCapture() {
if (auto *view = std::get_if<ViewedContent>(&lexerState->content); if (auto *view = std::get_if<ViewedContent>(&lexerState->content);
view && lexerState->expansions.empty()) { view && lexerState->expansions.empty()) {
return { return {
.lineNo = lineNo, .lineNo = lineNo, .span = {.ptr = view->makeSharedContentPtr(), .size = 0}
.span = {
.ptr = std::shared_ptr<char[]>(view->span.ptr, &view->span.ptr[view->offset]),
.size = 0,
}
}; };
} else { } else {
assert(lexerState->captureBuf == nullptr); assert(lexerState->captureBuf == nullptr);
@@ -2194,8 +2185,7 @@ static void endCapture(Capture &capture) {
// This being `nullptr` means we're capturing from the capture buffer, which is reallocated // This being `nullptr` means we're capturing from the capture buffer, which is reallocated
// during the whole capture process, and so MUST be retrieved at the end // during the whole capture process, and so MUST be retrieved at the end
if (!capture.span.ptr) if (!capture.span.ptr)
capture.span.ptr = capture.span.ptr = lexerState->makeSharedCaptureBufPtr();
std::shared_ptr<char[]>(lexerState->captureBuf, lexerState->captureBuf->data());
capture.span.size = lexerState->captureSize; capture.span.size = lexerState->captureSize;
// ENDR/ENDM or EOF puts us past the start of the line // ENDR/ENDM or EOF puts us past the start of the line