diff --git a/include/asm/lexer.hpp b/include/asm/lexer.hpp index 42401216..899453d0 100644 --- a/include/asm/lexer.hpp +++ b/include/asm/lexer.hpp @@ -13,7 +13,7 @@ #include "platform.hpp" // SSIZE_MAX -#define LEXER_BUF_SIZE 42 // TODO: determine a sane value for this +#define LEXER_BUF_SIZE 128 // The buffer needs to be large enough for the maximum `peekInternal` lookahead distance static_assert(LEXER_BUF_SIZE > 1, "Lexer buffer size is too small"); // This caps the size of buffer reads, and according to POSIX, passing more than SSIZE_MAX is UB @@ -31,14 +31,12 @@ enum LexerMode { struct Expansion { std::optional name; std::shared_ptr contents; - size_t offset; // Cursor into the contents + size_t offset; // Cursor into `contents` size_t size() const { return contents->size(); } -}; - -struct IfStackEntry { - bool ranIfBlock; // Whether an IF/ELIF/ELSE block ran already - bool reachedElseBlock; // Whether an ELSE block ran already + bool canPeek(uint8_t distance) const { return offset + distance < contents->size(); } + uint8_t peek(uint8_t distance) const { return (*contents)[offset + distance]; } + bool advance(); // Increment `offset`; return whether it then exceeds `contents` }; struct ContentSpan { @@ -47,21 +45,40 @@ struct ContentSpan { }; struct ViewedContent { - ContentSpan span; - size_t offset = 0; + ContentSpan span; // Span of chars + size_t offset = 0; // Cursor into `span.ptr` ViewedContent(ContentSpan const &span_) : span(span_) {} ViewedContent(std::shared_ptr ptr, size_t size) : span({.ptr = ptr, .size = size}) {} + + bool canPeek(uint8_t distance) const { return offset + distance < span.size; } + uint8_t peek(uint8_t distance) const { return span.ptr[offset + distance]; } + std::shared_ptr makeSharedContentPtr() const { + return std::shared_ptr(span.ptr, &span.ptr[offset]); + } }; struct BufferedContent { - int fd; - size_t index = 0; // Read index into the buffer - char buf[LEXER_BUF_SIZE] = {}; // Circular buffer - size_t nbChars = 0; // Number of "fresh" chars in the buffer + int fd; // File from which to read chars + char buf[LEXER_BUF_SIZE] = {}; // Circular buffer of chars + size_t offset = 0; // Cursor into `buf` + size_t size = 0; // Number of "fresh" chars in `buf` BufferedContent(int fd_) : fd(fd_) {} ~BufferedContent(); + + bool canPeek(uint8_t distance) const { return size > distance; } + uint8_t peek(uint8_t distance) const { return buf[(offset + distance) % LEXER_BUF_SIZE]; } + void advance(); // Increment `offset` circularly, decrement `size` + void refill(); // Read from `fd` to fill `buf` + +private: + size_t readMore(size_t startIndex, size_t nbChars); +}; + +struct IfStackEntry { + bool ranIfBlock; // Whether an IF/ELIF/ELSE block ran already + bool reachedElseBlock; // Whether an ELSE block ran already }; struct LexerState { @@ -77,8 +94,7 @@ struct LexerState { bool capturing; // Whether the text being lexed should be captured size_t captureSize; // Amount of text captured - std::shared_ptr> - captureBuf; // Buffer to send the captured text to if non-null + std::shared_ptr> captureBuf; // Buffer to send the captured text to if set bool disableMacroArgs; bool disableInterpolation; @@ -90,6 +106,10 @@ struct LexerState { ~LexerState(); + std::shared_ptr makeSharedCaptureBufPtr() const { + return std::shared_ptr(captureBuf, captureBuf->data()); + } + void setAsCurrentState(); bool setFileAsNextState(std::string const &filePath, bool updateStateNow); void setViewAsNextState(char const *name, ContentSpan const &span, uint32_t lineNo_); diff --git a/src/asm/lexer.cpp b/src/asm/lexer.cpp index f23532a4..03581060 100644 --- a/src/asm/lexer.cpp +++ b/src/asm/lexer.cpp @@ -46,10 +46,8 @@ #include // CloseHandle // clang-format on - #define MAP_FAILED nullptr - -static void mapFile(void *&mappingAddr, int fd, std::string const &path, size_t) { - mappingAddr = MAP_FAILED; +static char *mapFile(int fd, std::string const &path, size_t) { + void *mappingAddr = nullptr; if (HANDLE file = CreateFileA( path.c_str(), GENERIC_READ, @@ -67,10 +65,11 @@ static void mapFile(void *&mappingAddr, int fd, std::string const &path, size_t) } CloseHandle(file); } + return (char *)mappingAddr; } -struct MunmapDeleter { - MunmapDeleter(size_t) {} +struct FileUnmapDeleter { + FileUnmapDeleter(size_t) {} void operator()(char *mappingAddr) { UnmapViewOfFile(mappingAddr); } }; @@ -78,8 +77,8 @@ struct MunmapDeleter { #else // defined(_MSC_VER) || defined(__MINGW32__) #include -static void mapFile(void *&mappingAddr, int fd, std::string const &path, size_t size) { - mappingAddr = mmap(nullptr, size, PROT_READ, MAP_PRIVATE, fd, 0); +static char *mapFile(int fd, std::string const &path, size_t size) { + void *mappingAddr = mmap(nullptr, size, PROT_READ, MAP_PRIVATE, fd, 0); if (mappingAddr == MAP_FAILED && errno == ENOTSUP) { // The implementation may not support MAP_PRIVATE; try again with MAP_SHARED // instead, offering, I believe, weaker guarantees about external modifications to @@ -88,12 +87,13 @@ static void mapFile(void *&mappingAddr, int fd, std::string const &path, size_t printf("mmap(%s, MAP_PRIVATE) failed, retrying with MAP_SHARED\n", path.c_str()); mappingAddr = mmap(nullptr, size, PROT_READ, MAP_SHARED, fd, 0); } + return mappingAddr != MAP_FAILED ? (char *)mappingAddr : nullptr; } -struct MunmapDeleter { +struct FileUnmapDeleter { size_t mappingSize; - MunmapDeleter(size_t mappingSize_) : mappingSize(mappingSize_) {} + FileUnmapDeleter(size_t mappingSize_) : mappingSize(mappingSize_) {} void operator()(char *mappingAddr) { munmap(mappingAddr, mappingSize); } }; @@ -418,18 +418,12 @@ bool LexerState::setFileAsNextState(std::string const &filePath, bool updateStat bool isMmapped = false; - if (statBuf.st_size > 0) { + if (size_t size = (size_t)statBuf.st_size; statBuf.st_size > 0) { // Try using `mmap` for better performance - void *mappingAddr; - mapFile(mappingAddr, fd, path, statBuf.st_size); - - if (mappingAddr != MAP_FAILED) { + if (char *mappingAddr = mapFile(fd, path, size); mappingAddr != nullptr) { close(fd); content.emplace( - std::shared_ptr( - (char *)mappingAddr, MunmapDeleter((size_t)statBuf.st_size) - ), - (size_t)statBuf.st_size + std::shared_ptr(mappingAddr, FileUnmapDeleter(size)), size ); if (verbose) printf("File \"%s\" is mmap()ped\n", path.c_str()); @@ -489,10 +483,63 @@ LexerState::~LexerState() { assert(this != lexerStateEOL); } +bool Expansion::advance() { + assert(offset <= size()); + offset++; + return offset > size(); +} + BufferedContent::~BufferedContent() { close(fd); } +void BufferedContent::advance() { + assert(offset < LEXER_BUF_SIZE); + offset++; + if (offset == LEXER_BUF_SIZE) + offset = 0; // Wrap around if necessary + assert(size > 0); + size--; +} + +void BufferedContent::refill() { + size_t target = LEXER_BUF_SIZE - size; // Aim: making the buf full + + // Compute the index we'll start writing to + size_t startIndex = (offset + size) % LEXER_BUF_SIZE; + + // If the range to fill passes over the buffer wrapping point, we need two reads + if (startIndex + target > LEXER_BUF_SIZE) { + size_t nbExpectedChars = LEXER_BUF_SIZE - startIndex; + size_t nbReadChars = readMore(startIndex, nbExpectedChars); + + startIndex += nbReadChars; + if (startIndex == LEXER_BUF_SIZE) + startIndex = 0; + + // If the read was incomplete, don't perform a second read + target -= nbReadChars; + if (nbReadChars < nbExpectedChars) + target = 0; + } + if (target != 0) + readMore(startIndex, target); +} + +size_t BufferedContent::readMore(size_t startIndex, size_t nbChars) { + // This buffer overflow made me lose WEEKS of my life. Never again. + assert(startIndex + nbChars <= LEXER_BUF_SIZE); + ssize_t nbReadChars = read(fd, &buf[startIndex], nbChars); + + if (nbReadChars == -1) + fatalerror("Error while reading \"%s\": %s\n", lexerState->path.c_str(), strerror(errno)); + + size += nbReadChars; + + // `nbReadChars` cannot be negative, so it's fine to cast to `size_t` + return (size_t)nbReadChars; +} + void lexer_SetMode(LexerMode mode) { lexerState->mode = mode; } @@ -640,75 +687,32 @@ static std::shared_ptr readMacroArg(char name) { } } -static size_t readInternal(BufferedContent &cbuf, size_t bufIndex, size_t nbChars) { - // This buffer overflow made me lose WEEKS of my life. Never again. - assert(bufIndex + nbChars <= LEXER_BUF_SIZE); - ssize_t nbReadChars = read(cbuf.fd, &cbuf.buf[bufIndex], nbChars); - - if (nbReadChars == -1) - fatalerror("Error while reading \"%s\": %s\n", lexerState->path.c_str(), strerror(errno)); - - // `nbReadChars` cannot be negative, so it's fine to cast to `size_t` - return (size_t)nbReadChars; -} - // We only need one character of lookahead, for macro arguments static int peekInternal(uint8_t distance) { for (Expansion &exp : lexerState->expansions) { // An expansion that has reached its end will have `exp->offset` == `exp->size()`, // and `peekInternal` will continue with its parent assert(exp.offset <= exp.size()); - if (distance < exp.size() - exp.offset) - return (*exp.contents)[exp.offset + distance]; + if (exp.canPeek(distance)) + return exp.peek(distance); distance -= exp.size() - exp.offset; } - if (distance >= LEXER_BUF_SIZE) - fatalerror( - "Internal lexer error: buffer has insufficient size for peeking (%" PRIu8 " >= %u)\n", - distance, - LEXER_BUF_SIZE - ); - if (auto *view = std::get_if(&lexerState->content); view) { - if (size_t idx = view->offset + distance; idx < view->span.size) - return (uint8_t)view->span.ptr[idx]; + if (view->canPeek(distance)) + return view->peek(distance); return EOF; } else { assert(std::holds_alternative(lexerState->content)); auto &cbuf = std::get(lexerState->content); - if (cbuf.nbChars > distance) - return (uint8_t)cbuf.buf[(cbuf.index + distance) % LEXER_BUF_SIZE]; - + assert(distance < LEXER_BUF_SIZE); + if (cbuf.canPeek(distance)) + return cbuf.peek(distance); // Buffer isn't full enough, read some chars in - size_t target = LEXER_BUF_SIZE - cbuf.nbChars; // Aim: making the buf full - - // Compute the index we'll start writing to - size_t writeIndex = (cbuf.index + cbuf.nbChars) % LEXER_BUF_SIZE; - - // If the range to fill passes over the buffer wrapping point, we need two reads - if (writeIndex + target > LEXER_BUF_SIZE) { - size_t nbExpectedChars = LEXER_BUF_SIZE - writeIndex; - size_t nbReadChars = readInternal(cbuf, writeIndex, nbExpectedChars); - - cbuf.nbChars += nbReadChars; - - writeIndex += nbReadChars; - if (writeIndex == LEXER_BUF_SIZE) - writeIndex = 0; - - // If the read was incomplete, don't perform a second read - target -= nbReadChars; - if (nbReadChars < nbExpectedChars) - target = 0; - } - if (target != 0) - cbuf.nbChars += readInternal(cbuf, writeIndex, target); - - if (cbuf.nbChars > distance) - return (uint8_t)cbuf.buf[(cbuf.index + distance) % LEXER_BUF_SIZE]; - + cbuf.refill(); + if (cbuf.canPeek(distance)) + return cbuf.peek(distance); // If there aren't enough chars even after refilling, give up return EOF; } @@ -777,11 +781,7 @@ static void shiftChar() { restart: if (!lexerState->expansions.empty()) { // Advance within the current expansion - Expansion &expansion = lexerState->expansions.front(); - - assert(expansion.offset <= expansion.size()); - expansion.offset++; - if (expansion.offset > expansion.size()) { + if (Expansion &exp = lexerState->expansions.front(); exp.advance()) { // When advancing would go past an expansion's end, // move up to its parent and try again to advance lexerState->expansions.pop_front(); @@ -795,12 +795,7 @@ restart: } else { assert(std::holds_alternative(lexerState->content)); auto &cbuf = std::get(lexerState->content); - assert(cbuf.index < LEXER_BUF_SIZE); - cbuf.index++; - if (cbuf.index == LEXER_BUF_SIZE) - cbuf.index = 0; // Wrap around if necessary - assert(cbuf.nbChars > 0); - cbuf.nbChars--; + cbuf.advance(); } } } @@ -2174,11 +2169,7 @@ static Capture startCapture() { if (auto *view = std::get_if(&lexerState->content); view && lexerState->expansions.empty()) { return { - .lineNo = lineNo, - .span = { - .ptr = std::shared_ptr(view->span.ptr, &view->span.ptr[view->offset]), - .size = 0, - } + .lineNo = lineNo, .span = {.ptr = view->makeSharedContentPtr(), .size = 0} }; } else { assert(lexerState->captureBuf == nullptr); @@ -2194,8 +2185,7 @@ static void endCapture(Capture &capture) { // This being `nullptr` means we're capturing from the capture buffer, which is reallocated // during the whole capture process, and so MUST be retrieved at the end if (!capture.span.ptr) - capture.span.ptr = - std::shared_ptr(lexerState->captureBuf, lexerState->captureBuf->data()); + capture.span.ptr = lexerState->makeSharedCaptureBufPtr(); capture.span.size = lexerState->captureSize; // ENDR/ENDM or EOF puts us past the start of the line