Use std::variant for lexer mmap/buffer state (#1328)

This commit is contained in:
Sylvie
2024-03-03 18:45:50 -05:00
committed by GitHub
parent 6b67c82b94
commit 8cf446b14c
2 changed files with 178 additions and 136 deletions

View File

@@ -6,6 +6,7 @@
#include <deque> #include <deque>
#include <optional> #include <optional>
#include <string> #include <string>
#include <variant>
#include <vector> #include <vector>
#include "platform.hpp" // SSIZE_MAX #include "platform.hpp" // SSIZE_MAX
@@ -44,15 +45,18 @@ struct IfStackEntry {
}; };
struct MmappedLexerState { struct MmappedLexerState {
union { char *ptr;
char const *unreferenced;
char *referenced; // Non-`const` only so it can be `munmap()`ped
} ptr;
size_t size; size_t size;
size_t offset; size_t offset;
bool isReferenced; // If a macro in this file requires not unmapping it bool isReferenced; // If a macro in this file requires not unmapping it
}; };
struct ViewedLexerState {
char const *ptr;
size_t size;
size_t offset;
};
struct BufferedLexerState { struct BufferedLexerState {
int fd; int fd;
size_t index; // Read index into the buffer size_t index; // Read index into the buffer
@@ -63,16 +67,6 @@ struct BufferedLexerState {
struct LexerState { struct LexerState {
char const *path; char const *path;
// mmap()-dependent IO state
bool isMmapped;
union {
MmappedLexerState mmap; // If mmap()ed
BufferedLexerState cbuf; // Otherwise
};
// Common state
bool isFile;
enum LexerMode mode; enum LexerMode mode;
bool atLineStart; bool atLineStart;
uint32_t lineNo; uint32_t lineNo;
@@ -90,6 +84,13 @@ struct LexerState {
size_t macroArgScanDistance; // Max distance already scanned for macro args size_t macroArgScanDistance; // Max distance already scanned for macro args
bool expandStrings; bool expandStrings;
std::deque<Expansion> expansions; // Front is the innermost current expansion std::deque<Expansion> expansions; // Front is the innermost current expansion
std::variant<
std::monostate,
MmappedLexerState,
ViewedLexerState,
BufferedLexerState
> content;
}; };
extern LexerState *lexerState; extern LexerState *lexerState;

View File

@@ -17,6 +17,7 @@
#include <string> #include <string>
#include <string.h> #include <string.h>
#include <unordered_map> #include <unordered_map>
#include <variant>
#include <vector> #include <vector>
#ifndef _MSC_VER #ifndef _MSC_VER
#include <unistd.h> #include <unistd.h>
@@ -369,64 +370,68 @@ void lexer_ReachELSEBlock()
bool lexer_OpenFile(LexerState &state, char const *path) bool lexer_OpenFile(LexerState &state, char const *path)
{ {
bool isStdin = !strcmp(path, "-"); if (!strcmp(path, "-")) {
struct stat fileInfo; state.path = "<stdin>";
state.content = BufferedLexerState{
// Give stdin a nicer file name .fd = STDIN_FILENO,
if (isStdin) .index = 0,
path = "<stdin>"; .buf = {},
if (!isStdin && stat(path, &fileInfo) != 0) { .nbChars = 0
error("Failed to stat file \"%s\": %s\n", path, strerror(errno)); };
return false; if (verbose)
} printf("Opening stdin\n");
state.path = path; } else {
state.isFile = true; struct stat fileInfo;
state.cbuf.fd = isStdin ? STDIN_FILENO : open(path, O_RDONLY); if (stat(path, &fileInfo) != 0) {
if (state.cbuf.fd < 0) { error("Failed to stat file \"%s\": %s\n", path, strerror(errno));
error("Failed to open file \"%s\": %s\n", path, strerror(errno)); return false;
return false;
}
state.isMmapped = false; // By default, assume it won't be mmap()ed
if (!isStdin && fileInfo.st_size > 0) {
// Try using `mmap` for better performance
// Important: do NOT assign to `state.mmap.ptr.referenced` directly, to avoid a
// cast that may alter an eventual `MAP_FAILED` value. It would also invalidate
// `state.cbuf.fd`, being on the other side of the union.
void *mappingAddr;
mapFile(mappingAddr, state.cbuf.fd, state.path, fileInfo.st_size);
if (mappingAddr == MAP_FAILED) {
// If mmap()ing failed, try again using another method (below)
state.isMmapped = false;
} else {
// IMPORTANT: the `union` mandates this is accessed before other members!
close(state.cbuf.fd);
state.isMmapped = true;
state.mmap.isReferenced = false; // By default, a state isn't referenced
state.mmap.ptr.referenced = (char *)mappingAddr;
assert(fileInfo.st_size >= 0);
state.mmap.size = (size_t)fileInfo.st_size;
state.mmap.offset = 0;
if (verbose)
printf("File %s successfully mmap()ped\n", path);
} }
} state.path = path;
if (!state.isMmapped) {
// Sometimes mmap() fails or isn't available, so have a fallback int fd = open(path, O_RDONLY);
if (verbose) { if (fd < 0) {
if (isStdin) error("Failed to open file \"%s\": %s\n", path, strerror(errno));
printf("Opening stdin\n"); return false;
else if (fileInfo.st_size == 0) }
printf("File %s is empty\n", path);
else bool isMmapped = false;
printf("File %s opened as regular, errno reports \"%s\"\n",
path, strerror(errno)); if (fileInfo.st_size > 0) {
// Try using `mmap` for better performance
void *mappingAddr;
mapFile(mappingAddr, fd, state.path, fileInfo.st_size);
if (mappingAddr != MAP_FAILED) {
close(fd);
state.content = MmappedLexerState{
.ptr = (char *)mappingAddr,
.size = (size_t)fileInfo.st_size,
.offset = 0,
.isReferenced = false
};
if (verbose)
printf("File \"%s\" is mmap()ped\n", path);
isMmapped = true;
}
}
if (!isMmapped) {
// Sometimes mmap() fails or isn't available, so have a fallback
state.content = BufferedLexerState{
.fd = fd,
.index = 0,
.buf = {},
.nbChars = 0
};
if (verbose) {
if (fileInfo.st_size == 0) {
printf("File \"%s\" is empty\n", path);
} else {
printf("File \"%s\" is opened; errno reports: %s\n",
path, strerror(errno));
}
}
} }
state.cbuf.index = 0;
state.cbuf.nbChars = 0;
} }
initState(state); initState(state);
@@ -438,19 +443,22 @@ void lexer_OpenFileView(LexerState &state, char const *path, char const *buf, si
uint32_t lineNo) uint32_t lineNo)
{ {
state.path = path; // Used to report read errors in `peekInternal` state.path = path; // Used to report read errors in `peekInternal`
state.isFile = false; state.content = ViewedLexerState{
state.isMmapped = true; // It's not *really* mmap()ed, but it behaves the same .ptr = buf,
state.mmap.ptr.unreferenced = buf; .size = size,
state.mmap.size = size; .offset = 0
state.mmap.offset = 0; };
initState(state); initState(state);
state.lineNo = lineNo; // Will be incremented at first line start state.lineNo = lineNo; // Will be incremented at first line start
} }
void lexer_RestartRept(uint32_t lineNo) void lexer_RestartRept(uint32_t lineNo)
{ {
lexerState->mmap.offset = 0; std::visit(Visitor{
[](MmappedLexerState &mmap) { mmap.offset = 0; },
[](ViewedLexerState &view) { view.offset = 0; },
[](auto &) {},
}, lexerState->content);
initState(*lexerState); initState(*lexerState);
lexerState->lineNo = lineNo; lexerState->lineNo = lineNo;
} }
@@ -470,10 +478,16 @@ void lexer_CleanupState(LexerState &state)
// `lexerStateEOL`, but there's currently no situation in which this should happen. // `lexerStateEOL`, but there's currently no situation in which this should happen.
assert(&state != lexerStateEOL); assert(&state != lexerStateEOL);
if (!state.isMmapped) std::visit(Visitor{
close(state.cbuf.fd); [](MmappedLexerState &mmap) {
else if (state.isFile && !state.mmap.isReferenced) if (!mmap.isReferenced)
munmap(state.mmap.ptr.referenced, state.mmap.size); munmap(mmap.ptr, mmap.size);
},
[](BufferedLexerState &cbuf) {
close(cbuf.fd);
},
[](auto &) {},
}, state.content);
} }
void lexer_SetMode(enum LexerMode mode) void lexer_SetMode(enum LexerMode mode)
@@ -628,11 +642,11 @@ static char const *readMacroArg(char name)
return str; return str;
} }
static size_t readInternal(size_t bufIndex, size_t nbChars) static size_t readInternal(BufferedLexerState &cbuf, size_t bufIndex, size_t nbChars)
{ {
// This buffer overflow made me lose WEEKS of my life. Never again. // This buffer overflow made me lose WEEKS of my life. Never again.
assert(bufIndex + nbChars <= LEXER_BUF_SIZE); assert(bufIndex + nbChars <= LEXER_BUF_SIZE);
ssize_t nbReadChars = read(lexerState->cbuf.fd, &lexerState->cbuf.buf[bufIndex], nbChars); ssize_t nbReadChars = read(cbuf.fd, &cbuf.buf[bufIndex], nbChars);
if (nbReadChars == -1) if (nbReadChars == -1)
fatalerror("Error while reading \"%s\": %s\n", lexerState->path, strerror(errno)); fatalerror("Error while reading \"%s\": %s\n", lexerState->path, strerror(errno));
@@ -657,45 +671,56 @@ static int peekInternal(uint8_t distance)
fatalerror("Internal lexer error: buffer has insufficient size for peeking (%" fatalerror("Internal lexer error: buffer has insufficient size for peeking (%"
PRIu8 " >= %u)\n", distance, LEXER_BUF_SIZE); PRIu8 " >= %u)\n", distance, LEXER_BUF_SIZE);
if (lexerState->isMmapped) { return std::visit(Visitor{
size_t index = lexerState->mmap.offset + distance; [&distance](MmappedLexerState &mmap) -> int {
if (size_t idx = mmap.offset + distance; idx < mmap.size)
return index < lexerState->mmap.size ? return (uint8_t)mmap.ptr[idx];
(uint8_t)lexerState->mmap.ptr.unreferenced[index] : EOF;
}
if (lexerState->cbuf.nbChars <= distance) {
// Buffer isn't full enough, read some chars in
size_t target = LEXER_BUF_SIZE - lexerState->cbuf.nbChars; // Aim: making the buf full
// Compute the index we'll start writing to
size_t writeIndex = (lexerState->cbuf.index + lexerState->cbuf.nbChars) % LEXER_BUF_SIZE;
// If the range to fill passes over the buffer wrapping point, we need two reads
if (writeIndex + target > LEXER_BUF_SIZE) {
size_t nbExpectedChars = LEXER_BUF_SIZE - writeIndex;
size_t nbReadChars = readInternal(writeIndex, nbExpectedChars);
lexerState->cbuf.nbChars += nbReadChars;
writeIndex += nbReadChars;
if (writeIndex == LEXER_BUF_SIZE)
writeIndex = 0;
// If the read was incomplete, don't perform a second read
target -= nbReadChars;
if (nbReadChars < nbExpectedChars)
target = 0;
}
if (target != 0)
lexerState->cbuf.nbChars += readInternal(writeIndex, target);
// If there aren't enough chars even after refilling, give up
if (lexerState->cbuf.nbChars <= distance)
return EOF; return EOF;
} },
[&distance](ViewedLexerState &view) -> int {
if (size_t idx = view.offset + distance; idx < view.size)
return (uint8_t)view.ptr[idx];
return EOF;
},
[&distance](BufferedLexerState &cbuf) -> int {
if (cbuf.nbChars > distance)
return (uint8_t)cbuf.buf[(cbuf.index + distance) % LEXER_BUF_SIZE];
return (unsigned char)lexerState->cbuf.buf[(lexerState->cbuf.index + distance) % LEXER_BUF_SIZE]; // Buffer isn't full enough, read some chars in
size_t target = LEXER_BUF_SIZE - cbuf.nbChars; // Aim: making the buf full
// Compute the index we'll start writing to
size_t writeIndex = (cbuf.index + cbuf.nbChars) % LEXER_BUF_SIZE;
// If the range to fill passes over the buffer wrapping point, we need two reads
if (writeIndex + target > LEXER_BUF_SIZE) {
size_t nbExpectedChars = LEXER_BUF_SIZE - writeIndex;
size_t nbReadChars = readInternal(cbuf, writeIndex, nbExpectedChars);
cbuf.nbChars += nbReadChars;
writeIndex += nbReadChars;
if (writeIndex == LEXER_BUF_SIZE)
writeIndex = 0;
// If the read was incomplete, don't perform a second read
target -= nbReadChars;
if (nbReadChars < nbExpectedChars)
target = 0;
}
if (target != 0)
cbuf.nbChars += readInternal(cbuf, writeIndex, target);
if (cbuf.nbChars > distance)
return (uint8_t)cbuf.buf[(cbuf.index + distance) % LEXER_BUF_SIZE];
// If there aren't enough chars even after refilling, give up
return EOF;
},
[](std::monostate) -> int {
return EOF;
}
}, lexerState->content);
} }
// forward declarations for peek // forward declarations for peek
@@ -775,16 +800,23 @@ restart:
} else { } else {
// Advance within the file contents // Advance within the file contents
lexerState->colNo++; lexerState->colNo++;
if (lexerState->isMmapped) { std::visit(Visitor{
lexerState->mmap.offset++; [](MmappedLexerState &mmap) {
} else { mmap.offset++;
assert(lexerState->cbuf.index < LEXER_BUF_SIZE); },
lexerState->cbuf.index++; [](ViewedLexerState &view) {
if (lexerState->cbuf.index == LEXER_BUF_SIZE) view.offset++;
lexerState->cbuf.index = 0; // Wrap around if necessary },
assert(lexerState->cbuf.nbChars > 0); [](BufferedLexerState &cbuf) {
lexerState->cbuf.nbChars--; assert(cbuf.index < LEXER_BUF_SIZE);
} cbuf.index++;
if (cbuf.index == LEXER_BUF_SIZE)
cbuf.index = 0; // Wrap around if necessary
assert(cbuf.nbChars > 0);
cbuf.nbChars--;
},
[](std::monostate) {}
}, lexerState->content);
} }
} }
@@ -2273,15 +2305,24 @@ static void startCapture(CaptureBody &capture)
lexerState->disableInterpolation = true; lexerState->disableInterpolation = true;
capture.lineNo = lexer_GetLineNo(); capture.lineNo = lexer_GetLineNo();
capture.body = std::visit(Visitor{
[](MmappedLexerState &mmap) -> char const * {
return lexerState->expansions.empty() ? &mmap.ptr[mmap.offset] : nullptr;
},
[](ViewedLexerState &view) -> char const * {
return lexerState->expansions.empty() ? &view.ptr[view.offset] : nullptr;
},
[](auto &) -> char const * {
return nullptr;
}
}, lexerState->content);
if (lexerState->isMmapped && lexerState->expansions.empty()) { if (capture.body == nullptr) {
capture.body = &lexerState->mmap.ptr.unreferenced[lexerState->mmap.offset]; // Indicates to retrieve the capture buffer when done capturing
} else {
assert(lexerState->captureBuf == nullptr); assert(lexerState->captureBuf == nullptr);
lexerState->captureBuf = new(std::nothrow) std::vector<char>(); lexerState->captureBuf = new(std::nothrow) std::vector<char>();
if (!lexerState->captureBuf) if (!lexerState->captureBuf)
fatalerror("Failed to allocate capture buffer: %s\n", strerror(errno)); fatalerror("Failed to allocate capture buffer: %s\n", strerror(errno));
capture.body = nullptr; // Indicate to retrieve the capture buffer when done capturing
} }
} }
@@ -2362,8 +2403,8 @@ bool lexer_CaptureMacroBody(CaptureBody &capture)
startCapture(capture); startCapture(capture);
// If the file is `mmap`ed, we need not to unmap it to keep access to the macro // If the file is `mmap`ed, we need not to unmap it to keep access to the macro
if (lexerState->isMmapped) if (MmappedLexerState *mmap = std::get_if<MmappedLexerState>(&lexerState->content); mmap)
lexerState->mmap.isReferenced = true; mmap->isReferenced = true;
int c = EOF; int c = EOF;