Implement new string functions (#1655)

`STRFIND`, `STRRFIND`, `STRCHAR`, `STRSLICE`, `CHARCMP`, `CHARSIZE`, and `REVCHAR`
This commit is contained in:
Rangi
2025-02-14 23:09:45 +01:00
committed by GitHub
parent ad4d9da4cf
commit 3feb75f84f
27 changed files with 584 additions and 134 deletions

View File

@@ -31,6 +31,29 @@ struct CharmapNode {
struct Charmap {
std::string name;
std::vector<CharmapNode> nodes; // first node is reserved for the root node
// Traverse the trie depth-first to derive the character mappings in definition order
template<typename F>
bool forEachChar(F callback) const {
// clang-format off: nested initializers
for (std::stack<std::pair<size_t, std::string>> prefixes({{0, ""}}); !prefixes.empty();) {
// clang-format on
auto [nodeIdx, mapping] = std::move(prefixes.top());
prefixes.pop();
CharmapNode const &node = nodes[nodeIdx];
if (node.isTerminal()) {
if (!callback(nodeIdx, mapping)) {
return false;
}
}
for (unsigned c = 0; c < std::size(node.next); c++) {
if (size_t nextIdx = node.next[c]; nextIdx) {
prefixes.push({nextIdx, mapping + static_cast<char>(c)});
}
}
}
return true;
}
};
static std::deque<Charmap> charmapList;
@@ -44,24 +67,12 @@ bool charmap_ForEach(
void (*charFunc)(std::string const &, std::vector<int32_t>)
) {
for (Charmap const &charmap : charmapList) {
// Traverse the trie depth-first to derive the character mappings in definition order
std::map<size_t, std::string> mappings;
// clang-format off: nested initializers
for (std::stack<std::pair<size_t, std::string>> prefixes({{0, ""}});
!prefixes.empty();) {
// clang-format on
auto [nodeIdx, mapping] = std::move(prefixes.top());
prefixes.pop();
CharmapNode const &node = charmap.nodes[nodeIdx];
if (node.isTerminal()) {
mappings[nodeIdx] = mapping;
}
for (unsigned c = 0; c < 256; c++) {
if (size_t nextIdx = node.next[c]; nextIdx) {
prefixes.push({nextIdx, mapping + static_cast<char>(c)});
}
}
}
charmap.forEachChar([&mappings](size_t nodeIdx, std::string const &mapping) {
mappings[nodeIdx] = mapping;
return true;
});
mapFunc(charmap.name);
for (auto [nodeIdx, mapping] : mappings) {
charFunc(mapping, charmap.nodes[nodeIdx].value);
@@ -178,6 +189,22 @@ bool charmap_HasChar(std::string const &mapping) {
return charmap.nodes[nodeIdx].isTerminal();
}
size_t charmap_CharSize(std::string const &mapping) {
Charmap const &charmap = *currentCharmap;
size_t nodeIdx = 0;
for (char c : mapping) {
nodeIdx = charmap.nodes[nodeIdx].next[static_cast<uint8_t>(c)];
if (!nodeIdx) {
return 0;
}
}
CharmapNode const &node = charmap.nodes[nodeIdx];
return node.isTerminal() ? node.value.size() : 0;
}
std::vector<int32_t> charmap_Convert(std::string const &input) {
std::vector<int32_t> output;
for (std::string_view inputView = input; charmap_ConvertNext(inputView, &output);) {}
@@ -263,3 +290,20 @@ size_t charmap_ConvertNext(std::string_view &input, std::vector<int32_t> *output
input = input.substr(inputIdx);
return matchLen;
}
std::string charmap_Reverse(std::vector<int32_t> const &value, bool &unique) {
Charmap const &charmap = *currentCharmap;
std::string revMapping;
unique = charmap.forEachChar([&](size_t nodeIdx, std::string const &mapping) {
if (charmap.nodes[nodeIdx].value == value) {
if (revMapping.empty()) {
revMapping = mapping;
} else {
revMapping.clear();
return false;
}
}
return true;
});
return revMapping;
}

View File

@@ -240,19 +240,26 @@ static std::unordered_map<std::string, int, CaseInsensitive, CaseInsensitive> ke
{"TZCOUNT", T_(OP_TZCOUNT) },
{"STRCAT", T_(OP_STRCAT) },
{"STRCHAR", T_(OP_STRCHAR) },
{"STRCMP", T_(OP_STRCMP) },
{"STRFIND", T_(OP_STRFIND) },
{"STRFMT", T_(OP_STRFMT) },
{"STRIN", T_(OP_STRIN) },
{"STRLEN", T_(OP_STRLEN) },
{"STRLWR", T_(OP_STRLWR) },
{"STRRFIND", T_(OP_STRRFIND) },
{"STRRIN", T_(OP_STRRIN) },
{"STRRPL", T_(OP_STRRPL) },
{"STRSLICE", T_(OP_STRSLICE) },
{"STRSUB", T_(OP_STRSUB) },
{"STRUPR", T_(OP_STRUPR) },
{"CHARCMP", T_(OP_CHARCMP) },
{"CHARLEN", T_(OP_CHARLEN) },
{"CHARSIZE", T_(OP_CHARSIZE) },
{"CHARSUB", T_(OP_CHARSUB) },
{"INCHARMAP", T_(OP_INCHARMAP) },
{"REVCHAR", T_(OP_REVCHAR) },
{"INCLUDE", T_(POP_INCLUDE) },
{"PRINT", T_(POP_PRINT) },

View File

@@ -64,9 +64,13 @@
static uint32_t strToNum(std::vector<int32_t> const &s);
static void errorInvalidUTF8Byte(uint8_t byte, char const *functionName);
static size_t strlenUTF8(std::string const &str, bool printErrors);
static std::string strsliceUTF8(std::string const &str, uint32_t start, uint32_t stop);
static std::string strsubUTF8(std::string const &str, uint32_t pos, uint32_t len);
static size_t charlenUTF8(std::string const &str);
static std::string strcharUTF8(std::string const &str, uint32_t idx);
static std::string charsubUTF8(std::string const &str, uint32_t pos);
static int32_t charcmp(std::string_view str1, std::string_view str2);
static uint32_t adjustNegativeIndex(int32_t idx, size_t len, char const *functionName);
static uint32_t adjustNegativePos(int32_t pos, size_t len, char const *functionName);
static std::string strrpl(std::string_view str, std::string const &old, std::string const &rep);
static std::string strfmt(
@@ -260,7 +264,9 @@
%token OP_BANK "BANK"
%token OP_BITWIDTH "BITWIDTH"
%token OP_CEIL "CEIL"
%token OP_CHARCMP "CHARCMP"
%token OP_CHARLEN "CHARLEN"
%token OP_CHARSIZE "CHARSIZE"
%token OP_CHARSUB "CHARSUB"
%token OP_COS "COS"
%token OP_DEF "DEF"
@@ -274,18 +280,23 @@
%token OP_LOG "LOG"
%token OP_LOW "LOW"
%token OP_POW "POW"
%token OP_REVCHAR "REVCHAR"
%token OP_ROUND "ROUND"
%token OP_SIN "SIN"
%token OP_SIZEOF "SIZEOF"
%token OP_STARTOF "STARTOF"
%token OP_STRCAT "STRCAT"
%token OP_STRCHAR "STRCHAR"
%token OP_STRCMP "STRCMP"
%token OP_STRFIND "STRFIND"
%token OP_STRFMT "STRFMT"
%token OP_STRIN "STRIN"
%token OP_STRLEN "STRLEN"
%token OP_STRLWR "STRLWR"
%token OP_STRRFIND "STRRFIND"
%token OP_STRRIN "STRRIN"
%token OP_STRRPL "STRRPL"
%token OP_STRSLICE "STRSLICE"
%token OP_STRSUB "STRSUB"
%token OP_STRUPR "STRUPR"
%token OP_TAN "TAN"
@@ -1461,6 +1472,14 @@ relocexpr_no_str:
| OP_STRCMP LPAREN string COMMA string RPAREN {
$$.makeNumber($3.compare($5));
}
| OP_STRFIND LPAREN string COMMA string RPAREN {
size_t pos = $3.find($5);
$$.makeNumber(pos != std::string::npos ? pos : -1);
}
| OP_STRRFIND LPAREN string COMMA string RPAREN {
size_t pos = $3.rfind($5);
$$.makeNumber(pos != std::string::npos ? pos : -1);
}
| OP_STRIN LPAREN string COMMA string RPAREN {
size_t pos = $3.find($5);
$$.makeNumber(pos != std::string::npos ? pos + 1 : 0);
@@ -1478,6 +1497,16 @@ relocexpr_no_str:
| OP_INCHARMAP LPAREN string RPAREN {
$$.makeNumber(charmap_HasChar($3));
}
| OP_CHARCMP LPAREN string COMMA string RPAREN {
$$.makeNumber(charcmp($3, $5));
}
| OP_CHARSIZE LPAREN string RPAREN {
size_t charSize = charmap_CharSize($3);
if (charSize == 0) {
::error("CHARSIZE: No character mapping for \"%s\"\n", $3.c_str());
}
$$.makeNumber(charSize);
}
| LPAREN relocexpr RPAREN {
$$ = std::move($2);
}
@@ -1515,6 +1544,17 @@ string:
STRING {
$$ = std::move($1);
}
| OP_STRSLICE LPAREN string COMMA iconst COMMA iconst RPAREN {
size_t len = strlenUTF8($3, false);
uint32_t start = adjustNegativeIndex($5, len, "STRSLICE");
uint32_t stop = adjustNegativeIndex($7, len, "STRSLICE");
$$ = strsliceUTF8($3, start, stop);
}
| OP_STRSLICE LPAREN string COMMA iconst RPAREN {
size_t len = strlenUTF8($3, false);
uint32_t start = adjustNegativeIndex($5, len, "STRSLICE");
$$ = strsliceUTF8($3, start, len - 1);
}
| OP_STRSUB LPAREN string COMMA iconst COMMA uconst RPAREN {
size_t len = strlenUTF8($3, false);
uint32_t pos = adjustNegativePos($5, len, "STRSUB");
@@ -1525,11 +1565,25 @@ string:
uint32_t pos = adjustNegativePos($5, len, "STRSUB");
$$ = strsubUTF8($3, pos, pos > len ? 0 : len + 1 - pos);
}
| OP_STRCHAR LPAREN string COMMA iconst RPAREN {
size_t len = charlenUTF8($3);
uint32_t idx = adjustNegativeIndex($5, len, "STRCHAR");
$$ = strcharUTF8($3, idx);
}
| OP_CHARSUB LPAREN string COMMA iconst RPAREN {
size_t len = charlenUTF8($3);
uint32_t pos = adjustNegativePos($5, len, "CHARSUB");
$$ = charsubUTF8($3, pos);
}
| OP_REVCHAR LPAREN charmap_args RPAREN {
bool unique;
$$ = charmap_Reverse($3, unique);
if (!unique) {
::error("REVCHAR: Multiple character mappings to values\n");
} else if ($$.empty()) {
::error("REVCHAR: No character mapping to values\n");
}
}
| OP_STRCAT LPAREN RPAREN {
$$.clear();
}
@@ -2516,6 +2570,70 @@ static size_t strlenUTF8(std::string const &str, bool printErrors) {
return len;
}
static std::string strsliceUTF8(std::string const &str, uint32_t start, uint32_t stop) {
char const *ptr = str.c_str();
size_t index = 0;
uint32_t state = 0;
uint32_t codepoint = 0;
uint32_t curIdx = 0;
// Advance to starting index in source string.
while (ptr[index] && curIdx < start) {
switch (decode(&state, &codepoint, ptr[index])) {
case 1:
errorInvalidUTF8Byte(ptr[index], "STRSLICE");
state = 0;
// fallthrough
case 0:
curIdx++;
break;
}
index++;
}
// An index 1 past the end of the string is allowed, but will trigger the
// "Length too big" warning below if the length is nonzero.
if (!ptr[index] && start > curIdx) {
warning(
WARNING_BUILTIN_ARG,
"STRSLICE: Start index %" PRIu32 " is past the end of the string\n",
start
);
}
size_t startIndex = index;
// Advance to ending index in source string.
while (ptr[index] && curIdx < stop) {
switch (decode(&state, &codepoint, ptr[index])) {
case 1:
errorInvalidUTF8Byte(ptr[index], "STRSLICE");
state = 0;
// fallthrough
case 0:
curIdx++;
break;
}
index++;
}
// Check for partial code point.
if (state != 0) {
error("STRSLICE: Incomplete UTF-8 character\n");
curIdx++;
}
if (curIdx < stop) {
warning(
WARNING_BUILTIN_ARG,
"STRSLICE: Stop index %" PRIu32 " is past the end of the string\n",
stop
);
}
return std::string(ptr + startIndex, ptr + index);
}
static std::string strsubUTF8(std::string const &str, uint32_t pos, uint32_t len) {
char const *ptr = str.c_str();
size_t index = 0;
@@ -2584,6 +2702,29 @@ static size_t charlenUTF8(std::string const &str) {
return len;
}
static std::string strcharUTF8(std::string const &str, uint32_t idx) {
std::string_view view = str;
size_t charLen = 1;
// Advance to starting index in source string.
for (uint32_t curIdx = 0; charLen && curIdx < idx; curIdx++) {
charLen = charmap_ConvertNext(view, nullptr);
}
std::string_view start = view;
if (!charmap_ConvertNext(view, nullptr)) {
warning(
WARNING_BUILTIN_ARG,
"STRCHAR: Index %" PRIu32 " is past the end of the string\n",
idx
);
}
start = start.substr(0, start.length() - view.length());
return std::string(start);
}
static std::string charsubUTF8(std::string const &str, uint32_t pos) {
std::string_view view = str;
size_t charLen = 1;
@@ -2607,6 +2748,46 @@ static std::string charsubUTF8(std::string const &str, uint32_t pos) {
return std::string(start);
}
static int32_t charcmp(std::string_view str1, std::string_view str2) {
std::vector<int32_t> seq1, seq2;
size_t idx1 = 0, idx2 = 0;
for (;;) {
if (idx1 >= seq1.size()) {
idx1 = 0;
seq1.clear();
charmap_ConvertNext(str1, &seq1);
}
if (idx2 >= seq2.size()) {
idx2 = 0;
seq2.clear();
charmap_ConvertNext(str2, &seq2);
}
if (seq1.empty() != seq2.empty()) {
return seq1.empty() ? -1 : 1;
} else if (seq1.empty()) {
return 0;
} else {
int32_t value1 = seq1[idx1++], value2 = seq2[idx2++];
if (value1 != value2) {
return (value1 > value2) - (value1 < value2);
}
}
}
}
static uint32_t adjustNegativeIndex(int32_t idx, size_t len, char const *functionName) {
// String functions adjust negative index arguments the same way,
// such that position -1 is the last character of a string.
if (idx < 0) {
idx += len;
}
if (idx < 0) {
warning(WARNING_BUILTIN_ARG, "%s: Index starts at 0\n", functionName);
idx = 0;
}
return static_cast<uint32_t>(idx);
}
static uint32_t adjustNegativePos(int32_t pos, size_t len, char const *functionName) {
// STRSUB and CHARSUB adjust negative position arguments the same way,
// such that position -1 is the last character of a string.