Merge pull request #360 from jidoc01/master

Improve charmap structure with trie.
This commit is contained in:
Antonio Niño Díaz
2019-07-07 11:46:08 +01:00
committed by GitHub
2 changed files with 88 additions and 60 deletions

View File

@@ -13,14 +13,25 @@
#define MAXCHARMAPS 512 #define MAXCHARMAPS 512
#define CHARMAPLENGTH 16 #define CHARMAPLENGTH 16
#define MAXCHARNODES (MAXCHARMAPS * CHARMAPLENGTH + 1)
/*
* A node for trie structure.
*/
struct Charnode {
uint8_t code; /* the value in a key-value pair. */
uint8_t isCode; /* has one if it's a code node, not just a bridge node. */
struct Charnode *next[256]; /* each index representing the next possible character from its current state. */
};
struct Charmap { struct Charmap {
int32_t count; int32_t charCount; /* user-side count. */
char input[MAXCHARMAPS][CHARMAPLENGTH + 1]; int32_t nodeCount; /* node-side count. */
char output[MAXCHARMAPS]; struct Charnode nodes[MAXCHARNODES]; /* first node is reserved for the root node in charmap. */
}; };
int32_t readUTF8Char(char *destination, char *source); int32_t readUTF8Char(char *destination, char *source);
int32_t charmap_Add(char *input, uint8_t output); int32_t charmap_Add(char *input, uint8_t output);
int32_t charmap_Convert(char **input); int32_t charmap_Convert(char **input);

View File

@@ -42,11 +42,10 @@ int32_t readUTF8Char(char *dest, char *src)
int32_t charmap_Add(char *input, uint8_t output) int32_t charmap_Add(char *input, uint8_t output)
{ {
int32_t i; int32_t i;
size_t input_length; uint8_t v;
char temp1i[CHARMAPLENGTH + 1], temp2i[CHARMAPLENGTH + 1];
char temp1o = 0, temp2o = 0;
struct Charmap *charmap; struct Charmap *charmap;
struct Charnode *curr_node, *temp_node;
if (pCurrentSection) { if (pCurrentSection) {
if (pCurrentSection->charmap) { if (pCurrentSection->charmap) {
@@ -55,91 +54,109 @@ int32_t charmap_Add(char *input, uint8_t output)
charmap = calloc(1, sizeof(struct Charmap)); charmap = calloc(1, sizeof(struct Charmap));
if (charmap == NULL) if (charmap == NULL)
fatalerror("Not enough memory for charmap"); fatalerror("Not enough memory for charmap");
pCurrentSection->charmap = charmap; pCurrentSection->charmap = charmap;
} }
} else { } else {
charmap = &globalCharmap; charmap = &globalCharmap;
} }
if (charmap->count > MAXCHARMAPS || strlen(input) > CHARMAPLENGTH) if (charmap->charCount >= MAXCHARMAPS || strlen(input) > CHARMAPLENGTH)
return -1; return -1;
input_length = strlen(input); curr_node = &charmap->nodes[0];
if (input_length > 1) {
i = 0; for (i = 0; (v = (uint8_t)input[i]); i++) {
while (i < charmap->count + 1) { if (curr_node->next[v]) {
if (input_length > strlen(charmap->input[i])) { curr_node = curr_node->next[v];
memcpy(temp1i, charmap->input[i],
CHARMAPLENGTH + 1);
memcpy(charmap->input[i], input, input_length);
temp1o = charmap->output[i];
charmap->output[i] = output;
i++;
break;
}
i++;
}
while (i < charmap->count + 1) {
memcpy(temp2i, charmap->input[i], CHARMAPLENGTH + 1);
memcpy(charmap->input[i], temp1i, CHARMAPLENGTH + 1);
memcpy(temp1i, temp2i, CHARMAPLENGTH + 1);
temp2o = charmap->output[i];
charmap->output[i] = temp1o;
temp1o = temp2o;
i++;
}
memcpy(charmap->input[charmap->count + 1], temp1i,
CHARMAPLENGTH + 1);
charmap->output[charmap->count + 1] = temp1o;
} else { } else {
memcpy(charmap->input[charmap->count], input, input_length); temp_node = &charmap->nodes[charmap->nodeCount + 1];
charmap->output[charmap->count] = output;
curr_node->next[v] = temp_node;
curr_node = temp_node;
++charmap->nodeCount;
} }
return ++charmap->count; }
/* prevent duplicated keys by accepting only first key-value pair. */
if (curr_node->isCode)
return charmap->charCount;
curr_node->code = output;
curr_node->isCode = 1;
return ++charmap->charCount;
} }
int32_t charmap_Convert(char **input) int32_t charmap_Convert(char **input)
{ {
struct Charmap *charmap; struct Charmap *charmap;
struct Charnode *charnode;
char outchar[CHARMAPLENGTH + 1]; char *output;
char *buffer; char outchar[8];
int32_t i, j, length;
int32_t i, match, length;
uint8_t v, foundCode;
if (pCurrentSection && pCurrentSection->charmap) if (pCurrentSection && pCurrentSection->charmap)
charmap = pCurrentSection->charmap; charmap = pCurrentSection->charmap;
else else
charmap = &globalCharmap; charmap = &globalCharmap;
buffer = malloc(strlen(*input)); output = malloc(strlen(*input));
if (buffer == NULL) if (output == NULL)
fatalerror("Not enough memory for buffer"); fatalerror("Not enough memory for buffer");
length = 0; length = 0;
while (**input) { while (**input) {
j = 0; charnode = &charmap->nodes[0];
for (i = 0; i < charmap->count; i++) {
j = strlen(charmap->input[i]); /*
if (memcmp(*input, charmap->input[i], j) == 0) { * find the longest valid match which has been registered in charmap.
outchar[0] = charmap->output[i]; * note that there could be either multiple matches or no match.
outchar[1] = 0; * and it possibly takes the longest match between them,
* which means that it ignores partial matches shorter than the longest one.
*/
for (i = match = 0; (v = (*input)[i]);) {
if (!charnode->next[v])
break; break;
charnode = charnode->next[v];
i++;
if (charnode->isCode) {
match = i;
foundCode = charnode->code;
} }
j = 0;
} }
if (!j) if (match) {
j = readUTF8Char(outchar, *input); output[length] = foundCode;
if (!outchar[0]) { length += 1;
buffer[length++] = 0;
} else { } else {
for (i = 0; outchar[i]; i++) /*
buffer[length++] = outchar[i]; * put a utf-8 character
* if failed to find a match.
*/
match = readUTF8Char(outchar, *input);
if (match) {
memcpy(output + length, *input, match);
} else {
output[length] = 0;
match = 1;
} }
*input += j;
length += match;
} }
*input = buffer;
*input += match;
}
*input = output;
return length; return length;
} }