diff --git a/LICENSE b/LICENSE index 8304b668..74639978 100644 --- a/LICENSE +++ b/LICENSE @@ -18,6 +18,10 @@ released under the following license: rgbfix was rewritten from scratch by Anthony J. Bentley, and is released under the ISC license; see the source file for the text of the license. +The UTF-8 decoder in src/asm/charmap.c was written by Björn Höhrmann and is +released under the MIT license. The remainder of charmap.c was written by +stag019, and is released under the ISC license. + extern/err.c is derived from the Musl C library, http://www.musl-libc.org, and is released under the MIT license. diff --git a/src/asm/charmap.c b/src/asm/charmap.c index cd0b7fac..370f1986 100644 --- a/src/asm/charmap.c +++ b/src/asm/charmap.c @@ -1,3 +1,57 @@ +/* + * UTF-8 decoder copyright © 2008–2009 Björn Höhrmann + * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include + +static const uint8_t utf8d[] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df + 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef + 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff + 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 + 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 + 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 + 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 +}; + +uint32_t +decode(uint32_t* state, uint32_t* codep, uint32_t byte) { + uint32_t type = utf8d[byte]; + + *codep = (*state != 0) ? + (byte & 0x3fu) | (*codep << 6) : + (0xff >> type) & (byte); + + *state = utf8d[256 + *state*16 + type]; + return *state; +} + /* * Copyright © 2013 stag019 * @@ -28,30 +82,26 @@ struct Charmap globalCharmap = {0}; extern struct Section *pCurrentSection; int -readUTF8Char(char *destination, char *source) +readUTF8Char(char *dest, char *src) { - int size; - UBYTE first; - first = source[0]; + uint32_t state; + uint32_t codep; + int i; - if (first >= 0xFC) { - size = 6; - } else if (first >= 0xF8) { - size = 5; - } else if (first >= 0xF0) { - size = 4; - } else if (first >= 0xE0) { - size = 3; - } else if (first >= 0xC0) { - size = 2; - } else if (first != '\0') { - size = 1; - } else { - size = 0; + for (i = 0, state = 0;; i++) { + if (decode(&state, &codep, (uint8_t)src[i]) == 1) { + fatalerror("invalid UTF-8 character"); + } + + dest[i] = src[i]; + + i++; + if (state == 0) { + dest[i] = '\0'; + return i; + } + dest[i] = src[i]; } - strncpy(destination, source, size); - destination[size] = 0; - return size; } int