67 lines
1.6 KiB
67 lines
1.6 KiB
1 year ago
|
#include <stdint.h>
|
||
|
#include "util/utf8.h"
|
||
|
|
||
|
static bool in_range(char x, uint8_t low, uint8_t high) {
|
||
|
uint8_t v = (uint8_t)x;
|
||
|
return low <= v && v <= high;
|
||
|
}
|
||
|
|
||
|
bool is_utf8(const char *string) {
|
||
|
/* Returns true iff the string is 'well-formed', as defined by
|
||
|
* Unicode Standard 15.0.0. See Chapter 3, D92 and Table 3.7.
|
||
|
*
|
||
|
* UTF-8 strings are sequences of code points encoded in one of the
|
||
|
* following ways. The first byte determines the pattern.
|
||
|
*
|
||
|
* 00..7F
|
||
|
* C2..DF 80..BF
|
||
|
* E0 A0..BF 80..BF
|
||
|
* E1..EC 80..BF 80..BF
|
||
|
* ED 80..9F 80..BF
|
||
|
* EE..EF 80..BF 80..BF
|
||
|
* F0 90..BF 80..BF 80..BF
|
||
|
* F1..F3 80..BF 80..BF 80..BF
|
||
|
* F4 80..8F 80..BF 80..BF
|
||
|
*/
|
||
|
uint8_t range_table[9][8] = {
|
||
|
{0x00, 0x7F},
|
||
|
{0xC2, 0xDF, 0x80, 0xBF},
|
||
|
{0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF},
|
||
|
{0xE1, 0xEC, 0x80, 0xBF, 0x80, 0xBF},
|
||
|
{0xED, 0xED, 0x80, 0x9F, 0x80, 0xBF},
|
||
|
{0xEE, 0xEF, 0x80, 0xBF, 0x80, 0xBF},
|
||
|
{0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF},
|
||
|
{0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF},
|
||
|
{0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF},
|
||
|
};
|
||
|
int lengths[9] = {
|
||
|
1, 2, 3, 3, 3, 3, 4, 4, 4
|
||
|
};
|
||
|
|
||
|
while (string[0]) {
|
||
|
bool accept = false;
|
||
|
for (int i = 0; i < 9; i++) {
|
||
|
if (!in_range(string[0], range_table[i][0],
|
||
|
range_table[i][1])) {
|
||
|
continue;
|
||
|
}
|
||
|
for (int j = 1; j < lengths[i]; j++) {
|
||
|
if (!in_range(string[j], range_table[i][2 * j],
|
||
|
range_table[i][2 * j + 1])) {
|
||
|
// Early exit is necessary to avoid
|
||
|
// reading past the null terminator
|
||
|
return false;
|
||
|
}
|
||
|
}
|
||
|
string += lengths[i];
|
||
|
accept = true;
|
||
|
break;
|
||
|
}
|
||
|
if (!accept) {
|
||
|
return false;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return true;
|
||
|
}
|