diff options
author | Nick Wellnhofer <wellnhofer@aevum.de> | 2015-06-09 19:27:33 +0200 |
---|---|---|
committer | Nick Wellnhofer <wellnhofer@aevum.de> | 2015-06-09 19:32:35 +0200 |
commit | 38f6ac470d3b597446d4663a00efbe6ebce8ee5e (patch) | |
tree | 75af766e702d5899959b91ae7bd99e186e846283 /src/utf8.c | |
parent | 8d997c85ee1452480ed3d821ce0642f7e6e5b9e6 (diff) |
Further optimize utf8proc_valid
Assume a multi-byte sequence and rework switch statement into if/else
for another 2% speedup.
Diffstat (limited to 'src/utf8.c')
-rw-r--r-- | src/utf8.c | 71 |
1 files changed, 34 insertions, 37 deletions
@@ -54,9 +54,11 @@ static int utf8proc_charlen(const uint8_t *str, bufsize_t str_len) } // Validate a single UTF-8 character according to RFC 3629. +// Assumes a multi-byte UTF-8 sequence. static int utf8proc_valid(const uint8_t *str, bufsize_t str_len) { int length = utf8proc_utf8class[str[0]]; + assert(length != 1); if (!length) return -1; @@ -64,53 +66,48 @@ static int utf8proc_valid(const uint8_t *str, bufsize_t str_len) if ((bufsize_t)length > str_len) return -str_len; - switch (length) { - case 2: - if ((str[1] & 0xC0) != 0x80) - return -1; + if ((str[1] & 0xC0) != 0x80) + return -1; + + if (length == 2) { if (str[0] < 0xC2) { // Overlong return -length; } - break; - - case 3: - if ((str[1] & 0xC0) != 0x80) - return -1; + } + else { if ((str[2] & 0xC0) != 0x80) return -2; - if (str[0] == 0xE0) { - if (str[1] < 0xA0) { - // Overlong - return -length; - } - } else if (str[0] == 0xED) { - if (str[1] >= 0xA0) { - // Surrogate - return -length; - } - } - break; - case 4: - if ((str[1] & 0xC0) != 0x80) - return -1; - if ((str[2] & 0xC0) != 0x80) - return -2; - if ((str[3] & 0xC0) != 0x80) - return -3; - if (str[0] == 0xF0) { - if (str[1] < 0x90) { - // Overlong - return -length; + if (length == 3) { + if (str[0] == 0xE0) { + if (str[1] < 0xA0) { + // Overlong + return -length; + } + } else if (str[0] == 0xED) { + if (str[1] >= 0xA0) { + // Surrogate + return -length; + } } - } else if (str[0] >= 0xF4) { - if (str[0] > 0xF4 || str[1] >= 0x90) { - // Above 0x10FFFF - return -length; + } + else { + if ((str[3] & 0xC0) != 0x80) + return -3; + + if (str[0] == 0xF0) { + if (str[1] < 0x90) { + // Overlong + return -length; + } + } else if (str[0] >= 0xF4) { + if (str[0] > 0xF4 || str[1] >= 0x90) { + // Above 0x10FFFF + return -length; + } } } - break; } return length; |