diff options
author | John MacFarlane <jgm@berkeley.edu> | 2015-06-09 09:54:55 -0700 |
---|---|---|
committer | John MacFarlane <jgm@berkeley.edu> | 2015-06-09 09:54:55 -0700 |
commit | bc14d869323650e936c7143dcf941b28ccd5b57d (patch) | |
tree | e46c0ec079ff6e62c35bd7de544f4a794caeaa27 /src | |
parent | a173d0bb746b1afc6a4942a2536c9008da35b572 (diff) | |
parent | 8d997c85ee1452480ed3d821ce0642f7e6e5b9e6 (diff) |
Merge pull request #57 from nwellnhof/optimize_utf8proc_detab
Optimize utf8proc_detab
Diffstat (limited to 'src')
-rw-r--r-- | src/utf8.c | 62 |
1 files changed, 40 insertions, 22 deletions
@@ -56,21 +56,18 @@ static int utf8proc_charlen(const uint8_t *str, bufsize_t str_len) // Validate a single UTF-8 character according to RFC 3629. static int utf8proc_valid(const uint8_t *str, bufsize_t str_len) { - int length = utf8proc_charlen(str, str_len); + int length = utf8proc_utf8class[str[0]]; - if (length <= 0) - return length; + if (!length) + return -1; - switch (length) { - case 1: - if (str[0] == 0x00) { - // ASCII NUL is technically valid but rejected - // for security reasons. - return -length; - } - break; + if ((bufsize_t)length > str_len) + return -str_len; + switch (length) { case 2: + if ((str[1] & 0xC0) != 0x80) + return -1; if (str[0] < 0xC2) { // Overlong return -length; @@ -78,6 +75,10 @@ static int utf8proc_valid(const uint8_t *str, bufsize_t str_len) break; case 3: + if ((str[1] & 0xC0) != 0x80) + return -1; + if ((str[2] & 0xC0) != 0x80) + return -2; if (str[0] == 0xE0) { if (str[1] < 0xA0) { // Overlong @@ -92,6 +93,12 @@ static int utf8proc_valid(const uint8_t *str, bufsize_t str_len) break; case 4: + if ((str[1] & 0xC0) != 0x80) + return -1; + if ((str[2] & 0xC0) != 0x80) + return -2; + if ((str[3] & 0xC0) != 0x80) + return -3; if (str[0] == 0xF0) { if (str[1] < 0x90) { // Overlong @@ -117,10 +124,27 @@ void utf8proc_detab(cmark_strbuf *ob, const uint8_t *line, bufsize_t size) while (i < size) { bufsize_t org = i; + int charlen = 0; + + while (i < size && line[i] != '\t') { + if (line[i] >= 0x80) { + charlen = utf8proc_valid(line + i, size - i); + if (charlen < 0) { + charlen = -charlen; + break; + } + i += charlen; + } + else if (line[i] == '\0') { + // ASCII NUL is technically valid but rejected + // for security reasons. + charlen = 1; + break; + } + else { + i++; + } - while (i < size && line[i] != '\t' && line[i] != '\0' - && line[i] < 0x80) { - i++; tab++; } @@ -136,14 +160,8 @@ void utf8proc_detab(cmark_strbuf *ob, const uint8_t *line, bufsize_t size) i += 1; tab += numspaces; } else { - int charlen = utf8proc_valid(line + i, size - i); - - if (charlen >= 0) { - cmark_strbuf_put(ob, line + i, charlen); - } else { - encode_unknown(ob); - charlen = -charlen; - } + // Invalid UTF-8 + encode_unknown(ob); i += charlen; tab += 1; |