diff options
author | Vicent Marti <tanoku@gmail.com> | 2014-09-06 20:48:05 +0200 |
---|---|---|
committer | Vicent Marti <tanoku@gmail.com> | 2014-09-09 03:39:16 +0200 |
commit | 61e3e606e64221eaa5cf3d83dc598d5a42818d10 (patch) | |
tree | 1dfb6309c0e0fd7de8094c9da6497992b156350c /src/utf8.c | |
parent | 278b89d092cae8fe9cdd6346c69512886d36abbd (diff) |
UTF8-aware detabbing and entity handling
Diffstat (limited to 'src/utf8.c')
-rw-r--r-- | src/utf8.c | 59 |
1 files changed, 53 insertions, 6 deletions
@@ -3,7 +3,7 @@ #include <unistd.h> #include <assert.h> -#include "stmd.h" +#include "utf8.h" static const int8_t utf8proc_utf8class[256] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, @@ -23,6 +23,12 @@ static const int8_t utf8proc_utf8class[256] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 }; +static void encode_unknown(strbuf *buf) +{ + static const unsigned char repl[] = {239, 191, 189}; + strbuf_put(buf, repl, 3); +} + ssize_t utf8proc_charlen(const uint8_t *str, ssize_t str_len) { ssize_t length, i; @@ -46,6 +52,46 @@ ssize_t utf8proc_charlen(const uint8_t *str, ssize_t str_len) return length; } +void utf8proc_detab(strbuf *ob, const unsigned char *line, size_t size) +{ + static const unsigned char whitespace[] = " "; + + size_t i = 0, tab = 0; + + while (i < size) { + size_t org = i; + + while (i < size && line[i] != '\t' && line[i] <= 0x80) { + i++; tab++; + } + + if (i > org) + strbuf_put(ob, line + org, i - org); + + if (i >= size) + break; + + if (line[i] == '\t') { + int numspaces = 4 - (tab % 4); + strbuf_put(ob, whitespace, numspaces); + i += 1; + tab += numspaces; + } else { + ssize_t charlen = utf8proc_charlen(line + i, size - i); + + if (charlen < 0) { + encode_unknown(ob); + i++; + } else { + strbuf_put(ob, line + i, charlen); + i += charlen; + } + + tab += 1; + } + } +} + ssize_t utf8proc_iterate(const uint8_t *str, ssize_t str_len, int32_t *dst) { ssize_t length; @@ -89,9 +135,9 @@ void utf8proc_encode_char(int32_t uc, strbuf *buf) unsigned char dst[4]; int len = 0; - if (uc < 0x00) { - assert(false); - } else if (uc < 0x80) { + assert(uc >= 0); + + if (uc < 0x80) { dst[0] = uc; len = 1; } else if (uc < 0x800) { @@ -116,7 +162,8 @@ void utf8proc_encode_char(int32_t uc, strbuf *buf) dst[3] = 0x80 + (uc & 0x3F); len = 4; } else { - assert(false); + encode_unknown(buf); + return; } strbuf_put(buf, dst, len); @@ -133,7 +180,7 @@ void utf8proc_case_fold(strbuf *dest, const unsigned char *str, int len) ssize_t char_len = utf8proc_iterate(str, len, &c); if (char_len < 0) { - bufpush(0xFFFD); + encode_unknown(dest); continue; } |