summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/utf8.c65
-rw-r--r--src/utf8.h1
2 files changed, 62 insertions, 4 deletions
diff --git a/src/utf8.c b/src/utf8.c
index e144c72..e4ea8e2 100644
--- a/src/utf8.c
+++ b/src/utf8.c
@@ -28,7 +28,7 @@ static void encode_unknown(strbuf *buf)
strbuf_put(buf, repl, 3);
}
-int utf8proc_charlen(const uint8_t *str, int str_len)
+static int utf8proc_charlen(const uint8_t *str, int str_len)
{
int length, i;
@@ -51,6 +51,64 @@ int utf8proc_charlen(const uint8_t *str, int str_len)
return length;
}
+// Validate a single UTF-8 character according to RFC 3629.
+static int utf8proc_valid(const uint8_t *str, int str_len)
+{
+ int length = utf8proc_charlen(str, str_len);
+
+ if (length <= 0)
+ return length;
+
+ switch (length) {
+ case 1:
+ if (str[0] == 0x00) {
+ // ASCII NUL is technically valid but rejected
+ // for security reasons.
+ return -length;
+ }
+ break;
+
+ case 2:
+ if (str[0] < 0xC2) {
+ // Overlong
+ return -length;
+ }
+ break;
+
+ case 3:
+ if (str[0] == 0xE0) {
+ if (str[1] < 0xA0) {
+ // Overlong
+ return -length;
+ }
+ }
+ else if (str[0] == 0xED) {
+ if (str[1] >= 0xA0) {
+ // Surrogate
+ return -length;
+ }
+ }
+ break;
+
+ case 4:
+ if (str[0] == 0xF0) {
+ if (str[1] < 0x90) {
+ // Overlong
+ return -length;
+ }
+ }
+ else if (str[0] >= 0xF4) {
+ if (str[0] > 0xF4 || str[1] >= 0x90) {
+ // Above 0x10FFFF
+ return -length;
+ }
+ }
+ break;
+ }
+
+ return length;
+}
+
void utf8proc_detab(strbuf *ob, const uint8_t *line, size_t size)
{
static const uint8_t whitespace[] = " ";
@@ -60,7 +118,8 @@ void utf8proc_detab(strbuf *ob, const uint8_t *line, size_t size)
while (i < size) {
size_t org = i;
- while (i < size && line[i] != '\t' && line[i] <= 0x80) {
+ while (i < size && line[i] != '\t' && line[i] != '\0'
+ && line[i] < 0x80) {
i++; tab++;
}
@@ -76,7 +135,7 @@ void utf8proc_detab(strbuf *ob, const uint8_t *line, size_t size)
i += 1;
tab += numspaces;
} else {
- int charlen = utf8proc_charlen(line + i, size - i);
+ int charlen = utf8proc_valid(line + i, size - i);
if (charlen >= 0) {
strbuf_put(ob, line + i, charlen);
diff --git a/src/utf8.h b/src/utf8.h
index 319e39a..7df1573 100644
--- a/src/utf8.h
+++ b/src/utf8.h
@@ -11,7 +11,6 @@ extern "C" {
void utf8proc_case_fold(cmark_strbuf *dest, const uint8_t *str, int len);
void utf8proc_encode_char(int32_t uc, cmark_strbuf *buf);
int utf8proc_iterate(const uint8_t *str, int str_len, int32_t *dst);
-int utf8proc_charlen(const uint8_t *str, int str_len);
void utf8proc_detab(cmark_strbuf *dest, const uint8_t *line, size_t size);
int utf8proc_is_space(int32_t uc);
int utf8proc_is_punctuation(int32_t uc);