Merge pull request #217 from nwellnhof/utf8_validation

UTF-8 validation
author: John MacFarlane <jgm@berkeley.edu> 2014-11-24 21:29:31 -0800
committer: John MacFarlane <jgm@berkeley.edu> 2014-11-24 21:29:31 -0800
commit: e5ab7bdbaa452d8bf7026875ee81fcb9c4a3c6d1 (patch)
tree: a9619d7e90846b60588c1948648888f9f4356bf9 /src
parent: 6291b2340055acfc5487141af5c1adb76cdf0662 (diff)
parent: a5ba5add1d72874fd40168eac54ed39e7b82bf49 (diff)
2 files changed, 62 insertions, 4 deletions
diff --git a/src/utf8.c b/src/utf8.c
index e144c72..e4ea8e2 100644
--- a/src/utf8.c
+++ b/src/utf8.c
@@ -28,7 +28,7 @@ static void encode_unknown(strbuf *buf)
 	strbuf_put(buf, repl, 3);
 }
 
-int utf8proc_charlen(const uint8_t *str, int str_len)
+static int utf8proc_charlen(const uint8_t *str, int str_len)
 {
 	int length, i;
 
@@ -51,6 +51,64 @@ int utf8proc_charlen(const uint8_t *str, int str_len)
 	return length;
 }
 
+// Validate a single UTF-8 character according to RFC 3629.
+static int utf8proc_valid(const uint8_t *str, int str_len)
+{
+	int length = utf8proc_charlen(str, str_len);
+
+	if (length <= 0)
+		return length;
+
+	switch (length) {
+	case 1:
+		if (str[0] == 0x00) {
+			// ASCII NUL is technically valid but rejected
+			// for security reasons.
+			return -length;
+		}
+		break;
+
+	case 2:
+		if (str[0] < 0xC2) {
+			// Overlong
+			return -length;
+		}
+		break;
+
+	case 3:
+		if (str[0] == 0xE0) {
+			if (str[1] < 0xA0) {
+				// Overlong
+				return -length;
+			}
+		}
+		else if (str[0] == 0xED) {
+			if (str[1] >= 0xA0) {
+				// Surrogate
+				return -length;
+			}
+		}
+		break;
+
+	case 4:
+		if (str[0] == 0xF0) {
+			if (str[1] < 0x90) {
+				// Overlong
+				return -length;
+			}
+		}
+		else if (str[0] >= 0xF4) {
+			if (str[0] > 0xF4 || str[1] >= 0x90) {
+				// Above 0x10FFFF
+				return -length;
+			}
+		}
+		break;
+	}
+
+	return length;
+}
+
 void utf8proc_detab(strbuf *ob, const uint8_t *line, size_t size)
 {
 	static const uint8_t whitespace[] = "    ";
@@ -60,7 +118,8 @@ void utf8proc_detab(strbuf *ob, const uint8_t *line, size_t size)
 	while (i < size) {
 		size_t org = i;
 
-		while (i < size && line[i] != '\t' && line[i] <= 0x80) {
+		while (i < size && line[i] != '\t' && line[i] != '\0'
+		       && line[i] < 0x80) {
 			i++; tab++;
 		}
 
@@ -76,7 +135,7 @@ void utf8proc_detab(strbuf *ob, const uint8_t *line, size_t size)
 			i += 1;
 			tab += numspaces;
 		} else {
-			int charlen = utf8proc_charlen(line + i, size - i);
+			int charlen = utf8proc_valid(line + i, size - i);
 
 			if (charlen >= 0) {
 				strbuf_put(ob, line + i, charlen);
diff --git a/src/utf8.h b/src/utf8.h
index 319e39a..7df1573 100644
--- a/src/utf8.h
+++ b/src/utf8.h
@@ -11,7 +11,6 @@ extern "C" {
 void utf8proc_case_fold(cmark_strbuf *dest, const uint8_t *str, int len);
 void utf8proc_encode_char(int32_t uc, cmark_strbuf *buf);
 int utf8proc_iterate(const uint8_t *str, int str_len, int32_t *dst);
-int utf8proc_charlen(const uint8_t *str, int str_len);
 void utf8proc_detab(cmark_strbuf *dest, const uint8_t *line, size_t size);
 int utf8proc_is_space(int32_t uc);
 int utf8proc_is_punctuation(int32_t uc);
author	John MacFarlane <jgm@berkeley.edu>	2014-11-24 21:29:31 -0800
committer	John MacFarlane <jgm@berkeley.edu>	2014-11-24 21:29:31 -0800
commit	e5ab7bdbaa452d8bf7026875ee81fcb9c4a3c6d1 (patch)
tree	a9619d7e90846b60588c1948648888f9f4356bf9 /src
parent	6291b2340055acfc5487141af5c1adb76cdf0662 (diff)
parent	a5ba5add1d72874fd40168eac54ed39e7b82bf49 (diff)