summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2015-07-13 09:21:35 -0700
committerJohn MacFarlane <jgm@berkeley.edu>2015-07-13 10:15:55 -0700
commitac39623d667999cfae1444b46508a9a423b0df1b (patch)
tree40579cea4365b373fdc2831c2e43c2288671d028
parent6dcd2beafdfbc9f694916bcdfa822b896aa44177 (diff)
Added `CMARK_OPT_SAFE` option and `--safe` command-line flag.
* Added `CMARK_OPT_SAFE`. This option disables rendering of raw HTML and potentially dangerous links. * Added `--safe` option in command-line program. * Updated `cmark.3` man page. * Added `scan_dangerous_url` to scanners. * In HTML, suppress rendering of raw HTML and potentially dangerous links if `CMARK_OPT_SAFE`. Dangerous URLs are those that begin with `javascript:`, `vbscript:`, `file:`, or `data:` (except for `image/png`, `image/gif`, `image/jpeg`, or `image/webp` mime types). * Added `api_test` for `OPT_CMARK_SAFE`. * Rewrote `README.md` on security.
-rw-r--r--README.md15
-rw-r--r--api_test/main.c16
-rw-r--r--man/man1/cmark.18
-rw-r--r--man/man3/cmark.318
-rw-r--r--src/cmark.h8
-rw-r--r--src/html.c38
-rw-r--r--src/main.c3
-rw-r--r--src/scanners.c315
-rw-r--r--src/scanners.h2
-rw-r--r--src/scanners.re14
10 files changed, 422 insertions, 15 deletions
diff --git a/README.md b/README.md
index c0ca22d..5cfbb10 100644
--- a/README.md
+++ b/README.md
@@ -139,11 +139,16 @@ Usage
Instructions for the use of the command line program and library can
be found in the man pages in the `man` subdirectory.
-**A note on security:**
-This library does not attempt to sanitize link attributes or
-raw HTML. If you use it in applications that accept
-untrusted user input, you must run the output through an HTML
-sanitizer to protect against
+Security
+--------
+
+By default, the library will pass through raw HTML and potentially
+dangerous links (`javascript:`, `vbscript:`, `data:`, `file:`).
+
+It is recommended that users either disable this potentially unsafe
+feature by using the option `CMARK_OPT_SAFE` (or `--safe` with the
+command-line program), or run the output through an HTML sanitizer
+to protect against
[XSS attacks](http://en.wikipedia.org/wiki/Cross-site_scripting).
Contributing
diff --git a/api_test/main.c b/api_test/main.c
index 104371c..dfb5483 100644
--- a/api_test/main.c
+++ b/api_test/main.c
@@ -714,6 +714,21 @@ numeric_entities(test_batch_runner *runner)
}
static void
+test_safe(test_batch_runner *runner)
+{
+ // Test safe mode
+ static const char raw_html[] =
+ "<div>\nhi\n</div>\n\n<a>hi</a>\n[link](JAVAscript:alert('hi'))\n![image](file:my.js)\n";
+ char *html = cmark_markdown_to_html(raw_html,
+ sizeof(raw_html) - 1,
+ CMARK_OPT_DEFAULT |
+ CMARK_OPT_SAFE);
+ STR_EQ(runner, html, "<!-- raw HTML omitted -->\n<p><!-- raw HTML omitted -->hi<!-- raw HTML omitted -->\n<a href=\"\">link</a>\n<img src=\"\" alt=\"image\" /></p>\n",
+ "input with raw HTML and dangerous links");
+ free(html);
+}
+
+static void
test_md_to_html(test_batch_runner *runner, const char *markdown,
const char *expected_html, const char *msg)
{
@@ -741,6 +756,7 @@ int main() {
line_endings(runner);
numeric_entities(runner);
test_cplusplus(runner);
+ test_safe(runner);
test_print_summary(runner);
retval = test_ok(runner) ? 0 : 1;
diff --git a/man/man1/cmark.1 b/man/man1/cmark.1
index 64fa697..8dd9165 100644
--- a/man/man1/cmark.1
+++ b/man/man1/cmark.1
@@ -45,6 +45,14 @@ be rendered as curly quotes, depending on their position.
\f[C]\-\-\-\f[] will be rendered as an em-dash.
\f[C]...\f[] will be rendered as ellipses.
.TP 12n
+.B \-\-safe
+Do not render raw HTML or potentially dangerous URLs.
+(Raw HTML is replaced by a placeholder comment; potentially
+dangerous URLs are replaced by empty strings.) Dangerous
+URLs are those that begin with `javascript:`, `vbscript:`,
+`file:`, or `data:` (except for `image/png`, `image/gif`,
+`image/jpeg`, or `image/webp` mime types).
+.TP 12n
.B \-\-help
Print usage information.
.TP 12n
diff --git a/man/man3/cmark.3 b/man/man3/cmark.3
index 288fadc..1359fcc 100644
--- a/man/man3/cmark.3
+++ b/man/man3/cmark.3
@@ -1,4 +1,4 @@
-.TH cmark 3 "July 12, 2015" "LOCAL" "Library Functions Manual"
+.TH cmark 3 "July 13, 2015" "LOCAL" "Library Functions Manual"
.SH
NAME
.PP
@@ -569,6 +569,22 @@ dashes.
Validate UTF\-8 in the input before parsing, replacing illegal sequences
with the replacement character U+FFFD.
+.PP
+.nf
+\fC
+.RS 0n
+#define CMARK_OPT_SAFE 32
+.RE
+\f[]
+.fi
+
+.PP
+Suppress raw HTML and unsafe links (\f[C]javascript:\f[],
+\f[C]vbscript:\f[], \f[C]file:\f[], and \f[C]data:\f[], except for
+\f[C]image/png\f[], \f[C]image/gif\f[], \f[C]image/jpeg\f[], or
+\f[C]image/webp\f[] mime types). Raw HTML is replaced by a placeholder
+HTML comment. Unsafe links are replaced by empty strings.
+
.SS
Version information
diff --git a/src/cmark.h b/src/cmark.h
index 7ae6d36..4a85f26 100644
--- a/src/cmark.h
+++ b/src/cmark.h
@@ -516,6 +516,14 @@ char *cmark_render_latex(cmark_node *root, int options, int width);
*/
#define CMARK_OPT_VALIDATE_UTF8 16
+/** Suppress raw HTML and unsafe links (`javascript:`, `vbscript:`,
+ * `file:`, and `data:`, except for `image/png`, `image/gif`,
+ * `image/jpeg`, or `image/webp` mime types). Raw HTML is replaced
+ * by a placeholder HTML comment. Unsafe links are replaced by
+ * empty strings.
+ */
+#define CMARK_OPT_SAFE 32
+
/**
* ## Version information
*/
diff --git a/src/html.c b/src/html.c
index 8cf8835..48a80d6 100644
--- a/src/html.c
+++ b/src/html.c
@@ -8,6 +8,7 @@
#include "node.h"
#include "buffer.h"
#include "houdini.h"
+#include "scanners.h"
// Functions to convert cmark_nodes to HTML strings.
@@ -174,7 +175,13 @@ S_render_node(cmark_node *node, cmark_event_type ev_type,
case CMARK_NODE_HTML:
cr(html);
- cmark_strbuf_put(html, node->as.literal.data, node->as.literal.len);
+ if (options & CMARK_OPT_SAFE) {
+ cmark_strbuf_puts(html, "<!-- raw HTML omitted -->");
+ } else {
+ cmark_strbuf_put(html, node->as.literal.data,
+ node->as.literal.len);
+ }
+ cr(html);
break;
case CMARK_NODE_HRULE:
@@ -228,7 +235,12 @@ S_render_node(cmark_node *node, cmark_event_type ev_type,
break;
case CMARK_NODE_INLINE_HTML:
- cmark_strbuf_put(html, node->as.literal.data, node->as.literal.len);
+ if (options & CMARK_OPT_SAFE) {
+ cmark_strbuf_puts(html, "<!-- raw HTML omitted -->");
+ } else {
+ cmark_strbuf_put(html, node->as.literal.data,
+ node->as.literal.len);
+ }
break;
case CMARK_NODE_STRONG:
@@ -250,15 +262,19 @@ S_render_node(cmark_node *node, cmark_event_type ev_type,
case CMARK_NODE_LINK:
if (entering) {
cmark_strbuf_puts(html, "<a href=\"");
- houdini_escape_href(html, node->as.link.url.data,
- node->as.link.url.len);
+ if (!((options & CMARK_OPT_SAFE) &&
+ scan_dangerous_url(&node->as.link.url, 0))) {
+ houdini_escape_href(html,
+ node->as.link.url.data,
+ node->as.link.url.len);
+ }
if (node->as.link.title.len) {
cmark_strbuf_puts(html, "\" title=\"");
- escape_html(html, node->as.link.title.data,
- node->as.link.title.len);
+ escape_html(html,
+ node->as.link.title.data,
+ node->as.link.title.len);
}
-
cmark_strbuf_puts(html, "\">");
} else {
cmark_strbuf_puts(html, "</a>");
@@ -268,9 +284,13 @@ S_render_node(cmark_node *node, cmark_event_type ev_type,
case CMARK_NODE_IMAGE:
if (entering) {
cmark_strbuf_puts(html, "<img src=\"");
- houdini_escape_href(html, node->as.link.url.data,
- node->as.link.url.len);
+ if (!((options & CMARK_OPT_SAFE) &&
+ scan_dangerous_url(&node->as.link.url, 0))) {
+ houdini_escape_href(html,
+ node->as.link.url.data,
+ node->as.link.url.len);
+ }
cmark_strbuf_puts(html, "\" alt=\"");
state->plain = node;
} else {
diff --git a/src/main.c b/src/main.c
index 26e42ca..7fae7e4 100644
--- a/src/main.c
+++ b/src/main.c
@@ -28,6 +28,7 @@ void print_usage()
printf(" --width WIDTH Specify wrap width (default 0 = nowrap)\n");
printf(" --sourcepos Include source position attribute\n");
printf(" --hardbreaks Treat newlines as hard line breaks\n");
+ printf(" --safe Suppress raw HTML and dangerous URLs\n");
printf(" --smart Use smart punctuation\n");
printf(" --normalize Consolidate adjacent text nodes\n");
printf(" --help, -h Print usage information\n");
@@ -93,6 +94,8 @@ int main(int argc, char *argv[])
options |= CMARK_OPT_HARDBREAKS;
} else if (strcmp(argv[i], "--smart") == 0) {
options |= CMARK_OPT_SMART;
+ } else if (strcmp(argv[i], "--safe") == 0) {
+ options |= CMARK_OPT_SAFE;
} else if (strcmp(argv[i], "--normalize") == 0) {
options |= CMARK_OPT_NORMALIZE;
} else if (strcmp(argv[i], "--validate-utf8") == 0) {
diff --git a/src/scanners.c b/src/scanners.c
index b3963a3..75fdb46 100644
--- a/src/scanners.c
+++ b/src/scanners.c
@@ -20730,3 +20730,318 @@ yy2270:
}
}
+
+// Returns positive value if a URL begins in a way that is potentially
+// dangerous, with javascript:, vbscript:, file:, or data:, otherwise 0.
+bufsize_t _scan_dangerous_url(const unsigned char *p)
+{
+ const unsigned char *marker = NULL;
+ const unsigned char *start = p;
+
+{
+ unsigned char yych;
+ unsigned int yyaccept = 0;
+ yych = *(marker = p);
+ if (yych <= 'f') {
+ if (yych <= 'I') {
+ if (yych <= 'C') {
+ if (yych != '\n') goto yy2278;
+ } else {
+ if (yych <= 'D') goto yy2274;
+ if (yych == 'F') goto yy2277;
+ goto yy2278;
+ }
+ } else {
+ if (yych <= 'V') {
+ if (yych <= 'J') goto yy2275;
+ if (yych <= 'U') goto yy2278;
+ goto yy2276;
+ } else {
+ if (yych == 'd') goto yy2274;
+ if (yych <= 'e') goto yy2278;
+ goto yy2277;
+ }
+ }
+ } else {
+ if (yych <= 0xDF) {
+ if (yych <= 'u') {
+ if (yych == 'j') goto yy2275;
+ goto yy2278;
+ } else {
+ if (yych <= 'v') goto yy2276;
+ if (yych <= 0x7F) goto yy2278;
+ if (yych >= 0xC2) goto yy2279;
+ }
+ } else {
+ if (yych <= 0xEF) {
+ if (yych <= 0xE0) goto yy2281;
+ if (yych == 0xED) goto yy2286;
+ goto yy2282;
+ } else {
+ if (yych <= 0xF0) goto yy2283;
+ if (yych <= 0xF3) goto yy2284;
+ if (yych <= 0xF4) goto yy2285;
+ }
+ }
+ }
+yy2273:
+ { return 0; }
+yy2274:
+ yyaccept = 0;
+ yych = *(marker = ++p);
+ if (yych == 'A') goto yy2308;
+ if (yych == 'a') goto yy2308;
+ goto yy2273;
+yy2275:
+ yyaccept = 0;
+ yych = *(marker = ++p);
+ if (yych == 'A') goto yy2299;
+ if (yych == 'a') goto yy2299;
+ goto yy2273;
+yy2276:
+ yyaccept = 0;
+ yych = *(marker = ++p);
+ if (yych == 'B') goto yy2292;
+ if (yych == 'b') goto yy2292;
+ goto yy2273;
+yy2277:
+ yyaccept = 0;
+ yych = *(marker = ++p);
+ if (yych == 'I') goto yy2287;
+ if (yych == 'i') goto yy2287;
+ goto yy2273;
+yy2278:
+ yych = *++p;
+ goto yy2273;
+yy2279:
+ yych = *++p;
+ if (yych <= 0x7F) goto yy2280;
+ if (yych <= 0xBF) goto yy2278;
+yy2280:
+ p = marker;
+ if (yyaccept == 0) {
+ goto yy2273;
+ } else {
+ goto yy2291;
+ }
+yy2281:
+ yych = *++p;
+ if (yych <= 0x9F) goto yy2280;
+ if (yych <= 0xBF) goto yy2279;
+ goto yy2280;
+yy2282:
+ yych = *++p;
+ if (yych <= 0x7F) goto yy2280;
+ if (yych <= 0xBF) goto yy2279;
+ goto yy2280;
+yy2283:
+ yych = *++p;
+ if (yych <= 0x8F) goto yy2280;
+ if (yych <= 0xBF) goto yy2282;
+ goto yy2280;
+yy2284:
+ yych = *++p;
+ if (yych <= 0x7F) goto yy2280;
+ if (yych <= 0xBF) goto yy2282;
+ goto yy2280;
+yy2285:
+ yych = *++p;
+ if (yych <= 0x7F) goto yy2280;
+ if (yych <= 0x8F) goto yy2282;
+ goto yy2280;
+yy2286:
+ yych = *++p;
+ if (yych <= 0x7F) goto yy2280;
+ if (yych <= 0x9F) goto yy2279;
+ goto yy2280;
+yy2287:
+ yych = *++p;
+ if (yych == 'L') goto yy2288;
+ if (yych != 'l') goto yy2280;
+yy2288:
+ yych = *++p;
+ if (yych == 'E') goto yy2289;
+ if (yych != 'e') goto yy2280;
+yy2289:
+ yych = *++p;
+ if (yych != ':') goto yy2280;
+yy2290:
+ ++p;
+yy2291:
+ { return (bufsize_t)(p - start); }
+yy2292:
+ yych = *++p;
+ if (yych == 'S') goto yy2293;
+ if (yych != 's') goto yy2280;
+yy2293:
+ yych = *++p;
+ if (yych == 'C') goto yy2294;
+ if (yych != 'c') goto yy2280;
+yy2294:
+ yych = *++p;
+ if (yych == 'R') goto yy2295;
+ if (yych != 'r') goto yy2280;
+yy2295:
+ yych = *++p;
+ if (yych == 'I') goto yy2296;
+ if (yych != 'i') goto yy2280;
+yy2296:
+ yych = *++p;
+ if (yych == 'P') goto yy2297;
+ if (yych != 'p') goto yy2280;
+yy2297:
+ yych = *++p;
+ if (yych == 'T') goto yy2298;
+ if (yych != 't') goto yy2280;
+yy2298:
+ yych = *++p;
+ if (yych == ':') goto yy2290;
+ goto yy2280;
+yy2299:
+ yych = *++p;
+ if (yych == 'V') goto yy2300;
+ if (yych != 'v') goto yy2280;
+yy2300:
+ yych = *++p;
+ if (yych == 'A') goto yy2301;
+ if (yych != 'a') goto yy2280;
+yy2301:
+ yych = *++p;
+ if (yych == 'S') goto yy2302;
+ if (yych != 's') goto yy2280;
+yy2302:
+ yych = *++p;
+ if (yych == 'C') goto yy2303;
+ if (yych != 'c') goto yy2280;
+yy2303:
+ yych = *++p;
+ if (yych == 'R') goto yy2304;
+ if (yych != 'r') goto yy2280;
+yy2304:
+ yych = *++p;
+ if (yych == 'I') goto yy2305;
+ if (yych != 'i') goto yy2280;
+yy2305:
+ yych = *++p;
+ if (yych == 'P') goto yy2306;
+ if (yych != 'p') goto yy2280;
+yy2306:
+ yych = *++p;
+ if (yych == 'T') goto yy2307;
+ if (yych != 't') goto yy2280;
+yy2307:
+ yych = *++p;
+ if (yych == ':') goto yy2290;
+ goto yy2280;
+yy2308:
+ yych = *++p;
+ if (yych == 'T') goto yy2309;
+ if (yych != 't') goto yy2280;
+yy2309:
+ yych = *++p;
+ if (yych == 'A') goto yy2310;
+ if (yych != 'a') goto yy2280;
+yy2310:
+ yych = *++p;
+ if (yych != ':') goto yy2280;
+ yyaccept = 1;
+ yych = *(marker = ++p);
+ if (yych == 'I') goto yy2312;
+ if (yych != 'i') goto yy2291;
+yy2312:
+ yych = *++p;
+ if (yych == 'M') goto yy2313;
+ if (yych != 'm') goto yy2280;
+yy2313:
+ yych = *++p;
+ if (yych == 'A') goto yy2314;
+ if (yych != 'a') goto yy2280;
+yy2314:
+ yych = *++p;
+ if (yych == 'G') goto yy2315;
+ if (yych != 'g') goto yy2280;
+yy2315:
+ yych = *++p;
+ if (yych == 'E') goto yy2316;
+ if (yych != 'e') goto yy2280;
+yy2316:
+ yych = *++p;
+ if (yych != '/') goto yy2280;
+ yych = *++p;
+ if (yych <= 'W') {
+ if (yych <= 'J') {
+ if (yych == 'G') goto yy2319;
+ if (yych <= 'I') goto yy2280;
+ goto yy2320;
+ } else {
+ if (yych == 'P') goto yy2318;
+ if (yych <= 'V') goto yy2280;
+ goto yy2321;
+ }
+ } else {
+ if (yych <= 'j') {
+ if (yych == 'g') goto yy2319;
+ if (yych <= 'i') goto yy2280;
+ goto yy2320;
+ } else {
+ if (yych <= 'p') {
+ if (yych <= 'o') goto yy2280;
+ } else {
+ if (yych == 'w') goto yy2321;
+ goto yy2280;
+ }
+ }
+ }
+yy2318:
+ yych = *++p;
+ if (yych == 'N') goto yy2329;
+ if (yych == 'n') goto yy2329;
+ goto yy2280;
+yy2319:
+ yych = *++p;
+ if (yych == 'I') goto yy2328;
+ if (yych == 'i') goto yy2328;
+ goto yy2280;
+yy2320:
+ yych = *++p;
+ if (yych == 'P') goto yy2326;
+ if (yych == 'p') goto yy2326;
+ goto yy2280;
+yy2321:
+ yych = *++p;
+ if (yych == 'E') goto yy2322;
+ if (yych != 'e') goto yy2280;
+yy2322:
+ yych = *++p;
+ if (yych == 'B') goto yy2323;
+ if (yych != 'b') goto yy2280;
+yy2323:
+ yych = *++p;
+ if (yych == 'P') goto yy2324;
+ if (yych != 'p') goto yy2280;
+yy2324:
+ ++p;
+ { return 0; }
+yy2326:
+ yych = *++p;
+ if (yych == 'E') goto yy2327;
+ if (yych != 'e') goto yy2280;
+yy2327:
+ yych = *++p;
+ if (yych == 'G') goto yy2324;
+ if (yych == 'g') goto yy2324;
+ goto yy2280;
+yy2328:
+ yych = *++p;
+ if (yych == 'F') goto yy2324;
+ if (yych == 'f') goto yy2324;
+ goto yy2280;
+yy2329:
+ ++p;
+ if ((yych = *p) == 'G') goto yy2324;
+ if (yych == 'g') goto yy2324;
+ goto yy2280;
+}
+
+}
+
diff --git a/src/scanners.h b/src/scanners.h
index db8eeb8..a6a71bf 100644
--- a/src/scanners.h
+++ b/src/scanners.h
@@ -26,6 +26,7 @@ bufsize_t _scan_hrule(const unsigned char *p);
bufsize_t _scan_open_code_fence(const unsigned char *p);
bufsize_t _scan_close_code_fence(const unsigned char *p);
bufsize_t _scan_entity(const unsigned char *p);
+bufsize_t _scan_dangerous_url(const unsigned char *p);
#define scan_scheme(c, n) _scan_at(&_scan_scheme, c, n)
#define scan_autolink_uri(c, n) _scan_at(&_scan_autolink_uri, c, n)
@@ -47,6 +48,7 @@ bufsize_t _scan_entity(const unsigned char *p);
#define scan_open_code_fence(c, n) _scan_at(&_scan_open_code_fence, c, n)
#define scan_close_code_fence(c, n) _scan_at(&_scan_close_code_fence, c, n)
#define scan_entity(c, n) _scan_at(&_scan_entity, c, n)
+#define scan_dangerous_url(c, n) _scan_at(&_scan_dangerous_url, c, n)
#ifdef __cplusplus
}
diff --git a/src/scanners.re b/src/scanners.re
index efa6731..fbe3283 100644
--- a/src/scanners.re
+++ b/src/scanners.re
@@ -315,3 +315,17 @@ bufsize_t _scan_entity(const unsigned char *p)
.? { return 0; }
*/
}
+
+// Returns positive value if a URL begins in a way that is potentially
+// dangerous, with javascript:, vbscript:, file:, or data:, otherwise 0.
+bufsize_t _scan_dangerous_url(const unsigned char *p)
+{
+ const unsigned char *marker = NULL;
+ const unsigned char *start = p;
+/*!re2c
+ 'data:image/' ('png'|'gif'|'jpeg'|'webp') { return 0; }
+ 'javascript:' | 'vbscript:' | 'file:' | 'data:' { return (bufsize_t)(p - start); }
+ .? { return 0; }
+*/
+}
+