diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/blocks.c | 11 | ||||
| -rw-r--r-- | src/buffer.c | 6 | ||||
| -rw-r--r-- | src/html/houdini.h | 44 | ||||
| -rw-r--r-- | src/html/houdini_href_e.c | 115 | ||||
| -rw-r--r-- | src/html/houdini_html_e.c | 89 | ||||
| -rw-r--r-- | src/html/html.c | 212 | ||||
| -rw-r--r-- | src/stmd.h | 1 | ||||
| -rw-r--r-- | src/utf8.c | 7 | 
8 files changed, 473 insertions, 12 deletions
| diff --git a/src/blocks.c b/src/blocks.c index 71dc830..42f20db 100644 --- a/src/blocks.c +++ b/src/blocks.c @@ -8,6 +8,7 @@  #include "scanners.h"  #include "uthash.h" +static void incorporate_line(gh_buf *ln, int line_number, block** curptr);  static void finalize(block* b, int line_number);  static block* make_block(int tag, int start_line, int start_column) @@ -390,7 +391,7 @@ static void expand_tabs(gh_buf *ob, const unsigned char *line, size_t size)  	}  } -static block *finalize_parsing(block *document, int linenum) +static block *finalize_document(block *document, int linenum)  {  	while (document != document->top) {  		finalize(document, linenum); @@ -411,7 +412,7 @@ extern block *stmd_parse_file(FILE *f)  	block *document = make_document();  	while (fgets((char *)buffer, sizeof(buffer), f)) { -		expand_tabs(&line, buffer, strlen(buffer)); +		expand_tabs(&line, buffer, strlen((char *)buffer));  		incorporate_line(&line, linenum, &document);  		gh_buf_clear(&line);  		linenum++; @@ -429,7 +430,7 @@ extern block *stmd_parse_document(const unsigned char *buffer, size_t len)  	block *document = make_document();  	while (buffer < end) { -		const char *eol = memchr(buffer, '\n', end - buffer); +		const unsigned char *eol = memchr(buffer, '\n', end - buffer);  		if (!eol) {  			expand_tabs(&line, buffer, end - buffer); @@ -449,9 +450,7 @@ extern block *stmd_parse_document(const unsigned char *buffer, size_t len)  }  // Process one line at a time, modifying a block. -// Returns 0 if successful.  curptr is changed to point to -// the currently open block. -extern void incorporate_line(gh_buf *ln, int line_number, block** curptr) +static void incorporate_line(gh_buf *ln, int line_number, block** curptr)  {  	block* last_matched_container;  	int offset = 0; diff --git a/src/buffer.c b/src/buffer.c index 17dc864..cfc6a7e 100644 --- a/src/buffer.c +++ b/src/buffer.c @@ -245,11 +245,11 @@ int gh_buf_cmp(const gh_buf *a, const gh_buf *b)  int gh_buf_strchr(const gh_buf *buf, int c, int pos)  { -	const char *p = memchr(buf->ptr + pos, c, buf->size - pos); +	const unsigned char *p = memchr(buf->ptr + pos, c, buf->size - pos);  	if (!p)  		return -1; -	return (int)(p - buf->ptr); +	return (int)(p - (const unsigned char *)buf->ptr);  }  int gh_buf_strrchr(const gh_buf *buf, int c, int pos) @@ -264,7 +264,7 @@ int gh_buf_strrchr(const gh_buf *buf, int c, int pos)  	return -1;  } -void gh_buf_truncate(gh_buf *buf, size_t len) +void gh_buf_truncate(gh_buf *buf, int len)  {  	if (len < buf->size) {  		buf->size = len; diff --git a/src/html/houdini.h b/src/html/houdini.h new file mode 100644 index 0000000..31fe917 --- /dev/null +++ b/src/html/houdini.h @@ -0,0 +1,44 @@ +#ifndef __HOUDINI_H__ +#define __HOUDINI_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <stdint.h> +#include "buffer.h" + +#define likely(x)       __builtin_expect((x),1) +#define unlikely(x)     __builtin_expect((x),0) + +#ifdef HOUDINI_USE_LOCALE +#	define _isxdigit(c) isxdigit(c) +#	define _isdigit(c) isdigit(c) +#else +/* + * Helper _isdigit methods -- do not trust the current locale + * */ +#	define _isxdigit(c) (strchr("0123456789ABCDEFabcdef", (c)) != NULL) +#	define _isdigit(c) ((c) >= '0' && (c) <= '9') +#endif + +#define HOUDINI_ESCAPED_SIZE(x) (((x) * 12) / 10) +#define HOUDINI_UNESCAPED_SIZE(x) (x) + +extern int houdini_escape_html(gh_buf *ob, const uint8_t *src, size_t size); +extern int houdini_escape_html0(gh_buf *ob, const uint8_t *src, size_t size, int secure); +extern int houdini_unescape_html(gh_buf *ob, const uint8_t *src, size_t size); +extern int houdini_escape_xml(gh_buf *ob, const uint8_t *src, size_t size); +extern int houdini_escape_uri(gh_buf *ob, const uint8_t *src, size_t size); +extern int houdini_escape_url(gh_buf *ob, const uint8_t *src, size_t size); +extern int houdini_escape_href(gh_buf *ob, const uint8_t *src, size_t size); +extern int houdini_unescape_uri(gh_buf *ob, const uint8_t *src, size_t size); +extern int houdini_unescape_url(gh_buf *ob, const uint8_t *src, size_t size); +extern int houdini_escape_js(gh_buf *ob, const uint8_t *src, size_t size); +extern int houdini_unescape_js(gh_buf *ob, const uint8_t *src, size_t size); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/html/houdini_href_e.c b/src/html/houdini_href_e.c new file mode 100644 index 0000000..59fe850 --- /dev/null +++ b/src/html/houdini_href_e.c @@ -0,0 +1,115 @@ +#include <assert.h> +#include <stdio.h> +#include <string.h> + +#include "html/houdini.h" + +/* + * The following characters will not be escaped: + * + *		-_.+!*'(),%#@?=;:/,+&$ alphanum + * + * Note that this character set is the addition of: + * + *	- The characters which are safe to be in an URL + *	- The characters which are *not* safe to be in + *	an URL because they are RESERVED characters. + * + * We asume (lazily) that any RESERVED char that + * appears inside an URL is actually meant to + * have its native function (i.e. as an URL + * component/separator) and hence needs no escaping. + * + * There are two exceptions: the chacters & (amp) + * and ' (single quote) do not appear in the table. + * They are meant to appear in the URL as components, + * yet they require special HTML-entity escaping + * to generate valid HTML markup. + * + * All other characters will be escaped to %XX. + * + */ +static const char HREF_SAFE[] = { +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +	0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, +	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, +	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, +	0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +int +houdini_escape_href(gh_buf *ob, const uint8_t *src, size_t size) +{ +	static const uint8_t hex_chars[] = "0123456789ABCDEF"; +	size_t  i = 0, org; +	uint8_t hex_str[3]; + +	hex_str[0] = '%'; + +	while (i < size) { +		org = i; +		while (i < size && HREF_SAFE[src[i]] != 0) +			i++; + +		if (likely(i > org)) { +			if (unlikely(org == 0)) { +				if (i >= size) +					return 0; + +				gh_buf_grow(ob, HOUDINI_ESCAPED_SIZE(size)); +			} + +			gh_buf_put(ob, src + org, i - org); +		} + +		/* escaping */ +		if (i >= size) +			break; + +		switch (src[i]) { +		/* amp appears all the time in URLs, but needs +		 * HTML-entity escaping to be inside an href */ +		case '&': +			gh_buf_puts(ob, "&"); +			break; + +		/* the single quote is a valid URL character +		 * according to the standard; it needs HTML +		 * entity escaping too */ +		case '\'': +			gh_buf_puts(ob, "'"); +			break; + +		/* the space can be escaped to %20 or a plus +		 * sign. we're going with the generic escape +		 * for now. the plus thing is more commonly seen +		 * when building GET strings */ +#if 0 +		case ' ': +			gh_buf_putc(ob, '+'); +			break; +#endif + +		/* every other character goes with a %XX escaping */ +		default: +			hex_str[1] = hex_chars[(src[i] >> 4) & 0xF]; +			hex_str[2] = hex_chars[src[i] & 0xF]; +			gh_buf_put(ob, hex_str, 3); +		} + +		i++; +	} + +	return 1; +} diff --git a/src/html/houdini_html_e.c b/src/html/houdini_html_e.c new file mode 100644 index 0000000..316c5ce --- /dev/null +++ b/src/html/houdini_html_e.c @@ -0,0 +1,89 @@ +#include <assert.h> +#include <stdio.h> +#include <string.h> + +#include "html/houdini.h" + +/** + * According to the OWASP rules: + * + * & --> & + * < --> < + * > --> > + * " --> " + * ' --> '     ' is not recommended + * / --> /     forward slash is included as it helps end an HTML entity + * + */ +static const char HTML_ESCAPE_TABLE[] = { +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +	0, 0, 1, 0, 0, 0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 4, +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 6, 0, +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +static const char *HTML_ESCAPES[] = { +        "", +        """, +        "&", +        "'", +        "/", +        "<", +        ">" +}; + +int +houdini_escape_html0(gh_buf *ob, const uint8_t *src, size_t size, int secure) +{ +	size_t  i = 0, org, esc = 0; + +	while (i < size) { +		org = i; +		while (i < size && (esc = HTML_ESCAPE_TABLE[src[i]]) == 0) +			i++; + +		if (i > org) { +			if (unlikely(org == 0)) { +				if (i >= size) +					return 0; + +				gh_buf_grow(ob, HOUDINI_ESCAPED_SIZE(size)); +			} + +			gh_buf_put(ob, src + org, i - org); +		} + +		/* escaping */ +		if (unlikely(i >= size)) +			break; + +		/* The forward slash is only escaped in secure mode */ +		if (src[i] == '/' && !secure) { +			gh_buf_putc(ob, '/'); +		} else { +			gh_buf_puts(ob, HTML_ESCAPES[esc]); +		} + +		i++; +	} + +	return 1; +} + +int +houdini_escape_html(gh_buf *ob, const uint8_t *src, size_t size) +{ +	return houdini_escape_html0(ob, src, size, 1); +} diff --git a/src/html/html.c b/src/html/html.c new file mode 100644 index 0000000..2f160ca --- /dev/null +++ b/src/html/html.c @@ -0,0 +1,212 @@ +#include <stdlib.h> +#include <stdio.h> +#include <stdbool.h> +#include <string.h> +#include <assert.h> + +#include "stmd.h" +#include "debug.h" +#include "scanners.h" +#include "html/houdini.h" + +// Functions to convert block and inline lists to HTML strings. + +static void escape_html(gh_buf *dest, const unsigned char *source, int length) +{ +	if (length < 0) +		length = strlen((char *)source); + +	houdini_escape_html0(dest, source, (size_t)length, 0); +} + +static void escape_href(gh_buf *dest, const unsigned char *source, int length) +{ +	if (length < 0) +		length = strlen((char *)source); + +	houdini_escape_href(dest, source, (size_t)length); +} + +static inline void cr(gh_buf *html) +{ +	if (html->size && html->ptr[html->size - 1] != '\n') +		gh_buf_putc(html, '\n'); +} + +// Convert a block list to HTML.  Returns 0 on success, and sets result. +void blocks_to_html(gh_buf *html, block *b, bool tight) +{ +	struct ListData *data; + +	while(b != NULL) { +		switch(b->tag) { +			case document: +				blocks_to_html(html, b->children, false); +				break; + +			case paragraph: +				if (tight) { +					inlines_to_html(html, b->inline_content); +				} else { +					cr(html); +					gh_buf_puts(html, "<p>"); +					inlines_to_html(html, b->inline_content); +					gh_buf_puts(html, "</p>"); +					cr(html); +				} +				break; + +			case block_quote: +				cr(html); +				gh_buf_puts(html, "<blockquote>"); +				blocks_to_html(html, b->children, false); +				gh_buf_puts(html, "</blockquote>"); +				cr(html); +				break; + +			case list_item: +				cr(html); +				gh_buf_puts(html, "<li>"); +				blocks_to_html(html, b->children, tight); +				gh_buf_trim(html); +				gh_buf_puts(html, "</li>"); +				cr(html); +				break; + +			case list: +				// make sure a list starts at the beginning of the line: +				cr(html); +				data = &(b->attributes.list_data); + +				if (data->start > 1) { +					gh_buf_printf(html, "<%s start=\"%d\">\n", +							data->list_type == bullet ? "ul" : "ol", +							data->start); +				} else { +					gh_buf_puts(html, data->list_type == bullet ? "<ul>\n" : "<ol>\n"); +				} + +				blocks_to_html(html, b->children, data->tight); +				gh_buf_puts(html, data->list_type == bullet ? "</ul>" : "</ol>"); +				cr(html); +				break; + +			case atx_header: +			case setext_header: +				cr(html); +				gh_buf_printf(html, "<h%d>", b->attributes.header_level); +				inlines_to_html(html, b->inline_content); +				gh_buf_printf(html, "</h%d>", b->attributes.header_level); +				cr(html); +				break; + +			case indented_code: +			case fenced_code: +				/* TODO: fenced code lang attributes */ +				cr(html); +				gh_buf_puts(html, "<pre><code>"); +				escape_html(html, b->string_content.ptr, b->string_content.size); +				gh_buf_puts(html, "</pre></code>"); +				cr(html); +				break; + +			case html_block: +				gh_buf_put(html, b->string_content.ptr, b->string_content.size); +				break; + +			case hrule: +				gh_buf_puts(html, "<hr />"); +				cr(html); +				break; + +			case reference_def: +				break; + +			default: +				assert(false); +		} + +		b = b->next; +	} +} + +// Convert an inline list to HTML.  Returns 0 on success, and sets result. +void inlines_to_html(gh_buf *html, inl* ils) +{ +	gh_buf scrap = GH_BUF_INIT; + +	while(ils != NULL) { +		switch(ils->tag) { +			case INL_STRING: +				escape_html(html, ils->content.literal.data, ils->content.literal.len); +				break; + +			case INL_LINEBREAK: +				gh_buf_puts(html, "<br />\n"); +				break; + +			case INL_SOFTBREAK: +				gh_buf_putc(html, '\n'); +				break; + +			case INL_CODE: +				gh_buf_puts(html, "<code>"); +				escape_html(html, ils->content.literal.data, ils->content.literal.len); +				gh_buf_puts(html, "</code>"); +				break; + +			case INL_RAW_HTML: +			case INL_ENTITY: +				gh_buf_put(html, +						ils->content.literal.data, +						ils->content.literal.len); +				break; + +			case INL_LINK: +				gh_buf_puts(html, "<a href=\""); +				escape_href(html, ils->content.linkable.url, -1); + +				if (ils->content.linkable.title) { +					gh_buf_puts(html, "\" title=\""); +					escape_html(html, ils->content.linkable.title, -1); +				} + +				gh_buf_puts(html, "\">"); +				inlines_to_html(html, ils->content.inlines); +				gh_buf_puts(html, "</a>"); +				break; + +			case INL_IMAGE: +				gh_buf_puts(html, "<img src=\""); +				escape_href(html, ils->content.linkable.url, -1); + +				inlines_to_html(&scrap, ils->content.inlines); +				if (scrap.size) { +					gh_buf_puts(html, "\" alt=\""); +					escape_html(html, scrap.ptr, scrap.size); +				} +				gh_buf_clear(&scrap); + +				if (ils->content.linkable.title) { +					gh_buf_puts(html, "\" title=\""); +					escape_html(html, ils->content.linkable.title, -1); +				} + +				gh_buf_puts(html, "\"/>"); +				break; + +			case INL_STRONG: +				gh_buf_puts(html, "<strong>"); +				inlines_to_html(html, ils->content.inlines); +				gh_buf_puts(html, "</strong>"); +				break; + +			case INL_EMPH: +				gh_buf_puts(html, "<em>"); +				inlines_to_html(html, ils->content.inlines); +				gh_buf_puts(html, "</em>"); +				break; +		} +		ils = ils->next; +	} +} @@ -1,4 +1,5 @@  #include <stdbool.h> +#include <stdio.h>  #include "buffer.h"  #include "uthash.h" @@ -1,6 +1,7 @@  #include <stdlib.h>  #include <stdint.h>  #include <unistd.h> +#include <assert.h>  #include "stmd.h" @@ -83,9 +84,9 @@ ssize_t utf8proc_iterate(const uint8_t *str, ssize_t str_len, int32_t *dst)  	return length;  } -void utf8_encode_char(int32_t uc, gh_buf *buf) +void utf8proc_encode_char(int32_t uc, gh_buf *buf)  { -	char dst[4]; +	unsigned char dst[4];  	int len = 0;  	if (uc < 0x00) { @@ -99,7 +100,7 @@ void utf8_encode_char(int32_t uc, gh_buf *buf)  		len = 2;  	} else if (uc == 0xFFFF) {  		dst[0] = 0xFF; -		return 1; +		len = 1;  	} else if (uc == 0xFFFE) {  		dst[0] = 0xFE;  		len = 1; | 
