summaryrefslogtreecommitdiff
path: root/src/html/houdini_href_e.c
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2014-09-17 14:05:04 -0700
committerJohn MacFarlane <jgm@berkeley.edu>2014-09-17 14:05:04 -0700
commit309173a493aea59cce5cce1b52b86e01b041bb8f (patch)
tree30eea56599b096d11cf4eaad38d395b3910a35b4 /src/html/houdini_href_e.c
parent6326bc748c8f5f225d82c01fe6763776f2bbd88e (diff)
parent3aa56049d4b52b55a2313e51698090ee81e10036 (diff)
Merge pull request #66 from vmg/revamp
Enfastenate the C Parsenator
Diffstat (limited to 'src/html/houdini_href_e.c')
-rw-r--r--src/html/houdini_href_e.c107
1 files changed, 107 insertions, 0 deletions
diff --git a/src/html/houdini_href_e.c b/src/html/houdini_href_e.c
new file mode 100644
index 0000000..12456ce
--- /dev/null
+++ b/src/html/houdini_href_e.c
@@ -0,0 +1,107 @@
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "html/houdini.h"
+
+/*
+ * The following characters will not be escaped:
+ *
+ * -_.+!*'(),%#@?=;:/,+&$ alphanum
+ *
+ * Note that this character set is the addition of:
+ *
+ * - The characters which are safe to be in an URL
+ * - The characters which are *not* safe to be in
+ * an URL because they are RESERVED characters.
+ *
+ * We asume (lazily) that any RESERVED char that
+ * appears inside an URL is actually meant to
+ * have its native function (i.e. as an URL
+ * component/separator) and hence needs no escaping.
+ *
+ * There are two exceptions: the chacters & (amp)
+ * and ' (single quote) do not appear in the table.
+ * They are meant to appear in the URL as components,
+ * yet they require special HTML-entity escaping
+ * to generate valid HTML markup.
+ *
+ * All other characters will be escaped to %XX.
+ *
+ */
+static const char HREF_SAFE[] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+int
+houdini_escape_href(strbuf *ob, const uint8_t *src, size_t size)
+{
+ static const uint8_t hex_chars[] = "0123456789ABCDEF";
+ size_t i = 0, org;
+ uint8_t hex_str[3];
+
+ hex_str[0] = '%';
+
+ while (i < size) {
+ org = i;
+ while (i < size && HREF_SAFE[src[i]] != 0)
+ i++;
+
+ if (likely(i > org))
+ strbuf_put(ob, src + org, i - org);
+
+ /* escaping */
+ if (i >= size)
+ break;
+
+ switch (src[i]) {
+ /* amp appears all the time in URLs, but needs
+ * HTML-entity escaping to be inside an href */
+ case '&':
+ strbuf_puts(ob, "&amp;");
+ break;
+
+ /* the single quote is a valid URL character
+ * according to the standard; it needs HTML
+ * entity escaping too */
+ case '\'':
+ strbuf_puts(ob, "&#x27;");
+ break;
+
+ /* the space can be escaped to %20 or a plus
+ * sign. we're going with the generic escape
+ * for now. the plus thing is more commonly seen
+ * when building GET strings */
+#if 0
+ case ' ':
+ strbuf_putc(ob, '+');
+ break;
+#endif
+
+ /* every other character goes with a %XX escaping */
+ default:
+ hex_str[1] = hex_chars[(src[i] >> 4) & 0xF];
+ hex_str[2] = hex_chars[src[i] & 0xF];
+ strbuf_put(ob, hex_str, 3);
+ }
+
+ i++;
+ }
+
+ return 1;
+}