diff options
author | John MacFarlane <jgm@berkeley.edu> | 2015-06-16 09:54:31 -0700 |
---|---|---|
committer | John MacFarlane <jgm@berkeley.edu> | 2015-06-16 12:59:47 -0700 |
commit | 208c794def61eb819ed6eebe1d51867613addce0 (patch) | |
tree | 0d0f81dab960befc5efa7124ae900ddd64e43be3 /src/houdini_html_u.c | |
parent | f904f701cf4390b4d5531c5626c5cf08d85a913f (diff) |
Replace gperf-based entity lookup with binary tree lookup.
The primary advantage is a big reduction in the size of
the compiled library and executable (> 100K).
There should be no measurable performance difference in
normal documents. I detected a slight performance
hit (around 5%) in a file containing 1,000,000 entities.
* Removed `src/html_unescape.gperf` and `src/html_unescape.h`.
* Added `src/entities.h` (generated by `tools/make_entities_h.py`).
* Added binary tree lookup functions to `houdini_html_u.c`, and
use the data in `src/entities.h`.
Diffstat (limited to 'src/houdini_html_u.c')
-rw-r--r-- | src/houdini_html_u.c | 53 |
1 files changed, 43 insertions, 10 deletions
diff --git a/src/houdini_html_u.c b/src/houdini_html_u.c index 2362b04..6f94c23 100644 --- a/src/houdini_html_u.c +++ b/src/houdini_html_u.c @@ -5,7 +5,44 @@ #include "buffer.h" #include "houdini.h" #include "utf8.h" -#include "html_unescape.h" +#include "entities.h" + +/* Binary tree lookup code for entities added by JGM */ + +static unsigned long +S_hash(const unsigned char *str, int len) +{ + unsigned long hash = 5381; + int i; + + for (i = 0; i < len; i++) { + hash = (((hash << 5) + hash) + str[i]) & 0xFFFFFFFF; /* hash * 33 + c */ + } + + return hash; +} + +static unsigned char * +S_lookup(int i, unsigned long key) +{ + if (cmark_entities[i].value == key) { + return cmark_entities[i].bytes; + } else { + int next = key < cmark_entities[i].value ? + cmark_entities[i].less : cmark_entities[i].greater; + if (next == 0) { + return NULL; + } else { + return S_lookup(next, key); + } + } +} + +static unsigned char * +S_lookup_entity(const unsigned char *s, int len) +{ + return S_lookup(cmark_entities_root, S_hash(s, len)); +} bufsize_t houdini_unescape_ent(cmark_strbuf *ob, const uint8_t *src, bufsize_t size) @@ -57,22 +94,18 @@ houdini_unescape_ent(cmark_strbuf *ob, const uint8_t *src, bufsize_t size) } else { - if (size > MAX_WORD_LENGTH) - size = MAX_WORD_LENGTH; + if (size > CMARK_ENTITY_MAX_LENGTH) + size = CMARK_ENTITY_MAX_LENGTH; - for (i = MIN_WORD_LENGTH; i < size; ++i) { + for (i = CMARK_ENTITY_MIN_LENGTH; i < size; ++i) { if (src[i] == ' ') break; if (src[i] == ';') { - const struct html_ent *entity = find_entity((char *)src, i); + const unsigned char *entity = S_lookup_entity(src, i); if (entity != NULL) { - bufsize_t len = 0; - while (len < 8 && entity->utf8[len] != '\0') { - ++len; - } - cmark_strbuf_put(ob, entity->utf8, len); + cmark_strbuf_puts(ob, (const char *)entity); return i + 1; } |