summaryrefslogtreecommitdiff
path: root/tools/make_entities_h.py
diff options
context:
space:
mode:
Diffstat (limited to 'tools/make_entities_h.py')
-rw-r--r--tools/make_entities_h.py88
1 files changed, 9 insertions, 79 deletions
diff --git a/tools/make_entities_h.py b/tools/make_entities_h.py
index 9342286..48492c7 100644
--- a/tools/make_entities_h.py
+++ b/tools/make_entities_h.py
@@ -4,79 +4,12 @@
import html
-# We use this simple hashing algorithm to convert a string
-# to an integer:
-def djb2(s):
- bs = list(s.encode('utf-8'))
- hash = 5381
- for b in bs:
- hash = (((hash << 5) + hash) + b) & 0xFFFFFFFF
- return hash
-
entities5 = html.entities.html5
# remove keys without semicolons. For some reason the list
# has duplicates of a few things, like auml, one with and one
# without a semicolon.
-entities = [(k[:-1], entities5[k]) for k in entities5.keys() if k[-1] == ';']
-
-# Note that most entries in the entity table end with ';', but in a few
-# cases we have both a version with ';' and one without, so we strip out
-# the latter to avoid duplicates:
-hashed_data = sorted([[int(djb2(k)), v.encode('utf-8'), k] for (k,v) in entities])
-
-# Confirm no hash collisions
-hashes = [x for [x,_,_] in hashed_data]
-assert(len(hashes) == len(set(hashes)))
-
-# indices is a dictionary - given a hash it spits out the ordering
-# of this entity in the list (the array index)
-indices = {}
-i = 0
-
-for x in hashed_data:
- indices[x[0]] = i
- i = i + 1
-
-# Formats integer as C octal escape.
-def toesc(x):
- return '\\' + oct(x)[2:]
-
-# Lines is the list of lines in the array.
-# We don't fill them in order, so we initialize the whole array first.
-lines = [""] * len(hashed_data)
-
-# Takes hashed_data or some sublist of it, and a midpoint (array index)
-# in this list. Adds to lines a line for the midpoint, then calls
-# itself recursively for the earlier and later elements. Each node
-# contains indices for elements with a lesser hash and elements with
-# a greater hash. An index of -1 means we're at a leaf node.
-def to_binary_array(xs, mid):
- # divide in half, and form binary array from each half
- x = xs[mid]
- lesses = xs[0:mid]
- greaters = xs[mid+1:]
- midlesses = len(lesses) // 2
- midgreaters = len(greaters) // 2
- if len(lesses) == 0:
- ml = -1
- else:
- ml = indices[lesses[midlesses][0]]
- if len(greaters) == 0:
- mg = -1
- else:
- mg = indices[greaters[midgreaters][0]]
- lines[indices[x[0]]] = ("{" + str(x[0]) + ", (unsigned char*)\"" +
- ''.join(map(toesc, x[1])) + "\", " + str(ml) +
- ", " + str(mg) + "}, /* &" + x[2] + "; */")
- if len(lesses) > 0:
- to_binary_array(lesses, midlesses)
- if len(greaters) > 0:
- to_binary_array(greaters, midgreaters)
-
-# Now call this to fill up the array lines:
-mid = len(hashed_data) // 2
-to_binary_array(hashed_data, mid)
+entities = sorted([(k[:-1], entities5[k].encode('utf-8')) for k in entities5.keys() if k[-1] == ';'])
# Print out the header:
print("""#ifndef CMARK_ENTITIES_H
@@ -87,24 +20,21 @@ extern "C" {
#endif
struct cmark_entity_node {
- unsigned long value;
- unsigned char *bytes;
- int less;
- int greater;
+ unsigned char *entity;
+ unsigned char bytes[8];
};
#define CMARK_ENTITY_MIN_LENGTH 2
-#define CMARK_ENTITY_MAX_LENGTH 31
-""")
+#define CMARK_ENTITY_MAX_LENGTH 31""")
-print("static const struct cmark_entity_node cmark_entities[] = {");
+print("#define CMARK_NUM_ENTITIES " + str(len(entities)));
-for line in lines:
- print(line);
+print("\nstatic const struct cmark_entity_node cmark_entities[] = {");
-print("};\n");
+for (ent, bs) in entities:
+ print('{(unsigned char*)"' + ent + '", {' + ', '.join(map(str, bs)) + ', 0}},')
-print("static const int cmark_entities_root = " + str(mid) + ";");
+print("};\n");
print("""
#ifdef __cplusplus