diff options
Diffstat (limited to 'tools/make_entities_h.py')
-rw-r--r-- | tools/make_entities_h.py | 88 |
1 files changed, 9 insertions, 79 deletions
diff --git a/tools/make_entities_h.py b/tools/make_entities_h.py index 9342286..48492c7 100644 --- a/tools/make_entities_h.py +++ b/tools/make_entities_h.py @@ -4,79 +4,12 @@ import html -# We use this simple hashing algorithm to convert a string -# to an integer: -def djb2(s): - bs = list(s.encode('utf-8')) - hash = 5381 - for b in bs: - hash = (((hash << 5) + hash) + b) & 0xFFFFFFFF - return hash - entities5 = html.entities.html5 # remove keys without semicolons. For some reason the list # has duplicates of a few things, like auml, one with and one # without a semicolon. -entities = [(k[:-1], entities5[k]) for k in entities5.keys() if k[-1] == ';'] - -# Note that most entries in the entity table end with ';', but in a few -# cases we have both a version with ';' and one without, so we strip out -# the latter to avoid duplicates: -hashed_data = sorted([[int(djb2(k)), v.encode('utf-8'), k] for (k,v) in entities]) - -# Confirm no hash collisions -hashes = [x for [x,_,_] in hashed_data] -assert(len(hashes) == len(set(hashes))) - -# indices is a dictionary - given a hash it spits out the ordering -# of this entity in the list (the array index) -indices = {} -i = 0 - -for x in hashed_data: - indices[x[0]] = i - i = i + 1 - -# Formats integer as C octal escape. -def toesc(x): - return '\\' + oct(x)[2:] - -# Lines is the list of lines in the array. -# We don't fill them in order, so we initialize the whole array first. -lines = [""] * len(hashed_data) - -# Takes hashed_data or some sublist of it, and a midpoint (array index) -# in this list. Adds to lines a line for the midpoint, then calls -# itself recursively for the earlier and later elements. Each node -# contains indices for elements with a lesser hash and elements with -# a greater hash. An index of -1 means we're at a leaf node. -def to_binary_array(xs, mid): - # divide in half, and form binary array from each half - x = xs[mid] - lesses = xs[0:mid] - greaters = xs[mid+1:] - midlesses = len(lesses) // 2 - midgreaters = len(greaters) // 2 - if len(lesses) == 0: - ml = -1 - else: - ml = indices[lesses[midlesses][0]] - if len(greaters) == 0: - mg = -1 - else: - mg = indices[greaters[midgreaters][0]] - lines[indices[x[0]]] = ("{" + str(x[0]) + ", (unsigned char*)\"" + - ''.join(map(toesc, x[1])) + "\", " + str(ml) + - ", " + str(mg) + "}, /* &" + x[2] + "; */") - if len(lesses) > 0: - to_binary_array(lesses, midlesses) - if len(greaters) > 0: - to_binary_array(greaters, midgreaters) - -# Now call this to fill up the array lines: -mid = len(hashed_data) // 2 -to_binary_array(hashed_data, mid) +entities = sorted([(k[:-1], entities5[k].encode('utf-8')) for k in entities5.keys() if k[-1] == ';']) # Print out the header: print("""#ifndef CMARK_ENTITIES_H @@ -87,24 +20,21 @@ extern "C" { #endif struct cmark_entity_node { - unsigned long value; - unsigned char *bytes; - int less; - int greater; + unsigned char *entity; + unsigned char bytes[8]; }; #define CMARK_ENTITY_MIN_LENGTH 2 -#define CMARK_ENTITY_MAX_LENGTH 31 -""") +#define CMARK_ENTITY_MAX_LENGTH 31""") -print("static const struct cmark_entity_node cmark_entities[] = {"); +print("#define CMARK_NUM_ENTITIES " + str(len(entities))); -for line in lines: - print(line); +print("\nstatic const struct cmark_entity_node cmark_entities[] = {"); -print("};\n"); +for (ent, bs) in entities: + print('{(unsigned char*)"' + ent + '", {' + ', '.join(map(str, bs)) + ', 0}},') -print("static const int cmark_entities_root = " + str(mid) + ";"); +print("};\n"); print(""" #ifdef __cplusplus |