1 files changed, 9 insertions, 79 deletions
diff --git a/tools/make_entities_h.py b/tools/make_entities_h.py
index 9342286..48492c7 100644
--- a/tools/make_entities_h.py
+++ b/tools/make_entities_h.py
@@ -4,79 +4,12 @@
 
 import html
 
-# We use this simple hashing algorithm to convert a string
-# to an integer:
-def djb2(s):
-  bs = list(s.encode('utf-8'))
-  hash = 5381
-  for b in bs:
-    hash = (((hash << 5) + hash) + b) & 0xFFFFFFFF
-  return hash
-
 entities5 = html.entities.html5
 
 # remove keys without semicolons.  For some reason the list
 # has duplicates of a few things, like auml, one with and one
 # without a semicolon.
-entities = [(k[:-1], entities5[k]) for k in entities5.keys() if k[-1] == ';']
-
-# Note that most entries in the entity table end with ';', but in a few
-# cases we have both a version with ';' and one without, so we strip out
-# the latter to avoid duplicates:
-hashed_data = sorted([[int(djb2(k)), v.encode('utf-8'), k] for (k,v) in entities])
-
-# Confirm no hash collisions
-hashes = [x for [x,_,_] in hashed_data]
-assert(len(hashes) == len(set(hashes)))
-
-# indices is a dictionary - given a hash it spits out the ordering
-# of this entity in the list (the array index)
-indices = {}
-i = 0
-
-for x in hashed_data:
-  indices[x[0]] = i
-  i = i + 1
-
-# Formats integer as C octal escape.
-def toesc(x):
-  return '\\' + oct(x)[2:]
-
-# Lines is the list of lines in the array.
-# We don't fill them in order, so we initialize the whole array first.
-lines = [""] * len(hashed_data)
-
-# Takes hashed_data or some sublist of it, and a midpoint (array index)
-# in this list.  Adds to lines a line for the midpoint, then calls
-# itself recursively for the earlier and later elements.  Each node
-# contains indices for elements with a lesser hash and elements with
-# a greater hash.  An index of -1 means we're at a leaf node.
-def to_binary_array(xs, mid):
-  # divide in half, and form binary array from each half
-  x = xs[mid]
-  lesses = xs[0:mid]
-  greaters = xs[mid+1:]
-  midlesses = len(lesses) // 2
-  midgreaters = len(greaters) // 2
-  if len(lesses) == 0:
-    ml = -1
-  else:
-    ml = indices[lesses[midlesses][0]]
-  if len(greaters) == 0:
-    mg = -1
-  else:
-    mg = indices[greaters[midgreaters][0]]
-  lines[indices[x[0]]] = ("{" + str(x[0]) + ", (unsigned char*)\"" +
-                          ''.join(map(toesc, x[1])) + "\", " + str(ml) +
-                          ", " + str(mg) + "}, /* &" + x[2] + "; */")
-  if len(lesses) > 0:
-    to_binary_array(lesses, midlesses)
-  if len(greaters) > 0:
-    to_binary_array(greaters, midgreaters)
-
-# Now call this to fill up the array lines:
-mid = len(hashed_data) // 2
-to_binary_array(hashed_data, mid)
+entities = sorted([(k[:-1], entities5[k].encode('utf-8')) for k in entities5.keys() if k[-1] == ';'])
 
 # Print out the header:
 print("""#ifndef CMARK_ENTITIES_H
@@ -87,24 +20,21 @@ extern "C" {
 #endif
 
 struct cmark_entity_node {
-	unsigned long value;
-	unsigned char *bytes;
-	int less;
-	int greater;
+	unsigned char *entity;
+        unsigned char bytes[8];
 };
 
 #define CMARK_ENTITY_MIN_LENGTH 2
-#define CMARK_ENTITY_MAX_LENGTH 31
-""")
+#define CMARK_ENTITY_MAX_LENGTH 31""")
 
-print("static const struct cmark_entity_node cmark_entities[] = {");
+print("#define CMARK_NUM_ENTITIES " + str(len(entities)));
 
-for line in lines:
-  print(line);
+print("\nstatic const struct cmark_entity_node cmark_entities[] = {");
 
-print("};\n");
+for (ent, bs) in entities:
+  print('{(unsigned char*)"' + ent + '", {' + ', '.join(map(str, bs)) + ', 0}},')
 
-print("static const int cmark_entities_root = " + str(mid) + ";");
+print("};\n");
 
 print("""
 #ifdef __cplusplus