tools/make_entities_h.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107

# Creates C data structures for binary lookup table of entities,
# using python's html5 entity data.
# Usage: python3 tools/make_entities_h.py > src/entities.h

import html

# We use this simple hashing algorithm to convert a string
# to an integer:
def djb2(s):
  bs = list(s.encode('utf-8'))
  hash = 5381
  for b in bs:
    hash = (((hash << 5) + hash) + b) & 0xFFFFFFFF
  return hash

entities5 = html.entities.html5

# Note that most entries in the entity table end with ';', but in a few
# cases we have both a version with ';' and one without, so we strip out
# the latter to avoid duplicates:
hashed_data = sorted([[int(djb2(s[:-1])), entities5[s].encode('utf-8'), s]
                      for s in entities5.keys() if s[-1] == ';'])

# indices is a dictionary - given a hash it spits out the ordering
# of this entity in the list (the array index)
indices = {}
i = 0

for x in hashed_data:
  indices[x[0]] = i
  i = i + 1

# Formats integer as C octal escape.
def toesc(x):
  return '\\' + oct(x)[2:]

# Lines is the list of lines in the array.
# We don't fill them in order, so we initialize the whole array first.
lines = [""] * len(hashed_data)

# Takes hashed_data or some sublist of it, and a midpoint (array index)
# in this list.  Adds to lines a line for the midpoint, then calls
# itself recursively for the earlier and later elements.  Each node
# contains indices for elements with a lesser hash and elements with
# a greater hash.  An index of 0 means we're at a leaf node.
def to_binary_array(xs, mid):
  # divide in half, and form binary array from each half
  x = xs[mid]
  lesses = xs[0:mid]
  greaters = xs[mid+1:]
  midlesses = len(lesses) // 2
  midgreaters = len(greaters) // 2
  if len(lesses) == 0:
    ml = 0
  else:
    ml = indices[lesses[midlesses][0]]
  if len(greaters) == 0:
    mg = 0
  else:
    mg = indices[greaters[midgreaters][0]]
  lines[indices[x[0]]] = ("{" + str(x[0]) + ", (unsigned char*)\"" +
                          ''.join(map(toesc, x[1])) + "\", " + str(ml) +
                          ", " + str(mg) + "}, /* &" + x[2] + " */")
  if len(lesses) > 0:
    to_binary_array(lesses, midlesses)
  if len(greaters) > 0:
    to_binary_array(greaters, midgreaters)

# Now call this to fill up the array lines:
mid = len(hashed_data) // 2
to_binary_array(hashed_data, mid)

# Print out the header:
print("""#ifndef CMARK_ENTITIES_H
#define CMARK_ENTITIES_H

#ifdef __cplusplus
extern "C" {
#endif

struct cmark_entity_node {
	unsigned long value;
	unsigned char *bytes;
	int less;
	int greater;
};

#define CMARK_ENTITY_MIN_LENGTH 2
#define CMARK_ENTITY_MAX_LENGTH 31
""")

print("static struct cmark_entity_node cmark_entities[] = {");

for line in lines:
  print(line);

print("};\n");

print("static int cmark_entities_root = " + str(mid) + ";");

print("""
#ifdef __cplusplus
}
#endif

#endif
""")