1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
|
# Creates C data structures for binary lookup table of entities,
# using python's html5 entity data.
# Usage: python3 tools/make_entities_h.py > src/entities.h
import html
# We use this simple hashing algorithm to convert a string
# to an integer:
def djb2(s):
bs = list(s.encode('utf-8'))
hash = 5381
for b in bs:
hash = (((hash << 5) + hash) + b) & 0xFFFFFFFF
return hash
entities5 = html.entities.html5
# Note that most entries in the entity table end with ';', but in a few
# cases we have both a version with ';' and one without, so we strip out
# the latter to avoid duplicates:
hashed_data = sorted([[int(djb2(s[:-1])), entities5[s].encode('utf-8'), s]
for s in entities5.keys() if s[-1] == ';'])
# indices is a dictionary - given a hash it spits out the ordering
# of this entity in the list (the array index)
indices = {}
i = 0
for x in hashed_data:
indices[x[0]] = i
i = i + 1
# Formats integer as C octal escape.
def toesc(x):
return '\\' + oct(x)[2:]
# Lines is the list of lines in the array.
# We don't fill them in order, so we initialize the whole array first.
lines = [""] * len(hashed_data)
# Takes hashed_data or some sublist of it, and a midpoint (array index)
# in this list. Adds to lines a line for the midpoint, then calls
# itself recursively for the earlier and later elements. Each node
# contains indices for elements with a lesser hash and elements with
# a greater hash. An index of 0 means we're at a leaf node.
def to_binary_array(xs, mid):
# divide in half, and form binary array from each half
x = xs[mid]
lesses = xs[0:mid]
greaters = xs[mid+1:]
midlesses = len(lesses) // 2
midgreaters = len(greaters) // 2
if len(lesses) == 0:
ml = 0
else:
ml = indices[lesses[midlesses][0]]
if len(greaters) == 0:
mg = 0
else:
mg = indices[greaters[midgreaters][0]]
lines[indices[x[0]]] = ("{" + str(x[0]) + ", (unsigned char*)\"" +
''.join(map(toesc, x[1])) + "\", " + str(ml) +
", " + str(mg) + "}, /* &" + x[2] + " */")
if len(lesses) > 0:
to_binary_array(lesses, midlesses)
if len(greaters) > 0:
to_binary_array(greaters, midgreaters)
# Now call this to fill up the array lines:
mid = len(hashed_data) // 2
to_binary_array(hashed_data, mid)
# Print out the header:
print("""#ifndef CMARK_ENTITIES_H
#define CMARK_ENTITIES_H
#ifdef __cplusplus
extern "C" {
#endif
struct cmark_entity_node {
unsigned long value;
unsigned char *bytes;
int less;
int greater;
};
#define CMARK_ENTITY_MIN_LENGTH 2
#define CMARK_ENTITY_MAX_LENGTH 31
""")
print("static struct cmark_entity_node cmark_entities[] = {");
for line in lines:
print(line);
print("};\n");
print("static int cmark_entities_root = " + str(mid) + ";");
print("""
#ifdef __cplusplus
}
#endif
#endif
""")
|