html2ast.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys
from HTMLParser import HTMLParser, HTMLParseError
import re
from htmlentitydefs import name2codepoint
import cgi

class MyHTMLParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.in_pre = False
        self.output = u""
    def handle_data(self, data):
        self.output += data
    def handle_endtag(self, tag):
        if tag == "pre":
            self.in_pre = False
    def handle_starttag(self, tag, attrs):
        if tag == "pre":
            self.in_pre = True
    def handle_startendtag(self, tag, attrs):
        self.output += ""
    def handle_comment(self, data):
        self.output += '<!--' + data + '-->'
    def handle_decl(self, data):
        self.output += '<!' + data + '>'
    def unknown_decl(self, data):
        self.output += '<!' + data + '>'
    def handle_pi(self,data):
        self.output += '<?' + data + '>'
    def handle_entityref(self, name):
        try:
            c = unichr(name2codepoint[name])
        except KeyError:
            c = None
        self.output_char(c)
    def handle_charref(self, name):
        try:
            if name.startswith("x"):
                c = unichr(int(name[1:], 16))
            else:
                c = unichr(int(name))
        except ValueError:
                c = None
        self.output_char(c)
    # Helpers.
    def output_char(self, c):
        if c == u'\n':
            self.output += "\\n"
        elif c == u'"':
            self.output += "\\\""
        elif c == u'\\':
            self.output += "\\\\"
        else:
            self.output += c

    def is_block_tag(self,tag):
        return (tag in ['article', 'header', 'aside', 'hgroup', 'blockquote',
            'hr', 'iframe', 'body', 'li', 'map', 'button', 'object', 'canvas',
            'ol', 'caption', 'output', 'col', 'p', 'colgroup', 'pre', 'dd',
            'progress', 'div', 'section', 'dl', 'table', 'td', 'dt',
            'tbody', 'embed', 'textarea', 'fieldset', 'tfoot', 'figcaption',
            'th', 'figure', 'thead', 'footer', 'tr', 'form', 'ul',
            'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'video', 'script', 'style'])

def html2ast(html):
    html_chunk_re = re.compile("(\<!\[CDATA\[.*?\]\]\>|\<[^>]*\>|[^<]+)")
    parser = MyHTMLParser()
    # We work around HTMLParser's limitations parsing CDATA
    # by breaking the input into chunks and passing CDATA chunks
    # through verbatim.
    for chunk in re.finditer(html_chunk_re, html):
        if chunk.group(0)[:8] == "<![CDATA":
            parser.output += chunk.group(0)
        else:
            parser.feed(chunk.group(0).decode(encoding='UTF-8'))
    parser.close()
    return parser.output


if __name__ == "__main__":
    print(html2ast(sys.stdin.read()))