|
1 | 1 | from __future__ import absolute_import, division, unicode_literals |
2 | 2 |
|
3 | | -try: |
4 | | - import json |
5 | | -except ImportError: |
6 | | - import simplejson as json |
7 | | - |
8 | | -from html5lib import html5parser, sanitizer, constants, treebuilders |
9 | | - |
10 | | - |
11 | | -def toxmlFactory(): |
12 | | - tree = treebuilders.getTreeBuilder("etree") |
13 | | - |
14 | | - def toxml(element): |
15 | | - # encode/decode roundtrip required for Python 2.6 compatibility |
16 | | - result_bytes = tree.implementation.tostring(element, encoding="utf-8") |
17 | | - return result_bytes.decode("utf-8") |
18 | | - |
19 | | - return toxml |
20 | | - |
21 | | - |
22 | | -def runSanitizerTest(name, expected, input, toxml=None): |
23 | | - if toxml is None: |
24 | | - toxml = toxmlFactory() |
25 | | - expected = ''.join([toxml(token) for token in html5parser.HTMLParser(). |
26 | | - parseFragment(expected)]) |
27 | | - expected = json.loads(json.dumps(expected)) |
| 3 | +from html5lib import constants |
| 4 | +from html5lib import parseFragment, serialize |
| 5 | +from html5lib.filters import sanitizer |
| 6 | + |
| 7 | + |
| 8 | +def runSanitizerTest(name, expected, input): |
| 9 | + parsed = parseFragment(expected) |
| 10 | + expected = serialize(parsed, |
| 11 | + omit_optional_tags=False, |
| 12 | + use_trailing_solidus=True, |
| 13 | + space_before_trailing_solidus=False, |
| 14 | + quote_attr_values=True, |
| 15 | + quote_char='"') |
28 | 16 | assert expected == sanitize_html(input) |
29 | 17 |
|
30 | 18 |
|
31 | | -def sanitize_html(stream, toxml=None): |
32 | | - if toxml is None: |
33 | | - toxml = toxmlFactory() |
34 | | - return ''.join([toxml(token) for token in |
35 | | - html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer). |
36 | | - parseFragment(stream)]) |
| 19 | +def sanitize_html(stream): |
| 20 | + parsed = parseFragment(stream) |
| 21 | + serialized = serialize(parsed, |
| 22 | + sanitize=True, |
| 23 | + omit_optional_tags=False, |
| 24 | + use_trailing_solidus=True, |
| 25 | + space_before_trailing_solidus=False, |
| 26 | + quote_attr_values=True, |
| 27 | + quote_char='"') |
| 28 | + return serialized |
37 | 29 |
|
38 | 30 |
|
39 | 31 | def test_should_handle_astral_plane_characters(): |
40 | | - assert '<html:p xmlns:html="http://www.w3.org/1999/xhtml">\U0001d4b5 \U0001d538</html:p>' == sanitize_html("<p>𝒵 𝔸</p>") |
| 32 | + assert '<p>\U0001d4b5 \U0001d538</p>' == sanitize_html("<p>𝒵 𝔸</p>") |
41 | 33 |
|
42 | 34 |
|
43 | 35 | def test_should_allow_relative_uris(): |
44 | | - assert '<html:p xmlns:html="http://www.w3.org/1999/xhtml"><html:a href="/example.com" /></html:p>' == sanitize_html('<p><a href="/example.com"></a></p>') |
| 36 | + assert '<p><a href="/example.com"></a></p>' == sanitize_html('<p><a href="/example.com"></a></p>') |
45 | 37 |
|
46 | 38 |
|
47 | 39 | def test_sanitizer(): |
48 | | - toxml = toxmlFactory() |
49 | | - for tag_name in sanitizer.HTMLSanitizer.allowed_elements: |
| 40 | + for ns, tag_name in sanitizer.allowed_elements: |
50 | 41 | if tag_name in ['caption', 'col', 'colgroup', 'optgroup', 'option', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr']: |
51 | 42 | continue # TODO |
52 | 43 | if tag_name != tag_name.lower(): |
53 | 44 | continue # TODO |
54 | 45 | if tag_name == 'image': |
55 | 46 | yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name, |
56 | 47 | "<img title=\"1\"/>foo <bad>bar</bad> baz", |
57 | | - "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), |
58 | | - toxml) |
| 48 | + "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name)) |
59 | 49 | elif tag_name == 'br': |
60 | 50 | yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name, |
61 | 51 | "<br title=\"1\"/>foo <bad>bar</bad> baz<br/>", |
62 | | - "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), |
63 | | - toxml) |
| 52 | + "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name)) |
64 | 53 | elif tag_name in constants.voidElements: |
65 | 54 | yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name, |
66 | 55 | "<%s title=\"1\"/>foo <bad>bar</bad> baz" % tag_name, |
67 | | - "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), |
68 | | - toxml) |
| 56 | + "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name)) |
69 | 57 | else: |
70 | 58 | yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name, |
71 | 59 | "<%s title=\"1\">foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), |
72 | | - "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), |
73 | | - toxml) |
| 60 | + "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name)) |
74 | 61 |
|
75 | | - for tag_name in sanitizer.HTMLSanitizer.allowed_elements: |
76 | | - tag_name = tag_name.upper() |
77 | | - yield (runSanitizerTest, "test_should_forbid_%s_tag" % tag_name, |
78 | | - "<%s title=\"1\">foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), |
79 | | - "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), |
80 | | - toxml) |
81 | | - |
82 | | - for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes: |
| 62 | + for ns, attribute_name in sanitizer.allowed_attributes: |
83 | 63 | if attribute_name != attribute_name.lower(): |
84 | 64 | continue # TODO |
85 | 65 | if attribute_name == 'style': |
86 | 66 | continue |
87 | 67 | attribute_value = 'foo' |
88 | | - if attribute_name in sanitizer.HTMLSanitizer.attr_val_is_uri: |
89 | | - attribute_value = '%s://sub.domain.tld/path/object.ext' % sanitizer.HTMLSanitizer.allowed_protocols[0] |
| 68 | + if attribute_name in sanitizer.attr_val_is_uri: |
| 69 | + attribute_value = '%s://sub.domain.tld/path/object.ext' % sanitizer.allowed_protocols[0] |
90 | 70 | yield (runSanitizerTest, "test_should_allow_%s_attribute" % attribute_name, |
91 | 71 | "<p %s=\"%s\">foo <bad>bar</bad> baz</p>" % (attribute_name, attribute_value), |
92 | | - "<p %s='%s'>foo <bad>bar</bad> baz</p>" % (attribute_name, attribute_value), |
93 | | - toxml) |
94 | | - |
95 | | - for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes: |
96 | | - attribute_name = attribute_name.upper() |
97 | | - yield (runSanitizerTest, "test_should_forbid_%s_attribute" % attribute_name, |
98 | | - "<p>foo <bad>bar</bad> baz</p>", |
99 | | - "<p %s='display: none;'>foo <bad>bar</bad> baz</p>" % attribute_name, |
100 | | - toxml) |
| 72 | + "<p %s='%s'>foo <bad>bar</bad> baz</p>" % (attribute_name, attribute_value)) |
101 | 73 |
|
102 | | - for protocol in sanitizer.HTMLSanitizer.allowed_protocols: |
| 74 | + for protocol in sanitizer.allowed_protocols: |
103 | 75 | rest_of_uri = '//sub.domain.tld/path/object.ext' |
104 | 76 | if protocol == 'data': |
105 | 77 | rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ=' |
106 | 78 | yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol, |
107 | 79 | "<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri), |
108 | | - """<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri), |
109 | | - toxml) |
| 80 | + """<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri)) |
110 | 81 |
|
111 | 82 | yield (runSanitizerTest, "test_invalid_data_uri", |
112 | 83 | "<audio controls=\"\"></audio>", |
113 | | - "<audio controls=\"\" src=\"data:foobar\"></audio>", |
114 | | - toxml) |
| 84 | + "<audio controls=\"\" src=\"data:foobar\"></audio>") |
115 | 85 |
|
116 | 86 | yield (runSanitizerTest, "test_invalid_ipv6_url", |
117 | 87 | "<a>", |
118 | | - "<a href=\"h://]\">", |
119 | | - toxml) |
| 88 | + "<a href=\"h://]\">") |
120 | 89 |
|
121 | 90 | yield (runSanitizerTest, "test_data_uri_disallowed_type", |
122 | 91 | "<audio controls=\"\"></audio>", |
123 | | - "<audio controls=\"\" src=\"data:text/html,<html>\"></audio>", |
124 | | - toxml) |
| 92 | + "<audio controls=\"\" src=\"data:text/html,<html>\"></audio>") |
125 | 93 |
|
126 | | - for protocol in sanitizer.HTMLSanitizer.allowed_protocols: |
| 94 | + for protocol in sanitizer.allowed_protocols: |
127 | 95 | rest_of_uri = '//sub.domain.tld/path/object.ext' |
128 | 96 | if protocol == 'data': |
129 | 97 | rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ=' |
130 | 98 | protocol = protocol.upper() |
131 | 99 | yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol, |
132 | 100 | "<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri), |
133 | | - """<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri), |
134 | | - toxml) |
| 101 | + """<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri)) |
0 commit comments