Skip to content

Commit 1d3434f

Browse files
Marc DMgsnedders
authored andcommitted
Fix html5lib#63: treewalker fails to handle bytes data from outwith html5lib
We do, ourselves, ensure everything we put in the tree is unicode under Python 2; users, however, may not be so careful and assign an attribute using a bare string, for example. We should handle this gracefully under Python 2 by coercing gently into unicode, failing in the normal way if we are unable to decode the bytes. I (Geoffrey, the committer) am not entirely convinced this is the right way to test this; on the other hand, a thorough testsuite for the issues previously in the treewalker would be large, especially considering the large number of treewalkers we support.
1 parent 84d8a74 commit 1d3434f

File tree

2 files changed

+104
-33
lines changed

2 files changed

+104
-33
lines changed

html5lib/tests/test_treewalkers.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -310,3 +310,54 @@ def test_treewalker():
310310
"document")]
311311
errors = errors.split("\n")
312312
yield runTreewalkerTest, innerHTML, input, expected, errors, treeCls
313+
314+
315+
def set_attribute_on_first_child(docfrag, name, value, treeName):
316+
"""naively sets an attribute on the first child of the document
317+
fragment passed in"""
318+
setter = {'ElementTree': lambda d: d[0].set,
319+
'DOM': lambda d: d.firstChild.setAttribute}
320+
setter['cElementTree'] = setter['ElementTree']
321+
try:
322+
setter.get(treeName, setter['DOM'])(docfrag)(name, value)
323+
except AttributeError:
324+
setter['ElementTree'](docfrag)(name, value)
325+
326+
327+
def runTreewalkerEditTest(intext, expected, attrs_to_add, tree):
328+
"""tests what happens when we add attributes to the intext"""
329+
treeName, treeClass = tree
330+
parser = html5parser.HTMLParser(tree=treeClass["builder"])
331+
document = parser.parseFragment(intext)
332+
for nom, val in attrs_to_add:
333+
set_attribute_on_first_child(document, nom, val, treeName)
334+
335+
document = treeClass.get("adapter", lambda x: x)(document)
336+
output = convertTokens(treeClass["walker"](document))
337+
output = attrlist.sub(sortattrs, output)
338+
if not output in expected:
339+
raise AssertionError("TreewalkerEditTest: %s\nExpected:\n%s\nReceived:\n%s" % (treeName, expected, output))
340+
341+
342+
def test_treewalker_six_mix():
343+
"""Str/Unicode mix. If str attrs added to tree"""
344+
345+
# On Python 2.x string literals are of type str. Unless, like this
346+
# file, the programmer imports unicode_literals from __future__.
347+
# In that case, string literals become objects of type unicode.
348+
349+
# This test simulates a Py2 user, modifying attributes on a document
350+
# fragment but not using the u'' syntax nor importing unicode_literals
351+
sm_tests = [
352+
('<a href="http://example.com">Example</a>',
353+
[(str('class'), str('test123'))],
354+
'<a>\n class="test123"\n href="http://example.com"\n "Example"'),
355+
356+
('<link href="http://example.com/cow">',
357+
[(str('rel'), str('alternate'))],
358+
'<link>\n href="http://example.com/cow"\n rel="alternate"\n "Example"')
359+
]
360+
361+
for tree in treeTypes.items():
362+
for intext, attrs, expected in sm_tests:
363+
yield runTreewalkerEditTest, intext, expected, attrs, tree

html5lib/treewalkers/_base.py

Lines changed: 53 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from __future__ import absolute_import, division, unicode_literals
2-
from six import text_type
2+
from six import text_type, string_types
33

44
import gettext
55
_ = gettext.gettext
@@ -8,6 +8,24 @@
88
spaceCharacters = "".join(spaceCharacters)
99

1010

11+
def to_text(s, blank_if_none=True):
12+
"""Wrapper around six.text_type to convert None to empty string"""
13+
if s is None:
14+
if blank_if_none:
15+
return ""
16+
else:
17+
return None
18+
elif isinstance(s, text_type):
19+
return s
20+
else:
21+
return text_type(s)
22+
23+
24+
def is_text_or_none(string):
25+
"""Wrapper around isinstance(string_types) or is None"""
26+
return string is None or isinstance(string, string_types)
27+
28+
1129
class TreeWalker(object):
1230
def __init__(self, tree):
1331
self.tree = tree
@@ -19,45 +37,47 @@ def error(self, msg):
1937
return {"type": "SerializeError", "data": msg}
2038

2139
def emptyTag(self, namespace, name, attrs, hasChildren=False):
22-
assert namespace is None or isinstance(namespace, text_type), type(namespace)
23-
assert isinstance(name, text_type), type(name)
24-
assert all((namespace is None or isinstance(namespace, text_type)) and
25-
isinstance(name, text_type) and
26-
isinstance(value, text_type)
40+
assert namespace is None or isinstance(namespace, string_types), type(namespace)
41+
assert isinstance(name, string_types), type(name)
42+
assert all((namespace is None or isinstance(namespace, string_types)) and
43+
isinstance(name, string_types) and
44+
isinstance(value, string_types)
2745
for (namespace, name), value in attrs.items())
2846

29-
yield {"type": "EmptyTag", "name": name,
30-
"namespace": namespace,
47+
yield {"type": "EmptyTag", "name": to_text(name, False),
48+
"namespace": to_text(namespace),
3149
"data": attrs}
3250
if hasChildren:
3351
yield self.error(_("Void element has children"))
3452

3553
def startTag(self, namespace, name, attrs):
36-
assert namespace is None or isinstance(namespace, text_type), type(namespace)
37-
assert isinstance(name, text_type), type(name)
38-
assert all((namespace is None or isinstance(namespace, text_type)) and
39-
isinstance(name, text_type) and
40-
isinstance(value, text_type)
54+
assert namespace is None or isinstance(namespace, string_types), type(namespace)
55+
assert isinstance(name, string_types), type(name)
56+
assert all((namespace is None or isinstance(namespace, string_types)) and
57+
isinstance(name, string_types) and
58+
isinstance(value, string_types)
4159
for (namespace, name), value in attrs.items())
4260

4361
return {"type": "StartTag",
44-
"name": name,
45-
"namespace": namespace,
46-
"data": attrs}
62+
"name": text_type(name),
63+
"namespace": to_text(namespace),
64+
"data": dict(((to_text(namespace, False), to_text(name)),
65+
to_text(value, False))
66+
for (namespace, name), value in attrs.items())}
4767

4868
def endTag(self, namespace, name):
49-
assert namespace is None or isinstance(namespace, text_type), type(namespace)
50-
assert isinstance(name, text_type), type(namespace)
69+
assert namespace is None or isinstance(namespace, string_types), type(namespace)
70+
assert isinstance(name, string_types), type(namespace)
5171

5272
return {"type": "EndTag",
53-
"name": name,
54-
"namespace": namespace,
73+
"name": to_text(name, False),
74+
"namespace": to_text(namespace),
5575
"data": {}}
5676

5777
def text(self, data):
58-
assert isinstance(data, text_type), type(data)
78+
assert isinstance(data, string_types), type(data)
5979

60-
data = data
80+
data = to_text(data)
6181
middle = data.lstrip(spaceCharacters)
6282
left = data[:len(data) - len(middle)]
6383
if left:
@@ -71,25 +91,25 @@ def text(self, data):
7191
yield {"type": "SpaceCharacters", "data": right}
7292

7393
def comment(self, data):
74-
assert isinstance(data, text_type), type(data)
94+
assert isinstance(data, string_types), type(data)
7595

76-
return {"type": "Comment", "data": data}
96+
return {"type": "Comment", "data": text_type(data)}
7797

7898
def doctype(self, name, publicId=None, systemId=None, correct=True):
79-
assert name is None or isinstance(name, text_type), type(name)
80-
assert publicId is None or isinstance(publicId, text_type), type(publicId)
81-
assert systemId is None or isinstance(systemId, text_type), type(systemId)
99+
assert is_text_or_none(name), type(name)
100+
assert is_text_or_none(publicId), type(publicId)
101+
assert is_text_or_none(systemId), type(systemId)
82102

83103
return {"type": "Doctype",
84-
"name": name if name is not None else "",
85-
"publicId": publicId,
86-
"systemId": systemId,
87-
"correct": correct}
104+
"name": to_text(name),
105+
"publicId": to_text(publicId),
106+
"systemId": to_text(systemId),
107+
"correct": to_text(correct)}
88108

89109
def entity(self, name):
90-
assert isinstance(name, text_type), type(name)
110+
assert isinstance(name, string_types), type(name)
91111

92-
return {"type": "Entity", "name": name}
112+
return {"type": "Entity", "name": text_type(name)}
93113

94114
def unknown(self, nodeType):
95115
return self.error(_("Unknown node type: ") + nodeType)

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy