Skip to content

Commit deb205a

Browse files
committed
Get the lxml treewalker working under the joint codebase under Py2.
This hard-codes the fact that lxml uses UTF-8 (byte) strings under Py2, and adds asserts to the generic treewalker to ensure we have Unicode strings.
1 parent 82377ec commit deb205a

File tree

2 files changed

+56
-26
lines changed

2 files changed

+56
-26
lines changed

html5lib/treewalkers/_base.py

Lines changed: 30 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -17,37 +17,45 @@ def __iter__(self):
1717
def error(self, msg):
1818
return {"type": "SerializeError", "data": msg}
1919

20-
def normalizeAttrs(self, attrs):
21-
newattrs = {}
22-
if attrs:
23-
#TODO: treewalkers should always have attrs
24-
for (namespace,name),value in attrs.items():
25-
assert namespace is None or isinstance(namespace, text_type), type(namespace)
26-
assert isinstance(name, text_type)
27-
assert isinstance(value, text_type)
28-
newattrs[(namespace,name)] = value
29-
return newattrs
30-
3120
def emptyTag(self, namespace, name, attrs, hasChildren=False):
21+
assert namespace is None or isinstance(namespace, text_type), type(namespace)
22+
assert isinstance(name, text_type), type(name)
23+
assert all((namespace is None or isinstance(namespace, text_type)) and
24+
isinstance(name, text_type) and
25+
isinstance(value, text_type)
26+
for (namespace, name), value in attrs.items())
27+
3228
yield {"type": "EmptyTag", "name": name,
3329
"namespace":namespace,
34-
"data": self.normalizeAttrs(attrs)}
30+
"data": attrs}
3531
if hasChildren:
3632
yield self.error(_("Void element has children"))
3733

3834
def startTag(self, namespace, name, attrs):
35+
assert namespace is None or isinstance(namespace, text_type), type(namespace)
36+
assert isinstance(name, text_type), type(name)
37+
assert all((namespace is None or isinstance(namespace, text_type)) and
38+
isinstance(name, text_type) and
39+
isinstance(value, text_type)
40+
for (namespace, name), value in attrs.items())
41+
3942
return {"type": "StartTag",
4043
"name": name,
4144
"namespace":namespace,
42-
"data": self.normalizeAttrs(attrs)}
45+
"data": attrs}
4346

4447
def endTag(self, namespace, name):
48+
assert namespace is None or isinstance(namespace, text_type), type(namespace)
49+
assert isinstance(name, text_type), type(namespace)
50+
4551
return {"type": "EndTag",
4652
"name": name,
4753
"namespace":namespace,
4854
"data": {}}
4955

5056
def text(self, data):
57+
assert isinstance(data, text_type), type(data)
58+
5159
data = data
5260
middle = data.lstrip(spaceCharacters)
5361
left = data[:len(data)-len(middle)]
@@ -62,16 +70,24 @@ def text(self, data):
6270
yield {"type": "SpaceCharacters", "data": right}
6371

6472
def comment(self, data):
73+
assert isinstance(data, text_type), type(data)
74+
6575
return {"type": "Comment", "data": data}
6676

6777
def doctype(self, name, publicId=None, systemId=None, correct=True):
78+
assert name is None or isinstance(name, text_type), type(name)
79+
assert publicId is None or isinstance(publicId, text_type), type(publicId)
80+
assert systemId is None or isinstance(systemId, text_type), type(systemId)
81+
6882
return {"type": "Doctype",
69-
"name": name is not None and name or "",
83+
"name": name if name is not None else "",
7084
"publicId": publicId,
7185
"systemId": systemId,
7286
"correct": correct}
7387

7488
def entity(self, name):
89+
assert isinstance(name, text_type), type(name)
90+
7591
return {"type": "Entity", "name": name}
7692

7793
def unknown(self, nodeType):

html5lib/treewalkers/lxmletree.py

Lines changed: 26 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from __future__ import absolute_import, division, unicode_literals
2+
from six import text_type
23

34
from lxml import etree
45
from html5lib.treebuilders.etree import tag_regexp
@@ -12,14 +13,23 @@
1213
from html5lib.constants import voidElements
1314
from html5lib import ihatexml
1415

16+
def ensure_str(s):
17+
if s is None:
18+
return None
19+
elif isinstance(s, text_type):
20+
return s
21+
else:
22+
return s.decode("utf-8", "strict")
23+
1524
class Root(object):
1625
def __init__(self, et):
1726
self.elementtree = et
1827
self.children = []
1928
if et.docinfo.internalDTD:
20-
self.children.append(Doctype(self, et.docinfo.root_name,
21-
et.docinfo.public_id,
22-
et.docinfo.system_url))
29+
self.children.append(Doctype(self,
30+
ensure_str(et.docinfo.root_name),
31+
ensure_str(et.docinfo.public_id),
32+
ensure_str(et.docinfo.system_url)))
2333
root = et.getroot()
2434
node = root
2535

@@ -67,15 +77,17 @@ def __init__(self, fragment_root, obj):
6777
self.root_node = fragment_root
6878
self.obj = obj
6979
if hasattr(self.obj, 'text'):
70-
self.text = self.obj.text
80+
self.text = ensure_str(self.obj.text)
7181
else:
7282
self.text = None
7383
if hasattr(self.obj, 'tail'):
74-
self.tail = self.obj.tail
84+
self.tail = ensure_str(self.obj.tail)
7585
else:
7686
self.tail = None
7787
self.isstring = isinstance(obj, str) or isinstance(obj, bytes)
78-
assert not self.isstring or isinstance(obj, str) or sys.version_info.major == 2
88+
# Support for bytes here is Py2
89+
if self.isstring:
90+
self.obj = ensure_str(self.obj)
7991

8092
def __getattr__(self, name):
8193
return getattr(self.obj, name)
@@ -120,7 +132,7 @@ def getNodeDetails(self, node):
120132
if isinstance(node, tuple): # Text node
121133
node, key = node
122134
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
123-
return _base.TEXT, getattr(node, key)
135+
return _base.TEXT, ensure_str(getattr(node, key))
124136

125137
elif isinstance(node, Root):
126138
return (_base.DOCUMENT,)
@@ -129,24 +141,26 @@ def getNodeDetails(self, node):
129141
return _base.DOCTYPE, node.name, node.public_id, node.system_id
130142

131143
elif isinstance(node, FragmentWrapper) and node.isstring:
132-
return _base.TEXT, node
144+
return _base.TEXT, node.obj
133145

134146
elif node.tag == etree.Comment:
135-
return _base.COMMENT, node.text
147+
return _base.COMMENT, ensure_str(node.text)
136148

137149
elif node.tag == etree.Entity:
138-
return _base.ENTITY, node.text[1:-1] # strip &;
150+
return _base.ENTITY, ensure_str(node.text)[1:-1] # strip &;
139151

140152
else:
141153
#This is assumed to be an ordinary element
142-
match = tag_regexp.match(node.tag)
154+
match = tag_regexp.match(ensure_str(node.tag))
143155
if match:
144156
namespace, tag = match.groups()
145157
else:
146158
namespace = None
147-
tag = node.tag
159+
tag = ensure_str(node.tag)
148160
attrs = {}
149161
for name, value in list(node.attrib.items()):
162+
name = ensure_str(name)
163+
value = ensure_str(value)
150164
match = tag_regexp.match(name)
151165
if match:
152166
attrs[(match.group(1),match.group(2))] = value

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy