Skip to content

Commit 73215c5

Browse files
committed
Merge pull request #222 from gsnedders/lint_fixes
Various fixes for the lint filter, and use it to validate treewalker sanity in tests.
2 parents af0199c + ca6591c commit 73215c5

File tree

5 files changed

+74
-132
lines changed

5 files changed

+74
-132
lines changed

html5lib/filters/lint.py

Lines changed: 44 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -1,90 +1,77 @@
11
from __future__ import absolute_import, division, unicode_literals
22

3+
from six import text_type
4+
35
from . import _base
4-
from ..constants import cdataElements, rcdataElements, voidElements
6+
from ..constants import namespaces, voidElements
57

68
from ..constants import spaceCharacters
79
spaceCharacters = "".join(spaceCharacters)
810

911

10-
class LintError(Exception):
11-
pass
12-
13-
1412
class Filter(_base.Filter):
1513
def __iter__(self):
1614
open_elements = []
17-
contentModelFlag = "PCDATA"
1815
for token in _base.Filter.__iter__(self):
1916
type = token["type"]
2017
if type in ("StartTag", "EmptyTag"):
18+
namespace = token["namespace"]
2119
name = token["name"]
22-
if contentModelFlag != "PCDATA":
23-
raise LintError("StartTag not in PCDATA content model flag: %(tag)s" % {"tag": name})
24-
if not isinstance(name, str):
25-
raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
26-
if not name:
27-
raise LintError("Empty tag name")
28-
if type == "StartTag" and name in voidElements:
29-
raise LintError("Void element reported as StartTag token: %(tag)s" % {"tag": name})
30-
elif type == "EmptyTag" and name not in voidElements:
31-
raise LintError("Non-void element reported as EmptyTag token: %(tag)s" % {"tag": token["name"]})
20+
assert namespace is None or isinstance(namespace, text_type)
21+
assert namespace != ""
22+
assert isinstance(name, text_type)
23+
assert name != ""
24+
assert isinstance(token["data"], dict)
25+
if (not namespace or namespace == namespaces["html"]) and name in voidElements:
26+
assert type == "EmptyTag"
27+
else:
28+
assert type == "StartTag"
3229
if type == "StartTag":
33-
open_elements.append(name)
34-
for name, value in token["data"]:
35-
if not isinstance(name, str):
36-
raise LintError("Attribute name is not a string: %(name)r" % {"name": name})
37-
if not name:
38-
raise LintError("Empty attribute name")
39-
if not isinstance(value, str):
40-
raise LintError("Attribute value is not a string: %(value)r" % {"value": value})
41-
if name in cdataElements:
42-
contentModelFlag = "CDATA"
43-
elif name in rcdataElements:
44-
contentModelFlag = "RCDATA"
45-
elif name == "plaintext":
46-
contentModelFlag = "PLAINTEXT"
30+
open_elements.append((namespace, name))
31+
for (namespace, name), value in token["data"].items():
32+
assert namespace is None or isinstance(namespace, text_type)
33+
assert namespace != ""
34+
assert isinstance(name, text_type)
35+
assert name != ""
36+
assert isinstance(value, text_type)
4737

4838
elif type == "EndTag":
39+
namespace = token["namespace"]
4940
name = token["name"]
50-
if not isinstance(name, str):
51-
raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
52-
if not name:
53-
raise LintError("Empty tag name")
54-
if name in voidElements:
55-
raise LintError("Void element reported as EndTag token: %(tag)s" % {"tag": name})
56-
start_name = open_elements.pop()
57-
if start_name != name:
58-
raise LintError("EndTag (%(end)s) does not match StartTag (%(start)s)" % {"end": name, "start": start_name})
59-
contentModelFlag = "PCDATA"
41+
assert namespace is None or isinstance(namespace, text_type)
42+
assert namespace != ""
43+
assert isinstance(name, text_type)
44+
assert name != ""
45+
if (not namespace or namespace == namespaces["html"]) and name in voidElements:
46+
assert False, "Void element reported as EndTag token: %(tag)s" % {"tag": name}
47+
else:
48+
start = open_elements.pop()
49+
assert start == (namespace, name)
6050

6151
elif type == "Comment":
62-
if contentModelFlag != "PCDATA":
63-
raise LintError("Comment not in PCDATA content model flag")
52+
data = token["data"]
53+
assert isinstance(data, text_type)
6454

6555
elif type in ("Characters", "SpaceCharacters"):
6656
data = token["data"]
67-
if not isinstance(data, str):
68-
raise LintError("Attribute name is not a string: %(name)r" % {"name": data})
69-
if not data:
70-
raise LintError("%(type)s token with empty data" % {"type": type})
57+
assert isinstance(data, text_type)
58+
assert data != ""
7159
if type == "SpaceCharacters":
72-
data = data.strip(spaceCharacters)
73-
if data:
74-
raise LintError("Non-space character(s) found in SpaceCharacters token: %(token)r" % {"token": data})
60+
assert data.strip(spaceCharacters) == ""
7561

7662
elif type == "Doctype":
7763
name = token["name"]
78-
if contentModelFlag != "PCDATA":
79-
raise LintError("Doctype not in PCDATA content model flag: %(name)s" % {"name": name})
80-
if not isinstance(name, str):
81-
raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
82-
# XXX: what to do with token["data"] ?
64+
assert name is None or isinstance(name, text_type)
65+
assert token["publicId"] is None or isinstance(name, text_type)
66+
assert token["systemId"] is None or isinstance(name, text_type)
67+
68+
elif type == "Entity":
69+
assert isinstance(token["name"], text_type)
8370

84-
elif type in ("ParseError", "SerializeError"):
85-
pass
71+
elif type == "SerializerError":
72+
assert isinstance(token["data"], text_type)
8673

8774
else:
88-
raise LintError("Unknown token type: %(type)s" % {"type": type})
75+
assert False, "Unknown token type: %(type)s" % {"type": type}
8976

9077
yield token

html5lib/tests/test_treewalkers.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from .support import get_data_files, TestData, convertExpected
1515

1616
from html5lib import html5parser, treewalkers, treebuilders, treeadapters, constants
17+
from html5lib.filters.lint import Filter as Lint
1718

1819

1920
treeTypes = {
@@ -77,21 +78,21 @@ def test_all_tokens(self):
7778
expected = [
7879
{'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'html'},
7980
{'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'head'},
80-
{'data': {}, 'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'head'},
81+
{'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'head'},
8182
{'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'body'},
8283
{'data': 'a', 'type': 'Characters'},
8384
{'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'div'},
8485
{'data': 'b', 'type': 'Characters'},
85-
{'data': {}, 'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'div'},
86+
{'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'div'},
8687
{'data': 'c', 'type': 'Characters'},
87-
{'data': {}, 'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'body'},
88-
{'data': {}, 'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'html'}
88+
{'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'body'},
89+
{'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'html'}
8990
]
9091
for treeName, treeCls in sorted(treeTypes.items()):
9192
p = html5parser.HTMLParser(tree=treeCls["builder"])
9293
document = p.parse("<html><head></head><body>a<div>b</div>c</body></html>")
9394
document = treeCls.get("adapter", lambda x: x)(document)
94-
output = treeCls["walker"](document)
95+
output = Lint(treeCls["walker"](document))
9596
for expectedToken, outputToken in zip(expected, output):
9697
self.assertEqual(expectedToken, outputToken)
9798

@@ -111,7 +112,7 @@ def runTreewalkerTest(innerHTML, input, expected, errors, treeClass):
111112

112113
document = treeClass.get("adapter", lambda x: x)(document)
113114
try:
114-
output = treewalkers.pprint(treeClass["walker"](document))
115+
output = treewalkers.pprint(Lint(treeClass["walker"](document)))
115116
output = attrlist.sub(sortattrs, output)
116117
expected = attrlist.sub(sortattrs, convertExpected(expected))
117118
diff = "".join(unified_diff([line + "\n" for line in expected.splitlines()],

html5lib/treewalkers/_base.py

Lines changed: 17 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
from __future__ import absolute_import, division, unicode_literals
2-
from six import text_type, string_types
32

43
from xml.dom import Node
5-
from ..constants import voidElements, spaceCharacters
4+
from ..constants import namespaces, voidElements, spaceCharacters
65

76
__all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN",
87
"TreeWalker", "NonRecursiveTreeWalker"]
@@ -18,24 +17,6 @@
1817
spaceCharacters = "".join(spaceCharacters)
1918

2019

21-
def to_text(s, blank_if_none=True):
22-
"""Wrapper around six.text_type to convert None to empty string"""
23-
if s is None:
24-
if blank_if_none:
25-
return ""
26-
else:
27-
return None
28-
elif isinstance(s, text_type):
29-
return s
30-
else:
31-
return text_type(s)
32-
33-
34-
def is_text_or_none(string):
35-
"""Wrapper around isinstance(string_types) or is None"""
36-
return string is None or isinstance(string, string_types)
37-
38-
3920
class TreeWalker(object):
4021
def __init__(self, tree):
4122
self.tree = tree
@@ -47,47 +28,25 @@ def error(self, msg):
4728
return {"type": "SerializeError", "data": msg}
4829

4930
def emptyTag(self, namespace, name, attrs, hasChildren=False):
50-
assert namespace is None or isinstance(namespace, string_types), type(namespace)
51-
assert isinstance(name, string_types), type(name)
52-
assert all((namespace is None or isinstance(namespace, string_types)) and
53-
isinstance(name, string_types) and
54-
isinstance(value, string_types)
55-
for (namespace, name), value in attrs.items())
56-
57-
yield {"type": "EmptyTag", "name": to_text(name, False),
58-
"namespace": to_text(namespace),
31+
yield {"type": "EmptyTag", "name": name,
32+
"namespace": namespace,
5933
"data": attrs}
6034
if hasChildren:
6135
yield self.error("Void element has children")
6236

6337
def startTag(self, namespace, name, attrs):
64-
assert namespace is None or isinstance(namespace, string_types), type(namespace)
65-
assert isinstance(name, string_types), type(name)
66-
assert all((namespace is None or isinstance(namespace, string_types)) and
67-
isinstance(name, string_types) and
68-
isinstance(value, string_types)
69-
for (namespace, name), value in attrs.items())
70-
7138
return {"type": "StartTag",
72-
"name": text_type(name),
73-
"namespace": to_text(namespace),
74-
"data": dict(((to_text(namespace, False), to_text(name)),
75-
to_text(value, False))
76-
for (namespace, name), value in attrs.items())}
39+
"name": name,
40+
"namespace": namespace,
41+
"data": attrs}
7742

7843
def endTag(self, namespace, name):
79-
assert namespace is None or isinstance(namespace, string_types), type(namespace)
80-
assert isinstance(name, string_types), type(namespace)
81-
8244
return {"type": "EndTag",
83-
"name": to_text(name, False),
84-
"namespace": to_text(namespace),
85-
"data": {}}
45+
"name": name,
46+
"namespace": namespace}
8647

8748
def text(self, data):
88-
assert isinstance(data, string_types), type(data)
89-
90-
data = to_text(data)
49+
data = data
9150
middle = data.lstrip(spaceCharacters)
9251
left = data[:len(data) - len(middle)]
9352
if left:
@@ -101,25 +60,16 @@ def text(self, data):
10160
yield {"type": "SpaceCharacters", "data": right}
10261

10362
def comment(self, data):
104-
assert isinstance(data, string_types), type(data)
105-
106-
return {"type": "Comment", "data": text_type(data)}
107-
108-
def doctype(self, name, publicId=None, systemId=None, correct=True):
109-
assert is_text_or_none(name), type(name)
110-
assert is_text_or_none(publicId), type(publicId)
111-
assert is_text_or_none(systemId), type(systemId)
63+
return {"type": "Comment", "data": data}
11264

65+
def doctype(self, name, publicId=None, systemId=None):
11366
return {"type": "Doctype",
114-
"name": to_text(name),
115-
"publicId": to_text(publicId),
116-
"systemId": to_text(systemId),
117-
"correct": to_text(correct)}
67+
"name": name,
68+
"publicId": publicId,
69+
"systemId": systemId}
11870

11971
def entity(self, name):
120-
assert isinstance(name, string_types), type(name)
121-
122-
return {"type": "Entity", "name": text_type(name)}
72+
return {"type": "Entity", "name": name}
12373

12474
def unknown(self, nodeType):
12575
return self.error("Unknown node type: " + nodeType)
@@ -154,7 +104,7 @@ def __iter__(self):
154104

155105
elif type == ELEMENT:
156106
namespace, name, attributes, hasChildren = details
157-
if name in voidElements:
107+
if (not namespace or namespace == namespaces["html"]) and name in voidElements:
158108
for token in self.emptyTag(namespace, name, attributes,
159109
hasChildren):
160110
yield token
@@ -187,7 +137,7 @@ def __iter__(self):
187137
type, details = details[0], details[1:]
188138
if type == ELEMENT:
189139
namespace, name, attributes, hasChildren = details
190-
if name not in voidElements:
140+
if (namespace and namespace != namespaces["html"]) or name not in voidElements:
191141
yield self.endTag(namespace, name)
192142
if self.tree is currentNode:
193143
currentNode = None

html5lib/treewalkers/genshistream.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def tokens(self, event, next):
4848
elif kind == END:
4949
name = data.localname
5050
namespace = data.namespace
51-
if name not in voidElements:
51+
if namespace != namespaces["html"] or name not in voidElements:
5252
yield self.endTag(namespace, name)
5353

5454
elif kind == COMMENT:

html5lib/treewalkers/lxmletree.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,8 +118,10 @@ def __len__(self):
118118
class TreeWalker(_base.NonRecursiveTreeWalker):
119119
def __init__(self, tree):
120120
if hasattr(tree, "getroot"):
121+
self.fragmentChildren = set()
121122
tree = Root(tree)
122123
elif isinstance(tree, list):
124+
self.fragmentChildren = set(tree)
123125
tree = FragmentRoot(tree)
124126
_base.NonRecursiveTreeWalker.__init__(self, tree)
125127
self.filter = ihatexml.InfosetFilter()
@@ -137,7 +139,7 @@ def getNodeDetails(self, node):
137139
return _base.DOCTYPE, node.name, node.public_id, node.system_id
138140

139141
elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"):
140-
return _base.TEXT, node.obj
142+
return _base.TEXT, ensure_str(node.obj)
141143

142144
elif node.tag == etree.Comment:
143145
return _base.COMMENT, ensure_str(node.text)
@@ -197,5 +199,7 @@ def getParentNode(self, node):
197199
if key == "text":
198200
return node
199201
# else: fallback to "normal" processing
202+
elif node in self.fragmentChildren:
203+
return None
200204

201205
return node.getparent()

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy