Skip to content

Commit 38ec086

Browse files
committed
Move token normalisation to the tokenizer
1 parent b2e4802 commit 38ec086

File tree

6 files changed

+151
-68
lines changed

6 files changed

+151
-68
lines changed

html5lib/_tokenizer.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22

33
from six import unichr as chr
44

5-
from collections import deque
5+
from collections import deque, OrderedDict
6+
from sys import version_info
67

78
from .constants import spaceCharacters
89
from .constants import entities
@@ -17,6 +18,11 @@
1718

1819
entitiesTrie = Trie(entities)
1920

21+
if version_info >= (3, 7):
22+
attributeMap = dict
23+
else:
24+
attributeMap = OrderedDict
25+
2026

2127
class HTMLTokenizer(object):
2228
""" This class takes care of tokenizing HTML.
@@ -228,6 +234,14 @@ def emitCurrentToken(self):
228234
# Add token to the queue to be yielded
229235
if (token["type"] in tagTokenTypes):
230236
token["name"] = token["name"].translate(asciiUpper2Lower)
237+
if token["type"] == tokenTypes["StartTag"]:
238+
raw = token["data"]
239+
data = attributeMap(raw)
240+
if len(raw) > len(data):
241+
# we had some duplicated attribute, fix so first wins
242+
data.update(raw[::-1])
243+
token["data"] = data
244+
231245
if token["type"] == tokenTypes["EndTag"]:
232246
if token["data"]:
233247
self.tokenQueue.append({"type": tokenTypes["ParseError"],

html5lib/html5parser.py

Lines changed: 3 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,6 @@
33

44
import types
55

6-
from collections import OrderedDict
7-
from sys import version_info
8-
96
from . import _inputstream
107
from . import _tokenizer
118

@@ -26,12 +23,6 @@
2623
)
2724

2825

29-
if version_info >= (3, 7):
30-
attributeMap = dict
31-
else:
32-
attributeMap = OrderedDict
33-
34-
3526
def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs):
3627
"""Parse an HTML document as a string or file-like object into a tree
3728
@@ -210,7 +201,7 @@ def mainLoop(self):
210201
DoctypeToken = tokenTypes["Doctype"]
211202
ParseErrorToken = tokenTypes["ParseError"]
212203

213-
for token in self.normalizedTokens():
204+
for token in self.tokenizer:
214205
prev_token = None
215206
new_token = token
216207
while new_token is not None:
@@ -268,10 +259,6 @@ def mainLoop(self):
268259
if reprocess:
269260
assert self.phase not in phases
270261

271-
def normalizedTokens(self):
272-
for token in self.tokenizer:
273-
yield self.normalizeToken(token)
274-
275262
def parse(self, stream, *args, **kwargs):
276263
"""Parse a HTML document into a well-formed tree
277264
@@ -333,18 +320,6 @@ def parseError(self, errorcode="XXX-undefined-error", datavars=None):
333320
if self.strict:
334321
raise ParseError(E[errorcode] % datavars)
335322

336-
def normalizeToken(self, token):
337-
# HTML5 specific normalizations to the token stream
338-
if token["type"] == tokenTypes["StartTag"]:
339-
raw = token["data"]
340-
data = attributeMap(raw)
341-
if len(raw) > len(data):
342-
# we had some duplicated attribute, fix so first wins
343-
data.update(raw[::-1])
344-
token["data"] = data
345-
346-
return token
347-
348323
def adjustMathMLAttributes(self, token):
349324
adjust_attributes(token, adjustMathMLAttributes)
350325

@@ -2803,8 +2778,8 @@ def processEndTag(self, token):
28032778
def adjust_attributes(token, replacements):
28042779
needs_adjustment = viewkeys(token['data']) & viewkeys(replacements)
28052780
if needs_adjustment:
2806-
token['data'] = attributeMap((replacements.get(k, k), v)
2807-
for k, v in token['data'].items())
2781+
token['data'] = type(token['data'])((replacements.get(k, k), v)
2782+
for k, v in token['data'].items())
28082783

28092784

28102785
def impliedTagToken(name, type="EndTag", attributes=None,

html5lib/tests/test_parser2.py

Lines changed: 2 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
from __future__ import absolute_import, division, unicode_literals
22

3-
from six import PY2, text_type, unichr
3+
from six import PY2, text_type
44

55
import io
66

77
from . import support # noqa
88

9-
from html5lib.constants import namespaces, tokenTypes
9+
from html5lib.constants import namespaces
1010
from html5lib import parse, parseFragment, HTMLParser
1111

1212

@@ -53,42 +53,6 @@ def test_unicode_file():
5353
assert parse(io.StringIO("a")) is not None
5454

5555

56-
def test_maintain_attribute_order():
57-
# This is here because we impl it in parser and not tokenizer
58-
p = HTMLParser()
59-
# generate loads to maximize the chance a hash-based mutation will occur
60-
attrs = [(unichr(x), i) for i, x in enumerate(range(ord('a'), ord('z')))]
61-
token = {'name': 'html',
62-
'selfClosing': False,
63-
'selfClosingAcknowledged': False,
64-
'type': tokenTypes["StartTag"],
65-
'data': attrs}
66-
out = p.normalizeToken(token)
67-
attr_order = list(out["data"].keys())
68-
assert attr_order == [x for x, i in attrs]
69-
70-
71-
def test_duplicate_attribute():
72-
# This is here because we impl it in parser and not tokenizer
73-
doc = parse('<p class=a class=b>')
74-
el = doc[1][0]
75-
assert el.get("class") == "a"
76-
77-
78-
def test_maintain_duplicate_attribute_order():
79-
# This is here because we impl it in parser and not tokenizer
80-
p = HTMLParser()
81-
attrs = [(unichr(x), i) for i, x in enumerate(range(ord('a'), ord('z')))]
82-
token = {'name': 'html',
83-
'selfClosing': False,
84-
'selfClosingAcknowledged': False,
85-
'type': tokenTypes["StartTag"],
86-
'data': attrs + [('a', len(attrs))]}
87-
out = p.normalizeToken(token)
88-
attr_order = list(out["data"].keys())
89-
assert attr_order == [x for x, i in attrs]
90-
91-
9256
def test_debug_log():
9357
parser = HTMLParser(debug=True)
9458
parser.parse("<!doctype html><title>a</title><p>b<script>c</script>d</p>e")

html5lib/tests/test_tokenizer2.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
from __future__ import absolute_import, division, unicode_literals
2+
3+
import io
4+
5+
from six import unichr, text_type
6+
7+
from html5lib._tokenizer import HTMLTokenizer
8+
from html5lib.constants import tokenTypes
9+
10+
11+
def ignore_parse_errors(toks):
12+
for tok in toks:
13+
if tok['type'] != tokenTypes['ParseError']:
14+
yield tok
15+
16+
17+
def test_maintain_attribute_order():
18+
# generate loads to maximize the chance a hash-based mutation will occur
19+
attrs = [(unichr(x), text_type(i)) for i, x in enumerate(range(ord('a'), ord('z')))]
20+
stream = io.StringIO("<span " + " ".join("%s='%s'" % (x, i) for x, i in attrs) + ">")
21+
22+
toks = HTMLTokenizer(stream)
23+
out = list(ignore_parse_errors(toks))
24+
25+
assert len(out) == 1
26+
assert out[0]['type'] == tokenTypes['StartTag']
27+
28+
attrs_tok = out[0]['data']
29+
assert len(attrs_tok) == len(attrs)
30+
31+
for (in_name, in_value), (out_name, out_value) in zip(attrs, attrs_tok.items()):
32+
assert in_name == out_name
33+
assert in_value == out_value
34+
35+
36+
def test_duplicate_attribute():
37+
stream = io.StringIO("<span a=1 a=2 a=3>")
38+
39+
toks = HTMLTokenizer(stream)
40+
out = list(ignore_parse_errors(toks))
41+
42+
assert len(out) == 1
43+
assert out[0]['type'] == tokenTypes['StartTag']
44+
45+
attrs_tok = out[0]['data']
46+
assert len(attrs_tok) == 1
47+
assert list(attrs_tok.items()) == [('a', '1')]
48+
49+
50+
def test_maintain_duplicate_attribute_order():
51+
# generate loads to maximize the chance a hash-based mutation will occur
52+
attrs = [(unichr(x), text_type(i)) for i, x in enumerate(range(ord('a'), ord('z')))]
53+
stream = io.StringIO("<span " + " ".join("%s='%s'" % (x, i) for x, i in attrs) + " a=100>")
54+
55+
toks = HTMLTokenizer(stream)
56+
out = list(ignore_parse_errors(toks))
57+
58+
assert len(out) == 1
59+
assert out[0]['type'] == tokenTypes['StartTag']
60+
61+
attrs_tok = out[0]['data']
62+
assert len(attrs_tok) == len(attrs)
63+
64+
for (in_name, in_value), (out_name, out_value) in zip(attrs, attrs_tok.items()):
65+
assert in_name == out_name
66+
assert in_value == out_value

html5lib/tests/test_treewalkers.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
from __future__ import absolute_import, division, unicode_literals
22

33
import itertools
4+
import sys
45

6+
from six import unichr, text_type
57
import pytest
68

79
try:
@@ -135,3 +137,65 @@ def test_lxml_xml():
135137
output = Lint(walker(lxmltree))
136138

137139
assert list(output) == expected
140+
141+
142+
@pytest.mark.parametrize("treeName",
143+
[pytest.param(treeName, marks=[getattr(pytest.mark, treeName),
144+
pytest.mark.skipif(sys.version_info < (3, 7), reason="dict order undef")])
145+
for treeName in sorted(treeTypes.keys())])
146+
def test_maintain_attribute_order(treeName):
147+
treeAPIs = treeTypes[treeName]
148+
if treeAPIs is None:
149+
pytest.skip("Treebuilder not loaded")
150+
151+
# generate loads to maximize the chance a hash-based mutation will occur
152+
attrs = [(unichr(x), text_type(i)) for i, x in enumerate(range(ord('a'), ord('z')))]
153+
data = "<span " + " ".join("%s='%s'" % (x, i) for x, i in attrs) + ">"
154+
155+
parser = html5parser.HTMLParser(tree=treeAPIs["builder"])
156+
document = parser.parseFragment(data)
157+
158+
document = treeAPIs.get("adapter", lambda x: x)(document)
159+
output = list(Lint(treeAPIs["walker"](document)))
160+
161+
assert len(output) == 2
162+
assert output[0]['type'] == 'StartTag'
163+
assert output[1]['type'] == "EndTag"
164+
165+
attrs_out = output[0]['data']
166+
assert len(attrs) == len(attrs_out)
167+
168+
for (in_name, in_value), (out_name, out_value) in zip(attrs, attrs_out.items()):
169+
assert (None, in_name) == out_name
170+
assert in_value == out_value
171+
172+
173+
@pytest.mark.parametrize("treeName",
174+
[pytest.param(treeName, marks=[getattr(pytest.mark, treeName),
175+
pytest.mark.skipif(sys.version_info < (3, 7), reason="dict order undef")])
176+
for treeName in sorted(treeTypes.keys())])
177+
def test_maintain_attribute_order_adjusted(treeName):
178+
treeAPIs = treeTypes[treeName]
179+
if treeAPIs is None:
180+
pytest.skip("Treebuilder not loaded")
181+
182+
# generate loads to maximize the chance a hash-based mutation will occur
183+
data = "<svg a=1 refx=2 b=3 xml:lang=4 c=5>"
184+
185+
parser = html5parser.HTMLParser(tree=treeAPIs["builder"])
186+
document = parser.parseFragment(data)
187+
188+
document = treeAPIs.get("adapter", lambda x: x)(document)
189+
output = list(Lint(treeAPIs["walker"](document)))
190+
191+
assert len(output) == 2
192+
assert output[0]['type'] == 'StartTag'
193+
assert output[1]['type'] == "EndTag"
194+
195+
attrs_out = output[0]['data']
196+
197+
assert list(attrs_out.items()) == [((None, 'a'), '1'),
198+
((None, 'refX'), '2'),
199+
((None, 'b'), '3'),
200+
(('http://www.w3.org/XML/1998/namespace', 'lang'), '4'),
201+
((None, 'c'), '5')]

html5lib/tests/tokenizer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def processDoctype(self, token):
4040

4141
def processStartTag(self, token):
4242
self.outputTokens.append(["StartTag", token["name"],
43-
dict(token["data"][::-1]), token["selfClosing"]])
43+
token["data"], token["selfClosing"]])
4444

4545
def processEmptyTag(self, token):
4646
if token["name"] not in constants.voidElements:

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy