Skip to content

Commit 699276b

Browse files
authored
Merge pull request #257 from gsnedders/det_encoding
Update encoding detection; r=nobody!
2 parents dce9d62 + fc9f63b commit 699276b

File tree

7 files changed

+137
-83
lines changed

7 files changed

+137
-83
lines changed

CHANGES.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,10 @@ Released on XXX
4646

4747
* **Drop support of charade, now that chardet is supported once more.**
4848

49+
* **Replace the charset keyword argument on parse and related methods
50+
with a set of keyword arguments: override_encoding, transport_encoding,
51+
same_origin_parent_encoding, likely_encoding, and default_encoding.**
52+
4953

5054
0.9999999/1.0b8
5155
~~~~~~~~~~~~~~~

README.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ pass into html5lib as follows:
5151
import html5lib
5252
5353
with closing(urlopen("http://example.com/")) as f:
54-
document = html5lib.parse(f, encoding=f.info().getparam("charset"))
54+
document = html5lib.parse(f, transport_encoding=f.info().getparam("charset"))
5555
5656
When using with ``urllib.request`` (Python 3), the charset from HTTP
5757
should be pass into html5lib as follows:
@@ -62,7 +62,7 @@ should be pass into html5lib as follows:
6262
import html5lib
6363
6464
with urlopen("http://example.com/") as f:
65-
document = html5lib.parse(f, encoding=f.info().get_content_charset())
65+
document = html5lib.parse(f, transport_encoding=f.info().get_content_charset())
6666
6767
To have more control over the parser, create a parser object explicitly.
6868
For instance, to make the parser raise exceptions on parse errors, use:

html5lib/html5parser.py

Lines changed: 11 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -28,19 +28,17 @@
2828
)
2929

3030

31-
def parse(doc, treebuilder="etree", encoding=None,
32-
namespaceHTMLElements=True, scripting=False):
31+
def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs):
3332
"""Parse a string or file-like object into a tree"""
3433
tb = treebuilders.getTreeBuilder(treebuilder)
3534
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
36-
return p.parse(doc, encoding=encoding, scripting=scripting)
35+
return p.parse(doc, **kwargs)
3736

3837

39-
def parseFragment(doc, container="div", treebuilder="etree", encoding=None,
40-
namespaceHTMLElements=True, scripting=False):
38+
def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs):
4139
tb = treebuilders.getTreeBuilder(treebuilder)
4240
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
43-
return p.parseFragment(doc, container=container, encoding=encoding, scripting=scripting)
41+
return p.parseFragment(doc, container=container, **kwargs)
4442

4543

4644
def method_decorator_metaclass(function):
@@ -59,18 +57,13 @@ class HTMLParser(object):
5957
"""HTML parser. Generates a tree structure from a stream of (possibly
6058
malformed) HTML"""
6159

62-
def __init__(self, tree=None, tokenizer=tokenizer.HTMLTokenizer,
63-
strict=False, namespaceHTMLElements=True, debug=False):
60+
def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False):
6461
"""
6562
strict - raise an exception when a parse error is encountered
6663
6764
tree - a treebuilder class controlling the type of tree that will be
6865
returned. Built in treebuilders can be accessed through
6966
html5lib.treebuilders.getTreeBuilder(treeType)
70-
71-
tokenizer - a class that provides a stream of tokens to the treebuilder.
72-
This may be replaced for e.g. a sanitizer which converts some tags to
73-
text
7467
"""
7568

7669
# Raise an exception on the first error encountered
@@ -79,22 +72,17 @@ def __init__(self, tree=None, tokenizer=tokenizer.HTMLTokenizer,
7972
if tree is None:
8073
tree = treebuilders.getTreeBuilder("etree")
8174
self.tree = tree(namespaceHTMLElements)
82-
self.tokenizer_class = tokenizer
8375
self.errors = []
8476

8577
self.phases = dict([(name, cls(self, self.tree)) for name, cls in
8678
getPhases(debug).items()])
8779

88-
def _parse(self, stream, innerHTML=False, container="div", encoding=None,
89-
parseMeta=True, useChardet=True, scripting=False, **kwargs):
80+
def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs):
9081

9182
self.innerHTMLMode = innerHTML
9283
self.container = container
9384
self.scripting = scripting
94-
self.tokenizer = self.tokenizer_class(stream, encoding=encoding,
95-
parseMeta=parseMeta,
96-
useChardet=useChardet,
97-
parser=self, **kwargs)
85+
self.tokenizer = tokenizer.HTMLTokenizer(stream, parser=self, **kwargs)
9886
self.reset()
9987

10088
try:
@@ -232,8 +220,7 @@ def normalizedTokens(self):
232220
for token in self.tokenizer:
233221
yield self.normalizeToken(token)
234222

235-
def parse(self, stream, encoding=None, parseMeta=True,
236-
useChardet=True, scripting=False):
223+
def parse(self, stream, *args, **kwargs):
237224
"""Parse a HTML document into a well-formed tree
238225
239226
stream - a filelike object or string containing the HTML to be parsed
@@ -245,13 +232,10 @@ def parse(self, stream, encoding=None, parseMeta=True,
245232
246233
scripting - treat noscript elements as if javascript was turned on
247234
"""
248-
self._parse(stream, innerHTML=False, encoding=encoding,
249-
parseMeta=parseMeta, useChardet=useChardet, scripting=scripting)
235+
self._parse(stream, False, None, *args, **kwargs)
250236
return self.tree.getDocument()
251237

252-
def parseFragment(self, stream, container="div", encoding=None,
253-
parseMeta=False, useChardet=True, scripting=False):
254-
# pylint:disable=unused-argument
238+
def parseFragment(self, stream, *args, **kwargs):
255239
"""Parse a HTML fragment into a well-formed tree fragment
256240
257241
container - name of the element we're setting the innerHTML property
@@ -266,8 +250,7 @@ def parseFragment(self, stream, container="div", encoding=None,
266250
267251
scripting - treat noscript elements as if javascript was turned on
268252
"""
269-
self._parse(stream, True, container=container,
270-
encoding=encoding, scripting=scripting)
253+
self._parse(stream, True, *args, **kwargs)
271254
return self.tree.getFragment()
272255

273256
def parseError(self, errorcode="XXX-undefined-error", datavars=None):

html5lib/inputstream.py

Lines changed: 62 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ def _readFromBuffer(self, bytes):
128128
return b"".join(rv)
129129

130130

131-
def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
131+
def HTMLInputStream(source, **kwargs):
132132
# Work around Python bug #20007: read(0) closes the connection.
133133
# http://bugs.python.org/issue20007
134134
if (isinstance(source, http_client.HTTPResponse) or
@@ -142,12 +142,13 @@ def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
142142
isUnicode = isinstance(source, text_type)
143143

144144
if isUnicode:
145-
if encoding is not None:
146-
raise TypeError("Cannot explicitly set an encoding with a unicode string")
145+
encodings = [x for x in kwargs if x.endswith("_encoding")]
146+
if encodings:
147+
raise TypeError("Cannot set an encoding with a unicode input, set %r" % encodings)
147148

148-
return HTMLUnicodeInputStream(source)
149+
return HTMLUnicodeInputStream(source, **kwargs)
149150
else:
150-
return HTMLBinaryInputStream(source, encoding, parseMeta, chardet)
151+
return HTMLBinaryInputStream(source, **kwargs)
151152

152153

153154
class HTMLUnicodeInputStream(object):
@@ -173,8 +174,6 @@ def __init__(self, source):
173174
regardless of any BOM or later declaration (such as in a meta
174175
element)
175176
176-
parseMeta - Look for a <meta> element containing encoding information
177-
178177
"""
179178

180179
if not utils.supports_lone_surrogates:
@@ -390,7 +389,9 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
390389
391390
"""
392391

393-
def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
392+
def __init__(self, source, override_encoding=None, transport_encoding=None,
393+
same_origin_parent_encoding=None, likely_encoding=None,
394+
default_encoding="windows-1252", useChardet=True):
394395
"""Initialises the HTMLInputStream.
395396
396397
HTMLInputStream(source, [encoding]) -> Normalized stream from source
@@ -403,30 +404,29 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
403404
regardless of any BOM or later declaration (such as in a meta
404405
element)
405406
406-
parseMeta - Look for a <meta> element containing encoding information
407-
408407
"""
409408
# Raw Stream - for unicode objects this will encode to utf-8 and set
410409
# self.charEncoding as appropriate
411410
self.rawStream = self.openStream(source)
412411

413412
HTMLUnicodeInputStream.__init__(self, self.rawStream)
414413

415-
self.charEncoding = (lookupEncoding(encoding), "certain")
416-
417414
# Encoding Information
418415
# Number of bytes to use when looking for a meta element with
419416
# encoding information
420417
self.numBytesMeta = 1024
421418
# Number of bytes to use when using detecting encoding using chardet
422419
self.numBytesChardet = 100
423-
# Encoding to use if no other information can be found
424-
self.defaultEncoding = "windows-1252"
420+
# Things from args
421+
self.override_encoding = override_encoding
422+
self.transport_encoding = transport_encoding
423+
self.same_origin_parent_encoding = same_origin_parent_encoding
424+
self.likely_encoding = likely_encoding
425+
self.default_encoding = default_encoding
425426

426-
# Detect encoding iff no explicit "transport level" encoding is supplied
427-
if (self.charEncoding[0] is None):
428-
self.charEncoding = self.detectEncoding(parseMeta, chardet)
429-
assert self.charEncoding[0] is not None
427+
# Determine encoding
428+
self.charEncoding = self.determineEncoding(useChardet)
429+
assert self.charEncoding[0] is not None
430430

431431
# Call superclass
432432
self.reset()
@@ -454,21 +454,45 @@ def openStream(self, source):
454454

455455
return stream
456456

457-
def detectEncoding(self, parseMeta=True, chardet=True):
458-
# First look for a BOM
457+
def determineEncoding(self, chardet=True):
458+
# BOMs take precedence over everything
459459
# This will also read past the BOM if present
460-
encoding = self.detectBOM()
461-
confidence = "certain"
462-
# If there is no BOM need to look for meta elements with encoding
463-
# information
464-
if encoding is None and parseMeta:
465-
encoding = self.detectEncodingMeta()
466-
confidence = "tentative"
460+
charEncoding = self.detectBOM(), "certain"
461+
if charEncoding[0] is not None:
462+
return charEncoding
463+
464+
# If we've been overriden, we've been overriden
465+
charEncoding = lookupEncoding(self.override_encoding), "certain"
466+
if charEncoding[0] is not None:
467+
return charEncoding
468+
469+
# Now check the transport layer
470+
charEncoding = lookupEncoding(self.transport_encoding), "certain"
471+
if charEncoding[0] is not None:
472+
return charEncoding
473+
474+
# Look for meta elements with encoding information
475+
charEncoding = self.detectEncodingMeta(), "tentative"
476+
if charEncoding[0] is not None:
477+
return charEncoding
478+
479+
# Parent document encoding
480+
charEncoding = lookupEncoding(self.same_origin_parent_encoding), "tentative"
481+
if charEncoding[0] is not None and not charEncoding[0].name.startswith("utf-16"):
482+
return charEncoding
483+
484+
# "likely" encoding
485+
charEncoding = lookupEncoding(self.likely_encoding), "tentative"
486+
if charEncoding[0] is not None:
487+
return charEncoding
488+
467489
# Guess with chardet, if available
468-
if encoding is None and chardet:
469-
confidence = "tentative"
490+
if chardet:
470491
try:
471492
from chardet.universaldetector import UniversalDetector
493+
except ImportError:
494+
pass
495+
else:
472496
buffers = []
473497
detector = UniversalDetector()
474498
while not detector.done:
@@ -481,14 +505,16 @@ def detectEncoding(self, parseMeta=True, chardet=True):
481505
detector.close()
482506
encoding = lookupEncoding(detector.result['encoding'])
483507
self.rawStream.seek(0)
484-
except ImportError:
485-
pass
486-
# If all else fails use the default encoding
487-
if encoding is None:
488-
confidence = "tentative"
489-
encoding = lookupEncoding(self.defaultEncoding)
508+
if encoding is not None:
509+
return encoding, "tentative"
510+
511+
# Try the default encoding
512+
charEncoding = lookupEncoding(self.default_encoding), "tentative"
513+
if charEncoding[0] is not None:
514+
return charEncoding
490515

491-
return encoding, confidence
516+
# Fallback to html5lib's default if even that hasn't worked
517+
return lookupEncoding("windows-1252"), "tentative"
492518

493519
def changeEncoding(self, newEncoding):
494520
assert self.charEncoding[1] != "certain"

html5lib/tests/test_encoding.py

Lines changed: 51 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
import os
44

5+
import pytest
6+
57
from .support import get_data_files, test_dir, errorMessage, TestData as _TestData
68
from html5lib import HTMLParser, inputstream
79

@@ -11,7 +13,7 @@ def test_basic_prescan_length():
1113
pad = 1024 - len(data) + 1
1214
data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-")
1315
assert len(data) == 1024 # Sanity
14-
stream = inputstream.HTMLBinaryInputStream(data, chardet=False)
16+
stream = inputstream.HTMLBinaryInputStream(data, useChardet=False)
1517
assert 'utf-8' == stream.charEncoding[0].name
1618

1719

@@ -20,14 +22,59 @@ def test_parser_reparse():
2022
pad = 10240 - len(data) + 1
2123
data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-")
2224
assert len(data) == 10240 # Sanity
23-
stream = inputstream.HTMLBinaryInputStream(data, chardet=False)
25+
stream = inputstream.HTMLBinaryInputStream(data, useChardet=False)
2426
assert 'windows-1252' == stream.charEncoding[0].name
2527
p = HTMLParser(namespaceHTMLElements=False)
2628
doc = p.parse(data, useChardet=False)
2729
assert 'utf-8' == p.documentEncoding
2830
assert doc.find(".//title").text == "Caf\u00E9"
2931

3032

33+
@pytest.mark.parametrize("expected,data,kwargs", [
34+
("utf-16le", b"\xFF\xFE", {"override_encoding": "iso-8859-2"}),
35+
("utf-16be", b"\xFE\xFF", {"override_encoding": "iso-8859-2"}),
36+
("utf-8", b"\xEF\xBB\xBF", {"override_encoding": "iso-8859-2"}),
37+
("iso-8859-2", b"", {"override_encoding": "iso-8859-2", "transport_encoding": "iso-8859-3"}),
38+
("iso-8859-2", b"<meta charset=iso-8859-3>", {"transport_encoding": "iso-8859-2"}),
39+
("iso-8859-2", b"<meta charset=iso-8859-2>", {"same_origin_parent_encoding": "iso-8859-3"}),
40+
("iso-8859-2", b"", {"same_origin_parent_encoding": "iso-8859-2", "likely_encoding": "iso-8859-3"}),
41+
("iso-8859-2", b"", {"same_origin_parent_encoding": "utf-16", "likely_encoding": "iso-8859-2"}),
42+
("iso-8859-2", b"", {"same_origin_parent_encoding": "utf-16be", "likely_encoding": "iso-8859-2"}),
43+
("iso-8859-2", b"", {"same_origin_parent_encoding": "utf-16le", "likely_encoding": "iso-8859-2"}),
44+
("iso-8859-2", b"", {"likely_encoding": "iso-8859-2", "default_encoding": "iso-8859-3"}),
45+
("iso-8859-2", b"", {"default_encoding": "iso-8859-2"}),
46+
("windows-1252", b"", {"default_encoding": "totally-bogus-string"}),
47+
("windows-1252", b"", {}),
48+
])
49+
def test_parser_args(expected, data, kwargs):
50+
stream = inputstream.HTMLBinaryInputStream(data, useChardet=False, **kwargs)
51+
assert expected == stream.charEncoding[0].name
52+
p = HTMLParser()
53+
p.parse(data, useChardet=False, **kwargs)
54+
assert expected == p.documentEncoding
55+
56+
57+
@pytest.mark.parametrize("kwargs", [
58+
{"override_encoding": "iso-8859-2"},
59+
{"override_encoding": None},
60+
{"transport_encoding": "iso-8859-2"},
61+
{"transport_encoding": None},
62+
{"same_origin_parent_encoding": "iso-8859-2"},
63+
{"same_origin_parent_encoding": None},
64+
{"likely_encoding": "iso-8859-2"},
65+
{"likely_encoding": None},
66+
{"default_encoding": "iso-8859-2"},
67+
{"default_encoding": None},
68+
{"foo_encoding": "iso-8859-2"},
69+
{"foo_encoding": None},
70+
])
71+
def test_parser_args_raises(kwargs):
72+
with pytest.raises(TypeError) as exc_info:
73+
p = HTMLParser()
74+
p.parse("", useChardet=False, **kwargs)
75+
assert exc_info.value.args[0].startswith("Cannot set an encoding with a unicode input")
76+
77+
3178
def runParserEncodingTest(data, encoding):
3279
p = HTMLParser()
3380
assert p.documentEncoding is None
@@ -38,7 +85,7 @@ def runParserEncodingTest(data, encoding):
3885

3986

4087
def runPreScanEncodingTest(data, encoding):
41-
stream = inputstream.HTMLBinaryInputStream(data, chardet=False)
88+
stream = inputstream.HTMLBinaryInputStream(data, useChardet=False)
4289
encoding = encoding.lower().decode("ascii")
4390

4491
# Very crude way to ignore irrelevant tests
@@ -55,6 +102,7 @@ def test_encoding():
55102
yield (runParserEncodingTest, test[b'data'], test[b'encoding'])
56103
yield (runPreScanEncodingTest, test[b'data'], test[b'encoding'])
57104

105+
58106
# pylint:disable=wrong-import-position
59107
try:
60108
import chardet # noqa

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy