diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index b0f14f39..14a494a7 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -129,6 +129,17 @@ def reset(self): self.framesetOK = True + @property + def documentEncoding(self): + """The name of the character encoding + that was used to decode the input stream, + or :obj:`None` if that is not determined yet. + + """ + if not hasattr(self, 'tokenizer'): + return None + return self.tokenizer.stream.documentEncoding + def isHTMLIntegrationPoint(self, element): if (element.name == "annotation-xml" and element.namespace == namespaces["mathml"]): diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py index 004bdd4a..1fb38a2f 100644 --- a/html5lib/inputstream.py +++ b/html5lib/inputstream.py @@ -142,6 +142,8 @@ class HTMLUnicodeInputStream(object): _defaultChunkSize = 10240 + documentEncoding = None # No encoding involved for Unicode input. + def __init__(self, source): """Initialises the HTMLInputStream. @@ -413,6 +415,10 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True): # Call superclass self.reset() + @property + def documentEncoding(self): + return self.charEncoding[0] + def reset(self): self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream, 'replace') diff --git a/html5lib/tests/test_encoding.py b/html5lib/tests/test_encoding.py index f314421d..fb713761 100644 --- a/html5lib/tests/test_encoding.py +++ b/html5lib/tests/test_encoding.py @@ -26,12 +26,35 @@ def test_codec_name_d(self): self.assertEqual(inputstream.codecName("ISO_8859--1"), "windows-1252") +def test_unicode_input_encoding(): + p = HTMLParser() + assert p.documentEncoding is None + p.parse(b'', useChardet=False) + assert p.documentEncoding == 'iso8859-2' + + p = HTMLParser() + assert p.documentEncoding is None + p.parse('') + assert p.documentEncoding is None + + p = HTMLParser() + assert p.documentEncoding is None + try: + p.parse('', encoding='latin3') + except TypeError: + pass + else: + assert 0, 'Expected TypeError' + assert p.documentEncoding is None + + def runParserEncodingTest(data, encoding): p = HTMLParser() + assert p.documentEncoding is None p.parse(data, useChardet=False) encoding = encoding.lower().decode("ascii") - assert encoding == p.tokenizer.stream.charEncoding[0], errorMessage(data, encoding, p.tokenizer.stream.charEncoding[0]) + assert encoding == p.documentEncoding, errorMessage(data, encoding, p.documentEncoding) def runPreScanEncodingTest(data, encoding): pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.

Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy