From 9aab9221cbc1ea301e8da5096e02653b58947e78 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Sat, 30 Nov 2013 23:01:43 +0000 Subject: [PATCH 1/2] Add a usedEncoding method to HTML5Parser, fix #121 --- html5lib/html5parser.py | 10 ++++++++++ html5lib/inputstream.py | 6 ++++++ html5lib/tests/test_encoding.py | 25 ++++++++++++++++++++++++- 3 files changed, 40 insertions(+), 1 deletion(-) diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index b0f14f39..f0121a4b 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -129,6 +129,16 @@ def reset(self): self.framesetOK = True + def usedEncoding(self): + """Return the name of the character encoding + that was used to decode the input stream, + or :obj:`None` if that is not determined yet. + + """ + if not hasattr(self, 'tokenizer'): + return None + return self.tokenizer.stream.usedEncoding() + def isHTMLIntegrationPoint(self, element): if (element.name == "annotation-xml" and element.namespace == namespaces["mathml"]): diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py index 004bdd4a..0275c0b8 100644 --- a/html5lib/inputstream.py +++ b/html5lib/inputstream.py @@ -175,6 +175,9 @@ def __init__(self, source): self.reset() + def usedEncoding(self): + return None # No encoding involved for Unicode input. + def reset(self): self.chunk = "" self.chunkSize = 0 @@ -413,6 +416,9 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True): # Call superclass self.reset() + def usedEncoding(self): + return self.charEncoding[0] + def reset(self): self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream, 'replace') diff --git a/html5lib/tests/test_encoding.py b/html5lib/tests/test_encoding.py index f314421d..6a1a6d0d 100644 --- a/html5lib/tests/test_encoding.py +++ b/html5lib/tests/test_encoding.py @@ -26,12 +26,35 @@ def test_codec_name_d(self): self.assertEqual(inputstream.codecName("ISO_8859--1"), "windows-1252") +def test_unicode_input_encoding(): + p = HTMLParser() + assert p.usedEncoding() is None + p.parse(b'', useChardet=False) + assert p.usedEncoding() == 'iso8859-2' + + p = HTMLParser() + assert p.usedEncoding() is None + p.parse('') + assert p.usedEncoding() is None + + p = HTMLParser() + assert p.usedEncoding() is None + try: + p.parse('', encoding='latin3') + except TypeError: + pass + else: + assert 0, 'Expected TypeError' + assert p.usedEncoding() is None + + def runParserEncodingTest(data, encoding): p = HTMLParser() + assert p.usedEncoding() is None p.parse(data, useChardet=False) encoding = encoding.lower().decode("ascii") - assert encoding == p.tokenizer.stream.charEncoding[0], errorMessage(data, encoding, p.tokenizer.stream.charEncoding[0]) + assert encoding == p.usedEncoding(), errorMessage(data, encoding, p.usedEncoding()) def runPreScanEncodingTest(data, encoding): From 722bfd36028fc5dd65babb266499e3b0b1bb770b Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Mon, 2 Dec 2013 14:44:02 +0000 Subject: [PATCH 2/2] Rename the usedEncoding method to documentEncoding, and make it a property. --- html5lib/html5parser.py | 7 ++++--- html5lib/inputstream.py | 8 ++++---- html5lib/tests/test_encoding.py | 16 ++++++++-------- 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index f0121a4b..14a494a7 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -129,15 +129,16 @@ def reset(self): self.framesetOK = True - def usedEncoding(self): - """Return the name of the character encoding + @property + def documentEncoding(self): + """The name of the character encoding that was used to decode the input stream, or :obj:`None` if that is not determined yet. """ if not hasattr(self, 'tokenizer'): return None - return self.tokenizer.stream.usedEncoding() + return self.tokenizer.stream.documentEncoding def isHTMLIntegrationPoint(self, element): if (element.name == "annotation-xml" and diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py index 0275c0b8..1fb38a2f 100644 --- a/html5lib/inputstream.py +++ b/html5lib/inputstream.py @@ -142,6 +142,8 @@ class HTMLUnicodeInputStream(object): _defaultChunkSize = 10240 + documentEncoding = None # No encoding involved for Unicode input. + def __init__(self, source): """Initialises the HTMLInputStream. @@ -175,9 +177,6 @@ def __init__(self, source): self.reset() - def usedEncoding(self): - return None # No encoding involved for Unicode input. - def reset(self): self.chunk = "" self.chunkSize = 0 @@ -416,7 +415,8 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True): # Call superclass self.reset() - def usedEncoding(self): + @property + def documentEncoding(self): return self.charEncoding[0] def reset(self): diff --git a/html5lib/tests/test_encoding.py b/html5lib/tests/test_encoding.py index 6a1a6d0d..fb713761 100644 --- a/html5lib/tests/test_encoding.py +++ b/html5lib/tests/test_encoding.py @@ -28,33 +28,33 @@ def test_codec_name_d(self): def test_unicode_input_encoding(): p = HTMLParser() - assert p.usedEncoding() is None + assert p.documentEncoding is None p.parse(b'', useChardet=False) - assert p.usedEncoding() == 'iso8859-2' + assert p.documentEncoding == 'iso8859-2' p = HTMLParser() - assert p.usedEncoding() is None + assert p.documentEncoding is None p.parse('') - assert p.usedEncoding() is None + assert p.documentEncoding is None p = HTMLParser() - assert p.usedEncoding() is None + assert p.documentEncoding is None try: p.parse('', encoding='latin3') except TypeError: pass else: assert 0, 'Expected TypeError' - assert p.usedEncoding() is None + assert p.documentEncoding is None def runParserEncodingTest(data, encoding): p = HTMLParser() - assert p.usedEncoding() is None + assert p.documentEncoding is None p.parse(data, useChardet=False) encoding = encoding.lower().decode("ascii") - assert encoding == p.usedEncoding(), errorMessage(data, encoding, p.usedEncoding()) + assert encoding == p.documentEncoding, errorMessage(data, encoding, p.documentEncoding) def runPreScanEncodingTest(data, encoding): pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy