diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py index 9e03b931..674fabe2 100644 --- a/html5lib/inputstream.py +++ b/html5lib/inputstream.py @@ -1,6 +1,5 @@ from __future__ import absolute_import, division, unicode_literals from six import text_type -from six.moves import http_client import codecs import re @@ -119,22 +118,23 @@ def _readFromBuffer(self, bytes): def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True): - if isinstance(source, http_client.HTTPResponse): - # Work around Python bug #20007: read(0) closes the connection. + if hasattr(source, "read"): + # Do not use .read(0) because of Python bug #20007 # http://bugs.python.org/issue20007 - isUnicode = False - elif hasattr(source, "read"): - isUnicode = isinstance(source.read(0), text_type) + firstChunk = source.read(HTMLUnicodeInputStream._defaultChunkSize) + isUnicode = isinstance(firstChunk, text_type) else: isUnicode = isinstance(source, text_type) + firstChunk = None if isUnicode: if encoding is not None: raise TypeError("Cannot explicitly set an encoding with a unicode string") - return HTMLUnicodeInputStream(source) + return HTMLUnicodeInputStream(source, firstChunk) else: - return HTMLBinaryInputStream(source, encoding, parseMeta, chardet) + return HTMLBinaryInputStream(source, firstChunk, encoding, parseMeta, + chardet) class HTMLUnicodeInputStream(object): @@ -147,7 +147,7 @@ class HTMLUnicodeInputStream(object): _defaultChunkSize = 10240 - def __init__(self, source): + def __init__(self, source, firstChunk=""): """Initialises the HTMLInputStream. HTMLInputStream(source, [encoding]) -> Normalized stream from source @@ -163,6 +163,7 @@ def __init__(self, source): parseMeta - Look for a element containing encoding information """ + # XXX do something with firstChunk # Craziness if len("\U0010FFFF") == 1: @@ -378,7 +379,8 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream): """ - def __init__(self, source, encoding=None, parseMeta=True, chardet=True): + def __init__(self, source, firstChunk=b"", encoding=None, + parseMeta=True, chardet=True): """Initialises the HTMLInputStream. HTMLInputStream(source, [encoding]) -> Normalized stream from source @@ -394,6 +396,8 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True): parseMeta - Look for a element containing encoding information """ + # XXX do something with firstChunk + # Raw Stream - for unicode objects this will encode to utf-8 and set # self.charEncoding as appropriate self.rawStream = self.openStream(source) diff --git a/html5lib/tests/test_stream.py b/html5lib/tests/test_stream.py index 2a876c1d..b769c6e1 100644 --- a/html5lib/tests/test_stream.py +++ b/html5lib/tests/test_stream.py @@ -1,3 +1,5 @@ +# coding: utf8 + from __future__ import absolute_import, division, unicode_literals from . import support # flake8: noqa @@ -6,6 +8,7 @@ from io import BytesIO from six.moves import http_client +from six.moves.urllib.response import addinfourl from html5lib.inputstream import (BufferedStream, HTMLInputStream, HTMLUnicodeInputStream, HTMLBinaryInputStream) @@ -156,6 +159,25 @@ def test_position2(self): self.assertEqual(stream.char(), "d") self.assertEqual(stream.position(), (2, 1)) + def test_non_seekable_stream(self): + class Stream(object): + def __init__(self, data): + self.data = data + + def read(self, n=None): + if n is None: + data = self.data + self.data = b'' + return data + else: + data = self.data[:n] + self.data = self.data[n:] + return data + + # Fails when firstChunk is ignored + stream = HTMLInputStream(Stream(b"Test")) + self.assertEqual(stream.charsUntil(" "), "Test") + def test_python_issue_20007(self): """ Make sure we have a work-around for Python bug #20007 @@ -170,6 +192,26 @@ def makefile(self, _mode, _bufsize=None): stream = HTMLInputStream(source) self.assertEqual(stream.charsUntil(" "), "Text") + def test_python_issue_20007_addinfourl(self): + """ + Same as above, but the source is not necessarily an instance + of HTTPResponse. + """ + class FakeSocket(object): + def makefile(self, _mode, _bufsize=None): + return BytesIO(b"HTTP/1.1 200 Ok\r\n\r\nText") + + source = http_client.HTTPResponse(FakeSocket()) + source.begin() + try: + source = addinfourl(source, None, None) + except AttributeError: + # Fails on Python 2.x where HTTPResponse does not have .readline() + # Apparently, addinfourl it only used with HTTPResponse on 3.x + pass + else: + stream = HTMLInputStream(source) + self.assertEqual(stream.charsUntil(" "), "Text") def buildTestSuite(): return unittest.defaultTestLoader.loadTestsFromName(__name__)
Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.
Alternative Proxies: