From a5e861a5041c2c18893a1f916b5bd446b2a5bc06 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Tue, 31 Dec 2013 10:22:51 +0100 Subject: [PATCH 1/2] WIP More general fix for #127 with addinfourl See #134. --- html5lib/inputstream.py | 25 ++++++++++++--------- html5lib/tests/test_stream.py | 42 +++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 10 deletions(-) diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py index 9e03b931..339005e9 100644 --- a/html5lib/inputstream.py +++ b/html5lib/inputstream.py @@ -1,6 +1,5 @@ from __future__ import absolute_import, division, unicode_literals from six import text_type -from six.moves import http_client import codecs import re @@ -119,22 +118,24 @@ def _readFromBuffer(self, bytes): def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True): - if isinstance(source, http_client.HTTPResponse): - # Work around Python bug #20007: read(0) closes the connection. + if hasattr(source, "read"): + # Do no use .read(0) because of Python bug #20007 # http://bugs.python.org/issue20007 - isUnicode = False - elif hasattr(source, "read"): - isUnicode = isinstance(source.read(0), text_type) + firstChunk = source.read(HTMLUnicodeInputStream._defaultChunkSize) + print(firstChunk) + isUnicode = isinstance(firstChunk, text_type) else: isUnicode = isinstance(source, text_type) + firstChunk = "" if isUnicode else b"" if isUnicode: if encoding is not None: raise TypeError("Cannot explicitly set an encoding with a unicode string") - return HTMLUnicodeInputStream(source) + return HTMLUnicodeInputStream(source, firstChunk) else: - return HTMLBinaryInputStream(source, encoding, parseMeta, chardet) + return HTMLBinaryInputStream( + source, firstChunk, encoding, parseMeta, chardet) class HTMLUnicodeInputStream(object): @@ -147,7 +148,7 @@ class HTMLUnicodeInputStream(object): _defaultChunkSize = 10240 - def __init__(self, source): + def __init__(self, source, firstChunk=""): """Initialises the HTMLInputStream. HTMLInputStream(source, [encoding]) -> Normalized stream from source @@ -163,6 +164,7 @@ def __init__(self, source): parseMeta - Look for a element containing encoding information """ + # XXX do something with firstChunk # Craziness if len("\U0010FFFF") == 1: @@ -378,7 +380,8 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream): """ - def __init__(self, source, encoding=None, parseMeta=True, chardet=True): + def __init__(self, source, firstChunk=b"", encoding=None, + parseMeta=True, chardet=True): """Initialises the HTMLInputStream. HTMLInputStream(source, [encoding]) -> Normalized stream from source @@ -394,6 +397,8 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True): parseMeta - Look for a element containing encoding information """ + # XXX do something with firstChunk + # Raw Stream - for unicode objects this will encode to utf-8 and set # self.charEncoding as appropriate self.rawStream = self.openStream(source) diff --git a/html5lib/tests/test_stream.py b/html5lib/tests/test_stream.py index 2a876c1d..b769c6e1 100644 --- a/html5lib/tests/test_stream.py +++ b/html5lib/tests/test_stream.py @@ -1,3 +1,5 @@ +# coding: utf8 + from __future__ import absolute_import, division, unicode_literals from . import support # flake8: noqa @@ -6,6 +8,7 @@ from io import BytesIO from six.moves import http_client +from six.moves.urllib.response import addinfourl from html5lib.inputstream import (BufferedStream, HTMLInputStream, HTMLUnicodeInputStream, HTMLBinaryInputStream) @@ -156,6 +159,25 @@ def test_position2(self): self.assertEqual(stream.char(), "d") self.assertEqual(stream.position(), (2, 1)) + def test_non_seekable_stream(self): + class Stream(object): + def __init__(self, data): + self.data = data + + def read(self, n=None): + if n is None: + data = self.data + self.data = b'' + return data + else: + data = self.data[:n] + self.data = self.data[n:] + return data + + # Fails when firstChunk is ignored + stream = HTMLInputStream(Stream(b"Test")) + self.assertEqual(stream.charsUntil(" "), "Test") + def test_python_issue_20007(self): """ Make sure we have a work-around for Python bug #20007 @@ -170,6 +192,26 @@ def makefile(self, _mode, _bufsize=None): stream = HTMLInputStream(source) self.assertEqual(stream.charsUntil(" "), "Text") + def test_python_issue_20007_addinfourl(self): + """ + Same as above, but the source is not necessarily an instance + of HTTPResponse. + """ + class FakeSocket(object): + def makefile(self, _mode, _bufsize=None): + return BytesIO(b"HTTP/1.1 200 Ok\r\n\r\nText") + + source = http_client.HTTPResponse(FakeSocket()) + source.begin() + try: + source = addinfourl(source, None, None) + except AttributeError: + # Fails on Python 2.x where HTTPResponse does not have .readline() + # Apparently, addinfourl it only used with HTTPResponse on 3.x + pass + else: + stream = HTMLInputStream(source) + self.assertEqual(stream.charsUntil(" "), "Text") def buildTestSuite(): return unittest.defaultTestLoader.loadTestsFromName(__name__) From b4861c60184bf73cc40774888fd01cbb5b3ddcef Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Tue, 31 Dec 2013 17:09:27 +0100 Subject: [PATCH 2/2] Adress some review comments. --- html5lib/inputstream.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py index 339005e9..674fabe2 100644 --- a/html5lib/inputstream.py +++ b/html5lib/inputstream.py @@ -119,14 +119,13 @@ def _readFromBuffer(self, bytes): def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True): if hasattr(source, "read"): - # Do no use .read(0) because of Python bug #20007 + # Do not use .read(0) because of Python bug #20007 # http://bugs.python.org/issue20007 firstChunk = source.read(HTMLUnicodeInputStream._defaultChunkSize) - print(firstChunk) isUnicode = isinstance(firstChunk, text_type) else: isUnicode = isinstance(source, text_type) - firstChunk = "" if isUnicode else b"" + firstChunk = None if isUnicode: if encoding is not None: @@ -134,8 +133,8 @@ def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True): return HTMLUnicodeInputStream(source, firstChunk) else: - return HTMLBinaryInputStream( - source, firstChunk, encoding, parseMeta, chardet) + return HTMLBinaryInputStream(source, firstChunk, encoding, parseMeta, + chardet) class HTMLUnicodeInputStream(object): pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy