diff --git a/.pylintrc b/.pylintrc index ea74d5db..c60b8510 100644 --- a/.pylintrc +++ b/.pylintrc @@ -3,7 +3,7 @@ ignore=tests [MESSAGES CONTROL] # messages up to fixme should probably be fixed somehow -disable = redefined-builtin,attribute-defined-outside-init,anomalous-backslash-in-string,no-self-use,redefined-outer-name,bad-continuation,wrong-import-order,superfluous-parens,no-member,duplicate-code,super-init-not-called,abstract-method,property-on-old-class,wrong-import-position,no-name-in-module,no-init,bad-mcs-classmethod-argument,bad-classmethod-argument,fixme,invalid-name,import-error,too-few-public-methods,too-many-ancestors,too-many-arguments,too-many-boolean-expressions,too-many-branches,too-many-instance-attributes,too-many-locals,too-many-lines,too-many-public-methods,too-many-return-statements,too-many-statements,missing-docstring,line-too-long,locally-disabled,locally-enabled,bad-builtin,deprecated-lambda +disable = redefined-builtin,attribute-defined-outside-init,anomalous-backslash-in-string,no-self-use,redefined-outer-name,bad-continuation,wrong-import-order,superfluous-parens,no-member,duplicate-code,super-init-not-called,abstract-method,property-on-old-class,wrong-import-position,no-name-in-module,no-init,bad-mcs-classmethod-argument,bad-classmethod-argument,fixme,invalid-name,import-error,too-few-public-methods,too-many-ancestors,too-many-arguments,too-many-boolean-expressions,too-many-branches,too-many-instance-attributes,too-many-locals,too-many-lines,too-many-public-methods,too-many-return-statements,too-many-statements,missing-docstring,line-too-long,locally-disabled,locally-enabled,bad-builtin,deprecated-lambda,bad-option-value,star-args,abstract-class-little-used,abstract-class-not-used [FORMAT] max-line-length=139 diff --git a/.travis.yml b/.travis.yml index 94bb87e7..7f9aecd5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -27,6 +27,7 @@ script: - if [[ $TRAVIS_PYTHON_VERSION == pypy* ]]; then py.test; fi - if [[ $TRAVIS_PYTHON_VERSION != pypy* ]]; then coverage run -m pytest; fi - bash flake8-run.sh + - pylint --rcfile=.pylintrc html5lib after_script: - python debug-info.py diff --git a/debug-info.py b/debug-info.py index f93fbdbe..c213f7cc 100644 --- a/debug-info.py +++ b/debug-info.py @@ -1,4 +1,4 @@ -from __future__ import print_function, unicode_literals +from __future__ import absolute_import, division, print_function, unicode_literals import platform import sys diff --git a/flake8-run.sh b/flake8-run.sh index d9264946..e8652e9e 100755 --- a/flake8-run.sh +++ b/flake8-run.sh @@ -5,5 +5,6 @@ if [[ ! -x $(which flake8) ]]; then exit 1 fi +flake8 --version flake8 `dirname $0` exit $? diff --git a/html5lib/_inputstream.py b/html5lib/_inputstream.py index 79f2331e..a6787ac4 100644 --- a/html5lib/_inputstream.py +++ b/html5lib/_inputstream.py @@ -238,8 +238,9 @@ def position(self): return (line + 1, col) def char(self): - """ Read one character from the stream or queue if available. Return - EOF when EOF is reached. + """Read one character from the stream or queue if available. + + Return EOF when EOF is reached. """ # Read a new chunk from the input stream if necessary if self.chunkOffset >= self.chunkSize: @@ -318,7 +319,7 @@ def characterErrorsUCS2(self, data): self.errors.append("invalid-codepoint") def charsUntil(self, characters, opposite=False): - """ Returns a string of characters from the stream up to but not + """Returns a string of characters from the stream up to but not including any character in 'characters' or EOF. 'characters' must be a container that supports the 'in' method and iteration over its characters. @@ -330,7 +331,7 @@ def charsUntil(self, characters, opposite=False): except KeyError: if __debug__: for c in characters: - assert(ord(c) < 128) + assert ord(c) < 128 regex = "".join(["\\x%02x" % ord(c) for c in characters]) if not opposite: regex = "^%s" % regex @@ -449,7 +450,7 @@ def openStream(self, source): try: stream.seek(stream.tell()) - except: # pylint:disable=bare-except + except Exception: # pylint: disable=broad-except stream = BufferedStream(stream) return stream @@ -567,8 +568,7 @@ def detectBOM(self): return None def detectEncodingMeta(self): - """Report the encoding declared by the meta element - """ + """Report the encoding declared by the meta element.""" buffer = self.rawStream.read(self.numBytesMeta) assert isinstance(buffer, bytes) parser = EncodingParser(buffer) @@ -686,10 +686,12 @@ def jumpTo(self, bytes): class EncodingParser(object): - """Mini parser for detecting character encoding from meta elements""" + """Mini parser for detecting character encoding from meta elements.""" def __init__(self, data): - """string - the data to work on for encoding detection""" + """Constructor. + + data - the data to work on for encoding detection""" self.data = EncodingBytes(data) self.encoding = None diff --git a/html5lib/_tokenizer.py b/html5lib/_tokenizer.py index 6078f66a..4cf46c2a 100644 --- a/html5lib/_tokenizer.py +++ b/html5lib/_tokenizer.py @@ -19,7 +19,7 @@ class HTMLTokenizer(object): - """ This class takes care of tokenizing HTML. + """This class takes care of tokenizing HTML. * self.currentToken Holds the token that is currently being processed. @@ -47,7 +47,7 @@ def __init__(self, stream, parser=None, **kwargs): super(HTMLTokenizer, self).__init__() def __iter__(self): - """ This is where the magic happens. + """This is where the magic happens. We do our usually processing through the states and when we have a token to return we yield the token which pauses processing until the next token @@ -215,8 +215,7 @@ def consumeEntity(self, allowedChar=None, fromAttribute=False): self.tokenQueue.append({"type": tokenTypes[tokenType], "data": output}) def processEntityInAttribute(self, allowedChar): - """This method replaces the need for "entityInAttributeValueState". - """ + """This method replaces the need for "entityInAttributeValueState".""" self.consumeEntity(allowedChar=allowedChar, fromAttribute=True) def emitCurrentToken(self): @@ -1686,8 +1685,7 @@ def bogusDoctypeState(self): self.stream.unget(data) self.tokenQueue.append(self.currentToken) self.state = self.dataState - else: - pass + return True def cdataSectionState(self): diff --git a/html5lib/_utils.py b/html5lib/_utils.py index 03f0dab7..8cfe5ee6 100644 --- a/html5lib/_utils.py +++ b/html5lib/_utils.py @@ -30,14 +30,14 @@ # We need this with u"" because of http://bugs.jython.org/issue2039 _x = eval('u"\\uD800"') # pylint:disable=eval-used assert isinstance(_x, text_type) -except: # pylint:disable=bare-except +except Exception: # pylint: disable=broad-except supports_lone_surrogates = False else: supports_lone_surrogates = True class MethodDispatcher(dict): - """Dict with 2 special properties: + """Dict with 2 special properties. On initiation, keys that are lists, sets or tuples are converted to multiple keys so accessing any one of the items in the original diff --git a/html5lib/filters/sanitizer.py b/html5lib/filters/sanitizer.py index b5ddcb93..292af95e 100644 --- a/html5lib/filters/sanitizer.py +++ b/html5lib/filters/sanitizer.py @@ -705,7 +705,7 @@ class Filter(base.Filter): - """ sanitization of XHTML+MathML+SVG and of inline style attributes.""" + """Sanitization of XHTML+MathML+SVG and of inline style attributes.""" def __init__(self, source, allowed_elements=allowed_elements, diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index 2abd63e4..bb500811 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -55,10 +55,11 @@ def __new__(meta, classname, bases, classDict): class HTMLParser(object): """HTML parser. Generates a tree structure from a stream of (possibly - malformed) HTML""" + malformed) HTML""" def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False): - """ + """Constructor. + strict - raise an exception when a parse error is encountered tree - a treebuilder class controlling the type of tree that will be @@ -108,10 +109,9 @@ def reset(self): self.tokenizer.state = self.tokenizer.rawtextState elif self.innerHTML == 'plaintext': self.tokenizer.state = self.tokenizer.plaintextState - else: - # state already is data state - # self.tokenizer.state = self.tokenizer.dataState - pass + # else state already is data state + # i.e. self.tokenizer.state = self.tokenizer.dataState + self.phase = self.phases["beforeHtml"] self.phase.insertHtmlElement() self.resetInsertionMode() @@ -262,7 +262,7 @@ def parseError(self, errorcode="XXX-undefined-error", datavars=None): raise ParseError(E[errorcode] % datavars) def normalizeToken(self, token): - """ HTML5 specific normalizations to the token stream """ + """HTML5 specific normalizations to the token stream.""" if token["type"] == tokenTypes["StartTag"]: raw = token["data"] @@ -358,10 +358,7 @@ def log(function): def wrapped(self, *args, **kwargs): if function.__name__.startswith("process") and len(args) > 0: token = args[0] - try: - info = {"type": type_names[token['type']]} - except: - raise + info = {"type": type_names[token['type']]} if token['type'] in tagTokenTypes: info["name"] = token['name'] @@ -383,8 +380,7 @@ def getMetaclass(use_metaclass, metaclass_func): # pylint:disable=unused-argument class Phase(with_metaclass(getMetaclass(debug, log))): - """Base class for helper object that implements each phase of processing - """ + """Base class for helper object that implements each phase of processing.""" def __init__(self, parser, tree): self.parser = parser @@ -1285,7 +1281,7 @@ def startTagSvg(self, token): token["selfClosingAcknowledged"] = True def startTagMisplaced(self, token): - """ Elements that should be children of other elements that have a + """Elements that should be children of other elements that have a different insertion mode; here they are ignored "caption", "col", "colgroup", "frame", "frameset", "head", "option", "optgroup", "tbody", "td", "tfoot", "th", "thead", @@ -2730,4 +2726,3 @@ def impliedTagToken(name, type="EndTag", attributes=None, class ParseError(Exception): """Error in parsed document""" - pass diff --git a/html5lib/serializer.py b/html5lib/serializer.py index 103dd206..f3786ae9 100644 --- a/html5lib/serializer.py +++ b/html5lib/serializer.py @@ -166,14 +166,14 @@ def __init__(self, **kwargs): self.strict = False def encode(self, string): - assert(isinstance(string, text_type)) + assert isinstance(string, text_type) if self.encoding: return string.encode(self.encoding, "htmlentityreplace") else: return string def encodeStrict(self, string): - assert(isinstance(string, text_type)) + assert isinstance(string, text_type) if self.encoding: return string.encode(self.encoding, "strict") else: @@ -331,4 +331,3 @@ def serializeError(self, data="XXX ERROR MESSAGE NEEDED"): class SerializeError(Exception): """Error in serialized tree""" - pass diff --git a/html5lib/tests/conftest.py b/html5lib/tests/conftest.py index ce93eff6..bfacd7e7 100644 --- a/html5lib/tests/conftest.py +++ b/html5lib/tests/conftest.py @@ -1,3 +1,5 @@ +from __future__ import absolute_import, division, unicode_literals + import os.path import pkg_resources diff --git a/html5lib/tests/support.py b/html5lib/tests/support.py index dab65c1c..d091cdae 100644 --- a/html5lib/tests/support.py +++ b/html5lib/tests/support.py @@ -49,7 +49,8 @@ } try: - import lxml.etree as lxml # noqa + import lxml.etree as lxml + del lxml except ImportError: treeTypes['lxml'] = None else: @@ -60,7 +61,8 @@ # Genshi impls try: - import genshi # noqa + import genshi + del genshi except ImportError: treeTypes["genshi"] = None else: @@ -132,7 +134,7 @@ def normaliseOutput(self, data): def convert(stripChars): def convertData(data): - """convert the output of str(document) to the format used in the testcases""" + """Convert the output of str(document) to the format used in the testcases""" data = data.split("\n") rv = [] for line in data: diff --git a/html5lib/tests/test_encoding.py b/html5lib/tests/test_encoding.py index 9a411c77..de59ef54 100644 --- a/html5lib/tests/test_encoding.py +++ b/html5lib/tests/test_encoding.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, unicode_literals +from __future__ import absolute_import, division, print_function, unicode_literals import os @@ -105,7 +105,8 @@ def test_encoding(): # pylint:disable=wrong-import-position try: - import chardet # noqa + import chardet + del chardet except ImportError: print("chardet not found, skipping chardet tests") else: diff --git a/html5lib/tests/test_parser2.py b/html5lib/tests/test_parser2.py index bcc0bf48..ad5349d7 100644 --- a/html5lib/tests/test_parser2.py +++ b/html5lib/tests/test_parser2.py @@ -4,11 +4,15 @@ import io -from . import support # noqa +from . import support + from html5lib.constants import namespaces, tokenTypes from html5lib import parse, parseFragment, HTMLParser +# above import has side-effects; mark it as used and del it +del support + # tests that aren't autogenerated from text files def test_assertDoctypeCloneable(): diff --git a/html5lib/tests/test_serializer.py b/html5lib/tests/test_serializer.py index 9333286e..f7cd0037 100644 --- a/html5lib/tests/test_serializer.py +++ b/html5lib/tests/test_serializer.py @@ -74,7 +74,7 @@ def _convertAttrib(self, attribs): attrs = {} for attrib in attribs: name = (attrib["namespace"], attrib["name"]) - assert(name not in attrs) + assert name not in attrs attrs[name] = attrib["value"] return attrs @@ -93,7 +93,7 @@ def runSerializerTest(input, expected, options): encoding = options.get("encoding", None) if encoding: - expected = list(map(lambda x: x.encode(encoding), expected)) + expected = list(x.encode(encoding) for x in expected) result = serialize_html(input, options) if len(expected) == 1: diff --git a/html5lib/tests/test_stream.py b/html5lib/tests/test_stream.py index 27c39538..f7f6a153 100644 --- a/html5lib/tests/test_stream.py +++ b/html5lib/tests/test_stream.py @@ -1,7 +1,5 @@ from __future__ import absolute_import, division, unicode_literals -from . import support # noqa - import codecs import sys from io import BytesIO, StringIO @@ -11,10 +9,15 @@ import six from six.moves import http_client, urllib +from . import support + from html5lib._inputstream import (BufferedStream, HTMLInputStream, HTMLUnicodeInputStream, HTMLBinaryInputStream) from html5lib._utils import supports_lone_surrogates +# above import has side-effects; mark it as used and del it +del support + def test_basic(): s = b"abc" @@ -182,8 +185,8 @@ def test_position2(): def test_python_issue_20007(): - """ - Make sure we have a work-around for Python bug #20007 + """Ensure we have a work-around for Python bug #20007. + http://bugs.python.org/issue20007 """ class FakeSocket(object): @@ -198,8 +201,8 @@ def makefile(self, _mode, _bufsize=None): def test_python_issue_20007_b(): - """ - Make sure we have a work-around for Python bug #20007 + """Ensure we have a work-around for Python bug #20007 (part b). + http://bugs.python.org/issue20007 """ if six.PY2: diff --git a/html5lib/tests/test_treewalkers.py b/html5lib/tests/test_treewalkers.py index 67fc89e5..061b6639 100644 --- a/html5lib/tests/test_treewalkers.py +++ b/html5lib/tests/test_treewalkers.py @@ -50,7 +50,7 @@ def test_all_tokens(): def set_attribute_on_first_child(docfrag, name, value, treeName): - """naively sets an attribute on the first child of the document + """Naively sets an attribute on the first child of the document fragment passed in""" setter = {'ElementTree': lambda d: d[0].set, 'DOM': lambda d: d.firstChild.setAttribute} @@ -62,7 +62,7 @@ def set_attribute_on_first_child(docfrag, name, value, treeName): def runTreewalkerEditTest(intext, expected, attrs_to_add, tree): - """tests what happens when we add attributes to the intext""" + """Test what happens when we add attributes to the intext""" treeName, treeClass = tree if treeClass is None: pytest.skip("Treebuilder not loaded") diff --git a/html5lib/tests/tokenizer.py b/html5lib/tests/tokenizer.py index 1440a722..6649878e 100644 --- a/html5lib/tests/tokenizer.py +++ b/html5lib/tests/tokenizer.py @@ -132,7 +132,7 @@ def tokensMatch(expectedTokens, receivedTokens, ignoreErrorOrder, def unescape(test): def decode(inp): - """Decode \\uXXXX escapes + r"""Decode \\uXXXX escapes This decodes \\uXXXX escapes, possibly into non-BMP characters when two surrogate character escapes are adjacent to each other. diff --git a/html5lib/treeadapters/__init__.py b/html5lib/treeadapters/__init__.py index 4f978466..290736bb 100644 --- a/html5lib/treeadapters/__init__.py +++ b/html5lib/treeadapters/__init__.py @@ -2,11 +2,9 @@ from . import sax -__all__ = ["sax"] - try: - from . import genshi # noqa + from . import genshi except ImportError: - pass + __all__ = ("sax", ) else: - __all__.append("genshi") + __all__ = ("sax", "genshi") diff --git a/html5lib/treeadapters/genshi.py b/html5lib/treeadapters/genshi.py index 04e316df..0e955b46 100644 --- a/html5lib/treeadapters/genshi.py +++ b/html5lib/treeadapters/genshi.py @@ -40,8 +40,7 @@ def to_genshi(walker): yield DOCTYPE, (token["name"], token["publicId"], token["systemId"]), (None, -1, -1) - else: - pass # FIXME: What to do? + # FIXME: What to do if type is not known? if text: yield TEXT, "".join(text), (None, -1, -1) diff --git a/html5lib/treeadapters/sax.py b/html5lib/treeadapters/sax.py index ad47df95..17ba0cf4 100644 --- a/html5lib/treeadapters/sax.py +++ b/html5lib/treeadapters/sax.py @@ -11,7 +11,7 @@ def to_sax(walker, handler): - """Call SAX-like content handler based on treewalker walker""" + """Call SAX-like content handler based on treewalker walker.""" handler.startDocument() for prefix, namespace in prefix_mapping.items(): handler.startPrefixMapping(prefix, namespace) diff --git a/html5lib/treebuilders/__init__.py b/html5lib/treebuilders/__init__.py index e2328847..4dd3852b 100644 --- a/html5lib/treebuilders/__init__.py +++ b/html5lib/treebuilders/__init__.py @@ -34,7 +34,7 @@ def getTreeBuilder(treeType, implementation=None, **kwargs): - """Get a TreeBuilder class for various types of tree with built-in support + """Get a TreeBuilder class for various types of tree with built-in support. treeType - the name of the tree type required (case-insensitive). Supported values are: diff --git a/html5lib/treebuilders/base.py b/html5lib/treebuilders/base.py index a4b2792a..c6169ab6 100644 --- a/html5lib/treebuilders/base.py +++ b/html5lib/treebuilders/base.py @@ -52,8 +52,7 @@ def __repr__(self): return "<%s>" % (self.name) def appendChild(self, node): - """Insert node as a child of the current node - """ + """Insert node as a child of the current node.""" raise NotImplementedError def insertText(self, data, insertBefore=None): @@ -69,8 +68,7 @@ def insertBefore(self, node, refNode): raise NotImplementedError def removeChild(self, node): - """Remove node from the children of the current node - """ + """Remove node from the children of the current node.""" raise NotImplementedError def reparentChildren(self, newParent): @@ -90,8 +88,7 @@ def cloneNode(self): raise NotImplementedError def hasContent(self): - """Return true if the node has children or text, false otherwise - """ + """Return true if the node has children or text, false otherwise.""" raise NotImplementedError @@ -367,17 +364,18 @@ def generateImpliedEndTags(self, exclude=None): self.generateImpliedEndTags(exclude) def getDocument(self): - "Return the final tree" + """Return the final tree.""" return self.document def getFragment(self): - "Return the final fragment" + """Return the final fragment.""" # assert self.innerHTML fragment = self.fragmentClass() self.openElements[0].reparentChildren(fragment) return fragment - def testSerializer(self, node): - """Serialize the subtree of node in the format required by unit tests - node - the node from which to start serializing""" + def testSerializer(self, element): + """Serialize the subtree of node in the format required by unit tests. + + element - the node from which to start serializing""" raise NotImplementedError diff --git a/html5lib/treebuilders/etree_lxml.py b/html5lib/treebuilders/etree_lxml.py index 908820c0..e8b6bc56 100644 --- a/html5lib/treebuilders/etree_lxml.py +++ b/html5lib/treebuilders/etree_lxml.py @@ -77,7 +77,7 @@ def serializeElement(element, indent=0): while next_element is not None: serializeElement(next_element, indent + 2) next_element = next_element.getnext() - elif isinstance(element, str) or isinstance(element, bytes): + elif isinstance(element, (str, bytes)): # Text in a fragment assert isinstance(element, str) or sys.version_info[0] == 2 rv.append("|%s\"%s\"" % (' ' * indent, element)) diff --git a/parse.py b/parse.py index 3e65c330..c849c1f0 100755 --- a/parse.py +++ b/parse.py @@ -3,6 +3,8 @@ Parse a document to a tree, with optional profiling """ +from __future__ import absolute_import, division, unicode_literals, print_function + import sys import traceback @@ -33,7 +35,7 @@ def parse(): if contentType: (mediaType, params) = cgi.parse_header(contentType) encoding = params.get('charset') - except: + except Exception: pass elif f == '-': f = sys.stdin @@ -94,7 +96,7 @@ def parse(): def run(parseMethod, f, encoding, scripting): try: document = parseMethod(f, override_encoding=encoding, scripting=scripting) - except: + except Exception: document = None traceback.print_exc() return document @@ -127,7 +129,7 @@ def printOutput(parser, document, opts): for opt in serializer.HTMLSerializer.options: try: kwargs[opt] = getattr(opts, opt) - except: + except AttributeError: pass if not kwargs['quote_char']: del kwargs['quote_char'] diff --git a/requirements-test.txt b/requirements-test.txt index 40df78d4..64e9cf75 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,6 +1,20 @@ -r requirements.txt flake8<3.0 +flake8-docstrings +flake8-string-format +flake8-future-import +flake8-debugger +flake8-print +hacking +flake8-tuple +flake8-dodgy +ebb-lint ; python_version > '2.6' + +pylint ; python_version > '2.6' +pylint<1.4 ; python_version <= '2.6' +astroid<1.3.6 ; python_version <= '2.6' + pytest pytest-expect>=1.1,<2.0 mock diff --git a/setup.py b/setup.py index 00fee241..cd94afdc 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,4 @@ -from __future__ import print_function +from __future__ import absolute_import, division, print_function, unicode_literals import ast import codecs @@ -83,7 +83,7 @@ def default_environment(): with open(join(here, "html5lib", "__init__.py"), "rb") as init_file: t = ast.parse(init_file.read(), filename="__init__.py", mode="exec") assert isinstance(t, ast.Module) - assignments = filter(lambda x: isinstance(x, ast.Assign), t.body) + assignments = (x for x in t.body if isinstance(x, ast.Assign)) for a in assignments: if (len(a.targets) == 1 and isinstance(a.targets[0], ast.Name) and diff --git a/tox.ini b/tox.ini index da64de71..24fcaef9 100644 --- a/tox.ini +++ b/tox.ini @@ -4,6 +4,15 @@ envlist = {py26,py27,py33,py34,py35,pypy}-{base,optional} [testenv] deps = flake8<3.0 + flake8-docstrings + flake8-string-format + flake8-future-import + flake8-debugger + flake8-print + hacking + flake8-tuple + flake8-dodgy + ebb-lint ; python_version > '2.6' pytest pytest-expect>=1.1,<2.0 mock @@ -15,3 +24,8 @@ deps = commands = {envbindir}/py.test {toxinidir}/flake8-run.sh + +[flake8] +ignore = D1,D202,D204,D205,D208,D209,D400,D401,FI13,FI50,FI51,FI53,FI54,H101,H301,H304,H306,H403,H405,L101,L102,L201,L202,L204,L205,T003 +min-version = 2.6 +require-code = True diff --git a/utils/entities.py b/utils/entities.py index 6dccf5f0..6d91587a 100644 --- a/utils/entities.py +++ b/utils/entities.py @@ -1,3 +1,5 @@ +from __future__ import absolute_import, division, unicode_literals, print_function + import json import html5lib diff --git a/utils/spider.py b/utils/spider.py index 3a325888..2cdc0aaf 100644 --- a/utils/spider.py +++ b/utils/spider.py @@ -1,17 +1,19 @@ #!/usr/bin/env python -"""Spider to try and find bugs in the parser. Requires httplib2 and elementtree +"""Spider to try and find bugs in the parser. Requires httplib2 and elementtree. usage: import spider s = spider.Spider() s.spider("http://www.google.com", maxURLs=100) """ +from __future__ import absolute_import, division, unicode_literals, print_function import urllib.request import urllib.error import urllib.parse import urllib.robotparser -import md5 + +from hashlib import md5 import httplib2 @@ -46,7 +48,7 @@ def parse(self, content): p = html5lib.HTMLParser(tree=etree.TreeBuilder) try: tree = p.parse(content) - except: + except Exception: self.buggyURLs.add(self.currentURL) failed = True print("BUGGY:", self.currentURL) @@ -57,7 +59,7 @@ def parse(self, content): def loadURL(self, url): resp, content = self.http.request(url, "GET") self.currentURL = url - digest = md5.md5(content).hexdigest() + digest = md5(content).hexdigest() if digest in self.contentDigest: content = None self.visitedURLs.add(url) pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy