diff --git a/.prospector.yaml b/.prospector.yaml new file mode 100644 index 00000000..7e8efe1a --- /dev/null +++ b/.prospector.yaml @@ -0,0 +1,21 @@ +strictness: veryhigh +doc-warnings: false +test-warnings: false + +max-line-length: 139 + +requirements: + - requirements.txt + - requirements-test.txt + - requirements-optional.txt + +ignore-paths: + - parse.py + - utils/ + +python-targets: + - 2 + - 3 + +mccabe: + run: false diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 00000000..ea74d5db --- /dev/null +++ b/.pylintrc @@ -0,0 +1,10 @@ +[MASTER] +ignore=tests + +[MESSAGES CONTROL] +# messages up to fixme should probably be fixed somehow +disable = redefined-builtin,attribute-defined-outside-init,anomalous-backslash-in-string,no-self-use,redefined-outer-name,bad-continuation,wrong-import-order,superfluous-parens,no-member,duplicate-code,super-init-not-called,abstract-method,property-on-old-class,wrong-import-position,no-name-in-module,no-init,bad-mcs-classmethod-argument,bad-classmethod-argument,fixme,invalid-name,import-error,too-few-public-methods,too-many-ancestors,too-many-arguments,too-many-boolean-expressions,too-many-branches,too-many-instance-attributes,too-many-locals,too-many-lines,too-many-public-methods,too-many-return-statements,too-many-statements,missing-docstring,line-too-long,locally-disabled,locally-enabled,bad-builtin,deprecated-lambda + +[FORMAT] +max-line-length=139 +single-line-if-stmt=no diff --git a/flake8-run.sh b/flake8-run.sh index 685ec6ab..d9264946 100755 --- a/flake8-run.sh +++ b/flake8-run.sh @@ -5,8 +5,5 @@ if [[ ! -x $(which flake8) ]]; then exit 1 fi -find html5lib/ -name '*.py' -and -not -name 'constants.py' -print0 | xargs -0 flake8 --ignore=E501 -flake1=$? -flake8 --max-line-length=99 --ignore=E126 html5lib/constants.py -flake2=$? -exit $[$flake1 || $flake2] +flake8 `dirname $0` +exit $? diff --git a/html5lib/constants.py b/html5lib/constants.py index 2244933c..df1f061e 100644 --- a/html5lib/constants.py +++ b/html5lib/constants.py @@ -2819,7 +2819,6 @@ 0x0d: "\u000D", 0x80: "\u20AC", 0x81: "\u0081", - 0x81: "\u0081", 0x82: "\u201A", 0x83: "\u0192", 0x84: "\u201E", diff --git a/html5lib/filters/sanitizer.py b/html5lib/filters/sanitizer.py index caddd318..7f81c0d1 100644 --- a/html5lib/filters/sanitizer.py +++ b/html5lib/filters/sanitizer.py @@ -765,15 +765,15 @@ def sanitize_token(self, token): if ((namespace, name) in self.allowed_elements or (namespace is None and (namespaces["html"], name) in self.allowed_elements)): - return self.allowed_token(token, token_type) + return self.allowed_token(token) else: - return self.disallowed_token(token, token_type) + return self.disallowed_token(token) elif token_type == "Comment": pass else: return token - def allowed_token(self, token, token_type): + def allowed_token(self, token): if "data" in token: attrs = token["data"] attr_names = set(attrs.keys()) @@ -823,7 +823,8 @@ def allowed_token(self, token, token_type): token["data"] = attrs return token - def disallowed_token(self, token, token_type): + def disallowed_token(self, token): + token_type = token["type"] if token_type == "EndTag": token["data"] = "" % token["name"] elif token["data"]: @@ -862,7 +863,7 @@ def sanitize_css(self, style): 'padding']: for keyword in value.split(): if keyword not in self.allowed_css_keywords and \ - not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): + not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): # noqa break else: clean.append(prop + ': ' + value + ';') diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index e6808425..331b8fd7 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -121,7 +121,7 @@ def reset(self): self.phase.insertHtmlElement() self.resetInsertionMode() else: - self.innerHTML = False + self.innerHTML = False # pylint:disable=redefined-variable-type self.phase = self.phases["initial"] self.lastPhase = None @@ -241,6 +241,7 @@ def parse(self, stream, encoding=None, parseMeta=True, def parseFragment(self, stream, container="div", encoding=None, parseMeta=False, useChardet=True, scripting=False): + # pylint:disable=unused-argument """Parse a HTML fragment into a well-formed tree fragment container - name of the element we're setting the innerHTML property @@ -259,8 +260,10 @@ def parseFragment(self, stream, container="div", encoding=None, encoding=encoding, scripting=scripting) return self.tree.getFragment() - def parseError(self, errorcode="XXX-undefined-error", datavars={}): + def parseError(self, errorcode="XXX-undefined-error", datavars=None): # XXX The idea is to make errorcode mandatory. + if datavars is None: + datavars = {} self.errors.append((self.tokenizer.stream.position(), errorcode, datavars)) if self.strict: raise ParseError(E[errorcode] % datavars) @@ -361,6 +364,7 @@ def adjustForeignAttributes(self, token): del token["data"][originalName] def reparseTokenNormal(self, token): + # pylint:disable=unused-argument self.parser.phase() def resetInsertionMode(self): @@ -458,6 +462,7 @@ def getMetaclass(use_metaclass, metaclass_func): else: return type + # pylint:disable=unused-argument class Phase(with_metaclass(getMetaclass(debug, log))): """Base class for helper object that implements each phase of processing """ @@ -948,8 +953,8 @@ class InBodyPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - # Keep a ref to this for special handling of whitespace in
-            self.processSpaceCharactersNonPre = self.processSpaceCharacters
+            # Set this to the default handler
+            self.processSpaceCharacters = self.processSpaceCharactersNonPre
 
             self.startTagHandler = utils.MethodDispatcher([
                 ("html", self.startTagHtml),
@@ -1082,7 +1087,7 @@ def processCharacters(self, token):
                      for char in token["data"]])):
                 self.parser.framesetOK = False
 
-        def processSpaceCharacters(self, token):
+        def processSpaceCharactersNonPre(self, token):
             self.tree.reconstructActiveFormattingElements()
             self.tree.insertText(token["data"])
 
@@ -2763,6 +2768,7 @@ def startTagOther(self, token):
         def processEndTag(self, token):
             self.parser.parseError("expected-eof-but-got-end-tag",
                                    {"name": token["name"]})
+    # pylint:enable=unused-argument
 
     return {
         "initial": InitialPhase,
diff --git a/html5lib/ihatexml.py b/html5lib/ihatexml.py
index 5da5d938..d6d1d6fb 100644
--- a/html5lib/ihatexml.py
+++ b/html5lib/ihatexml.py
@@ -175,9 +175,9 @@ def escapeRegexp(string):
     return string
 
 # output from the above
-nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
+nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')  # noqa
 
-nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
+nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')  # noqa
 
 # Simpler things
 nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\-\'()+,./:=?;!*#@$_%]")
@@ -186,7 +186,7 @@ def escapeRegexp(string):
 class InfosetFilter(object):
     replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
 
-    def __init__(self, replaceChars=None,
+    def __init__(self,
                  dropXmlnsLocalName=False,
                  dropXmlnsAttrNs=False,
                  preventDoubleDashComments=False,
@@ -217,7 +217,7 @@ def coerceAttribute(self, name, namespace=None):
         else:
             return self.toXmlName(name)
 
-    def coerceElement(self, name, namespace=None):
+    def coerceElement(self, name):
         return self.toXmlName(name)
 
     def coerceComment(self, data):
@@ -232,7 +232,7 @@ def coerceComment(self, data):
 
     def coerceCharacters(self, data):
         if self.replaceFormFeedCharacters:
-            for i in range(data.count("\x0C")):
+            for _ in range(data.count("\x0C")):
                 warnings.warn("Text cannot contain U+000C", DataLossWarning)
             data = data.replace("\x0C", " ")
         # Other non-xml characters
diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py
index 15acba0d..58d626c9 100644
--- a/html5lib/inputstream.py
+++ b/html5lib/inputstream.py
@@ -19,12 +19,6 @@
 except ImportError:
     BytesIO = StringIO
 
-try:
-    from io import BufferedIOBase
-except ImportError:
-    class BufferedIOBase(object):
-        pass
-
 # Non-unicode versions of constants for use in the pre-parser
 spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
 asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
@@ -32,15 +26,17 @@ class BufferedIOBase(object):
 spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
 
 
-invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]"
+invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]"  # noqa
 
 if utils.supports_lone_surrogates:
     # Use one extra step of indirection and create surrogates with
-    # unichr. Not using this indirection would introduce an illegal
+    # eval. Not using this indirection would introduce an illegal
     # unicode literal on platforms not supporting such lone
     # surrogates.
-    invalid_unicode_re = re.compile(invalid_unicode_no_surrogate +
-                                    eval('"\\uD800-\\uDFFF"'))
+    assert invalid_unicode_no_surrogate[-1] == "]" and invalid_unicode_no_surrogate.count("]") == 1
+    invalid_unicode_re = re.compile(invalid_unicode_no_surrogate[:-1] +
+                                    eval('"\\uD800-\\uDFFF"') +  # pylint:disable=eval-used
+                                    "]")
 else:
     invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
 
@@ -296,7 +292,7 @@ def readChunk(self, chunkSize=None):
         return True
 
     def characterErrorsUCS4(self, data):
-        for i in range(len(invalid_unicode_re.findall(data))):
+        for _ in range(len(invalid_unicode_re.findall(data))):
             self.errors.append("invalid-codepoint")
 
     def characterErrorsUCS2(self, data):
@@ -453,7 +449,7 @@ def openStream(self, source):
 
         try:
             stream.seek(stream.tell())
-        except:
+        except:  # pylint:disable=bare-except
             stream = BufferedStream(stream)
 
         return stream
@@ -571,6 +567,7 @@ def __new__(self, value):
         return bytes.__new__(self, value.lower())
 
     def __init__(self, value):
+        # pylint:disable=unused-argument
         self._position = -1
 
     def __iter__(self):
@@ -681,7 +678,7 @@ def getEncoding(self):
             (b" 1) or
-                (not is_ucs4 and len(v) > 2)):
-            continue
-        if v != "&":
-            if len(v) == 2:
-                v = utils.surrogatePairToCodepoint(v)
-            else:
-                v = ord(v)
-            if v not in encode_entity_map or k.islower():
-                # prefer < over < and similarly for &, >, etc.
-                encode_entity_map[v] = k
-
-    def htmlentityreplace_errors(exc):
-        if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
-            res = []
-            codepoints = []
-            skip = False
-            for i, c in enumerate(exc.object[exc.start:exc.end]):
-                if skip:
-                    skip = False
-                    continue
-                index = i + exc.start
-                if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
-                    codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2])
-                    skip = True
-                else:
-                    codepoint = ord(c)
-                codepoints.append(codepoint)
-            for cp in codepoints:
-                e = encode_entity_map.get(cp)
-                if e:
-                    res.append("&")
-                    res.append(e)
-                    if not e.endswith(";"):
-                        res.append(";")
-                else:
-                    res.append("&#x%s;" % (hex(cp)[2:]))
-            return ("".join(res), exc.end)
-        else:
-            return xmlcharrefreplace_errors(exc)
 
-    register_error(unicode_encode_errors, htmlentityreplace_errors)
+encode_entity_map = {}
+is_ucs4 = len("\U0010FFFF") == 1
+for k, v in list(entities.items()):
+    # skip multi-character entities
+    if ((is_ucs4 and len(v) > 1) or
+            (not is_ucs4 and len(v) > 2)):
+        continue
+    if v != "&":
+        if len(v) == 2:
+            v = utils.surrogatePairToCodepoint(v)
+        else:
+            v = ord(v)
+        if v not in encode_entity_map or k.islower():
+            # prefer < over < and similarly for &, >, etc.
+            encode_entity_map[v] = k
+
+
+def htmlentityreplace_errors(exc):
+    if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
+        res = []
+        codepoints = []
+        skip = False
+        for i, c in enumerate(exc.object[exc.start:exc.end]):
+            if skip:
+                skip = False
+                continue
+            index = i + exc.start
+            if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
+                codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2])
+                skip = True
+            else:
+                codepoint = ord(c)
+            codepoints.append(codepoint)
+        for cp in codepoints:
+            e = encode_entity_map.get(cp)
+            if e:
+                res.append("&")
+                res.append(e)
+                if not e.endswith(";"):
+                    res.append(";")
+            else:
+                res.append("&#x%s;" % (hex(cp)[2:]))
+        return ("".join(res), exc.end)
+    else:
+        return xmlcharrefreplace_errors(exc)
 
-    del register_error
+register_error("htmlentityreplace", htmlentityreplace_errors)
 
 
 class HTMLSerializer(object):
@@ -168,7 +163,7 @@ def __init__(self, **kwargs):
     def encode(self, string):
         assert(isinstance(string, text_type))
         if self.encoding:
-            return string.encode(self.encoding, unicode_encode_errors)
+            return string.encode(self.encoding, "htmlentityreplace")
         else:
             return string
 
@@ -180,6 +175,7 @@ def encodeStrict(self, string):
             return string
 
     def serialize(self, treewalker, encoding=None):
+        # pylint:disable=too-many-nested-blocks
         self.encoding = encoding
         in_cdata = False
         self.errors = []
@@ -241,7 +237,7 @@ def serialize(self, treewalker, encoding=None):
                     in_cdata = True
                 elif in_cdata:
                     self.serializeError("Unexpected child element of a CDATA element")
-                for (attr_namespace, attr_name), attr_value in token["data"].items():
+                for (_, attr_name), attr_value in token["data"].items():
                     # TODO: Add namespace support here
                     k = attr_name
                     v = attr_value
@@ -328,6 +324,6 @@ def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
             raise SerializeError
 
 
-def SerializeError(Exception):
+class SerializeError(Exception):
     """Error in serialized tree"""
     pass
diff --git a/html5lib/tests/support.py b/html5lib/tests/support.py
index 6e6a916b..6ae09dbe 100644
--- a/html5lib/tests/support.py
+++ b/html5lib/tests/support.py
@@ -1,5 +1,7 @@
 from __future__ import absolute_import, division, unicode_literals
 
+# pylint:disable=wrong-import-position
+
 import os
 import sys
 import codecs
@@ -13,7 +15,7 @@
                                                 os.path.pardir,
                                                 os.path.pardir)))
 
-from html5lib import treebuilders, treewalkers, treeadapters
+from html5lib import treebuilders, treewalkers, treeadapters  # noqa
 del base_path
 
 # Build a dict of available trees
@@ -26,14 +28,14 @@
 }
 
 # ElementTree impls
-import xml.etree.ElementTree as ElementTree
+import xml.etree.ElementTree as ElementTree  # noqa
 treeTypes['ElementTree'] = {
     "builder": treebuilders.getTreeBuilder("etree", ElementTree, fullTree=True),
     "walker": treewalkers.getTreeWalker("etree", ElementTree)
 }
 
 try:
-    import xml.etree.cElementTree as cElementTree
+    import xml.etree.cElementTree as cElementTree  # noqa
 except ImportError:
     treeTypes['cElementTree'] = None
 else:
@@ -47,7 +49,7 @@
         }
 
 try:
-    import lxml.etree as lxml  # flake8: noqa
+    import lxml.etree as lxml  # noqa
 except ImportError:
     treeTypes['lxml'] = None
 else:
@@ -58,7 +60,7 @@
 
 # Genshi impls
 try:
-    import genshi  # flake8: noqa
+    import genshi  # noqa
 except ImportError:
     pass
 else:
@@ -68,6 +70,8 @@
         "walker": treewalkers.getTreeWalker("genshi")
     }
 
+# pylint:enable=wrong-import-position
+
 
 def get_data_files(subdirectory, files='*.dat', search_dir=test_dir):
     return sorted(glob.glob(os.path.join(search_dir, subdirectory, files)))
diff --git a/html5lib/tests/test_encoding.py b/html5lib/tests/test_encoding.py
index 09504654..c5d2af12 100644
--- a/html5lib/tests/test_encoding.py
+++ b/html5lib/tests/test_encoding.py
@@ -51,19 +51,21 @@ def runPreScanEncodingTest(data, encoding):
 def test_encoding():
     for filename in get_data_files("encoding"):
         tests = _TestData(filename, b"data", encoding=None)
-        for idx, test in enumerate(tests):
+        for test in tests:
             yield (runParserEncodingTest, test[b'data'], test[b'encoding'])
             yield (runPreScanEncodingTest, test[b'data'], test[b'encoding'])
 
+# pylint:disable=wrong-import-position
 try:
     try:
-        import charade  # flake8: noqa
+        import charade  # noqa
     except ImportError:
-        import chardet  # flake8: noqa
+        import chardet  # noqa
 except ImportError:
     print("charade/chardet not found, skipping chardet tests")
 else:
     def test_chardet():
-        with open(os.path.join(test_dir, "encoding" , "chardet", "test_big5.txt"), "rb") as fp:
+        with open(os.path.join(test_dir, "encoding", "chardet", "test_big5.txt"), "rb") as fp:
             encoding = inputstream.HTMLInputStream(fp.read()).charEncoding
             assert encoding[0].name == "big5"
+# pylint:enable=wrong-import-position
diff --git a/html5lib/tests/test_parser2.py b/html5lib/tests/test_parser2.py
index 2f3ba2c8..f8e1ac43 100644
--- a/html5lib/tests/test_parser2.py
+++ b/html5lib/tests/test_parser2.py
@@ -2,10 +2,8 @@
 
 import io
 
-import pytest
+from . import support  # noqa
 
-from . import support  # flake8: noqa
-from html5lib import html5parser
 from html5lib.constants import namespaces
 from html5lib import parse
 
@@ -23,29 +21,29 @@ def test_line_counter():
 
 def test_namespace_html_elements_0_dom():
     doc = parse("",
-                         treebuilder="dom",
-                         namespaceHTMLElements=True)
+                treebuilder="dom",
+                namespaceHTMLElements=True)
     assert doc.childNodes[0].namespaceURI == namespaces["html"]
 
 
 def test_namespace_html_elements_1_dom():
     doc = parse("",
-                         treebuilder="dom",
-                         namespaceHTMLElements=False)
+                treebuilder="dom",
+                namespaceHTMLElements=False)
     assert doc.childNodes[0].namespaceURI is None
 
 
 def test_namespace_html_elements_0_etree():
     doc = parse("",
-                         treebuilder="etree",
-                         namespaceHTMLElements=True)
+                treebuilder="etree",
+                namespaceHTMLElements=True)
     assert doc.tag == "{%s}html" % (namespaces["html"],)
 
 
 def test_namespace_html_elements_1_etree():
     doc = parse("",
-                         treebuilder="etree",
-                         namespaceHTMLElements=False)
+                treebuilder="etree",
+                namespaceHTMLElements=False)
     assert doc.tag == "html"
 
 
diff --git a/html5lib/tests/test_sanitizer.py b/html5lib/tests/test_sanitizer.py
index 1f8a06f6..e19deea8 100644
--- a/html5lib/tests/test_sanitizer.py
+++ b/html5lib/tests/test_sanitizer.py
@@ -4,7 +4,7 @@
 from html5lib.filters import sanitizer
 
 
-def runSanitizerTest(name, expected, input):
+def runSanitizerTest(_, expected, input):
     parsed = parseFragment(expected)
     expected = serialize(parsed,
                          omit_optional_tags=False,
@@ -63,7 +63,8 @@ def test_sanitizer():
     for ns, tag_name in sanitizer.allowed_elements:
         if ns != constants.namespaces["html"]:
             continue
-        if tag_name in ['caption', 'col', 'colgroup', 'optgroup', 'option', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr', 'select']:
+        if tag_name in ['caption', 'col', 'colgroup', 'optgroup', 'option', 'table', 'tbody', 'td',
+                        'tfoot', 'th', 'thead', 'tr', 'select']:
             continue  # TODO
         if tag_name == 'image':
             yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
diff --git a/html5lib/tests/test_serializer.py b/html5lib/tests/test_serializer.py
index b3ffe0df..b3cda7d7 100644
--- a/html5lib/tests/test_serializer.py
+++ b/html5lib/tests/test_serializer.py
@@ -12,6 +12,7 @@
 from html5lib.serializer import HTMLSerializer, serialize
 from html5lib.treewalkers._base import TreeWalker
 
+# pylint:disable=wrong-import-position
 optionals_loaded = []
 
 try:
@@ -19,6 +20,7 @@
     optionals_loaded.append("lxml")
 except ImportError:
     pass
+# pylint:enable=wrong-import-position
 
 default_namespace = constants.namespaces["html"]
 
@@ -219,5 +221,5 @@ def test_serializer():
     for filename in get_data_files('serializer-testdata', '*.test', os.path.dirname(__file__)):
         with open(filename) as fp:
             tests = json.load(fp)
-            for index, test in enumerate(tests['tests']):
+            for test in tests['tests']:
                 yield runSerializerTest, test["input"], test["expected"], test.get("options", {})
diff --git a/html5lib/tests/test_stream.py b/html5lib/tests/test_stream.py
index 3b659fbb..77e411d5 100644
--- a/html5lib/tests/test_stream.py
+++ b/html5lib/tests/test_stream.py
@@ -1,15 +1,20 @@
 from __future__ import absolute_import, division, unicode_literals
 
-from . import support  # flake8: noqa
+from . import support  # noqa
+
 import codecs
-from io import BytesIO
-import socket
+import sys
+from io import BytesIO, StringIO
+
+import pytest
 
 import six
 from six.moves import http_client, urllib
 
 from html5lib.inputstream import (BufferedStream, HTMLInputStream,
                                   HTMLUnicodeInputStream, HTMLBinaryInputStream)
+from html5lib.utils import supports_lone_surrogates
+
 
 def test_basic():
     s = b"abc"
@@ -17,6 +22,7 @@ def test_basic():
     read = fp.read(10)
     assert read == s
 
+
 def test_read_length():
     fp = BufferedStream(BytesIO(b"abcdef"))
     read1 = fp.read(1)
@@ -28,17 +34,23 @@ def test_read_length():
     read4 = fp.read(4)
     assert read4 == b""
 
+
 def test_tell():
     fp = BufferedStream(BytesIO(b"abcdef"))
     read1 = fp.read(1)
+    assert read1 == b"a"
     assert fp.tell() == 1
     read2 = fp.read(2)
+    assert read2 == b"bc"
     assert fp.tell() == 3
     read3 = fp.read(3)
+    assert read3 == b"def"
     assert fp.tell() == 6
     read4 = fp.read(4)
+    assert read4 == b""
     assert fp.tell() == 6
 
+
 def test_seek():
     fp = BufferedStream(BytesIO(b"abcdef"))
     read1 = fp.read(1)
@@ -55,20 +67,26 @@ def test_seek():
     read5 = fp.read(2)
     assert read5 == b"ef"
 
+
 def test_seek_tell():
     fp = BufferedStream(BytesIO(b"abcdef"))
     read1 = fp.read(1)
+    assert read1 == b"a"
     assert fp.tell() == 1
     fp.seek(0)
     read2 = fp.read(1)
+    assert read2 == b"a"
     assert fp.tell() == 1
     read3 = fp.read(2)
+    assert read3 == b"bc"
     assert fp.tell() == 3
     fp.seek(2)
     read4 = fp.read(2)
+    assert read4 == b"cd"
     assert fp.tell() == 4
     fp.seek(4)
     read5 = fp.read(2)
+    assert read5 == b"ef"
     assert fp.tell() == 6
 
 
@@ -85,11 +103,13 @@ def test_char_ascii():
     assert stream.charEncoding[0].name == 'windows-1252'
     assert stream.char() == "'"
 
+
 def test_char_utf8():
     stream = HTMLInputStream('\u2018'.encode('utf-8'), encoding='utf-8')
     assert stream.charEncoding[0].name == 'utf-8'
     assert stream.char() == '\u2018'
 
+
 def test_char_win1252():
     stream = HTMLInputStream("\xa9\xf1\u2019".encode('windows-1252'))
     assert stream.charEncoding[0].name == 'windows-1252'
@@ -97,16 +117,19 @@ def test_char_win1252():
     assert stream.char() == "\xf1"
     assert stream.char() == "\u2019"
 
+
 def test_bom():
     stream = HTMLInputStream(codecs.BOM_UTF8 + b"'")
     assert stream.charEncoding[0].name == 'utf-8'
     assert stream.char() == "'"
 
+
 def test_utf_16():
     stream = HTMLInputStream((' ' * 1025).encode('utf-16'))
     assert stream.charEncoding[0].name in ['utf-16le', 'utf-16be']
     assert len(stream.charsUntil(' ', True)) == 1025
 
+
 def test_newlines():
     stream = HTMLBinaryInputStreamShortChunk(codecs.BOM_UTF8 + b"a\nbb\r\nccc\rddddxe")
     assert stream.position() == (1, 0)
@@ -117,11 +140,13 @@ def test_newlines():
     assert stream.charsUntil('e') == "x"
     assert stream.position() == (4, 5)
 
+
 def test_newlines2():
     size = HTMLUnicodeInputStream._defaultChunkSize
     stream = HTMLInputStream("\r" * size + "\n")
     assert stream.charsUntil('x') == "\n" * size
 
+
 def test_position():
     stream = HTMLBinaryInputStreamShortChunk(codecs.BOM_UTF8 + b"a\nbb\nccc\nddde\nf\ngh")
     assert stream.position() == (1, 0)
@@ -140,6 +165,7 @@ def test_position():
     assert stream.charsUntil('h') == "e\nf\ng"
     assert stream.position() == (6, 1)
 
+
 def test_position2():
     stream = HTMLUnicodeInputStreamShortChunk("abc\nd")
     assert stream.position() == (1, 0)
@@ -154,6 +180,7 @@ def test_position2():
     assert stream.char() == "d"
     assert stream.position() == (2, 1)
 
+
 def test_python_issue_20007():
     """
     Make sure we have a work-around for Python bug #20007
@@ -161,6 +188,7 @@ def test_python_issue_20007():
     """
     class FakeSocket(object):
         def makefile(self, _mode, _bufsize=None):
+            # pylint:disable=unused-argument
             return BytesIO(b"HTTP/1.1 200 Ok\r\n\r\nText")
 
     source = http_client.HTTPResponse(FakeSocket())
@@ -168,6 +196,7 @@ def makefile(self, _mode, _bufsize=None):
     stream = HTMLInputStream(source)
     assert stream.charsUntil(" ") == "Text"
 
+
 def test_python_issue_20007_b():
     """
     Make sure we have a work-around for Python bug #20007
@@ -178,6 +207,7 @@ def test_python_issue_20007_b():
 
     class FakeSocket(object):
         def makefile(self, _mode, _bufsize=None):
+            # pylint:disable=unused-argument
             return BytesIO(b"HTTP/1.1 200 Ok\r\n\r\nText")
 
     source = http_client.HTTPResponse(FakeSocket())
@@ -185,3 +215,109 @@ def makefile(self, _mode, _bufsize=None):
     wrapped = urllib.response.addinfourl(source, source.msg, "http://example.com")
     stream = HTMLInputStream(wrapped)
     assert stream.charsUntil(" ") == "Text"
+
+
+@pytest.mark.parametrize("inp,num",
+                         [("\u0000", 0),
+                          ("\u0001", 1),
+                          ("\u0008", 1),
+                          ("\u0009", 0),
+                          ("\u000A", 0),
+                          ("\u000B", 1),
+                          ("\u000C", 0),
+                          ("\u000D", 0),
+                          ("\u000E", 1),
+                          ("\u001F", 1),
+                          ("\u0020", 0),
+                          ("\u007E", 0),
+                          ("\u007F", 1),
+                          ("\u009F", 1),
+                          ("\u00A0", 0),
+                          ("\uFDCF", 0),
+                          ("\uFDD0", 1),
+                          ("\uFDEF", 1),
+                          ("\uFDF0", 0),
+                          ("\uFFFD", 0),
+                          ("\uFFFE", 1),
+                          ("\uFFFF", 1),
+                          ("\U0001FFFD", 0),
+                          ("\U0001FFFE", 1),
+                          ("\U0001FFFF", 1),
+                          ("\U0002FFFD", 0),
+                          ("\U0002FFFE", 1),
+                          ("\U0002FFFF", 1),
+                          ("\U0003FFFD", 0),
+                          ("\U0003FFFE", 1),
+                          ("\U0003FFFF", 1),
+                          ("\U0004FFFD", 0),
+                          ("\U0004FFFE", 1),
+                          ("\U0004FFFF", 1),
+                          ("\U0005FFFD", 0),
+                          ("\U0005FFFE", 1),
+                          ("\U0005FFFF", 1),
+                          ("\U0006FFFD", 0),
+                          ("\U0006FFFE", 1),
+                          ("\U0006FFFF", 1),
+                          ("\U0007FFFD", 0),
+                          ("\U0007FFFE", 1),
+                          ("\U0007FFFF", 1),
+                          ("\U0008FFFD", 0),
+                          ("\U0008FFFE", 1),
+                          ("\U0008FFFF", 1),
+                          ("\U0009FFFD", 0),
+                          ("\U0009FFFE", 1),
+                          ("\U0009FFFF", 1),
+                          ("\U000AFFFD", 0),
+                          ("\U000AFFFE", 1),
+                          ("\U000AFFFF", 1),
+                          ("\U000BFFFD", 0),
+                          ("\U000BFFFE", 1),
+                          ("\U000BFFFF", 1),
+                          ("\U000CFFFD", 0),
+                          ("\U000CFFFE", 1),
+                          ("\U000CFFFF", 1),
+                          ("\U000DFFFD", 0),
+                          ("\U000DFFFE", 1),
+                          ("\U000DFFFF", 1),
+                          ("\U000EFFFD", 0),
+                          ("\U000EFFFE", 1),
+                          ("\U000EFFFF", 1),
+                          ("\U000FFFFD", 0),
+                          ("\U000FFFFE", 1),
+                          ("\U000FFFFF", 1),
+                          ("\U0010FFFD", 0),
+                          ("\U0010FFFE", 1),
+                          ("\U0010FFFF", 1),
+                          ("\x01\x01\x01", 3),
+                          ("a\x01a\x01a\x01a", 3)])
+def test_invalid_codepoints(inp, num):
+    stream = HTMLUnicodeInputStream(StringIO(inp))
+    for _i in range(len(inp)):
+        stream.char()
+    assert len(stream.errors) == num
+
+
+@pytest.mark.skipif(not supports_lone_surrogates, reason="doesn't support lone surrogates")
+@pytest.mark.parametrize("inp,num",
+                         [("'\\uD7FF'", 0),
+                          ("'\\uD800'", 1),
+                          ("'\\uDBFF'", 1),
+                          ("'\\uDC00'", 1),
+                          ("'\\uDFFF'", 1),
+                          ("'\\uE000'", 0),
+                          ("'\\uD800\\uD800\\uD800'", 3),
+                          ("'a\\uD800a\\uD800a\\uD800a'", 3),
+                          ("'\\uDFFF\\uDBFF'", 2),
+                          pytest.mark.skipif(sys.maxunicode == 0xFFFF,
+                                             ("'\\uDBFF\\uDFFF'", 2),
+                                             reason="narrow Python")])
+def test_invalid_codepoints_surrogates(inp, num):
+    inp = eval(inp)  # pylint:disable=eval-used
+    fp = StringIO(inp)
+    if ord(max(fp.read())) > 0xFFFF:
+        pytest.skip("StringIO altered string")
+    fp.seek(0)
+    stream = HTMLUnicodeInputStream(fp)
+    for _i in range(len(inp)):
+        stream.char()
+    assert len(stream.errors) == num
diff --git a/html5lib/tests/test_treeadapters.py b/html5lib/tests/test_treeadapters.py
index 5f38b6c3..95e56c00 100644
--- a/html5lib/tests/test_treeadapters.py
+++ b/html5lib/tests/test_treeadapters.py
@@ -1,6 +1,6 @@
 from __future__ import absolute_import, division, unicode_literals
 
-from . import support  # flake8: noqa
+from . import support  # noqa
 
 import html5lib
 from html5lib.treeadapters import sax
@@ -25,7 +25,7 @@ def test_to_sax():
         ('endElementNS', ('http://www.w3.org/1999/xhtml', 'title'), 'title'),
         ('characters', '\n        '),
         ('endElementNS', ('http://www.w3.org/1999/xhtml', 'head'), 'head'),
-        ('startElementNS',  ('http://www.w3.org/1999/xhtml', 'body'), 'body', {}),
+        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'body'), 'body', {}),
         ('startElementNS', ('http://www.w3.org/1999/xhtml', 'a'), 'a', {(None, 'href'): '/'}),
         ('startElementNS', ('http://www.w3.org/1999/xhtml', 'b'), 'b', {}),
         ('startElementNS', ('http://www.w3.org/1999/xhtml', 'p'), 'p', {}),
diff --git a/html5lib/tests/test_treewalkers.py b/html5lib/tests/test_treewalkers.py
index 045d9d7b..332027ac 100644
--- a/html5lib/tests/test_treewalkers.py
+++ b/html5lib/tests/test_treewalkers.py
@@ -31,7 +31,7 @@ def test_all_tokens():
         {'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'body'},
         {'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'html'}
     ]
-    for treeName, treeCls in sorted(treeTypes.items()):
+    for _, treeCls in sorted(treeTypes.items()):
         if treeCls is None:
             continue
         p = html5parser.HTMLParser(tree=treeCls["builder"])
diff --git a/html5lib/tests/tokenizer.py b/html5lib/tests/tokenizer.py
index c6163a1f..255c1859 100644
--- a/html5lib/tests/tokenizer.py
+++ b/html5lib/tests/tokenizer.py
@@ -19,6 +19,7 @@ def __init__(self, initialState, lastStartTag=None):
         self._lastStartTag = lastStartTag
 
     def parse(self, stream, encoding=None, innerHTML=False):
+        # pylint:disable=unused-argument
         tokenizer = self.tokenizer(stream, encoding)
         self.outputTokens = []
 
diff --git a/html5lib/tokenizer.py b/html5lib/tokenizer.py
index 79774578..dd6ea75f 100644
--- a/html5lib/tokenizer.py
+++ b/html5lib/tokenizer.py
@@ -1,9 +1,6 @@
 from __future__ import absolute_import, division, unicode_literals
 
-try:
-    chr = unichr # flake8: noqa
-except NameError:
-    pass
+from six import unichr as chr
 
 from collections import deque
 
@@ -147,8 +144,8 @@ def consumeEntity(self, allowedChar=None, fromAttribute=False):
         output = "&"
 
         charStack = [self.stream.char()]
-        if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&")
-                or (allowedChar is not None and allowedChar == charStack[0])):
+        if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&") or
+                (allowedChar is not None and allowedChar == charStack[0])):
             self.stream.unget(charStack[0])
 
         elif charStack[0] == "#":
@@ -924,7 +921,7 @@ def attributeNameState(self):
             if self.lowercaseAttrName:
                 self.currentToken["data"][-1][0] = (
                     self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
-            for name, value in self.currentToken["data"][:-1]:
+            for name, _ in self.currentToken["data"][:-1]:
                 if self.currentToken["data"][-1][0] == name:
                     self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
                                             "duplicate-attribute"})
@@ -1716,11 +1713,11 @@ def cdataSectionState(self):
                 else:
                     data.append(char)
 
-        data = "".join(data)
+        data = "".join(data)  # pylint:disable=redefined-variable-type
         # Deal with null here rather than in the parser
         nullCount = data.count("\u0000")
         if nullCount > 0:
-            for i in range(nullCount):
+            for _ in range(nullCount):
                 self.tokenQueue.append({"type": tokenTypes["ParseError"],
                                         "data": "invalid-codepoint"})
             data = data.replace("\u0000", "\uFFFD")
diff --git a/html5lib/treeadapters/__init__.py b/html5lib/treeadapters/__init__.py
index 57d71304..4f978466 100644
--- a/html5lib/treeadapters/__init__.py
+++ b/html5lib/treeadapters/__init__.py
@@ -5,7 +5,7 @@
 __all__ = ["sax"]
 
 try:
-    from . import genshi  # flake8: noqa
+    from . import genshi  # noqa
 except ImportError:
     pass
 else:
diff --git a/html5lib/treebuilders/_base.py b/html5lib/treebuilders/_base.py
index 8196f591..900a724c 100644
--- a/html5lib/treebuilders/_base.py
+++ b/html5lib/treebuilders/_base.py
@@ -126,6 +126,7 @@ class TreeBuilder(object):
     commentClass - the class to use for comments
     doctypeClass - the class to use for doctypes
     """
+    # pylint:disable=not-callable
 
     # Document class
     documentClass = None
diff --git a/html5lib/treebuilders/dom.py b/html5lib/treebuilders/dom.py
index 8656244f..b7df74b2 100644
--- a/html5lib/treebuilders/dom.py
+++ b/html5lib/treebuilders/dom.py
@@ -109,7 +109,7 @@ def getNameTuple(self):
 
         nameTuple = property(getNameTuple)
 
-    class TreeBuilder(_base.TreeBuilder):
+    class TreeBuilder(_base.TreeBuilder):  # pylint:disable=unused-variable
         def documentClass(self):
             self.dom = Dom.getDOMImplementation().createDocument(None, None, None)
             return weakref.proxy(self)
@@ -158,6 +158,7 @@ def insertText(self, data, parent=None):
             else:
                 # HACK: allow text nodes as children of the document node
                 if hasattr(self.dom, '_child_node_types'):
+                    # pylint:disable=protected-access
                     if Node.TEXT_NODE not in self.dom._child_node_types:
                         self.dom._child_node_types = list(self.dom._child_node_types)
                         self.dom._child_node_types.append(Node.TEXT_NODE)
diff --git a/html5lib/treebuilders/etree.py b/html5lib/treebuilders/etree.py
index 2c8ed19f..d394148d 100644
--- a/html5lib/treebuilders/etree.py
+++ b/html5lib/treebuilders/etree.py
@@ -1,4 +1,6 @@
 from __future__ import absolute_import, division, unicode_literals
+# pylint:disable=protected-access
+
 from six import text_type
 
 import re
@@ -253,7 +255,7 @@ def serializeElement(element, indent=0):
 
         return "\n".join(rv)
 
-    def tostring(element):
+    def tostring(element):  # pylint:disable=unused-variable
         """Serialize an element and its child nodes to a string"""
         rv = []
         filter = ihatexml.InfosetFilter()
@@ -307,7 +309,7 @@ def serializeElement(element):
 
         return "".join(rv)
 
-    class TreeBuilder(_base.TreeBuilder):
+    class TreeBuilder(_base.TreeBuilder):  # pylint:disable=unused-variable
         documentClass = Document
         doctypeClass = DocumentType
         elementClass = Element
diff --git a/html5lib/treebuilders/etree_lxml.py b/html5lib/treebuilders/etree_lxml.py
index 138b30bd..2a69769b 100644
--- a/html5lib/treebuilders/etree_lxml.py
+++ b/html5lib/treebuilders/etree_lxml.py
@@ -10,6 +10,7 @@
 """
 
 from __future__ import absolute_import, division, unicode_literals
+# pylint:disable=protected-access
 
 import warnings
 import re
@@ -53,7 +54,6 @@ def _getChildNodes(self):
 
 def testSerializer(element):
     rv = []
-    finalText = None
     infosetFilter = ihatexml.InfosetFilter(preventDoubleDashComments=True)
 
     def serializeElement(element, indent=0):
@@ -128,16 +128,12 @@ def serializeElement(element, indent=0):
                 rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
     serializeElement(element, 0)
 
-    if finalText is not None:
-        rv.append("|%s\"%s\"" % (' ' * 2, finalText))
-
     return "\n".join(rv)
 
 
 def tostring(element):
     """Serialize an element and its child nodes to a string"""
     rv = []
-    finalText = None
 
     def serializeElement(element):
         if not hasattr(element, "tag"):
@@ -173,9 +169,6 @@ def serializeElement(element):
 
     serializeElement(element)
 
-    if finalText is not None:
-        rv.append("%s\"" % (' ' * 2, finalText))
-
     return "".join(rv)
 
 
@@ -193,9 +186,11 @@ def __init__(self, namespaceHTMLElements, fullTree=False):
         self.namespaceHTMLElements = namespaceHTMLElements
 
         class Attributes(dict):
-            def __init__(self, element, value={}):
+            def __init__(self, element, value=None):
+                if value is None:
+                    value = {}
                 self._element = element
-                dict.__init__(self, value)
+                dict.__init__(self, value)  # pylint:disable=non-parent-init-called
                 for key, value in self.items():
                     if isinstance(key, tuple):
                         name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
@@ -303,12 +298,14 @@ def insertDoctype(self, token):
             self.doctype = doctype
 
     def insertCommentInitial(self, data, parent=None):
+        assert parent is None or parent is self.document
+        assert self.document._elementTree is None
         self.initial_comments.append(data)
 
     def insertCommentMain(self, data, parent=None):
         if (parent == self.document and
                 self.document._elementTree.getroot()[-1].tag == comment_type):
-                warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
+            warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
         super(TreeBuilder, self).insertComment(data, parent)
 
     def insertRoot(self, token):
diff --git a/html5lib/treewalkers/etree.py b/html5lib/treewalkers/etree.py
index 73c8e26a..d3b0c50e 100644
--- a/html5lib/treewalkers/etree.py
+++ b/html5lib/treewalkers/etree.py
@@ -22,7 +22,7 @@ def getETreeBuilder(ElementTreeImplementation):
     ElementTree = ElementTreeImplementation
     ElementTreeCommentType = ElementTree.Comment("asd").tag
 
-    class TreeWalker(_base.NonRecursiveTreeWalker):
+    class TreeWalker(_base.NonRecursiveTreeWalker):  # pylint:disable=unused-variable
         """Given the particular ElementTree representation, this implementation,
         to avoid using recursion, returns "nodes" as tuples with the following
         content:
@@ -38,7 +38,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
         """
         def getNodeDetails(self, node):
             if isinstance(node, tuple):  # It might be the root Element
-                elt, key, parents, flag = node
+                elt, _, _, flag = node
                 if flag in ("text", "tail"):
                     return _base.TEXT, getattr(elt, flag)
                 else:
diff --git a/html5lib/treewalkers/genshistream.py b/html5lib/treewalkers/genshistream.py
index 83cd1654..61cbfede 100644
--- a/html5lib/treewalkers/genshistream.py
+++ b/html5lib/treewalkers/genshistream.py
@@ -25,7 +25,7 @@ def __iter__(self):
                 yield token
 
     def tokens(self, event, next):
-        kind, data, pos = event
+        kind, data, _ = event
         if kind == START:
             tag, attribs = data
             name = tag.localname
diff --git a/html5lib/treewalkers/lxmletree.py b/html5lib/treewalkers/lxmletree.py
index 36850086..7d99adc2 100644
--- a/html5lib/treewalkers/lxmletree.py
+++ b/html5lib/treewalkers/lxmletree.py
@@ -117,6 +117,7 @@ def __len__(self):
 
 class TreeWalker(_base.NonRecursiveTreeWalker):
     def __init__(self, tree):
+        # pylint:disable=redefined-variable-type
         if hasattr(tree, "getroot"):
             self.fragmentChildren = set()
             tree = Root(tree)
diff --git a/html5lib/trie/__init__.py b/html5lib/trie/__init__.py
index a8cca8a9..a5ba4bf1 100644
--- a/html5lib/trie/__init__.py
+++ b/html5lib/trie/__init__.py
@@ -4,9 +4,11 @@
 
 Trie = PyTrie
 
+# pylint:disable=wrong-import-position
 try:
     from .datrie import Trie as DATrie
 except ImportError:
     pass
 else:
     Trie = DATrie
+# pylint:enable=wrong-import-position
diff --git a/html5lib/trie/_base.py b/html5lib/trie/_base.py
index 724486b1..25eece46 100644
--- a/html5lib/trie/_base.py
+++ b/html5lib/trie/_base.py
@@ -7,7 +7,8 @@ class Trie(Mapping):
     """Abstract base class for tries"""
 
     def keys(self, prefix=None):
-        keys = super().keys()
+        # pylint:disable=arguments-differ
+        keys = super(Trie, self).keys()
 
         if prefix is None:
             return set(keys)
diff --git a/html5lib/utils.py b/html5lib/utils.py
index c70de172..5fe237a0 100644
--- a/html5lib/utils.py
+++ b/html5lib/utils.py
@@ -22,12 +22,12 @@
 # surrogates, and there is no mechanism to further escape such
 # escapes.
 try:
-    _x = eval('"\\uD800"')
+    _x = eval('"\\uD800"')  # pylint:disable=eval-used
     if not isinstance(_x, text_type):
         # We need this with u"" because of http://bugs.jython.org/issue2039
-        _x = eval('u"\\uD800"')
+        _x = eval('u"\\uD800"')  # pylint:disable=eval-used
         assert isinstance(_x, text_type)
-except:
+except:  # pylint:disable=bare-except
     supports_lone_surrogates = False
 else:
     supports_lone_surrogates = True
@@ -52,7 +52,7 @@ def __init__(self, items=()):
         # anything here.
         _dictEntries = []
         for name, value in items:
-            if type(name) in (list, tuple, frozenset, set):
+            if isinstance(name, (list, tuple, frozenset, set)):
                 for item in name:
                     _dictEntries.append((item, value))
             else:
diff --git a/parse.py b/parse.py
index cceea84d..2ed8f1c2 100755
--- a/parse.py
+++ b/parse.py
@@ -5,7 +5,6 @@
 """
 
 import sys
-import os
 import traceback
 from optparse import OptionParser
 
@@ -15,9 +14,10 @@
 from html5lib import constants
 from html5lib import utils
 
+
 def parse():
     optParser = getOptParser()
-    opts,args = optParser.parse_args()
+    opts, args = optParser.parse_args()
     encoding = "utf8"
 
     try:
@@ -25,7 +25,10 @@ def parse():
         # Try opening from the internet
         if f.startswith('http://'):
             try:
-                import urllib.request, urllib.parse, urllib.error, cgi
+                import urllib.request
+                import urllib.parse
+                import urllib.error
+                import cgi
                 f = urllib.request.urlopen(f)
                 contentType = f.headers.get('content-type')
                 if contentType:
@@ -41,7 +44,7 @@ def parse():
             try:
                 # Try opening from file system
                 f = open(f, "rb")
-            except IOError as e:                
+            except IOError as e:
                 sys.stderr.write("Unable to open file: %s\n" % e)
                 sys.exit(1)
     except IndexError:
@@ -82,14 +85,15 @@ def parse():
         if document:
             printOutput(p, document, opts)
             t2 = time.time()
-            sys.stderr.write("\n\nRun took: %fs (plus %fs to print the output)"%(t1-t0, t2-t1))
+            sys.stderr.write("\n\nRun took: %fs (plus %fs to print the output)" % (t1 - t0, t2 - t1))
         else:
-            sys.stderr.write("\n\nRun took: %fs"%(t1-t0))
+            sys.stderr.write("\n\nRun took: %fs" % (t1 - t0))
     else:
         document = run(parseMethod, f, encoding, opts.scripting)
         if document:
             printOutput(p, document, opts)
 
+
 def run(parseMethod, f, encoding, scripting):
     try:
         document = parseMethod(f, encoding=encoding, scripting=scripting)
@@ -98,6 +102,7 @@ def run(parseMethod, f, encoding, scripting):
         traceback.print_exc()
     return document
 
+
 def printOutput(parser, document, opts):
     if opts.encoding:
         print("Encoding:", parser.tokenizer.stream.charEncoding)
@@ -116,7 +121,7 @@ def printOutput(parser, document, opts):
             elif tb == "etree":
                 sys.stdout.write(utils.default_etree.tostring(document))
         elif opts.tree:
-            if not hasattr(document,'__getitem__'):
+            if not hasattr(document, '__getitem__'):
                 document = [document]
             for fragment in document:
                 print(parser.tree.testSerializer(fragment))
@@ -126,7 +131,7 @@ def printOutput(parser, document, opts):
             kwargs = {}
             for opt in serializer.HTMLSerializer.options:
                 try:
-                    kwargs[opt] = getattr(opts,opt)
+                    kwargs[opt] = getattr(opts, opt)
                 except:
                     pass
             if not kwargs['quote_char']:
@@ -142,12 +147,14 @@ def printOutput(parser, document, opts):
                 encoding = "utf-8"
             for text in serializer.HTMLSerializer(**kwargs).serialize(tokens, encoding=encoding):
                 sys.stdout.write(text)
-            if not text.endswith('\n'): sys.stdout.write('\n')
+            if not text.endswith('\n'):
+                sys.stdout.write('\n')
     if opts.error:
-        errList=[]
+        errList = []
         for pos, errorcode, datavars in parser.errors:
-            errList.append("Line %i Col %i"%pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars)
-        sys.stdout.write("\nParse errors:\n" + "\n".join(errList)+"\n")
+            errList.append("Line %i Col %i" % pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars)
+        sys.stdout.write("\nParse errors:\n" + "\n".join(errList) + "\n")
+
 
 def getOptParser():
     parser = OptionParser(usage=__doc__)
diff --git a/setup.cfg b/setup.cfg
index 2a9acf13..3152ac54 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,2 +1,11 @@
 [bdist_wheel]
 universal = 1
+
+[pep8]
+ignore = N
+max-line-length = 139
+exclude = .git,__pycache__,.tox,doc
+
+[flake8]
+ignore = N
+max-line-length = 139
diff --git a/setup.py b/setup.py
index b6ea24af..b42ba400 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
 from setuptools import setup
 
 
-classifiers=[
+classifiers = [
     'Development Status :: 5 - Production/Stable',
     'Intended Audience :: Developers',
     'License :: OSI Approved :: MIT License',
@@ -20,9 +20,9 @@
     'Programming Language :: Python :: 3.5',
     'Topic :: Software Development :: Libraries :: Python Modules',
     'Topic :: Text Processing :: Markup :: HTML'
-    ]
+]
 
-packages = ['html5lib'] + ['html5lib.'+name
+packages = ['html5lib'] + ['html5lib.' + name
                            for name in os.listdir(os.path.join('html5lib'))
                            if os.path.isdir(os.path.join('html5lib', name)) and
                            not name.startswith('.') and name != 'tests']
@@ -39,9 +39,9 @@
     assignments = filter(lambda x: isinstance(x, ast.Assign), t.body)
     for a in assignments:
         if (len(a.targets) == 1 and
-              isinstance(a.targets[0], ast.Name) and
-              a.targets[0].id == "__version__" and
-              isinstance(a.value, ast.Str)):
+                isinstance(a.targets[0], ast.Name) and
+                a.targets[0].id == "__version__" and
+                isinstance(a.value, ast.Str)):
             version = a.value.s
 
 setup(name='html5lib',
diff --git a/utils/entities.py b/utils/entities.py
index 116a27cb..6dccf5f0 100644
--- a/utils/entities.py
+++ b/utils/entities.py
@@ -2,50 +2,59 @@
 
 import html5lib
 
+
 def parse(path="html5ents.xml"):
     return html5lib.parse(open(path), treebuilder="lxml")
 
+
 def entity_table(tree):
     return dict((entity_name("".join(tr[0].xpath(".//text()"))),
                  entity_characters(tr[1].text))
                 for tr in tree.xpath("//h:tbody/h:tr",
-                                     namespaces={"h":"http://www.w3.org/1999/xhtml"}))
+                                     namespaces={"h": "http://www.w3.org/1999/xhtml"}))
+
 
 def entity_name(inp):
     return inp.strip()
 
+
 def entity_characters(inp):
     return "".join(codepoint_to_character(item)
-                    for item in inp.split()
-                    if item)
+                   for item in inp.split()
+                   if item)
+
 
 def codepoint_to_character(inp):
-    return ("\U000"+inp[2:]).decode("unicode-escape")
+    return ("\\U000" + inp[2:]).decode("unicode-escape")
+
 
 def make_tests_json(entities):
     test_list = make_test_list(entities)
     tests_json = {"tests":
-                      [make_test(*item) for item in test_list]
+                  [make_test(*item) for item in test_list]
                   }
     return tests_json
 
+
 def make_test(name, characters, good):
     return {
-        "description":test_description(name, good),
-        "input":"&%s"%name,
-        "output":test_expected(name, characters, good)
-        }
+        "description": test_description(name, good),
+        "input": "&%s" % name,
+        "output": test_expected(name, characters, good)
+    }
+
 
 def test_description(name, good):
     with_semicolon = name.endswith(";")
-    semicolon_text = {True:"with a semi-colon",
-                      False:"without a semi-colon"}[with_semicolon]
+    semicolon_text = {True: "with a semi-colon",
+                      False: "without a semi-colon"}[with_semicolon]
     if good:
-        text = "Named entity: %s %s"%(name, semicolon_text)
+        text = "Named entity: %s %s" % (name, semicolon_text)
     else:
-        text = "Bad named entity: %s %s"%(name, semicolon_text)
+        text = "Bad named entity: %s %s" % (name, semicolon_text)
     return text
 
+
 def test_expected(name, characters, good):
     rv = []
     if not good or not name.endswith(";"):
@@ -53,6 +62,7 @@ def test_expected(name, characters, good):
     rv.append(["Character", characters])
     return rv
 
+
 def make_test_list(entities):
     tests = []
     for entity_name, characters in entities.items():
@@ -61,20 +71,23 @@ def make_test_list(entities):
         tests.append((entity_name, characters, True))
     return sorted(tests)
 
+
 def subentity_exists(entity_name, entities):
     for i in range(1, len(entity_name)):
         if entity_name[:-i] in entities:
             return True
     return False
 
+
 def make_entities_code(entities):
-    entities_text = "\n".join("    \"%s\": u\"%s\","%(
-            name, entities[name].encode(
-                "unicode-escape").replace("\"", "\\\""))
-                              for name in sorted(entities.keys()))
+    entities_text = "\n".join("    \"%s\": u\"%s\"," % (
+        name, entities[name].encode(
+            "unicode-escape").replace("\"", "\\\""))
+        for name in sorted(entities.keys()))
     return """entities = {
 %s
-}"""%entities_text
+}""" % entities_text
+
 
 def main():
     entities = entity_table(parse())
@@ -85,4 +98,3 @@ def main():
 
 if __name__ == "__main__":
     main()
-
diff --git a/utils/spider.py b/utils/spider.py
index ac5f9fbe..3a325888 100644
--- a/utils/spider.py
+++ b/utils/spider.py
@@ -7,7 +7,9 @@
 s.spider("http://www.google.com", maxURLs=100)
 """
 
-import urllib.request, urllib.error, urllib.parse
+import urllib.request
+import urllib.error
+import urllib.parse
 import urllib.robotparser
 import md5
 
@@ -16,11 +18,13 @@
 import html5lib
 from html5lib.treebuilders import etree
 
+
 class Spider(object):
+
     def __init__(self):
         self.unvisitedURLs = set()
         self.visitedURLs = set()
-        self.buggyURLs=set()
+        self.buggyURLs = set()
         self.robotParser = urllib.robotparser.RobotFileParser()
         self.contentDigest = {}
         self.http = httplib2.Http(".cache")
@@ -70,18 +74,18 @@ def updateURLs(self, tree):
         update the list of visited and unvisited URLs according to whether we
         have seen them before or not"""
         urls = set()
-        #Remove all links we have already visited
+        # Remove all links we have already visited
         for link in tree.findall(".//a"):
-                try:
-                    url = urllib.parse.urldefrag(link.attrib['href'])[0]
-                    if (url and url not in self.unvisitedURLs and url
+            try:
+                url = urllib.parse.urldefrag(link.attrib['href'])[0]
+                if (url and url not in self.unvisitedURLs and url
                         not in self.visitedURLs):
-                        urls.add(url)
-                except KeyError:
-                    pass
+                    urls.add(url)
+            except KeyError:
+                pass
 
-        #Remove all non-http URLs and add a suitable base URL where that is
-        #missing
+        # Remove all non-http URLs and add a suitable base URL where that is
+        # missing
         newUrls = set()
         for url in urls:
             splitURL = list(urllib.parse.urlsplit(url))
@@ -93,23 +97,22 @@ def updateURLs(self, tree):
         urls = newUrls
 
         responseHeaders = {}
-        #Now we want to find the content types of the links we haven't visited
+        # Now we want to find the content types of the links we haven't visited
         for url in urls:
             try:
                 resp, content = self.http.request(url, "HEAD")
                 responseHeaders[url] = resp
-            except AttributeError as KeyError:
-                #Don't know why this happens
+            except AttributeError:
+                # Don't know why this happens
                 pass
 
-
-        #Remove links not of content-type html or pages not found
-        #XXX - need to deal with other status codes?
+        # Remove links not of content-type html or pages not found
+        # XXX - need to deal with other status codes?
         toVisit = set([url for url in urls if url in responseHeaders and
-                      "html" in responseHeaders[url]['content-type'] and
-                      responseHeaders[url]['status'] == "200"])
+                       "html" in responseHeaders[url]['content-type'] and
+                       responseHeaders[url]['status'] == "200"])
 
-        #Now check we are allowed to spider the page
+        # Now check we are allowed to spider the page
         for url in toVisit:
             robotURL = list(urllib.parse.urlsplit(url)[:2])
             robotURL.extend(["robots.txt", "", ""])




pFad - Phonifier reborn



Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy