diff --git a/.prospector.yaml b/.prospector.yaml new file mode 100644 index 00000000..7e8efe1a --- /dev/null +++ b/.prospector.yaml @@ -0,0 +1,21 @@ +strictness: veryhigh +doc-warnings: false +test-warnings: false + +max-line-length: 139 + +requirements: + - requirements.txt + - requirements-test.txt + - requirements-optional.txt + +ignore-paths: + - parse.py + - utils/ + +python-targets: + - 2 + - 3 + +mccabe: + run: false diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 00000000..ea74d5db --- /dev/null +++ b/.pylintrc @@ -0,0 +1,10 @@ +[MASTER] +ignore=tests + +[MESSAGES CONTROL] +# messages up to fixme should probably be fixed somehow +disable = redefined-builtin,attribute-defined-outside-init,anomalous-backslash-in-string,no-self-use,redefined-outer-name,bad-continuation,wrong-import-order,superfluous-parens,no-member,duplicate-code,super-init-not-called,abstract-method,property-on-old-class,wrong-import-position,no-name-in-module,no-init,bad-mcs-classmethod-argument,bad-classmethod-argument,fixme,invalid-name,import-error,too-few-public-methods,too-many-ancestors,too-many-arguments,too-many-boolean-expressions,too-many-branches,too-many-instance-attributes,too-many-locals,too-many-lines,too-many-public-methods,too-many-return-statements,too-many-statements,missing-docstring,line-too-long,locally-disabled,locally-enabled,bad-builtin,deprecated-lambda + +[FORMAT] +max-line-length=139 +single-line-if-stmt=no diff --git a/flake8-run.sh b/flake8-run.sh index 685ec6ab..d9264946 100755 --- a/flake8-run.sh +++ b/flake8-run.sh @@ -5,8 +5,5 @@ if [[ ! -x $(which flake8) ]]; then exit 1 fi -find html5lib/ -name '*.py' -and -not -name 'constants.py' -print0 | xargs -0 flake8 --ignore=E501 -flake1=$? -flake8 --max-line-length=99 --ignore=E126 html5lib/constants.py -flake2=$? -exit $[$flake1 || $flake2] +flake8 `dirname $0` +exit $? diff --git a/html5lib/constants.py b/html5lib/constants.py index 2244933c..df1f061e 100644 --- a/html5lib/constants.py +++ b/html5lib/constants.py @@ -2819,7 +2819,6 @@ 0x0d: "\u000D", 0x80: "\u20AC", 0x81: "\u0081", - 0x81: "\u0081", 0x82: "\u201A", 0x83: "\u0192", 0x84: "\u201E", diff --git a/html5lib/filters/sanitizer.py b/html5lib/filters/sanitizer.py index caddd318..7f81c0d1 100644 --- a/html5lib/filters/sanitizer.py +++ b/html5lib/filters/sanitizer.py @@ -765,15 +765,15 @@ def sanitize_token(self, token): if ((namespace, name) in self.allowed_elements or (namespace is None and (namespaces["html"], name) in self.allowed_elements)): - return self.allowed_token(token, token_type) + return self.allowed_token(token) else: - return self.disallowed_token(token, token_type) + return self.disallowed_token(token) elif token_type == "Comment": pass else: return token - def allowed_token(self, token, token_type): + def allowed_token(self, token): if "data" in token: attrs = token["data"] attr_names = set(attrs.keys()) @@ -823,7 +823,8 @@ def allowed_token(self, token, token_type): token["data"] = attrs return token - def disallowed_token(self, token, token_type): + def disallowed_token(self, token): + token_type = token["type"] if token_type == "EndTag": token["data"] = "%s>" % token["name"] elif token["data"]: @@ -862,7 +863,7 @@ def sanitize_css(self, style): 'padding']: for keyword in value.split(): if keyword not in self.allowed_css_keywords and \ - not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): + not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): # noqa break else: clean.append(prop + ': ' + value + ';') diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index e6808425..331b8fd7 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -121,7 +121,7 @@ def reset(self): self.phase.insertHtmlElement() self.resetInsertionMode() else: - self.innerHTML = False + self.innerHTML = False # pylint:disable=redefined-variable-type self.phase = self.phases["initial"] self.lastPhase = None @@ -241,6 +241,7 @@ def parse(self, stream, encoding=None, parseMeta=True, def parseFragment(self, stream, container="div", encoding=None, parseMeta=False, useChardet=True, scripting=False): + # pylint:disable=unused-argument """Parse a HTML fragment into a well-formed tree fragment container - name of the element we're setting the innerHTML property @@ -259,8 +260,10 @@ def parseFragment(self, stream, container="div", encoding=None, encoding=encoding, scripting=scripting) return self.tree.getFragment() - def parseError(self, errorcode="XXX-undefined-error", datavars={}): + def parseError(self, errorcode="XXX-undefined-error", datavars=None): # XXX The idea is to make errorcode mandatory. + if datavars is None: + datavars = {} self.errors.append((self.tokenizer.stream.position(), errorcode, datavars)) if self.strict: raise ParseError(E[errorcode] % datavars) @@ -361,6 +364,7 @@ def adjustForeignAttributes(self, token): del token["data"][originalName] def reparseTokenNormal(self, token): + # pylint:disable=unused-argument self.parser.phase() def resetInsertionMode(self): @@ -458,6 +462,7 @@ def getMetaclass(use_metaclass, metaclass_func): else: return type + # pylint:disable=unused-argument class Phase(with_metaclass(getMetaclass(debug, log))): """Base class for helper object that implements each phase of processing """ @@ -948,8 +953,8 @@ class InBodyPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - # Keep a ref to this for special handling of whitespace in
- self.processSpaceCharactersNonPre = self.processSpaceCharacters + # Set this to the default handler + self.processSpaceCharacters = self.processSpaceCharactersNonPre self.startTagHandler = utils.MethodDispatcher([ ("html", self.startTagHtml), @@ -1082,7 +1087,7 @@ def processCharacters(self, token): for char in token["data"]])): self.parser.framesetOK = False - def processSpaceCharacters(self, token): + def processSpaceCharactersNonPre(self, token): self.tree.reconstructActiveFormattingElements() self.tree.insertText(token["data"]) @@ -2763,6 +2768,7 @@ def startTagOther(self, token): def processEndTag(self, token): self.parser.parseError("expected-eof-but-got-end-tag", {"name": token["name"]}) + # pylint:enable=unused-argument return { "initial": InitialPhase, diff --git a/html5lib/ihatexml.py b/html5lib/ihatexml.py index 5da5d938..d6d1d6fb 100644 --- a/html5lib/ihatexml.py +++ b/html5lib/ihatexml.py @@ -175,9 +175,9 @@ def escapeRegexp(string): return string # output from the above -nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') +nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') # noqa -nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') +nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') # noqa # Simpler things nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\-\'()+,./:=?;!*#@$_%]") @@ -186,7 +186,7 @@ def escapeRegexp(string): class InfosetFilter(object): replacementRegexp = re.compile(r"U[\dA-F]{5,5}") - def __init__(self, replaceChars=None, + def __init__(self, dropXmlnsLocalName=False, dropXmlnsAttrNs=False, preventDoubleDashComments=False, @@ -217,7 +217,7 @@ def coerceAttribute(self, name, namespace=None): else: return self.toXmlName(name) - def coerceElement(self, name, namespace=None): + def coerceElement(self, name): return self.toXmlName(name) def coerceComment(self, data): @@ -232,7 +232,7 @@ def coerceComment(self, data): def coerceCharacters(self, data): if self.replaceFormFeedCharacters: - for i in range(data.count("\x0C")): + for _ in range(data.count("\x0C")): warnings.warn("Text cannot contain U+000C", DataLossWarning) data = data.replace("\x0C", " ") # Other non-xml characters diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py index 15acba0d..58d626c9 100644 --- a/html5lib/inputstream.py +++ b/html5lib/inputstream.py @@ -19,12 +19,6 @@ except ImportError: BytesIO = StringIO -try: - from io import BufferedIOBase -except ImportError: - class BufferedIOBase(object): - pass - # Non-unicode versions of constants for use in the pre-parser spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters]) asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters]) @@ -32,15 +26,17 @@ class BufferedIOBase(object): spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"]) -invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]" +invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]" # noqa if utils.supports_lone_surrogates: # Use one extra step of indirection and create surrogates with - # unichr. Not using this indirection would introduce an illegal + # eval. Not using this indirection would introduce an illegal # unicode literal on platforms not supporting such lone # surrogates. - invalid_unicode_re = re.compile(invalid_unicode_no_surrogate + - eval('"\\uD800-\\uDFFF"')) + assert invalid_unicode_no_surrogate[-1] == "]" and invalid_unicode_no_surrogate.count("]") == 1 + invalid_unicode_re = re.compile(invalid_unicode_no_surrogate[:-1] + + eval('"\\uD800-\\uDFFF"') + # pylint:disable=eval-used + "]") else: invalid_unicode_re = re.compile(invalid_unicode_no_surrogate) @@ -296,7 +292,7 @@ def readChunk(self, chunkSize=None): return True def characterErrorsUCS4(self, data): - for i in range(len(invalid_unicode_re.findall(data))): + for _ in range(len(invalid_unicode_re.findall(data))): self.errors.append("invalid-codepoint") def characterErrorsUCS2(self, data): @@ -453,7 +449,7 @@ def openStream(self, source): try: stream.seek(stream.tell()) - except: + except: # pylint:disable=bare-except stream = BufferedStream(stream) return stream @@ -571,6 +567,7 @@ def __new__(self, value): return bytes.__new__(self, value.lower()) def __init__(self, value): + # pylint:disable=unused-argument self._position = -1 def __iter__(self): @@ -681,7 +678,7 @@ def getEncoding(self): (b" 1) or - (not is_ucs4 and len(v) > 2)): - continue - if v != "&": - if len(v) == 2: - v = utils.surrogatePairToCodepoint(v) - else: - v = ord(v) - if v not in encode_entity_map or k.islower(): - # prefer < over < and similarly for &, >, etc. - encode_entity_map[v] = k - - def htmlentityreplace_errors(exc): - if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)): - res = [] - codepoints = [] - skip = False - for i, c in enumerate(exc.object[exc.start:exc.end]): - if skip: - skip = False - continue - index = i + exc.start - if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]): - codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2]) - skip = True - else: - codepoint = ord(c) - codepoints.append(codepoint) - for cp in codepoints: - e = encode_entity_map.get(cp) - if e: - res.append("&") - res.append(e) - if not e.endswith(";"): - res.append(";") - else: - res.append("%s;" % (hex(cp)[2:])) - return ("".join(res), exc.end) - else: - return xmlcharrefreplace_errors(exc) - register_error(unicode_encode_errors, htmlentityreplace_errors) +encode_entity_map = {} +is_ucs4 = len("\U0010FFFF") == 1 +for k, v in list(entities.items()): + # skip multi-character entities + if ((is_ucs4 and len(v) > 1) or + (not is_ucs4 and len(v) > 2)): + continue + if v != "&": + if len(v) == 2: + v = utils.surrogatePairToCodepoint(v) + else: + v = ord(v) + if v not in encode_entity_map or k.islower(): + # prefer < over < and similarly for &, >, etc. + encode_entity_map[v] = k + + +def htmlentityreplace_errors(exc): + if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)): + res = [] + codepoints = [] + skip = False + for i, c in enumerate(exc.object[exc.start:exc.end]): + if skip: + skip = False + continue + index = i + exc.start + if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]): + codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2]) + skip = True + else: + codepoint = ord(c) + codepoints.append(codepoint) + for cp in codepoints: + e = encode_entity_map.get(cp) + if e: + res.append("&") + res.append(e) + if not e.endswith(";"): + res.append(";") + else: + res.append("%s;" % (hex(cp)[2:])) + return ("".join(res), exc.end) + else: + return xmlcharrefreplace_errors(exc) - del register_error +register_error("htmlentityreplace", htmlentityreplace_errors) class HTMLSerializer(object): @@ -168,7 +163,7 @@ def __init__(self, **kwargs): def encode(self, string): assert(isinstance(string, text_type)) if self.encoding: - return string.encode(self.encoding, unicode_encode_errors) + return string.encode(self.encoding, "htmlentityreplace") else: return string @@ -180,6 +175,7 @@ def encodeStrict(self, string): return string def serialize(self, treewalker, encoding=None): + # pylint:disable=too-many-nested-blocks self.encoding = encoding in_cdata = False self.errors = [] @@ -241,7 +237,7 @@ def serialize(self, treewalker, encoding=None): in_cdata = True elif in_cdata: self.serializeError("Unexpected child element of a CDATA element") - for (attr_namespace, attr_name), attr_value in token["data"].items(): + for (_, attr_name), attr_value in token["data"].items(): # TODO: Add namespace support here k = attr_name v = attr_value @@ -328,6 +324,6 @@ def serializeError(self, data="XXX ERROR MESSAGE NEEDED"): raise SerializeError -def SerializeError(Exception): +class SerializeError(Exception): """Error in serialized tree""" pass diff --git a/html5lib/tests/support.py b/html5lib/tests/support.py index 6e6a916b..6ae09dbe 100644 --- a/html5lib/tests/support.py +++ b/html5lib/tests/support.py @@ -1,5 +1,7 @@ from __future__ import absolute_import, division, unicode_literals +# pylint:disable=wrong-import-position + import os import sys import codecs @@ -13,7 +15,7 @@ os.path.pardir, os.path.pardir))) -from html5lib import treebuilders, treewalkers, treeadapters +from html5lib import treebuilders, treewalkers, treeadapters # noqa del base_path # Build a dict of available trees @@ -26,14 +28,14 @@ } # ElementTree impls -import xml.etree.ElementTree as ElementTree +import xml.etree.ElementTree as ElementTree # noqa treeTypes['ElementTree'] = { "builder": treebuilders.getTreeBuilder("etree", ElementTree, fullTree=True), "walker": treewalkers.getTreeWalker("etree", ElementTree) } try: - import xml.etree.cElementTree as cElementTree + import xml.etree.cElementTree as cElementTree # noqa except ImportError: treeTypes['cElementTree'] = None else: @@ -47,7 +49,7 @@ } try: - import lxml.etree as lxml # flake8: noqa + import lxml.etree as lxml # noqa except ImportError: treeTypes['lxml'] = None else: @@ -58,7 +60,7 @@ # Genshi impls try: - import genshi # flake8: noqa + import genshi # noqa except ImportError: pass else: @@ -68,6 +70,8 @@ "walker": treewalkers.getTreeWalker("genshi") } +# pylint:enable=wrong-import-position + def get_data_files(subdirectory, files='*.dat', search_dir=test_dir): return sorted(glob.glob(os.path.join(search_dir, subdirectory, files))) diff --git a/html5lib/tests/test_encoding.py b/html5lib/tests/test_encoding.py index 09504654..c5d2af12 100644 --- a/html5lib/tests/test_encoding.py +++ b/html5lib/tests/test_encoding.py @@ -51,19 +51,21 @@ def runPreScanEncodingTest(data, encoding): def test_encoding(): for filename in get_data_files("encoding"): tests = _TestData(filename, b"data", encoding=None) - for idx, test in enumerate(tests): + for test in tests: yield (runParserEncodingTest, test[b'data'], test[b'encoding']) yield (runPreScanEncodingTest, test[b'data'], test[b'encoding']) +# pylint:disable=wrong-import-position try: try: - import charade # flake8: noqa + import charade # noqa except ImportError: - import chardet # flake8: noqa + import chardet # noqa except ImportError: print("charade/chardet not found, skipping chardet tests") else: def test_chardet(): - with open(os.path.join(test_dir, "encoding" , "chardet", "test_big5.txt"), "rb") as fp: + with open(os.path.join(test_dir, "encoding", "chardet", "test_big5.txt"), "rb") as fp: encoding = inputstream.HTMLInputStream(fp.read()).charEncoding assert encoding[0].name == "big5" +# pylint:enable=wrong-import-position diff --git a/html5lib/tests/test_parser2.py b/html5lib/tests/test_parser2.py index 2f3ba2c8..f8e1ac43 100644 --- a/html5lib/tests/test_parser2.py +++ b/html5lib/tests/test_parser2.py @@ -2,10 +2,8 @@ import io -import pytest +from . import support # noqa -from . import support # flake8: noqa -from html5lib import html5parser from html5lib.constants import namespaces from html5lib import parse @@ -23,29 +21,29 @@ def test_line_counter(): def test_namespace_html_elements_0_dom(): doc = parse("", - treebuilder="dom", - namespaceHTMLElements=True) + treebuilder="dom", + namespaceHTMLElements=True) assert doc.childNodes[0].namespaceURI == namespaces["html"] def test_namespace_html_elements_1_dom(): doc = parse("", - treebuilder="dom", - namespaceHTMLElements=False) + treebuilder="dom", + namespaceHTMLElements=False) assert doc.childNodes[0].namespaceURI is None def test_namespace_html_elements_0_etree(): doc = parse("", - treebuilder="etree", - namespaceHTMLElements=True) + treebuilder="etree", + namespaceHTMLElements=True) assert doc.tag == "{%s}html" % (namespaces["html"],) def test_namespace_html_elements_1_etree(): doc = parse("", - treebuilder="etree", - namespaceHTMLElements=False) + treebuilder="etree", + namespaceHTMLElements=False) assert doc.tag == "html" diff --git a/html5lib/tests/test_sanitizer.py b/html5lib/tests/test_sanitizer.py index 1f8a06f6..e19deea8 100644 --- a/html5lib/tests/test_sanitizer.py +++ b/html5lib/tests/test_sanitizer.py @@ -4,7 +4,7 @@ from html5lib.filters import sanitizer -def runSanitizerTest(name, expected, input): +def runSanitizerTest(_, expected, input): parsed = parseFragment(expected) expected = serialize(parsed, omit_optional_tags=False, @@ -63,7 +63,8 @@ def test_sanitizer(): for ns, tag_name in sanitizer.allowed_elements: if ns != constants.namespaces["html"]: continue - if tag_name in ['caption', 'col', 'colgroup', 'optgroup', 'option', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr', 'select']: + if tag_name in ['caption', 'col', 'colgroup', 'optgroup', 'option', 'table', 'tbody', 'td', + 'tfoot', 'th', 'thead', 'tr', 'select']: continue # TODO if tag_name == 'image': yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name, diff --git a/html5lib/tests/test_serializer.py b/html5lib/tests/test_serializer.py index b3ffe0df..b3cda7d7 100644 --- a/html5lib/tests/test_serializer.py +++ b/html5lib/tests/test_serializer.py @@ -12,6 +12,7 @@ from html5lib.serializer import HTMLSerializer, serialize from html5lib.treewalkers._base import TreeWalker +# pylint:disable=wrong-import-position optionals_loaded = [] try: @@ -19,6 +20,7 @@ optionals_loaded.append("lxml") except ImportError: pass +# pylint:enable=wrong-import-position default_namespace = constants.namespaces["html"] @@ -219,5 +221,5 @@ def test_serializer(): for filename in get_data_files('serializer-testdata', '*.test', os.path.dirname(__file__)): with open(filename) as fp: tests = json.load(fp) - for index, test in enumerate(tests['tests']): + for test in tests['tests']: yield runSerializerTest, test["input"], test["expected"], test.get("options", {}) diff --git a/html5lib/tests/test_stream.py b/html5lib/tests/test_stream.py index 3b659fbb..77e411d5 100644 --- a/html5lib/tests/test_stream.py +++ b/html5lib/tests/test_stream.py @@ -1,15 +1,20 @@ from __future__ import absolute_import, division, unicode_literals -from . import support # flake8: noqa +from . import support # noqa + import codecs -from io import BytesIO -import socket +import sys +from io import BytesIO, StringIO + +import pytest import six from six.moves import http_client, urllib from html5lib.inputstream import (BufferedStream, HTMLInputStream, HTMLUnicodeInputStream, HTMLBinaryInputStream) +from html5lib.utils import supports_lone_surrogates + def test_basic(): s = b"abc" @@ -17,6 +22,7 @@ def test_basic(): read = fp.read(10) assert read == s + def test_read_length(): fp = BufferedStream(BytesIO(b"abcdef")) read1 = fp.read(1) @@ -28,17 +34,23 @@ def test_read_length(): read4 = fp.read(4) assert read4 == b"" + def test_tell(): fp = BufferedStream(BytesIO(b"abcdef")) read1 = fp.read(1) + assert read1 == b"a" assert fp.tell() == 1 read2 = fp.read(2) + assert read2 == b"bc" assert fp.tell() == 3 read3 = fp.read(3) + assert read3 == b"def" assert fp.tell() == 6 read4 = fp.read(4) + assert read4 == b"" assert fp.tell() == 6 + def test_seek(): fp = BufferedStream(BytesIO(b"abcdef")) read1 = fp.read(1) @@ -55,20 +67,26 @@ def test_seek(): read5 = fp.read(2) assert read5 == b"ef" + def test_seek_tell(): fp = BufferedStream(BytesIO(b"abcdef")) read1 = fp.read(1) + assert read1 == b"a" assert fp.tell() == 1 fp.seek(0) read2 = fp.read(1) + assert read2 == b"a" assert fp.tell() == 1 read3 = fp.read(2) + assert read3 == b"bc" assert fp.tell() == 3 fp.seek(2) read4 = fp.read(2) + assert read4 == b"cd" assert fp.tell() == 4 fp.seek(4) read5 = fp.read(2) + assert read5 == b"ef" assert fp.tell() == 6 @@ -85,11 +103,13 @@ def test_char_ascii(): assert stream.charEncoding[0].name == 'windows-1252' assert stream.char() == "'" + def test_char_utf8(): stream = HTMLInputStream('\u2018'.encode('utf-8'), encoding='utf-8') assert stream.charEncoding[0].name == 'utf-8' assert stream.char() == '\u2018' + def test_char_win1252(): stream = HTMLInputStream("\xa9\xf1\u2019".encode('windows-1252')) assert stream.charEncoding[0].name == 'windows-1252' @@ -97,16 +117,19 @@ def test_char_win1252(): assert stream.char() == "\xf1" assert stream.char() == "\u2019" + def test_bom(): stream = HTMLInputStream(codecs.BOM_UTF8 + b"'") assert stream.charEncoding[0].name == 'utf-8' assert stream.char() == "'" + def test_utf_16(): stream = HTMLInputStream((' ' * 1025).encode('utf-16')) assert stream.charEncoding[0].name in ['utf-16le', 'utf-16be'] assert len(stream.charsUntil(' ', True)) == 1025 + def test_newlines(): stream = HTMLBinaryInputStreamShortChunk(codecs.BOM_UTF8 + b"a\nbb\r\nccc\rddddxe") assert stream.position() == (1, 0) @@ -117,11 +140,13 @@ def test_newlines(): assert stream.charsUntil('e') == "x" assert stream.position() == (4, 5) + def test_newlines2(): size = HTMLUnicodeInputStream._defaultChunkSize stream = HTMLInputStream("\r" * size + "\n") assert stream.charsUntil('x') == "\n" * size + def test_position(): stream = HTMLBinaryInputStreamShortChunk(codecs.BOM_UTF8 + b"a\nbb\nccc\nddde\nf\ngh") assert stream.position() == (1, 0) @@ -140,6 +165,7 @@ def test_position(): assert stream.charsUntil('h') == "e\nf\ng" assert stream.position() == (6, 1) + def test_position2(): stream = HTMLUnicodeInputStreamShortChunk("abc\nd") assert stream.position() == (1, 0) @@ -154,6 +180,7 @@ def test_position2(): assert stream.char() == "d" assert stream.position() == (2, 1) + def test_python_issue_20007(): """ Make sure we have a work-around for Python bug #20007 @@ -161,6 +188,7 @@ def test_python_issue_20007(): """ class FakeSocket(object): def makefile(self, _mode, _bufsize=None): + # pylint:disable=unused-argument return BytesIO(b"HTTP/1.1 200 Ok\r\n\r\nText") source = http_client.HTTPResponse(FakeSocket()) @@ -168,6 +196,7 @@ def makefile(self, _mode, _bufsize=None): stream = HTMLInputStream(source) assert stream.charsUntil(" ") == "Text" + def test_python_issue_20007_b(): """ Make sure we have a work-around for Python bug #20007 @@ -178,6 +207,7 @@ def test_python_issue_20007_b(): class FakeSocket(object): def makefile(self, _mode, _bufsize=None): + # pylint:disable=unused-argument return BytesIO(b"HTTP/1.1 200 Ok\r\n\r\nText") source = http_client.HTTPResponse(FakeSocket()) @@ -185,3 +215,109 @@ def makefile(self, _mode, _bufsize=None): wrapped = urllib.response.addinfourl(source, source.msg, "http://example.com") stream = HTMLInputStream(wrapped) assert stream.charsUntil(" ") == "Text" + + +@pytest.mark.parametrize("inp,num", + [("\u0000", 0), + ("\u0001", 1), + ("\u0008", 1), + ("\u0009", 0), + ("\u000A", 0), + ("\u000B", 1), + ("\u000C", 0), + ("\u000D", 0), + ("\u000E", 1), + ("\u001F", 1), + ("\u0020", 0), + ("\u007E", 0), + ("\u007F", 1), + ("\u009F", 1), + ("\u00A0", 0), + ("\uFDCF", 0), + ("\uFDD0", 1), + ("\uFDEF", 1), + ("\uFDF0", 0), + ("\uFFFD", 0), + ("\uFFFE", 1), + ("\uFFFF", 1), + ("\U0001FFFD", 0), + ("\U0001FFFE", 1), + ("\U0001FFFF", 1), + ("\U0002FFFD", 0), + ("\U0002FFFE", 1), + ("\U0002FFFF", 1), + ("\U0003FFFD", 0), + ("\U0003FFFE", 1), + ("\U0003FFFF", 1), + ("\U0004FFFD", 0), + ("\U0004FFFE", 1), + ("\U0004FFFF", 1), + ("\U0005FFFD", 0), + ("\U0005FFFE", 1), + ("\U0005FFFF", 1), + ("\U0006FFFD", 0), + ("\U0006FFFE", 1), + ("\U0006FFFF", 1), + ("\U0007FFFD", 0), + ("\U0007FFFE", 1), + ("\U0007FFFF", 1), + ("\U0008FFFD", 0), + ("\U0008FFFE", 1), + ("\U0008FFFF", 1), + ("\U0009FFFD", 0), + ("\U0009FFFE", 1), + ("\U0009FFFF", 1), + ("\U000AFFFD", 0), + ("\U000AFFFE", 1), + ("\U000AFFFF", 1), + ("\U000BFFFD", 0), + ("\U000BFFFE", 1), + ("\U000BFFFF", 1), + ("\U000CFFFD", 0), + ("\U000CFFFE", 1), + ("\U000CFFFF", 1), + ("\U000DFFFD", 0), + ("\U000DFFFE", 1), + ("\U000DFFFF", 1), + ("\U000EFFFD", 0), + ("\U000EFFFE", 1), + ("\U000EFFFF", 1), + ("\U000FFFFD", 0), + ("\U000FFFFE", 1), + ("\U000FFFFF", 1), + ("\U0010FFFD", 0), + ("\U0010FFFE", 1), + ("\U0010FFFF", 1), + ("\x01\x01\x01", 3), + ("a\x01a\x01a\x01a", 3)]) +def test_invalid_codepoints(inp, num): + stream = HTMLUnicodeInputStream(StringIO(inp)) + for _i in range(len(inp)): + stream.char() + assert len(stream.errors) == num + + +@pytest.mark.skipif(not supports_lone_surrogates, reason="doesn't support lone surrogates") +@pytest.mark.parametrize("inp,num", + [("'\\uD7FF'", 0), + ("'\\uD800'", 1), + ("'\\uDBFF'", 1), + ("'\\uDC00'", 1), + ("'\\uDFFF'", 1), + ("'\\uE000'", 0), + ("'\\uD800\\uD800\\uD800'", 3), + ("'a\\uD800a\\uD800a\\uD800a'", 3), + ("'\\uDFFF\\uDBFF'", 2), + pytest.mark.skipif(sys.maxunicode == 0xFFFF, + ("'\\uDBFF\\uDFFF'", 2), + reason="narrow Python")]) +def test_invalid_codepoints_surrogates(inp, num): + inp = eval(inp) # pylint:disable=eval-used + fp = StringIO(inp) + if ord(max(fp.read())) > 0xFFFF: + pytest.skip("StringIO altered string") + fp.seek(0) + stream = HTMLUnicodeInputStream(fp) + for _i in range(len(inp)): + stream.char() + assert len(stream.errors) == num diff --git a/html5lib/tests/test_treeadapters.py b/html5lib/tests/test_treeadapters.py index 5f38b6c3..95e56c00 100644 --- a/html5lib/tests/test_treeadapters.py +++ b/html5lib/tests/test_treeadapters.py @@ -1,6 +1,6 @@ from __future__ import absolute_import, division, unicode_literals -from . import support # flake8: noqa +from . import support # noqa import html5lib from html5lib.treeadapters import sax @@ -25,7 +25,7 @@ def test_to_sax(): ('endElementNS', ('http://www.w3.org/1999/xhtml', 'title'), 'title'), ('characters', '\n '), ('endElementNS', ('http://www.w3.org/1999/xhtml', 'head'), 'head'), - ('startElementNS', ('http://www.w3.org/1999/xhtml', 'body'), 'body', {}), + ('startElementNS', ('http://www.w3.org/1999/xhtml', 'body'), 'body', {}), ('startElementNS', ('http://www.w3.org/1999/xhtml', 'a'), 'a', {(None, 'href'): '/'}), ('startElementNS', ('http://www.w3.org/1999/xhtml', 'b'), 'b', {}), ('startElementNS', ('http://www.w3.org/1999/xhtml', 'p'), 'p', {}), diff --git a/html5lib/tests/test_treewalkers.py b/html5lib/tests/test_treewalkers.py index 045d9d7b..332027ac 100644 --- a/html5lib/tests/test_treewalkers.py +++ b/html5lib/tests/test_treewalkers.py @@ -31,7 +31,7 @@ def test_all_tokens(): {'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'body'}, {'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'html'} ] - for treeName, treeCls in sorted(treeTypes.items()): + for _, treeCls in sorted(treeTypes.items()): if treeCls is None: continue p = html5parser.HTMLParser(tree=treeCls["builder"]) diff --git a/html5lib/tests/tokenizer.py b/html5lib/tests/tokenizer.py index c6163a1f..255c1859 100644 --- a/html5lib/tests/tokenizer.py +++ b/html5lib/tests/tokenizer.py @@ -19,6 +19,7 @@ def __init__(self, initialState, lastStartTag=None): self._lastStartTag = lastStartTag def parse(self, stream, encoding=None, innerHTML=False): + # pylint:disable=unused-argument tokenizer = self.tokenizer(stream, encoding) self.outputTokens = [] diff --git a/html5lib/tokenizer.py b/html5lib/tokenizer.py index 79774578..dd6ea75f 100644 --- a/html5lib/tokenizer.py +++ b/html5lib/tokenizer.py @@ -1,9 +1,6 @@ from __future__ import absolute_import, division, unicode_literals -try: - chr = unichr # flake8: noqa -except NameError: - pass +from six import unichr as chr from collections import deque @@ -147,8 +144,8 @@ def consumeEntity(self, allowedChar=None, fromAttribute=False): output = "&" charStack = [self.stream.char()] - if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&") - or (allowedChar is not None and allowedChar == charStack[0])): + if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&") or + (allowedChar is not None and allowedChar == charStack[0])): self.stream.unget(charStack[0]) elif charStack[0] == "#": @@ -924,7 +921,7 @@ def attributeNameState(self): if self.lowercaseAttrName: self.currentToken["data"][-1][0] = ( self.currentToken["data"][-1][0].translate(asciiUpper2Lower)) - for name, value in self.currentToken["data"][:-1]: + for name, _ in self.currentToken["data"][:-1]: if self.currentToken["data"][-1][0] == name: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "duplicate-attribute"}) @@ -1716,11 +1713,11 @@ def cdataSectionState(self): else: data.append(char) - data = "".join(data) + data = "".join(data) # pylint:disable=redefined-variable-type # Deal with null here rather than in the parser nullCount = data.count("\u0000") if nullCount > 0: - for i in range(nullCount): + for _ in range(nullCount): self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) data = data.replace("\u0000", "\uFFFD") diff --git a/html5lib/treeadapters/__init__.py b/html5lib/treeadapters/__init__.py index 57d71304..4f978466 100644 --- a/html5lib/treeadapters/__init__.py +++ b/html5lib/treeadapters/__init__.py @@ -5,7 +5,7 @@ __all__ = ["sax"] try: - from . import genshi # flake8: noqa + from . import genshi # noqa except ImportError: pass else: diff --git a/html5lib/treebuilders/_base.py b/html5lib/treebuilders/_base.py index 8196f591..900a724c 100644 --- a/html5lib/treebuilders/_base.py +++ b/html5lib/treebuilders/_base.py @@ -126,6 +126,7 @@ class TreeBuilder(object): commentClass - the class to use for comments doctypeClass - the class to use for doctypes """ + # pylint:disable=not-callable # Document class documentClass = None diff --git a/html5lib/treebuilders/dom.py b/html5lib/treebuilders/dom.py index 8656244f..b7df74b2 100644 --- a/html5lib/treebuilders/dom.py +++ b/html5lib/treebuilders/dom.py @@ -109,7 +109,7 @@ def getNameTuple(self): nameTuple = property(getNameTuple) - class TreeBuilder(_base.TreeBuilder): + class TreeBuilder(_base.TreeBuilder): # pylint:disable=unused-variable def documentClass(self): self.dom = Dom.getDOMImplementation().createDocument(None, None, None) return weakref.proxy(self) @@ -158,6 +158,7 @@ def insertText(self, data, parent=None): else: # HACK: allow text nodes as children of the document node if hasattr(self.dom, '_child_node_types'): + # pylint:disable=protected-access if Node.TEXT_NODE not in self.dom._child_node_types: self.dom._child_node_types = list(self.dom._child_node_types) self.dom._child_node_types.append(Node.TEXT_NODE) diff --git a/html5lib/treebuilders/etree.py b/html5lib/treebuilders/etree.py index 2c8ed19f..d394148d 100644 --- a/html5lib/treebuilders/etree.py +++ b/html5lib/treebuilders/etree.py @@ -1,4 +1,6 @@ from __future__ import absolute_import, division, unicode_literals +# pylint:disable=protected-access + from six import text_type import re @@ -253,7 +255,7 @@ def serializeElement(element, indent=0): return "\n".join(rv) - def tostring(element): + def tostring(element): # pylint:disable=unused-variable """Serialize an element and its child nodes to a string""" rv = [] filter = ihatexml.InfosetFilter() @@ -307,7 +309,7 @@ def serializeElement(element): return "".join(rv) - class TreeBuilder(_base.TreeBuilder): + class TreeBuilder(_base.TreeBuilder): # pylint:disable=unused-variable documentClass = Document doctypeClass = DocumentType elementClass = Element diff --git a/html5lib/treebuilders/etree_lxml.py b/html5lib/treebuilders/etree_lxml.py index 138b30bd..2a69769b 100644 --- a/html5lib/treebuilders/etree_lxml.py +++ b/html5lib/treebuilders/etree_lxml.py @@ -10,6 +10,7 @@ """ from __future__ import absolute_import, division, unicode_literals +# pylint:disable=protected-access import warnings import re @@ -53,7 +54,6 @@ def _getChildNodes(self): def testSerializer(element): rv = [] - finalText = None infosetFilter = ihatexml.InfosetFilter(preventDoubleDashComments=True) def serializeElement(element, indent=0): @@ -128,16 +128,12 @@ def serializeElement(element, indent=0): rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail)) serializeElement(element, 0) - if finalText is not None: - rv.append("|%s\"%s\"" % (' ' * 2, finalText)) - return "\n".join(rv) def tostring(element): """Serialize an element and its child nodes to a string""" rv = [] - finalText = None def serializeElement(element): if not hasattr(element, "tag"): @@ -173,9 +169,6 @@ def serializeElement(element): serializeElement(element) - if finalText is not None: - rv.append("%s\"" % (' ' * 2, finalText)) - return "".join(rv) @@ -193,9 +186,11 @@ def __init__(self, namespaceHTMLElements, fullTree=False): self.namespaceHTMLElements = namespaceHTMLElements class Attributes(dict): - def __init__(self, element, value={}): + def __init__(self, element, value=None): + if value is None: + value = {} self._element = element - dict.__init__(self, value) + dict.__init__(self, value) # pylint:disable=non-parent-init-called for key, value in self.items(): if isinstance(key, tuple): name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1])) @@ -303,12 +298,14 @@ def insertDoctype(self, token): self.doctype = doctype def insertCommentInitial(self, data, parent=None): + assert parent is None or parent is self.document + assert self.document._elementTree is None self.initial_comments.append(data) def insertCommentMain(self, data, parent=None): if (parent == self.document and self.document._elementTree.getroot()[-1].tag == comment_type): - warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning) + warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning) super(TreeBuilder, self).insertComment(data, parent) def insertRoot(self, token): diff --git a/html5lib/treewalkers/etree.py b/html5lib/treewalkers/etree.py index 73c8e26a..d3b0c50e 100644 --- a/html5lib/treewalkers/etree.py +++ b/html5lib/treewalkers/etree.py @@ -22,7 +22,7 @@ def getETreeBuilder(ElementTreeImplementation): ElementTree = ElementTreeImplementation ElementTreeCommentType = ElementTree.Comment("asd").tag - class TreeWalker(_base.NonRecursiveTreeWalker): + class TreeWalker(_base.NonRecursiveTreeWalker): # pylint:disable=unused-variable """Given the particular ElementTree representation, this implementation, to avoid using recursion, returns "nodes" as tuples with the following content: @@ -38,7 +38,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker): """ def getNodeDetails(self, node): if isinstance(node, tuple): # It might be the root Element - elt, key, parents, flag = node + elt, _, _, flag = node if flag in ("text", "tail"): return _base.TEXT, getattr(elt, flag) else: diff --git a/html5lib/treewalkers/genshistream.py b/html5lib/treewalkers/genshistream.py index 83cd1654..61cbfede 100644 --- a/html5lib/treewalkers/genshistream.py +++ b/html5lib/treewalkers/genshistream.py @@ -25,7 +25,7 @@ def __iter__(self): yield token def tokens(self, event, next): - kind, data, pos = event + kind, data, _ = event if kind == START: tag, attribs = data name = tag.localname diff --git a/html5lib/treewalkers/lxmletree.py b/html5lib/treewalkers/lxmletree.py index 36850086..7d99adc2 100644 --- a/html5lib/treewalkers/lxmletree.py +++ b/html5lib/treewalkers/lxmletree.py @@ -117,6 +117,7 @@ def __len__(self): class TreeWalker(_base.NonRecursiveTreeWalker): def __init__(self, tree): + # pylint:disable=redefined-variable-type if hasattr(tree, "getroot"): self.fragmentChildren = set() tree = Root(tree) diff --git a/html5lib/trie/__init__.py b/html5lib/trie/__init__.py index a8cca8a9..a5ba4bf1 100644 --- a/html5lib/trie/__init__.py +++ b/html5lib/trie/__init__.py @@ -4,9 +4,11 @@ Trie = PyTrie +# pylint:disable=wrong-import-position try: from .datrie import Trie as DATrie except ImportError: pass else: Trie = DATrie +# pylint:enable=wrong-import-position diff --git a/html5lib/trie/_base.py b/html5lib/trie/_base.py index 724486b1..25eece46 100644 --- a/html5lib/trie/_base.py +++ b/html5lib/trie/_base.py @@ -7,7 +7,8 @@ class Trie(Mapping): """Abstract base class for tries""" def keys(self, prefix=None): - keys = super().keys() + # pylint:disable=arguments-differ + keys = super(Trie, self).keys() if prefix is None: return set(keys) diff --git a/html5lib/utils.py b/html5lib/utils.py index c70de172..5fe237a0 100644 --- a/html5lib/utils.py +++ b/html5lib/utils.py @@ -22,12 +22,12 @@ # surrogates, and there is no mechanism to further escape such # escapes. try: - _x = eval('"\\uD800"') + _x = eval('"\\uD800"') # pylint:disable=eval-used if not isinstance(_x, text_type): # We need this with u"" because of http://bugs.jython.org/issue2039 - _x = eval('u"\\uD800"') + _x = eval('u"\\uD800"') # pylint:disable=eval-used assert isinstance(_x, text_type) -except: +except: # pylint:disable=bare-except supports_lone_surrogates = False else: supports_lone_surrogates = True @@ -52,7 +52,7 @@ def __init__(self, items=()): # anything here. _dictEntries = [] for name, value in items: - if type(name) in (list, tuple, frozenset, set): + if isinstance(name, (list, tuple, frozenset, set)): for item in name: _dictEntries.append((item, value)) else: diff --git a/parse.py b/parse.py index cceea84d..2ed8f1c2 100755 --- a/parse.py +++ b/parse.py @@ -5,7 +5,6 @@ """ import sys -import os import traceback from optparse import OptionParser @@ -15,9 +14,10 @@ from html5lib import constants from html5lib import utils + def parse(): optParser = getOptParser() - opts,args = optParser.parse_args() + opts, args = optParser.parse_args() encoding = "utf8" try: @@ -25,7 +25,10 @@ def parse(): # Try opening from the internet if f.startswith('http://'): try: - import urllib.request, urllib.parse, urllib.error, cgi + import urllib.request + import urllib.parse + import urllib.error + import cgi f = urllib.request.urlopen(f) contentType = f.headers.get('content-type') if contentType: @@ -41,7 +44,7 @@ def parse(): try: # Try opening from file system f = open(f, "rb") - except IOError as e: + except IOError as e: sys.stderr.write("Unable to open file: %s\n" % e) sys.exit(1) except IndexError: @@ -82,14 +85,15 @@ def parse(): if document: printOutput(p, document, opts) t2 = time.time() - sys.stderr.write("\n\nRun took: %fs (plus %fs to print the output)"%(t1-t0, t2-t1)) + sys.stderr.write("\n\nRun took: %fs (plus %fs to print the output)" % (t1 - t0, t2 - t1)) else: - sys.stderr.write("\n\nRun took: %fs"%(t1-t0)) + sys.stderr.write("\n\nRun took: %fs" % (t1 - t0)) else: document = run(parseMethod, f, encoding, opts.scripting) if document: printOutput(p, document, opts) + def run(parseMethod, f, encoding, scripting): try: document = parseMethod(f, encoding=encoding, scripting=scripting) @@ -98,6 +102,7 @@ def run(parseMethod, f, encoding, scripting): traceback.print_exc() return document + def printOutput(parser, document, opts): if opts.encoding: print("Encoding:", parser.tokenizer.stream.charEncoding) @@ -116,7 +121,7 @@ def printOutput(parser, document, opts): elif tb == "etree": sys.stdout.write(utils.default_etree.tostring(document)) elif opts.tree: - if not hasattr(document,'__getitem__'): + if not hasattr(document, '__getitem__'): document = [document] for fragment in document: print(parser.tree.testSerializer(fragment)) @@ -126,7 +131,7 @@ def printOutput(parser, document, opts): kwargs = {} for opt in serializer.HTMLSerializer.options: try: - kwargs[opt] = getattr(opts,opt) + kwargs[opt] = getattr(opts, opt) except: pass if not kwargs['quote_char']: @@ -142,12 +147,14 @@ def printOutput(parser, document, opts): encoding = "utf-8" for text in serializer.HTMLSerializer(**kwargs).serialize(tokens, encoding=encoding): sys.stdout.write(text) - if not text.endswith('\n'): sys.stdout.write('\n') + if not text.endswith('\n'): + sys.stdout.write('\n') if opts.error: - errList=[] + errList = [] for pos, errorcode, datavars in parser.errors: - errList.append("Line %i Col %i"%pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars) - sys.stdout.write("\nParse errors:\n" + "\n".join(errList)+"\n") + errList.append("Line %i Col %i" % pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars) + sys.stdout.write("\nParse errors:\n" + "\n".join(errList) + "\n") + def getOptParser(): parser = OptionParser(usage=__doc__) diff --git a/setup.cfg b/setup.cfg index 2a9acf13..3152ac54 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,2 +1,11 @@ [bdist_wheel] universal = 1 + +[pep8] +ignore = N +max-line-length = 139 +exclude = .git,__pycache__,.tox,doc + +[flake8] +ignore = N +max-line-length = 139 diff --git a/setup.py b/setup.py index b6ea24af..b42ba400 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ from setuptools import setup -classifiers=[ +classifiers = [ 'Development Status :: 5 - Production/Stable', 'Intended Audience :: Developers', 'License :: OSI Approved :: MIT License', @@ -20,9 +20,9 @@ 'Programming Language :: Python :: 3.5', 'Topic :: Software Development :: Libraries :: Python Modules', 'Topic :: Text Processing :: Markup :: HTML' - ] +] -packages = ['html5lib'] + ['html5lib.'+name +packages = ['html5lib'] + ['html5lib.' + name for name in os.listdir(os.path.join('html5lib')) if os.path.isdir(os.path.join('html5lib', name)) and not name.startswith('.') and name != 'tests'] @@ -39,9 +39,9 @@ assignments = filter(lambda x: isinstance(x, ast.Assign), t.body) for a in assignments: if (len(a.targets) == 1 and - isinstance(a.targets[0], ast.Name) and - a.targets[0].id == "__version__" and - isinstance(a.value, ast.Str)): + isinstance(a.targets[0], ast.Name) and + a.targets[0].id == "__version__" and + isinstance(a.value, ast.Str)): version = a.value.s setup(name='html5lib', diff --git a/utils/entities.py b/utils/entities.py index 116a27cb..6dccf5f0 100644 --- a/utils/entities.py +++ b/utils/entities.py @@ -2,50 +2,59 @@ import html5lib + def parse(path="html5ents.xml"): return html5lib.parse(open(path), treebuilder="lxml") + def entity_table(tree): return dict((entity_name("".join(tr[0].xpath(".//text()"))), entity_characters(tr[1].text)) for tr in tree.xpath("//h:tbody/h:tr", - namespaces={"h":"http://www.w3.org/1999/xhtml"})) + namespaces={"h": "http://www.w3.org/1999/xhtml"})) + def entity_name(inp): return inp.strip() + def entity_characters(inp): return "".join(codepoint_to_character(item) - for item in inp.split() - if item) + for item in inp.split() + if item) + def codepoint_to_character(inp): - return ("\U000"+inp[2:]).decode("unicode-escape") + return ("\\U000" + inp[2:]).decode("unicode-escape") + def make_tests_json(entities): test_list = make_test_list(entities) tests_json = {"tests": - [make_test(*item) for item in test_list] + [make_test(*item) for item in test_list] } return tests_json + def make_test(name, characters, good): return { - "description":test_description(name, good), - "input":"&%s"%name, - "output":test_expected(name, characters, good) - } + "description": test_description(name, good), + "input": "&%s" % name, + "output": test_expected(name, characters, good) + } + def test_description(name, good): with_semicolon = name.endswith(";") - semicolon_text = {True:"with a semi-colon", - False:"without a semi-colon"}[with_semicolon] + semicolon_text = {True: "with a semi-colon", + False: "without a semi-colon"}[with_semicolon] if good: - text = "Named entity: %s %s"%(name, semicolon_text) + text = "Named entity: %s %s" % (name, semicolon_text) else: - text = "Bad named entity: %s %s"%(name, semicolon_text) + text = "Bad named entity: %s %s" % (name, semicolon_text) return text + def test_expected(name, characters, good): rv = [] if not good or not name.endswith(";"): @@ -53,6 +62,7 @@ def test_expected(name, characters, good): rv.append(["Character", characters]) return rv + def make_test_list(entities): tests = [] for entity_name, characters in entities.items(): @@ -61,20 +71,23 @@ def make_test_list(entities): tests.append((entity_name, characters, True)) return sorted(tests) + def subentity_exists(entity_name, entities): for i in range(1, len(entity_name)): if entity_name[:-i] in entities: return True return False + def make_entities_code(entities): - entities_text = "\n".join(" \"%s\": u\"%s\","%( - name, entities[name].encode( - "unicode-escape").replace("\"", "\\\"")) - for name in sorted(entities.keys())) + entities_text = "\n".join(" \"%s\": u\"%s\"," % ( + name, entities[name].encode( + "unicode-escape").replace("\"", "\\\"")) + for name in sorted(entities.keys())) return """entities = { %s -}"""%entities_text +}""" % entities_text + def main(): entities = entity_table(parse()) @@ -85,4 +98,3 @@ def main(): if __name__ == "__main__": main() - diff --git a/utils/spider.py b/utils/spider.py index ac5f9fbe..3a325888 100644 --- a/utils/spider.py +++ b/utils/spider.py @@ -7,7 +7,9 @@ s.spider("http://www.google.com", maxURLs=100) """ -import urllib.request, urllib.error, urllib.parse +import urllib.request +import urllib.error +import urllib.parse import urllib.robotparser import md5 @@ -16,11 +18,13 @@ import html5lib from html5lib.treebuilders import etree + class Spider(object): + def __init__(self): self.unvisitedURLs = set() self.visitedURLs = set() - self.buggyURLs=set() + self.buggyURLs = set() self.robotParser = urllib.robotparser.RobotFileParser() self.contentDigest = {} self.http = httplib2.Http(".cache") @@ -70,18 +74,18 @@ def updateURLs(self, tree): update the list of visited and unvisited URLs according to whether we have seen them before or not""" urls = set() - #Remove all links we have already visited + # Remove all links we have already visited for link in tree.findall(".//a"): - try: - url = urllib.parse.urldefrag(link.attrib['href'])[0] - if (url and url not in self.unvisitedURLs and url + try: + url = urllib.parse.urldefrag(link.attrib['href'])[0] + if (url and url not in self.unvisitedURLs and url not in self.visitedURLs): - urls.add(url) - except KeyError: - pass + urls.add(url) + except KeyError: + pass - #Remove all non-http URLs and add a suitable base URL where that is - #missing + # Remove all non-http URLs and add a suitable base URL where that is + # missing newUrls = set() for url in urls: splitURL = list(urllib.parse.urlsplit(url)) @@ -93,23 +97,22 @@ def updateURLs(self, tree): urls = newUrls responseHeaders = {} - #Now we want to find the content types of the links we haven't visited + # Now we want to find the content types of the links we haven't visited for url in urls: try: resp, content = self.http.request(url, "HEAD") responseHeaders[url] = resp - except AttributeError as KeyError: - #Don't know why this happens + except AttributeError: + # Don't know why this happens pass - - #Remove links not of content-type html or pages not found - #XXX - need to deal with other status codes? + # Remove links not of content-type html or pages not found + # XXX - need to deal with other status codes? toVisit = set([url for url in urls if url in responseHeaders and - "html" in responseHeaders[url]['content-type'] and - responseHeaders[url]['status'] == "200"]) + "html" in responseHeaders[url]['content-type'] and + responseHeaders[url]['status'] == "200"]) - #Now check we are allowed to spider the page + # Now check we are allowed to spider the page for url in toVisit: robotURL = list(urllib.parse.urlsplit(url)[:2]) robotURL.extend(["robots.txt", "", ""])pFad - Phonifier reborn Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.
Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.
Alternative Proxies: