diff --git a/html5lib/filters/lint.py b/html5lib/filters/lint.py index 8884696d..3ec63d72 100644 --- a/html5lib/filters/lint.py +++ b/html5lib/filters/lint.py @@ -1,90 +1,77 @@ from __future__ import absolute_import, division, unicode_literals +from six import text_type + from . import _base -from ..constants import cdataElements, rcdataElements, voidElements +from ..constants import namespaces, voidElements from ..constants import spaceCharacters spaceCharacters = "".join(spaceCharacters) -class LintError(Exception): - pass - - class Filter(_base.Filter): def __iter__(self): open_elements = [] - contentModelFlag = "PCDATA" for token in _base.Filter.__iter__(self): type = token["type"] if type in ("StartTag", "EmptyTag"): + namespace = token["namespace"] name = token["name"] - if contentModelFlag != "PCDATA": - raise LintError("StartTag not in PCDATA content model flag: %(tag)s" % {"tag": name}) - if not isinstance(name, str): - raise LintError("Tag name is not a string: %(tag)r" % {"tag": name}) - if not name: - raise LintError("Empty tag name") - if type == "StartTag" and name in voidElements: - raise LintError("Void element reported as StartTag token: %(tag)s" % {"tag": name}) - elif type == "EmptyTag" and name not in voidElements: - raise LintError("Non-void element reported as EmptyTag token: %(tag)s" % {"tag": token["name"]}) + assert namespace is None or isinstance(namespace, text_type) + assert namespace != "" + assert isinstance(name, text_type) + assert name != "" + assert isinstance(token["data"], dict) + if (not namespace or namespace == namespaces["html"]) and name in voidElements: + assert type == "EmptyTag" + else: + assert type == "StartTag" if type == "StartTag": - open_elements.append(name) - for name, value in token["data"]: - if not isinstance(name, str): - raise LintError("Attribute name is not a string: %(name)r" % {"name": name}) - if not name: - raise LintError("Empty attribute name") - if not isinstance(value, str): - raise LintError("Attribute value is not a string: %(value)r" % {"value": value}) - if name in cdataElements: - contentModelFlag = "CDATA" - elif name in rcdataElements: - contentModelFlag = "RCDATA" - elif name == "plaintext": - contentModelFlag = "PLAINTEXT" + open_elements.append((namespace, name)) + for (namespace, name), value in token["data"].items(): + assert namespace is None or isinstance(namespace, text_type) + assert namespace != "" + assert isinstance(name, text_type) + assert name != "" + assert isinstance(value, text_type) elif type == "EndTag": + namespace = token["namespace"] name = token["name"] - if not isinstance(name, str): - raise LintError("Tag name is not a string: %(tag)r" % {"tag": name}) - if not name: - raise LintError("Empty tag name") - if name in voidElements: - raise LintError("Void element reported as EndTag token: %(tag)s" % {"tag": name}) - start_name = open_elements.pop() - if start_name != name: - raise LintError("EndTag (%(end)s) does not match StartTag (%(start)s)" % {"end": name, "start": start_name}) - contentModelFlag = "PCDATA" + assert namespace is None or isinstance(namespace, text_type) + assert namespace != "" + assert isinstance(name, text_type) + assert name != "" + if (not namespace or namespace == namespaces["html"]) and name in voidElements: + assert False, "Void element reported as EndTag token: %(tag)s" % {"tag": name} + else: + start = open_elements.pop() + assert start == (namespace, name) elif type == "Comment": - if contentModelFlag != "PCDATA": - raise LintError("Comment not in PCDATA content model flag") + data = token["data"] + assert isinstance(data, text_type) elif type in ("Characters", "SpaceCharacters"): data = token["data"] - if not isinstance(data, str): - raise LintError("Attribute name is not a string: %(name)r" % {"name": data}) - if not data: - raise LintError("%(type)s token with empty data" % {"type": type}) + assert isinstance(data, text_type) + assert data != "" if type == "SpaceCharacters": - data = data.strip(spaceCharacters) - if data: - raise LintError("Non-space character(s) found in SpaceCharacters token: %(token)r" % {"token": data}) + assert data.strip(spaceCharacters) == "" elif type == "Doctype": name = token["name"] - if contentModelFlag != "PCDATA": - raise LintError("Doctype not in PCDATA content model flag: %(name)s" % {"name": name}) - if not isinstance(name, str): - raise LintError("Tag name is not a string: %(tag)r" % {"tag": name}) - # XXX: what to do with token["data"] ? + assert name is None or isinstance(name, text_type) + assert token["publicId"] is None or isinstance(name, text_type) + assert token["systemId"] is None or isinstance(name, text_type) + + elif type == "Entity": + assert isinstance(token["name"], text_type) - elif type in ("ParseError", "SerializeError"): - pass + elif type == "SerializerError": + assert isinstance(token["data"], text_type) else: - raise LintError("Unknown token type: %(type)s" % {"type": type}) + assert False, "Unknown token type: %(type)s" % {"type": type} yield token diff --git a/html5lib/tests/test_treewalkers.py b/html5lib/tests/test_treewalkers.py index c79d0b1b..e59f25ea 100644 --- a/html5lib/tests/test_treewalkers.py +++ b/html5lib/tests/test_treewalkers.py @@ -14,6 +14,7 @@ from .support import get_data_files, TestData, convertExpected from html5lib import html5parser, treewalkers, treebuilders, treeadapters, constants +from html5lib.filters.lint import Filter as Lint treeTypes = { @@ -77,21 +78,21 @@ def test_all_tokens(self): expected = [ {'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'html'}, {'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'head'}, - {'data': {}, 'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'head'}, + {'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'head'}, {'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'body'}, {'data': 'a', 'type': 'Characters'}, {'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'div'}, {'data': 'b', 'type': 'Characters'}, - {'data': {}, 'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'div'}, + {'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'div'}, {'data': 'c', 'type': 'Characters'}, - {'data': {}, 'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'body'}, - {'data': {}, 'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'html'} + {'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'body'}, + {'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'html'} ] for treeName, treeCls in sorted(treeTypes.items()): p = html5parser.HTMLParser(tree=treeCls["builder"]) document = p.parse("a
b
c") document = treeCls.get("adapter", lambda x: x)(document) - output = treeCls["walker"](document) + output = Lint(treeCls["walker"](document)) for expectedToken, outputToken in zip(expected, output): self.assertEqual(expectedToken, outputToken) @@ -111,7 +112,7 @@ def runTreewalkerTest(innerHTML, input, expected, errors, treeClass): document = treeClass.get("adapter", lambda x: x)(document) try: - output = treewalkers.pprint(treeClass["walker"](document)) + output = treewalkers.pprint(Lint(treeClass["walker"](document))) output = attrlist.sub(sortattrs, output) expected = attrlist.sub(sortattrs, convertExpected(expected)) diff = "".join(unified_diff([line + "\n" for line in expected.splitlines()], diff --git a/html5lib/treewalkers/_base.py b/html5lib/treewalkers/_base.py index e79a4357..36e1ba24 100644 --- a/html5lib/treewalkers/_base.py +++ b/html5lib/treewalkers/_base.py @@ -1,8 +1,7 @@ from __future__ import absolute_import, division, unicode_literals -from six import text_type, string_types from xml.dom import Node -from ..constants import voidElements, spaceCharacters +from ..constants import namespaces, voidElements, spaceCharacters __all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN", "TreeWalker", "NonRecursiveTreeWalker"] @@ -18,24 +17,6 @@ spaceCharacters = "".join(spaceCharacters) -def to_text(s, blank_if_none=True): - """Wrapper around six.text_type to convert None to empty string""" - if s is None: - if blank_if_none: - return "" - else: - return None - elif isinstance(s, text_type): - return s - else: - return text_type(s) - - -def is_text_or_none(string): - """Wrapper around isinstance(string_types) or is None""" - return string is None or isinstance(string, string_types) - - class TreeWalker(object): def __init__(self, tree): self.tree = tree @@ -47,47 +28,25 @@ def error(self, msg): return {"type": "SerializeError", "data": msg} def emptyTag(self, namespace, name, attrs, hasChildren=False): - assert namespace is None or isinstance(namespace, string_types), type(namespace) - assert isinstance(name, string_types), type(name) - assert all((namespace is None or isinstance(namespace, string_types)) and - isinstance(name, string_types) and - isinstance(value, string_types) - for (namespace, name), value in attrs.items()) - - yield {"type": "EmptyTag", "name": to_text(name, False), - "namespace": to_text(namespace), + yield {"type": "EmptyTag", "name": name, + "namespace": namespace, "data": attrs} if hasChildren: yield self.error("Void element has children") def startTag(self, namespace, name, attrs): - assert namespace is None or isinstance(namespace, string_types), type(namespace) - assert isinstance(name, string_types), type(name) - assert all((namespace is None or isinstance(namespace, string_types)) and - isinstance(name, string_types) and - isinstance(value, string_types) - for (namespace, name), value in attrs.items()) - return {"type": "StartTag", - "name": text_type(name), - "namespace": to_text(namespace), - "data": dict(((to_text(namespace, False), to_text(name)), - to_text(value, False)) - for (namespace, name), value in attrs.items())} + "name": name, + "namespace": namespace, + "data": attrs} def endTag(self, namespace, name): - assert namespace is None or isinstance(namespace, string_types), type(namespace) - assert isinstance(name, string_types), type(namespace) - return {"type": "EndTag", - "name": to_text(name, False), - "namespace": to_text(namespace), - "data": {}} + "name": name, + "namespace": namespace} def text(self, data): - assert isinstance(data, string_types), type(data) - - data = to_text(data) + data = data middle = data.lstrip(spaceCharacters) left = data[:len(data) - len(middle)] if left: @@ -101,25 +60,16 @@ def text(self, data): yield {"type": "SpaceCharacters", "data": right} def comment(self, data): - assert isinstance(data, string_types), type(data) - - return {"type": "Comment", "data": text_type(data)} - - def doctype(self, name, publicId=None, systemId=None, correct=True): - assert is_text_or_none(name), type(name) - assert is_text_or_none(publicId), type(publicId) - assert is_text_or_none(systemId), type(systemId) + return {"type": "Comment", "data": data} + def doctype(self, name, publicId=None, systemId=None): return {"type": "Doctype", - "name": to_text(name), - "publicId": to_text(publicId), - "systemId": to_text(systemId), - "correct": to_text(correct)} + "name": name, + "publicId": publicId, + "systemId": systemId} def entity(self, name): - assert isinstance(name, string_types), type(name) - - return {"type": "Entity", "name": text_type(name)} + return {"type": "Entity", "name": name} def unknown(self, nodeType): return self.error("Unknown node type: " + nodeType) @@ -154,7 +104,7 @@ def __iter__(self): elif type == ELEMENT: namespace, name, attributes, hasChildren = details - if name in voidElements: + if (not namespace or namespace == namespaces["html"]) and name in voidElements: for token in self.emptyTag(namespace, name, attributes, hasChildren): yield token @@ -187,7 +137,7 @@ def __iter__(self): type, details = details[0], details[1:] if type == ELEMENT: namespace, name, attributes, hasChildren = details - if name not in voidElements: + if (namespace and namespace != namespaces["html"]) or name not in voidElements: yield self.endTag(namespace, name) if self.tree is currentNode: currentNode = None diff --git a/html5lib/treewalkers/genshistream.py b/html5lib/treewalkers/genshistream.py index 24d33282..83cd1654 100644 --- a/html5lib/treewalkers/genshistream.py +++ b/html5lib/treewalkers/genshistream.py @@ -48,7 +48,7 @@ def tokens(self, event, next): elif kind == END: name = data.localname namespace = data.namespace - if name not in voidElements: + if namespace != namespaces["html"] or name not in voidElements: yield self.endTag(namespace, name) elif kind == COMMENT: diff --git a/html5lib/treewalkers/lxmletree.py b/html5lib/treewalkers/lxmletree.py index 90e116d3..173fa082 100644 --- a/html5lib/treewalkers/lxmletree.py +++ b/html5lib/treewalkers/lxmletree.py @@ -118,8 +118,10 @@ def __len__(self): class TreeWalker(_base.NonRecursiveTreeWalker): def __init__(self, tree): if hasattr(tree, "getroot"): + self.fragmentChildren = set() tree = Root(tree) elif isinstance(tree, list): + self.fragmentChildren = set(tree) tree = FragmentRoot(tree) _base.NonRecursiveTreeWalker.__init__(self, tree) self.filter = ihatexml.InfosetFilter() @@ -137,7 +139,7 @@ def getNodeDetails(self, node): return _base.DOCTYPE, node.name, node.public_id, node.system_id elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"): - return _base.TEXT, node.obj + return _base.TEXT, ensure_str(node.obj) elif node.tag == etree.Comment: return _base.COMMENT, ensure_str(node.text) @@ -197,5 +199,7 @@ def getParentNode(self, node): if key == "text": return node # else: fallback to "normal" processing + elif node in self.fragmentChildren: + return None return node.getparent() pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy