diff --git a/CHANGES.rst b/CHANGES.rst index fe07f1ec..c6cbe78f 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -50,6 +50,20 @@ Released on XXX with a set of keyword arguments: override_encoding, transport_encoding, same_origin_parent_encoding, likely_encoding, and default_encoding.** +* **Move filters._base, treebuilder._base, and treewalkers._base to .base + to clarify their status as public.** + +* **Get rid of the sanitizer package. Merge sanitizer.sanitize into the + sanitizer.htmlsanitizer module and move that to saniziter. This means + anyone who used sanitizer.sanitize or sanitizer.HTMLSanitizer needs no + code changes.** + +* **Rename treewalkers.lxmletree to .etree_lxml and + treewalkers.genshistream to .genshi to have a consistent API.** + +* Move a whole load of stuff (inputstream, ihatexml, trie, tokenizer, + utils) to be underscore prefixed to clarify their status as private. + 0.9999999/1.0b8 ~~~~~~~~~~~~~~~ diff --git a/doc/html5lib.filters.rst b/doc/html5lib.filters.rst index 1fda38a7..38d4a956 100644 --- a/doc/html5lib.filters.rst +++ b/doc/html5lib.filters.rst @@ -1,10 +1,10 @@ filters Package =============== -:mod:`_base` Module +:mod:`base` Module ------------------- -.. automodule:: html5lib.filters._base +.. automodule:: html5lib.filters.base :members: :undoc-members: :show-inheritance: diff --git a/doc/html5lib.rst b/doc/html5lib.rst index d4ed12b4..f0646aac 100644 --- a/doc/html5lib.rst +++ b/doc/html5lib.rst @@ -25,42 +25,10 @@ html5lib Package :undoc-members: :show-inheritance: -:mod:`ihatexml` Module +:mod:`serializer` Module ---------------------- -.. automodule:: html5lib.ihatexml - :members: - :undoc-members: - :show-inheritance: - -:mod:`inputstream` Module -------------------------- - -.. automodule:: html5lib.inputstream - :members: - :undoc-members: - :show-inheritance: - -:mod:`sanitizer` Module ------------------------ - -.. automodule:: html5lib.sanitizer - :members: - :undoc-members: - :show-inheritance: - -:mod:`tokenizer` Module ------------------------ - -.. automodule:: html5lib.tokenizer - :members: - :undoc-members: - :show-inheritance: - -:mod:`utils` Module -------------------- - -.. automodule:: html5lib.utils +.. automodule:: html5lib.serializer :members: :undoc-members: :show-inheritance: @@ -71,7 +39,6 @@ Subpackages .. toctree:: html5lib.filters - html5lib.serializer html5lib.treebuilders html5lib.treewalkers diff --git a/doc/html5lib.serializer.rst b/doc/html5lib.serializer.rst deleted file mode 100644 index fa954742..00000000 --- a/doc/html5lib.serializer.rst +++ /dev/null @@ -1,19 +0,0 @@ -serializer Package -================== - -:mod:`serializer` Package -------------------------- - -.. automodule:: html5lib.serializer - :members: - :undoc-members: - :show-inheritance: - -:mod:`htmlserializer` Module ----------------------------- - -.. automodule:: html5lib.serializer.htmlserializer - :members: - :undoc-members: - :show-inheritance: - diff --git a/doc/html5lib.treebuilders.rst b/doc/html5lib.treebuilders.rst index 99119839..aee82142 100644 --- a/doc/html5lib.treebuilders.rst +++ b/doc/html5lib.treebuilders.rst @@ -9,10 +9,10 @@ treebuilders Package :undoc-members: :show-inheritance: -:mod:`_base` Module +:mod:`base` Module ------------------- -.. automodule:: html5lib.treebuilders._base +.. automodule:: html5lib.treebuilders.base :members: :undoc-members: :show-inheritance: diff --git a/doc/html5lib.treewalkers.rst b/doc/html5lib.treewalkers.rst index 694c8194..46501258 100644 --- a/doc/html5lib.treewalkers.rst +++ b/doc/html5lib.treewalkers.rst @@ -9,10 +9,10 @@ treewalkers Package :undoc-members: :show-inheritance: -:mod:`_base` Module +:mod:`base` Module ------------------- -.. automodule:: html5lib.treewalkers._base +.. automodule:: html5lib.treewalkers.base :members: :undoc-members: :show-inheritance: @@ -33,18 +33,19 @@ treewalkers Package :undoc-members: :show-inheritance: -:mod:`genshistream` Module --------------------------- +:mod:`etree_lxml` Module +----------------------- -.. automodule:: html5lib.treewalkers.genshistream +.. automodule:: html5lib.treewalkers.etree_lxml :members: :undoc-members: :show-inheritance: -:mod:`lxmletree` Module ------------------------ -.. automodule:: html5lib.treewalkers.lxmletree +:mod:`genshi` Module +-------------------------- + +.. automodule:: html5lib.treewalkers.genshi :members: :undoc-members: - :show-inheritance: + :show-inheritance: \ No newline at end of file diff --git a/html5lib/ihatexml.py b/html5lib/_ihatexml.py similarity index 100% rename from html5lib/ihatexml.py rename to html5lib/_ihatexml.py diff --git a/html5lib/inputstream.py b/html5lib/_inputstream.py similarity index 99% rename from html5lib/inputstream.py rename to html5lib/_inputstream.py index dafe33ca..79f2331e 100644 --- a/html5lib/inputstream.py +++ b/html5lib/_inputstream.py @@ -10,7 +10,7 @@ from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase from .constants import ReparseException -from . import utils +from . import _utils from io import StringIO @@ -28,7 +28,7 @@ invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]" # noqa -if utils.supports_lone_surrogates: +if _utils.supports_lone_surrogates: # Use one extra step of indirection and create surrogates with # eval. Not using this indirection would introduce an illegal # unicode literal on platforms not supporting such lone @@ -176,7 +176,7 @@ def __init__(self, source): """ - if not utils.supports_lone_surrogates: + if not _utils.supports_lone_surrogates: # Such platforms will have already checked for such # surrogate errors, so no need to do this checking. self.reportCharacterErrors = None @@ -304,9 +304,9 @@ def characterErrorsUCS2(self, data): codepoint = ord(match.group()) pos = match.start() # Pretty sure there should be endianness issues here - if utils.isSurrogatePair(data[pos:pos + 2]): + if _utils.isSurrogatePair(data[pos:pos + 2]): # We have a surrogate pair! - char_val = utils.surrogatePairToCodepoint(data[pos:pos + 2]) + char_val = _utils.surrogatePairToCodepoint(data[pos:pos + 2]) if char_val in non_bmp_invalid_codepoints: self.errors.append("invalid-codepoint") skip = True diff --git a/html5lib/tokenizer.py b/html5lib/_tokenizer.py similarity index 99% rename from html5lib/tokenizer.py rename to html5lib/_tokenizer.py index 3f10c01f..6078f66a 100644 --- a/html5lib/tokenizer.py +++ b/html5lib/_tokenizer.py @@ -11,9 +11,9 @@ from .constants import tokenTypes, tagTokenTypes from .constants import replacementCharacters -from .inputstream import HTMLInputStream +from ._inputstream import HTMLInputStream -from .trie import Trie +from ._trie import Trie entitiesTrie = Trie(entities) diff --git a/html5lib/trie/__init__.py b/html5lib/_trie/__init__.py similarity index 100% rename from html5lib/trie/__init__.py rename to html5lib/_trie/__init__.py diff --git a/html5lib/trie/_base.py b/html5lib/_trie/_base.py similarity index 100% rename from html5lib/trie/_base.py rename to html5lib/_trie/_base.py diff --git a/html5lib/trie/datrie.py b/html5lib/_trie/datrie.py similarity index 100% rename from html5lib/trie/datrie.py rename to html5lib/_trie/datrie.py diff --git a/html5lib/trie/py.py b/html5lib/_trie/py.py similarity index 100% rename from html5lib/trie/py.py rename to html5lib/_trie/py.py diff --git a/html5lib/utils.py b/html5lib/_utils.py similarity index 100% rename from html5lib/utils.py rename to html5lib/_utils.py diff --git a/html5lib/filters/alphabeticalattributes.py b/html5lib/filters/alphabeticalattributes.py index fed6996c..4795baec 100644 --- a/html5lib/filters/alphabeticalattributes.py +++ b/html5lib/filters/alphabeticalattributes.py @@ -1,6 +1,6 @@ from __future__ import absolute_import, division, unicode_literals -from . import _base +from . import base try: from collections import OrderedDict @@ -8,9 +8,9 @@ from ordereddict import OrderedDict -class Filter(_base.Filter): +class Filter(base.Filter): def __iter__(self): - for token in _base.Filter.__iter__(self): + for token in base.Filter.__iter__(self): if token["type"] in ("StartTag", "EmptyTag"): attrs = OrderedDict() for name, value in sorted(token["data"].items(), diff --git a/html5lib/filters/_base.py b/html5lib/filters/base.py similarity index 100% rename from html5lib/filters/_base.py rename to html5lib/filters/base.py diff --git a/html5lib/filters/inject_meta_charset.py b/html5lib/filters/inject_meta_charset.py index ca33b70b..2059ec86 100644 --- a/html5lib/filters/inject_meta_charset.py +++ b/html5lib/filters/inject_meta_charset.py @@ -1,11 +1,11 @@ from __future__ import absolute_import, division, unicode_literals -from . import _base +from . import base -class Filter(_base.Filter): +class Filter(base.Filter): def __init__(self, source, encoding): - _base.Filter.__init__(self, source) + base.Filter.__init__(self, source) self.encoding = encoding def __iter__(self): @@ -13,7 +13,7 @@ def __iter__(self): meta_found = (self.encoding is None) pending = [] - for token in _base.Filter.__iter__(self): + for token in base.Filter.__iter__(self): type = token["type"] if type == "StartTag": if token["name"].lower() == "head": diff --git a/html5lib/filters/lint.py b/html5lib/filters/lint.py index af231d8e..a9c0831a 100644 --- a/html5lib/filters/lint.py +++ b/html5lib/filters/lint.py @@ -2,21 +2,21 @@ from six import text_type -from . import _base +from . import base from ..constants import namespaces, voidElements from ..constants import spaceCharacters spaceCharacters = "".join(spaceCharacters) -class Filter(_base.Filter): +class Filter(base.Filter): def __init__(self, source, require_matching_tags=True): super(Filter, self).__init__(source) self.require_matching_tags = require_matching_tags def __iter__(self): open_elements = [] - for token in _base.Filter.__iter__(self): + for token in base.Filter.__iter__(self): type = token["type"] if type in ("StartTag", "EmptyTag"): namespace = token["namespace"] diff --git a/html5lib/filters/optionaltags.py b/html5lib/filters/optionaltags.py index 8f11fff4..f6edb734 100644 --- a/html5lib/filters/optionaltags.py +++ b/html5lib/filters/optionaltags.py @@ -1,9 +1,9 @@ from __future__ import absolute_import, division, unicode_literals -from . import _base +from . import base -class Filter(_base.Filter): +class Filter(base.Filter): def slider(self): previous1 = previous2 = None for token in self.source: diff --git a/html5lib/filters/sanitizer.py b/html5lib/filters/sanitizer.py index 7f81c0d1..b5ddcb93 100644 --- a/html5lib/filters/sanitizer.py +++ b/html5lib/filters/sanitizer.py @@ -5,13 +5,13 @@ from six.moves import urllib_parse as urlparse -from . import _base +from . import base from ..constants import namespaces, prefixes __all__ = ["Filter"] -acceptable_elements = frozenset(( +allowed_elements = frozenset(( (namespaces['html'], 'a'), (namespaces['html'], 'abbr'), (namespaces['html'], 'acronym'), @@ -175,7 +175,7 @@ (namespaces['svg'], 'use'), )) -acceptable_attributes = frozenset(( +allowed_attributes = frozenset(( # HTML attributes (None, 'abbr'), (None, 'accept'), @@ -552,7 +552,7 @@ (None, 'use') )) -acceptable_css_properties = frozenset(( +allowed_css_properties = frozenset(( 'azimuth', 'background-color', 'border-bottom-color', @@ -601,7 +601,7 @@ 'width', )) -acceptable_css_keywords = frozenset(( +allowed_css_keywords = frozenset(( 'auto', 'aqua', 'black', @@ -643,7 +643,7 @@ 'yellow', )) -acceptable_svg_properties = frozenset(( +allowed_svg_properties = frozenset(( 'fill', 'fill-opacity', 'fill-rule', @@ -654,7 +654,7 @@ 'stroke-opacity', )) -acceptable_protocols = frozenset(( +allowed_protocols = frozenset(( 'ed2k', 'ftp', 'http', @@ -680,7 +680,7 @@ 'data', )) -acceptable_content_types = frozenset(( +allowed_content_types = frozenset(( 'image/png', 'image/jpeg', 'image/gif', @@ -689,14 +689,6 @@ 'text/plain', )) -allowed_elements = acceptable_elements -allowed_attributes = acceptable_attributes -allowed_css_properties = acceptable_css_properties -allowed_css_keywords = acceptable_css_keywords -allowed_svg_properties = acceptable_svg_properties -allowed_protocols = acceptable_protocols -allowed_content_types = acceptable_content_types - data_content_type = re.compile(r''' ^ @@ -712,7 +704,7 @@ re.VERBOSE) -class Filter(_base.Filter): +class Filter(base.Filter): """ sanitization of XHTML+MathML+SVG and of inline style attributes.""" def __init__(self, source, @@ -739,7 +731,7 @@ def __init__(self, self.svg_allow_local_href = svg_allow_local_href def __iter__(self): - for token in _base.Filter.__iter__(self): + for token in base.Filter.__iter__(self): token = self.sanitize_token(token) if token: yield token diff --git a/html5lib/filters/whitespace.py b/html5lib/filters/whitespace.py index dfc60eeb..89210528 100644 --- a/html5lib/filters/whitespace.py +++ b/html5lib/filters/whitespace.py @@ -2,20 +2,20 @@ import re -from . import _base +from . import base from ..constants import rcdataElements, spaceCharacters spaceCharacters = "".join(spaceCharacters) SPACES_REGEX = re.compile("[%s]+" % spaceCharacters) -class Filter(_base.Filter): +class Filter(base.Filter): spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements)) def __iter__(self): preserve = 0 - for token in _base.Filter.__iter__(self): + for token in base.Filter.__iter__(self): type = token["type"] if type == "StartTag" \ and (preserve or token["name"] in self.spacePreserveElements): diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index 6a5c8bcb..470c8a7d 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -8,13 +8,13 @@ except ImportError: from ordereddict import OrderedDict -from . import inputstream -from . import tokenizer +from . import _inputstream +from . import _tokenizer from . import treebuilders -from .treebuilders._base import Marker +from .treebuilders.base import Marker -from . import utils +from . import _utils from .constants import ( spaceCharacters, asciiUpper2Lower, specialElements, headingElements, cdataElements, rcdataElements, @@ -82,7 +82,7 @@ def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kw self.innerHTMLMode = innerHTML self.container = container self.scripting = scripting - self.tokenizer = tokenizer.HTMLTokenizer(stream, parser=self, **kwargs) + self.tokenizer = _tokenizer.HTMLTokenizer(stream, parser=self, **kwargs) self.reset() try: @@ -344,7 +344,7 @@ def parseRCDataRawtext(self, token, contentType): self.phase = self.phases["text"] -@utils.memoize +@_utils.memoize def getPhases(debug): def log(function): """Logger that records which phase processes each token""" @@ -586,13 +586,13 @@ class BeforeHeadPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - self.startTagHandler = utils.MethodDispatcher([ + self.startTagHandler = _utils.MethodDispatcher([ ("html", self.startTagHtml), ("head", self.startTagHead) ]) self.startTagHandler.default = self.startTagOther - self.endTagHandler = utils.MethodDispatcher([ + self.endTagHandler = _utils.MethodDispatcher([ (("head", "body", "html", "br"), self.endTagImplyHead) ]) self.endTagHandler.default = self.endTagOther @@ -632,7 +632,7 @@ class InHeadPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - self.startTagHandler = utils.MethodDispatcher([ + self.startTagHandler = _utils.MethodDispatcher([ ("html", self.startTagHtml), ("title", self.startTagTitle), (("noframes", "style"), self.startTagNoFramesStyle), @@ -645,7 +645,7 @@ def __init__(self, parser, tree): ]) self.startTagHandler.default = self.startTagOther - self.endTagHandler = utils.MethodDispatcher([ + self.endTagHandler = _utils.MethodDispatcher([ ("head", self.endTagHead), (("br", "html", "body"), self.endTagHtmlBodyBr) ]) @@ -687,8 +687,8 @@ def startTagMeta(self, token): # the abstract Unicode string, and just use the # ContentAttrParser on that, but using UTF-8 allows all chars # to be encoded and as a ASCII-superset works. - data = inputstream.EncodingBytes(attributes["content"].encode("utf-8")) - parser = inputstream.ContentAttrParser(data) + data = _inputstream.EncodingBytes(attributes["content"].encode("utf-8")) + parser = _inputstream.ContentAttrParser(data) codec = parser.parse() self.parser.tokenizer.stream.changeEncoding(codec) @@ -735,14 +735,14 @@ class InHeadNoscriptPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - self.startTagHandler = utils.MethodDispatcher([ + self.startTagHandler = _utils.MethodDispatcher([ ("html", self.startTagHtml), (("basefont", "bgsound", "link", "meta", "noframes", "style"), self.startTagBaseLinkCommand), (("head", "noscript"), self.startTagHeadNoscript), ]) self.startTagHandler.default = self.startTagOther - self.endTagHandler = utils.MethodDispatcher([ + self.endTagHandler = _utils.MethodDispatcher([ ("noscript", self.endTagNoscript), ("br", self.endTagBr), ]) @@ -799,7 +799,7 @@ class AfterHeadPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - self.startTagHandler = utils.MethodDispatcher([ + self.startTagHandler = _utils.MethodDispatcher([ ("html", self.startTagHtml), ("body", self.startTagBody), ("frameset", self.startTagFrameset), @@ -809,8 +809,8 @@ def __init__(self, parser, tree): ("head", self.startTagHead) ]) self.startTagHandler.default = self.startTagOther - self.endTagHandler = utils.MethodDispatcher([(("body", "html", "br"), - self.endTagHtmlBodyBr)]) + self.endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"), + self.endTagHtmlBodyBr)]) self.endTagHandler.default = self.endTagOther def processEOF(self): @@ -871,7 +871,7 @@ def __init__(self, parser, tree): # Set this to the default handler self.processSpaceCharacters = self.processSpaceCharactersNonPre - self.startTagHandler = utils.MethodDispatcher([ + self.startTagHandler = _utils.MethodDispatcher([ ("html", self.startTagHtml), (("base", "basefont", "bgsound", "command", "link", "meta", "script", "style", "title"), @@ -918,7 +918,7 @@ def __init__(self, parser, tree): ]) self.startTagHandler.default = self.startTagOther - self.endTagHandler = utils.MethodDispatcher([ + self.endTagHandler = _utils.MethodDispatcher([ ("body", self.endTagBody), ("html", self.endTagHtml), (("address", "article", "aside", "blockquote", "button", "center", @@ -1588,9 +1588,9 @@ def endTagOther(self, token): class TextPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - self.startTagHandler = utils.MethodDispatcher([]) + self.startTagHandler = _utils.MethodDispatcher([]) self.startTagHandler.default = self.startTagOther - self.endTagHandler = utils.MethodDispatcher([ + self.endTagHandler = _utils.MethodDispatcher([ ("script", self.endTagScript)]) self.endTagHandler.default = self.endTagOther @@ -1622,7 +1622,7 @@ class InTablePhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#in-table def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - self.startTagHandler = utils.MethodDispatcher([ + self.startTagHandler = _utils.MethodDispatcher([ ("html", self.startTagHtml), ("caption", self.startTagCaption), ("colgroup", self.startTagColgroup), @@ -1636,7 +1636,7 @@ def __init__(self, parser, tree): ]) self.startTagHandler.default = self.startTagOther - self.endTagHandler = utils.MethodDispatcher([ + self.endTagHandler = _utils.MethodDispatcher([ ("table", self.endTagTable), (("body", "caption", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr"), self.endTagIgnore) @@ -1813,14 +1813,14 @@ class InCaptionPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - self.startTagHandler = utils.MethodDispatcher([ + self.startTagHandler = _utils.MethodDispatcher([ ("html", self.startTagHtml), (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr"), self.startTagTableElement) ]) self.startTagHandler.default = self.startTagOther - self.endTagHandler = utils.MethodDispatcher([ + self.endTagHandler = _utils.MethodDispatcher([ ("caption", self.endTagCaption), ("table", self.endTagTable), (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", @@ -1885,13 +1885,13 @@ class InColumnGroupPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - self.startTagHandler = utils.MethodDispatcher([ + self.startTagHandler = _utils.MethodDispatcher([ ("html", self.startTagHtml), ("col", self.startTagCol) ]) self.startTagHandler.default = self.startTagOther - self.endTagHandler = utils.MethodDispatcher([ + self.endTagHandler = _utils.MethodDispatcher([ ("colgroup", self.endTagColgroup), ("col", self.endTagCol) ]) @@ -1949,7 +1949,7 @@ class InTableBodyPhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#in-table0 def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - self.startTagHandler = utils.MethodDispatcher([ + self.startTagHandler = _utils.MethodDispatcher([ ("html", self.startTagHtml), ("tr", self.startTagTr), (("td", "th"), self.startTagTableCell), @@ -1958,7 +1958,7 @@ def __init__(self, parser, tree): ]) self.startTagHandler.default = self.startTagOther - self.endTagHandler = utils.MethodDispatcher([ + self.endTagHandler = _utils.MethodDispatcher([ (("tbody", "tfoot", "thead"), self.endTagTableRowGroup), ("table", self.endTagTable), (("body", "caption", "col", "colgroup", "html", "td", "th", @@ -2047,7 +2047,7 @@ class InRowPhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#in-row def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - self.startTagHandler = utils.MethodDispatcher([ + self.startTagHandler = _utils.MethodDispatcher([ ("html", self.startTagHtml), (("td", "th"), self.startTagTableCell), (("caption", "col", "colgroup", "tbody", "tfoot", "thead", @@ -2055,7 +2055,7 @@ def __init__(self, parser, tree): ]) self.startTagHandler.default = self.startTagOther - self.endTagHandler = utils.MethodDispatcher([ + self.endTagHandler = _utils.MethodDispatcher([ ("tr", self.endTagTr), ("table", self.endTagTable), (("tbody", "tfoot", "thead"), self.endTagTableRowGroup), @@ -2136,14 +2136,14 @@ class InCellPhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#in-cell def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - self.startTagHandler = utils.MethodDispatcher([ + self.startTagHandler = _utils.MethodDispatcher([ ("html", self.startTagHtml), (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr"), self.startTagTableOther) ]) self.startTagHandler.default = self.startTagOther - self.endTagHandler = utils.MethodDispatcher([ + self.endTagHandler = _utils.MethodDispatcher([ (("td", "th"), self.endTagTableCell), (("body", "caption", "col", "colgroup", "html"), self.endTagIgnore), (("table", "tbody", "tfoot", "thead", "tr"), self.endTagImply) @@ -2212,7 +2212,7 @@ class InSelectPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - self.startTagHandler = utils.MethodDispatcher([ + self.startTagHandler = _utils.MethodDispatcher([ ("html", self.startTagHtml), ("option", self.startTagOption), ("optgroup", self.startTagOptgroup), @@ -2222,7 +2222,7 @@ def __init__(self, parser, tree): ]) self.startTagHandler.default = self.startTagOther - self.endTagHandler = utils.MethodDispatcher([ + self.endTagHandler = _utils.MethodDispatcher([ ("option", self.endTagOption), ("optgroup", self.endTagOptgroup), ("select", self.endTagSelect) @@ -2312,13 +2312,13 @@ class InSelectInTablePhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - self.startTagHandler = utils.MethodDispatcher([ + self.startTagHandler = _utils.MethodDispatcher([ (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"), self.startTagTable) ]) self.startTagHandler.default = self.startTagOther - self.endTagHandler = utils.MethodDispatcher([ + self.endTagHandler = _utils.MethodDispatcher([ (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"), self.endTagTable) ]) @@ -2466,12 +2466,12 @@ class AfterBodyPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - self.startTagHandler = utils.MethodDispatcher([ + self.startTagHandler = _utils.MethodDispatcher([ ("html", self.startTagHtml) ]) self.startTagHandler.default = self.startTagOther - self.endTagHandler = utils.MethodDispatcher([("html", self.endTagHtml)]) + self.endTagHandler = _utils.MethodDispatcher([("html", self.endTagHtml)]) self.endTagHandler.default = self.endTagOther def processEOF(self): @@ -2514,7 +2514,7 @@ class InFramesetPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - self.startTagHandler = utils.MethodDispatcher([ + self.startTagHandler = _utils.MethodDispatcher([ ("html", self.startTagHtml), ("frameset", self.startTagFrameset), ("frame", self.startTagFrame), @@ -2522,7 +2522,7 @@ def __init__(self, parser, tree): ]) self.startTagHandler.default = self.startTagOther - self.endTagHandler = utils.MethodDispatcher([ + self.endTagHandler = _utils.MethodDispatcher([ ("frameset", self.endTagFrameset) ]) self.endTagHandler.default = self.endTagOther @@ -2571,13 +2571,13 @@ class AfterFramesetPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - self.startTagHandler = utils.MethodDispatcher([ + self.startTagHandler = _utils.MethodDispatcher([ ("html", self.startTagHtml), ("noframes", self.startTagNoframes) ]) self.startTagHandler.default = self.startTagOther - self.endTagHandler = utils.MethodDispatcher([ + self.endTagHandler = _utils.MethodDispatcher([ ("html", self.endTagHtml) ]) self.endTagHandler.default = self.endTagOther @@ -2607,7 +2607,7 @@ class AfterAfterBodyPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - self.startTagHandler = utils.MethodDispatcher([ + self.startTagHandler = _utils.MethodDispatcher([ ("html", self.startTagHtml) ]) self.startTagHandler.default = self.startTagOther @@ -2645,7 +2645,7 @@ class AfterAfterFramesetPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - self.startTagHandler = utils.MethodDispatcher([ + self.startTagHandler = _utils.MethodDispatcher([ ("html", self.startTagHtml), ("noframes", self.startTagNoFrames) ]) @@ -2707,7 +2707,7 @@ def processEndTag(self, token): def adjust_attributes(token, replacements): - if PY3 or utils.PY27: + if PY3 or _utils.PY27: needs_adjustment = viewkeys(token['data']) & viewkeys(replacements) else: needs_adjustment = frozenset(token['data']) & frozenset(replacements) diff --git a/html5lib/serializer/htmlserializer.py b/html5lib/serializer.py similarity index 85% rename from html5lib/serializer/htmlserializer.py rename to html5lib/serializer.py index 8a9439df..103dd206 100644 --- a/html5lib/serializer/htmlserializer.py +++ b/html5lib/serializer.py @@ -5,40 +5,38 @@ from codecs import register_error, xmlcharrefreplace_errors -from ..constants import voidElements, booleanAttributes, spaceCharacters -from ..constants import rcdataElements, entities, xmlEntities -from .. import utils +from .constants import voidElements, booleanAttributes, spaceCharacters +from .constants import rcdataElements, entities, xmlEntities +from . import treewalkers, _utils from xml.sax.saxutils import escape -spaceCharacters = "".join(spaceCharacters) +_quoteAttributeSpecChars = "".join(spaceCharacters) + "\"'=<>`" +_quoteAttributeSpec = re.compile("[" + _quoteAttributeSpecChars + "]") +_quoteAttributeLegacy = re.compile("[" + _quoteAttributeSpecChars + + "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n" + "\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15" + "\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" + "\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000" + "\u2001\u2002\u2003\u2004\u2005\u2006\u2007" + "\u2008\u2009\u200a\u2028\u2029\u202f\u205f" + "\u3000]") -quoteAttributeSpecChars = spaceCharacters + "\"'=<>`" -quoteAttributeSpec = re.compile("[" + quoteAttributeSpecChars + "]") -quoteAttributeLegacy = re.compile("[" + quoteAttributeSpecChars + - "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n" - "\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15" - "\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" - "\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000" - "\u2001\u2002\u2003\u2004\u2005\u2006\u2007" - "\u2008\u2009\u200a\u2028\u2029\u202f\u205f" - "\u3000]") - -encode_entity_map = {} -is_ucs4 = len("\U0010FFFF") == 1 +_encode_entity_map = {} +_is_ucs4 = len("\U0010FFFF") == 1 for k, v in list(entities.items()): # skip multi-character entities - if ((is_ucs4 and len(v) > 1) or - (not is_ucs4 and len(v) > 2)): + if ((_is_ucs4 and len(v) > 1) or + (not _is_ucs4 and len(v) > 2)): continue if v != "&": if len(v) == 2: - v = utils.surrogatePairToCodepoint(v) + v = _utils.surrogatePairToCodepoint(v) else: v = ord(v) - if v not in encode_entity_map or k.islower(): + if v not in _encode_entity_map or k.islower(): # prefer < over < and similarly for &, >, etc. - encode_entity_map[v] = k + _encode_entity_map[v] = k def htmlentityreplace_errors(exc): @@ -51,14 +49,14 @@ def htmlentityreplace_errors(exc): skip = False continue index = i + exc.start - if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]): - codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2]) + if _utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]): + codepoint = _utils.surrogatePairToCodepoint(exc.object[index:index + 2]) skip = True else: codepoint = ord(c) codepoints.append(codepoint) for cp in codepoints: - e = encode_entity_map.get(cp) + e = _encode_entity_map.get(cp) if e: res.append("&") res.append(e) @@ -73,6 +71,13 @@ def htmlentityreplace_errors(exc): register_error("htmlentityreplace", htmlentityreplace_errors) +def serialize(input, tree="etree", encoding=None, **serializer_opts): + # XXX: Should we cache this? + walker = treewalkers.getTreeWalker(tree) + s = HTMLSerializer(**serializer_opts) + return s.render(walker(input), encoding) + + class HTMLSerializer(object): # attribute quoting options @@ -181,24 +186,24 @@ def serialize(self, treewalker, encoding=None): self.errors = [] if encoding and self.inject_meta_charset: - from ..filters.inject_meta_charset import Filter + from .filters.inject_meta_charset import Filter treewalker = Filter(treewalker, encoding) # Alphabetical attributes is here under the assumption that none of # the later filters add or change order of attributes; it needs to be # before the sanitizer so escaped elements come out correctly if self.alphabetical_attributes: - from ..filters.alphabeticalattributes import Filter + from .filters.alphabeticalattributes import Filter treewalker = Filter(treewalker) # WhitespaceFilter should be used before OptionalTagFilter # for maximum efficiently of this latter filter if self.strip_whitespace: - from ..filters.whitespace import Filter + from .filters.whitespace import Filter treewalker = Filter(treewalker) if self.sanitize: - from ..filters.sanitizer import Filter + from .filters.sanitizer import Filter treewalker = Filter(treewalker) if self.omit_optional_tags: - from ..filters.optionaltags import Filter + from .filters.optionaltags import Filter treewalker = Filter(treewalker) for token in treewalker: @@ -251,9 +256,9 @@ def serialize(self, treewalker, encoding=None): if self.quote_attr_values == "always" or len(v) == 0: quote_attr = True elif self.quote_attr_values == "spec": - quote_attr = quoteAttributeSpec.search(v) is not None + quote_attr = _quoteAttributeSpec.search(v) is not None elif self.quote_attr_values == "legacy": - quote_attr = quoteAttributeLegacy.search(v) is not None + quote_attr = _quoteAttributeLegacy.search(v) is not None else: raise ValueError("quote_attr_values must be one of: " "'always', 'spec', or 'legacy'") diff --git a/html5lib/serializer/__init__.py b/html5lib/serializer/__init__.py deleted file mode 100644 index 8380839a..00000000 --- a/html5lib/serializer/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -from __future__ import absolute_import, division, unicode_literals - -from .. import treewalkers - -from .htmlserializer import HTMLSerializer - - -def serialize(input, tree="etree", format="html", encoding=None, - **serializer_opts): - # XXX: Should we cache this? - walker = treewalkers.getTreeWalker(tree) - if format == "html": - s = HTMLSerializer(**serializer_opts) - else: - raise ValueError("type must be html") - return s.render(walker(input), encoding) diff --git a/html5lib/tests/test_encoding.py b/html5lib/tests/test_encoding.py index b6d20f24..9a411c77 100644 --- a/html5lib/tests/test_encoding.py +++ b/html5lib/tests/test_encoding.py @@ -5,7 +5,7 @@ import pytest from .support import get_data_files, test_dir, errorMessage, TestData as _TestData -from html5lib import HTMLParser, inputstream +from html5lib import HTMLParser, _inputstream def test_basic_prescan_length(): @@ -13,7 +13,7 @@ def test_basic_prescan_length(): pad = 1024 - len(data) + 1 data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-") assert len(data) == 1024 # Sanity - stream = inputstream.HTMLBinaryInputStream(data, useChardet=False) + stream = _inputstream.HTMLBinaryInputStream(data, useChardet=False) assert 'utf-8' == stream.charEncoding[0].name @@ -22,7 +22,7 @@ def test_parser_reparse(): pad = 10240 - len(data) + 1 data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-") assert len(data) == 10240 # Sanity - stream = inputstream.HTMLBinaryInputStream(data, useChardet=False) + stream = _inputstream.HTMLBinaryInputStream(data, useChardet=False) assert 'windows-1252' == stream.charEncoding[0].name p = HTMLParser(namespaceHTMLElements=False) doc = p.parse(data, useChardet=False) @@ -47,7 +47,7 @@ def test_parser_reparse(): ("windows-1252", b"", {}), ]) def test_parser_args(expected, data, kwargs): - stream = inputstream.HTMLBinaryInputStream(data, useChardet=False, **kwargs) + stream = _inputstream.HTMLBinaryInputStream(data, useChardet=False, **kwargs) assert expected == stream.charEncoding[0].name p = HTMLParser() p.parse(data, useChardet=False, **kwargs) @@ -85,7 +85,7 @@ def runParserEncodingTest(data, encoding): def runPreScanEncodingTest(data, encoding): - stream = inputstream.HTMLBinaryInputStream(data, useChardet=False) + stream = _inputstream.HTMLBinaryInputStream(data, useChardet=False) encoding = encoding.lower().decode("ascii") # Very crude way to ignore irrelevant tests @@ -111,6 +111,6 @@ def test_encoding(): else: def test_chardet(): with open(os.path.join(test_dir, "encoding", "chardet", "test_big5.txt"), "rb") as fp: - encoding = inputstream.HTMLInputStream(fp.read()).charEncoding + encoding = _inputstream.HTMLInputStream(fp.read()).charEncoding assert encoding[0].name == "big5" # pylint:enable=wrong-import-position diff --git a/html5lib/tests/test_serializer.py b/html5lib/tests/test_serializer.py index b3cda7d7..9333286e 100644 --- a/html5lib/tests/test_serializer.py +++ b/html5lib/tests/test_serializer.py @@ -10,7 +10,7 @@ from html5lib import constants from html5lib.filters.lint import Filter as Lint from html5lib.serializer import HTMLSerializer, serialize -from html5lib.treewalkers._base import TreeWalker +from html5lib.treewalkers.base import TreeWalker # pylint:disable=wrong-import-position optionals_loaded = [] diff --git a/html5lib/tests/test_stream.py b/html5lib/tests/test_stream.py index e8d9fd86..27c39538 100644 --- a/html5lib/tests/test_stream.py +++ b/html5lib/tests/test_stream.py @@ -11,9 +11,9 @@ import six from six.moves import http_client, urllib -from html5lib.inputstream import (BufferedStream, HTMLInputStream, - HTMLUnicodeInputStream, HTMLBinaryInputStream) -from html5lib.utils import supports_lone_surrogates +from html5lib._inputstream import (BufferedStream, HTMLInputStream, + HTMLUnicodeInputStream, HTMLBinaryInputStream) +from html5lib._utils import supports_lone_surrogates def test_basic(): diff --git a/html5lib/tests/tokenizer.py b/html5lib/tests/tokenizer.py index 255c1859..1440a722 100644 --- a/html5lib/tests/tokenizer.py +++ b/html5lib/tests/tokenizer.py @@ -8,8 +8,8 @@ import pytest from six import unichr -from html5lib.tokenizer import HTMLTokenizer -from html5lib import constants, utils +from html5lib._tokenizer import HTMLTokenizer +from html5lib import constants, _utils class TokenizerTestParser(object): @@ -156,7 +156,7 @@ def repl(m): except ValueError: # This occurs when unichr throws ValueError, which should # only be for a lone-surrogate. - if utils.supports_lone_surrogates: + if _utils.supports_lone_surrogates: raise return None diff --git a/html5lib/treebuilders/__init__.py b/html5lib/treebuilders/__init__.py index 6a6b2a4c..e2328847 100644 --- a/html5lib/treebuilders/__init__.py +++ b/html5lib/treebuilders/__init__.py @@ -28,7 +28,7 @@ from __future__ import absolute_import, division, unicode_literals -from ..utils import default_etree +from .._utils import default_etree treeBuilderCache = {} diff --git a/html5lib/treebuilders/_base.py b/html5lib/treebuilders/base.py similarity index 100% rename from html5lib/treebuilders/_base.py rename to html5lib/treebuilders/base.py diff --git a/html5lib/treebuilders/dom.py b/html5lib/treebuilders/dom.py index 9d7f4824..dcfac220 100644 --- a/html5lib/treebuilders/dom.py +++ b/html5lib/treebuilders/dom.py @@ -5,10 +5,10 @@ from xml.dom import minidom, Node import weakref -from . import _base +from . import base from .. import constants from ..constants import namespaces -from ..utils import moduleFactoryFactory +from .._utils import moduleFactoryFactory def getDomBuilder(DomImplementation): @@ -50,9 +50,9 @@ def __delitem__(self, name): else: del self.element.attributes[name] - class NodeBuilder(_base.Node): + class NodeBuilder(base.Node): def __init__(self, element): - _base.Node.__init__(self, element.nodeName) + base.Node.__init__(self, element.nodeName) self.element = element namespace = property(lambda self: hasattr(self.element, "namespaceURI") and @@ -117,7 +117,7 @@ def getNameTuple(self): nameTuple = property(getNameTuple) - class TreeBuilder(_base.TreeBuilder): # pylint:disable=unused-variable + class TreeBuilder(base.TreeBuilder): # pylint:disable=unused-variable def documentClass(self): self.dom = Dom.getDOMImplementation().createDocument(None, None, None) return weakref.proxy(self) @@ -157,12 +157,12 @@ def getDocument(self): return self.dom def getFragment(self): - return _base.TreeBuilder.getFragment(self).element + return base.TreeBuilder.getFragment(self).element def insertText(self, data, parent=None): data = data if parent != self: - _base.TreeBuilder.insertText(self, data, parent) + base.TreeBuilder.insertText(self, data, parent) else: # HACK: allow text nodes as children of the document node if hasattr(self.dom, '_child_node_types'): diff --git a/html5lib/treebuilders/etree.py b/html5lib/treebuilders/etree.py index 4d12bd45..cb1d4aef 100644 --- a/html5lib/treebuilders/etree.py +++ b/html5lib/treebuilders/etree.py @@ -5,11 +5,11 @@ import re -from . import _base -from .. import ihatexml +from . import base +from .. import _ihatexml from .. import constants from ..constants import namespaces -from ..utils import moduleFactoryFactory +from .._utils import moduleFactoryFactory tag_regexp = re.compile("{([^}]*)}(.*)") @@ -18,7 +18,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False): ElementTree = ElementTreeImplementation ElementTreeCommentType = ElementTree.Comment("asd").tag - class Element(_base.Node): + class Element(base.Node): def __init__(self, name, namespace=None): self._name = name self._namespace = namespace @@ -142,7 +142,7 @@ def reparentChildren(self, newParent): if self._element.text is not None: newParent._element.text += self._element.text self._element.text = "" - _base.Node.reparentChildren(self, newParent) + base.Node.reparentChildren(self, newParent) class Comment(Element): def __init__(self, data): @@ -259,7 +259,7 @@ def serializeElement(element, indent=0): def tostring(element): # pylint:disable=unused-variable """Serialize an element and its child nodes to a string""" rv = [] - filter = ihatexml.InfosetFilter() + filter = _ihatexml.InfosetFilter() def serializeElement(element): if isinstance(element, ElementTree.ElementTree): @@ -310,7 +310,7 @@ def serializeElement(element): return "".join(rv) - class TreeBuilder(_base.TreeBuilder): # pylint:disable=unused-variable + class TreeBuilder(base.TreeBuilder): # pylint:disable=unused-variable documentClass = Document doctypeClass = DocumentType elementClass = Element @@ -332,7 +332,7 @@ def getDocument(self): return self.document._element.find("html") def getFragment(self): - return _base.TreeBuilder.getFragment(self)._element + return base.TreeBuilder.getFragment(self)._element return locals() diff --git a/html5lib/treebuilders/etree_lxml.py b/html5lib/treebuilders/etree_lxml.py index 2a69769b..908820c0 100644 --- a/html5lib/treebuilders/etree_lxml.py +++ b/html5lib/treebuilders/etree_lxml.py @@ -16,11 +16,11 @@ import re import sys -from . import _base +from . import base from ..constants import DataLossWarning from .. import constants from . import etree as etree_builders -from .. import ihatexml +from .. import _ihatexml import lxml.etree as etree @@ -54,7 +54,7 @@ def _getChildNodes(self): def testSerializer(element): rv = [] - infosetFilter = ihatexml.InfosetFilter(preventDoubleDashComments=True) + infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True) def serializeElement(element, indent=0): if not hasattr(element, "tag"): @@ -172,7 +172,7 @@ def serializeElement(element): return "".join(rv) -class TreeBuilder(_base.TreeBuilder): +class TreeBuilder(base.TreeBuilder): documentClass = Document doctypeClass = DocumentType elementClass = None @@ -182,7 +182,7 @@ class TreeBuilder(_base.TreeBuilder): def __init__(self, namespaceHTMLElements, fullTree=False): builder = etree_builders.getETreeModule(etree, fullTree=fullTree) - infosetFilter = self.infosetFilter = ihatexml.InfosetFilter(preventDoubleDashComments=True) + infosetFilter = self.infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True) self.namespaceHTMLElements = namespaceHTMLElements class Attributes(dict): @@ -254,10 +254,10 @@ def _getData(self): self.elementClass = Element self.commentClass = Comment # self.fragmentClass = builder.DocumentFragment - _base.TreeBuilder.__init__(self, namespaceHTMLElements) + base.TreeBuilder.__init__(self, namespaceHTMLElements) def reset(self): - _base.TreeBuilder.reset(self) + base.TreeBuilder.reset(self) self.insertComment = self.insertCommentInitial self.initial_comments = [] self.doctype = None diff --git a/html5lib/treewalkers/__init__.py b/html5lib/treewalkers/__init__.py index 00ae2804..9e19a559 100644 --- a/html5lib/treewalkers/__init__.py +++ b/html5lib/treewalkers/__init__.py @@ -11,9 +11,9 @@ from __future__ import absolute_import, division, unicode_literals from .. import constants -from ..utils import default_etree +from .._utils import default_etree -__all__ = ["getTreeWalker", "pprint", "dom", "etree", "genshistream", "lxmletree"] +__all__ = ["getTreeWalker", "pprint", "dom", "etree", "genshi", "etree_lxml"] treeWalkerCache = {} @@ -43,11 +43,11 @@ def getTreeWalker(treeType, implementation=None, **kwargs): from . import dom treeWalkerCache[treeType] = dom.TreeWalker elif treeType == "genshi": - from . import genshistream - treeWalkerCache[treeType] = genshistream.TreeWalker + from . import genshi + treeWalkerCache[treeType] = genshi.TreeWalker elif treeType == "lxml": - from . import lxmletree - treeWalkerCache[treeType] = lxmletree.TreeWalker + from . import etree_lxml + treeWalkerCache[treeType] = etree_lxml.TreeWalker elif treeType == "etree": from . import etree if implementation is None: diff --git a/html5lib/treewalkers/_base.py b/html5lib/treewalkers/base.py similarity index 100% rename from html5lib/treewalkers/_base.py rename to html5lib/treewalkers/base.py diff --git a/html5lib/treewalkers/dom.py b/html5lib/treewalkers/dom.py index ac4dcf31..b0c89b00 100644 --- a/html5lib/treewalkers/dom.py +++ b/html5lib/treewalkers/dom.py @@ -2,16 +2,16 @@ from xml.dom import Node -from . import _base +from . import base -class TreeWalker(_base.NonRecursiveTreeWalker): +class TreeWalker(base.NonRecursiveTreeWalker): def getNodeDetails(self, node): if node.nodeType == Node.DOCUMENT_TYPE_NODE: - return _base.DOCTYPE, node.name, node.publicId, node.systemId + return base.DOCTYPE, node.name, node.publicId, node.systemId elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE): - return _base.TEXT, node.nodeValue + return base.TEXT, node.nodeValue elif node.nodeType == Node.ELEMENT_NODE: attrs = {} @@ -21,17 +21,17 @@ def getNodeDetails(self, node): attrs[(attr.namespaceURI, attr.localName)] = attr.value else: attrs[(None, attr.name)] = attr.value - return (_base.ELEMENT, node.namespaceURI, node.nodeName, + return (base.ELEMENT, node.namespaceURI, node.nodeName, attrs, node.hasChildNodes()) elif node.nodeType == Node.COMMENT_NODE: - return _base.COMMENT, node.nodeValue + return base.COMMENT, node.nodeValue elif node.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE): - return (_base.DOCUMENT,) + return (base.DOCUMENT,) else: - return _base.UNKNOWN, node.nodeType + return base.UNKNOWN, node.nodeType def getFirstChild(self, node): return node.firstChild diff --git a/html5lib/treewalkers/etree.py b/html5lib/treewalkers/etree.py index d3b0c50e..8f30f078 100644 --- a/html5lib/treewalkers/etree.py +++ b/html5lib/treewalkers/etree.py @@ -12,8 +12,8 @@ from six import string_types -from . import _base -from ..utils import moduleFactoryFactory +from . import base +from .._utils import moduleFactoryFactory tag_regexp = re.compile("{([^}]*)}(.*)") @@ -22,7 +22,7 @@ def getETreeBuilder(ElementTreeImplementation): ElementTree = ElementTreeImplementation ElementTreeCommentType = ElementTree.Comment("asd").tag - class TreeWalker(_base.NonRecursiveTreeWalker): # pylint:disable=unused-variable + class TreeWalker(base.NonRecursiveTreeWalker): # pylint:disable=unused-variable """Given the particular ElementTree representation, this implementation, to avoid using recursion, returns "nodes" as tuples with the following content: @@ -40,7 +40,7 @@ def getNodeDetails(self, node): if isinstance(node, tuple): # It might be the root Element elt, _, _, flag = node if flag in ("text", "tail"): - return _base.TEXT, getattr(elt, flag) + return base.TEXT, getattr(elt, flag) else: node = elt @@ -48,14 +48,14 @@ def getNodeDetails(self, node): node = node.getroot() if node.tag in ("DOCUMENT_ROOT", "DOCUMENT_FRAGMENT"): - return (_base.DOCUMENT,) + return (base.DOCUMENT,) elif node.tag == "": - return (_base.DOCTYPE, node.text, + return (base.DOCTYPE, node.text, node.get("publicId"), node.get("systemId")) elif node.tag == ElementTreeCommentType: - return _base.COMMENT, node.text + return base.COMMENT, node.text else: assert isinstance(node.tag, string_types), type(node.tag) @@ -73,7 +73,7 @@ def getNodeDetails(self, node): attrs[(match.group(1), match.group(2))] = value else: attrs[(None, name)] = value - return (_base.ELEMENT, namespace, tag, + return (base.ELEMENT, namespace, tag, attrs, len(node) or node.text) def getFirstChild(self, node): diff --git a/html5lib/treewalkers/lxmletree.py b/html5lib/treewalkers/etree_lxml.py similarity index 90% rename from html5lib/treewalkers/lxmletree.py rename to html5lib/treewalkers/etree_lxml.py index ff31a44e..fb236311 100644 --- a/html5lib/treewalkers/lxmletree.py +++ b/html5lib/treewalkers/etree_lxml.py @@ -4,9 +4,9 @@ from lxml import etree from ..treebuilders.etree import tag_regexp -from . import _base +from . import base -from .. import ihatexml +from .. import _ihatexml def ensure_str(s): @@ -122,7 +122,7 @@ def __len__(self): return len(self.obj) -class TreeWalker(_base.NonRecursiveTreeWalker): +class TreeWalker(base.NonRecursiveTreeWalker): def __init__(self, tree): # pylint:disable=redefined-variable-type if isinstance(tree, list): @@ -131,29 +131,29 @@ def __init__(self, tree): else: self.fragmentChildren = set() tree = Root(tree) - _base.NonRecursiveTreeWalker.__init__(self, tree) - self.filter = ihatexml.InfosetFilter() + base.NonRecursiveTreeWalker.__init__(self, tree) + self.filter = _ihatexml.InfosetFilter() def getNodeDetails(self, node): if isinstance(node, tuple): # Text node node, key = node assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key - return _base.TEXT, ensure_str(getattr(node, key)) + return base.TEXT, ensure_str(getattr(node, key)) elif isinstance(node, Root): - return (_base.DOCUMENT,) + return (base.DOCUMENT,) elif isinstance(node, Doctype): - return _base.DOCTYPE, node.name, node.public_id, node.system_id + return base.DOCTYPE, node.name, node.public_id, node.system_id elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"): - return _base.TEXT, ensure_str(node.obj) + return base.TEXT, ensure_str(node.obj) elif node.tag == etree.Comment: - return _base.COMMENT, ensure_str(node.text) + return base.COMMENT, ensure_str(node.text) elif node.tag == etree.Entity: - return _base.ENTITY, ensure_str(node.text)[1:-1] # strip &; + return base.ENTITY, ensure_str(node.text)[1:-1] # strip &; else: # This is assumed to be an ordinary element @@ -172,7 +172,7 @@ def getNodeDetails(self, node): attrs[(match.group(1), match.group(2))] = value else: attrs[(None, name)] = value - return (_base.ELEMENT, namespace, self.filter.fromXmlName(tag), + return (base.ELEMENT, namespace, self.filter.fromXmlName(tag), attrs, len(node) > 0 or node.text) def getFirstChild(self, node): diff --git a/html5lib/treewalkers/genshistream.py b/html5lib/treewalkers/genshi.py similarity index 97% rename from html5lib/treewalkers/genshistream.py rename to html5lib/treewalkers/genshi.py index 61cbfede..7483be27 100644 --- a/html5lib/treewalkers/genshistream.py +++ b/html5lib/treewalkers/genshi.py @@ -4,12 +4,12 @@ from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT from genshi.core import START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT -from . import _base +from . import base from ..constants import voidElements, namespaces -class TreeWalker(_base.TreeWalker): +class TreeWalker(base.TreeWalker): def __iter__(self): # Buffer the events so we can pass in the following one previous = None diff --git a/parse.py b/parse.py index d5087fb8..3e65c330 100755 --- a/parse.py +++ b/parse.py @@ -11,7 +11,7 @@ from html5lib import html5parser from html5lib import treebuilders, serializer, treewalkers from html5lib import constants -from html5lib import utils +from html5lib import _utils def parse(): @@ -116,7 +116,7 @@ def printOutput(parser, document, opts): import lxml.etree sys.stdout.write(lxml.etree.tostring(document, encoding="unicode")) elif tb == "etree": - sys.stdout.write(utils.default_etree.tostring(document, encoding="unicode")) + sys.stdout.write(_utils.default_etree.tostring(document, encoding="unicode")) elif opts.tree: if not hasattr(document, '__getitem__'): document = [document]
Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.
Alternative Proxies: