Skip to content

Commit 0c8d7bc

Browse files
committed
Fix html5lib#6: dom2sax crash by replacing dom2sax with a generic to_sax
This moves the functionality to a new treeadapters module (where later the adapters from test_treewalker.py will get moved) and removes the previous dom2sax function.
1 parent f61e328 commit 0c8d7bc

File tree

6 files changed

+69
-78
lines changed

6 files changed

+69
-78
lines changed

CHANGES.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@ Change Log
66

77
Released on XXX, 2013
88

9+
* Removed ``dom2sax`` from DOM treebuilders. It has been replaced by
10+
``treeadapters.sax.to_sax`` which is generic and supports any
11+
treewalker; it also resolves all known bugs with ``dom2sax``.
12+
913

1014
1.0b1
1115
~~~~~

html5lib/constants.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -433,6 +433,24 @@
433433
(namespaces["mathml"], "mtext")
434434
))
435435

436+
adjustForeignAttributes = {
437+
"xlink:actuate": ("xlink", "actuate", namespaces["xlink"]),
438+
"xlink:arcrole": ("xlink", "arcrole", namespaces["xlink"]),
439+
"xlink:href": ("xlink", "href", namespaces["xlink"]),
440+
"xlink:role": ("xlink", "role", namespaces["xlink"]),
441+
"xlink:show": ("xlink", "show", namespaces["xlink"]),
442+
"xlink:title": ("xlink", "title", namespaces["xlink"]),
443+
"xlink:type": ("xlink", "type", namespaces["xlink"]),
444+
"xml:base": ("xml", "base", namespaces["xml"]),
445+
"xml:lang": ("xml", "lang", namespaces["xml"]),
446+
"xml:space": ("xml", "space", namespaces["xml"]),
447+
"xmlns": (None, "xmlns", namespaces["xmlns"]),
448+
"xmlns:xlink": ("xmlns", "xlink", namespaces["xmlns"])
449+
}
450+
451+
unadjustForeignAttributes = dict([((ns, local), qname) for qname, (prefix, local, ns) in
452+
adjustForeignAttributes.items()])
453+
436454
spaceCharacters = frozenset((
437455
"\t",
438456
"\n",

html5lib/html5parser.py

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from .constants import cdataElements, rcdataElements
1818
from .constants import tokenTypes, ReparseException, namespaces
1919
from .constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements
20+
from .constants import adjustForeignAttributes as adjustForeignAttributesMap
2021

2122

2223
def parse(doc, treebuilder="etree", encoding=None,
@@ -333,20 +334,7 @@ def adjustSVGAttributes(self, token):
333334
del token["data"][originalName]
334335

335336
def adjustForeignAttributes(self, token):
336-
replacements = {
337-
"xlink:actuate": ("xlink", "actuate", namespaces["xlink"]),
338-
"xlink:arcrole": ("xlink", "arcrole", namespaces["xlink"]),
339-
"xlink:href": ("xlink", "href", namespaces["xlink"]),
340-
"xlink:role": ("xlink", "role", namespaces["xlink"]),
341-
"xlink:show": ("xlink", "show", namespaces["xlink"]),
342-
"xlink:title": ("xlink", "title", namespaces["xlink"]),
343-
"xlink:type": ("xlink", "type", namespaces["xlink"]),
344-
"xml:base": ("xml", "base", namespaces["xml"]),
345-
"xml:lang": ("xml", "lang", namespaces["xml"]),
346-
"xml:space": ("xml", "space", namespaces["xml"]),
347-
"xmlns": (None, "xmlns", namespaces["xmlns"]),
348-
"xmlns:xlink": ("xmlns", "xlink", namespaces["xmlns"])
349-
}
337+
replacements = adjustForeignAttributesMap
350338

351339
for originalName in token["data"].keys():
352340
if originalName in replacements:

html5lib/treeadapters/__init__.py

Whitespace-only changes.

html5lib/treeadapters/sax.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
from __future__ import absolute_import, division, unicode_literals
2+
3+
from xml.sax.xmlreader import AttributesNSImpl
4+
5+
from ..constants import adjustForeignAttributes, unadjustForeignAttributes
6+
7+
prefix_mapping = {}
8+
for prefix, localName, namespace in adjustForeignAttributes.values():
9+
if prefix is not None:
10+
prefix_mapping[prefix] = namespace
11+
12+
13+
def to_sax(walker, handler):
14+
"""Call SAX-like content handler based on treewalker walker"""
15+
handler.startDocument()
16+
for prefix, namespace in prefix_mapping.items():
17+
handler.startPrefixMapping(prefix, namespace)
18+
19+
for token in walker:
20+
type = token["type"]
21+
if type == "Doctype":
22+
continue
23+
elif type in ("StartTag", "EmptyTag"):
24+
attrs = AttributesNSImpl(token["data"],
25+
unadjustForeignAttributes)
26+
handler.startElementNS((token["namespace"], token["name"]),
27+
token["name"],
28+
attrs)
29+
if type == "EmptyTag":
30+
handler.endElementNS((token["namespace"], token["name"]),
31+
token["name"])
32+
elif type == "EndTag":
33+
handler.endElementNS((token["namespace"], token["name"]),
34+
token["name"])
35+
elif type in ("Characters", "SpaceCharacters"):
36+
handler.characters(token["data"])
37+
elif type == "Comment":
38+
pass
39+
else:
40+
assert False, "Unknown token type"
41+
42+
for prefix, namespace in prefix_mapping.items():
43+
handler.endPrefixMapping(prefix)
44+
handler.endDocument()

html5lib/treebuilders/dom.py

Lines changed: 1 addition & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from __future__ import absolute_import, division, unicode_literals
22

33

4-
from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
4+
from xml.dom import minidom, Node
55
import weakref
66

77
from . import _base
@@ -220,69 +220,6 @@ def serializeElement(element, indent=0):
220220

221221
return "\n".join(rv)
222222

223-
def dom2sax(node, handler, nsmap={'xml': XML_NAMESPACE}):
224-
if node.nodeType == Node.ELEMENT_NODE:
225-
if not nsmap:
226-
handler.startElement(node.nodeName, node.attributes)
227-
for child in node.childNodes:
228-
dom2sax(child, handler, nsmap)
229-
handler.endElement(node.nodeName)
230-
else:
231-
attributes = dict(node.attributes.itemsNS())
232-
233-
# gather namespace declarations
234-
prefixes = []
235-
for attrname in list(node.attributes.keys()):
236-
attr = node.getAttributeNode(attrname)
237-
if (attr.namespaceURI == XMLNS_NAMESPACE or
238-
(attr.namespaceURI is None and attr.nodeName.startswith('xmlns'))):
239-
prefix = (attr.nodeName != 'xmlns' and attr.nodeName or None)
240-
handler.startPrefixMapping(prefix, attr.nodeValue)
241-
prefixes.append(prefix)
242-
nsmap = nsmap.copy()
243-
nsmap[prefix] = attr.nodeValue
244-
del attributes[(attr.namespaceURI, attr.nodeName)]
245-
246-
# apply namespace declarations
247-
for attrname in list(node.attributes.keys()):
248-
attr = node.getAttributeNode(attrname)
249-
if attr.namespaceURI is None and ':' in attr.nodeName:
250-
prefix = attr.nodeName.split(':')[0]
251-
if prefix in nsmap:
252-
del attributes[(attr.namespaceURI, attr.nodeName)]
253-
attributes[(nsmap[prefix], attr.nodeName)] = attr.nodeValue
254-
255-
# SAX events
256-
ns = node.namespaceURI or nsmap.get(None, None)
257-
handler.startElementNS((ns, node.nodeName), node.nodeName, attributes)
258-
for child in node.childNodes:
259-
dom2sax(child, handler, nsmap)
260-
handler.endElementNS((ns, node.nodeName), node.nodeName)
261-
for prefix in prefixes:
262-
handler.endPrefixMapping(prefix)
263-
264-
elif node.nodeType in [Node.TEXT_NODE, Node.CDATA_SECTION_NODE]:
265-
handler.characters(node.nodeValue)
266-
267-
elif node.nodeType == Node.DOCUMENT_NODE:
268-
handler.startDocument()
269-
for child in node.childNodes:
270-
dom2sax(child, handler, nsmap)
271-
handler.endDocument()
272-
273-
elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
274-
for child in node.childNodes:
275-
dom2sax(child, handler, nsmap)
276-
277-
else:
278-
# ATTRIBUTE_NODE
279-
# ENTITY_NODE
280-
# PROCESSING_INSTRUCTION_NODE
281-
# COMMENT_NODE
282-
# DOCUMENT_TYPE_NODE
283-
# NOTATION_NODE
284-
pass
285-
286223
return locals()
287224

288225

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy