Skip to content

Commit 5d493a5

Browse files
committed
Update sanitizer to html5; accommodate usage as a filter
1 parent a71989b commit 5d493a5

File tree

1 file changed

+87
-59
lines changed

1 file changed

+87
-59
lines changed

src/html5lib/sanitizer.py

Lines changed: 87 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,19 @@
77
class HTMLSanitizerMixin(object):
88
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
99

10-
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
11-
'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
12-
'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt',
13-
'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
14-
'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map',
15-
'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
16-
'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
17-
'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u',
18-
'ul', 'var']
10+
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area',
11+
'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
12+
'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
13+
'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
14+
'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
15+
'figure', 'footer', 'font', 'form', 'header', 'h1', 'h2', 'h3', 'h4',
16+
'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins', 'keygen', 'kbd',
17+
'label', 'legend', 'li', 'm', 'map', 'menu', 'meter', 'multicol',
18+
'nav', 'nextid', 'ol', 'output', 'optgroup', 'option', 'p', 'pre',
19+
'progress', 'q', 's', 'samp', 'section', 'select', 'small', 'sound',
20+
'source', 'spacer', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
21+
'tbody', 'td', 'textarea', 'time', 'tfoot', 'th', 'thead', 'tr', 'tt',
22+
'u', 'ul', 'var', 'video']
1923

2024
mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
2125
'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
@@ -24,24 +28,35 @@ class HTMLSanitizerMixin(object):
2428
'munderover', 'none']
2529

2630
svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
27-
'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face',
28-
'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
31+
'animateTransform', 'clipPath', 'circle', 'defs', 'desc', 'ellipse',
32+
'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
2933
'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
3034
'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
3135
'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
3236

3337
acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
34-
'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
35-
'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
36-
'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
37-
'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
38-
'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
39-
'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
40-
'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
41-
'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
42-
'span', 'src', 'start', 'style', 'summary', 'tabindex', 'target',
43-
'title', 'type', 'usemap', 'valign', 'value', 'vspace', 'width',
44-
'xml:lang']
38+
'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
39+
'background', 'balance', 'bgcolor', 'bgproperties', 'border',
40+
'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
41+
'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
42+
'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color',
43+
'cols', 'colspan', 'compact', 'contenteditable', 'controls', 'coords',
44+
'data', 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default',
45+
'delay', 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end',
46+
'face', 'for', 'form', 'frame', 'galleryimg', 'gutter', 'headers',
47+
'height', 'hidefocus', 'hidden', 'high', 'href', 'hreflang', 'hspace',
48+
'icon', 'id', 'inputmode', 'ismap', 'keytype', 'label', 'leftspacing',
49+
'lang', 'list', 'longdesc', 'loop', 'loopcount', 'loopend',
50+
'loopstart', 'low', 'lowsrc', 'max', 'maxlength', 'media', 'method',
51+
'min', 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'open',
52+
'optimum', 'pattern', 'ping', 'point-size', 'prompt', 'pqg',
53+
'radiogroup', 'readonly', 'rel', 'repeat-max', 'repeat-min',
54+
'replace', 'required', 'rev', 'rightspacing', 'rows', 'rowspan',
55+
'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src', 'start',
56+
'step', 'style', 'summary', 'suppress', 'tabindex', 'target',
57+
'template', 'title', 'toppadding', 'type', 'unselectable', 'usemap',
58+
'urn', 'valign', 'value', 'variable', 'volume', 'vspace', 'vrml',
59+
'width', 'wrap', 'xml:lang']
4560

4661
mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
4762
'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
@@ -54,43 +69,45 @@ class HTMLSanitizerMixin(object):
5469
'xlink:type', 'xmlns', 'xmlns:xlink']
5570

5671
svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
57-
'arabic-form', 'ascent', 'attributeName', 'attributeType',
58-
'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
59-
'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
60-
'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity',
61-
'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style',
62-
'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2',
63-
'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x',
64-
'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints',
65-
'keySplines', 'keyTimes', 'lang', 'marker-end', 'marker-mid',
66-
'marker-start', 'markerHeight', 'markerUnits', 'markerWidth',
67-
'mathematical', 'max', 'min', 'name', 'offset', 'opacity', 'orient',
68-
'origin', 'overline-position', 'overline-thickness', 'panose-1',
69-
'path', 'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX',
70-
'refY', 'repeatCount', 'repeatDur', 'requiredExtensions',
71-
'requiredFeatures', 'restart', 'rotate', 'rx', 'ry', 'slope',
72-
'stemh', 'stemv', 'stop-color', 'stop-opacity',
73-
'strikethrough-position', 'strikethrough-thickness', 'stroke',
74-
'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
75-
'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity',
76-
'stroke-width', 'systemLanguage', 'target', 'text-anchor', 'to',
77-
'transform', 'type', 'u1', 'u2', 'underline-position',
78-
'underline-thickness', 'unicode', 'unicode-range', 'units-per-em',
79-
'values', 'version', 'viewBox', 'visibility', 'width', 'widths', 'x',
80-
'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
81-
'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title',
82-
'xlink:type', 'xml:base', 'xml:lang', 'xml:space', 'xmlns',
83-
'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan']
72+
'arabic-form', 'ascent', 'attributeName', 'attributeType',
73+
'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
74+
'class', 'clip-path', 'color', 'color-rendering', 'content', 'cx',
75+
'cy', 'd', 'dx', 'dy', 'descent', 'display', 'dur', 'end', 'fill',
76+
'fill-opacity', 'fill-rule', 'font-family', 'font-size',
77+
'font-stretch', 'font-style', 'font-variant', 'font-weight', 'from',
78+
'fx', 'fy', 'g1', 'g2', 'glyph-name', 'gradientUnits', 'hanging',
79+
'height', 'horiz-adv-x', 'horiz-origin-x', 'id', 'ideographic', 'k',
80+
'keyPoints', 'keySplines', 'keyTimes', 'lang', 'marker-end',
81+
'marker-mid', 'marker-start', 'markerHeight', 'markerUnits',
82+
'markerWidth', 'mathematical', 'max', 'min', 'name', 'offset',
83+
'opacity', 'orient', 'origin', 'overline-position',
84+
'overline-thickness', 'panose-1', 'path', 'pathLength', 'points',
85+
'preserveAspectRatio', 'r', 'refX', 'refY', 'repeatCount',
86+
'repeatDur', 'requiredExtensions', 'requiredFeatures', 'restart',
87+
'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv', 'stop-color',
88+
'stop-opacity', 'strikethrough-position', 'strikethrough-thickness',
89+
'stroke', 'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
90+
'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity',
91+
'stroke-width', 'systemLanguage', 'target', 'text-anchor', 'to',
92+
'transform', 'type', 'u1', 'u2', 'underline-position',
93+
'underline-thickness', 'unicode', 'unicode-range', 'units-per-em',
94+
'values', 'version', 'viewBox', 'visibility', 'width', 'widths', 'x',
95+
'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
96+
'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
97+
'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y',
98+
'y1', 'y2', 'zoomAndPan']
8499

85100
attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc',
86-
'xlink:href', 'xml:base']
101+
'xlink:href', 'xml:base']
87102

88103
svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill',
89-
'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end', 'mask', 'stroke']
104+
'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end',
105+
'mask', 'stroke']
90106

91-
svg_allow_local_href = ['altGlyph', 'animate', 'animateColor', 'animateMotion',
92-
'animateTransform', 'cursor', 'feImage', 'filter', 'linearGradient', 'pattern',
93-
'radialGradient', 'textpath', 'tref', 'set', 'use']
107+
svg_allow_local_href = ['altGlyph', 'animate', 'animateColor',
108+
'animateMotion', 'animateTransform', 'cursor', 'feImage', 'filter',
109+
'linearGradient', 'pattern', 'radialGradient', 'textpath', 'tref',
110+
'set', 'use']
94111

95112
acceptable_css_properties = ['azimuth', 'background-color',
96113
'border-bottom-color', 'border-collapse', 'border-color',
@@ -140,7 +157,13 @@ class HTMLSanitizerMixin(object):
140157
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
141158
# => <a>Click here for $100</a>
142159
def sanitize_token(self, token):
143-
if token["type"] in (tokenTypes["StartTag"], tokenTypes["EndTag"],
160+
161+
# accommodate filters which use token_type differently
162+
token_type = token["type"]
163+
if token_type in tokenTypes.keys():
164+
token_type = tokenTypes[token_type]
165+
166+
if token_type in (tokenTypes["StartTag"], tokenTypes["EndTag"],
144167
tokenTypes["EmptyTag"]):
145168
if token["name"] in self.allowed_elements:
146169
if token.has_key("data"):
@@ -172,19 +195,24 @@ def sanitize_token(self, token):
172195
token["data"] = [[name,val] for name,val in attrs.items()]
173196
return token
174197
else:
175-
if token["type"] == tokenTypes["EndTag"]:
198+
if token_type == tokenTypes["EndTag"]:
176199
token["data"] = "</%s>" % token["name"]
177200
elif token["data"]:
178201
attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
179202
token["data"] = "<%s%s>" % (token["name"],attrs)
180203
else:
181204
token["data"] = "<%s>" % token["name"]
182-
if token["selfClosing"]:
205+
if token.get("selfClosing"):
183206
token["data"]=token["data"][:-1] + "/>"
184-
token["type"] = tokenTypes["Characters"]
207+
208+
if token["type"] in tokenTypes.keys():
209+
token["type"] = "Characters"
210+
else:
211+
token["type"] = tokenTypes["Characters"]
212+
185213
del token["name"]
186214
return token
187-
elif token["type"] == tokenTypes["Comment"]:
215+
elif token_type == tokenTypes["Comment"]:
188216
pass
189217
else:
190218
return token

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy