Skip to content

Commit 93b8feb

Browse files
author
Drew Hubl
committed
Allow the data URI scheme, a whitelist for content types, and update tests to correctly check URIs
1 parent f5fd711 commit 93b8feb

File tree

2 files changed

+65
-12
lines changed

2 files changed

+65
-12
lines changed

html5lib/sanitizer.py

Lines changed: 48 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22

33
import re
44
from xml.sax.saxutils import escape, unescape
5+
try:
6+
from urllib.parse import urlparse
7+
except ImportError:
8+
from urlparse import urlparse
59

610
from .tokenizer import HTMLTokenizer
711
from .constants import tokenTypes
@@ -138,7 +142,9 @@ class HTMLSanitizerMixin(object):
138142
acceptable_protocols = ['ed2k', 'ftp', 'http', 'https', 'irc',
139143
'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
140144
'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
141-
'ssh', 'sftp', 'rtsp', 'afs']
145+
'ssh', 'sftp', 'rtsp', 'afs', 'data']
146+
147+
acceptable_content_types = ['image/png']
142148

143149
# subclasses may define their own versions of these constants
144150
allowed_elements = acceptable_elements + mathml_elements + svg_elements
@@ -147,6 +153,7 @@ class HTMLSanitizerMixin(object):
147153
allowed_css_keywords = acceptable_css_keywords
148154
allowed_svg_properties = acceptable_svg_properties
149155
allowed_protocols = acceptable_protocols
156+
allowed_content_types = acceptable_content_types
150157

151158
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
152159
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
@@ -189,10 +196,46 @@ def allowed_token(self, token, token_type):
189196
unescape(attrs[attr])).lower()
190197
# remove replacement characters from unescaped characters
191198
val_unescaped = val_unescaped.replace("\ufffd", "")
192-
if (re.match("^[a-z0-9][-+.a-z0-9]*:", val_unescaped) and
193-
(val_unescaped.split(':')[0] not in
194-
self.allowed_protocols)):
195-
del attrs[attr]
199+
uri = urlparse(val_unescaped)
200+
if uri:
201+
if uri.scheme not in self.allowed_protocols:
202+
del attrs[attr]
203+
rgx = re.compile(r'''
204+
^
205+
# Match a content type <application>/<type>
206+
(?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
207+
# Match any character set and encoding
208+
# Note that this does not prevent the
209+
# same one being set twice
210+
# The charset group is currently unused
211+
(?:;charset=(?P<charset>[-a-zA-Z0-9]+)|;(?P<encoding>base64)){0,2}
212+
# Match the base64-encoded or urlencoded
213+
# data
214+
# The data group is currently unused
215+
(?P<data>,(?P<base64_encoded_data>[a-zA-Z0-9+/]+=*|(?P<url_encoded_data>[a-zA-Z0-9]+|%[a-fA-F0-9]{2})))
216+
$
217+
''',
218+
re.VERBOSE)
219+
if uri.scheme == 'data':
220+
m = rgx.match(uri.path)
221+
if not m:
222+
del attrs[attr]
223+
if m.group('content_type') not in self.allowed_content_types:
224+
del attrs[attr]
225+
if m.group('encoding'):
226+
if m.group('encoding') == 'base64':
227+
# If the encoding identifier is base64, then
228+
# make sure the data is encoded in base64
229+
if not m.group('base64_encoded_data'):
230+
del attrs[attr]
231+
else:
232+
del attrs[attr]
233+
else:
234+
# If the encoding is not given, expect the data to
235+
# be urlencoded
236+
if not m.group('url_encoded_data'):
237+
del attrs[attr]
238+
196239
for attr in self.svg_attr_val_allows_ref:
197240
if attr in attrs:
198241
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',

html5lib/tests/test_sanitizer.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -80,9 +80,12 @@ def test_sanitizer():
8080
continue # TODO
8181
if attribute_name == 'style':
8282
continue
83+
attribute_value = 'foo'
84+
if attribute_name in sanitizer.HTMLSanitizer.attr_val_is_uri:
85+
attribute_value = 'http://sub.domain.tld/path/object.ext'
8386
yield (runSanitizerTest, "test_should_allow_%s_attribute" % attribute_name,
84-
"<p %s=\"foo\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>" % attribute_name,
85-
"<p %s='foo'>foo <bad>bar</bad> baz</p>" % attribute_name,
87+
"<p %s=\"%s\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>" % (attribute_name, attribute_value),
88+
"<p %s='%s'>foo <bad>bar</bad> baz</p>" % (attribute_name, attribute_value),
8689
toxml)
8790

8891
for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes:
@@ -93,13 +96,20 @@ def test_sanitizer():
9396
toxml)
9497

9598
for protocol in sanitizer.HTMLSanitizer.allowed_protocols:
96-
yield (runSanitizerTest, "test_should_allow_%s_uris" % protocol,
97-
"<a href=\"%s\">foo</a>" % protocol,
98-
"""<a href="%s">foo</a>""" % protocol,
99+
rest_of_uri = '//sub.domain.tld/path/object.ext'
100+
if protocol == 'data':
101+
rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ='
102+
yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
103+
"<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri),
104+
"""<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri),
99105
toxml)
100106

101107
for protocol in sanitizer.HTMLSanitizer.allowed_protocols:
108+
rest_of_uri = '//sub.domain.tld/path/object.ext'
109+
if protocol == 'data':
110+
rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ='
111+
protocol = protocol.upper()
102112
yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
103-
"<a href=\"%s\">foo</a>" % protocol,
104-
"""<a href="%s">foo</a>""" % protocol,
113+
"<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri),
114+
"""<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri),
105115
toxml)

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy