diff --git a/AUTHORS.rst b/AUTHORS.rst index 4148a6ed..b9a8fc8b 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -32,3 +32,5 @@ Patches and suggestions - Juan Carlos Garcia Segovia - Mike West - Marc DM +- Drew Hubl +- Austin Kumbera diff --git a/html5lib/sanitizer.py b/html5lib/sanitizer.py index 469d9b40..6bbd872f 100644 --- a/html5lib/sanitizer.py +++ b/html5lib/sanitizer.py @@ -2,11 +2,26 @@ import re from xml.sax.saxutils import escape, unescape +from six.moves import urllib_parse as urlparse from .tokenizer import HTMLTokenizer from .constants import tokenTypes +content_type_rgx = re.compile(r''' + ^ + # Match a content type / + (?P[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+) + # Match any character set and encoding + (?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?) + |(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?) + # Assume the rest is data + ,.* + $ + ''', + re.VERBOSE) + + class HTMLSanitizerMixin(object): """ sanitization of XHTML+MathML+SVG and of inline style attributes.""" @@ -138,7 +153,9 @@ class HTMLSanitizerMixin(object): acceptable_protocols = ['ed2k', 'ftp', 'http', 'https', 'irc', 'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal', 'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag', - 'ssh', 'sftp', 'rtsp', 'afs'] + 'ssh', 'sftp', 'rtsp', 'afs', 'data'] + + acceptable_content_types = ['image/png', 'image/jpeg', 'image/gif', 'image/webp', 'image/bmp', 'text/plain'] # subclasses may define their own versions of these constants allowed_elements = acceptable_elements + mathml_elements + svg_elements @@ -147,6 +164,7 @@ class HTMLSanitizerMixin(object): allowed_css_keywords = acceptable_css_keywords allowed_svg_properties = acceptable_svg_properties allowed_protocols = acceptable_protocols + allowed_content_types = acceptable_content_types # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and # stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style @@ -189,10 +207,17 @@ def allowed_token(self, token, token_type): unescape(attrs[attr])).lower() # remove replacement characters from unescaped characters val_unescaped = val_unescaped.replace("\ufffd", "") - if (re.match("^[a-z0-9][-+.a-z0-9]*:", val_unescaped) and - (val_unescaped.split(':')[0] not in - self.allowed_protocols)): - del attrs[attr] + uri = urlparse.urlparse(val_unescaped) + if uri: + if uri.scheme not in self.allowed_protocols: + del attrs[attr] + if uri.scheme == 'data': + m = content_type_rgx.match(uri.path) + if not m: + del attrs[attr] + if m.group('content_type') not in self.allowed_content_types: + del attrs[attr] + for attr in self.svg_attr_val_allows_ref: if attr in attrs: attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', diff --git a/html5lib/tests/test_sanitizer.py b/html5lib/tests/test_sanitizer.py index 1cc687df..4862570d 100644 --- a/html5lib/tests/test_sanitizer.py +++ b/html5lib/tests/test_sanitizer.py @@ -80,9 +80,12 @@ def test_sanitizer(): continue # TODO if attribute_name == 'style': continue + attribute_value = 'foo' + if attribute_name in sanitizer.HTMLSanitizer.attr_val_is_uri: + attribute_value = '%s://sub.domain.tld/path/object.ext' % sanitizer.HTMLSanitizer.allowed_protocols[0] yield (runSanitizerTest, "test_should_allow_%s_attribute" % attribute_name, - "

foo <bad>bar</bad> baz

" % attribute_name, - "

foo bar baz

" % attribute_name, + "

foo <bad>bar</bad> baz

" % (attribute_name, attribute_value), + "

foo bar baz

" % (attribute_name, attribute_value), toxml) for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes: @@ -93,13 +96,20 @@ def test_sanitizer(): toxml) for protocol in sanitizer.HTMLSanitizer.allowed_protocols: - yield (runSanitizerTest, "test_should_allow_%s_uris" % protocol, - "foo" % protocol, - """foo""" % protocol, + rest_of_uri = '//sub.domain.tld/path/object.ext' + if protocol == 'data': + rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ=' + yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol, + "foo" % (protocol, rest_of_uri), + """foo""" % (protocol, rest_of_uri), toxml) for protocol in sanitizer.HTMLSanitizer.allowed_protocols: + rest_of_uri = '//sub.domain.tld/path/object.ext' + if protocol == 'data': + rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ=' + protocol = protocol.upper() yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol, - "foo" % protocol, - """foo""" % protocol, + "foo" % (protocol, rest_of_uri), + """foo""" % (protocol, rest_of_uri), toxml) pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy