Skip to content

Commit 52f9ca6

Browse files
mfagsnedders
authored andcommitted
refactor allowed_token and disallowed_token as new methods in HTMLSanitizerMixin for usage in subclass.
1 parent 90aa9f4 commit 52f9ca6

File tree

1 file changed

+51
-45
lines changed

1 file changed

+51
-45
lines changed

html5lib/sanitizer.py

Lines changed: 51 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -168,57 +168,63 @@ def sanitize_token(self, token):
168168
if token_type in (tokenTypes["StartTag"], tokenTypes["EndTag"],
169169
tokenTypes["EmptyTag"]):
170170
if token["name"] in self.allowed_elements:
171-
if "data" in token:
172-
attrs = dict([(name,val) for name,val in
173-
token["data"][::-1]
174-
if name in self.allowed_attributes])
175-
for attr in self.attr_val_is_uri:
176-
if attr not in attrs:
177-
continue
178-
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
179-
unescape(attrs[attr])).lower()
180-
#remove replacement characters from unescaped characters
181-
val_unescaped = val_unescaped.replace("\ufffd", "")
182-
if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and
183-
(val_unescaped.split(':')[0] not in
184-
self.allowed_protocols)):
185-
del attrs[attr]
186-
for attr in self.svg_attr_val_allows_ref:
187-
if attr in attrs:
188-
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
189-
' ',
190-
unescape(attrs[attr]))
191-
if (token["name"] in self.svg_allow_local_href and
192-
'xlink:href' in attrs and re.search('^\s*[^#\s].*',
193-
attrs['xlink:href'])):
194-
del attrs['xlink:href']
195-
if 'style' in attrs:
196-
attrs['style'] = self.sanitize_css(attrs['style'])
197-
token["data"] = [[name,val] for name,val in list(attrs.items())]
198-
return token
171+
return self.allowed_token(token, token_type)
199172
else:
200-
if token_type == tokenTypes["EndTag"]:
201-
token["data"] = "</%s>" % token["name"]
202-
elif token["data"]:
203-
attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
204-
token["data"] = "<%s%s>" % (token["name"],attrs)
205-
else:
206-
token["data"] = "<%s>" % token["name"]
207-
if token.get("selfClosing"):
208-
token["data"]=token["data"][:-1] + "/>"
209-
210-
if token["type"] in list(tokenTypes.keys()):
211-
token["type"] = "Characters"
212-
else:
213-
token["type"] = tokenTypes["Characters"]
214-
215-
del token["name"]
216-
return token
173+
return self.disallowed_token(token, token_type)
217174
elif token_type == tokenTypes["Comment"]:
218175
pass
219176
else:
220177
return token
221178

179+
def allowed_token(self, token, token_type):
180+
if "data" in token:
181+
attrs = dict([(name,val) for name,val in
182+
token["data"][::-1]
183+
if name in self.allowed_attributes])
184+
for attr in self.attr_val_is_uri:
185+
if attr not in attrs:
186+
continue
187+
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
188+
unescape(attrs[attr])).lower()
189+
#remove replacement characters from unescaped characters
190+
val_unescaped = val_unescaped.replace("\ufffd", "")
191+
if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and
192+
(val_unescaped.split(':')[0] not in
193+
self.allowed_protocols)):
194+
del attrs[attr]
195+
for attr in self.svg_attr_val_allows_ref:
196+
if attr in attrs:
197+
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
198+
' ',
199+
unescape(attrs[attr]))
200+
if (token["name"] in self.svg_allow_local_href and
201+
'xlink:href' in attrs and re.search('^\s*[^#\s].*',
202+
attrs['xlink:href'])):
203+
del attrs['xlink:href']
204+
if 'style' in attrs:
205+
attrs['style'] = self.sanitize_css(attrs['style'])
206+
token["data"] = [[name,val] for name,val in list(attrs.items())]
207+
return token
208+
209+
def disallowed_token(self, token, token_type):
210+
if token_type == tokenTypes["EndTag"]:
211+
token["data"] = "</%s>" % token["name"]
212+
elif token["data"]:
213+
attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
214+
token["data"] = "<%s%s>" % (token["name"],attrs)
215+
else:
216+
token["data"] = "<%s>" % token["name"]
217+
if token.get("selfClosing"):
218+
token["data"]=token["data"][:-1] + "/>"
219+
220+
if token["type"] in list(tokenTypes.keys()):
221+
token["type"] = "Characters"
222+
else:
223+
token["type"] = tokenTypes["Characters"]
224+
225+
del token["name"]
226+
return token
227+
222228
def sanitize_css(self, style):
223229
# disallow urls
224230
style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style)

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy