Skip to content

Commit f4f1fb8

Browse files
fantasaigsnedders
authored andcommitted
Google Code Issue 157: Add "escape invisible characters" option
Vaguely updated, but basically working.
1 parent 073d792 commit f4f1fb8

File tree

3 files changed

+51
-1
lines changed

3 files changed

+51
-1
lines changed

html5lib/constants.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@
44
import gettext
55
_ = gettext.gettext
66

7+
from itertools import chain
8+
9+
710
EOF = None
811

912
E = {
@@ -3078,6 +3081,19 @@
30783081
prefixes["http://www.w3.org/1998/Math/MathML"] = "math"
30793082

30803083

3084+
invisibleChars = frozenset(chain(
3085+
# ASCII control chars
3086+
range(0x0, 0x9), range(0xB, 0xD), range(0xE, 0x20),
3087+
# Other control chars
3088+
# fixed-width spaces, zero-width marks, bidi marks
3089+
range(0x2000, 0x2010),
3090+
# LS, PS, bidi control codes
3091+
range(0x2028, 0x2030),
3092+
# nbsp, mathsp, ideosp, WJ, interlinear
3093+
[0x00A0, 0x205F, 0x3000, 0x2060, 0xFFF9, 0xFFFA, 0xFFFB]
3094+
))
3095+
3096+
30813097
class DataLossWarning(UserWarning):
30823098
pass
30833099

html5lib/serializer/htmlserializer.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ class HTMLSerializer(object):
9494
# escaping options
9595
escape_lt_in_attrs = False
9696
escape_rcdata = False
97+
escape_invisible = False
9798
resolve_entities = True
9899

99100
# miscellaneous options
@@ -105,7 +106,8 @@ class HTMLSerializer(object):
105106
"minimize_boolean_attributes", "use_trailing_solidus",
106107
"space_before_trailing_solidus", "omit_optional_tags",
107108
"strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
108-
"escape_rcdata", "resolve_entities", "sanitize")
109+
"escape_rcdata", "escape_invisible", "resolve_entities",
110+
"sanitize")
109111

110112
def __init__(self, **kwargs):
111113
"""Initialize HTMLSerializer.
@@ -127,6 +129,10 @@ def __init__(self, **kwargs):
127129
escape_rcdata=False|True
128130
Whether to escape characters that need to be escaped within normal
129131
elements within rcdata elements such as style.
132+
escape_invisible=False|True|'numeric'|'named'
133+
Whether to escape invisible characters (such as nbsp, fixed-width
134+
spaces, and control codes). Uses named HTML escapes if 'named'
135+
is specified, otherwise uses numeric codes.
130136
resolve_entities=True|False
131137
Whether to resolve named character entities that appear in the
132138
source tree. The XML predefined entities < > & " '
@@ -160,6 +166,8 @@ def __init__(self, **kwargs):
160166

161167
def encode(self, string):
162168
assert(isinstance(string, text_type))
169+
if self.escape_invisible:
170+
text = utils.escapeInvisible(text, self.escape_invisible == 'named')
163171
if self.encoding:
164172
return string.encode(self.encoding, unicode_encode_errors)
165173
else:

html5lib/utils.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
from types import ModuleType
44

5+
from .constants import invisibleChars
6+
57

68
class MethodDispatcher(dict):
79
"""Dict with 2 special properties:
@@ -71,3 +73,27 @@ def moduleFactory(baseModule, *args, **kwargs):
7173
return mod
7274

7375
return moduleFactory
76+
77+
78+
def escapeInvisible(text, useNamedEntities=False):
79+
"""Escape invisible characters other than Tab, LF, CR, and ASCII space
80+
"""
81+
assert type(text) == text_type
82+
# This algorithm is O(MN) for M len(text) and N num escapable
83+
# But it doesn't modify the text when N is zero (common case) and
84+
# N is expected to be small (usually 1 or 2) in most other cases.
85+
escapable = set()
86+
for c in text:
87+
if ord(c) in invisibleChars:
88+
escapable.add(c)
89+
if useNamedEntities:
90+
raise NotImplementedError("This doesn't work on Python 3")
91+
for c in escapable:
92+
name = codepoint2name.get(ord(c))
93+
escape = "&%s;" % name if name else "&#x%X;" % ord(c)
94+
text = text.replace(c, escape)
95+
else:
96+
for c in escapable:
97+
text = text.replace(c, "&#x%X;" % ord(c))
98+
99+
return text

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy