Skip to content

Commit af0199c

Browse files
committed
Merge pull request #175 from gsnedders/fix_tokenizer_201411
Get rid of obsolete replacement of unpaired surrogates with U+FFFD.
2 parents 93ee3b3 + f27af70 commit af0199c

File tree

2 files changed

+1
-8
lines changed

2 files changed

+1
-8
lines changed

.pytest.expect

-228 Bytes
Binary file not shown.

html5lib/inputstream.py

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -185,14 +185,10 @@ def __init__(self, source):
185185
# Such platforms will have already checked for such
186186
# surrogate errors, so no need to do this checking.
187187
self.reportCharacterErrors = None
188-
self.replaceCharactersRegexp = None
189188
elif len("\U0010FFFF") == 1:
190189
self.reportCharacterErrors = self.characterErrorsUCS4
191-
self.replaceCharactersRegexp = re.compile(eval('"[\\uD800-\\uDFFF]"'))
192190
else:
193191
self.reportCharacterErrors = self.characterErrorsUCS2
194-
self.replaceCharactersRegexp = re.compile(
195-
eval('"([\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?<![\\uD800-\\uDBFF])[\\uDC00-\\uDFFF])"'))
196192

197193
# List of where new lines occur
198194
self.newLines = [0]
@@ -290,10 +286,7 @@ def readChunk(self, chunkSize=None):
290286
if self.reportCharacterErrors:
291287
self.reportCharacterErrors(data)
292288

293-
# Replace invalid characters
294-
# Note U+0000 is dealt with in the tokenizer
295-
data = self.replaceCharactersRegexp.sub("\ufffd", data)
296-
289+
# Replace invalid characters
297290
data = data.replace("\r\n", "\n")
298291
data = data.replace("\r", "\n")
299292

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy