Skip to content

Commit 362c648

Browse files
committed
Implemented and added tests for the new list of illegal numeric character references
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401238
1 parent b067b74 commit 362c648

File tree

1 file changed

+11
-7
lines changed

1 file changed

+11
-7
lines changed

src/html5lib/tokenizer.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -173,8 +173,17 @@ def consumeNumberEntity(self, isHex):
173173

174174
charAsInt = entitiesWindows1252[charAsInt - 128]
175175

176-
# 0 is not a good number, neither are illegal Unicode code points (higher than 0x10FFFF) or surrogate characters (in the range 0xD800 to 0xDFFF).
177-
if 0 < charAsInt and charAsInt <= 1114111 and not (55296 <= charAsInt and charAsInt <= 57343):
176+
# Certain characters get replaced with U+FFFD
177+
if ((charAsInt <= 0x0008) or (charAsInt == 0x000B) or (0x000E <= charAsInt <= 0x001F)
178+
or (0x007F <= charAsInt <= 0x009F)
179+
or (0xD800 <= charAsInt <= 0xDFFF) or (0xFDD0 <= charAsInt <= 0xFDDF)
180+
or (charAsInt & 0xFFFE == 0xFFFE) # catch all U+?FFFE and U+?FFFF, where ? is 0..10
181+
or (0x10FFFF < charAsInt)):
182+
char = u"\uFFFD"
183+
self.tokenQueue.append({"type": "ParseError", "data":
184+
"illegal-codepoint-for-numeric-entity",
185+
"datavars": {"charAsInt": charAsInt}})
186+
else:
178187
try:
179188
# XXX We should have a separate function that does "int" to
180189
# "unicodestring" conversion since this doesn't always work
@@ -187,11 +196,6 @@ def consumeNumberEntity(self, isHex):
187196
self.tokenQueue.append({"type": "ParseError", "data":
188197
"cant-convert-numeric-entity",
189198
"datavars": {"charAsInt": charAsInt}})
190-
else:
191-
char = u"\uFFFD"
192-
self.tokenQueue.append({"type": "ParseError", "data":
193-
"illegal-codepoint-for-numeric-entity",
194-
"datavars": {"charAsInt": charAsInt}})
195199

196200
# Discard the ; if present. Otherwise, put it back on the queue and
197201
# invoke parseError on parser.

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy