Skip to content

Commit 6b40e76

Browse files
author
Edward Z. Yang ext:(%22)
committed
Initial implementation of numeric entities and tests, not complete, need spec clarification.
--HG-- branch : numeric-entities
1 parent 1cbacc5 commit 6b40e76

File tree

2 files changed

+21
-8
lines changed

2 files changed

+21
-8
lines changed

library/HTML5/Data.php

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,13 @@ class HTML5_Data
88
// at some point this should be moved to a .ser file. Another
99
// possible optimization is to give UTF-8 bytes, not Unicode
1010
// codepoints
11+
// XXX: Not quite sure why it's named this; this is
12+
// actually the numeric entity dereference table.
1113
protected static $realCodepointTable = array(
14+
0x00 => 0xFFFD, // REPLACEMENT CHARACTER
1215
0x0D => 0x000A, // LINE FEED (LF)
1316
0x80 => 0x20AC, // EURO SIGN ('€')
14-
0x81 => 0xFFFD, // REPLACEMENT CHARACTER
17+
0x81 => 0x0081, // <control>
1518
0x82 => 0x201A, // SINGLE LOW-9 QUOTATION MARK ('‚')
1619
0x83 => 0x0192, // LATIN SMALL LETTER F WITH HOOK ('ƒ')
1720
0x84 => 0x201E, // DOUBLE LOW-9 QUOTATION MARK ('„')
@@ -23,10 +26,10 @@ class HTML5_Data
2326
0x8A => 0x0160, // LATIN CAPITAL LETTER S WITH CARON ('Š')
2427
0x8B => 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK ('‹')
2528
0x8C => 0x0152, // LATIN CAPITAL LIGATURE OE ('Œ')
26-
0x8D => 0xFFFD, // REPLACEMENT CHARACTER
29+
0x8D => 0x008D, // <control>
2730
0x8E => 0x017D, // LATIN CAPITAL LETTER Z WITH CARON ('Ž')
28-
0x8F => 0xFFFD, // REPLACEMENT CHARACTER
29-
0x90 => 0xFFFD, // REPLACEMENT CHARACTER
31+
0x8F => 0x008F, // <control>
32+
0x90 => 0x0090, // <control>
3033
0x91 => 0x2018, // LEFT SINGLE QUOTATION MARK ('‘')
3134
0x92 => 0x2019, // RIGHT SINGLE QUOTATION MARK ('’')
3235
0x93 => 0x201C, // LEFT DOUBLE QUOTATION MARK ('“')
@@ -39,7 +42,7 @@ class HTML5_Data
3942
0x9A => 0x0161, // LATIN SMALL LETTER S WITH CARON ('š')
4043
0x9B => 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK ('›')
4144
0x9C => 0x0153, // LATIN SMALL LIGATURE OE ('œ')
42-
0x9D => 0xFFFD, // REPLACEMENT CHARACTER
45+
0x9D => 0x009D, // <control>
4346
0x9E => 0x017E, // LATIN SMALL LETTER Z WITH CARON ('ž')
4447
0x9F => 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS ('Ÿ')
4548
);
@@ -71,12 +74,13 @@ public static function getNamedCharacterReferences() {
7174
* shamelessly stolen from Feyd (which is in public domain).
7275
*/
7376
public static function utf8chr($code) {
74-
if($code > 0x10FFFF or $code < 0x0 or
77+
/* We don't care: we live dangerously
78+
* if($code > 0x10FFFF or $code < 0x0 or
7579
($code >= 0xD800 and $code <= 0xDFFF) ) {
7680
// bits are set outside the "valid" range as defined
7781
// by UNICODE 4.1.0
7882
return "\xEF\xBF\xBD";
79-
}
83+
}*/
8084

8185
$x = $y = $z = $w = 0;
8286
if ($code < 0x80) {

library/HTML5/Tokenizer.php

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2213,6 +2213,16 @@ private function consumeCharacterReference($allowed = false, $inattr = false) {
22132213
));
22142214
$codepoint = $new_codepoint;
22152215
} else {
2216+
// our logic is structured a little differently from the
2217+
// spec's but they're equivalent. The transform is:
2218+
// spec:
2219+
// return character for codepoint
2220+
// if in range:
2221+
// parse error
2222+
// ours:
2223+
// if in range:
2224+
// parse error
2225+
// return character for codepoint
22162226
/* Otherwise, if the number is in the range 0x0000 to 0x0008,
22172227
U+000B, U+000E to 0x001F, 0x007F to 0x009F, 0xD800 to 0xDFFF ,
22182228
0xFDD0 to 0xFDEF, or is one of 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF,
@@ -2238,7 +2248,6 @@ private function consumeCharacterReference($allowed = false, $inattr = false) {
22382248
'type' => self::PARSEERROR,
22392249
'data' => 'illegal-codepoint-for-numeric-entity'
22402250
));
2241-
$codepoint = 0xFFFD;
22422251
}
22432252
}
22442253

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy