Skip to content

Commit b95d51c

Browse files
committed
Fix test-case failures due to character reference parsing bugs.
More completely, change the entire strucutre of the NCR array to be a multi-dimensional array with each dimension having as a key one character and a value possible further steps, and sometimes 'codepoint', which that point represents. Hence, to get & now, you would need $ncrs['a']['m']['p'][';']['codepoint']. 'codepoint' is required so we can cope with both &amp and & (which means it can't just be a value of &amp). This also removes Data::getNamedCharacterReferenceMaxLength(), as it is now useless.
1 parent ef63fc9 commit b95d51c

File tree

4 files changed

+63
-36
lines changed

4 files changed

+63
-36
lines changed

library/HTML5/Data.php

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -65,16 +65,6 @@ public static function getNamedCharacterReferences() {
6565
return self::$namedCharacterReferences;
6666
}
6767

68-
public static function getNamedCharacterReferenceMaxLength() {
69-
if (!self::$namedCharacterReferenceMaxLength) {
70-
$namedCharacterReferences = self::getNamedCharacterReferences();
71-
$lengths = array_map('strlen', array_keys($namedCharacterReferences));
72-
self::$namedCharacterReferenceMaxLength = max($lengths);
73-
}
74-
return self::$namedCharacterReferenceMaxLength;
75-
}
76-
77-
7868
/**
7969
* Converts a Unicode codepoint to sequence of UTF-8 bytes.
8070
* @note Shamelessly stolen from HTML Purifier, which is also

library/HTML5/Tokenizer.php

Lines changed: 38 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2197,21 +2197,32 @@ private function consumeCharacterReference($allowed = false, $inattr = false) {
21972197
with the consumed characters matching one of the
21982198
identifiers in the first column of the named character
21992199
references table (in a case-sensitive manner). */
2200-
2201-
// we will implement this by matching the longest
2202-
// alphanumeric + semicolon string, and then working
2203-
// our way backwards
2204-
$chars .= $this->stream->charsWhile(self::DIGIT . self::ALPHA . ';', HTML5_Data::getNamedCharacterReferenceMaxLength() - 1);
2205-
$len = strlen($chars);
2200+
// What we actually do here is consume as much as we can while it
2201+
// matches the start of one of the identifiers in the first column.
22062202

22072203
$refs = HTML5_Data::getNamedCharacterReferences();
2204+
2205+
// Get the longest string which is the start of an identifier
2206+
// ($chars) as well as the longest identifier which matches ($id)
2207+
// and its codepoint ($codepoint).
22082208
$codepoint = false;
2209-
for($c = $len; $c > 0; $c--) {
2210-
$id = substr($chars, 0, $c);
2211-
if(isset($refs[$id])) {
2212-
$codepoint = $refs[$id];
2213-
break;
2209+
$char = $chars;
2210+
while ($char !== false && isset($refs[$char])) {
2211+
$refs = $refs[$char];
2212+
if (isset($refs['codepoint'])) {
2213+
$id = $chars;
2214+
$codepoint = $refs['codepoint'];
22142215
}
2216+
$chars .= $char = $this->stream->char();
2217+
}
2218+
2219+
// Unconsume the one character we just took which caused the while
2220+
// statement to fail. This could be anything and could cause state
2221+
// changes (as if it matches the while loop it must be
2222+
// alphanumeric so we can just concat it to whatever we get later).
2223+
$this->stream->unget();
2224+
if ($char !== false) {
2225+
$chars = substr($chars, 0, -1);
22152226
}
22162227

22172228
/* If no match can be made, then this is a parse error.
@@ -2235,7 +2246,6 @@ private function consumeCharacterReference($allowed = false, $inattr = false) {
22352246
$semicolon = false;
22362247
}
22372248

2238-
22392249
/* If the character reference is being consumed as part of
22402250
an attribute, and the last character matched is not a
22412251
U+003B SEMICOLON (;), and the next character is in the
@@ -2245,17 +2255,27 @@ private function consumeCharacterReference($allowed = false, $inattr = false) {
22452255
then, for historical reasons, all the characters that were
22462256
matched after the U+0026 AMPERSAND (&) must be unconsumed,
22472257
and nothing is returned. */
2248-
if (
2249-
$inattr && !$semicolon &&
2250-
strspn(substr($chars, $c, 1), self::ALPHA . self::DIGIT)
2251-
) {
2252-
return '&' . $chars;
2258+
if ($inattr && !$semicolon) {
2259+
// The next character is either the next character in $chars or in the stream.
2260+
if (strlen($chars) > strlen($id)) {
2261+
$next = substr($chars, strlen($id), 1);
2262+
} else {
2263+
$next = $this->stream->char();
2264+
$this->stream->unget();
2265+
}
2266+
if (
2267+
'0' <= $next && $next <= '9' ||
2268+
'A' <= $next && $next <= 'Z' ||
2269+
'a' <= $next && $next <= 'z'
2270+
) {
2271+
return '&' . $chars;
2272+
}
22532273
}
22542274

22552275
/* Otherwise, return a character token for the character
22562276
corresponding to the character reference name (as given
22572277
by the second column of the named character references table). */
2258-
return HTML5_Data::utf8chr($codepoint) . substr($chars, $c);
2278+
return HTML5_Data::utf8chr($codepoint) . substr($chars, strlen($id));
22592279
}
22602280
}
22612281

library/HTML5/named-character-references.ser

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

maintenance/scrape-ncr.php

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,19 +12,36 @@
1212
}
1313

1414
$url = 'http://www.whatwg.org/specs/web-apps/current-work/multipage/named-character-references.html';
15-
$request = new HttpRequest($url);
16-
$request->send();
17-
$html = $request->getResponseBody();
15+
if (extension_loaded('pecl_http')) {
16+
$request = new HttpRequest($url);
17+
$request->send();
18+
$html = $request->getResponseBody();
19+
} else {
20+
$html = file_get_contents($url);
21+
}
1822

1923
preg_match_all(
20-
'#<code title="">\s*([^<]+?)\s*</code>\s*</td>\s*<td>\s*U+([^<]+?)\s*<#',
24+
'#<code title="">\s*([^<]+?)\s*</code>\s*</td>\s*<td>\s*U\+([^<]+?)\s*<#',
2125
$html, $matches, PREG_SET_ORDER);
2226

2327
$table = array();
2428
foreach ($matches as $match) {
25-
$ncr = $match[1];
26-
$codepoint = hexdec($match[2]);
27-
$table[$ncr] = $codepoint;
29+
list(, $name, $codepoint) = $match;
30+
31+
// Set the subtable we're working with initially to the whole table.
32+
$subtable =& $table;
33+
34+
// Loop over each character to the name creating an array key for it, if it
35+
// doesn't already exist
36+
for ($i = 0, $len = strlen($name); $i < $len; $i++) {
37+
if (!isset($subtable[$name[$i]])) {
38+
$subtable[$name[$i]] = null;
39+
}
40+
$subtable =& $subtable[$name[$i]];
41+
}
42+
43+
// Set the key codepoint to the codepoint.
44+
$subtable['codepoint'] = hexdec($codepoint);
2845
}
2946

3047
file_put_contents($output, serialize($table));

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy