@@ -117,7 +117,7 @@ public static function unescapeString(string $str, string $quoteChar): string
117
117
't ' => "\t" ,
118
118
'u ' => self ::unescapeUnicodeSequence ($ str , $ i ),
119
119
$ quoteChar => $ quoteChar ,
120
- default => throw new JsonCrawlerException ('' , \sprintf ('Invalid escape sequence " \\%s" in %s-quoted string ' , $ str [$ i + 1 ], "' " === $ quoteChar ? 'single ' : 'double ' )),
120
+ default => throw new JsonCrawlerException ('' , \sprintf ('Invalid escape sequence " \\%s" in %s-quoted string. ' , $ str [$ i + 1 ], "' " === $ quoteChar ? 'single ' : 'double ' )),
121
121
};
122
122
123
123
++$ i ;
@@ -132,30 +132,33 @@ public static function unescapeString(string $str, string $quoteChar): string
132
132
private static function unescapeUnicodeSequence (string $ str , int &$ i ): string
133
133
{
134
134
if (!isset ($ str [$ i + 5 ]) || !ctype_xdigit (substr ($ str , $ i + 2 , 4 ))) {
135
- throw new JsonCrawlerException ('' , 'Invalid unicode escape sequence ' );
135
+ throw new JsonCrawlerException ('' , 'Invalid unicode escape sequence. ' );
136
136
}
137
137
138
- $ hex = substr ($ str , $ i + 2 , 4 );
138
+ $ codepoint = hexdec ( substr ($ str , $ i + 2 , 4 ) );
139
139
140
- $ codepoint = hexdec ($ hex );
141
140
// looks like a valid Unicode codepoint, string length is sufficient and it starts with \u
142
- if (0xD800 <= $ codepoint && $ codepoint <= 0xDBFF && isset ($ str [$ i + 11 ]) && '\\' === $ str [$ i + 6 ] && 'u ' === $ str [$ i + 7 ]) {
143
- $ lowHex = substr ($ str , $ i + 8 , 4 );
144
- if (ctype_xdigit ($ lowHex )) {
145
- $ lowSurrogate = hexdec ($ lowHex );
146
- if (0xDC00 <= $ lowSurrogate && $ lowSurrogate <= 0xDFFF ) {
147
- $ codepoint = 0x10000 + (($ codepoint & 0x3FF ) << 10 ) + ($ lowSurrogate & 0x3FF );
148
- $ i += 10 ; // skip surrogate pair
149
-
150
- return mb_chr ($ codepoint , 'UTF-8 ' );
151
- }
152
- }
141
+ if (0xD800 <= $ codepoint
142
+ && $ codepoint <= 0xDBFF
143
+ && isset ($ str [$ i + 11 ])
144
+ && '\\' === $ str [$ i + 6 ]
145
+ && 'u ' === $ str [$ i + 7 ]
146
+ && ctype_xdigit ($ lowSurrogate = substr ($ str , $ i + 8 , 4 ))
147
+ && 0xDC00 <= ($ lowSurrogate = hexdec ($ lowSurrogate ))
148
+ && $ lowSurrogate <= 0xDFFF
149
+ ) {
150
+ $ codepoint = 0x10000 + (($ codepoint & 0x3FF ) << 10 ) + ($ lowSurrogate & 0x3FF );
151
+ $ i += 10 ; // skip surrogate pair
152
+ } else {
153
+ // single Unicode character or invalid surrogate, skip the sequence
154
+ $ i += 4 ;
153
155
}
154
156
155
- // single Unicode character or invalid surrogate, skip the sequence
156
- $ i += 4 ;
157
+ if (false === $ chr = mb_chr ($ codepoint , 'UTF-8 ' )) {
158
+ throw new JsonCrawlerException ('' , \sprintf ('Invalid Unicode codepoint: U+%04X. ' , $ codepoint ));
159
+ }
157
160
158
- return mb_chr ( $ codepoint , ' UTF-8 ' ) ;
161
+ return $ chr ;
159
162
}
160
163
161
164
/**
0 commit comments