Skip to content

Commit cefc5d9

Browse files
[JsonPath] Better handling of Unicode chars in expressions
1 parent 0795d65 commit cefc5d9

File tree

4 files changed

+357
-2
lines changed

4 files changed

+357
-2
lines changed

src/Symfony/Component/JsonPath/JsonCrawler.php

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,7 @@ private function evaluateBracket(string $expr, mixed $value): array
230230

231231
// quoted strings for object keys
232232
if (preg_match('/^([\'"])(.*)\1$/', $expr, $matches)) {
233-
$key = stripslashes($matches[2]);
233+
$key = JsonPathUtils::unescapeString($matches[2], $matches[1]);
234234

235235
return \array_key_exists($key, $value) ? [$value[$key]] : [];
236236
}
@@ -335,7 +335,7 @@ private function evaluateScalar(string $expr, array $context): mixed
335335

336336
// string literals
337337
if (preg_match('/^([\'"])(.*)\1$/', $expr, $matches)) {
338-
return $matches[2];
338+
return JsonPathUtils::unescapeString($matches[2], $matches[1]);
339339
}
340340

341341
// current node references

src/Symfony/Component/JsonPath/JsonPathUtils.php

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,4 +85,80 @@ public static function findSmallestDeserializableStringAndPath(array $tokens, mi
8585
'tokens' => $remainingTokens,
8686
];
8787
}
88+
89+
public static function unescapeString(string $str, string $quoteChar): string
90+
{
91+
if ('"' === $quoteChar) {
92+
// try JSON decoding first for unicode sequences
93+
$jsonStr = '"' . $str . '"';
94+
$decoded = json_decode($jsonStr, true);
95+
96+
if (null !== $decoded) {
97+
return $decoded;
98+
}
99+
}
100+
101+
$result = '';
102+
$length = strlen($str);
103+
104+
for ($i = 0; $i < $length; $i++) {
105+
if ('\\' === $str[$i] && $i + 1 < $length) {
106+
$nextChar = $str[$i + 1];
107+
108+
$result .= match ($nextChar) {
109+
'"' => '"',
110+
"'" => "'",
111+
'\\' => '\\',
112+
'/' => '/',
113+
'b' => "\b",
114+
'f' => "\f",
115+
'n' => "\n",
116+
'r' => "\r",
117+
't' => "\t",
118+
'u' => self::unescapeUnicodeSequence($str, $length, $i),
119+
default => $str[$i] . $str[$i + 1], // keep the backslash
120+
};
121+
122+
++$i;
123+
} else {
124+
$result .= $str[$i];
125+
}
126+
}
127+
128+
return $result;
129+
}
130+
131+
private static function unescapeUnicodeSequence(string $str, int $length, int &$i): string
132+
{
133+
if ($i + 5 >= $length) {
134+
// not enough characters for Unicode escape, treat as literal
135+
return $str[$i];
136+
}
137+
138+
$hex = substr($str, $i + 2, 4);
139+
if (!ctype_xdigit($hex)) {
140+
// invalid hex, treat as literal
141+
return $str[$i];
142+
}
143+
144+
$codepoint = hexdec($hex);
145+
// looks like a valid Unicode codepoint, string length is sufficient and it starts with \u
146+
if (0xD800 <= $codepoint && $codepoint <= 0xDBFF && $i + 11 < $length && '\\' === $str[$i + 6] && 'u' === $str[$i + 7]) {
147+
$lowHex = substr($str, $i + 8, 4);
148+
if (ctype_xdigit($lowHex)) {
149+
$lowSurrogate = hexdec($lowHex);
150+
if (0xDC00 <= $lowSurrogate && $lowSurrogate <= 0xDFFF) {
151+
$codepoint = 0x10000 + (($codepoint & 0x3FF) << 10) + ($lowSurrogate & 0x3FF);
152+
$i += 10; // skip surrogate pair
153+
154+
return mb_chr($codepoint, 'UTF-8');
155+
}
156+
}
157+
}
158+
159+
// single Unicode character or invalid surrogate, skip the sequence
160+
$i += 4;
161+
162+
return mb_chr($codepoint, 'UTF-8');
163+
}
88164
}

src/Symfony/Component/JsonPath/Tests/JsonCrawlerTest.php

Lines changed: 278 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -404,6 +404,260 @@ public function testAcceptsJsonPath()
404404
$this->assertSame('red', $result[0]['color']);
405405
}
406406

407+
/**
408+
* @dataProvider provideUnicodeEscapeSequencesProvider
409+
*/
410+
public function testUnicodeEscapeSequences(string $jsonPath, array $expected)
411+
{
412+
$this->assertSame($expected, self::getUnicodeDocumentCrawler()->find($jsonPath));
413+
}
414+
415+
public static function provideUnicodeEscapeSequencesProvider(): array
416+
{
417+
return [
418+
[
419+
'$["caf\u00e9"]',
420+
['coffee'],
421+
],
422+
[
423+
'$["\u65e5\u672c"]',
424+
['Japan'],
425+
],
426+
[
427+
'$["M\u00fcller"]',
428+
[],
429+
],
430+
431+
[
432+
'$["emoji\ud83d\ude00"]',
433+
['smiley'],
434+
],
435+
436+
[
437+
'$["tab\there"]',
438+
['with tab'],
439+
],
440+
[
441+
'$["new\nline"]',
442+
['with newline'],
443+
],
444+
[
445+
'$["quote\"here"]',
446+
['with quote'],
447+
],
448+
[
449+
'$["backslash\\\\here"]',
450+
['with backslash'],
451+
],
452+
[
453+
'$["apostrophe\'here"]',
454+
['with apostrophe'],
455+
],
456+
457+
[
458+
'$["control\u0001char"]',
459+
['with control char'],
460+
],
461+
462+
[
463+
'$["\u0063af\u00e9"]',
464+
['coffee'],
465+
]
466+
];
467+
}
468+
469+
/**
470+
* @dataProvider provideSingleQuotedStringProvider
471+
*/
472+
public function testSingleQuotedStrings(string $jsonPath, array $expected)
473+
{
474+
$this->assertSame($expected, self::getUnicodeDocumentCrawler()->find($jsonPath));
475+
}
476+
477+
public static function provideSingleQuotedStringProvider(): array
478+
{
479+
return [
480+
[
481+
"$['caf\\u00e9']",
482+
['coffee'],
483+
],
484+
[
485+
"$['\\u65e5\\u672c']",
486+
['Japan'],
487+
],
488+
[
489+
"$['quote\"here']",
490+
['with quote'],
491+
],
492+
[
493+
"$['M\\u00fcller']",
494+
[],
495+
],
496+
497+
[
498+
"$['emoji\\ud83d\\ude00']",
499+
['smiley'],
500+
],
501+
502+
[
503+
"$['tab\\there']",
504+
['with tab'],
505+
],
506+
[
507+
"$['quote\\\"here']",
508+
['with quote'],
509+
],
510+
[
511+
"$['backslash\\\\here']",
512+
['with backslash'],
513+
],
514+
[
515+
"$['apostrophe\\'here']",
516+
['with apostrophe'],
517+
],
518+
519+
[
520+
"$['control\\u0001char']",
521+
['with control char'],
522+
],
523+
524+
[
525+
"$['\\u0063af\\u00e9']",
526+
['coffee'],
527+
]
528+
];
529+
}
530+
531+
/**
532+
* @dataProvider provideFilterWithUnicodeProvider
533+
*/
534+
public function testFilterWithUnicodeStrings(string $jsonPath, int $expectedCount, string $expectedCountry)
535+
{
536+
$result = self::getUnicodeDocumentCrawler()->find($jsonPath);
537+
538+
$this->assertCount($expectedCount, $result);
539+
540+
if ($expectedCount > 0) {
541+
$this->assertSame($expectedCountry, $result[0]['country']);
542+
}
543+
}
544+
545+
public static function provideFilterWithUnicodeProvider(): array
546+
{
547+
return [
548+
[
549+
'$.users[?(@.name == "caf\u00e9")]',
550+
1,
551+
'France',
552+
],
553+
[
554+
'$.users[?(@.name == "\u65e5\u672c\u592a\u90ce")]',
555+
1,
556+
'Japan',
557+
],
558+
[
559+
'$.users[?(@.name == "Jos\u00e9")]',
560+
1,
561+
'Spain',
562+
],
563+
[
564+
'$.users[?(@.name == "John")]',
565+
1,
566+
'USA',
567+
],
568+
[
569+
'$.users[?(@.name == "NonExistent\u0020Name")]',
570+
0,
571+
'',
572+
]
573+
];
574+
}
575+
576+
/**
577+
* @dataProvider provideInvalidUnicodeSequenceProvider
578+
*/
579+
public function testInvalidUnicodeSequencesAreProcessedAsLiterals(string $jsonPath)
580+
{
581+
$this->assertIsArray(self::getUnicodeDocumentCrawler()->find($jsonPath), 'invalid unicode sequence should be treated as literal and not throw');
582+
}
583+
584+
public static function provideInvalidUnicodeSequenceProvider(): array
585+
{
586+
return [
587+
[
588+
'$["test\uZZZZ"]',
589+
],
590+
[
591+
'$["test\u123"]',
592+
],
593+
[
594+
'$["test\u"]',
595+
]
596+
];
597+
}
598+
599+
/**
600+
* @dataProvider provideComplexUnicodePath
601+
*/
602+
public function testComplexUnicodePaths(string $jsonPath, array $expected)
603+
{
604+
$complexJson = [
605+
'データ' => [
606+
'ユーザー' => [
607+
['名前' => 'テスト', 'ID' => 1],
608+
['名前' => 'サンプル', 'ID' => 2]
609+
]
610+
],
611+
'special🔑' => [
612+
'value💎' => 'treasure'
613+
]
614+
];
615+
616+
$crawler = new JsonCrawler(json_encode($complexJson));
617+
618+
$this->assertSame($expected, $crawler->find($jsonPath));
619+
}
620+
621+
public static function provideComplexUnicodePath(): array
622+
{
623+
return [
624+
[
625+
'$["\u30c7\u30fc\u30bf"]["\u30e6\u30fc\u30b6\u30fc"][0]["\u540d\u524d"]',
626+
['テスト'],
627+
],
628+
[
629+
'$["special\ud83d\udd11"]["value\ud83d\udc8e"]',
630+
['treasure'],
631+
],
632+
[
633+
'$["\u30c7\u30fc\u30bf"]["\u30e6\u30fc\u30b6\u30fc"][*]["\u540d\u524d"]',
634+
['テスト', 'サンプル'],
635+
]
636+
];
637+
}
638+
639+
public function testSurrogatePairHandling()
640+
{
641+
$json = ['𝒽𝑒𝓁𝓁𝑜' => 'mathematical script hello'];
642+
$crawler = new JsonCrawler(json_encode($json));
643+
644+
// mathematical script "hello" requires surrogate pairs for each character
645+
$result = $crawler->find('$["\ud835\udcbd\ud835\udc52\ud835\udcc1\ud835\udcc1\ud835\udc5c"]');
646+
$this->assertSame(['mathematical script hello'], $result);
647+
}
648+
649+
public function testMixedQuoteTypes()
650+
{
651+
$json = ['key"with"quotes' => 'value1', "key'with'apostrophes" => 'value2'];
652+
$crawler = new JsonCrawler(json_encode($json));
653+
654+
$result = $crawler->find('$[\'key"with"quotes\']');
655+
$this->assertSame(['value1'], $result);
656+
657+
$result = $crawler->find('$["key\'with\'apostrophes"]');
658+
$this->assertSame(['value2'], $result);
659+
}
660+
407661
private static function getBookstoreCrawler(): JsonCrawler
408662
{
409663
return new JsonCrawler(<<<JSON
@@ -453,4 +707,28 @@ private static function getSimpleCollectionCrawler(): JsonCrawler
453707
{"a": [3, 5, 1, 2, 4, 6]}
454708
JSON);
455709
}
710+
711+
private static function getUnicodeDocumentCrawler(): JsonCrawler
712+
{
713+
$json = [
714+
'café' => 'coffee',
715+
'日本' => 'Japan',
716+
'emoji😀' => 'smiley',
717+
'tab here' => 'with tab',
718+
"new\nline" => 'with newline',
719+
'quote"here' => 'with quote',
720+
'backslash\\here' => 'with backslash',
721+
'apostrophe\'here' => 'with apostrophe',
722+
"control\x01char" => 'with control char',
723+
'users' => [
724+
['name' => 'café', 'country' => 'France'],
725+
['name' => '日本太郎', 'country' => 'Japan'],
726+
['name' => 'John', 'country' => 'USA'],
727+
['name' => 'Müller', 'country' => 'Germany'],
728+
['name' => 'José', 'country' => 'Spain']
729+
]
730+
];
731+
732+
return new JsonCrawler(json_encode($json));
733+
}
456734
}

src/Symfony/Component/JsonPath/composer.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
],
1818
"require": {
1919
"php": ">=8.2",
20+
"symfony/polyfill-ctype": "~1.8",
2021
"symfony/polyfill-mbstring": "~1.0"
2122
},
2223
"require-dev": {

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy