From d56b14862e4d2a01073634362862cf2a22c14135 Mon Sep 17 00:00:00 2001 From: Alexandre Daubois Date: Wed, 4 Jun 2025 10:02:35 +0200 Subject: [PATCH] [JsonPath] Better handling of Unicode chars in expressions --- .../Component/JsonPath/JsonCrawler.php | 4 +- .../JsonPath/JsonCrawlerInterface.php | 2 +- src/Symfony/Component/JsonPath/JsonPath.php | 6 +- .../Component/JsonPath/JsonPathUtils.php | 74 +++++ .../JsonPath/Tests/JsonCrawlerTest.php | 269 ++++++++++++++++++ .../Test/JsonPathAssertionsTraitTest.php | 9 + src/Symfony/Component/JsonPath/composer.json | 1 + 7 files changed, 359 insertions(+), 6 deletions(-) diff --git a/src/Symfony/Component/JsonPath/JsonCrawler.php b/src/Symfony/Component/JsonPath/JsonCrawler.php index b1d7ef0bf94d8..492f56e77bba7 100644 --- a/src/Symfony/Component/JsonPath/JsonCrawler.php +++ b/src/Symfony/Component/JsonPath/JsonCrawler.php @@ -230,7 +230,7 @@ private function evaluateBracket(string $expr, mixed $value): array // quoted strings for object keys if (preg_match('/^([\'"])(.*)\1$/', $expr, $matches)) { - $key = stripslashes($matches[2]); + $key = JsonPathUtils::unescapeString($matches[2], $matches[1]); return \array_key_exists($key, $value) ? [$value[$key]] : []; } @@ -335,7 +335,7 @@ private function evaluateScalar(string $expr, array $context): mixed // string literals if (preg_match('/^([\'"])(.*)\1$/', $expr, $matches)) { - return $matches[2]; + return JsonPathUtils::unescapeString($matches[2], $matches[1]); } // current node references diff --git a/src/Symfony/Component/JsonPath/JsonCrawlerInterface.php b/src/Symfony/Component/JsonPath/JsonCrawlerInterface.php index 3e8a222f0ba8e..4859c2bde076b 100644 --- a/src/Symfony/Component/JsonPath/JsonCrawlerInterface.php +++ b/src/Symfony/Component/JsonPath/JsonCrawlerInterface.php @@ -25,7 +25,7 @@ interface JsonCrawlerInterface * @return list * * @throws InvalidArgumentException When the JSON string provided to the crawler cannot be decoded - * @throws JsonCrawlerException When a syntax error occurs in the provided JSON path + * @throws JsonCrawlerException When a syntax error occurs in the provided JSON path */ public function find(string|JsonPath $query): array; } diff --git a/src/Symfony/Component/JsonPath/JsonPath.php b/src/Symfony/Component/JsonPath/JsonPath.php index e716167eb3f64..e36fc9ffd2ef1 100644 --- a/src/Symfony/Component/JsonPath/JsonPath.php +++ b/src/Symfony/Component/JsonPath/JsonPath.php @@ -92,12 +92,12 @@ private function escapeKey(string $key): string "\r" => '\\r', "\t" => '\\t', "\b" => '\\b', - "\f" => '\\f' + "\f" => '\\f', ]); - for ($i = 0; $i <= 31; $i++) { + for ($i = 0; $i <= 31; ++$i) { if ($i < 8 || $i > 13) { - $key = str_replace(chr($i), sprintf('\\u%04x', $i), $key); + $key = str_replace(\chr($i), \sprintf('\\u%04x', $i), $key); } } diff --git a/src/Symfony/Component/JsonPath/JsonPathUtils.php b/src/Symfony/Component/JsonPath/JsonPathUtils.php index b5ac2ae6b8d0a..6f971d20115b2 100644 --- a/src/Symfony/Component/JsonPath/JsonPathUtils.php +++ b/src/Symfony/Component/JsonPath/JsonPathUtils.php @@ -85,4 +85,78 @@ public static function findSmallestDeserializableStringAndPath(array $tokens, mi 'tokens' => $remainingTokens, ]; } + + public static function unescapeString(string $str, string $quoteChar): string + { + if ('"' === $quoteChar) { + // try JSON decoding first for unicode sequences + $jsonStr = '"'.$str.'"'; + $decoded = json_decode($jsonStr, true); + + if (null !== $decoded) { + return $decoded; + } + } + + $result = ''; + $length = \strlen($str); + + for ($i = 0; $i < $length; ++$i) { + if ('\\' === $str[$i] && $i + 1 < $length) { + $result .= match ($str[$i + 1]) { + '"' => '"', + "'" => "'", + '\\' => '\\', + '/' => '/', + 'b' => "\b", + 'f' => "\f", + 'n' => "\n", + 'r' => "\r", + 't' => "\t", + 'u' => self::unescapeUnicodeSequence($str, $length, $i), + default => $str[$i].$str[$i + 1], // keep the backslash + }; + + ++$i; + } else { + $result .= $str[$i]; + } + } + + return $result; + } + + private static function unescapeUnicodeSequence(string $str, int $length, int &$i): string + { + if ($i + 5 >= $length) { + // not enough characters for Unicode escape, treat as literal + return $str[$i]; + } + + $hex = substr($str, $i + 2, 4); + if (!ctype_xdigit($hex)) { + // invalid hex, treat as literal + return $str[$i]; + } + + $codepoint = hexdec($hex); + // looks like a valid Unicode codepoint, string length is sufficient and it starts with \u + if (0xD800 <= $codepoint && $codepoint <= 0xDBFF && $i + 11 < $length && '\\' === $str[$i + 6] && 'u' === $str[$i + 7]) { + $lowHex = substr($str, $i + 8, 4); + if (ctype_xdigit($lowHex)) { + $lowSurrogate = hexdec($lowHex); + if (0xDC00 <= $lowSurrogate && $lowSurrogate <= 0xDFFF) { + $codepoint = 0x10000 + (($codepoint & 0x3FF) << 10) + ($lowSurrogate & 0x3FF); + $i += 10; // skip surrogate pair + + return mb_chr($codepoint, 'UTF-8'); + } + } + } + + // single Unicode character or invalid surrogate, skip the sequence + $i += 4; + + return mb_chr($codepoint, 'UTF-8'); + } } diff --git a/src/Symfony/Component/JsonPath/Tests/JsonCrawlerTest.php b/src/Symfony/Component/JsonPath/Tests/JsonCrawlerTest.php index 66ccfc2642141..213ae06afa7db 100644 --- a/src/Symfony/Component/JsonPath/Tests/JsonCrawlerTest.php +++ b/src/Symfony/Component/JsonPath/Tests/JsonCrawlerTest.php @@ -465,6 +465,251 @@ public function testStarAsKey() $this->assertSame(['a' => 1, 'b' => 2], $result[0]); } + /** + * @dataProvider provideUnicodeEscapeSequencesProvider + */ + public function testUnicodeEscapeSequences(string $jsonPath, array $expected) + { + $this->assertSame($expected, self::getUnicodeDocumentCrawler()->find($jsonPath)); + } + + public static function provideUnicodeEscapeSequencesProvider(): array + { + return [ + [ + '$["caf\u00e9"]', + ['coffee'], + ], + [ + '$["\u65e5\u672c"]', + ['Japan'], + ], + [ + '$["M\u00fcller"]', + [], + ], + [ + '$["emoji\ud83d\ude00"]', + ['smiley'], + ], + [ + '$["tab\there"]', + ['with tab'], + ], + [ + '$["new\nline"]', + ['with newline'], + ], + [ + '$["quote\"here"]', + ['with quote'], + ], + [ + '$["backslash\\\\here"]', + ['with backslash'], + ], + [ + '$["apostrophe\'here"]', + ['with apostrophe'], + ], + [ + '$["control\u0001char"]', + ['with control char'], + ], + [ + '$["\u0063af\u00e9"]', + ['coffee'], + ], + ]; + } + + /** + * @dataProvider provideSingleQuotedStringProvider + */ + public function testSingleQuotedStrings(string $jsonPath, array $expected) + { + $this->assertSame($expected, self::getUnicodeDocumentCrawler()->find($jsonPath)); + } + + public static function provideSingleQuotedStringProvider(): array + { + return [ + [ + "$['caf\\u00e9']", + ['coffee'], + ], + [ + "$['\\u65e5\\u672c']", + ['Japan'], + ], + [ + "$['quote\"here']", + ['with quote'], + ], + [ + "$['M\\u00fcller']", + [], + ], + [ + "$['emoji\\ud83d\\ude00']", + ['smiley'], + ], + [ + "$['tab\\there']", + ['with tab'], + ], + [ + "$['quote\\\"here']", + ['with quote'], + ], + [ + "$['backslash\\\\here']", + ['with backslash'], + ], + [ + "$['apostrophe\\'here']", + ['with apostrophe'], + ], + [ + "$['control\\u0001char']", + ['with control char'], + ], + [ + "$['\\u0063af\\u00e9']", + ['coffee'], + ], + ]; + } + + /** + * @dataProvider provideFilterWithUnicodeProvider + */ + public function testFilterWithUnicodeStrings(string $jsonPath, int $expectedCount, string $expectedCountry) + { + $result = self::getUnicodeDocumentCrawler()->find($jsonPath); + + $this->assertCount($expectedCount, $result); + + if ($expectedCount > 0) { + $this->assertSame($expectedCountry, $result[0]['country']); + } + } + + public static function provideFilterWithUnicodeProvider(): array + { + return [ + [ + '$.users[?(@.name == "caf\u00e9")]', + 1, + 'France', + ], + [ + '$.users[?(@.name == "\u65e5\u672c\u592a\u90ce")]', + 1, + 'Japan', + ], + [ + '$.users[?(@.name == "Jos\u00e9")]', + 1, + 'Spain', + ], + [ + '$.users[?(@.name == "John")]', + 1, + 'USA', + ], + [ + '$.users[?(@.name == "NonExistent\u0020Name")]', + 0, + '', + ], + ]; + } + + /** + * @dataProvider provideInvalidUnicodeSequenceProvider + */ + public function testInvalidUnicodeSequencesAreProcessedAsLiterals(string $jsonPath) + { + $this->assertIsArray(self::getUnicodeDocumentCrawler()->find($jsonPath), 'invalid unicode sequence should be treated as literal and not throw'); + } + + public static function provideInvalidUnicodeSequenceProvider(): array + { + return [ + [ + '$["test\uZZZZ"]', + ], + [ + '$["test\u123"]', + ], + [ + '$["test\u"]', + ], + ]; + } + + /** + * @dataProvider provideComplexUnicodePath + */ + public function testComplexUnicodePaths(string $jsonPath, array $expected) + { + $complexJson = [ + 'データ' => [ + 'ユーザー' => [ + ['名前' => 'テスト', 'ID' => 1], + ['名前' => 'サンプル', 'ID' => 2], + ], + ], + 'special🔑' => [ + 'value💎' => 'treasure', + ], + ]; + + $crawler = new JsonCrawler(json_encode($complexJson)); + + $this->assertSame($expected, $crawler->find($jsonPath)); + } + + public static function provideComplexUnicodePath(): array + { + return [ + [ + '$["\u30c7\u30fc\u30bf"]["\u30e6\u30fc\u30b6\u30fc"][0]["\u540d\u524d"]', + ['テスト'], + ], + [ + '$["special\ud83d\udd11"]["value\ud83d\udc8e"]', + ['treasure'], + ], + [ + '$["\u30c7\u30fc\u30bf"]["\u30e6\u30fc\u30b6\u30fc"][*]["\u540d\u524d"]', + ['テスト', 'サンプル'], + ], + ]; + } + + public function testSurrogatePairHandling() + { + $json = ['𝒽𝑒𝓁𝓁𝑜' => 'mathematical script hello']; + $crawler = new JsonCrawler(json_encode($json)); + + // mathematical script "hello" requires surrogate pairs for each character + $result = $crawler->find('$["\ud835\udcbd\ud835\udc52\ud835\udcc1\ud835\udcc1\ud835\udc5c"]'); + $this->assertSame(['mathematical script hello'], $result); + } + + public function testMixedQuoteTypes() + { + $json = ['key"with"quotes' => 'value1', "key'with'apostrophes" => 'value2']; + $crawler = new JsonCrawler(json_encode($json)); + + $result = $crawler->find('$[\'key"with"quotes\']'); + $this->assertSame(['value1'], $result); + + $result = $crawler->find('$["key\'with\'apostrophes"]'); + $this->assertSame(['value2'], $result); + } private static function getBookstoreCrawler(): JsonCrawler { @@ -515,4 +760,28 @@ private static function getSimpleCollectionCrawler(): JsonCrawler {"a": [3, 5, 1, 2, 4, 6]} JSON); } + + private static function getUnicodeDocumentCrawler(): JsonCrawler + { + $json = [ + 'café' => 'coffee', + '日本' => 'Japan', + 'emoji😀' => 'smiley', + 'tab here' => 'with tab', + "new\nline" => 'with newline', + 'quote"here' => 'with quote', + 'backslash\\here' => 'with backslash', + 'apostrophe\'here' => 'with apostrophe', + "control\x01char" => 'with control char', + 'users' => [ + ['name' => 'café', 'country' => 'France'], + ['name' => '日本太郎', 'country' => 'Japan'], + ['name' => 'John', 'country' => 'USA'], + ['name' => 'Müller', 'country' => 'Germany'], + ['name' => 'José', 'country' => 'Spain'], + ], + ]; + + return new JsonCrawler(json_encode($json)); + } } diff --git a/src/Symfony/Component/JsonPath/Tests/Test/JsonPathAssertionsTraitTest.php b/src/Symfony/Component/JsonPath/Tests/Test/JsonPathAssertionsTraitTest.php index 62d64b53e1e8d..1044e7658672b 100644 --- a/src/Symfony/Component/JsonPath/Tests/Test/JsonPathAssertionsTraitTest.php +++ b/src/Symfony/Component/JsonPath/Tests/Test/JsonPathAssertionsTraitTest.php @@ -1,5 +1,14 @@ + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + namespace Symfony\Component\JsonPath\Tests\Test; use PHPUnit\Framework\AssertionFailedError; diff --git a/src/Symfony/Component/JsonPath/composer.json b/src/Symfony/Component/JsonPath/composer.json index fe8ddf84dd82d..feb8158aa5be2 100644 --- a/src/Symfony/Component/JsonPath/composer.json +++ b/src/Symfony/Component/JsonPath/composer.json @@ -17,6 +17,7 @@ ], "require": { "php": ">=8.2", + "symfony/polyfill-ctype": "^1.8", "symfony/polyfill-mbstring": "~1.0" }, "require-dev": { pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy