Skip to content

Commit 84193ab

Browse files
[JsonPath] Handle special whitespaces in filters
1 parent 488ff19 commit 84193ab

File tree

6 files changed

+523
-300
lines changed

6 files changed

+523
-300
lines changed

src/Symfony/Component/JsonPath/JsonCrawler.php

Lines changed: 170 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,14 @@ private function evaluateBracket(string $expr, mixed $value): array
133133
return [];
134134
}
135135

136+
if (str_contains($expr, ',')) {
137+
$trimmed = trim($expr);
138+
if (str_starts_with($trimmed, ',') || str_ends_with($trimmed, ',')) {
139+
throw new JsonCrawlerException($expr, 'Expression cannot have leading or trailing commas');
140+
}
141+
}
142+
143+
$expr = JsonPathUtils::normalizeWhitespace($expr);
136144
if ('*' === $expr) {
137145
return array_values($value);
138146
}
@@ -168,8 +176,7 @@ private function evaluateBracket(string $expr, mixed $value): array
168176
return $result;
169177
}
170178

171-
// start, end and step
172-
if (preg_match('/^(-?\d*):(-?\d*)(?::(-?\d+))?$/', $expr, $matches)) {
179+
if (preg_match('/^(-?\d*+)\s*+:\s*+(-?\d*+)(?:\s*+:\s*+(-?\d++))?$/', $expr, $matches)) {
173180
if (!array_is_list($value)) {
174181
return [];
175182
}
@@ -217,14 +224,14 @@ private function evaluateBracket(string $expr, mixed $value): array
217224

218225
// filter expressions
219226
if (preg_match('/^\?(.*)$/', $expr, $matches)) {
220-
$filterExpr = $matches[1];
227+
$filterExpr = trim($matches[1]);
221228

222229
if (preg_match('/^(\w+)\s*\([^()]*\)\s*([<>=!]+.*)?$/', $filterExpr)) {
223230
$filterExpr = "($filterExpr)";
224231
}
225232

226233
if (!str_starts_with($filterExpr, '(')) {
227-
throw new JsonCrawlerException($expr, 'Invalid filter expression');
234+
$filterExpr = "($filterExpr)";
228235
}
229236

230237
// remove outer filter parentheses
@@ -238,28 +245,31 @@ private function evaluateBracket(string $expr, mixed $value): array
238245
$parts = $this->parseCommaSeparatedValues($expr);
239246

240247
$result = [];
241-
$keysIndices = array_keys($value);
242-
$isList = array_is_list($value);
243248

244249
foreach ($parts as $part) {
245250
$part = trim($part);
246251

247-
if (preg_match('/^([\'"])(.*)\1$/', $part, $matches)) {
252+
if ('*' === $part) {
253+
$result = array_merge($result, array_values($value));
254+
} elseif (preg_match('/^(-?\d*+)\s*+:\s*+(-?\d*+)(?:\s*+:\s*+(-?\d++))?$/', $part, $matches)) {
255+
// slice notation
256+
$sliceResult = $this->evaluateBracket($part, $value);
257+
$result = array_merge($result, $sliceResult);
258+
} elseif (preg_match('/^([\'"])(.*)\1$/', $part, $matches)) {
248259
$key = JsonPathUtils::unescapeString($matches[2], $matches[1]);
249260

250-
if ($isList) {
261+
if (array_is_list($value)) {
262+
// for arrays, find ALL objects that contain this key
251263
foreach ($value as $item) {
252264
if (\is_array($item) && \array_key_exists($key, $item)) {
253265
$result[] = $item;
254-
break;
255266
}
256267
}
257-
258-
continue; // no results here
259-
}
260-
261-
if (\array_key_exists($key, $value)) {
262-
$result[] = $value[$key];
268+
} else {
269+
// for objects, get the value for this key
270+
if (\array_key_exists($key, $value)) {
271+
$result[] = $value[$key];
272+
}
263273
}
264274
} elseif (preg_match('/^-?\d+$/', $part)) {
265275
// numeric index
@@ -268,14 +278,14 @@ private function evaluateBracket(string $expr, mixed $value): array
268278
$index = \count($value) + $index;
269279
}
270280

271-
if ($isList && \array_key_exists($index, $value)) {
281+
if (array_is_list($value) && \array_key_exists($index, $value)) {
272282
$result[] = $value[$index];
273-
continue;
274-
}
275-
276-
// numeric index on a hashmap
277-
if (isset($keysIndices[$index]) && isset($value[$keysIndices[$index]])) {
278-
$result[] = $value[$keysIndices[$index]];
283+
} else {
284+
// numeric index on a hashmap
285+
$keysIndices = array_keys($value);
286+
if (isset($keysIndices[$index]) && isset($value[$keysIndices[$index]])) {
287+
$result[] = $value[$keysIndices[$index]];
288+
}
279289
}
280290
}
281291
}
@@ -310,7 +320,32 @@ private function evaluateFilter(string $expr, mixed $value): array
310320

311321
private function evaluateFilterExpression(string $expr, mixed $context): bool
312322
{
313-
$expr = trim($expr);
323+
$expr = JsonPathUtils::normalizeWhitespace($expr);
324+
325+
// remove outer parentheses if they wrap the entire expression
326+
if (str_starts_with($expr, '(') && str_ends_with($expr, ')')) {
327+
$depth = 0;
328+
$isWrapped = true;
329+
for ($i = 0; $i < strlen($expr); $i++) {
330+
if ($expr[$i] === '(') {
331+
$depth++;
332+
} elseif ($expr[$i] === ')') {
333+
$depth--;
334+
if ($depth === 0 && $i < strlen($expr) - 1) {
335+
$isWrapped = false;
336+
break;
337+
}
338+
}
339+
}
340+
if ($isWrapped) {
341+
$expr = trim(substr($expr, 1, -1));
342+
}
343+
}
344+
345+
if (str_starts_with($expr, '!')) {
346+
$innerExpr = trim(substr($expr, 1));
347+
return !$this->evaluateFilterExpression($innerExpr, $context);
348+
}
314349

315350
if (str_contains($expr, '&&')) {
316351
$parts = array_map('trim', explode('&&', $expr));
@@ -353,8 +388,8 @@ private function evaluateFilterExpression(string $expr, mixed $context): bool
353388
}
354389

355390
// function calls
356-
if (preg_match('/^(\w+)\((.*)\)$/', $expr, $matches)) {
357-
$functionName = $matches[1];
391+
if (preg_match('/^(\w++)\s*+\((.*)\)$/', $expr, $matches)) {
392+
$functionName = trim($matches[1]);
358393
if (!isset(self::RFC9535_FUNCTIONS[$functionName])) {
359394
throw new JsonCrawlerException($expr, \sprintf('invalid function "%s"', $functionName));
360395
}
@@ -369,8 +404,15 @@ private function evaluateFilterExpression(string $expr, mixed $context): bool
369404

370405
private function evaluateScalar(string $expr, mixed $context): mixed
371406
{
372-
if (is_numeric($expr)) {
373-
return str_contains($expr, '.') ? (float) $expr : (int) $expr;
407+
$expr = JsonPathUtils::normalizeWhitespace($expr);
408+
409+
if (JsonPathUtils::isJsonNumber($expr)) {
410+
return str_contains($expr, '.') || str_contains(strtolower($expr), 'e') ? (float) $expr : (int) $expr;
411+
}
412+
413+
// only validate tokens that look like standalone numbers
414+
if (preg_match('/^[\d+\-.eE]+$/', $expr) && preg_match('/\d/', $expr)) {
415+
throw new JsonCrawlerException($expr, \sprintf('Invalid number format "%s"', $expr));
374416
}
375417

376418
if ('@' === $expr) {
@@ -404,8 +446,8 @@ private function evaluateScalar(string $expr, mixed $context): mixed
404446
}
405447

406448
// function calls
407-
if (preg_match('/^(\w+)\((.*)\)$/', $expr, $matches)) {
408-
$functionName = $matches[1];
449+
if (preg_match('/^(\w++)\((.*)\)$/', $expr, $matches)) {
450+
$functionName = trim($matches[1]);
409451
if (!isset(self::RFC9535_FUNCTIONS[$functionName])) {
410452
throw new JsonCrawlerException($expr, \sprintf('invalid function "%s"', $functionName));
411453
}
@@ -416,31 +458,65 @@ private function evaluateScalar(string $expr, mixed $context): mixed
416458
return null;
417459
}
418460

419-
private function evaluateFunction(string $name, string $args, array $context): mixed
461+
private function evaluateFunction(string $name, string $args, mixed $context): mixed
420462
{
421-
$args = array_map(
422-
fn ($arg) => $this->evaluateScalar(trim($arg), $context),
423-
explode(',', $args)
424-
);
463+
$argList = [];
464+
$nodelistSizes = [];
465+
if ($args = trim($args)) {
466+
$args = $this->parseCommaSeparatedValues($args);
467+
foreach ($args as $arg) {
468+
$arg = trim($arg);
469+
if (str_starts_with($arg, '@')) { // special handling for @ to track nodelist size
470+
if ('@' === $arg) {
471+
$argList[] = $context;
472+
$nodelistSizes[] = 1;
473+
} elseif (!\is_array($context)) {
474+
$argList[] = null;
475+
$nodelistSizes[] = 0;
476+
} else {
477+
$pathPart = substr($arg, 1);
478+
if (str_starts_with($pathPart, '[')) {
479+
// handle bracket expressions like @['a','d']
480+
$results = $this->evaluateBracket(substr($pathPart, 1, -1), $context);
481+
$argList[] = $results;
482+
$nodelistSizes[] = \count($results);
483+
} else {
484+
// handle dot notation like @.a
485+
$results = $this->evaluateTokensOnDecodedData(JsonPathTokenizer::tokenize(new JsonPath('$'.$pathPart)), $context);
486+
$argList[] = $results[0] ?? null;
487+
$nodelistSizes[] = \count($results);
488+
}
489+
}
490+
} elseif (str_starts_with($arg, '$')) { // special handling for absolute paths
491+
$results = $this->evaluate(new JsonPath($arg));
492+
$argList[] = $results[0] ?? null;
493+
$nodelistSizes[] = \count($results);
494+
} else {
495+
$argList[] = $this->evaluateScalar($arg, $context);
496+
$nodelistSizes[] = 1;
497+
}
498+
}
499+
}
425500

426-
$value = $args[0] ?? null;
501+
$value = $argList[0] ?? null;
502+
$nodelistSize = $nodelistSizes[0] ?? 0;
427503

428504
return match ($name) {
429505
'length' => match (true) {
430506
\is_string($value) => mb_strlen($value),
431507
\is_array($value) => \count($value),
432508
default => 0,
433509
},
434-
'count' => \is_array($value) ? \count($value) : 0,
510+
'count' => $nodelistSize,
435511
'match' => match (true) {
436-
\is_string($value) && \is_string($args[1] ?? null) => (bool) @preg_match(\sprintf('/^%s$/', $args[1]), $value),
512+
\is_string($value) && \is_string($argList[1] ?? null) => (bool) @preg_match(\sprintf('/^%s$/u', $this->transformJsonPathRegex($argList[1])), $value),
437513
default => false,
438514
},
439515
'search' => match (true) {
440-
\is_string($value) && \is_string($args[1] ?? null) => (bool) @preg_match("/$args[1]/", $value),
516+
\is_string($value) && \is_string($argList[1] ?? null) => (bool) @preg_match("/{$this->transformJsonPathRegex($argList[1])}/u", $value),
441517
default => false,
442518
},
443-
'value' => $value,
519+
'value' => 1 < $nodelistSize ? null : (1 === $nodelistSize ? (\is_array($value) ? ($value[0] ?? null) : $value) : $value),
444520
default => null,
445521
};
446522
}
@@ -480,6 +556,7 @@ private function parseCommaSeparatedValues(string $expr): array
480556
$current = '';
481557
$inQuotes = false;
482558
$quoteChar = null;
559+
$bracketDepth = 0;
483560

484561
for ($i = 0; $i < \strlen($expr); ++$i) {
485562
$char = $expr[$i];
@@ -497,7 +574,11 @@ private function parseCommaSeparatedValues(string $expr): array
497574
$inQuotes = false;
498575
$quoteChar = null;
499576
}
500-
} elseif (!$inQuotes && ',' === $char) {
577+
} elseif (!$inQuotes && '[' === $char) {
578+
++$bracketDepth;
579+
} elseif (!$inQuotes && ']' === $char) {
580+
--$bracketDepth;
581+
} elseif (!$inQuotes && 0 === $bracketDepth && ',' === $char) {
501582
$parts[] = trim($current);
502583
$current = '';
503584

@@ -513,4 +594,53 @@ private function parseCommaSeparatedValues(string $expr): array
513594

514595
return $parts;
515596
}
597+
598+
/*
599+
* Transform JSONPath regex patterns to comply with RFC 9535. The main issue is
600+
* that '.' should not match \r or \n but should match Unicode line
601+
* separators U+2028 and U+2029.
602+
*/
603+
private function transformJsonPathRegex(string $pattern): string
604+
{
605+
$result = '';
606+
$inCharClass = false;
607+
$escaped = false;
608+
$length = strlen($pattern);
609+
610+
for ($i = 0; $i < $length; $i++) {
611+
$char = $pattern[$i];
612+
613+
if ($escaped) {
614+
$result .= $char;
615+
$escaped = false;
616+
continue;
617+
}
618+
619+
if ($char === '\\') {
620+
$result .= $char;
621+
$escaped = true;
622+
continue;
623+
}
624+
625+
if ($char === '[' && !$inCharClass) {
626+
$inCharClass = true;
627+
$result .= $char;
628+
continue;
629+
}
630+
631+
if ($char === ']' && $inCharClass) {
632+
$inCharClass = false;
633+
$result .= $char;
634+
continue;
635+
}
636+
637+
if ($char === '.' && !$inCharClass) {
638+
$result .= '(?:[^\r\n]|\x{2028}|\x{2029})';
639+
} else {
640+
$result .= $char;
641+
}
642+
}
643+
644+
return $result;
645+
}
516646
}

src/Symfony/Component/JsonPath/JsonPathUtils.php

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,4 +159,26 @@ private static function unescapeUnicodeSequence(string $str, int $length, int &$
159159

160160
return mb_chr($codepoint, 'UTF-8');
161161
}
162+
163+
/**
164+
* @see https://datatracker.ietf.org/doc/rfc9535/, section 2.1.1
165+
*/
166+
public static function normalizeWhitespace(string $input): string
167+
{
168+
$normalized = strtr($input, [
169+
"\t" => ' ',
170+
"\n" => ' ',
171+
"\r" => ' ',
172+
]);
173+
174+
return trim($normalized);
175+
}
176+
177+
/**
178+
* Check a number is RFC 9535 compliant using strict JSON number format.
179+
*/
180+
public static function isJsonNumber(string $value): bool
181+
{
182+
return preg_match('/^-?(0|[1-9]\d*)(\.\d+)?([eE][+-]?\d+)?$/', $value);
183+
}
162184
}

src/Symfony/Component/JsonPath/Tests/JsonCrawlerTest.php

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -500,6 +500,28 @@ public function testLengthFunctionWithOuterParentheses()
500500
$this->assertSame('J. R. R. Tolkien', $result[1]['author']);
501501
}
502502

503+
public function testMatchFunctionWithMultipleSpacesTrimmed()
504+
{
505+
$result = self::getBookstoreCrawler()->find("$.store.book[?(match(@.title, 'Sword of Honour'))]");
506+
507+
$this->assertSame([], $result);
508+
}
509+
510+
public function testFilterMultiline()
511+
{
512+
$result = self::getBookstoreCrawler()->find(
513+
'$
514+
.store
515+
.book[?
516+
length(@.author)>12
517+
]'
518+
);
519+
520+
$this->assertCount(2, $result);
521+
$this->assertSame('Herman Melville', $result[0]['author']);
522+
$this->assertSame('J. R. R. Tolkien', $result[1]['author']);
523+
}
524+
503525
public function testCountFunction()
504526
{
505527
$result = self::getBookstoreCrawler()->find('$.store.book[?count(@.extra) != 0]');
@@ -577,10 +599,6 @@ public static function provideUnicodeEscapeSequencesProvider(): array
577599
'$["tab\there"]',
578600
['with tab'],
579601
],
580-
[
581-
'$["new\nline"]',
582-
['with newline'],
583-
],
584602
[
585603
'$["quote\"here"]',
586604
['with quote'],

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy