Skip to content

Commit 9453d33

Browse files
[JsonPath] Handle special whitespaces in filters
1 parent 488ff19 commit 9453d33

File tree

6 files changed

+515
-260
lines changed

6 files changed

+515
-260
lines changed

src/Symfony/Component/JsonPath/JsonCrawler.php

Lines changed: 170 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,14 @@ private function evaluateBracket(string $expr, mixed $value): array
133133
return [];
134134
}
135135

136+
if (str_contains($expr, ',')) {
137+
$trimmed = trim($expr);
138+
if (str_starts_with($trimmed, ',') || str_ends_with($trimmed, ',')) {
139+
throw new JsonCrawlerException($expr, 'Expression cannot have leading or trailing commas');
140+
}
141+
}
142+
143+
$expr = JsonPathUtils::normalizeWhitespace($expr);
136144
if ('*' === $expr) {
137145
return array_values($value);
138146
}
@@ -168,8 +176,7 @@ private function evaluateBracket(string $expr, mixed $value): array
168176
return $result;
169177
}
170178

171-
// start, end and step
172-
if (preg_match('/^(-?\d*):(-?\d*)(?::(-?\d+))?$/', $expr, $matches)) {
179+
if (preg_match('/^(-?\d*+)\s*+:\s*+(-?\d*+)(?:\s*+:\s*+(-?\d++))?$/', $expr, $matches)) {
173180
if (!array_is_list($value)) {
174181
return [];
175182
}
@@ -217,14 +224,14 @@ private function evaluateBracket(string $expr, mixed $value): array
217224

218225
// filter expressions
219226
if (preg_match('/^\?(.*)$/', $expr, $matches)) {
220-
$filterExpr = $matches[1];
227+
$filterExpr = trim($matches[1]);
221228

222229
if (preg_match('/^(\w+)\s*\([^()]*\)\s*([<>=!]+.*)?$/', $filterExpr)) {
223230
$filterExpr = "($filterExpr)";
224231
}
225232

226233
if (!str_starts_with($filterExpr, '(')) {
227-
throw new JsonCrawlerException($expr, 'Invalid filter expression');
234+
$filterExpr = "($filterExpr)";
228235
}
229236

230237
// remove outer filter parentheses
@@ -238,28 +245,31 @@ private function evaluateBracket(string $expr, mixed $value): array
238245
$parts = $this->parseCommaSeparatedValues($expr);
239246

240247
$result = [];
241-
$keysIndices = array_keys($value);
242-
$isList = array_is_list($value);
243248

244249
foreach ($parts as $part) {
245250
$part = trim($part);
246251

247-
if (preg_match('/^([\'"])(.*)\1$/', $part, $matches)) {
252+
if ('*' === $part) {
253+
$result = array_merge($result, array_values($value));
254+
} elseif (preg_match('/^(-?\d*+)\s*+:\s*+(-?\d*+)(?:\s*+:\s*+(-?\d++))?$/', $part, $matches)) {
255+
// slice notation
256+
$sliceResult = $this->evaluateBracket($part, $value);
257+
$result = array_merge($result, $sliceResult);
258+
} elseif (preg_match('/^([\'"])(.*)\1$/', $part, $matches)) {
248259
$key = JsonPathUtils::unescapeString($matches[2], $matches[1]);
249260

250-
if ($isList) {
261+
if (array_is_list($value)) {
262+
// for arrays, find ALL objects that contain this key
251263
foreach ($value as $item) {
252264
if (\is_array($item) && \array_key_exists($key, $item)) {
253265
$result[] = $item;
254-
break;
255266
}
256267
}
257-
258-
continue; // no results here
259-
}
260-
261-
if (\array_key_exists($key, $value)) {
262-
$result[] = $value[$key];
268+
} else {
269+
// for objects, get the value for this key
270+
if (\array_key_exists($key, $value)) {
271+
$result[] = $value[$key];
272+
}
263273
}
264274
} elseif (preg_match('/^-?\d+$/', $part)) {
265275
// numeric index
@@ -268,14 +278,14 @@ private function evaluateBracket(string $expr, mixed $value): array
268278
$index = \count($value) + $index;
269279
}
270280

271-
if ($isList && \array_key_exists($index, $value)) {
281+
if (array_is_list($value) && \array_key_exists($index, $value)) {
272282
$result[] = $value[$index];
273-
continue;
274-
}
275-
276-
// numeric index on a hashmap
277-
if (isset($keysIndices[$index]) && isset($value[$keysIndices[$index]])) {
278-
$result[] = $value[$keysIndices[$index]];
283+
} else {
284+
// numeric index on a hashmap
285+
$keysIndices = array_keys($value);
286+
if (isset($keysIndices[$index]) && isset($value[$keysIndices[$index]])) {
287+
$result[] = $value[$keysIndices[$index]];
288+
}
279289
}
280290
}
281291
}
@@ -310,7 +320,32 @@ private function evaluateFilter(string $expr, mixed $value): array
310320

311321
private function evaluateFilterExpression(string $expr, mixed $context): bool
312322
{
313-
$expr = trim($expr);
323+
$expr = JsonPathUtils::normalizeWhitespace($expr);
324+
325+
// remove outer parentheses if they wrap the entire expression
326+
if (str_starts_with($expr, '(') && str_ends_with($expr, ')')) {
327+
$depth = 0;
328+
$isWrapped = true;
329+
for ($i = 0; $i < strlen($expr); $i++) {
330+
if ($expr[$i] === '(') {
331+
$depth++;
332+
} elseif ($expr[$i] === ')') {
333+
$depth--;
334+
if ($depth === 0 && $i < strlen($expr) - 1) {
335+
$isWrapped = false;
336+
break;
337+
}
338+
}
339+
}
340+
if ($isWrapped) {
341+
$expr = trim(substr($expr, 1, -1));
342+
}
343+
}
344+
345+
if (str_starts_with($expr, '!')) {
346+
$innerExpr = trim(substr($expr, 1));
347+
return !$this->evaluateFilterExpression($innerExpr, $context);
348+
}
314349

315350
if (str_contains($expr, '&&')) {
316351
$parts = array_map('trim', explode('&&', $expr));
@@ -353,8 +388,8 @@ private function evaluateFilterExpression(string $expr, mixed $context): bool
353388
}
354389

355390
// function calls
356-
if (preg_match('/^(\w+)\((.*)\)$/', $expr, $matches)) {
357-
$functionName = $matches[1];
391+
if (preg_match('/^(\w++)\s*+\((.*)\)$/', $expr, $matches)) {
392+
$functionName = trim($matches[1]);
358393
if (!isset(self::RFC9535_FUNCTIONS[$functionName])) {
359394
throw new JsonCrawlerException($expr, \sprintf('invalid function "%s"', $functionName));
360395
}
@@ -369,8 +404,16 @@ private function evaluateFilterExpression(string $expr, mixed $context): bool
369404

370405
private function evaluateScalar(string $expr, mixed $context): mixed
371406
{
372-
if (is_numeric($expr)) {
373-
return str_contains($expr, '.') ? (float) $expr : (int) $expr;
407+
$expr = JsonPathUtils::normalizeWhitespace($expr);
408+
409+
// RFC 9535 compliant number validation using strict JSON number format
410+
if (preg_match('/^-?(0|[1-9]\d*)(\.\d+)?([eE][+-]?\d+)?$/', $expr)) {
411+
return str_contains($expr, '.') || str_contains(strtolower($expr), 'e') ? (float) $expr : (int) $expr;
412+
}
413+
414+
// only validate tokens that look like standalone numbers
415+
if (preg_match('/^[\d+\-.eE]+$/', $expr) && preg_match('/\d/', $expr)) {
416+
throw new JsonCrawlerException($expr, \sprintf('Invalid number format "%s"', $expr));
374417
}
375418

376419
if ('@' === $expr) {
@@ -404,8 +447,8 @@ private function evaluateScalar(string $expr, mixed $context): mixed
404447
}
405448

406449
// function calls
407-
if (preg_match('/^(\w+)\((.*)\)$/', $expr, $matches)) {
408-
$functionName = $matches[1];
450+
if (preg_match('/^(\w++)\((.*)\)$/', $expr, $matches)) {
451+
$functionName = trim($matches[1]);
409452
if (!isset(self::RFC9535_FUNCTIONS[$functionName])) {
410453
throw new JsonCrawlerException($expr, \sprintf('invalid function "%s"', $functionName));
411454
}
@@ -416,31 +459,65 @@ private function evaluateScalar(string $expr, mixed $context): mixed
416459
return null;
417460
}
418461

419-
private function evaluateFunction(string $name, string $args, array $context): mixed
462+
private function evaluateFunction(string $name, string $args, mixed $context): mixed
420463
{
421-
$args = array_map(
422-
fn ($arg) => $this->evaluateScalar(trim($arg), $context),
423-
explode(',', $args)
424-
);
464+
$argList = [];
465+
$nodelistSizes = [];
466+
if ($args = trim($args)) {
467+
$args = $this->parseCommaSeparatedValues($args);
468+
foreach ($args as $arg) {
469+
$arg = trim($arg);
470+
if (str_starts_with($arg, '@')) { // special handling for @ to track nodelist size
471+
if ('@' === $arg) {
472+
$argList[] = $context;
473+
$nodelistSizes[] = 1;
474+
} elseif (!\is_array($context)) {
475+
$argList[] = null;
476+
$nodelistSizes[] = 0;
477+
} else {
478+
$pathPart = substr($arg, 1);
479+
if (str_starts_with($pathPart, '[')) {
480+
// handle bracket expressions like @['a','d']
481+
$results = $this->evaluateBracket(substr($pathPart, 1, -1), $context);
482+
$argList[] = $results;
483+
$nodelistSizes[] = \count($results);
484+
} else {
485+
// handle dot notation like @.a
486+
$results = $this->evaluateTokensOnDecodedData(JsonPathTokenizer::tokenize(new JsonPath('$'.$pathPart)), $context);
487+
$argList[] = $results[0] ?? null;
488+
$nodelistSizes[] = \count($results);
489+
}
490+
}
491+
} elseif (str_starts_with($arg, '$')) { // special handling for absolute paths
492+
$results = $this->evaluate(new JsonPath($arg));
493+
$argList[] = $results[0] ?? null;
494+
$nodelistSizes[] = \count($results);
495+
} else {
496+
$argList[] = $this->evaluateScalar($arg, $context);
497+
$nodelistSizes[] = 1;
498+
}
499+
}
500+
}
425501

426-
$value = $args[0] ?? null;
502+
$value = $argList[0] ?? null;
503+
$nodelistSize = $nodelistSizes[0] ?? 0;
427504

428505
return match ($name) {
429506
'length' => match (true) {
430507
\is_string($value) => mb_strlen($value),
431508
\is_array($value) => \count($value),
432509
default => 0,
433510
},
434-
'count' => \is_array($value) ? \count($value) : 0,
511+
'count' => $nodelistSize,
435512
'match' => match (true) {
436-
\is_string($value) && \is_string($args[1] ?? null) => (bool) @preg_match(\sprintf('/^%s$/', $args[1]), $value),
513+
\is_string($value) && \is_string($argList[1] ?? null) => (bool) @preg_match(\sprintf('/^%s$/u', $this->transformJsonPathRegex($argList[1])), $value),
437514
default => false,
438515
},
439516
'search' => match (true) {
440-
\is_string($value) && \is_string($args[1] ?? null) => (bool) @preg_match("/$args[1]/", $value),
517+
\is_string($value) && \is_string($argList[1] ?? null) => (bool) @preg_match("/{$this->transformJsonPathRegex($argList[1])}/u", $value),
441518
default => false,
442519
},
443-
'value' => $value,
520+
'value' => 1 < $nodelistSize ? null : (1 === $nodelistSize ? (\is_array($value) ? ($value[0] ?? null) : $value) : $value),
444521
default => null,
445522
};
446523
}
@@ -480,6 +557,7 @@ private function parseCommaSeparatedValues(string $expr): array
480557
$current = '';
481558
$inQuotes = false;
482559
$quoteChar = null;
560+
$bracketDepth = 0;
483561

484562
for ($i = 0; $i < \strlen($expr); ++$i) {
485563
$char = $expr[$i];
@@ -497,7 +575,11 @@ private function parseCommaSeparatedValues(string $expr): array
497575
$inQuotes = false;
498576
$quoteChar = null;
499577
}
500-
} elseif (!$inQuotes && ',' === $char) {
578+
} elseif (!$inQuotes && '[' === $char) {
579+
++$bracketDepth;
580+
} elseif (!$inQuotes && ']' === $char) {
581+
--$bracketDepth;
582+
} elseif (!$inQuotes && 0 === $bracketDepth && ',' === $char) {
501583
$parts[] = trim($current);
502584
$current = '';
503585

@@ -513,4 +595,52 @@ private function parseCommaSeparatedValues(string $expr): array
513595

514596
return $parts;
515597
}
598+
599+
/*
600+
* Transform JSONPath regex patterns to comply with RFC 9535. The main issue is
601+
* that '.' should not match \r or \n but should match Unicode line separators U+2028 and U+2029
602+
*/
603+
private function transformJsonPathRegex(string $pattern): string
604+
{
605+
$result = '';
606+
$inCharClass = false;
607+
$escaped = false;
608+
$length = strlen($pattern);
609+
610+
for ($i = 0; $i < $length; $i++) {
611+
$char = $pattern[$i];
612+
613+
if ($escaped) {
614+
$result .= $char;
615+
$escaped = false;
616+
continue;
617+
}
618+
619+
if ($char === '\\') {
620+
$result .= $char;
621+
$escaped = true;
622+
continue;
623+
}
624+
625+
if ($char === '[' && !$inCharClass) {
626+
$inCharClass = true;
627+
$result .= $char;
628+
continue;
629+
}
630+
631+
if ($char === ']' && $inCharClass) {
632+
$inCharClass = false;
633+
$result .= $char;
634+
continue;
635+
}
636+
637+
if ($char === '.' && !$inCharClass) {
638+
$result .= '(?:[^\r\n]|\x{2028}|\x{2029})';
639+
} else {
640+
$result .= $char;
641+
}
642+
}
643+
644+
return $result;
645+
}
516646
}

src/Symfony/Component/JsonPath/JsonPathUtils.php

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,4 +159,18 @@ private static function unescapeUnicodeSequence(string $str, int $length, int &$
159159

160160
return mb_chr($codepoint, 'UTF-8');
161161
}
162+
163+
/**
164+
* @see https://datatracker.ietf.org/doc/rfc9535/, section 2.1.1
165+
*/
166+
public static function normalizeWhitespace(string $input): string
167+
{
168+
$normalized = strtr($input, [
169+
"\t" => ' ',
170+
"\n" => ' ',
171+
"\r" => ' ',
172+
]);
173+
174+
return trim($normalized);
175+
}
162176
}

src/Symfony/Component/JsonPath/Tests/JsonCrawlerTest.php

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -500,6 +500,28 @@ public function testLengthFunctionWithOuterParentheses()
500500
$this->assertSame('J. R. R. Tolkien', $result[1]['author']);
501501
}
502502

503+
public function testMatchFunctionWithMultipleSpacesTrimmed()
504+
{
505+
$result = self::getBookstoreCrawler()->find("$.store.book[?(match(@.title, 'Sword of Honour'))]");
506+
507+
$this->assertSame([], $result);
508+
}
509+
510+
public function testFilterMultiline()
511+
{
512+
$result = self::getBookstoreCrawler()->find(
513+
'$
514+
.store
515+
.book[?
516+
length(@.author)>12
517+
]'
518+
);
519+
520+
$this->assertCount(2, $result);
521+
$this->assertSame('Herman Melville', $result[0]['author']);
522+
$this->assertSame('J. R. R. Tolkien', $result[1]['author']);
523+
}
524+
503525
public function testCountFunction()
504526
{
505527
$result = self::getBookstoreCrawler()->find('$.store.book[?count(@.extra) != 0]');
@@ -577,10 +599,6 @@ public static function provideUnicodeEscapeSequencesProvider(): array
577599
'$["tab\there"]',
578600
['with tab'],
579601
],
580-
[
581-
'$["new\nline"]',
582-
['with newline'],
583-
],
584602
[
585603
'$["quote\"here"]',
586604
['with quote'],

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy