@@ -133,7 +133,11 @@ private function evaluateBracket(string $expr, mixed $value): array
133
133
return [];
134
134
}
135
135
136
- if ('* ' === $ expr ) {
136
+ if (str_contains ($ expr , ', ' ) && (str_starts_with ($ trimmed = trim ($ expr ), ', ' ) || str_ends_with ($ trimmed , ', ' ))) {
137
+ throw new JsonCrawlerException ($ expr , 'Expression cannot have leading or trailing commas ' );
138
+ }
139
+
140
+ if ('* ' === $ expr = JsonPathUtils::normalizeWhitespace ($ expr )) {
137
141
return array_values ($ value );
138
142
}
139
143
@@ -168,8 +172,7 @@ private function evaluateBracket(string $expr, mixed $value): array
168
172
return $ result ;
169
173
}
170
174
171
- // start, end and step
172
- if (preg_match ('/^(-?\d*):(-?\d*)(?::(-?\d+))?$/ ' , $ expr , $ matches )) {
175
+ if (preg_match ('/^(-?\d*+)\s*+:\s*+(-?\d*+)(?:\s*+:\s*+(-?\d++))?$/ ' , $ expr , $ matches )) {
173
176
if (!array_is_list ($ value )) {
174
177
return [];
175
178
}
@@ -217,14 +220,12 @@ private function evaluateBracket(string $expr, mixed $value): array
217
220
218
221
// filter expressions
219
222
if (preg_match ('/^\?(.*)$/ ' , $ expr , $ matches )) {
220
- $ filterExpr = $ matches [1 ];
221
-
222
- if (preg_match ('/^(\w+)\s*\([^()]*\)\s*([<>=!]+.*)?$/ ' , $ filterExpr )) {
223
+ if (preg_match ('/^(\w+)\s*\([^()]*\)\s*([<>=!]+.*)?$/ ' , $ filterExpr = trim ($ matches [1 ]))) {
223
224
$ filterExpr = "( $ filterExpr) " ;
224
225
}
225
226
226
227
if (!str_starts_with ($ filterExpr , '( ' )) {
227
- throw new JsonCrawlerException ( $ expr , ' Invalid filter expression ' ) ;
228
+ $ filterExpr = " ( $ filterExpr ) " ;
228
229
}
229
230
230
231
// remove outer filter parentheses
@@ -235,30 +236,30 @@ private function evaluateBracket(string $expr, mixed $value): array
235
236
236
237
// comma-separated values, e.g. `['key1', 'key2', 123]` or `[0, 1, 'key']`
237
238
if (str_contains ($ expr , ', ' )) {
238
- $ parts = $ this -> parseCommaSeparatedValues ($ expr );
239
+ $ parts = JsonPathUtils:: parseCommaSeparatedValues ($ expr );
239
240
240
241
$ result = [];
241
- $ keysIndices = array_keys ($ value );
242
- $ isList = array_is_list ($ value );
243
242
244
243
foreach ($ parts as $ part ) {
245
244
$ part = trim ($ part );
246
245
247
- if (preg_match ('/^([ \'"])(.*)\1$/ ' , $ part , $ matches )) {
246
+ if ('* ' === $ part ) {
247
+ $ result = array_merge ($ result , array_values ($ value ));
248
+ } elseif (preg_match ('/^(-?\d*+)\s*+:\s*+(-?\d*+)(?:\s*+:\s*+(-?\d++))?$/ ' , $ part , $ matches )) {
249
+ // slice notation
250
+ $ sliceResult = $ this ->evaluateBracket ($ part , $ value );
251
+ $ result = array_merge ($ result , $ sliceResult );
252
+ } elseif (preg_match ('/^([ \'"])(.*)\1$/ ' , $ part , $ matches )) {
248
253
$ key = JsonPathUtils::unescapeString ($ matches [2 ], $ matches [1 ]);
249
254
250
- if ($ isList ) {
255
+ if (array_is_list ($ value )) {
256
+ // for arrays, find ALL objects that contain this key
251
257
foreach ($ value as $ item ) {
252
258
if (\is_array ($ item ) && \array_key_exists ($ key , $ item )) {
253
259
$ result [] = $ item ;
254
- break ;
255
260
}
256
261
}
257
-
258
- continue ; // no results here
259
- }
260
-
261
- if (\array_key_exists ($ key , $ value )) {
262
+ } elseif (\array_key_exists ($ key , $ value )) { // for objects, get the value for this key
262
263
$ result [] = $ value [$ key ];
263
264
}
264
265
} elseif (preg_match ('/^-?\d+$/ ' , $ part )) {
@@ -268,14 +269,14 @@ private function evaluateBracket(string $expr, mixed $value): array
268
269
$ index = \count ($ value ) + $ index ;
269
270
}
270
271
271
- if ($ isList && \array_key_exists ($ index , $ value )) {
272
+ if (array_is_list ( $ value ) && \array_key_exists ($ index , $ value )) {
272
273
$ result [] = $ value [$ index ];
273
- continue ;
274
- }
275
-
276
- // numeric index on a hashmap
277
- if ( isset ( $ keysIndices [ $ index ]) && isset ( $ value [$ keysIndices [$ index ]])) {
278
- $ result [] = $ value [ $ keysIndices [ $ index ]];
274
+ } else {
275
+ // numeric index on a hashmap
276
+ $ keysIndices = array_keys ( $ value );
277
+ if ( isset ( $ keysIndices [ $ index]) && isset ( $ value [ $ keysIndices [ $ index ]])) {
278
+ $ result [] = $ value [$ keysIndices [$ index ]];
279
+ }
279
280
}
280
281
}
281
282
}
@@ -310,7 +311,28 @@ private function evaluateFilter(string $expr, mixed $value): array
310
311
311
312
private function evaluateFilterExpression (string $ expr , mixed $ context ): bool
312
313
{
313
- $ expr = trim ($ expr );
314
+ $ expr = JsonPathUtils::normalizeWhitespace ($ expr );
315
+
316
+ // remove outer parentheses if they wrap the entire expression
317
+ if (str_starts_with ($ expr , '( ' ) && str_ends_with ($ expr , ') ' )) {
318
+ $ depth = 0 ;
319
+ $ isWrapped = true ;
320
+ for ($ i = 0 ; $ i < \strlen ($ expr ); ++$ i ) {
321
+ if ('( ' === $ expr [$ i ]) {
322
+ ++$ depth ;
323
+ } elseif (') ' === $ expr [$ i ] && 0 === --$ depth && $ i < \strlen ($ expr ) - 1 ) {
324
+ $ isWrapped = false ;
325
+ break ;
326
+ }
327
+ }
328
+ if ($ isWrapped ) {
329
+ $ expr = trim (substr ($ expr , 1 , -1 ));
330
+ }
331
+ }
332
+
333
+ if (str_starts_with ($ expr , '! ' )) {
334
+ return !$ this ->evaluateFilterExpression (trim (substr ($ expr , 1 )), $ context );
335
+ }
314
336
315
337
if (str_contains ($ expr , '&& ' )) {
316
338
$ parts = array_map ('trim ' , explode ('&& ' , $ expr ));
@@ -353,8 +375,8 @@ private function evaluateFilterExpression(string $expr, mixed $context): bool
353
375
}
354
376
355
377
// function calls
356
- if (preg_match ('/^(\w+) \((.*)\)$/ ' , $ expr , $ matches )) {
357
- $ functionName = $ matches [1 ];
378
+ if (preg_match ('/^(\w++)\s*+ \((.*)\)$/ ' , $ expr , $ matches )) {
379
+ $ functionName = trim ( $ matches [1 ]) ;
358
380
if (!isset (self ::RFC9535_FUNCTIONS [$ functionName ])) {
359
381
throw new JsonCrawlerException ($ expr , \sprintf ('invalid function "%s" ' , $ functionName ));
360
382
}
@@ -369,8 +391,15 @@ private function evaluateFilterExpression(string $expr, mixed $context): bool
369
391
370
392
private function evaluateScalar (string $ expr , mixed $ context ): mixed
371
393
{
372
- if (is_numeric ($ expr )) {
373
- return str_contains ($ expr , '. ' ) ? (float ) $ expr : (int ) $ expr ;
394
+ $ expr = JsonPathUtils::normalizeWhitespace ($ expr );
395
+
396
+ if (JsonPathUtils::isJsonNumber ($ expr )) {
397
+ return str_contains ($ expr , '. ' ) || str_contains (strtolower ($ expr ), 'e ' ) ? (float ) $ expr : (int ) $ expr ;
398
+ }
399
+
400
+ // only validate tokens that look like standalone numbers
401
+ if (preg_match ('/^[\d+\-.eE]+$/ ' , $ expr ) && preg_match ('/\d/ ' , $ expr )) {
402
+ throw new JsonCrawlerException ($ expr , \sprintf ('Invalid number format "%s" ' , $ expr ));
374
403
}
375
404
376
405
if ('@ ' === $ expr ) {
@@ -404,9 +433,8 @@ private function evaluateScalar(string $expr, mixed $context): mixed
404
433
}
405
434
406
435
// function calls
407
- if (preg_match ('/^(\w+)\((.*)\)$/ ' , $ expr , $ matches )) {
408
- $ functionName = $ matches [1 ];
409
- if (!isset (self ::RFC9535_FUNCTIONS [$ functionName ])) {
436
+ if (preg_match ('/^(\w++)\((.*)\)$/ ' , $ expr , $ matches )) {
437
+ if (!isset (self ::RFC9535_FUNCTIONS [$ functionName = trim ($ matches [1 ])])) {
410
438
throw new JsonCrawlerException ($ expr , \sprintf ('invalid function "%s" ' , $ functionName ));
411
439
}
412
440
@@ -416,31 +444,60 @@ private function evaluateScalar(string $expr, mixed $context): mixed
416
444
return null ;
417
445
}
418
446
419
- private function evaluateFunction (string $ name , string $ args , array $ context ): mixed
447
+ private function evaluateFunction (string $ name , string $ args , mixed $ context ): mixed
420
448
{
421
- $ args = array_map (
422
- fn ($ arg ) => $ this ->evaluateScalar (trim ($ arg ), $ context ),
423
- explode (', ' , $ args )
424
- );
449
+ $ argList = [];
450
+ $ nodelistSizes = [];
451
+ if ($ args = trim ($ args )) {
452
+ $ args = JsonPathUtils::parseCommaSeparatedValues ($ args );
453
+ foreach ($ args as $ arg ) {
454
+ $ arg = trim ($ arg );
455
+ if (str_starts_with ($ arg , '$ ' )) { // special handling for absolute paths
456
+ $ results = $ this ->evaluate (new JsonPath ($ arg ));
457
+ $ argList [] = $ results [0 ] ?? null ;
458
+ $ nodelistSizes [] = \count ($ results );
459
+ } elseif (!str_starts_with ($ arg , '@ ' )) { // special handling for @ to track nodelist size
460
+ $ argList [] = $ this ->evaluateScalar ($ arg , $ context );
461
+ $ nodelistSizes [] = 1 ;
462
+ } elseif ('@ ' === $ arg ) {
463
+ $ argList [] = $ context ;
464
+ $ nodelistSizes [] = 1 ;
465
+ } elseif (!\is_array ($ context )) {
466
+ $ argList [] = null ;
467
+ $ nodelistSizes [] = 0 ;
468
+ } elseif (str_starts_with ($ pathPart = substr ($ arg , 1 ), '[ ' )) {
469
+ // handle bracket expressions like @['a','d']
470
+ $ results = $ this ->evaluateBracket (substr ($ pathPart , 1 , -1 ), $ context );
471
+ $ argList [] = $ results ;
472
+ $ nodelistSizes [] = \count ($ results );
473
+ } else {
474
+ // handle dot notation like @.a
475
+ $ results = $ this ->evaluateTokensOnDecodedData (JsonPathTokenizer::tokenize (new JsonPath ('$ ' .$ pathPart )), $ context );
476
+ $ argList [] = $ results [0 ] ?? null ;
477
+ $ nodelistSizes [] = \count ($ results );
478
+ }
479
+ }
480
+ }
425
481
426
- $ value = $ args [0 ] ?? null ;
482
+ $ value = $ argList [0 ] ?? null ;
483
+ $ nodelistSize = $ nodelistSizes [0 ] ?? 0 ;
427
484
428
485
return match ($ name ) {
429
486
'length ' => match (true ) {
430
487
\is_string ($ value ) => mb_strlen ($ value ),
431
488
\is_array ($ value ) => \count ($ value ),
432
489
default => 0 ,
433
490
},
434
- 'count ' => \is_array ( $ value ) ? \count ( $ value ) : 0 ,
491
+ 'count ' => $ nodelistSize ,
435
492
'match ' => match (true ) {
436
- \is_string ($ value ) && \is_string ($ args [1 ] ?? null ) => (bool ) @preg_match (\sprintf ('/^%s$/ ' , $ args [1 ]), $ value ),
493
+ \is_string ($ value ) && \is_string ($ argList [1 ] ?? null ) => (bool ) @preg_match (\sprintf ('/^%s$/u ' , $ this -> transformJsonPathRegex ( $ argList [1 ]) ), $ value ),
437
494
default => false ,
438
495
},
439
496
'search ' => match (true ) {
440
- \is_string ($ value ) && \is_string ($ args [1 ] ?? null ) => (bool ) @preg_match ("/ $ args [1 ]/ " , $ value ),
497
+ \is_string ($ value ) && \is_string ($ argList [1 ] ?? null ) => (bool ) @preg_match ("/ { $ this -> transformJsonPathRegex ( $ argList [1 ])} /u " , $ value ),
441
498
default => false ,
442
499
},
443
- 'value ' => $ value ,
500
+ 'value ' => 1 < $ nodelistSize ? null : ( 1 === $ nodelistSize ? ( \is_array ( $ value) ? ( $ value [ 0 ] ?? null ) : $ value ) : $ value ) ,
444
501
default => null ,
445
502
};
446
503
}
@@ -474,43 +531,52 @@ private function compare(mixed $left, mixed $right, string $operator): bool
474
531
};
475
532
}
476
533
477
- private function parseCommaSeparatedValues (string $ expr ): array
534
+ /*
535
+ * Transform JSONPath regex patterns to comply with RFC 9535. The main issue is
536
+ * that '.' should not match \r or \n but should match Unicode line
537
+ * separators U+2028 and U+2029.
538
+ */
539
+ private function transformJsonPathRegex (string $ pattern ): string
478
540
{
479
- $ parts = [] ;
480
- $ current = '' ;
481
- $ inQuotes = false ;
482
- $ quoteChar = null ;
541
+ $ result = '' ;
542
+ $ inCharClass = false ;
543
+ $ escaped = false ;
544
+ $ length = \strlen ( $ pattern ) ;
483
545
484
- for ($ i = 0 ; $ i < \strlen ( $ expr ) ; ++$ i ) {
485
- $ char = $ expr [$ i ];
546
+ for ($ i = 0 ; $ i < $ length ; ++$ i ) {
547
+ $ char = $ pattern [$ i ];
486
548
487
- if ('\\' === $ char && $ i + 1 < \strlen ($ expr )) {
488
- $ current .= $ char .$ expr [++$ i ];
549
+ if ($ escaped ) {
550
+ $ result .= $ char ;
551
+ $ escaped = false ;
489
552
continue ;
490
553
}
491
554
492
- if ('" ' === $ char || "' " === $ char ) {
493
- if (!$ inQuotes ) {
494
- $ inQuotes = true ;
495
- $ quoteChar = $ char ;
496
- } elseif ($ char === $ quoteChar ) {
497
- $ inQuotes = false ;
498
- $ quoteChar = null ;
499
- }
500
- } elseif (!$ inQuotes && ', ' === $ char ) {
501
- $ parts [] = trim ($ current );
502
- $ current = '' ;
555
+ if ('\\' === $ char ) {
556
+ $ result .= $ char ;
557
+ $ escaped = true ;
558
+ continue ;
559
+ }
503
560
561
+ if ('[ ' === $ char && !$ inCharClass ) {
562
+ $ inCharClass = true ;
563
+ $ result .= $ char ;
504
564
continue ;
505
565
}
506
566
507
- $ current .= $ char ;
508
- }
567
+ if ('] ' === $ char && $ inCharClass ) {
568
+ $ inCharClass = false ;
569
+ $ result .= $ char ;
570
+ continue ;
571
+ }
509
572
510
- if ('' !== $ current ) {
511
- $ parts [] = trim ($ current );
573
+ if ('. ' === $ char && !$ inCharClass ) {
574
+ $ result .= '(?:[^\r\n]|\x{2028}|\x{2029}) ' ;
575
+ } else {
576
+ $ result .= $ char ;
577
+ }
512
578
}
513
579
514
- return $ parts ;
580
+ return $ result ;
515
581
}
516
582
}
0 commit comments