@@ -133,7 +133,11 @@ private function evaluateBracket(string $expr, mixed $value): array
133
133
return [];
134
134
}
135
135
136
- if ('* ' === $ expr ) {
136
+ if (str_contains ($ expr , ', ' ) && (str_starts_with ($ trimmed = trim ($ expr ), ', ' ) || str_ends_with ($ trimmed , ', ' ))) {
137
+ throw new JsonCrawlerException ($ expr , 'Expression cannot have leading or trailing commas ' );
138
+ }
139
+
140
+ if ('* ' === $ expr = JsonPathUtils::normalizeWhitespace ($ expr )) {
137
141
return array_values ($ value );
138
142
}
139
143
@@ -168,8 +172,7 @@ private function evaluateBracket(string $expr, mixed $value): array
168
172
return $ result ;
169
173
}
170
174
171
- // start, end and step
172
- if (preg_match ('/^(-?\d*):(-?\d*)(?::(-?\d+))?$/ ' , $ expr , $ matches )) {
175
+ if (preg_match ('/^(-?\d*+)\s*+:\s*+(-?\d*+)(?:\s*+:\s*+(-?\d++))?$/ ' , $ expr , $ matches )) {
173
176
if (!array_is_list ($ value )) {
174
177
return [];
175
178
}
@@ -217,14 +220,12 @@ private function evaluateBracket(string $expr, mixed $value): array
217
220
218
221
// filter expressions
219
222
if (preg_match ('/^\?(.*)$/ ' , $ expr , $ matches )) {
220
- $ filterExpr = $ matches [1 ];
221
-
222
- if (preg_match ('/^(\w+)\s*\([^()]*\)\s*([<>=!]+.*)?$/ ' , $ filterExpr )) {
223
+ if (preg_match ('/^(\w+)\s*\([^()]*\)\s*([<>=!]+.*)?$/ ' , $ filterExpr = trim ($ matches [1 ]))) {
223
224
$ filterExpr = "( $ filterExpr) " ;
224
225
}
225
226
226
227
if (!str_starts_with ($ filterExpr , '( ' )) {
227
- throw new JsonCrawlerException ( $ expr , ' Invalid filter expression ' ) ;
228
+ $ filterExpr = " ( $ filterExpr ) " ;
228
229
}
229
230
230
231
// remove outer filter parentheses
@@ -235,30 +236,30 @@ private function evaluateBracket(string $expr, mixed $value): array
235
236
236
237
// comma-separated values, e.g. `['key1', 'key2', 123]` or `[0, 1, 'key']`
237
238
if (str_contains ($ expr , ', ' )) {
238
- $ parts = $ this -> parseCommaSeparatedValues ($ expr );
239
+ $ parts = JsonPathUtils:: parseCommaSeparatedValues ($ expr );
239
240
240
241
$ result = [];
241
- $ keysIndices = array_keys ($ value );
242
- $ isList = array_is_list ($ value );
243
242
244
243
foreach ($ parts as $ part ) {
245
244
$ part = trim ($ part );
246
245
247
- if (preg_match ('/^([ \'"])(.*)\1$/ ' , $ part , $ matches )) {
246
+ if ('* ' === $ part ) {
247
+ $ result = array_merge ($ result , array_values ($ value ));
248
+ } elseif (preg_match ('/^(-?\d*+)\s*+:\s*+(-?\d*+)(?:\s*+:\s*+(-?\d++))?$/ ' , $ part , $ matches )) {
249
+ // slice notation
250
+ $ sliceResult = $ this ->evaluateBracket ($ part , $ value );
251
+ $ result = array_merge ($ result , $ sliceResult );
252
+ } elseif (preg_match ('/^([ \'"])(.*)\1$/ ' , $ part , $ matches )) {
248
253
$ key = JsonPathUtils::unescapeString ($ matches [2 ], $ matches [1 ]);
249
254
250
- if ($ isList ) {
255
+ if (array_is_list ($ value )) {
256
+ // for arrays, find ALL objects that contain this key
251
257
foreach ($ value as $ item ) {
252
258
if (\is_array ($ item ) && \array_key_exists ($ key , $ item )) {
253
259
$ result [] = $ item ;
254
- break ;
255
260
}
256
261
}
257
-
258
- continue ; // no results here
259
- }
260
-
261
- if (\array_key_exists ($ key , $ value )) {
262
+ } elseif (\array_key_exists ($ key , $ value )) { // for objects, get the value for this key
262
263
$ result [] = $ value [$ key ];
263
264
}
264
265
} elseif (preg_match ('/^-?\d+$/ ' , $ part )) {
@@ -268,14 +269,14 @@ private function evaluateBracket(string $expr, mixed $value): array
268
269
$ index = \count ($ value ) + $ index ;
269
270
}
270
271
271
- if ($ isList && \array_key_exists ($ index , $ value )) {
272
+ if (array_is_list ( $ value ) && \array_key_exists ($ index , $ value )) {
272
273
$ result [] = $ value [$ index ];
273
- continue ;
274
- }
275
-
276
- // numeric index on a hashmap
277
- if ( isset ( $ keysIndices [ $ index ]) && isset ( $ value [$ keysIndices [$ index ]])) {
278
- $ result [] = $ value [ $ keysIndices [ $ index ]];
274
+ } else {
275
+ // numeric index on a hashmap
276
+ $ keysIndices = array_keys ( $ value );
277
+ if ( isset ( $ keysIndices [ $ index]) && isset ( $ value [ $ keysIndices [ $ index ]])) {
278
+ $ result [] = $ value [$ keysIndices [$ index ]];
279
+ }
279
280
}
280
281
}
281
282
}
@@ -310,7 +311,31 @@ private function evaluateFilter(string $expr, mixed $value): array
310
311
311
312
private function evaluateFilterExpression (string $ expr , mixed $ context ): bool
312
313
{
313
- $ expr = trim ($ expr );
314
+ $ expr = JsonPathUtils::normalizeWhitespace ($ expr );
315
+
316
+ // remove outer parentheses if they wrap the entire expression
317
+ if (str_starts_with ($ expr , '( ' ) && str_ends_with ($ expr , ') ' )) {
318
+ $ depth = 0 ;
319
+ $ isWrapped = true ;
320
+ for ($ i = 0 ; $ i < \strlen ($ expr ); ++$ i ) {
321
+ if ('( ' === $ expr [$ i ]) {
322
+ ++$ depth ;
323
+ } elseif (') ' === $ expr [$ i ]) {
324
+ --$ depth ;
325
+ if (0 === $ depth && $ i < \strlen ($ expr ) - 1 ) {
326
+ $ isWrapped = false ;
327
+ break ;
328
+ }
329
+ }
330
+ }
331
+ if ($ isWrapped ) {
332
+ $ expr = trim (substr ($ expr , 1 , -1 ));
333
+ }
334
+ }
335
+
336
+ if (str_starts_with ($ expr , '! ' )) {
337
+ return !$ this ->evaluateFilterExpression (trim (substr ($ expr , 1 )), $ context );
338
+ }
314
339
315
340
if (str_contains ($ expr , '&& ' )) {
316
341
$ parts = array_map ('trim ' , explode ('&& ' , $ expr ));
@@ -353,8 +378,8 @@ private function evaluateFilterExpression(string $expr, mixed $context): bool
353
378
}
354
379
355
380
// function calls
356
- if (preg_match ('/^(\w+) \((.*)\)$/ ' , $ expr , $ matches )) {
357
- $ functionName = $ matches [1 ];
381
+ if (preg_match ('/^(\w++)\s*+ \((.*)\)$/ ' , $ expr , $ matches )) {
382
+ $ functionName = trim ( $ matches [1 ]) ;
358
383
if (!isset (self ::RFC9535_FUNCTIONS [$ functionName ])) {
359
384
throw new JsonCrawlerException ($ expr , \sprintf ('invalid function "%s" ' , $ functionName ));
360
385
}
@@ -369,8 +394,15 @@ private function evaluateFilterExpression(string $expr, mixed $context): bool
369
394
370
395
private function evaluateScalar (string $ expr , mixed $ context ): mixed
371
396
{
372
- if (is_numeric ($ expr )) {
373
- return str_contains ($ expr , '. ' ) ? (float ) $ expr : (int ) $ expr ;
397
+ $ expr = JsonPathUtils::normalizeWhitespace ($ expr );
398
+
399
+ if (JsonPathUtils::isJsonNumber ($ expr )) {
400
+ return str_contains ($ expr , '. ' ) || str_contains (strtolower ($ expr ), 'e ' ) ? (float ) $ expr : (int ) $ expr ;
401
+ }
402
+
403
+ // only validate tokens that look like standalone numbers
404
+ if (preg_match ('/^[\d+\-.eE]+$/ ' , $ expr ) && preg_match ('/\d/ ' , $ expr )) {
405
+ throw new JsonCrawlerException ($ expr , \sprintf ('Invalid number format "%s" ' , $ expr ));
374
406
}
375
407
376
408
if ('@ ' === $ expr ) {
@@ -404,9 +436,8 @@ private function evaluateScalar(string $expr, mixed $context): mixed
404
436
}
405
437
406
438
// function calls
407
- if (preg_match ('/^(\w+)\((.*)\)$/ ' , $ expr , $ matches )) {
408
- $ functionName = $ matches [1 ];
409
- if (!isset (self ::RFC9535_FUNCTIONS [$ functionName ])) {
439
+ if (preg_match ('/^(\w++)\((.*)\)$/ ' , $ expr , $ matches )) {
440
+ if (!isset (self ::RFC9535_FUNCTIONS [$ functionName = trim ($ matches [1 ])])) {
410
441
throw new JsonCrawlerException ($ expr , \sprintf ('invalid function "%s" ' , $ functionName ));
411
442
}
412
443
@@ -416,31 +447,60 @@ private function evaluateScalar(string $expr, mixed $context): mixed
416
447
return null ;
417
448
}
418
449
419
- private function evaluateFunction (string $ name , string $ args , array $ context ): mixed
450
+ private function evaluateFunction (string $ name , string $ args , mixed $ context ): mixed
420
451
{
421
- $ args = array_map (
422
- fn ($ arg ) => $ this ->evaluateScalar (trim ($ arg ), $ context ),
423
- explode (', ' , $ args )
424
- );
452
+ $ argList = [];
453
+ $ nodelistSizes = [];
454
+ if ($ args = trim ($ args )) {
455
+ $ args = JsonPathUtils::parseCommaSeparatedValues ($ args );
456
+ foreach ($ args as $ arg ) {
457
+ $ arg = trim ($ arg );
458
+ if (str_starts_with ($ arg , '$ ' )) { // special handling for absolute paths
459
+ $ results = $ this ->evaluate (new JsonPath ($ arg ));
460
+ $ argList [] = $ results [0 ] ?? null ;
461
+ $ nodelistSizes [] = \count ($ results );
462
+ } elseif (!str_starts_with ($ arg , '@ ' )) { // special handling for @ to track nodelist size
463
+ $ argList [] = $ this ->evaluateScalar ($ arg , $ context );
464
+ $ nodelistSizes [] = 1 ;
465
+ } elseif ('@ ' === $ arg ) {
466
+ $ argList [] = $ context ;
467
+ $ nodelistSizes [] = 1 ;
468
+ } elseif (!\is_array ($ context )) {
469
+ $ argList [] = null ;
470
+ $ nodelistSizes [] = 0 ;
471
+ } elseif (str_starts_with ($ pathPart = substr ($ arg , 1 ), '[ ' )) {
472
+ // handle bracket expressions like @['a','d']
473
+ $ results = $ this ->evaluateBracket (substr ($ pathPart , 1 , -1 ), $ context );
474
+ $ argList [] = $ results ;
475
+ $ nodelistSizes [] = \count ($ results );
476
+ } else {
477
+ // handle dot notation like @.a
478
+ $ results = $ this ->evaluateTokensOnDecodedData (JsonPathTokenizer::tokenize (new JsonPath ('$ ' .$ pathPart )), $ context );
479
+ $ argList [] = $ results [0 ] ?? null ;
480
+ $ nodelistSizes [] = \count ($ results );
481
+ }
482
+ }
483
+ }
425
484
426
- $ value = $ args [0 ] ?? null ;
485
+ $ value = $ argList [0 ] ?? null ;
486
+ $ nodelistSize = $ nodelistSizes [0 ] ?? 0 ;
427
487
428
488
return match ($ name ) {
429
489
'length ' => match (true ) {
430
490
\is_string ($ value ) => mb_strlen ($ value ),
431
491
\is_array ($ value ) => \count ($ value ),
432
492
default => 0 ,
433
493
},
434
- 'count ' => \is_array ( $ value ) ? \count ( $ value ) : 0 ,
494
+ 'count ' => $ nodelistSize ,
435
495
'match ' => match (true ) {
436
- \is_string ($ value ) && \is_string ($ args [1 ] ?? null ) => (bool ) @preg_match (\sprintf ('/^%s$/ ' , $ args [1 ]), $ value ),
496
+ \is_string ($ value ) && \is_string ($ argList [1 ] ?? null ) => (bool ) @preg_match (\sprintf ('/^%s$/u ' , $ this -> transformJsonPathRegex ( $ argList [1 ]) ), $ value ),
437
497
default => false ,
438
498
},
439
499
'search ' => match (true ) {
440
- \is_string ($ value ) && \is_string ($ args [1 ] ?? null ) => (bool ) @preg_match ("/ $ args [1 ]/ " , $ value ),
500
+ \is_string ($ value ) && \is_string ($ argList [1 ] ?? null ) => (bool ) @preg_match ("/ { $ this -> transformJsonPathRegex ( $ argList [1 ])} /u " , $ value ),
441
501
default => false ,
442
502
},
443
- 'value ' => $ value ,
503
+ 'value ' => 1 < $ nodelistSize ? null : ( 1 === $ nodelistSize ? ( \is_array ( $ value) ? ( $ value [ 0 ] ?? null ) : $ value ) : $ value ) ,
444
504
default => null ,
445
505
};
446
506
}
@@ -474,43 +534,52 @@ private function compare(mixed $left, mixed $right, string $operator): bool
474
534
};
475
535
}
476
536
477
- private function parseCommaSeparatedValues (string $ expr ): array
537
+ /*
538
+ * Transform JSONPath regex patterns to comply with RFC 9535. The main issue is
539
+ * that '.' should not match \r or \n but should match Unicode line
540
+ * separators U+2028 and U+2029.
541
+ */
542
+ private function transformJsonPathRegex (string $ pattern ): string
478
543
{
479
- $ parts = [] ;
480
- $ current = '' ;
481
- $ inQuotes = false ;
482
- $ quoteChar = null ;
544
+ $ result = '' ;
545
+ $ inCharClass = false ;
546
+ $ escaped = false ;
547
+ $ length = \strlen ( $ pattern ) ;
483
548
484
- for ($ i = 0 ; $ i < \strlen ( $ expr ) ; ++$ i ) {
485
- $ char = $ expr [$ i ];
549
+ for ($ i = 0 ; $ i < $ length ; ++$ i ) {
550
+ $ char = $ pattern [$ i ];
486
551
487
- if ('\\' === $ char && $ i + 1 < \strlen ($ expr )) {
488
- $ current .= $ char .$ expr [++$ i ];
552
+ if ($ escaped ) {
553
+ $ result .= $ char ;
554
+ $ escaped = false ;
489
555
continue ;
490
556
}
491
557
492
- if ('" ' === $ char || "' " === $ char ) {
493
- if (!$ inQuotes ) {
494
- $ inQuotes = true ;
495
- $ quoteChar = $ char ;
496
- } elseif ($ char === $ quoteChar ) {
497
- $ inQuotes = false ;
498
- $ quoteChar = null ;
499
- }
500
- } elseif (!$ inQuotes && ', ' === $ char ) {
501
- $ parts [] = trim ($ current );
502
- $ current = '' ;
558
+ if ('\\' === $ char ) {
559
+ $ result .= $ char ;
560
+ $ escaped = true ;
561
+ continue ;
562
+ }
503
563
564
+ if ('[ ' === $ char && !$ inCharClass ) {
565
+ $ inCharClass = true ;
566
+ $ result .= $ char ;
504
567
continue ;
505
568
}
506
569
507
- $ current .= $ char ;
508
- }
570
+ if ('] ' === $ char && $ inCharClass ) {
571
+ $ inCharClass = false ;
572
+ $ result .= $ char ;
573
+ continue ;
574
+ }
509
575
510
- if ('' !== $ current ) {
511
- $ parts [] = trim ($ current );
576
+ if ('. ' === $ char && !$ inCharClass ) {
577
+ $ result .= '(?:[^\r\n]|\x{2028}|\x{2029}) ' ;
578
+ } else {
579
+ $ result .= $ char ;
580
+ }
512
581
}
513
582
514
- return $ parts ;
583
+ return $ result ;
515
584
}
516
585
}
0 commit comments