@@ -133,6 +133,14 @@ private function evaluateBracket(string $expr, mixed $value): array
133
133
return [];
134
134
}
135
135
136
+ if (str_contains ($ expr , ', ' )) {
137
+ $ trimmed = trim ($ expr );
138
+ if (str_starts_with ($ trimmed , ', ' ) || str_ends_with ($ trimmed , ', ' )) {
139
+ throw new JsonCrawlerException ($ expr , 'Expression cannot have leading or trailing commas ' );
140
+ }
141
+ }
142
+
143
+ $ expr = JsonPathUtils::normalizeWhitespace ($ expr );
136
144
if ('* ' === $ expr ) {
137
145
return array_values ($ value );
138
146
}
@@ -168,8 +176,7 @@ private function evaluateBracket(string $expr, mixed $value): array
168
176
return $ result ;
169
177
}
170
178
171
- // start, end and step
172
- if (preg_match ('/^(-?\d*):(-?\d*)(?::(-?\d+))?$/ ' , $ expr , $ matches )) {
179
+ if (preg_match ('/^(-?\d*+)\s*+:\s*+(-?\d*+)(?:\s*+:\s*+(-?\d++))?$/ ' , $ expr , $ matches )) {
173
180
if (!array_is_list ($ value )) {
174
181
return [];
175
182
}
@@ -217,14 +224,14 @@ private function evaluateBracket(string $expr, mixed $value): array
217
224
218
225
// filter expressions
219
226
if (preg_match ('/^\?(.*)$/ ' , $ expr , $ matches )) {
220
- $ filterExpr = $ matches [1 ];
227
+ $ filterExpr = trim ( $ matches [1 ]) ;
221
228
222
229
if (preg_match ('/^(\w+)\s*\([^()]*\)\s*([<>=!]+.*)?$/ ' , $ filterExpr )) {
223
230
$ filterExpr = "( $ filterExpr) " ;
224
231
}
225
232
226
233
if (!str_starts_with ($ filterExpr , '( ' )) {
227
- throw new JsonCrawlerException ( $ expr , ' Invalid filter expression ' ) ;
234
+ $ filterExpr = " ( $ filterExpr ) " ;
228
235
}
229
236
230
237
// remove outer filter parentheses
@@ -238,28 +245,31 @@ private function evaluateBracket(string $expr, mixed $value): array
238
245
$ parts = $ this ->parseCommaSeparatedValues ($ expr );
239
246
240
247
$ result = [];
241
- $ keysIndices = array_keys ($ value );
242
- $ isList = array_is_list ($ value );
243
248
244
249
foreach ($ parts as $ part ) {
245
250
$ part = trim ($ part );
246
251
247
- if (preg_match ('/^([ \'"])(.*)\1$/ ' , $ part , $ matches )) {
252
+ if ('* ' === $ part ) {
253
+ $ result = array_merge ($ result , array_values ($ value ));
254
+ } elseif (preg_match ('/^(-?\d*+)\s*+:\s*+(-?\d*+)(?:\s*+:\s*+(-?\d++))?$/ ' , $ part , $ matches )) {
255
+ // slice notation
256
+ $ sliceResult = $ this ->evaluateBracket ($ part , $ value );
257
+ $ result = array_merge ($ result , $ sliceResult );
258
+ } elseif (preg_match ('/^([ \'"])(.*)\1$/ ' , $ part , $ matches )) {
248
259
$ key = JsonPathUtils::unescapeString ($ matches [2 ], $ matches [1 ]);
249
260
250
- if ($ isList ) {
261
+ if (array_is_list ($ value )) {
262
+ // for arrays, find ALL objects that contain this key
251
263
foreach ($ value as $ item ) {
252
264
if (\is_array ($ item ) && \array_key_exists ($ key , $ item )) {
253
265
$ result [] = $ item ;
254
- break ;
255
266
}
256
267
}
257
-
258
- continue ; // no results here
259
- }
260
-
261
- if (\array_key_exists ($ key , $ value )) {
262
- $ result [] = $ value [$ key ];
268
+ } else {
269
+ // for objects, get the value for this key
270
+ if (\array_key_exists ($ key , $ value )) {
271
+ $ result [] = $ value [$ key ];
272
+ }
263
273
}
264
274
} elseif (preg_match ('/^-?\d+$/ ' , $ part )) {
265
275
// numeric index
@@ -268,14 +278,14 @@ private function evaluateBracket(string $expr, mixed $value): array
268
278
$ index = \count ($ value ) + $ index ;
269
279
}
270
280
271
- if ($ isList && \array_key_exists ($ index , $ value )) {
281
+ if (array_is_list ( $ value ) && \array_key_exists ($ index , $ value )) {
272
282
$ result [] = $ value [$ index ];
273
- continue ;
274
- }
275
-
276
- // numeric index on a hashmap
277
- if ( isset ( $ keysIndices [ $ index ]) && isset ( $ value [$ keysIndices [$ index ]])) {
278
- $ result [] = $ value [ $ keysIndices [ $ index ]];
283
+ } else {
284
+ // numeric index on a hashmap
285
+ $ keysIndices = array_keys ( $ value );
286
+ if ( isset ( $ keysIndices [ $ index]) && isset ( $ value [ $ keysIndices [ $ index ]])) {
287
+ $ result [] = $ value [$ keysIndices [$ index ]];
288
+ }
279
289
}
280
290
}
281
291
}
@@ -310,7 +320,32 @@ private function evaluateFilter(string $expr, mixed $value): array
310
320
311
321
private function evaluateFilterExpression (string $ expr , mixed $ context ): bool
312
322
{
313
- $ expr = trim ($ expr );
323
+ $ expr = JsonPathUtils::normalizeWhitespace ($ expr );
324
+
325
+ // remove outer parentheses if they wrap the entire expression
326
+ if (str_starts_with ($ expr , '( ' ) && str_ends_with ($ expr , ') ' )) {
327
+ $ depth = 0 ;
328
+ $ isWrapped = true ;
329
+ for ($ i = 0 ; $ i < strlen ($ expr ); $ i ++) {
330
+ if ($ expr [$ i ] === '( ' ) {
331
+ $ depth ++;
332
+ } elseif ($ expr [$ i ] === ') ' ) {
333
+ $ depth --;
334
+ if ($ depth === 0 && $ i < strlen ($ expr ) - 1 ) {
335
+ $ isWrapped = false ;
336
+ break ;
337
+ }
338
+ }
339
+ }
340
+ if ($ isWrapped ) {
341
+ $ expr = trim (substr ($ expr , 1 , -1 ));
342
+ }
343
+ }
344
+
345
+ if (str_starts_with ($ expr , '! ' )) {
346
+ $ innerExpr = trim (substr ($ expr , 1 ));
347
+ return !$ this ->evaluateFilterExpression ($ innerExpr , $ context );
348
+ }
314
349
315
350
if (str_contains ($ expr , '&& ' )) {
316
351
$ parts = array_map ('trim ' , explode ('&& ' , $ expr ));
@@ -353,8 +388,8 @@ private function evaluateFilterExpression(string $expr, mixed $context): bool
353
388
}
354
389
355
390
// function calls
356
- if (preg_match ('/^(\w+) \((.*)\)$/ ' , $ expr , $ matches )) {
357
- $ functionName = $ matches [1 ];
391
+ if (preg_match ('/^(\w++)\s*+ \((.*)\)$/ ' , $ expr , $ matches )) {
392
+ $ functionName = trim ( $ matches [1 ]) ;
358
393
if (!isset (self ::RFC9535_FUNCTIONS [$ functionName ])) {
359
394
throw new JsonCrawlerException ($ expr , \sprintf ('invalid function "%s" ' , $ functionName ));
360
395
}
@@ -369,8 +404,16 @@ private function evaluateFilterExpression(string $expr, mixed $context): bool
369
404
370
405
private function evaluateScalar (string $ expr , mixed $ context ): mixed
371
406
{
372
- if (is_numeric ($ expr )) {
373
- return str_contains ($ expr , '. ' ) ? (float ) $ expr : (int ) $ expr ;
407
+ $ expr = JsonPathUtils::normalizeWhitespace ($ expr );
408
+
409
+ // RFC 9535 compliant number validation using strict JSON number format
410
+ if (preg_match ('/^-?(0|[1-9]\d*)(\.\d+)?([eE][+-]?\d+)?$/ ' , $ expr )) {
411
+ return str_contains ($ expr , '. ' ) || str_contains (strtolower ($ expr ), 'e ' ) ? (float ) $ expr : (int ) $ expr ;
412
+ }
413
+
414
+ // only validate tokens that look like standalone numbers
415
+ if (preg_match ('/^[\d+\-.eE]+$/ ' , $ expr ) && preg_match ('/\d/ ' , $ expr )) {
416
+ throw new JsonCrawlerException ($ expr , \sprintf ('Invalid number format "%s" ' , $ expr ));
374
417
}
375
418
376
419
if ('@ ' === $ expr ) {
@@ -404,8 +447,8 @@ private function evaluateScalar(string $expr, mixed $context): mixed
404
447
}
405
448
406
449
// function calls
407
- if (preg_match ('/^(\w+)\((.*)\)$/ ' , $ expr , $ matches )) {
408
- $ functionName = $ matches [1 ];
450
+ if (preg_match ('/^(\w++ )\((.*)\)$/ ' , $ expr , $ matches )) {
451
+ $ functionName = trim ( $ matches [1 ]) ;
409
452
if (!isset (self ::RFC9535_FUNCTIONS [$ functionName ])) {
410
453
throw new JsonCrawlerException ($ expr , \sprintf ('invalid function "%s" ' , $ functionName ));
411
454
}
@@ -416,31 +459,65 @@ private function evaluateScalar(string $expr, mixed $context): mixed
416
459
return null ;
417
460
}
418
461
419
- private function evaluateFunction (string $ name , string $ args , array $ context ): mixed
462
+ private function evaluateFunction (string $ name , string $ args , mixed $ context ): mixed
420
463
{
421
- $ args = array_map (
422
- fn ($ arg ) => $ this ->evaluateScalar (trim ($ arg ), $ context ),
423
- explode (', ' , $ args )
424
- );
464
+ $ argList = [];
465
+ $ nodelistSizes = [];
466
+ if ($ args = trim ($ args )) {
467
+ $ args = $ this ->parseCommaSeparatedValues ($ args );
468
+ foreach ($ args as $ arg ) {
469
+ $ arg = trim ($ arg );
470
+ if (str_starts_with ($ arg , '@ ' )) { // special handling for @ to track nodelist size
471
+ if ('@ ' === $ arg ) {
472
+ $ argList [] = $ context ;
473
+ $ nodelistSizes [] = 1 ;
474
+ } elseif (!\is_array ($ context )) {
475
+ $ argList [] = null ;
476
+ $ nodelistSizes [] = 0 ;
477
+ } else {
478
+ $ pathPart = substr ($ arg , 1 );
479
+ if (str_starts_with ($ pathPart , '[ ' )) {
480
+ // handle bracket expressions like @['a','d']
481
+ $ results = $ this ->evaluateBracket (substr ($ pathPart , 1 , -1 ), $ context );
482
+ $ argList [] = $ results ;
483
+ $ nodelistSizes [] = \count ($ results );
484
+ } else {
485
+ // handle dot notation like @.a
486
+ $ results = $ this ->evaluateTokensOnDecodedData (JsonPathTokenizer::tokenize (new JsonPath ('$ ' .$ pathPart )), $ context );
487
+ $ argList [] = $ results [0 ] ?? null ;
488
+ $ nodelistSizes [] = \count ($ results );
489
+ }
490
+ }
491
+ } elseif (str_starts_with ($ arg , '$ ' )) { // special handling for absolute paths
492
+ $ results = $ this ->evaluate (new JsonPath ($ arg ));
493
+ $ argList [] = $ results [0 ] ?? null ;
494
+ $ nodelistSizes [] = \count ($ results );
495
+ } else {
496
+ $ argList [] = $ this ->evaluateScalar ($ arg , $ context );
497
+ $ nodelistSizes [] = 1 ;
498
+ }
499
+ }
500
+ }
425
501
426
- $ value = $ args [0 ] ?? null ;
502
+ $ value = $ argList [0 ] ?? null ;
503
+ $ nodelistSize = $ nodelistSizes [0 ] ?? 0 ;
427
504
428
505
return match ($ name ) {
429
506
'length ' => match (true ) {
430
507
\is_string ($ value ) => mb_strlen ($ value ),
431
508
\is_array ($ value ) => \count ($ value ),
432
509
default => 0 ,
433
510
},
434
- 'count ' => \is_array ( $ value ) ? \count ( $ value ) : 0 ,
511
+ 'count ' => $ nodelistSize ,
435
512
'match ' => match (true ) {
436
- \is_string ($ value ) && \is_string ($ args [1 ] ?? null ) => (bool ) @preg_match (\sprintf ('/^%s$/ ' , $ args [1 ]), $ value ),
513
+ \is_string ($ value ) && \is_string ($ argList [1 ] ?? null ) => (bool ) @preg_match (\sprintf ('/^%s$/u ' , $ this -> transformJsonPathRegex ( $ argList [1 ]) ), $ value ),
437
514
default => false ,
438
515
},
439
516
'search ' => match (true ) {
440
- \is_string ($ value ) && \is_string ($ args [1 ] ?? null ) => (bool ) @preg_match ("/ $ args [1 ]/ " , $ value ),
517
+ \is_string ($ value ) && \is_string ($ argList [1 ] ?? null ) => (bool ) @preg_match ("/ { $ this -> transformJsonPathRegex ( $ argList [1 ])} /u " , $ value ),
441
518
default => false ,
442
519
},
443
- 'value ' => $ value ,
520
+ 'value ' => 1 < $ nodelistSize ? null : ( 1 === $ nodelistSize ? ( \is_array ( $ value) ? ( $ value [ 0 ] ?? null ) : $ value ) : $ value ) ,
444
521
default => null ,
445
522
};
446
523
}
@@ -480,6 +557,7 @@ private function parseCommaSeparatedValues(string $expr): array
480
557
$ current = '' ;
481
558
$ inQuotes = false ;
482
559
$ quoteChar = null ;
560
+ $ bracketDepth = 0 ;
483
561
484
562
for ($ i = 0 ; $ i < \strlen ($ expr ); ++$ i ) {
485
563
$ char = $ expr [$ i ];
@@ -497,7 +575,11 @@ private function parseCommaSeparatedValues(string $expr): array
497
575
$ inQuotes = false ;
498
576
$ quoteChar = null ;
499
577
}
500
- } elseif (!$ inQuotes && ', ' === $ char ) {
578
+ } elseif (!$ inQuotes && '[ ' === $ char ) {
579
+ ++$ bracketDepth ;
580
+ } elseif (!$ inQuotes && '] ' === $ char ) {
581
+ --$ bracketDepth ;
582
+ } elseif (!$ inQuotes && 0 === $ bracketDepth && ', ' === $ char ) {
501
583
$ parts [] = trim ($ current );
502
584
$ current = '' ;
503
585
@@ -513,4 +595,52 @@ private function parseCommaSeparatedValues(string $expr): array
513
595
514
596
return $ parts ;
515
597
}
598
+
599
+ /*
600
+ * Transform JSONPath regex patterns to comply with RFC 9535. The main issue is
601
+ * that '.' should not match \r or \n but should match Unicode line separators U+2028 and U+2029
602
+ */
603
+ private function transformJsonPathRegex (string $ pattern ): string
604
+ {
605
+ $ result = '' ;
606
+ $ inCharClass = false ;
607
+ $ escaped = false ;
608
+ $ length = strlen ($ pattern );
609
+
610
+ for ($ i = 0 ; $ i < $ length ; $ i ++) {
611
+ $ char = $ pattern [$ i ];
612
+
613
+ if ($ escaped ) {
614
+ $ result .= $ char ;
615
+ $ escaped = false ;
616
+ continue ;
617
+ }
618
+
619
+ if ($ char === '\\' ) {
620
+ $ result .= $ char ;
621
+ $ escaped = true ;
622
+ continue ;
623
+ }
624
+
625
+ if ($ char === '[ ' && !$ inCharClass ) {
626
+ $ inCharClass = true ;
627
+ $ result .= $ char ;
628
+ continue ;
629
+ }
630
+
631
+ if ($ char === '] ' && $ inCharClass ) {
632
+ $ inCharClass = false ;
633
+ $ result .= $ char ;
634
+ continue ;
635
+ }
636
+
637
+ if ($ char === '. ' && !$ inCharClass ) {
638
+ $ result .= '(?:[^\r\n]|\x{2028}|\x{2029}) ' ;
639
+ } else {
640
+ $ result .= $ char ;
641
+ }
642
+ }
643
+
644
+ return $ result ;
645
+ }
516
646
}
0 commit comments