Skip to content

Commit 053a7d6

Browse files
committed
Ruby/Python: regex parser: group sequences of 'normal' characters
1 parent 36e02ae commit 053a7d6

File tree

8 files changed

+159
-230
lines changed

8 files changed

+159
-230
lines changed

python/ql/lib/semmle/python/RegexTreeView.qll

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,12 @@ newtype TRegExpParent =
3939
/** A special character */
4040
TRegExpSpecialChar(Regex re, int start, int end) { re.specialCharacter(start, end, _) } or
4141
/** A normal character */
42-
TRegExpNormalChar(Regex re, int start, int end) { re.normalCharacter(start, end) } or
42+
TRegExpNormalChar(Regex re, int start, int end) {
43+
re.normalCharacterSequence(start, end)
44+
or
45+
re.escapedCharacter(start, end) and
46+
not re.specialCharacter(start, end, _)
47+
} or
4348
/** A back reference */
4449
TRegExpBackRef(Regex re, int start, int end) { re.backreference(start, end) }
4550

python/ql/lib/semmle/python/regex.qll

Lines changed: 41 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -446,6 +446,42 @@ abstract class RegexString extends Expr {
446446
)
447447
}
448448

449+
predicate normalCharacterSequence(int start, int end) {
450+
this.normalCharacter(start, end) and
451+
end = start + 1 and
452+
exists(int x, int y | this.charSet(x, y) and x <= start and y >= end)
453+
or
454+
exists(int s, int e |
455+
e = max(int i | normalCharacterSub(s, i)) and
456+
not exists(int x, int y | this.charSet(x, y) and x <= s and y >= e)
457+
|
458+
if qualifier(e, _, _, _)
459+
then
460+
end = e and start = e - 1
461+
or
462+
end = e - 1 and start = s and start < end
463+
else (
464+
end = e and
465+
start = s
466+
)
467+
)
468+
}
469+
470+
private predicate normalCharacterSub(int start, int end) {
471+
(
472+
normalCharacterSub(start, end - 1)
473+
or
474+
start = end - 1 and not normalCharacter(start - 1, start)
475+
) and
476+
this.normalCharacter(end - 1, end)
477+
}
478+
479+
private predicate characterItem(int start, int end) {
480+
this.normalCharacterSequence(start, end) or
481+
this.escapedCharacter(start, end) or
482+
this.specialCharacter(start, end, _)
483+
}
484+
449485
/** Whether the text in the range start,end is a group */
450486
predicate group(int start, int end) {
451487
this.groupContents(start, end, _, _)
@@ -717,7 +753,7 @@ abstract class RegexString extends Expr {
717753
string getBackrefName(int start, int end) { this.named_backreference(start, end, result) }
718754

719755
private predicate baseItem(int start, int end) {
720-
this.character(start, end) and
756+
this.characterItem(start, end) and
721757
not exists(int x, int y | this.charSet(x, y) and x <= start and y >= end)
722758
or
723759
this.group(start, end)
@@ -837,14 +873,14 @@ abstract class RegexString extends Expr {
837873
}
838874

839875
private predicate item_start(int start) {
840-
this.character(start, _) or
876+
this.characterItem(start, _) or
841877
this.isGroupStart(start) or
842878
this.charSet(start, _) or
843879
this.backreference(start, _)
844880
}
845881

846882
private predicate item_end(int end) {
847-
this.character(_, end)
883+
this.characterItem(_, end)
848884
or
849885
exists(int endm1 | this.isGroupEnd(endm1) and end = endm1 + 1)
850886
or
@@ -953,7 +989,7 @@ abstract class RegexString extends Expr {
953989
*/
954990
predicate firstItem(int start, int end) {
955991
(
956-
this.character(start, end)
992+
this.characterItem(start, end)
957993
or
958994
this.qualifiedItem(start, end, _, _)
959995
or
@@ -968,7 +1004,7 @@ abstract class RegexString extends Expr {
9681004
*/
9691005
predicate lastItem(int start, int end) {
9701006
(
971-
this.character(start, end)
1007+
this.characterItem(start, end)
9721008
or
9731009
this.qualifiedItem(start, end, _, _)
9741010
or

python/ql/test/library-tests/regex/FirstLast.expected

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
| 012345678 | first | 0 | 1 |
2-
| 012345678 | last | 8 | 9 |
3-
| (?!not-this)^[A-Z_]+$ | first | 3 | 4 |
1+
| 012345678 | first | 0 | 9 |
2+
| 012345678 | last | 0 | 9 |
3+
| (?!not-this)^[A-Z_]+$ | first | 3 | 11 |
44
| (?!not-this)^[A-Z_]+$ | first | 12 | 13 |
55
| (?!not-this)^[A-Z_]+$ | first | 13 | 19 |
66
| (?!not-this)^[A-Z_]+$ | first | 13 | 20 |
@@ -27,9 +27,9 @@
2727
| (?m)^(?!$) | last | 4 | 5 |
2828
| (?m)^(?!$) | last | 8 | 9 |
2929
| (\\033\|~{) | first | 1 | 5 |
30-
| (\\033\|~{) | first | 6 | 7 |
30+
| (\\033\|~{) | first | 6 | 8 |
3131
| (\\033\|~{) | last | 1 | 5 |
32-
| (\\033\|~{) | last | 7 | 8 |
32+
| (\\033\|~{) | last | 6 | 8 |
3333
| [\ufffd-\ufffd] | first | 0 | 5 |
3434
| [\ufffd-\ufffd] | last | 0 | 5 |
3535
| [\ufffd-\ufffd][\ufffd-\ufffd] | first | 0 | 5 |
@@ -52,8 +52,8 @@
5252
| \\A[+-]?\\d+ | last | 7 | 9 |
5353
| \\A[+-]?\\d+ | last | 7 | 10 |
5454
| \\Afoo\\Z | first | 0 | 2 |
55-
| \\Afoo\\Z | first | 2 | 3 |
56-
| \\Afoo\\Z | last | 4 | 5 |
55+
| \\Afoo\\Z | first | 2 | 5 |
56+
| \\Afoo\\Z | last | 2 | 5 |
5757
| \\Afoo\\Z | last | 5 | 7 |
5858
| \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | first | 0 | 2 |
5959
| \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | last | 28 | 32 |
@@ -86,30 +86,30 @@
8686
| ^[A-Z_]+$(?<!not-this) | last | 1 | 7 |
8787
| ^[A-Z_]+$(?<!not-this) | last | 1 | 8 |
8888
| ^[A-Z_]+$(?<!not-this) | last | 8 | 9 |
89-
| ^[A-Z_]+$(?<!not-this) | last | 20 | 21 |
89+
| ^[A-Z_]+$(?<!not-this) | last | 13 | 21 |
9090
| ax{01,3} | first | 0 | 1 |
9191
| ax{01,3} | last | 1 | 2 |
9292
| ax{01,3} | last | 1 | 8 |
93-
| ax{01,3} | last | 7 | 8 |
93+
| ax{01,3} | last | 3 | 8 |
9494
| ax{3,} | first | 0 | 1 |
9595
| ax{3,} | last | 1 | 2 |
9696
| ax{3,} | last | 1 | 6 |
97-
| ax{3,} | last | 5 | 6 |
97+
| ax{3,} | last | 3 | 6 |
9898
| ax{3} | first | 0 | 1 |
9999
| ax{3} | last | 1 | 2 |
100100
| ax{3} | last | 1 | 5 |
101-
| ax{3} | last | 4 | 5 |
101+
| ax{3} | last | 3 | 5 |
102102
| ax{,3} | first | 0 | 1 |
103103
| ax{,3} | last | 0 | 1 |
104104
| ax{,3} | last | 1 | 2 |
105105
| ax{,3} | last | 1 | 6 |
106-
| ax{,3} | last | 5 | 6 |
106+
| ax{,3} | last | 3 | 6 |
107107
| x\| | first | 0 | 1 |
108108
| x\| | last | 0 | 1 |
109109
| x\|(?<!\\w)l | first | 0 | 1 |
110110
| x\|(?<!\\w)l | first | 6 | 8 |
111111
| x\|(?<!\\w)l | first | 9 | 10 |
112112
| x\|(?<!\\w)l | last | 0 | 1 |
113113
| x\|(?<!\\w)l | last | 9 | 10 |
114-
| x{Not qual} | first | 0 | 1 |
115-
| x{Not qual} | last | 10 | 11 |
114+
| x{Not qual} | first | 0 | 11 |
115+
| x{Not qual} | last | 0 | 11 |

ruby/ql/lib/codeql/ruby/security/performance/ParseRegExp.qll

Lines changed: 41 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -401,6 +401,42 @@ class RegExp extends AST::RegExpLiteral {
401401
)
402402
}
403403

404+
predicate normalCharacterSequence(int start, int end) {
405+
this.normalCharacter(start, end) and
406+
end = start + 1 and
407+
exists(int x, int y | this.charSet(x, y) and x <= start and y >= end)
408+
or
409+
exists(int s, int e |
410+
e = max(int i | normalCharacterSub(s, i)) and
411+
not exists(int x, int y | this.charSet(x, y) and x <= s and y >= e)
412+
|
413+
if qualifier(e, _, _, _)
414+
then
415+
end = e and start = e - 1
416+
or
417+
end = e - 1 and start = s and start < end
418+
else (
419+
end = e and
420+
start = s
421+
)
422+
)
423+
}
424+
425+
private predicate normalCharacterSub(int start, int end) {
426+
(
427+
normalCharacterSub(start, end - 1)
428+
or
429+
start = end - 1 and not normalCharacter(start - 1, start)
430+
) and
431+
this.normalCharacter(end - 1, end)
432+
}
433+
434+
private predicate characterItem(int start, int end) {
435+
this.normalCharacterSequence(start, end) or
436+
this.escapedCharacter(start, end) or
437+
this.specialCharacter(start, end, _)
438+
}
439+
404440
/** Whether the text in the range `start,end` is a group */
405441
predicate group(int start, int end) {
406442
this.groupContents(start, end, _, _)
@@ -639,7 +675,7 @@ class RegExp extends AST::RegExpLiteral {
639675
string getBackRefName(int start, int end) { this.namedBackreference(start, end, result) }
640676

641677
private predicate baseItem(int start, int end) {
642-
this.character(start, end) and
678+
this.characterItem(start, end) and
643679
not exists(int x, int y | this.charSet(x, y) and x <= start and y >= end)
644680
or
645681
this.group(start, end)
@@ -746,15 +782,15 @@ class RegExp extends AST::RegExpLiteral {
746782
}
747783

748784
private predicate itemStart(int start) {
749-
this.character(start, _) or
785+
this.characterItem(start, _) or
750786
this.isGroupStart(start) or
751787
this.charSet(start, _) or
752788
this.backreference(start, _) or
753789
this.namedCharacterProperty(start, _, _)
754790
}
755791

756792
private predicate itemEnd(int end) {
757-
this.character(_, end)
793+
this.characterItem(_, end)
758794
or
759795
exists(int endm1 | this.isGroupEnd(endm1) and end = endm1 + 1)
760796
or
@@ -865,7 +901,7 @@ class RegExp extends AST::RegExpLiteral {
865901
*/
866902
predicate firstItem(int start, int end) {
867903
(
868-
this.character(start, end)
904+
this.characterItem(start, end)
869905
or
870906
this.qualifiedItem(start, end, _, _)
871907
or
@@ -880,7 +916,7 @@ class RegExp extends AST::RegExpLiteral {
880916
*/
881917
predicate lastItem(int start, int end) {
882918
(
883-
this.character(start, end)
919+
this.characterItem(start, end)
884920
or
885921
this.qualifiedItem(start, end, _, _)
886922
or

ruby/ql/lib/codeql/ruby/security/performance/RegExpTreeView.qll

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,12 @@ newtype TRegExpParent =
228228
TRegExpCharacterRange(RegExp re, int start, int end) { re.charRange(_, start, _, _, end) } or
229229
TRegExpGroup(RegExp re, int start, int end) { re.group(start, end) } or
230230
TRegExpSpecialChar(RegExp re, int start, int end) { re.specialCharacter(start, end, _) } or
231-
TRegExpNormalChar(RegExp re, int start, int end) { re.normalCharacter(start, end) } or
231+
TRegExpNormalChar(RegExp re, int start, int end) {
232+
re.normalCharacterSequence(start, end)
233+
or
234+
re.escapedCharacter(start, end) and
235+
not re.specialCharacter(start, end, _)
236+
} or
232237
TRegExpBackRef(RegExp re, int start, int end) { re.backreference(start, end) } or
233238
TRegExpNamedCharacterProperty(RegExp re, int start, int end) {
234239
re.namedCharacterProperty(start, end, _)

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy