Skip to content

Commit 2246c96

Browse files
authored
Merge pull request #15244 from Marcono1234/marcono1234/regex-flags
Java: Improve Regex flag parsing
2 parents 6c9f79c + 3edfdc5 commit 2246c96

File tree

5 files changed

+60
-15
lines changed

5 files changed

+60
-15
lines changed
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
category: fix
3+
---
4+
* Fixed regular expressions containing flags not being parsed correctly in some cases.

java/ql/lib/semmle/code/java/regex/regex.qll

Lines changed: 25 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -479,7 +479,7 @@ abstract class RegexString extends StringLiteral {
479479
private predicate flagGroupStartNoModes(int start, int end) {
480480
this.isGroupStart(start) and
481481
this.getChar(start + 1) = "?" and
482-
this.getChar(start + 2) in ["i", "m", "s", "u", "x", "U"] and
482+
this.getChar(start + 2) in ["-", "i", "d", "m", "s", "u", "x", "U"] and
483483
end = start + 2
484484
}
485485

@@ -491,15 +491,18 @@ abstract class RegexString extends StringLiteral {
491491
this.flagGroupStartNoModes(start, pos)
492492
or
493493
this.modeCharacter(start, pos - 1) and
494-
this.getChar(pos) in ["i", "m", "s", "u", "x", "U"]
494+
this.getChar(pos) in ["-", "i", "d", "m", "s", "u", "x", "U"]
495495
}
496496

497497
/**
498498
* Holds if a parse mode group is between `start` and `end`.
499499
*/
500500
private predicate flagGroupStart(int start, int end) {
501501
this.flagGroupStartNoModes(start, _) and
502-
end = max(int i | this.modeCharacter(start, i) | i + 1)
502+
// Check if this is a capturing group with flags, and therefore the `:` should be excluded
503+
exists(int maybeEnd | maybeEnd = max(int i | this.modeCharacter(start, i) | i + 1) |
504+
if this.getChar(maybeEnd) = ":" then end = maybeEnd + 1 else end = maybeEnd
505+
)
503506
}
504507

505508
/**
@@ -510,9 +513,15 @@ abstract class RegexString extends StringLiteral {
510513
* ```
511514
*/
512515
private predicate flag(string c) {
513-
exists(int pos |
514-
this.modeCharacter(_, pos) and
515-
this.getChar(pos) = c
516+
exists(int start, int pos |
517+
this.modeCharacter(start, pos) and
518+
this.getChar(pos) = c and
519+
// Ignore if flag is disabled; use `<=` to also exclude `-` itself
520+
// This does not properly handle the (contrived) case where a flag is both enabled and
521+
// disabled, e.g. `(?i-i)a+`, in which case the flag seems to acts as if it was disabled
522+
not exists(int minusPos |
523+
this.modeCharacter(start, minusPos) and this.getChar(minusPos) = "-" and minusPos <= pos
524+
)
516525
)
517526
}
518527

@@ -524,6 +533,8 @@ abstract class RegexString extends StringLiteral {
524533
exists(string c | this.flag(c) |
525534
c = "i" and result = "IGNORECASE"
526535
or
536+
c = "d" and result = "UNIXLINES"
537+
or
527538
c = "m" and result = "MULTILINE"
528539
or
529540
c = "s" and result = "DOTALL"
@@ -930,13 +941,13 @@ class Regex extends RegexString {
930941

931942
/**
932943
* Gets a mode (if any) of this regular expression. Can be any of:
933-
* DEBUG
934-
* IGNORECASE
935-
* MULTILINE
936-
* DOTALL
937-
* UNICODE
938-
* VERBOSE
939-
* UNICODECLASS
944+
* - IGNORECASE
945+
* - UNIXLINES
946+
* - MULTILINE
947+
* - DOTALL
948+
* - UNICODE
949+
* - VERBOSE
950+
* - UNICODECLASS
940951
*/
941952
string getAMode() {
942953
result != "None" and
@@ -946,7 +957,7 @@ class Regex extends RegexString {
946957
}
947958

948959
/**
949-
* Holds if this regex is used to match against a full string,
960+
* Holds if this regex is used to match against a full string,
950961
* as though it was implicitly surrounded by ^ and $.
951962
*/
952963
predicate matchesFullString() { matches_full_string = true }

java/ql/test/library-tests/regex/parser/RegexParseTests.expected

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
11
parseFailures
2+
modes
3+
| Test.java:17:9:17:37 | "(?i)(?=a)(?!b)(?<=c)(?<!d)+" | IGNORECASE |
4+
| Test.java:22:9:22:85 | "(?idmsuxU-idmsuxU)a+(?-idmsuxU)b+(?idmsuxU:c)d+(?-idmsuxU:e)f+(?idmsuxU:)g+" | DOTALL,IGNORECASE,MULTILINE,UNICODE,UNICODECLASS,UNIXLINES,VERBOSE |
5+
| Test.java:23:9:23:24 | "(?idms-iuxU)a+" | DOTALL,IGNORECASE,MULTILINE,UNIXLINES |
26
#select
37
| Test.java:5:10:5:17 | [A-Z\\d] | [RegExpCharacterClass] |
48
| Test.java:5:10:5:19 | [A-Z\\d]++ | [RegExpPlus] |
@@ -205,3 +209,25 @@ parseFailures
205209
| Test.java:21:62:21:62 | b | [RegExpConstant,RegExpNormalChar] |
206210
| Test.java:21:64:21:64 | b | [RegExpConstant,RegExpNormalChar] |
207211
| Test.java:21:66:21:66 | b | [RegExpConstant,RegExpNormalChar] |
212+
| Test.java:22:10:22:27 | (?idmsuxU-idmsuxU) | [RegExpZeroWidthMatch] |
213+
| Test.java:22:10:22:84 | (?idmsuxU-idmsuxU)a+(?-idmsuxU)b+(?idmsuxU:c)d+(?-idmsuxU:e)f+(?idmsuxU:)g+ | [RegExpSequence] |
214+
| Test.java:22:28:22:28 | a | [RegExpConstant,RegExpNormalChar] |
215+
| Test.java:22:28:22:29 | a+ | [RegExpPlus] |
216+
| Test.java:22:30:22:40 | (?-idmsuxU) | [RegExpZeroWidthMatch] |
217+
| Test.java:22:41:22:41 | b | [RegExpConstant,RegExpNormalChar] |
218+
| Test.java:22:41:22:42 | b+ | [RegExpPlus] |
219+
| Test.java:22:43:22:54 | (?idmsuxU:c) | [RegExpGroup] |
220+
| Test.java:22:53:22:53 | c | [RegExpConstant,RegExpNormalChar] |
221+
| Test.java:22:55:22:55 | d | [RegExpConstant,RegExpNormalChar] |
222+
| Test.java:22:55:22:56 | d+ | [RegExpPlus] |
223+
| Test.java:22:57:22:69 | (?-idmsuxU:e) | [RegExpGroup] |
224+
| Test.java:22:68:22:68 | e | [RegExpConstant,RegExpNormalChar] |
225+
| Test.java:22:70:22:70 | f | [RegExpConstant,RegExpNormalChar] |
226+
| Test.java:22:70:22:71 | f+ | [RegExpPlus] |
227+
| Test.java:22:72:22:82 | (?idmsuxU:) | [RegExpZeroWidthMatch] |
228+
| Test.java:22:83:22:83 | g | [RegExpConstant,RegExpNormalChar] |
229+
| Test.java:22:83:22:84 | g+ | [RegExpPlus] |
230+
| Test.java:23:10:23:21 | (?idms-iuxU) | [RegExpZeroWidthMatch] |
231+
| Test.java:23:10:23:23 | (?idms-iuxU)a+ | [RegExpSequence] |
232+
| Test.java:23:22:23:22 | a | [RegExpConstant,RegExpNormalChar] |
233+
| Test.java:23:22:23:23 | a+ | [RegExpPlus] |

java/ql/test/library-tests/regex/parser/RegexParseTests.ql

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,5 +8,7 @@ string getQLClases(RegexTreeView::RegExpTerm t) {
88

99
query predicate parseFailures(Regex::Regex r, int i) { r.failedToParse(i) }
1010

11+
query predicate modes(Regex::Regex r, string modes) { modes = strictconcat(r.getAMode(), ",") }
12+
1113
from RegexTreeView::RegExpTerm t
1214
select t, getQLClases(t)

java/ql/test/library-tests/regex/parser/Test.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,9 @@ class Test {
1818
"a||b|c(d|e|)f|g+",
1919
"\\018\\033\\0377\\0777\u1337+",
2020
"[|]+",
21-
"(a(a(a(a(a(a((((c))))a))))))((((((b(((((d)))))b)b)b)b)b)b)+"
21+
"(a(a(a(a(a(a((((c))))a))))))((((((b(((((d)))))b)b)b)b)b)b)+",
22+
"(?idmsuxU-idmsuxU)a+(?-idmsuxU)b+(?idmsuxU:c)d+(?-idmsuxU:e)f+(?idmsuxU:)g+",
23+
"(?idms-iuxU)a+",
2224
};
2325

2426
void test() {

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy