From 69ed121ecb3e8f0425dc73bf0c6a8a2cf48c1250 Mon Sep 17 00:00:00 2001 From: Arthur Baars Date: Tue, 22 Feb 2022 10:51:47 +0100 Subject: [PATCH 1/4] Ruby/Python: regex parser: group sequences of 'normal' characters --- python/ql/lib/semmle/python/RegexTreeView.qll | 7 +- python/ql/lib/semmle/python/regex.qll | 49 +++++- .../library-tests/regex/FirstLast.expected | 28 +-- .../Security/CWE-730-ReDoS/ReDoS.expected | 2 +- .../ruby/security/performance/ParseRegExp.qll | 49 +++++- .../security/performance/RegExpTreeView.qll | 7 +- ruby/ql/test/library-tests/ast/Ast.expected | 92 +++------- .../test/library-tests/regexp/parse.expected | 161 ++++-------------- .../cwe-1333-exponential-redos/ReDoS.expected | 2 +- 9 files changed, 166 insertions(+), 231 deletions(-) diff --git a/python/ql/lib/semmle/python/RegexTreeView.qll b/python/ql/lib/semmle/python/RegexTreeView.qll index 808bb265b697..95d983f5e88f 100644 --- a/python/ql/lib/semmle/python/RegexTreeView.qll +++ b/python/ql/lib/semmle/python/RegexTreeView.qll @@ -39,7 +39,12 @@ newtype TRegExpParent = /** A special character */ TRegExpSpecialChar(Regex re, int start, int end) { re.specialCharacter(start, end, _) } or /** A normal character */ - TRegExpNormalChar(Regex re, int start, int end) { re.normalCharacter(start, end) } or + TRegExpNormalChar(Regex re, int start, int end) { + re.normalCharacterSequence(start, end) + or + re.escapedCharacter(start, end) and + not re.specialCharacter(start, end, _) + } or /** A back reference */ TRegExpBackRef(Regex re, int start, int end) { re.backreference(start, end) } diff --git a/python/ql/lib/semmle/python/regex.qll b/python/ql/lib/semmle/python/regex.qll index 001b3bdc635c..910169d20836 100644 --- a/python/ql/lib/semmle/python/regex.qll +++ b/python/ql/lib/semmle/python/regex.qll @@ -446,6 +446,45 @@ abstract class RegexString extends Expr { ) } + /** + * A sequence of 'normal' characters. + */ + predicate normalCharacterSequence(int start, int end) { + this.normalCharacter(start, end) and + end = start + 1 and + exists(int x, int y | this.charSet(x, y) and x <= start and y >= end) + or + exists(int s, int e | + e = max(int i | normalCharacterSub(s, i)) and + not exists(int x, int y | this.charSet(x, y) and x <= s and y >= e) + | + if qualifier(e, _, _, _) + then + end = e and start = e - 1 + or + end = e - 1 and start = s and start < end + else ( + end = e and + start = s + ) + ) + } + + private predicate normalCharacterSub(int start, int end) { + ( + normalCharacterSub(start, end - 1) + or + start = end - 1 and not normalCharacter(start - 1, start) + ) and + this.normalCharacter(end - 1, end) + } + + private predicate characterItem(int start, int end) { + this.normalCharacterSequence(start, end) or + this.escapedCharacter(start, end) or + this.specialCharacter(start, end, _) + } + /** Whether the text in the range start,end is a group */ predicate group(int start, int end) { this.groupContents(start, end, _, _) @@ -717,7 +756,7 @@ abstract class RegexString extends Expr { string getBackrefName(int start, int end) { this.named_backreference(start, end, result) } private predicate baseItem(int start, int end) { - this.character(start, end) and + this.characterItem(start, end) and not exists(int x, int y | this.charSet(x, y) and x <= start and y >= end) or this.group(start, end) @@ -837,14 +876,14 @@ abstract class RegexString extends Expr { } private predicate item_start(int start) { - this.character(start, _) or + this.characterItem(start, _) or this.isGroupStart(start) or this.charSet(start, _) or this.backreference(start, _) } private predicate item_end(int end) { - this.character(_, end) + this.characterItem(_, end) or exists(int endm1 | this.isGroupEnd(endm1) and end = endm1 + 1) or @@ -953,7 +992,7 @@ abstract class RegexString extends Expr { */ predicate firstItem(int start, int end) { ( - this.character(start, end) + this.characterItem(start, end) or this.qualifiedItem(start, end, _, _) or @@ -968,7 +1007,7 @@ abstract class RegexString extends Expr { */ predicate lastItem(int start, int end) { ( - this.character(start, end) + this.characterItem(start, end) or this.qualifiedItem(start, end, _, _) or diff --git a/python/ql/test/library-tests/regex/FirstLast.expected b/python/ql/test/library-tests/regex/FirstLast.expected index 5c393547a53c..e388e0d1fdf7 100644 --- a/python/ql/test/library-tests/regex/FirstLast.expected +++ b/python/ql/test/library-tests/regex/FirstLast.expected @@ -1,6 +1,6 @@ -| 012345678 | first | 0 | 1 | -| 012345678 | last | 8 | 9 | -| (?!not-this)^[A-Z_]+$ | first | 3 | 4 | +| 012345678 | first | 0 | 9 | +| 012345678 | last | 0 | 9 | +| (?!not-this)^[A-Z_]+$ | first | 3 | 11 | | (?!not-this)^[A-Z_]+$ | first | 12 | 13 | | (?!not-this)^[A-Z_]+$ | first | 13 | 19 | | (?!not-this)^[A-Z_]+$ | first | 13 | 20 | @@ -27,9 +27,9 @@ | (?m)^(?!$) | last | 4 | 5 | | (?m)^(?!$) | last | 8 | 9 | | (\\033\|~{) | first | 1 | 5 | -| (\\033\|~{) | first | 6 | 7 | +| (\\033\|~{) | first | 6 | 8 | | (\\033\|~{) | last | 1 | 5 | -| (\\033\|~{) | last | 7 | 8 | +| (\\033\|~{) | last | 6 | 8 | | [\ufffd-\ufffd] | first | 0 | 5 | | [\ufffd-\ufffd] | last | 0 | 5 | | [\ufffd-\ufffd][\ufffd-\ufffd] | first | 0 | 5 | @@ -52,8 +52,8 @@ | \\A[+-]?\\d+ | last | 7 | 9 | | \\A[+-]?\\d+ | last | 7 | 10 | | \\Afoo\\Z | first | 0 | 2 | -| \\Afoo\\Z | first | 2 | 3 | -| \\Afoo\\Z | last | 4 | 5 | +| \\Afoo\\Z | first | 2 | 5 | +| \\Afoo\\Z | last | 2 | 5 | | \\Afoo\\Z | last | 5 | 7 | | \\[(?P[^[]*)\\]\\((?P[^)]*) | first | 0 | 2 | | \\[(?P[^[]*)\\]\\((?P[^)]*) | last | 28 | 32 | @@ -86,24 +86,24 @@ | ^[A-Z_]+$(?= end) + or + exists(int s, int e | + e = max(int i | normalCharacterSub(s, i)) and + not exists(int x, int y | this.charSet(x, y) and x <= s and y >= e) + | + if qualifier(e, _, _, _) + then + end = e and start = e - 1 + or + end = e - 1 and start = s and start < end + else ( + end = e and + start = s + ) + ) + } + + private predicate normalCharacterSub(int start, int end) { + ( + normalCharacterSub(start, end - 1) + or + start = end - 1 and not normalCharacter(start - 1, start) + ) and + this.normalCharacter(end - 1, end) + } + + private predicate characterItem(int start, int end) { + this.normalCharacterSequence(start, end) or + this.escapedCharacter(start, end) or + this.specialCharacter(start, end, _) + } + /** Whether the text in the range `start,end` is a group */ predicate group(int start, int end) { this.groupContents(start, end, _, _) @@ -639,7 +678,7 @@ class RegExp extends AST::RegExpLiteral { string getBackRefName(int start, int end) { this.namedBackreference(start, end, result) } private predicate baseItem(int start, int end) { - this.character(start, end) and + this.characterItem(start, end) and not exists(int x, int y | this.charSet(x, y) and x <= start and y >= end) or this.group(start, end) @@ -746,7 +785,7 @@ class RegExp extends AST::RegExpLiteral { } private predicate itemStart(int start) { - this.character(start, _) or + this.characterItem(start, _) or this.isGroupStart(start) or this.charSet(start, _) or this.backreference(start, _) or @@ -754,7 +793,7 @@ class RegExp extends AST::RegExpLiteral { } private predicate itemEnd(int end) { - this.character(_, end) + this.characterItem(_, end) or exists(int endm1 | this.isGroupEnd(endm1) and end = endm1 + 1) or @@ -865,7 +904,7 @@ class RegExp extends AST::RegExpLiteral { */ predicate firstItem(int start, int end) { ( - this.character(start, end) + this.characterItem(start, end) or this.qualifiedItem(start, end, _, _) or @@ -880,7 +919,7 @@ class RegExp extends AST::RegExpLiteral { */ predicate lastItem(int start, int end) { ( - this.character(start, end) + this.characterItem(start, end) or this.qualifiedItem(start, end, _, _) or diff --git a/ruby/ql/lib/codeql/ruby/security/performance/RegExpTreeView.qll b/ruby/ql/lib/codeql/ruby/security/performance/RegExpTreeView.qll index 9c8e39e56cea..7c2df79abef4 100644 --- a/ruby/ql/lib/codeql/ruby/security/performance/RegExpTreeView.qll +++ b/ruby/ql/lib/codeql/ruby/security/performance/RegExpTreeView.qll @@ -228,7 +228,12 @@ newtype TRegExpParent = TRegExpCharacterRange(RegExp re, int start, int end) { re.charRange(_, start, _, _, end) } or TRegExpGroup(RegExp re, int start, int end) { re.group(start, end) } or TRegExpSpecialChar(RegExp re, int start, int end) { re.specialCharacter(start, end, _) } or - TRegExpNormalChar(RegExp re, int start, int end) { re.normalCharacter(start, end) } or + TRegExpNormalChar(RegExp re, int start, int end) { + re.normalCharacterSequence(start, end) + or + re.escapedCharacter(start, end) and + not re.specialCharacter(start, end, _) + } or TRegExpBackRef(RegExp re, int start, int end) { re.backreference(start, end) } or TRegExpNamedCharacterProperty(RegExp re, int start, int end) { re.namedCharacterProperty(start, end, _) diff --git a/ruby/ql/test/library-tests/ast/Ast.expected b/ruby/ql/test/library-tests/ast/Ast.expected index a3e090324ba4..422af8141d5a 100644 --- a/ruby/ql/test/library-tests/ast/Ast.expected +++ b/ruby/ql/test/library-tests/ast/Ast.expected @@ -973,10 +973,8 @@ control/cases.rb: # 92| getParsed: [RegExpSequence] .*abc[0-9] # 92| 0: [RegExpStar] .* # 92| 0: [RegExpDot] . -# 92| 1: [RegExpConstant, RegExpNormalChar] a -# 92| 2: [RegExpConstant, RegExpNormalChar] b -# 92| 3: [RegExpConstant, RegExpNormalChar] c -# 92| 4: [RegExpCharacterClass] [0-9] +# 92| 1: [RegExpConstant, RegExpNormalChar] abc +# 92| 2: [RegExpCharacterClass] [0-9] # 92| 0: [RegExpCharacterRange] 0-9 # 92| 0: [RegExpConstant, RegExpNormalChar] 0 # 92| 1: [RegExpConstant, RegExpNormalChar] 9 @@ -1823,47 +1821,25 @@ literals/literals.rb: # 133| getAnOperand/getArgument/getRightOperand: [IntegerLiteral] 4 # 136| getStmt: [RegExpLiteral] // # 137| getStmt: [RegExpLiteral] /foo/ -# 137| getParsed: [RegExpSequence] foo -# 137| 0: [RegExpConstant, RegExpNormalChar] f -# 137| 1: [RegExpConstant, RegExpNormalChar] o -# 137| 2: [RegExpConstant, RegExpNormalChar] o +# 137| getParsed: [RegExpConstant, RegExpNormalChar] foo # 137| getComponent: [RegExpTextComponent] foo # 138| getStmt: [RegExpLiteral] /foo/ -# 138| getParsed: [RegExpSequence] foo -# 138| 0: [RegExpConstant, RegExpNormalChar] f -# 138| 1: [RegExpConstant, RegExpNormalChar] o -# 138| 2: [RegExpConstant, RegExpNormalChar] o +# 138| getParsed: [RegExpConstant, RegExpNormalChar] foo # 138| getComponent: [RegExpTextComponent] foo # 139| getStmt: [RegExpLiteral] /foo+\sbar\S/ # 139| getParsed: [RegExpSequence] foo+\sbar\S -# 139| 0: [RegExpConstant, RegExpNormalChar] f -# 139| 1: [RegExpConstant, RegExpNormalChar] o -# 139| 2: [RegExpPlus] o+ +# 139| 0: [RegExpConstant, RegExpNormalChar] fo +# 139| 1: [RegExpPlus] o+ # 139| 0: [RegExpConstant, RegExpNormalChar] o -# 139| 3: [RegExpCharacterClassEscape] \s -# 139| 4: [RegExpConstant, RegExpNormalChar] b -# 139| 5: [RegExpConstant, RegExpNormalChar] a -# 139| 6: [RegExpConstant, RegExpNormalChar] r -# 139| 7: [RegExpCharacterClassEscape] \S +# 139| 2: [RegExpCharacterClassEscape] \s +# 139| 3: [RegExpConstant, RegExpNormalChar] bar +# 139| 4: [RegExpCharacterClassEscape] \S # 139| getComponent: [RegExpTextComponent] foo+ # 139| getComponent: [RegExpEscapeSequenceComponent] \s # 139| getComponent: [RegExpTextComponent] bar # 139| getComponent: [RegExpEscapeSequenceComponent] \S # 140| getStmt: [RegExpLiteral] /foo#{...}bar#{...}#{...}/ -# 140| getParsed: [RegExpSequence] foo2barbarbar -# 140| 0: [RegExpConstant, RegExpNormalChar] f -# 140| 1: [RegExpConstant, RegExpNormalChar] o -# 140| 2: [RegExpConstant, RegExpNormalChar] o -# 140| 3: [RegExpConstant, RegExpNormalChar] 2 -# 140| 4: [RegExpConstant, RegExpNormalChar] b -# 140| 5: [RegExpConstant, RegExpNormalChar] a -# 140| 6: [RegExpConstant, RegExpNormalChar] r -# 140| 7: [RegExpConstant, RegExpNormalChar] b -# 140| 8: [RegExpConstant, RegExpNormalChar] a -# 140| 9: [RegExpConstant, RegExpNormalChar] r -# 140| 10: [RegExpConstant, RegExpNormalChar] b -# 140| 11: [RegExpConstant, RegExpNormalChar] a -# 140| 12: [RegExpConstant, RegExpNormalChar] r +# 140| getParsed: [RegExpConstant, RegExpNormalChar] foo2barbarbar # 140| getComponent: [RegExpTextComponent] foo # 140| getComponent: [RegExpInterpolationComponent] #{...} # 140| getStmt: [AddExpr] ... + ... @@ -1878,47 +1854,25 @@ literals/literals.rb: # 141| getComponent: [RegExpTextComponent] foo # 142| getStmt: [RegExpLiteral] // # 143| getStmt: [RegExpLiteral] /foo/ -# 143| getParsed: [RegExpSequence] foo -# 143| 0: [RegExpConstant, RegExpNormalChar] f -# 143| 1: [RegExpConstant, RegExpNormalChar] o -# 143| 2: [RegExpConstant, RegExpNormalChar] o +# 143| getParsed: [RegExpConstant, RegExpNormalChar] foo # 143| getComponent: [RegExpTextComponent] foo # 144| getStmt: [RegExpLiteral] /foo/ -# 144| getParsed: [RegExpSequence] foo -# 144| 0: [RegExpConstant, RegExpNormalChar] f -# 144| 1: [RegExpConstant, RegExpNormalChar] o -# 144| 2: [RegExpConstant, RegExpNormalChar] o +# 144| getParsed: [RegExpConstant, RegExpNormalChar] foo # 144| getComponent: [RegExpTextComponent] foo # 145| getStmt: [RegExpLiteral] /foo+\sbar\S/ # 145| getParsed: [RegExpSequence] foo+\sbar\S -# 145| 0: [RegExpConstant, RegExpNormalChar] f -# 145| 1: [RegExpConstant, RegExpNormalChar] o -# 145| 2: [RegExpPlus] o+ +# 145| 0: [RegExpConstant, RegExpNormalChar] fo +# 145| 1: [RegExpPlus] o+ # 145| 0: [RegExpConstant, RegExpNormalChar] o -# 145| 3: [RegExpCharacterClassEscape] \s -# 145| 4: [RegExpConstant, RegExpNormalChar] b -# 145| 5: [RegExpConstant, RegExpNormalChar] a -# 145| 6: [RegExpConstant, RegExpNormalChar] r -# 145| 7: [RegExpCharacterClassEscape] \S +# 145| 2: [RegExpCharacterClassEscape] \s +# 145| 3: [RegExpConstant, RegExpNormalChar] bar +# 145| 4: [RegExpCharacterClassEscape] \S # 145| getComponent: [RegExpTextComponent] foo+ # 145| getComponent: [RegExpEscapeSequenceComponent] \s # 145| getComponent: [RegExpTextComponent] bar # 145| getComponent: [RegExpEscapeSequenceComponent] \S # 146| getStmt: [RegExpLiteral] /foo#{...}bar#{...}#{...}/ -# 146| getParsed: [RegExpSequence] foo2barbarbar -# 146| 0: [RegExpConstant, RegExpNormalChar] f -# 146| 1: [RegExpConstant, RegExpNormalChar] o -# 146| 2: [RegExpConstant, RegExpNormalChar] o -# 146| 3: [RegExpConstant, RegExpNormalChar] 2 -# 146| 4: [RegExpConstant, RegExpNormalChar] b -# 146| 5: [RegExpConstant, RegExpNormalChar] a -# 146| 6: [RegExpConstant, RegExpNormalChar] r -# 146| 7: [RegExpConstant, RegExpNormalChar] b -# 146| 8: [RegExpConstant, RegExpNormalChar] a -# 146| 9: [RegExpConstant, RegExpNormalChar] r -# 146| 10: [RegExpConstant, RegExpNormalChar] b -# 146| 11: [RegExpConstant, RegExpNormalChar] a -# 146| 12: [RegExpConstant, RegExpNormalChar] r +# 146| getParsed: [RegExpConstant, RegExpNormalChar] foo2barbarbar # 146| getComponent: [RegExpTextComponent] foo # 146| getComponent: [RegExpInterpolationComponent] #{...} # 146| getStmt: [AddExpr] ... + ... @@ -2469,10 +2423,8 @@ operations/operations.rb: # 65| getAnOperand/getLeftOperand/getReceiver: [LocalVariableAccess] name # 65| getAnOperand/getArgument/getRightOperand: [RegExpLiteral] /foo.*/ # 65| getParsed: [RegExpSequence] foo.* -# 65| 0: [RegExpConstant, RegExpNormalChar] f -# 65| 1: [RegExpConstant, RegExpNormalChar] o -# 65| 2: [RegExpConstant, RegExpNormalChar] o -# 65| 3: [RegExpStar] .* +# 65| 0: [RegExpConstant, RegExpNormalChar] foo +# 65| 1: [RegExpStar] .* # 65| 0: [RegExpDot] . # 65| getComponent: [RegExpTextComponent] foo.* # 66| getStmt: [NoRegExpMatchExpr] ... !~ ... @@ -2481,9 +2433,7 @@ operations/operations.rb: # 66| getParsed: [RegExpSequence] .*bar # 66| 0: [RegExpStar] .* # 66| 0: [RegExpDot] . -# 66| 1: [RegExpConstant, RegExpNormalChar] b -# 66| 2: [RegExpConstant, RegExpNormalChar] a -# 66| 3: [RegExpConstant, RegExpNormalChar] r +# 66| 1: [RegExpConstant, RegExpNormalChar] bar # 66| getComponent: [RegExpTextComponent] .*bar # 69| getStmt: [AssignAddExpr] ... += ... # 69| getAnOperand/getLeftOperand: [LocalVariableAccess] x diff --git a/ruby/ql/test/library-tests/regexp/parse.expected b/ruby/ql/test/library-tests/regexp/parse.expected index c42b90d1ab81..3241ce25388f 100644 --- a/ruby/ql/test/library-tests/regexp/parse.expected +++ b/ruby/ql/test/library-tests/regexp/parse.expected @@ -1,14 +1,5 @@ regexp.rb: -# 5| [RegExpConstant, RegExpNormalChar] a - -# 5| [RegExpSequence] abc -#-----| 0 -> [RegExpConstant, RegExpNormalChar] a -#-----| 1 -> [RegExpConstant, RegExpNormalChar] b -#-----| 2 -> [RegExpConstant, RegExpNormalChar] c - -# 5| [RegExpConstant, RegExpNormalChar] b - -# 5| [RegExpConstant, RegExpNormalChar] c +# 5| [RegExpConstant, RegExpNormalChar] abc # 8| [RegExpConstant, RegExpNormalChar] a @@ -38,70 +29,36 @@ regexp.rb: # 9| [RegExpRange] a{4,8} #-----| 0 -> [RegExpConstant, RegExpNormalChar] a -# 9| [RegExpNormalChar] 4 - -# 9| [RegExpNormalChar] , - -# 9| [RegExpNormalChar] 8 - -# 9| [RegExpNormalChar] } +# 9| [RegExpNormalChar] 4,8} # 10| [RegExpConstant, RegExpNormalChar] a # 10| [RegExpRange] a{,8} #-----| 0 -> [RegExpConstant, RegExpNormalChar] a -# 10| [RegExpNormalChar] , - -# 10| [RegExpNormalChar] 8 - -# 10| [RegExpNormalChar] } +# 10| [RegExpNormalChar] ,8} # 11| [RegExpConstant, RegExpNormalChar] a # 11| [InfiniteRepetitionQuantifier, RegExpRange] a{3,} #-----| 0 -> [RegExpConstant, RegExpNormalChar] a -# 11| [RegExpNormalChar] 3 - -# 11| [RegExpNormalChar] , - -# 11| [RegExpNormalChar] } +# 11| [RegExpNormalChar] 3,} # 12| [RegExpConstant, RegExpNormalChar] a # 12| [RegExpRange] a{7} #-----| 0 -> [RegExpConstant, RegExpNormalChar] a -# 12| [RegExpNormalChar] 7 +# 12| [RegExpNormalChar] 7} -# 12| [RegExpNormalChar] } - -# 15| [RegExpConstant, RegExpNormalChar] f - -# 15| [RegExpSequence] foo -#-----| 0 -> [RegExpConstant, RegExpNormalChar] f -#-----| 1 -> [RegExpConstant, RegExpNormalChar] o -#-----| 2 -> [RegExpConstant, RegExpNormalChar] o +# 15| [RegExpConstant, RegExpNormalChar] foo # 15| [RegExpAlt] foo|bar -#-----| 0 -> [RegExpSequence] foo -#-----| 1 -> [RegExpSequence] bar - -# 15| [RegExpConstant, RegExpNormalChar] o +#-----| 0 -> [RegExpConstant, RegExpNormalChar] foo +#-----| 1 -> [RegExpConstant, RegExpNormalChar] bar -# 15| [RegExpConstant, RegExpNormalChar] o - -# 15| [RegExpConstant, RegExpNormalChar] b - -# 15| [RegExpSequence] bar -#-----| 0 -> [RegExpConstant, RegExpNormalChar] b -#-----| 1 -> [RegExpConstant, RegExpNormalChar] a -#-----| 2 -> [RegExpConstant, RegExpNormalChar] r - -# 15| [RegExpConstant, RegExpNormalChar] a - -# 15| [RegExpConstant, RegExpNormalChar] r +# 15| [RegExpConstant, RegExpNormalChar] bar # 18| [RegExpCharacterClass] [abc] #-----| 0 -> [RegExpConstant, RegExpNormalChar] a @@ -229,10 +186,7 @@ regexp.rb: # 29| [RegExpSequence] [[a-f]A-F] #-----| 0 -> [RegExpCharacterClass] [[a-f] -#-----| 1 -> [RegExpConstant, RegExpNormalChar] A -#-----| 2 -> [RegExpConstant, RegExpNormalChar] - -#-----| 3 -> [RegExpConstant, RegExpNormalChar] F -#-----| 4 -> [RegExpConstant, RegExpNormalChar] ] +#-----| 1 -> [RegExpConstant, RegExpNormalChar] A-F] # 29| [RegExpConstant, RegExpNormalChar] [ @@ -244,13 +198,7 @@ regexp.rb: # 29| [RegExpConstant, RegExpNormalChar] f -# 29| [RegExpConstant, RegExpNormalChar] A - -# 29| [RegExpConstant, RegExpNormalChar] - - -# 29| [RegExpConstant, RegExpNormalChar] F - -# 29| [RegExpConstant, RegExpNormalChar] ] +# 29| [RegExpConstant, RegExpNormalChar] A-F] # 32| [RegExpDot] . @@ -312,69 +260,41 @@ regexp.rb: # 41| [RegExpSequence] \Gabc #-----| 0 -> [RegExpSpecialChar] \G -#-----| 1 -> [RegExpConstant, RegExpNormalChar] a -#-----| 2 -> [RegExpConstant, RegExpNormalChar] b -#-----| 3 -> [RegExpConstant, RegExpNormalChar] c - -# 41| [RegExpConstant, RegExpNormalChar] a +#-----| 1 -> [RegExpConstant, RegExpNormalChar] abc -# 41| [RegExpConstant, RegExpNormalChar] b - -# 41| [RegExpConstant, RegExpNormalChar] c +# 41| [RegExpConstant, RegExpNormalChar] abc # 42| [RegExpSpecialChar] \b # 42| [RegExpSequence] \b!a\B #-----| 0 -> [RegExpSpecialChar] \b -#-----| 1 -> [RegExpConstant, RegExpNormalChar] ! -#-----| 2 -> [RegExpConstant, RegExpNormalChar] a -#-----| 3 -> [RegExpSpecialChar] \B - -# 42| [RegExpConstant, RegExpNormalChar] ! +#-----| 1 -> [RegExpConstant, RegExpNormalChar] !a +#-----| 2 -> [RegExpSpecialChar] \B -# 42| [RegExpConstant, RegExpNormalChar] a +# 42| [RegExpConstant, RegExpNormalChar] !a # 42| [RegExpSpecialChar] \B # 45| [RegExpGroup] (foo) -#-----| 0 -> [RegExpSequence] foo +#-----| 0 -> [RegExpConstant, RegExpNormalChar] foo # 45| [RegExpStar] (foo)* #-----| 0 -> [RegExpGroup] (foo) # 45| [RegExpSequence] (foo)*bar #-----| 0 -> [RegExpStar] (foo)* -#-----| 1 -> [RegExpConstant, RegExpNormalChar] b -#-----| 2 -> [RegExpConstant, RegExpNormalChar] a -#-----| 3 -> [RegExpConstant, RegExpNormalChar] r - -# 45| [RegExpConstant, RegExpNormalChar] f - -# 45| [RegExpSequence] foo -#-----| 0 -> [RegExpConstant, RegExpNormalChar] f -#-----| 1 -> [RegExpConstant, RegExpNormalChar] o -#-----| 2 -> [RegExpConstant, RegExpNormalChar] o - -# 45| [RegExpConstant, RegExpNormalChar] o +#-----| 1 -> [RegExpConstant, RegExpNormalChar] bar -# 45| [RegExpConstant, RegExpNormalChar] o +# 45| [RegExpConstant, RegExpNormalChar] foo -# 45| [RegExpConstant, RegExpNormalChar] b +# 45| [RegExpConstant, RegExpNormalChar] bar -# 45| [RegExpConstant, RegExpNormalChar] a - -# 45| [RegExpConstant, RegExpNormalChar] r - -# 46| [RegExpConstant, RegExpNormalChar] f +# 46| [RegExpConstant, RegExpNormalChar] fo # 46| [RegExpSequence] fo(o|b)ar -#-----| 0 -> [RegExpConstant, RegExpNormalChar] f -#-----| 1 -> [RegExpConstant, RegExpNormalChar] o -#-----| 2 -> [RegExpGroup] (o|b) -#-----| 3 -> [RegExpConstant, RegExpNormalChar] a -#-----| 4 -> [RegExpConstant, RegExpNormalChar] r - -# 46| [RegExpConstant, RegExpNormalChar] o +#-----| 0 -> [RegExpConstant, RegExpNormalChar] fo +#-----| 1 -> [RegExpGroup] (o|b) +#-----| 2 -> [RegExpConstant, RegExpNormalChar] ar # 46| [RegExpGroup] (o|b) #-----| 0 -> [RegExpAlt] o|b @@ -387,9 +307,7 @@ regexp.rb: # 46| [RegExpConstant, RegExpNormalChar] b -# 46| [RegExpConstant, RegExpNormalChar] a - -# 46| [RegExpConstant, RegExpNormalChar] r +# 46| [RegExpConstant, RegExpNormalChar] ar # 47| [RegExpGroup] (a|b|cd) #-----| 0 -> [RegExpAlt] a|b|cd @@ -403,17 +321,11 @@ regexp.rb: # 47| [RegExpAlt] a|b|cd #-----| 0 -> [RegExpConstant, RegExpNormalChar] a #-----| 1 -> [RegExpConstant, RegExpNormalChar] b -#-----| 2 -> [RegExpSequence] cd +#-----| 2 -> [RegExpConstant, RegExpNormalChar] cd # 47| [RegExpConstant, RegExpNormalChar] b -# 47| [RegExpConstant, RegExpNormalChar] c - -# 47| [RegExpSequence] cd -#-----| 0 -> [RegExpConstant, RegExpNormalChar] c -#-----| 1 -> [RegExpConstant, RegExpNormalChar] d - -# 47| [RegExpConstant, RegExpNormalChar] d +# 47| [RegExpConstant, RegExpNormalChar] cd # 47| [RegExpConstant, RegExpNormalChar] e @@ -511,13 +423,7 @@ regexp.rb: # 61| [RegExpRange] \p{^Alnum}{2,3} #-----| 0 -> [RegExpNamedCharacterProperty] \p{^Alnum} -# 61| [RegExpNormalChar] 2 - -# 61| [RegExpNormalChar] , - -# 61| [RegExpNormalChar] 3 - -# 61| [RegExpNormalChar] } +# 61| [RegExpNormalChar] 2,3} # 62| [RegExpCharacterClass] [a-f\p{Digit}] #-----| 0 -> [RegExpCharacterRange] a-f @@ -583,13 +489,4 @@ regexp.rb: # 74| [RegExpNamedCharacterProperty] [:digit:] -# 78| [RegExpConstant, RegExpNormalChar] a - -# 78| [RegExpSequence] abc -#-----| 0 -> [RegExpConstant, RegExpNormalChar] a -#-----| 1 -> [RegExpConstant, RegExpNormalChar] b -#-----| 2 -> [RegExpConstant, RegExpNormalChar] c - -# 78| [RegExpConstant, RegExpNormalChar] b - -# 78| [RegExpConstant, RegExpNormalChar] c +# 78| [RegExpConstant, RegExpNormalChar] abc diff --git a/ruby/ql/test/query-tests/security/cwe-1333-exponential-redos/ReDoS.expected b/ruby/ql/test/query-tests/security/cwe-1333-exponential-redos/ReDoS.expected index 213f0e11189a..572ac08887d3 100644 --- a/ruby/ql/test/query-tests/security/cwe-1333-exponential-redos/ReDoS.expected +++ b/ruby/ql/test/query-tests/security/cwe-1333-exponential-redos/ReDoS.expected @@ -54,7 +54,7 @@ | tst.rb:218:11:218:15 | [^X]+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'W'. | | tst.rb:221:16:221:16 | b | This part of the regular expression may cause exponential backtracking on strings starting with 'W' and containing many repetitions of 'bW'. | | tst.rb:227:16:227:16 | b | This part of the regular expression may cause exponential backtracking on strings starting with 'W' and containing many repetitions of 'bW'. | -| tst.rb:239:13:239:13 | b | This part of the regular expression may cause exponential backtracking on strings starting with 'a' and containing many repetitions of 'ba'. | +| tst.rb:239:12:239:13 | ab | This part of the regular expression may cause exponential backtracking on strings starting with 'a' and containing many repetitions of 'ab'. | | tst.rb:245:11:245:17 | [\\n\\s]+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\n'. | | tst.rb:254:11:254:13 | \\w* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'foobarbazfoobarbazfoobarbazfoobarbazfoobarbazfoobarbaz'. | | tst.rb:254:23:254:25 | \\w* | This part of the regular expression may cause exponential backtracking on strings starting with 'foobarbaz' and containing many repetitions of 'foobarbazfoobarbazfoobarbazfoobarbazfoobarbazfoobarbaz'. | From 9d9abaf1f902f812a9adf73e0873b564f1c246cb Mon Sep 17 00:00:00 2001 From: Arthur Baars Date: Fri, 25 Feb 2022 12:18:37 +0100 Subject: [PATCH 2/4] Apply suggestions from code review Co-authored-by: yoff --- python/ql/lib/semmle/python/RegexTreeView.qll | 2 +- python/ql/lib/semmle/python/regex.qll | 28 +++++++++++-------- .../ruby/security/performance/ParseRegExp.qll | 28 +++++++++++-------- .../security/performance/RegExpTreeView.qll | 2 +- 4 files changed, 34 insertions(+), 26 deletions(-) diff --git a/python/ql/lib/semmle/python/RegexTreeView.qll b/python/ql/lib/semmle/python/RegexTreeView.qll index 95d983f5e88f..428ad3c3e63c 100644 --- a/python/ql/lib/semmle/python/RegexTreeView.qll +++ b/python/ql/lib/semmle/python/RegexTreeView.qll @@ -40,7 +40,7 @@ newtype TRegExpParent = TRegExpSpecialChar(Regex re, int start, int end) { re.specialCharacter(start, end, _) } or /** A normal character */ TRegExpNormalChar(Regex re, int start, int end) { - re.normalCharacterSequence(start, end) + re.simpleCharacterSequence(start, end) or re.escapedCharacter(start, end) and not re.specialCharacter(start, end, _) diff --git a/python/ql/lib/semmle/python/regex.qll b/python/ql/lib/semmle/python/regex.qll index 910169d20836..24e47af3d8f8 100644 --- a/python/ql/lib/semmle/python/regex.qll +++ b/python/ql/lib/semmle/python/regex.qll @@ -447,18 +447,22 @@ abstract class RegexString extends Expr { } /** - * A sequence of 'normal' characters. + * A sequence of 'simple' characters. */ - predicate normalCharacterSequence(int start, int end) { - this.normalCharacter(start, end) and - end = start + 1 and - exists(int x, int y | this.charSet(x, y) and x <= start and y >= end) + predicate simpleCharacterSequence(int start, int end) { + // a simple character inside a character set is interpreted on its own + this.simpleCharacter(start, end) and + this.inCharSet(start) or + // a maximal run of simple characters is considered as one constant exists(int s, int e | - e = max(int i | normalCharacterSub(s, i)) and - not exists(int x, int y | this.charSet(x, y) and x <= s and y >= e) + e = max(int i | simpleCharacterRun(s, i)) and + not this.inCharSet(s) | - if qualifier(e, _, _, _) + // 'abc' can be considered one constant, but + // 'abc+' has to be broken up into 'ab' and 'c+', + // as the qualifier only applies to 'c'. + if this.qualifier(e, _, _, _) then end = e and start = e - 1 or @@ -470,17 +474,17 @@ abstract class RegexString extends Expr { ) } - private predicate normalCharacterSub(int start, int end) { + private predicate simpleCharacterRun(int start, int end) { ( - normalCharacterSub(start, end - 1) + simpleCharacterRun(start, end - 1) or start = end - 1 and not normalCharacter(start - 1, start) ) and - this.normalCharacter(end - 1, end) + this.simpleCharacter(end - 1, end) } private predicate characterItem(int start, int end) { - this.normalCharacterSequence(start, end) or + this.simpleCharacterSequence(start, end) or this.escapedCharacter(start, end) or this.specialCharacter(start, end, _) } diff --git a/ruby/ql/lib/codeql/ruby/security/performance/ParseRegExp.qll b/ruby/ql/lib/codeql/ruby/security/performance/ParseRegExp.qll index 11dc890b89a7..ea364d4eb318 100644 --- a/ruby/ql/lib/codeql/ruby/security/performance/ParseRegExp.qll +++ b/ruby/ql/lib/codeql/ruby/security/performance/ParseRegExp.qll @@ -402,18 +402,22 @@ class RegExp extends AST::RegExpLiteral { } /** - * A sequence of 'normal' characters. + * A sequence of 'simple' characters. */ - predicate normalCharacterSequence(int start, int end) { - this.normalCharacter(start, end) and - end = start + 1 and - exists(int x, int y | this.charSet(x, y) and x <= start and y >= end) + predicate simpleCharacterSequence(int start, int end) { + // a simple character inside a character set is interpreted on its own + this.simpleCharacter(start, end) and + this.inCharSet(start) or + // a maximal run of simple characters is considered as one constant exists(int s, int e | - e = max(int i | normalCharacterSub(s, i)) and - not exists(int x, int y | this.charSet(x, y) and x <= s and y >= e) + e = max(int i | simpleCharacterRun(s, i)) and + not this.inCharSet(s) | - if qualifier(e, _, _, _) + // 'abc' can be considered one constant, but + // 'abc+' has to be broken up into 'ab' and 'c+', + // as the qualifier only applies to 'c'. + if this.qualifier(e, _, _, _) then end = e and start = e - 1 or @@ -425,17 +429,17 @@ class RegExp extends AST::RegExpLiteral { ) } - private predicate normalCharacterSub(int start, int end) { + private predicate simpleCharacterRun(int start, int end) { ( - normalCharacterSub(start, end - 1) + simpleCharacterRun(start, end - 1) or start = end - 1 and not normalCharacter(start - 1, start) ) and - this.normalCharacter(end - 1, end) + this.simpleCharacter(end - 1, end) } private predicate characterItem(int start, int end) { - this.normalCharacterSequence(start, end) or + this.simpleCharacterSequence(start, end) or this.escapedCharacter(start, end) or this.specialCharacter(start, end, _) } diff --git a/ruby/ql/lib/codeql/ruby/security/performance/RegExpTreeView.qll b/ruby/ql/lib/codeql/ruby/security/performance/RegExpTreeView.qll index 7c2df79abef4..1ae338e45b60 100644 --- a/ruby/ql/lib/codeql/ruby/security/performance/RegExpTreeView.qll +++ b/ruby/ql/lib/codeql/ruby/security/performance/RegExpTreeView.qll @@ -229,7 +229,7 @@ newtype TRegExpParent = TRegExpGroup(RegExp re, int start, int end) { re.group(start, end) } or TRegExpSpecialChar(RegExp re, int start, int end) { re.specialCharacter(start, end, _) } or TRegExpNormalChar(RegExp re, int start, int end) { - re.normalCharacterSequence(start, end) + re.simpleCharacterSequence(start, end) or re.escapedCharacter(start, end) and not re.specialCharacter(start, end, _) From 5044f8910516d548f1316816963d04ba98632d60 Mon Sep 17 00:00:00 2001 From: Arthur Baars Date: Fri, 25 Feb 2022 12:57:59 +0100 Subject: [PATCH 3/4] Ruby/Python re-introduce normalCharacterSequence --- python/ql/lib/semmle/python/RegexTreeView.qll | 2 +- python/ql/lib/semmle/python/regex.qll | 23 ++++++++++--------- python/ql/test/library-tests/regex/Regex.ql | 4 ++++ .../ruby/security/performance/ParseRegExp.qll | 23 ++++++++++--------- .../security/performance/RegExpTreeView.qll | 2 +- 5 files changed, 30 insertions(+), 24 deletions(-) diff --git a/python/ql/lib/semmle/python/RegexTreeView.qll b/python/ql/lib/semmle/python/RegexTreeView.qll index 428ad3c3e63c..95d983f5e88f 100644 --- a/python/ql/lib/semmle/python/RegexTreeView.qll +++ b/python/ql/lib/semmle/python/RegexTreeView.qll @@ -40,7 +40,7 @@ newtype TRegExpParent = TRegExpSpecialChar(Regex re, int start, int end) { re.specialCharacter(start, end, _) } or /** A normal character */ TRegExpNormalChar(Regex re, int start, int end) { - re.simpleCharacterSequence(start, end) + re.normalCharacterSequence(start, end) or re.escapedCharacter(start, end) and not re.specialCharacter(start, end, _) diff --git a/python/ql/lib/semmle/python/regex.qll b/python/ql/lib/semmle/python/regex.qll index 24e47af3d8f8..fb56fa0ab118 100644 --- a/python/ql/lib/semmle/python/regex.qll +++ b/python/ql/lib/semmle/python/regex.qll @@ -427,6 +427,7 @@ abstract class RegexString extends Expr { } predicate normalCharacter(int start, int end) { + end = start + 1 and this.character(start, end) and not this.specialCharacter(start, end, _) } @@ -447,16 +448,16 @@ abstract class RegexString extends Expr { } /** - * A sequence of 'simple' characters. + * Holds if the range [start:end) consists of only 'normal' characters. */ - predicate simpleCharacterSequence(int start, int end) { - // a simple character inside a character set is interpreted on its own - this.simpleCharacter(start, end) and + predicate normalCharacterSequence(int start, int end) { + // a normal character inside a character set is interpreted on its own + this.normalCharacter(start, end) and this.inCharSet(start) or - // a maximal run of simple characters is considered as one constant + // a maximal run of normal characters is considered as one constant exists(int s, int e | - e = max(int i | simpleCharacterRun(s, i)) and + e = max(int i | this.normalCharacterRun(s, i)) and not this.inCharSet(s) | // 'abc' can be considered one constant, but @@ -474,17 +475,17 @@ abstract class RegexString extends Expr { ) } - private predicate simpleCharacterRun(int start, int end) { + private predicate normalCharacterRun(int start, int end) { ( - simpleCharacterRun(start, end - 1) + this.normalCharacterRun(start, end - 1) or - start = end - 1 and not normalCharacter(start - 1, start) + start = end - 1 and not this.normalCharacter(start - 1, start) ) and - this.simpleCharacter(end - 1, end) + this.normalCharacter(end - 1, end) } private predicate characterItem(int start, int end) { - this.simpleCharacterSequence(start, end) or + this.normalCharacterSequence(start, end) or this.escapedCharacter(start, end) or this.specialCharacter(start, end, _) } diff --git a/python/ql/test/library-tests/regex/Regex.ql b/python/ql/test/library-tests/regex/Regex.ql index eb0628bfcde7..4c799ac25741 100644 --- a/python/ql/test/library-tests/regex/Regex.ql +++ b/python/ql/test/library-tests/regex/Regex.ql @@ -6,6 +6,10 @@ predicate part(Regex r, int start, int end, string kind) { or r.normalCharacter(start, end) and kind = "char" or + r.escapedCharacter(start, end) and + kind = "char" and + not r.specialCharacter(start, end, _) + or r.specialCharacter(start, end, kind) or r.sequence(start, end) and kind = "sequence" diff --git a/ruby/ql/lib/codeql/ruby/security/performance/ParseRegExp.qll b/ruby/ql/lib/codeql/ruby/security/performance/ParseRegExp.qll index ea364d4eb318..397381e6a7fe 100644 --- a/ruby/ql/lib/codeql/ruby/security/performance/ParseRegExp.qll +++ b/ruby/ql/lib/codeql/ruby/security/performance/ParseRegExp.qll @@ -382,6 +382,7 @@ class RegExp extends AST::RegExpLiteral { } predicate normalCharacter(int start, int end) { + end = start + 1 and this.character(start, end) and not this.specialCharacter(start, end, _) } @@ -402,16 +403,16 @@ class RegExp extends AST::RegExpLiteral { } /** - * A sequence of 'simple' characters. + * Holds if the range [start:end) consists of only 'normal' characters. */ - predicate simpleCharacterSequence(int start, int end) { - // a simple character inside a character set is interpreted on its own - this.simpleCharacter(start, end) and + predicate normalCharacterSequence(int start, int end) { + // a normal character inside a character set is interpreted on its own + this.normalCharacter(start, end) and this.inCharSet(start) or - // a maximal run of simple characters is considered as one constant + // a maximal run of normal characters is considered as one constant exists(int s, int e | - e = max(int i | simpleCharacterRun(s, i)) and + e = max(int i | this.normalCharacterRun(s, i)) and not this.inCharSet(s) | // 'abc' can be considered one constant, but @@ -429,17 +430,17 @@ class RegExp extends AST::RegExpLiteral { ) } - private predicate simpleCharacterRun(int start, int end) { + private predicate normalCharacterRun(int start, int end) { ( - simpleCharacterRun(start, end - 1) + this.normalCharacterRun(start, end - 1) or - start = end - 1 and not normalCharacter(start - 1, start) + start = end - 1 and not this.normalCharacter(start - 1, start) ) and - this.simpleCharacter(end - 1, end) + this.normalCharacter(end - 1, end) } private predicate characterItem(int start, int end) { - this.simpleCharacterSequence(start, end) or + this.normalCharacterSequence(start, end) or this.escapedCharacter(start, end) or this.specialCharacter(start, end, _) } diff --git a/ruby/ql/lib/codeql/ruby/security/performance/RegExpTreeView.qll b/ruby/ql/lib/codeql/ruby/security/performance/RegExpTreeView.qll index 1ae338e45b60..7c2df79abef4 100644 --- a/ruby/ql/lib/codeql/ruby/security/performance/RegExpTreeView.qll +++ b/ruby/ql/lib/codeql/ruby/security/performance/RegExpTreeView.qll @@ -229,7 +229,7 @@ newtype TRegExpParent = TRegExpGroup(RegExp re, int start, int end) { re.group(start, end) } or TRegExpSpecialChar(RegExp re, int start, int end) { re.specialCharacter(start, end, _) } or TRegExpNormalChar(RegExp re, int start, int end) { - re.simpleCharacterSequence(start, end) + re.normalCharacterSequence(start, end) or re.escapedCharacter(start, end) and not re.specialCharacter(start, end, _) From 0c23f5815fbe221222592f5670037462ed9d901c Mon Sep 17 00:00:00 2001 From: Arthur Baars Date: Fri, 25 Feb 2022 17:36:34 +0100 Subject: [PATCH 4/4] Add change note --- .../ql/lib/change-notes/2022-02-25-regex-group-characters.md | 4 ++++ ruby/ql/lib/change-notes/2022-02-25-regex-group-characters.md | 4 ++++ 2 files changed, 8 insertions(+) create mode 100644 python/ql/lib/change-notes/2022-02-25-regex-group-characters.md create mode 100644 ruby/ql/lib/change-notes/2022-02-25-regex-group-characters.md diff --git a/python/ql/lib/change-notes/2022-02-25-regex-group-characters.md b/python/ql/lib/change-notes/2022-02-25-regex-group-characters.md new file mode 100644 index 000000000000..615fe0023133 --- /dev/null +++ b/python/ql/lib/change-notes/2022-02-25-regex-group-characters.md @@ -0,0 +1,4 @@ +--- +category: minorAnalysis +--- +* The regular expression parser now groups sequences of normal characters. This reduces the number of instances of `RegExpNormalChar`. diff --git a/ruby/ql/lib/change-notes/2022-02-25-regex-group-characters.md b/ruby/ql/lib/change-notes/2022-02-25-regex-group-characters.md new file mode 100644 index 000000000000..615fe0023133 --- /dev/null +++ b/ruby/ql/lib/change-notes/2022-02-25-regex-group-characters.md @@ -0,0 +1,4 @@ +--- +category: minorAnalysis +--- +* The regular expression parser now groups sequences of normal characters. This reduces the number of instances of `RegExpNormalChar`. pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy