From 69ed121ecb3e8f0425dc73bf0c6a8a2cf48c1250 Mon Sep 17 00:00:00 2001
From: Arthur Baars <aibaars@github.com>
Date: Tue, 22 Feb 2022 10:51:47 +0100
Subject: [PATCH 1/4] Ruby/Python: regex parser: group sequences of 'normal'
 characters

---
 python/ql/lib/semmle/python/RegexTreeView.qll |   7 +-
 python/ql/lib/semmle/python/regex.qll         |  49 +++++-
 .../library-tests/regex/FirstLast.expected    |  28 +--
 .../Security/CWE-730-ReDoS/ReDoS.expected     |   2 +-
 .../ruby/security/performance/ParseRegExp.qll |  49 +++++-
 .../security/performance/RegExpTreeView.qll   |   7 +-
 ruby/ql/test/library-tests/ast/Ast.expected   |  92 +++-------
 .../test/library-tests/regexp/parse.expected  | 161 ++++--------------
 .../cwe-1333-exponential-redos/ReDoS.expected |   2 +-
 9 files changed, 166 insertions(+), 231 deletions(-)

diff --git a/python/ql/lib/semmle/python/RegexTreeView.qll b/python/ql/lib/semmle/python/RegexTreeView.qll
index 808bb265b697..95d983f5e88f 100644
--- a/python/ql/lib/semmle/python/RegexTreeView.qll
+++ b/python/ql/lib/semmle/python/RegexTreeView.qll
@@ -39,7 +39,12 @@ newtype TRegExpParent =
   /** A special character */
   TRegExpSpecialChar(Regex re, int start, int end) { re.specialCharacter(start, end, _) } or
   /** A normal character */
-  TRegExpNormalChar(Regex re, int start, int end) { re.normalCharacter(start, end) } or
+  TRegExpNormalChar(Regex re, int start, int end) {
+    re.normalCharacterSequence(start, end)
+    or
+    re.escapedCharacter(start, end) and
+    not re.specialCharacter(start, end, _)
+  } or
   /** A back reference */
   TRegExpBackRef(Regex re, int start, int end) { re.backreference(start, end) }
 
diff --git a/python/ql/lib/semmle/python/regex.qll b/python/ql/lib/semmle/python/regex.qll
index 001b3bdc635c..910169d20836 100644
--- a/python/ql/lib/semmle/python/regex.qll
+++ b/python/ql/lib/semmle/python/regex.qll
@@ -446,6 +446,45 @@ abstract class RegexString extends Expr {
     )
   }
 
+  /**
+   * A sequence of 'normal' characters.
+   */
+  predicate normalCharacterSequence(int start, int end) {
+    this.normalCharacter(start, end) and
+    end = start + 1 and
+    exists(int x, int y | this.charSet(x, y) and x <= start and y >= end)
+    or
+    exists(int s, int e |
+      e = max(int i | normalCharacterSub(s, i)) and
+      not exists(int x, int y | this.charSet(x, y) and x <= s and y >= e)
+    |
+      if qualifier(e, _, _, _)
+      then
+        end = e and start = e - 1
+        or
+        end = e - 1 and start = s and start < end
+      else (
+        end = e and
+        start = s
+      )
+    )
+  }
+
+  private predicate normalCharacterSub(int start, int end) {
+    (
+      normalCharacterSub(start, end - 1)
+      or
+      start = end - 1 and not normalCharacter(start - 1, start)
+    ) and
+    this.normalCharacter(end - 1, end)
+  }
+
+  private predicate characterItem(int start, int end) {
+    this.normalCharacterSequence(start, end) or
+    this.escapedCharacter(start, end) or
+    this.specialCharacter(start, end, _)
+  }
+
   /** Whether the text in the range start,end is a group */
   predicate group(int start, int end) {
     this.groupContents(start, end, _, _)
@@ -717,7 +756,7 @@ abstract class RegexString extends Expr {
   string getBackrefName(int start, int end) { this.named_backreference(start, end, result) }
 
   private predicate baseItem(int start, int end) {
-    this.character(start, end) and
+    this.characterItem(start, end) and
     not exists(int x, int y | this.charSet(x, y) and x <= start and y >= end)
     or
     this.group(start, end)
@@ -837,14 +876,14 @@ abstract class RegexString extends Expr {
   }
 
   private predicate item_start(int start) {
-    this.character(start, _) or
+    this.characterItem(start, _) or
     this.isGroupStart(start) or
     this.charSet(start, _) or
     this.backreference(start, _)
   }
 
   private predicate item_end(int end) {
-    this.character(_, end)
+    this.characterItem(_, end)
     or
     exists(int endm1 | this.isGroupEnd(endm1) and end = endm1 + 1)
     or
@@ -953,7 +992,7 @@ abstract class RegexString extends Expr {
    */
   predicate firstItem(int start, int end) {
     (
-      this.character(start, end)
+      this.characterItem(start, end)
       or
       this.qualifiedItem(start, end, _, _)
       or
@@ -968,7 +1007,7 @@ abstract class RegexString extends Expr {
    */
   predicate lastItem(int start, int end) {
     (
-      this.character(start, end)
+      this.characterItem(start, end)
       or
       this.qualifiedItem(start, end, _, _)
       or
diff --git a/python/ql/test/library-tests/regex/FirstLast.expected b/python/ql/test/library-tests/regex/FirstLast.expected
index 5c393547a53c..e388e0d1fdf7 100644
--- a/python/ql/test/library-tests/regex/FirstLast.expected
+++ b/python/ql/test/library-tests/regex/FirstLast.expected
@@ -1,6 +1,6 @@
-| 012345678 | first | 0 | 1 |
-| 012345678 | last | 8 | 9 |
-| (?!not-this)^[A-Z_]+$ | first | 3 | 4 |
+| 012345678 | first | 0 | 9 |
+| 012345678 | last | 0 | 9 |
+| (?!not-this)^[A-Z_]+$ | first | 3 | 11 |
 | (?!not-this)^[A-Z_]+$ | first | 12 | 13 |
 | (?!not-this)^[A-Z_]+$ | first | 13 | 19 |
 | (?!not-this)^[A-Z_]+$ | first | 13 | 20 |
@@ -27,9 +27,9 @@
 | (?m)^(?!$) | last | 4 | 5 |
 | (?m)^(?!$) | last | 8 | 9 |
 | (\\033\|~{) | first | 1 | 5 |
-| (\\033\|~{) | first | 6 | 7 |
+| (\\033\|~{) | first | 6 | 8 |
 | (\\033\|~{) | last | 1 | 5 |
-| (\\033\|~{) | last | 7 | 8 |
+| (\\033\|~{) | last | 6 | 8 |
 | [\ufffd-\ufffd] | first | 0 | 5 |
 | [\ufffd-\ufffd] | last | 0 | 5 |
 | [\ufffd-\ufffd][\ufffd-\ufffd] | first | 0 | 5 |
@@ -52,8 +52,8 @@
 | \\A[+-]?\\d+ | last | 7 | 9 |
 | \\A[+-]?\\d+ | last | 7 | 10 |
 | \\Afoo\\Z | first | 0 | 2 |
-| \\Afoo\\Z | first | 2 | 3 |
-| \\Afoo\\Z | last | 4 | 5 |
+| \\Afoo\\Z | first | 2 | 5 |
+| \\Afoo\\Z | last | 2 | 5 |
 | \\Afoo\\Z | last | 5 | 7 |
 | \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | first | 0 | 2 |
 | \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | last | 28 | 32 |
@@ -86,24 +86,24 @@
 | ^[A-Z_]+$(?<!not-this) | last | 1 | 7 |
 | ^[A-Z_]+$(?<!not-this) | last | 1 | 8 |
 | ^[A-Z_]+$(?<!not-this) | last | 8 | 9 |
-| ^[A-Z_]+$(?<!not-this) | last | 20 | 21 |
+| ^[A-Z_]+$(?<!not-this) | last | 13 | 21 |
 | ax{01,3} | first | 0 | 1 |
 | ax{01,3} | last | 1 | 2 |
 | ax{01,3} | last | 1 | 8 |
-| ax{01,3} | last | 7 | 8 |
+| ax{01,3} | last | 3 | 8 |
 | ax{3,} | first | 0 | 1 |
 | ax{3,} | last | 1 | 2 |
 | ax{3,} | last | 1 | 6 |
-| ax{3,} | last | 5 | 6 |
+| ax{3,} | last | 3 | 6 |
 | ax{3} | first | 0 | 1 |
 | ax{3} | last | 1 | 2 |
 | ax{3} | last | 1 | 5 |
-| ax{3} | last | 4 | 5 |
+| ax{3} | last | 3 | 5 |
 | ax{,3} | first | 0 | 1 |
 | ax{,3} | last | 0 | 1 |
 | ax{,3} | last | 1 | 2 |
 | ax{,3} | last | 1 | 6 |
-| ax{,3} | last | 5 | 6 |
+| ax{,3} | last | 3 | 6 |
 | x\| | first | 0 | 1 |
 | x\| | last | 0 | 1 |
 | x\|(?<!\\w)l | first | 0 | 1 |
@@ -111,5 +111,5 @@
 | x\|(?<!\\w)l | first | 9 | 10 |
 | x\|(?<!\\w)l | last | 0 | 1 |
 | x\|(?<!\\w)l | last | 9 | 10 |
-| x{Not qual} | first | 0 | 1 |
-| x{Not qual} | last | 10 | 11 |
+| x{Not qual} | first | 0 | 11 |
+| x{Not qual} | last | 0 | 11 |
diff --git a/python/ql/test/query-tests/Security/CWE-730-ReDoS/ReDoS.expected b/python/ql/test/query-tests/Security/CWE-730-ReDoS/ReDoS.expected
index 487650e216b8..76cc5992bdb8 100644
--- a/python/ql/test/query-tests/Security/CWE-730-ReDoS/ReDoS.expected
+++ b/python/ql/test/query-tests/Security/CWE-730-ReDoS/ReDoS.expected
@@ -59,7 +59,7 @@
 | redos.py:220:25:220:29 | [^X]+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'W'. |
 | redos.py:223:30:223:30 | b | This part of the regular expression may cause exponential backtracking on strings starting with 'W' and containing many repetitions of 'bW'. |
 | redos.py:229:30:229:30 | b | This part of the regular expression may cause exponential backtracking on strings starting with 'W' and containing many repetitions of 'bW'. |
-| redos.py:241:27:241:27 | b | This part of the regular expression may cause exponential backtracking on strings starting with 'a' and containing many repetitions of 'ba'. |
+| redos.py:241:26:241:27 | ab | This part of the regular expression may cause exponential backtracking on strings starting with 'a' and containing many repetitions of 'ab'. |
 | redos.py:247:25:247:31 | [\\n\\s]+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\n'. |
 | redos.py:256:25:256:27 | \\w* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'foobarbazfoobarbazfoobarbazfoobarbazfoobarbazfoobarbaz'. |
 | redos.py:256:37:256:39 | \\w* | This part of the regular expression may cause exponential backtracking on strings starting with 'foobarbaz' and containing many repetitions of 'foobarbazfoobarbazfoobarbazfoobarbazfoobarbazfoobarbaz'. |
diff --git a/ruby/ql/lib/codeql/ruby/security/performance/ParseRegExp.qll b/ruby/ql/lib/codeql/ruby/security/performance/ParseRegExp.qll
index a4c7e1cd03ac..11dc890b89a7 100644
--- a/ruby/ql/lib/codeql/ruby/security/performance/ParseRegExp.qll
+++ b/ruby/ql/lib/codeql/ruby/security/performance/ParseRegExp.qll
@@ -401,6 +401,45 @@ class RegExp extends AST::RegExpLiteral {
     )
   }
 
+  /**
+   * A sequence of 'normal' characters.
+   */
+  predicate normalCharacterSequence(int start, int end) {
+    this.normalCharacter(start, end) and
+    end = start + 1 and
+    exists(int x, int y | this.charSet(x, y) and x <= start and y >= end)
+    or
+    exists(int s, int e |
+      e = max(int i | normalCharacterSub(s, i)) and
+      not exists(int x, int y | this.charSet(x, y) and x <= s and y >= e)
+    |
+      if qualifier(e, _, _, _)
+      then
+        end = e and start = e - 1
+        or
+        end = e - 1 and start = s and start < end
+      else (
+        end = e and
+        start = s
+      )
+    )
+  }
+
+  private predicate normalCharacterSub(int start, int end) {
+    (
+      normalCharacterSub(start, end - 1)
+      or
+      start = end - 1 and not normalCharacter(start - 1, start)
+    ) and
+    this.normalCharacter(end - 1, end)
+  }
+
+  private predicate characterItem(int start, int end) {
+    this.normalCharacterSequence(start, end) or
+    this.escapedCharacter(start, end) or
+    this.specialCharacter(start, end, _)
+  }
+
   /** Whether the text in the range `start,end` is a group */
   predicate group(int start, int end) {
     this.groupContents(start, end, _, _)
@@ -639,7 +678,7 @@ class RegExp extends AST::RegExpLiteral {
   string getBackRefName(int start, int end) { this.namedBackreference(start, end, result) }
 
   private predicate baseItem(int start, int end) {
-    this.character(start, end) and
+    this.characterItem(start, end) and
     not exists(int x, int y | this.charSet(x, y) and x <= start and y >= end)
     or
     this.group(start, end)
@@ -746,7 +785,7 @@ class RegExp extends AST::RegExpLiteral {
   }
 
   private predicate itemStart(int start) {
-    this.character(start, _) or
+    this.characterItem(start, _) or
     this.isGroupStart(start) or
     this.charSet(start, _) or
     this.backreference(start, _) or
@@ -754,7 +793,7 @@ class RegExp extends AST::RegExpLiteral {
   }
 
   private predicate itemEnd(int end) {
-    this.character(_, end)
+    this.characterItem(_, end)
     or
     exists(int endm1 | this.isGroupEnd(endm1) and end = endm1 + 1)
     or
@@ -865,7 +904,7 @@ class RegExp extends AST::RegExpLiteral {
    */
   predicate firstItem(int start, int end) {
     (
-      this.character(start, end)
+      this.characterItem(start, end)
       or
       this.qualifiedItem(start, end, _, _)
       or
@@ -880,7 +919,7 @@ class RegExp extends AST::RegExpLiteral {
    */
   predicate lastItem(int start, int end) {
     (
-      this.character(start, end)
+      this.characterItem(start, end)
       or
       this.qualifiedItem(start, end, _, _)
       or
diff --git a/ruby/ql/lib/codeql/ruby/security/performance/RegExpTreeView.qll b/ruby/ql/lib/codeql/ruby/security/performance/RegExpTreeView.qll
index 9c8e39e56cea..7c2df79abef4 100644
--- a/ruby/ql/lib/codeql/ruby/security/performance/RegExpTreeView.qll
+++ b/ruby/ql/lib/codeql/ruby/security/performance/RegExpTreeView.qll
@@ -228,7 +228,12 @@ newtype TRegExpParent =
   TRegExpCharacterRange(RegExp re, int start, int end) { re.charRange(_, start, _, _, end) } or
   TRegExpGroup(RegExp re, int start, int end) { re.group(start, end) } or
   TRegExpSpecialChar(RegExp re, int start, int end) { re.specialCharacter(start, end, _) } or
-  TRegExpNormalChar(RegExp re, int start, int end) { re.normalCharacter(start, end) } or
+  TRegExpNormalChar(RegExp re, int start, int end) {
+    re.normalCharacterSequence(start, end)
+    or
+    re.escapedCharacter(start, end) and
+    not re.specialCharacter(start, end, _)
+  } or
   TRegExpBackRef(RegExp re, int start, int end) { re.backreference(start, end) } or
   TRegExpNamedCharacterProperty(RegExp re, int start, int end) {
     re.namedCharacterProperty(start, end, _)
diff --git a/ruby/ql/test/library-tests/ast/Ast.expected b/ruby/ql/test/library-tests/ast/Ast.expected
index a3e090324ba4..422af8141d5a 100644
--- a/ruby/ql/test/library-tests/ast/Ast.expected
+++ b/ruby/ql/test/library-tests/ast/Ast.expected
@@ -973,10 +973,8 @@ control/cases.rb:
 #   92|         getParsed: [RegExpSequence] .*abc[0-9]
 #   92|           0: [RegExpStar] .*
 #   92|             0: [RegExpDot] .
-#   92|           1: [RegExpConstant, RegExpNormalChar] a
-#   92|           2: [RegExpConstant, RegExpNormalChar] b
-#   92|           3: [RegExpConstant, RegExpNormalChar] c
-#   92|           4: [RegExpCharacterClass] [0-9]
+#   92|           1: [RegExpConstant, RegExpNormalChar] abc
+#   92|           2: [RegExpCharacterClass] [0-9]
 #   92|             0: [RegExpCharacterRange] 0-9
 #   92|               0: [RegExpConstant, RegExpNormalChar] 0
 #   92|               1: [RegExpConstant, RegExpNormalChar] 9
@@ -1823,47 +1821,25 @@ literals/literals.rb:
 #  133|         getAnOperand/getArgument/getRightOperand: [IntegerLiteral] 4
 #  136|   getStmt: [RegExpLiteral] //
 #  137|   getStmt: [RegExpLiteral] /foo/
-#  137|     getParsed: [RegExpSequence] foo
-#  137|       0: [RegExpConstant, RegExpNormalChar] f
-#  137|       1: [RegExpConstant, RegExpNormalChar] o
-#  137|       2: [RegExpConstant, RegExpNormalChar] o
+#  137|     getParsed: [RegExpConstant, RegExpNormalChar] foo
 #  137|     getComponent: [RegExpTextComponent] foo
 #  138|   getStmt: [RegExpLiteral] /foo/
-#  138|     getParsed: [RegExpSequence] foo
-#  138|       0: [RegExpConstant, RegExpNormalChar] f
-#  138|       1: [RegExpConstant, RegExpNormalChar] o
-#  138|       2: [RegExpConstant, RegExpNormalChar] o
+#  138|     getParsed: [RegExpConstant, RegExpNormalChar] foo
 #  138|     getComponent: [RegExpTextComponent] foo
 #  139|   getStmt: [RegExpLiteral] /foo+\sbar\S/
 #  139|     getParsed: [RegExpSequence] foo+\sbar\S
-#  139|       0: [RegExpConstant, RegExpNormalChar] f
-#  139|       1: [RegExpConstant, RegExpNormalChar] o
-#  139|       2: [RegExpPlus] o+
+#  139|       0: [RegExpConstant, RegExpNormalChar] fo
+#  139|       1: [RegExpPlus] o+
 #  139|         0: [RegExpConstant, RegExpNormalChar] o
-#  139|       3: [RegExpCharacterClassEscape] \s
-#  139|       4: [RegExpConstant, RegExpNormalChar] b
-#  139|       5: [RegExpConstant, RegExpNormalChar] a
-#  139|       6: [RegExpConstant, RegExpNormalChar] r
-#  139|       7: [RegExpCharacterClassEscape] \S
+#  139|       2: [RegExpCharacterClassEscape] \s
+#  139|       3: [RegExpConstant, RegExpNormalChar] bar
+#  139|       4: [RegExpCharacterClassEscape] \S
 #  139|     getComponent: [RegExpTextComponent] foo+
 #  139|     getComponent: [RegExpEscapeSequenceComponent] \s
 #  139|     getComponent: [RegExpTextComponent] bar
 #  139|     getComponent: [RegExpEscapeSequenceComponent] \S
 #  140|   getStmt: [RegExpLiteral] /foo#{...}bar#{...}#{...}/
-#  140|     getParsed: [RegExpSequence] foo2barbarbar
-#  140|       0: [RegExpConstant, RegExpNormalChar] f
-#  140|       1: [RegExpConstant, RegExpNormalChar] o
-#  140|       2: [RegExpConstant, RegExpNormalChar] o
-#  140|       3: [RegExpConstant, RegExpNormalChar] 2
-#  140|       4: [RegExpConstant, RegExpNormalChar] b
-#  140|       5: [RegExpConstant, RegExpNormalChar] a
-#  140|       6: [RegExpConstant, RegExpNormalChar] r
-#  140|       7: [RegExpConstant, RegExpNormalChar] b
-#  140|       8: [RegExpConstant, RegExpNormalChar] a
-#  140|       9: [RegExpConstant, RegExpNormalChar] r
-#  140|       10: [RegExpConstant, RegExpNormalChar] b
-#  140|       11: [RegExpConstant, RegExpNormalChar] a
-#  140|       12: [RegExpConstant, RegExpNormalChar] r
+#  140|     getParsed: [RegExpConstant, RegExpNormalChar] foo2barbarbar
 #  140|     getComponent: [RegExpTextComponent] foo
 #  140|     getComponent: [RegExpInterpolationComponent] #{...}
 #  140|       getStmt: [AddExpr] ... + ...
@@ -1878,47 +1854,25 @@ literals/literals.rb:
 #  141|     getComponent: [RegExpTextComponent] foo
 #  142|   getStmt: [RegExpLiteral] //
 #  143|   getStmt: [RegExpLiteral] /foo/
-#  143|     getParsed: [RegExpSequence] foo
-#  143|       0: [RegExpConstant, RegExpNormalChar] f
-#  143|       1: [RegExpConstant, RegExpNormalChar] o
-#  143|       2: [RegExpConstant, RegExpNormalChar] o
+#  143|     getParsed: [RegExpConstant, RegExpNormalChar] foo
 #  143|     getComponent: [RegExpTextComponent] foo
 #  144|   getStmt: [RegExpLiteral] /foo/
-#  144|     getParsed: [RegExpSequence] foo
-#  144|       0: [RegExpConstant, RegExpNormalChar] f
-#  144|       1: [RegExpConstant, RegExpNormalChar] o
-#  144|       2: [RegExpConstant, RegExpNormalChar] o
+#  144|     getParsed: [RegExpConstant, RegExpNormalChar] foo
 #  144|     getComponent: [RegExpTextComponent] foo
 #  145|   getStmt: [RegExpLiteral] /foo+\sbar\S/
 #  145|     getParsed: [RegExpSequence] foo+\sbar\S
-#  145|       0: [RegExpConstant, RegExpNormalChar] f
-#  145|       1: [RegExpConstant, RegExpNormalChar] o
-#  145|       2: [RegExpPlus] o+
+#  145|       0: [RegExpConstant, RegExpNormalChar] fo
+#  145|       1: [RegExpPlus] o+
 #  145|         0: [RegExpConstant, RegExpNormalChar] o
-#  145|       3: [RegExpCharacterClassEscape] \s
-#  145|       4: [RegExpConstant, RegExpNormalChar] b
-#  145|       5: [RegExpConstant, RegExpNormalChar] a
-#  145|       6: [RegExpConstant, RegExpNormalChar] r
-#  145|       7: [RegExpCharacterClassEscape] \S
+#  145|       2: [RegExpCharacterClassEscape] \s
+#  145|       3: [RegExpConstant, RegExpNormalChar] bar
+#  145|       4: [RegExpCharacterClassEscape] \S
 #  145|     getComponent: [RegExpTextComponent] foo+
 #  145|     getComponent: [RegExpEscapeSequenceComponent] \s
 #  145|     getComponent: [RegExpTextComponent] bar
 #  145|     getComponent: [RegExpEscapeSequenceComponent] \S
 #  146|   getStmt: [RegExpLiteral] /foo#{...}bar#{...}#{...}/
-#  146|     getParsed: [RegExpSequence] foo2barbarbar
-#  146|       0: [RegExpConstant, RegExpNormalChar] f
-#  146|       1: [RegExpConstant, RegExpNormalChar] o
-#  146|       2: [RegExpConstant, RegExpNormalChar] o
-#  146|       3: [RegExpConstant, RegExpNormalChar] 2
-#  146|       4: [RegExpConstant, RegExpNormalChar] b
-#  146|       5: [RegExpConstant, RegExpNormalChar] a
-#  146|       6: [RegExpConstant, RegExpNormalChar] r
-#  146|       7: [RegExpConstant, RegExpNormalChar] b
-#  146|       8: [RegExpConstant, RegExpNormalChar] a
-#  146|       9: [RegExpConstant, RegExpNormalChar] r
-#  146|       10: [RegExpConstant, RegExpNormalChar] b
-#  146|       11: [RegExpConstant, RegExpNormalChar] a
-#  146|       12: [RegExpConstant, RegExpNormalChar] r
+#  146|     getParsed: [RegExpConstant, RegExpNormalChar] foo2barbarbar
 #  146|     getComponent: [RegExpTextComponent] foo
 #  146|     getComponent: [RegExpInterpolationComponent] #{...}
 #  146|       getStmt: [AddExpr] ... + ...
@@ -2469,10 +2423,8 @@ operations/operations.rb:
 #   65|     getAnOperand/getLeftOperand/getReceiver: [LocalVariableAccess] name
 #   65|     getAnOperand/getArgument/getRightOperand: [RegExpLiteral] /foo.*/
 #   65|       getParsed: [RegExpSequence] foo.*
-#   65|         0: [RegExpConstant, RegExpNormalChar] f
-#   65|         1: [RegExpConstant, RegExpNormalChar] o
-#   65|         2: [RegExpConstant, RegExpNormalChar] o
-#   65|         3: [RegExpStar] .*
+#   65|         0: [RegExpConstant, RegExpNormalChar] foo
+#   65|         1: [RegExpStar] .*
 #   65|           0: [RegExpDot] .
 #   65|       getComponent: [RegExpTextComponent] foo.*
 #   66|   getStmt: [NoRegExpMatchExpr] ... !~ ...
@@ -2481,9 +2433,7 @@ operations/operations.rb:
 #   66|       getParsed: [RegExpSequence] .*bar
 #   66|         0: [RegExpStar] .*
 #   66|           0: [RegExpDot] .
-#   66|         1: [RegExpConstant, RegExpNormalChar] b
-#   66|         2: [RegExpConstant, RegExpNormalChar] a
-#   66|         3: [RegExpConstant, RegExpNormalChar] r
+#   66|         1: [RegExpConstant, RegExpNormalChar] bar
 #   66|       getComponent: [RegExpTextComponent] .*bar
 #   69|   getStmt: [AssignAddExpr] ... += ...
 #   69|     getAnOperand/getLeftOperand: [LocalVariableAccess] x
diff --git a/ruby/ql/test/library-tests/regexp/parse.expected b/ruby/ql/test/library-tests/regexp/parse.expected
index c42b90d1ab81..3241ce25388f 100644
--- a/ruby/ql/test/library-tests/regexp/parse.expected
+++ b/ruby/ql/test/library-tests/regexp/parse.expected
@@ -1,14 +1,5 @@
 regexp.rb:
-#    5| [RegExpConstant, RegExpNormalChar] a
-
-#    5| [RegExpSequence] abc
-#-----| 0 -> [RegExpConstant, RegExpNormalChar] a
-#-----| 1 -> [RegExpConstant, RegExpNormalChar] b
-#-----| 2 -> [RegExpConstant, RegExpNormalChar] c
-
-#    5| [RegExpConstant, RegExpNormalChar] b
-
-#    5| [RegExpConstant, RegExpNormalChar] c
+#    5| [RegExpConstant, RegExpNormalChar] abc
 
 #    8| [RegExpConstant, RegExpNormalChar] a
 
@@ -38,70 +29,36 @@ regexp.rb:
 #    9| [RegExpRange] a{4,8}
 #-----| 0 -> [RegExpConstant, RegExpNormalChar] a
 
-#    9| [RegExpNormalChar] 4
-
-#    9| [RegExpNormalChar] ,
-
-#    9| [RegExpNormalChar] 8
-
-#    9| [RegExpNormalChar] }
+#    9| [RegExpNormalChar] 4,8}
 
 #   10| [RegExpConstant, RegExpNormalChar] a
 
 #   10| [RegExpRange] a{,8}
 #-----| 0 -> [RegExpConstant, RegExpNormalChar] a
 
-#   10| [RegExpNormalChar] ,
-
-#   10| [RegExpNormalChar] 8
-
-#   10| [RegExpNormalChar] }
+#   10| [RegExpNormalChar] ,8}
 
 #   11| [RegExpConstant, RegExpNormalChar] a
 
 #   11| [InfiniteRepetitionQuantifier, RegExpRange] a{3,}
 #-----| 0 -> [RegExpConstant, RegExpNormalChar] a
 
-#   11| [RegExpNormalChar] 3
-
-#   11| [RegExpNormalChar] ,
-
-#   11| [RegExpNormalChar] }
+#   11| [RegExpNormalChar] 3,}
 
 #   12| [RegExpConstant, RegExpNormalChar] a
 
 #   12| [RegExpRange] a{7}
 #-----| 0 -> [RegExpConstant, RegExpNormalChar] a
 
-#   12| [RegExpNormalChar] 7
+#   12| [RegExpNormalChar] 7}
 
-#   12| [RegExpNormalChar] }
-
-#   15| [RegExpConstant, RegExpNormalChar] f
-
-#   15| [RegExpSequence] foo
-#-----| 0 -> [RegExpConstant, RegExpNormalChar] f
-#-----| 1 -> [RegExpConstant, RegExpNormalChar] o
-#-----| 2 -> [RegExpConstant, RegExpNormalChar] o
+#   15| [RegExpConstant, RegExpNormalChar] foo
 
 #   15| [RegExpAlt] foo|bar
-#-----| 0 -> [RegExpSequence] foo
-#-----| 1 -> [RegExpSequence] bar
-
-#   15| [RegExpConstant, RegExpNormalChar] o
+#-----| 0 -> [RegExpConstant, RegExpNormalChar] foo
+#-----| 1 -> [RegExpConstant, RegExpNormalChar] bar
 
-#   15| [RegExpConstant, RegExpNormalChar] o
-
-#   15| [RegExpConstant, RegExpNormalChar] b
-
-#   15| [RegExpSequence] bar
-#-----| 0 -> [RegExpConstant, RegExpNormalChar] b
-#-----| 1 -> [RegExpConstant, RegExpNormalChar] a
-#-----| 2 -> [RegExpConstant, RegExpNormalChar] r
-
-#   15| [RegExpConstant, RegExpNormalChar] a
-
-#   15| [RegExpConstant, RegExpNormalChar] r
+#   15| [RegExpConstant, RegExpNormalChar] bar
 
 #   18| [RegExpCharacterClass] [abc]
 #-----| 0 -> [RegExpConstant, RegExpNormalChar] a
@@ -229,10 +186,7 @@ regexp.rb:
 
 #   29| [RegExpSequence] [[a-f]A-F]
 #-----| 0 -> [RegExpCharacterClass] [[a-f]
-#-----| 1 -> [RegExpConstant, RegExpNormalChar] A
-#-----| 2 -> [RegExpConstant, RegExpNormalChar] -
-#-----| 3 -> [RegExpConstant, RegExpNormalChar] F
-#-----| 4 -> [RegExpConstant, RegExpNormalChar] ]
+#-----| 1 -> [RegExpConstant, RegExpNormalChar] A-F]
 
 #   29| [RegExpConstant, RegExpNormalChar] [
 
@@ -244,13 +198,7 @@ regexp.rb:
 
 #   29| [RegExpConstant, RegExpNormalChar] f
 
-#   29| [RegExpConstant, RegExpNormalChar] A
-
-#   29| [RegExpConstant, RegExpNormalChar] -
-
-#   29| [RegExpConstant, RegExpNormalChar] F
-
-#   29| [RegExpConstant, RegExpNormalChar] ]
+#   29| [RegExpConstant, RegExpNormalChar] A-F]
 
 #   32| [RegExpDot] .
 
@@ -312,69 +260,41 @@ regexp.rb:
 
 #   41| [RegExpSequence] \Gabc
 #-----| 0 -> [RegExpSpecialChar] \G
-#-----| 1 -> [RegExpConstant, RegExpNormalChar] a
-#-----| 2 -> [RegExpConstant, RegExpNormalChar] b
-#-----| 3 -> [RegExpConstant, RegExpNormalChar] c
-
-#   41| [RegExpConstant, RegExpNormalChar] a
+#-----| 1 -> [RegExpConstant, RegExpNormalChar] abc
 
-#   41| [RegExpConstant, RegExpNormalChar] b
-
-#   41| [RegExpConstant, RegExpNormalChar] c
+#   41| [RegExpConstant, RegExpNormalChar] abc
 
 #   42| [RegExpSpecialChar] \b
 
 #   42| [RegExpSequence] \b!a\B
 #-----| 0 -> [RegExpSpecialChar] \b
-#-----| 1 -> [RegExpConstant, RegExpNormalChar] !
-#-----| 2 -> [RegExpConstant, RegExpNormalChar] a
-#-----| 3 -> [RegExpSpecialChar] \B
-
-#   42| [RegExpConstant, RegExpNormalChar] !
+#-----| 1 -> [RegExpConstant, RegExpNormalChar] !a
+#-----| 2 -> [RegExpSpecialChar] \B
 
-#   42| [RegExpConstant, RegExpNormalChar] a
+#   42| [RegExpConstant, RegExpNormalChar] !a
 
 #   42| [RegExpSpecialChar] \B
 
 #   45| [RegExpGroup] (foo)
-#-----| 0 -> [RegExpSequence] foo
+#-----| 0 -> [RegExpConstant, RegExpNormalChar] foo
 
 #   45| [RegExpStar] (foo)*
 #-----| 0 -> [RegExpGroup] (foo)
 
 #   45| [RegExpSequence] (foo)*bar
 #-----| 0 -> [RegExpStar] (foo)*
-#-----| 1 -> [RegExpConstant, RegExpNormalChar] b
-#-----| 2 -> [RegExpConstant, RegExpNormalChar] a
-#-----| 3 -> [RegExpConstant, RegExpNormalChar] r
-
-#   45| [RegExpConstant, RegExpNormalChar] f
-
-#   45| [RegExpSequence] foo
-#-----| 0 -> [RegExpConstant, RegExpNormalChar] f
-#-----| 1 -> [RegExpConstant, RegExpNormalChar] o
-#-----| 2 -> [RegExpConstant, RegExpNormalChar] o
-
-#   45| [RegExpConstant, RegExpNormalChar] o
+#-----| 1 -> [RegExpConstant, RegExpNormalChar] bar
 
-#   45| [RegExpConstant, RegExpNormalChar] o
+#   45| [RegExpConstant, RegExpNormalChar] foo
 
-#   45| [RegExpConstant, RegExpNormalChar] b
+#   45| [RegExpConstant, RegExpNormalChar] bar
 
-#   45| [RegExpConstant, RegExpNormalChar] a
-
-#   45| [RegExpConstant, RegExpNormalChar] r
-
-#   46| [RegExpConstant, RegExpNormalChar] f
+#   46| [RegExpConstant, RegExpNormalChar] fo
 
 #   46| [RegExpSequence] fo(o|b)ar
-#-----| 0 -> [RegExpConstant, RegExpNormalChar] f
-#-----| 1 -> [RegExpConstant, RegExpNormalChar] o
-#-----| 2 -> [RegExpGroup] (o|b)
-#-----| 3 -> [RegExpConstant, RegExpNormalChar] a
-#-----| 4 -> [RegExpConstant, RegExpNormalChar] r
-
-#   46| [RegExpConstant, RegExpNormalChar] o
+#-----| 0 -> [RegExpConstant, RegExpNormalChar] fo
+#-----| 1 -> [RegExpGroup] (o|b)
+#-----| 2 -> [RegExpConstant, RegExpNormalChar] ar
 
 #   46| [RegExpGroup] (o|b)
 #-----| 0 -> [RegExpAlt] o|b
@@ -387,9 +307,7 @@ regexp.rb:
 
 #   46| [RegExpConstant, RegExpNormalChar] b
 
-#   46| [RegExpConstant, RegExpNormalChar] a
-
-#   46| [RegExpConstant, RegExpNormalChar] r
+#   46| [RegExpConstant, RegExpNormalChar] ar
 
 #   47| [RegExpGroup] (a|b|cd)
 #-----| 0 -> [RegExpAlt] a|b|cd
@@ -403,17 +321,11 @@ regexp.rb:
 #   47| [RegExpAlt] a|b|cd
 #-----| 0 -> [RegExpConstant, RegExpNormalChar] a
 #-----| 1 -> [RegExpConstant, RegExpNormalChar] b
-#-----| 2 -> [RegExpSequence] cd
+#-----| 2 -> [RegExpConstant, RegExpNormalChar] cd
 
 #   47| [RegExpConstant, RegExpNormalChar] b
 
-#   47| [RegExpConstant, RegExpNormalChar] c
-
-#   47| [RegExpSequence] cd
-#-----| 0 -> [RegExpConstant, RegExpNormalChar] c
-#-----| 1 -> [RegExpConstant, RegExpNormalChar] d
-
-#   47| [RegExpConstant, RegExpNormalChar] d
+#   47| [RegExpConstant, RegExpNormalChar] cd
 
 #   47| [RegExpConstant, RegExpNormalChar] e
 
@@ -511,13 +423,7 @@ regexp.rb:
 #   61| [RegExpRange] \p{^Alnum}{2,3}
 #-----| 0 -> [RegExpNamedCharacterProperty] \p{^Alnum}
 
-#   61| [RegExpNormalChar] 2
-
-#   61| [RegExpNormalChar] ,
-
-#   61| [RegExpNormalChar] 3
-
-#   61| [RegExpNormalChar] }
+#   61| [RegExpNormalChar] 2,3}
 
 #   62| [RegExpCharacterClass] [a-f\p{Digit}]
 #-----| 0 -> [RegExpCharacterRange] a-f
@@ -583,13 +489,4 @@ regexp.rb:
 
 #   74| [RegExpNamedCharacterProperty] [:digit:]
 
-#   78| [RegExpConstant, RegExpNormalChar] a
-
-#   78| [RegExpSequence] abc
-#-----| 0 -> [RegExpConstant, RegExpNormalChar] a
-#-----| 1 -> [RegExpConstant, RegExpNormalChar] b
-#-----| 2 -> [RegExpConstant, RegExpNormalChar] c
-
-#   78| [RegExpConstant, RegExpNormalChar] b
-
-#   78| [RegExpConstant, RegExpNormalChar] c
+#   78| [RegExpConstant, RegExpNormalChar] abc
diff --git a/ruby/ql/test/query-tests/security/cwe-1333-exponential-redos/ReDoS.expected b/ruby/ql/test/query-tests/security/cwe-1333-exponential-redos/ReDoS.expected
index 213f0e11189a..572ac08887d3 100644
--- a/ruby/ql/test/query-tests/security/cwe-1333-exponential-redos/ReDoS.expected
+++ b/ruby/ql/test/query-tests/security/cwe-1333-exponential-redos/ReDoS.expected
@@ -54,7 +54,7 @@
 | tst.rb:218:11:218:15 | [^X]+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'W'. |
 | tst.rb:221:16:221:16 | b | This part of the regular expression may cause exponential backtracking on strings starting with 'W' and containing many repetitions of 'bW'. |
 | tst.rb:227:16:227:16 | b | This part of the regular expression may cause exponential backtracking on strings starting with 'W' and containing many repetitions of 'bW'. |
-| tst.rb:239:13:239:13 | b | This part of the regular expression may cause exponential backtracking on strings starting with 'a' and containing many repetitions of 'ba'. |
+| tst.rb:239:12:239:13 | ab | This part of the regular expression may cause exponential backtracking on strings starting with 'a' and containing many repetitions of 'ab'. |
 | tst.rb:245:11:245:17 | [\\n\\s]+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\n'. |
 | tst.rb:254:11:254:13 | \\w* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'foobarbazfoobarbazfoobarbazfoobarbazfoobarbazfoobarbaz'. |
 | tst.rb:254:23:254:25 | \\w* | This part of the regular expression may cause exponential backtracking on strings starting with 'foobarbaz' and containing many repetitions of 'foobarbazfoobarbazfoobarbazfoobarbazfoobarbazfoobarbaz'. |

From 9d9abaf1f902f812a9adf73e0873b564f1c246cb Mon Sep 17 00:00:00 2001
From: Arthur Baars <aibaars@github.com>
Date: Fri, 25 Feb 2022 12:18:37 +0100
Subject: [PATCH 2/4] Apply suggestions from code review

Co-authored-by: yoff <lerchedahl@gmail.com>
---
 python/ql/lib/semmle/python/RegexTreeView.qll |  2 +-
 python/ql/lib/semmle/python/regex.qll         | 28 +++++++++++--------
 .../ruby/security/performance/ParseRegExp.qll | 28 +++++++++++--------
 .../security/performance/RegExpTreeView.qll   |  2 +-
 4 files changed, 34 insertions(+), 26 deletions(-)

diff --git a/python/ql/lib/semmle/python/RegexTreeView.qll b/python/ql/lib/semmle/python/RegexTreeView.qll
index 95d983f5e88f..428ad3c3e63c 100644
--- a/python/ql/lib/semmle/python/RegexTreeView.qll
+++ b/python/ql/lib/semmle/python/RegexTreeView.qll
@@ -40,7 +40,7 @@ newtype TRegExpParent =
   TRegExpSpecialChar(Regex re, int start, int end) { re.specialCharacter(start, end, _) } or
   /** A normal character */
   TRegExpNormalChar(Regex re, int start, int end) {
-    re.normalCharacterSequence(start, end)
+    re.simpleCharacterSequence(start, end)
     or
     re.escapedCharacter(start, end) and
     not re.specialCharacter(start, end, _)
diff --git a/python/ql/lib/semmle/python/regex.qll b/python/ql/lib/semmle/python/regex.qll
index 910169d20836..24e47af3d8f8 100644
--- a/python/ql/lib/semmle/python/regex.qll
+++ b/python/ql/lib/semmle/python/regex.qll
@@ -447,18 +447,22 @@ abstract class RegexString extends Expr {
   }
 
   /**
-   * A sequence of 'normal' characters.
+   * A sequence of 'simple' characters.
    */
-  predicate normalCharacterSequence(int start, int end) {
-    this.normalCharacter(start, end) and
-    end = start + 1 and
-    exists(int x, int y | this.charSet(x, y) and x <= start and y >= end)
+  predicate simpleCharacterSequence(int start, int end) {
+    // a simple character inside a character set is interpreted on its own
+    this.simpleCharacter(start, end) and
+    this.inCharSet(start)
     or
+    // a maximal run of simple characters is considered as one constant
     exists(int s, int e |
-      e = max(int i | normalCharacterSub(s, i)) and
-      not exists(int x, int y | this.charSet(x, y) and x <= s and y >= e)
+      e = max(int i | simpleCharacterRun(s, i)) and
+      not this.inCharSet(s)
     |
-      if qualifier(e, _, _, _)
+      // 'abc' can be considered one constant, but
+      // 'abc+' has to be broken up into 'ab' and 'c+',
+      // as the qualifier only applies to 'c'.
+      if this.qualifier(e, _, _, _)
       then
         end = e and start = e - 1
         or
@@ -470,17 +474,17 @@ abstract class RegexString extends Expr {
     )
   }
 
-  private predicate normalCharacterSub(int start, int end) {
+  private predicate simpleCharacterRun(int start, int end) {
     (
-      normalCharacterSub(start, end - 1)
+      simpleCharacterRun(start, end - 1)
       or
       start = end - 1 and not normalCharacter(start - 1, start)
     ) and
-    this.normalCharacter(end - 1, end)
+    this.simpleCharacter(end - 1, end)
   }
 
   private predicate characterItem(int start, int end) {
-    this.normalCharacterSequence(start, end) or
+    this.simpleCharacterSequence(start, end) or
     this.escapedCharacter(start, end) or
     this.specialCharacter(start, end, _)
   }
diff --git a/ruby/ql/lib/codeql/ruby/security/performance/ParseRegExp.qll b/ruby/ql/lib/codeql/ruby/security/performance/ParseRegExp.qll
index 11dc890b89a7..ea364d4eb318 100644
--- a/ruby/ql/lib/codeql/ruby/security/performance/ParseRegExp.qll
+++ b/ruby/ql/lib/codeql/ruby/security/performance/ParseRegExp.qll
@@ -402,18 +402,22 @@ class RegExp extends AST::RegExpLiteral {
   }
 
   /**
-   * A sequence of 'normal' characters.
+   * A sequence of 'simple' characters.
    */
-  predicate normalCharacterSequence(int start, int end) {
-    this.normalCharacter(start, end) and
-    end = start + 1 and
-    exists(int x, int y | this.charSet(x, y) and x <= start and y >= end)
+  predicate simpleCharacterSequence(int start, int end) {
+    // a simple character inside a character set is interpreted on its own
+    this.simpleCharacter(start, end) and
+    this.inCharSet(start)
     or
+    // a maximal run of simple characters is considered as one constant
     exists(int s, int e |
-      e = max(int i | normalCharacterSub(s, i)) and
-      not exists(int x, int y | this.charSet(x, y) and x <= s and y >= e)
+      e = max(int i | simpleCharacterRun(s, i)) and
+      not this.inCharSet(s)
     |
-      if qualifier(e, _, _, _)
+      // 'abc' can be considered one constant, but
+      // 'abc+' has to be broken up into 'ab' and 'c+',
+      // as the qualifier only applies to 'c'.
+      if this.qualifier(e, _, _, _)
       then
         end = e and start = e - 1
         or
@@ -425,17 +429,17 @@ class RegExp extends AST::RegExpLiteral {
     )
   }
 
-  private predicate normalCharacterSub(int start, int end) {
+  private predicate simpleCharacterRun(int start, int end) {
     (
-      normalCharacterSub(start, end - 1)
+      simpleCharacterRun(start, end - 1)
       or
       start = end - 1 and not normalCharacter(start - 1, start)
     ) and
-    this.normalCharacter(end - 1, end)
+    this.simpleCharacter(end - 1, end)
   }
 
   private predicate characterItem(int start, int end) {
-    this.normalCharacterSequence(start, end) or
+    this.simpleCharacterSequence(start, end) or
     this.escapedCharacter(start, end) or
     this.specialCharacter(start, end, _)
   }
diff --git a/ruby/ql/lib/codeql/ruby/security/performance/RegExpTreeView.qll b/ruby/ql/lib/codeql/ruby/security/performance/RegExpTreeView.qll
index 7c2df79abef4..1ae338e45b60 100644
--- a/ruby/ql/lib/codeql/ruby/security/performance/RegExpTreeView.qll
+++ b/ruby/ql/lib/codeql/ruby/security/performance/RegExpTreeView.qll
@@ -229,7 +229,7 @@ newtype TRegExpParent =
   TRegExpGroup(RegExp re, int start, int end) { re.group(start, end) } or
   TRegExpSpecialChar(RegExp re, int start, int end) { re.specialCharacter(start, end, _) } or
   TRegExpNormalChar(RegExp re, int start, int end) {
-    re.normalCharacterSequence(start, end)
+    re.simpleCharacterSequence(start, end)
     or
     re.escapedCharacter(start, end) and
     not re.specialCharacter(start, end, _)

From 5044f8910516d548f1316816963d04ba98632d60 Mon Sep 17 00:00:00 2001
From: Arthur Baars <aibaars@github.com>
Date: Fri, 25 Feb 2022 12:57:59 +0100
Subject: [PATCH 3/4] Ruby/Python re-introduce normalCharacterSequence

---
 python/ql/lib/semmle/python/RegexTreeView.qll |  2 +-
 python/ql/lib/semmle/python/regex.qll         | 23 ++++++++++---------
 python/ql/test/library-tests/regex/Regex.ql   |  4 ++++
 .../ruby/security/performance/ParseRegExp.qll | 23 ++++++++++---------
 .../security/performance/RegExpTreeView.qll   |  2 +-
 5 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/python/ql/lib/semmle/python/RegexTreeView.qll b/python/ql/lib/semmle/python/RegexTreeView.qll
index 428ad3c3e63c..95d983f5e88f 100644
--- a/python/ql/lib/semmle/python/RegexTreeView.qll
+++ b/python/ql/lib/semmle/python/RegexTreeView.qll
@@ -40,7 +40,7 @@ newtype TRegExpParent =
   TRegExpSpecialChar(Regex re, int start, int end) { re.specialCharacter(start, end, _) } or
   /** A normal character */
   TRegExpNormalChar(Regex re, int start, int end) {
-    re.simpleCharacterSequence(start, end)
+    re.normalCharacterSequence(start, end)
     or
     re.escapedCharacter(start, end) and
     not re.specialCharacter(start, end, _)
diff --git a/python/ql/lib/semmle/python/regex.qll b/python/ql/lib/semmle/python/regex.qll
index 24e47af3d8f8..fb56fa0ab118 100644
--- a/python/ql/lib/semmle/python/regex.qll
+++ b/python/ql/lib/semmle/python/regex.qll
@@ -427,6 +427,7 @@ abstract class RegexString extends Expr {
   }
 
   predicate normalCharacter(int start, int end) {
+    end = start + 1 and
     this.character(start, end) and
     not this.specialCharacter(start, end, _)
   }
@@ -447,16 +448,16 @@ abstract class RegexString extends Expr {
   }
 
   /**
-   * A sequence of 'simple' characters.
+   * Holds if the range [start:end) consists of only 'normal' characters.
    */
-  predicate simpleCharacterSequence(int start, int end) {
-    // a simple character inside a character set is interpreted on its own
-    this.simpleCharacter(start, end) and
+  predicate normalCharacterSequence(int start, int end) {
+    // a normal character inside a character set is interpreted on its own
+    this.normalCharacter(start, end) and
     this.inCharSet(start)
     or
-    // a maximal run of simple characters is considered as one constant
+    // a maximal run of normal characters is considered as one constant
     exists(int s, int e |
-      e = max(int i | simpleCharacterRun(s, i)) and
+      e = max(int i | this.normalCharacterRun(s, i)) and
       not this.inCharSet(s)
     |
       // 'abc' can be considered one constant, but
@@ -474,17 +475,17 @@ abstract class RegexString extends Expr {
     )
   }
 
-  private predicate simpleCharacterRun(int start, int end) {
+  private predicate normalCharacterRun(int start, int end) {
     (
-      simpleCharacterRun(start, end - 1)
+      this.normalCharacterRun(start, end - 1)
       or
-      start = end - 1 and not normalCharacter(start - 1, start)
+      start = end - 1 and not this.normalCharacter(start - 1, start)
     ) and
-    this.simpleCharacter(end - 1, end)
+    this.normalCharacter(end - 1, end)
   }
 
   private predicate characterItem(int start, int end) {
-    this.simpleCharacterSequence(start, end) or
+    this.normalCharacterSequence(start, end) or
     this.escapedCharacter(start, end) or
     this.specialCharacter(start, end, _)
   }
diff --git a/python/ql/test/library-tests/regex/Regex.ql b/python/ql/test/library-tests/regex/Regex.ql
index eb0628bfcde7..4c799ac25741 100644
--- a/python/ql/test/library-tests/regex/Regex.ql
+++ b/python/ql/test/library-tests/regex/Regex.ql
@@ -6,6 +6,10 @@ predicate part(Regex r, int start, int end, string kind) {
   or
   r.normalCharacter(start, end) and kind = "char"
   or
+  r.escapedCharacter(start, end) and
+  kind = "char" and
+  not r.specialCharacter(start, end, _)
+  or
   r.specialCharacter(start, end, kind)
   or
   r.sequence(start, end) and kind = "sequence"
diff --git a/ruby/ql/lib/codeql/ruby/security/performance/ParseRegExp.qll b/ruby/ql/lib/codeql/ruby/security/performance/ParseRegExp.qll
index ea364d4eb318..397381e6a7fe 100644
--- a/ruby/ql/lib/codeql/ruby/security/performance/ParseRegExp.qll
+++ b/ruby/ql/lib/codeql/ruby/security/performance/ParseRegExp.qll
@@ -382,6 +382,7 @@ class RegExp extends AST::RegExpLiteral {
   }
 
   predicate normalCharacter(int start, int end) {
+    end = start + 1 and
     this.character(start, end) and
     not this.specialCharacter(start, end, _)
   }
@@ -402,16 +403,16 @@ class RegExp extends AST::RegExpLiteral {
   }
 
   /**
-   * A sequence of 'simple' characters.
+   * Holds if the range [start:end) consists of only 'normal' characters.
    */
-  predicate simpleCharacterSequence(int start, int end) {
-    // a simple character inside a character set is interpreted on its own
-    this.simpleCharacter(start, end) and
+  predicate normalCharacterSequence(int start, int end) {
+    // a normal character inside a character set is interpreted on its own
+    this.normalCharacter(start, end) and
     this.inCharSet(start)
     or
-    // a maximal run of simple characters is considered as one constant
+    // a maximal run of normal characters is considered as one constant
     exists(int s, int e |
-      e = max(int i | simpleCharacterRun(s, i)) and
+      e = max(int i | this.normalCharacterRun(s, i)) and
       not this.inCharSet(s)
     |
       // 'abc' can be considered one constant, but
@@ -429,17 +430,17 @@ class RegExp extends AST::RegExpLiteral {
     )
   }
 
-  private predicate simpleCharacterRun(int start, int end) {
+  private predicate normalCharacterRun(int start, int end) {
     (
-      simpleCharacterRun(start, end - 1)
+      this.normalCharacterRun(start, end - 1)
       or
-      start = end - 1 and not normalCharacter(start - 1, start)
+      start = end - 1 and not this.normalCharacter(start - 1, start)
     ) and
-    this.simpleCharacter(end - 1, end)
+    this.normalCharacter(end - 1, end)
   }
 
   private predicate characterItem(int start, int end) {
-    this.simpleCharacterSequence(start, end) or
+    this.normalCharacterSequence(start, end) or
     this.escapedCharacter(start, end) or
     this.specialCharacter(start, end, _)
   }
diff --git a/ruby/ql/lib/codeql/ruby/security/performance/RegExpTreeView.qll b/ruby/ql/lib/codeql/ruby/security/performance/RegExpTreeView.qll
index 1ae338e45b60..7c2df79abef4 100644
--- a/ruby/ql/lib/codeql/ruby/security/performance/RegExpTreeView.qll
+++ b/ruby/ql/lib/codeql/ruby/security/performance/RegExpTreeView.qll
@@ -229,7 +229,7 @@ newtype TRegExpParent =
   TRegExpGroup(RegExp re, int start, int end) { re.group(start, end) } or
   TRegExpSpecialChar(RegExp re, int start, int end) { re.specialCharacter(start, end, _) } or
   TRegExpNormalChar(RegExp re, int start, int end) {
-    re.simpleCharacterSequence(start, end)
+    re.normalCharacterSequence(start, end)
     or
     re.escapedCharacter(start, end) and
     not re.specialCharacter(start, end, _)

From 0c23f5815fbe221222592f5670037462ed9d901c Mon Sep 17 00:00:00 2001
From: Arthur Baars <aibaars@github.com>
Date: Fri, 25 Feb 2022 17:36:34 +0100
Subject: [PATCH 4/4] Add change note

---
 .../ql/lib/change-notes/2022-02-25-regex-group-characters.md  | 4 ++++
 ruby/ql/lib/change-notes/2022-02-25-regex-group-characters.md | 4 ++++
 2 files changed, 8 insertions(+)
 create mode 100644 python/ql/lib/change-notes/2022-02-25-regex-group-characters.md
 create mode 100644 ruby/ql/lib/change-notes/2022-02-25-regex-group-characters.md

diff --git a/python/ql/lib/change-notes/2022-02-25-regex-group-characters.md b/python/ql/lib/change-notes/2022-02-25-regex-group-characters.md
new file mode 100644
index 000000000000..615fe0023133
--- /dev/null
+++ b/python/ql/lib/change-notes/2022-02-25-regex-group-characters.md
@@ -0,0 +1,4 @@
+---
+category: minorAnalysis
+---
+* The regular expression parser now groups sequences of normal characters. This reduces the number of instances of `RegExpNormalChar`.
diff --git a/ruby/ql/lib/change-notes/2022-02-25-regex-group-characters.md b/ruby/ql/lib/change-notes/2022-02-25-regex-group-characters.md
new file mode 100644
index 000000000000..615fe0023133
--- /dev/null
+++ b/ruby/ql/lib/change-notes/2022-02-25-regex-group-characters.md
@@ -0,0 +1,4 @@
+---
+category: minorAnalysis
+---
+* The regular expression parser now groups sequences of normal characters. This reduces the number of instances of `RegExpNormalChar`.

<!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.0 Transitional//EN' 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'>
<html xmlns='http://www.w3.org/1999/xhtml'>
<head>
<title>pFad - Phonifier reborn</title>
<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />
</head>
<body>
<h1>Pfad - The Proxy pFad of &#169; 2024 Garber Painting. All rights reserved.</h1>


<!-- Disclaimer -->
<p>Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.</p>
<br>
<p>Alternative Proxies:</p><p><a href="http://rainy.clevelandohioweatherforecast.com/php-proxy/index.php?q=https://patch-diff.githubusercontent.com/raw/github/codeql/pull/8166.patch" target="_blank">Alternative Proxy</a></p><p><a href="http://rainy.clevelandohioweatherforecast.com/pFad/index.php?u=https://patch-diff.githubusercontent.com/raw/github/codeql/pull/8166.patch" target="_blank">pFad Proxy</a></p><p><a href="http://rainy.clevelandohioweatherforecast.com/pFad/v3index.php?u=https://patch-diff.githubusercontent.com/raw/github/codeql/pull/8166.patch" target="_blank">pFad v3 Proxy</a></p><p><a href="http://rainy.clevelandohioweatherforecast.com/pFad/v4index.php?u=https://patch-diff.githubusercontent.com/raw/github/codeql/pull/8166.patch" target="_blank">pFad v4 Proxy</a></p></body>
</html>