postgrespro
diff --git a/‎doc/src/sgml/func.sgml
Lines changed: 12 additions & 13 deletions b/‎doc/src/sgml/func.sgml
Lines changed: 12 additions & 13 deletions
diff --git a/‎src/backend/regex/re_syntax.n
Lines changed: 4 additions & 9 deletions b/‎src/backend/regex/re_syntax.n
Lines changed: 4 additions & 9 deletions
diff --git a/‎src/backend/regex/regc_color.c
Lines changed: 30 additions & 4 deletions b/‎src/backend/regex/regc_color.c
Lines changed: 30 additions & 4 deletions
diff --git a/‎src/backend/regex/regc_lex.c
Lines changed: 16 additions & 150 deletions b/‎src/backend/regex/regc_lex.c
Lines changed: 16 additions & 150 deletions
@@ -6097,6 +6097,9 @@ SELECT foo FROM regexp_split_to_table('the quick brown fox', '\s*') AS foo;
     non-ASCII characters to belong to any of these classes.)
     In addition to these standard character
     classes, <productname>PostgreSQL</productname> defines
+    the <literal>word</literal> character class, which is the same as
+    <literal>alnum</literal> plus the underscore (<literal>_</literal>)
+    character, and
     the <literal>ascii</literal> character class, which contains exactly
     the 7-bit ASCII set.
    </para>
@@ -6108,9 +6111,9 @@ SELECT foo FROM regexp_split_to_table('the quick brown fox', '\s*') AS foo;
     matching empty strings at the beginning
     and end of a word respectively.  A word is defined as a sequence
     of word characters that is neither preceded nor followed by word
-    characters.  A word character is an <literal>alnum</literal> character (as
-    defined by the <acronym>POSIX</acronym> character class described above)
-    or an underscore.  This is an extension, compatible with but not
+    characters.  A word character is any character belonging to the
+    <literal>word</literal> character class, that is, any letter, digit,
+    or underscore.  This is an extension, compatible with but not
     specified by <acronym>POSIX</acronym> 1003.2, and should be used with
     caution in software intended to be portable to other systems.
     The constraint escapes described below are usually preferable; they
@@ -6330,8 +6333,7 @@ SELECT foo FROM regexp_split_to_table('the quick brown fox', '\s*') AS foo;
 
        <row>
        <entry> <literal>\w</literal> </entry>
-       <entry> <literal>[[:alnum:]_]</literal>
-       (note underscore is included) </entry>
+       <entry> <literal>[[:word:]]</literal> </entry>
        </row>
 
        <row>
@@ -6346,21 +6348,18 @@ SELECT foo FROM regexp_split_to_table('the quick brown fox', '\s*') AS foo;
 
        <row>
        <entry> <literal>\W</literal> </entry>
-       <entry> <literal>[^[:alnum:]_]</literal>
-       (note underscore is included) </entry>
+       <entry> <literal>[^[:word:]]</literal> </entry>
        </row>
       </tbody>
      </tgroup>
     </table>
 
    <para>
-    Within bracket expressions, <literal>\d</literal>, <literal>\s</literal>,
-    and <literal>\w</literal> lose their outer brackets,
-    and <literal>\D</literal>, <literal>\S</literal>, and <literal>\W</literal> are illegal.
-    (So, for example, <literal>[a-c\d]</literal> is equivalent to
+    The class-shorthand escapes also work within bracket expressions,
+    although the definitions shown above are not quite syntactically
+    valid in that context.
+    For example, <literal>[a-c\d]</literal> is equivalent to
     <literal>[a-c[:digit:]]</literal>.
-    Also, <literal>[a-c\D]</literal>, which is equivalent to
-    <literal>[a-c^[:digit:]]</literal>, is illegal.)
    </para>
 
    <table id="posix-constraint-escapes-table">
 
@@ -519,15 +519,10 @@ character classes:
 (note underscore)
 .RE
 .PP
-Within bracket expressions, `\fB\ed\fR', `\fB\es\fR',
-and `\fB\ew\fR'\&
-lose their outer brackets,
-and `\fB\eD\fR', `\fB\eS\fR',
-and `\fB\eW\fR'\&
-are illegal.
-.VS 8.2
-(So, for example, \fB[a-c\ed]\fR is equivalent to \fB[a-c[:digit:]]\fR.
-Also, \fB[a-c\eD]\fR, which is equivalent to \fB[a-c^[:digit:]]\fR, is illegal.)
+The class-shorthand escapes also work within bracket expressions,
+although the definitions shown above are not quite syntactically
+valid in that context.
+For example, \fB[a-c\ed]\fR is equivalent to \fB[a-c[:digit:]]\fR.
 .VE 8.2
 .PP
 A constraint escape (AREs only) is a constraint,
 
@@ -936,7 +936,16 @@ okcolors(struct nfa *nfa,
 		}
 		else if (cd->nschrs == 0 && cd->nuchrs == 0)
 		{
-			/* parent empty, its arcs change color to subcolor */
+			/*
+			 * Parent is now empty, so just change all its arcs to the
+			 * subcolor, then free the parent.
+			 *
+			 * It is not obvious that simply relabeling the arcs like this is
+			 * OK; it appears to risk creating duplicate arcs.  We are
+			 * basically relying on the assumption that processing of a
+			 * bracket expression can't create arcs of both a color and its
+			 * subcolor between the bracket's endpoints.
+			 */
 			cd->sub = NOSUB;
 			scd = &cm->cd[sco];
 			assert(scd->nschrs > 0 || scd->nuchrs > 0);
@@ -1062,17 +1071,34 @@ colorcomplement(struct nfa *nfa,
 	struct colordesc *cd;
 	struct colordesc *end = CDEND(cm);
 	color		co;
+	struct arc *a;
 
 	assert(of != from);
 
 	/* A RAINBOW arc matches all colors, making the complement empty */
 	if (findarc(of, PLAIN, RAINBOW) != NULL)
 		return;
 
+	/* Otherwise, transiently mark the colors that appear in of's out-arcs */
+	for (a = of->outs; a != NULL; a = a->outchain)
+	{
+		if (a->type == PLAIN)
+		{
+			assert(a->co >= 0);
+			cd = &cm->cd[a->co];
+			assert(!UNUSEDCOLOR(cd));
+			cd->flags |= COLMARK;
+		}
+	}
+
+	/* Scan colors, clear transient marks, add arcs for unmarked colors */
 	for (cd = cm->cd, co = 0; cd < end && !CISERR(); cd++, co++)
-		if (!UNUSEDCOLOR(cd) && !(cd->flags & PSEUDO))
-			if (findarc(of, PLAIN, co) == NULL)
-				newarc(nfa, type, co, from, to);
+	{
+		if (cd->flags & COLMARK)
+			cd->flags &= ~COLMARK;
+		else if (!UNUSEDCOLOR(cd) && !(cd->flags & PSEUDO))
+			newarc(nfa, type, co, from, to);
+	}
 }
 
 
 
@@ -193,83 +193,6 @@ prefixes(struct vars *v)
 	}
 }
 
-/*
- * lexnest - "call a subroutine", interpolating string at the lexical level
- *
- * Note, this is not a very general facility.  There are a number of
- * implicit assumptions about what sorts of strings can be subroutines.
- */
-static void
-lexnest(struct vars *v,
-		const chr *beginp,		/* start of interpolation */
-		const chr *endp)		/* one past end of interpolation */
-{
-	assert(v->savenow == NULL); /* only one level of nesting */
-	v->savenow = v->now;
-	v->savestop = v->stop;
-	v->now = beginp;
-	v->stop = endp;
-}
-
-/*
- * string constants to interpolate as expansions of things like \d
- */
-static const chr backd[] = {	/* \d */
-	CHR('['), CHR('['), CHR(':'),
-	CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
-	CHR(':'), CHR(']'), CHR(']')
-};
-static const chr backD[] = {	/* \D */
-	CHR('['), CHR('^'), CHR('['), CHR(':'),
-	CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
-	CHR(':'), CHR(']'), CHR(']')
-};
-static const chr brbackd[] = {	/* \d within brackets */
-	CHR('['), CHR(':'),
-	CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
-	CHR(':'), CHR(']')
-};
-static const chr backs[] = {	/* \s */
-	CHR('['), CHR('['), CHR(':'),
-	CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
-	CHR(':'), CHR(']'), CHR(']')
-};
-static const chr backS[] = {	/* \S */
-	CHR('['), CHR('^'), CHR('['), CHR(':'),
-	CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
-	CHR(':'), CHR(']'), CHR(']')
-};
-static const chr brbacks[] = {	/* \s within brackets */
-	CHR('['), CHR(':'),
-	CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
-	CHR(':'), CHR(']')
-};
-static const chr backw[] = {	/* \w */
-	CHR('['), CHR('['), CHR(':'),
-	CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
-	CHR(':'), CHR(']'), CHR('_'), CHR(']')
-};
-static const chr backW[] = {	/* \W */
-	CHR('['), CHR('^'), CHR('['), CHR(':'),
-	CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
-	CHR(':'), CHR(']'), CHR('_'), CHR(']')
-};
-static const chr brbackw[] = {	/* \w within brackets */
-	CHR('['), CHR(':'),
-	CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
-	CHR(':'), CHR(']'), CHR('_')
-};
-
-/*
- * lexword - interpolate a bracket expression for word characters
- * Possibly ought to inquire whether there is a "word" character class.
- */
-static void
-lexword(struct vars *v)
-{
-	lexnest(v, backw, ENDOF(backw));
-}
-
 /*
  * next - get next token
  */
@@ -292,14 +215,6 @@ next(struct vars *v)
 		RETV(SBEGIN, 0);		/* same as \A */
 	}
 
-	/* if we're nested and we've hit end, return to outer level */
-	if (v->savenow != NULL && ATEOS())
-	{
-		v->now = v->savenow;
-		v->stop = v->savestop;
-		v->savenow = v->savestop = NULL;
-	}
-
 	/* skip white space etc. if appropriate (not in literal or []) */
 	if (v->cflags & REG_EXPANDED)
 		switch (v->lexcon)
@@ -420,32 +335,15 @@ next(struct vars *v)
 					NOTE(REG_UNONPOSIX);
 					if (ATEOS())
 						FAILW(REG_EESCAPE);
-					(DISCARD) lexescape(v);
+					if (!lexescape(v))
+						return 0;
 					switch (v->nexttype)
 					{			/* not all escapes okay here */
 						case PLAIN:
+						case CCLASSS:
+						case CCLASSC:
 							return 1;
 							break;
-						case CCLASS:
-							switch (v->nextvalue)
-							{
-								case 'd':
-									lexnest(v, brbackd, ENDOF(brbackd));
-									break;
-								case 's':
-									lexnest(v, brbacks, ENDOF(brbacks));
-									break;
-								case 'w':
-									lexnest(v, brbackw, ENDOF(brbackw));
-									break;
-								default:
-									FAILW(REG_EESCAPE);
-									break;
-							}
-							/* lexnest done, back up and try again */
-							v->nexttype = v->lasttype;
-							return next(v);
-							break;
 					}
 					/* not one of the acceptable escapes */
 					FAILW(REG_EESCAPE);
@@ -691,49 +589,17 @@ next(struct vars *v)
 		}
 		RETV(PLAIN, *v->now++);
 	}
-	(DISCARD) lexescape(v);
-	if (ISERR())
-		FAILW(REG_EESCAPE);
-	if (v->nexttype == CCLASS)
-	{							/* fudge at lexical level */
-		switch (v->nextvalue)
-		{
-			case 'd':
-				lexnest(v, backd, ENDOF(backd));
-				break;
-			case 'D':
-				lexnest(v, backD, ENDOF(backD));
-				break;
-			case 's':
-				lexnest(v, backs, ENDOF(backs));
-				break;
-			case 'S':
-				lexnest(v, backS, ENDOF(backS));
-				break;
-			case 'w':
-				lexnest(v, backw, ENDOF(backw));
-				break;
-			case 'W':
-				lexnest(v, backW, ENDOF(backW));
-				break;
-			default:
-				assert(NOTREACHED);
-				FAILW(REG_ASSERT);
-				break;
-		}
-		/* lexnest done, back up and try again */
-		v->nexttype = v->lasttype;
-		return next(v);
-	}
-	/* otherwise, lexescape has already done the work */
-	return !ISERR();
+	return lexescape(v);
 }
 
 /*
  * lexescape - parse an ARE backslash escape (backslash already eaten)
- * Note slightly nonstandard use of the CCLASS type code.
+ *
+ * This is used for ARE backslashes both normally and inside bracket
+ * expressions.  In the latter case, not all escape types are allowed,
+ * but the caller must reject unwanted ones after we return.
  */
-static int						/* not actually used, but convenient for RETV */
+static int
 lexescape(struct vars *v)
 {
 	chr			c;
@@ -775,11 +641,11 @@ lexescape(struct vars *v)
 			break;
 		case CHR('d'):
 			NOTE(REG_ULOCALE);
-			RETV(CCLASS, 'd');
+			RETV(CCLASSS, CC_DIGIT);
 			break;
 		case CHR('D'):
 			NOTE(REG_ULOCALE);
-			RETV(CCLASS, 'D');
+			RETV(CCLASSC, CC_DIGIT);
 			break;
 		case CHR('e'):
 			NOTE(REG_UUNPORT);
@@ -802,11 +668,11 @@ lexescape(struct vars *v)
 			break;
 		case CHR('s'):
 			NOTE(REG_ULOCALE);
-			RETV(CCLASS, 's');
+			RETV(CCLASSS, CC_SPACE);
 			break;
 		case CHR('S'):
 			NOTE(REG_ULOCALE);
-			RETV(CCLASS, 'S');
+			RETV(CCLASSC, CC_SPACE);
 			break;
 		case CHR('t'):
 			RETV(PLAIN, CHR('\t'));
@@ -828,11 +694,11 @@ lexescape(struct vars *v)
 			break;
 		case CHR('w'):
 			NOTE(REG_ULOCALE);
-			RETV(CCLASS, 'w');
+			RETV(CCLASSS, CC_WORD);
 			break;
 		case CHR('W'):
 			NOTE(REG_ULOCALE);
-			RETV(CCLASS, 'W');
+			RETV(CCLASSC, CC_WORD);
 			break;
 		case CHR('x'):
 			NOTE(REG_UUNPORT);