Skip to content

Commit 0d32342

Browse files
committed
Teach the regular expression functions to do case-insensitive matching and
locale-dependent character classification properly when the database encoding is UTF8. The previous coding worked okay in single-byte encodings, or in any case for ASCII characters, but failed entirely on multibyte characters. The fix assumes that the <wctype.h> functions use Unicode code points as the wchar representation for Unicode, ie, wchar matches pg_wchar. This is only a partial solution, since we're still stupid about non-ASCII characters in multibyte encodings other than UTF8. The practical effect of that is limited, however, since those cases are generally Far Eastern glyphs for which concepts like case-folding don't apply anyway. Certainly all or nearly all of the field reports of problems have been about UTF8. A more general solution would require switching to the platform's wchar representation for all regex operations; which is possible but would have substantial disadvantages. Let's try this and see if it's sufficient in practice.
1 parent ef51395 commit 0d32342

File tree

2 files changed

+117
-14
lines changed

2 files changed

+117
-14
lines changed

src/backend/regex/regc_locale.c

Lines changed: 105 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@
4747
* permission to use and distribute the software in accordance with the
4848
* terms specified in this license.
4949
*
50-
* $PostgreSQL: pgsql/src/backend/regex/regc_locale.c,v 1.9 2008/02/14 17:33:37 tgl Exp $
50+
* $PostgreSQL: pgsql/src/backend/regex/regc_locale.c,v 1.10 2009/12/01 21:00:24 tgl Exp $
5151
*/
5252

5353
/* ASCII character-name table */
@@ -349,75 +349,167 @@ static const struct cname
349349
}
350350
};
351351

352+
352353
/*
353-
* some ctype functions with non-ascii-char guard
354+
* ctype functions adapted to work on pg_wchar (a/k/a chr)
355+
*
356+
* When working in UTF8 encoding, we use the <wctype.h> functions if
357+
* available. This assumes that every platform uses Unicode codepoints
358+
* directly as the wchar_t representation of Unicode. On some platforms
359+
* wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
360+
*
361+
* In all other encodings, we use the <ctype.h> functions for pg_wchar
362+
* values up to 255, and punt for values above that. This is only 100%
363+
* correct in single-byte encodings such as LATINn. However, non-Unicode
364+
* multibyte encodings are mostly Far Eastern character sets for which the
365+
* properties being tested here aren't relevant for higher code values anyway.
366+
*
367+
* NB: the coding here assumes pg_wchar is an unsigned type.
354368
*/
369+
355370
static int
356371
pg_wc_isdigit(pg_wchar c)
357372
{
358-
return (c >= 0 && c <= UCHAR_MAX && isdigit((unsigned char) c));
373+
#ifdef USE_WIDE_UPPER_LOWER
374+
if (GetDatabaseEncoding() == PG_UTF8)
375+
{
376+
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
377+
return iswdigit((wint_t) c);
378+
}
379+
#endif
380+
return (c <= (pg_wchar) UCHAR_MAX && isdigit((unsigned char) c));
359381
}
360382

361383
static int
362384
pg_wc_isalpha(pg_wchar c)
363385
{
364-
return (c >= 0 && c <= UCHAR_MAX && isalpha((unsigned char) c));
386+
#ifdef USE_WIDE_UPPER_LOWER
387+
if (GetDatabaseEncoding() == PG_UTF8)
388+
{
389+
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
390+
return iswalpha((wint_t) c);
391+
}
392+
#endif
393+
return (c <= (pg_wchar) UCHAR_MAX && isalpha((unsigned char) c));
365394
}
366395

367396
static int
368397
pg_wc_isalnum(pg_wchar c)
369398
{
370-
return (c >= 0 && c <= UCHAR_MAX && isalnum((unsigned char) c));
399+
#ifdef USE_WIDE_UPPER_LOWER
400+
if (GetDatabaseEncoding() == PG_UTF8)
401+
{
402+
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
403+
return iswalnum((wint_t) c);
404+
}
405+
#endif
406+
return (c <= (pg_wchar) UCHAR_MAX && isalnum((unsigned char) c));
371407
}
372408

373409
static int
374410
pg_wc_isupper(pg_wchar c)
375411
{
376-
return (c >= 0 && c <= UCHAR_MAX && isupper((unsigned char) c));
412+
#ifdef USE_WIDE_UPPER_LOWER
413+
if (GetDatabaseEncoding() == PG_UTF8)
414+
{
415+
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
416+
return iswupper((wint_t) c);
417+
}
418+
#endif
419+
return (c <= (pg_wchar) UCHAR_MAX && isupper((unsigned char) c));
377420
}
378421

379422
static int
380423
pg_wc_islower(pg_wchar c)
381424
{
382-
return (c >= 0 && c <= UCHAR_MAX && islower((unsigned char) c));
425+
#ifdef USE_WIDE_UPPER_LOWER
426+
if (GetDatabaseEncoding() == PG_UTF8)
427+
{
428+
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
429+
return iswlower((wint_t) c);
430+
}
431+
#endif
432+
return (c <= (pg_wchar) UCHAR_MAX && islower((unsigned char) c));
383433
}
384434

385435
static int
386436
pg_wc_isgraph(pg_wchar c)
387437
{
388-
return (c >= 0 && c <= UCHAR_MAX && isgraph((unsigned char) c));
438+
#ifdef USE_WIDE_UPPER_LOWER
439+
if (GetDatabaseEncoding() == PG_UTF8)
440+
{
441+
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
442+
return iswgraph((wint_t) c);
443+
}
444+
#endif
445+
return (c <= (pg_wchar) UCHAR_MAX && isgraph((unsigned char) c));
389446
}
390447

391448
static int
392449
pg_wc_isprint(pg_wchar c)
393450
{
394-
return (c >= 0 && c <= UCHAR_MAX && isprint((unsigned char) c));
451+
#ifdef USE_WIDE_UPPER_LOWER
452+
if (GetDatabaseEncoding() == PG_UTF8)
453+
{
454+
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
455+
return iswprint((wint_t) c);
456+
}
457+
#endif
458+
return (c <= (pg_wchar) UCHAR_MAX && isprint((unsigned char) c));
395459
}
396460

397461
static int
398462
pg_wc_ispunct(pg_wchar c)
399463
{
400-
return (c >= 0 && c <= UCHAR_MAX && ispunct((unsigned char) c));
464+
#ifdef USE_WIDE_UPPER_LOWER
465+
if (GetDatabaseEncoding() == PG_UTF8)
466+
{
467+
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
468+
return iswpunct((wint_t) c);
469+
}
470+
#endif
471+
return (c <= (pg_wchar) UCHAR_MAX && ispunct((unsigned char) c));
401472
}
402473

403474
static int
404475
pg_wc_isspace(pg_wchar c)
405476
{
406-
return (c >= 0 && c <= UCHAR_MAX && isspace((unsigned char) c));
477+
#ifdef USE_WIDE_UPPER_LOWER
478+
if (GetDatabaseEncoding() == PG_UTF8)
479+
{
480+
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
481+
return iswspace((wint_t) c);
482+
}
483+
#endif
484+
return (c <= (pg_wchar) UCHAR_MAX && isspace((unsigned char) c));
407485
}
408486

409487
static pg_wchar
410488
pg_wc_toupper(pg_wchar c)
411489
{
412-
if (c >= 0 && c <= UCHAR_MAX)
490+
#ifdef USE_WIDE_UPPER_LOWER
491+
if (GetDatabaseEncoding() == PG_UTF8)
492+
{
493+
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
494+
return towupper((wint_t) c);
495+
}
496+
#endif
497+
if (c <= (pg_wchar) UCHAR_MAX)
413498
return toupper((unsigned char) c);
414499
return c;
415500
}
416501

417502
static pg_wchar
418503
pg_wc_tolower(pg_wchar c)
419504
{
420-
if (c >= 0 && c <= UCHAR_MAX)
505+
#ifdef USE_WIDE_UPPER_LOWER
506+
if (GetDatabaseEncoding() == PG_UTF8)
507+
{
508+
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
509+
return towlower((wint_t) c);
510+
}
511+
#endif
512+
if (c <= (pg_wchar) UCHAR_MAX)
421513
return tolower((unsigned char) c);
422514
return c;
423515
}

src/include/regex/regcustom.h

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
2626
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2727
*
28-
* $PostgreSQL: pgsql/src/include/regex/regcustom.h,v 1.7 2008/02/14 17:33:37 tgl Exp $
28+
* $PostgreSQL: pgsql/src/include/regex/regcustom.h,v 1.8 2009/12/01 21:00:24 tgl Exp $
2929
*/
3030

3131
/* headers if any */
@@ -34,6 +34,17 @@
3434
#include <ctype.h>
3535
#include <limits.h>
3636

37+
/*
38+
* towlower() and friends should be in <wctype.h>, but some pre-C99 systems
39+
* declare them in <wchar.h>.
40+
*/
41+
#ifdef HAVE_WCHAR_H
42+
#include <wchar.h>
43+
#endif
44+
#ifdef HAVE_WCTYPE_H
45+
#include <wctype.h>
46+
#endif
47+
3748
#include "mb/pg_wchar.h"
3849

3950

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy