Skip to content

Commit ed87e19

Browse files
committed
Mop-up for commit 85feb77.
Adjust commentary in regc_pg_locale.c to remove mention of the possibility of not having <wctype.h> functions, since we no longer consider that. Eliminate duplicate code in wparser_def.c by generalizing the p_iswhat macro to take a parameter saying what to return for non-ASCII chars in C locale. (That's not really a consequence of the USE_WIDE_UPPER_LOWER-ectomy, but I noticed it while doing that.)
1 parent 85feb77 commit ed87e19

File tree

2 files changed

+40
-97
lines changed

2 files changed

+40
-97
lines changed

src/backend/regex/regc_pg_locale.c

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -29,20 +29,20 @@
2929
*
3030
* 2. In the "default" collation (which is supposed to obey LC_CTYPE):
3131
*
32-
* 2a. When working in UTF8 encoding, we use the <wctype.h> functions if
33-
* available. This assumes that every platform uses Unicode codepoints
34-
* directly as the wchar_t representation of Unicode. On some platforms
32+
* 2a. When working in UTF8 encoding, we use the <wctype.h> functions.
33+
* This assumes that every platform uses Unicode codepoints directly
34+
* as the wchar_t representation of Unicode. On some platforms
3535
* wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
3636
*
37-
* 2b. In all other encodings, or on machines that lack <wctype.h>, we use
38-
* the <ctype.h> functions for pg_wchar values up to 255, and punt for values
39-
* above that. This is only 100% correct in single-byte encodings such as
40-
* LATINn. However, non-Unicode multibyte encodings are mostly Far Eastern
41-
* character sets for which the properties being tested here aren't very
42-
* relevant for higher code values anyway. The difficulty with using the
43-
* <wctype.h> functions with non-Unicode multibyte encodings is that we can
44-
* have no certainty that the platform's wchar_t representation matches
45-
* what we do in pg_wchar conversions.
37+
* 2b. In all other encodings, we use the <ctype.h> functions for pg_wchar
38+
* values up to 255, and punt for values above that. This is 100% correct
39+
* only in single-byte encodings such as LATINn. However, non-Unicode
40+
* multibyte encodings are mostly Far Eastern character sets for which the
41+
* properties being tested here aren't very relevant for higher code values
42+
* anyway. The difficulty with using the <wctype.h> functions with
43+
* non-Unicode multibyte encodings is that we can have no certainty that
44+
* the platform's wchar_t representation matches what we do in pg_wchar
45+
* conversions.
4646
*
4747
* 3. Other collations are only supported on platforms that HAVE_LOCALE_T.
4848
* Here, we use the locale_t-extended forms of the <wctype.h> and <ctype.h>

src/backend/tsearch/wparser_def.c

Lines changed: 28 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -427,94 +427,45 @@ TParserCopyClose(TParser *prs)
427427
* - if locale is C then we use pgwstr instead of wstr.
428428
*/
429429

430-
#define p_iswhat(type) \
430+
#define p_iswhat(type, nonascii) \
431+
\
431432
static int \
432-
p_is##type(TParser *prs) { \
433-
Assert( prs->state ); \
434-
if ( prs->usewide ) \
433+
p_is##type(TParser *prs) \
434+
{ \
435+
Assert(prs->state); \
436+
if (prs->usewide) \
435437
{ \
436-
if ( prs->pgwstr ) \
438+
if (prs->pgwstr) \
437439
{ \
438440
unsigned int c = *(prs->pgwstr + prs->state->poschar); \
439-
if ( c > 0x7f ) \
440-
return 0; \
441-
return is##type( c ); \
441+
if (c > 0x7f) \
442+
return nonascii; \
443+
return is##type(c); \
442444
} \
443-
return isw##type( *( prs->wstr + prs->state->poschar ) ); \
445+
return isw##type(*(prs->wstr + prs->state->poschar)); \
444446
} \
445-
\
446-
return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); \
447-
} \
447+
return is##type(*(unsigned char *) (prs->str + prs->state->posbyte)); \
448+
} \
448449
\
449450
static int \
450-
p_isnot##type(TParser *prs) { \
451+
p_isnot##type(TParser *prs) \
452+
{ \
451453
return !p_is##type(prs); \
452454
}
453455

454-
static int
455-
p_isalnum(TParser *prs)
456-
{
457-
Assert(prs->state);
458-
459-
if (prs->usewide)
460-
{
461-
if (prs->pgwstr)
462-
{
463-
unsigned int c = *(prs->pgwstr + prs->state->poschar);
464-
465-
/*
466-
* any non-ascii symbol with multibyte encoding with C-locale is
467-
* an alpha character
468-
*/
469-
if (c > 0x7f)
470-
return 1;
471-
472-
return isalnum(c);
473-
}
474-
475-
return iswalnum(*(prs->wstr + prs->state->poschar));
476-
}
477-
478-
return isalnum(*(unsigned char *) (prs->str + prs->state->posbyte));
479-
}
480-
static int
481-
p_isnotalnum(TParser *prs)
482-
{
483-
return !p_isalnum(prs);
484-
}
485-
486-
static int
487-
p_isalpha(TParser *prs)
488-
{
489-
Assert(prs->state);
490-
491-
if (prs->usewide)
492-
{
493-
if (prs->pgwstr)
494-
{
495-
unsigned int c = *(prs->pgwstr + prs->state->poschar);
496-
497-
/*
498-
* any non-ascii symbol with multibyte encoding with C-locale is
499-
* an alpha character
500-
*/
501-
if (c > 0x7f)
502-
return 1;
503-
504-
return isalpha(c);
505-
}
506-
507-
return iswalpha(*(prs->wstr + prs->state->poschar));
508-
}
509-
510-
return isalpha(*(unsigned char *) (prs->str + prs->state->posbyte));
511-
}
512-
513-
static int
514-
p_isnotalpha(TParser *prs)
515-
{
516-
return !p_isalpha(prs);
517-
}
456+
/*
457+
* In C locale with a multibyte encoding, any non-ASCII symbol is considered
458+
* an alpha character, but not a member of other char classes.
459+
*/
460+
p_iswhat(alnum, 1)
461+
p_iswhat(alpha, 1)
462+
p_iswhat(digit, 0)
463+
p_iswhat(lower, 0)
464+
p_iswhat(print, 0)
465+
p_iswhat(punct, 0)
466+
p_iswhat(space, 0)
467+
p_iswhat(upper, 0)
468+
p_iswhat(xdigit, 0)
518469

519470
/* p_iseq should be used only for ascii symbols */
520471

@@ -525,14 +476,6 @@ p_iseq(TParser *prs, char c)
525476
return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
526477
}
527478

528-
p_iswhat(digit)
529-
p_iswhat(lower)
530-
p_iswhat(print)
531-
p_iswhat(punct)
532-
p_iswhat(space)
533-
p_iswhat(upper)
534-
p_iswhat(xdigit)
535-
536479
static int
537480
p_isEOF(TParser *prs)
538481
{

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy