Skip to content

Commit a13fdc4

Browse files
committed
Mop-up for commit 85feb77aa09cda9ff3e12cf95c757c499dc25343.
Adjust commentary in regc_pg_locale.c to remove mention of the possibility of not having <wctype.h> functions, since we no longer consider that. Eliminate duplicate code in wparser_def.c by generalizing the p_iswhat macro to take a parameter saying what to return for non-ASCII chars in C locale. (That's not really a consequence of the USE_WIDE_UPPER_LOWER-ectomy, but I noticed it while doing that.)
1 parent d7c4fa3 commit a13fdc4

File tree

3 files changed

+59
-125
lines changed

3 files changed

+59
-125
lines changed

expected/pg_tsparser.out

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,3 +236,31 @@ SELECT to_tsvector('english_ts', 'test2.com');
236236
'com':3 'test2':2 'test2.com':1
237237
(1 row)
238238

239+
-- Test non-ASCII symbols
240+
SELECT * from ts_parse('tsparser', 'аб_вгд 12_абв 12-абв абв.рф абв2.рф');
241+
tokid | token
242+
-------+--------
243+
17 | аб_вгд
244+
10 | аб
245+
12 | _
246+
10 | вгд
247+
12 |
248+
15 | 12_абв
249+
9 | 12
250+
12 | _
251+
10 | абв
252+
12 |
253+
15 | 12-абв
254+
9 | 12
255+
12 | -
256+
10 | абв
257+
12 |
258+
2 | абв
259+
12 | .
260+
2 | рф
261+
12 |
262+
3 | абв2
263+
12 | .
264+
2 | рф
265+
(22 rows)
266+

sql/pg_tsparser.sql

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,6 @@ SELECT to_tsvector('english_ts', '12_abc');
2626
SELECT to_tsvector('english_ts', '12-abc');
2727
SELECT to_tsvector('english_ts', 'test.com');
2828
SELECT to_tsvector('english_ts', 'test2.com');
29+
30+
-- Test non-ASCII symbols
31+
SELECT * from ts_parse('tsparser', 'аб_вгд 12_абв 12-абв абв.рф абв2.рф');

tsparser.c

Lines changed: 28 additions & 125 deletions
Original file line numberDiff line numberDiff line change
@@ -249,11 +249,9 @@ typedef struct TParser
249249
/* string and position information */
250250
char *str; /* multibyte string */
251251
int lenstr; /* length of mbstring */
252-
#ifdef USE_WIDE_UPPER_LOWER
253252
wchar_t *wstr; /* wide character string */
254253
pg_wchar *pgwstr; /* wide character string for C-locale */
255254
bool usewide;
256-
#endif
257255

258256
/* State of parse */
259257
int charmaxlen;
@@ -302,8 +300,6 @@ TParserInit(char *str, int len)
302300
prs->str = str;
303301
prs->lenstr = len;
304302

305-
#ifdef USE_WIDE_UPPER_LOWER
306-
307303
/*
308304
* Use wide char code only when max encoding length > 1.
309305
*/
@@ -331,7 +327,6 @@ TParserInit(char *str, int len)
331327
}
332328
else
333329
prs->usewide = false;
334-
#endif
335330

336331
prs->state = newTParserPosition(NULL);
337332
prs->state->state = TPS_Base;
@@ -368,15 +363,12 @@ TParserCopyInit(const TParser *orig)
368363
prs->charmaxlen = orig->charmaxlen;
369364
prs->str = orig->str + orig->state->posbyte;
370365
prs->lenstr = orig->lenstr - orig->state->posbyte;
371-
372-
#ifdef USE_WIDE_UPPER_LOWER
373366
prs->usewide = orig->usewide;
374367

375368
if (orig->pgwstr)
376369
prs->pgwstr = orig->pgwstr + orig->state->poschar;
377370
if (orig->wstr)
378371
prs->wstr = orig->wstr + orig->state->poschar;
379-
#endif
380372

381373
prs->state = newTParserPosition(NULL);
382374
prs->state->state = TPS_Base;
@@ -401,12 +393,10 @@ TParserClose(TParser *prs)
401393
prs->state = ptr;
402394
}
403395

404-
#ifdef USE_WIDE_UPPER_LOWER
405396
if (prs->wstr)
406397
pfree(prs->wstr);
407398
if (prs->pgwstr)
408399
pfree(prs->pgwstr);
409-
#endif
410400

411401
#ifdef WPARSER_TRACE
412402
fprintf(stderr, "closing parser\n");
@@ -445,96 +435,45 @@ TParserCopyClose(TParser *prs)
445435
* - if locale is C then we use pgwstr instead of wstr.
446436
*/
447437

448-
#ifdef USE_WIDE_UPPER_LOWER
449-
450-
#define p_iswhat(type) \
438+
#define p_iswhat(type, nonascii) \
439+
\
451440
static int \
452-
p_is##type(TParser *prs) { \
453-
Assert( prs->state ); \
454-
if ( prs->usewide ) \
441+
p_is##type(TParser *prs) \
442+
{ \
443+
Assert(prs->state); \
444+
if (prs->usewide) \
455445
{ \
456-
if ( prs->pgwstr ) \
446+
if (prs->pgwstr) \
457447
{ \
458448
unsigned int c = *(prs->pgwstr + prs->state->poschar); \
459-
if ( c > 0x7f ) \
460-
return 0; \
461-
return is##type( c ); \
449+
if (c > 0x7f) \
450+
return nonascii; \
451+
return is##type(c); \
462452
} \
463-
return isw##type( *( prs->wstr + prs->state->poschar ) ); \
453+
return isw##type(*(prs->wstr + prs->state->poschar)); \
464454
} \
465-
\
466-
return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); \
467-
} \
455+
return is##type(*(unsigned char *) (prs->str + prs->state->posbyte)); \
456+
} \
468457
\
469458
static int \
470-
p_isnot##type(TParser *prs) { \
459+
p_isnot##type(TParser *prs) \
460+
{ \
471461
return !p_is##type(prs); \
472462
}
473463

474-
static int
475-
p_isalnum(TParser *prs)
476-
{
477-
Assert(prs->state);
478-
479-
if (prs->usewide)
480-
{
481-
if (prs->pgwstr)
482-
{
483-
unsigned int c = *(prs->pgwstr + prs->state->poschar);
484-
485-
/*
486-
* any non-ascii symbol with multibyte encoding with C-locale is
487-
* an alpha character
488-
*/
489-
if (c > 0x7f)
490-
return 1;
491-
492-
return isalnum(c);
493-
}
494-
495-
return iswalnum(*(prs->wstr + prs->state->poschar));
496-
}
497-
498-
return isalnum(*(unsigned char *) (prs->str + prs->state->posbyte));
499-
}
500-
static int
501-
p_isnotalnum(TParser *prs)
502-
{
503-
return !p_isalnum(prs);
504-
}
505-
506-
static int
507-
p_isalpha(TParser *prs)
508-
{
509-
Assert(prs->state);
510-
511-
if (prs->usewide)
512-
{
513-
if (prs->pgwstr)
514-
{
515-
unsigned int c = *(prs->pgwstr + prs->state->poschar);
516-
517-
/*
518-
* any non-ascii symbol with multibyte encoding with C-locale is
519-
* an alpha character
520-
*/
521-
if (c > 0x7f)
522-
return 1;
523-
524-
return isalpha(c);
525-
}
526-
527-
return iswalpha(*(prs->wstr + prs->state->poschar));
528-
}
529-
530-
return isalpha(*(unsigned char *) (prs->str + prs->state->posbyte));
531-
}
532-
533-
static int
534-
p_isnotalpha(TParser *prs)
535-
{
536-
return !p_isalpha(prs);
537-
}
464+
/*
465+
* In C locale with a multibyte encoding, any non-ASCII symbol is considered
466+
* an alpha character, but not a member of other char classes.
467+
*/
468+
p_iswhat(alnum, 1)
469+
p_iswhat(alpha, 1)
470+
p_iswhat(digit, 0)
471+
p_iswhat(lower, 0)
472+
p_iswhat(print, 0)
473+
p_iswhat(punct, 0)
474+
p_iswhat(space, 0)
475+
p_iswhat(upper, 0)
476+
p_iswhat(xdigit, 0)
538477

539478
/* p_iseq should be used only for ascii symbols */
540479

@@ -544,39 +483,6 @@ p_iseq(TParser *prs, char c)
544483
Assert(prs->state);
545484
return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
546485
}
547-
#else /* USE_WIDE_UPPER_LOWER */
548-
549-
#define p_iswhat(type) \
550-
static int \
551-
p_is##type(TParser *prs) { \
552-
Assert( prs->state ); \
553-
return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ); \
554-
} \
555-
\
556-
static int \
557-
p_isnot##type(TParser *prs) { \
558-
return !p_is##type(prs); \
559-
}
560-
561-
562-
static int
563-
p_iseq(TParser *prs, char c)
564-
{
565-
Assert(prs->state);
566-
return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0;
567-
}
568-
569-
p_iswhat(alnum)
570-
p_iswhat(alpha)
571-
#endif /* USE_WIDE_UPPER_LOWER */
572-
573-
p_iswhat(digit)
574-
p_iswhat(lower)
575-
p_iswhat(print)
576-
p_iswhat(punct)
577-
p_iswhat(space)
578-
p_iswhat(upper)
579-
p_iswhat(xdigit)
580486

581487
static int
582488
p_isEOF(TParser *prs)
@@ -793,8 +699,6 @@ p_isspecial(TParser *prs)
793699
if (pg_dsplen(prs->str + prs->state->posbyte) == 0)
794700
return 1;
795701

796-
#ifdef USE_WIDE_UPPER_LOWER
797-
798702
/*
799703
* Unicode Characters in the 'Mark, Spacing Combining' Category That
800704
* characters are not alpha although they are not breakers of word too.
@@ -1058,7 +962,6 @@ p_isspecial(TParser *prs)
1058962
StopHigh = StopMiddle;
1059963
}
1060964
}
1061-
#endif
1062965

1063966
return 0;
1064967
}

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy