diff --git a/.gitignore b/.gitignore index 8a9a6c9..1167d7c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ *.o *.so results +/log/ diff --git a/Makefile b/Makefile index 9ddd3fc..bbc36cb 100644 --- a/Makefile +++ b/Makefile @@ -8,6 +8,9 @@ DATA = pg_tsparser--1.0.sql PGFILEDESC = "pg_tsparser - parser for text search" REGRESS = pg_tsparser +# We need a UTF8 database +ENCODING = UTF8 +NO_LOCALE = 1 ifdef USE_PGXS PG_CONFIG = pg_config diff --git a/expected/pg_tsparser.out b/expected/pg_tsparser.out index 537d2df..23dbb8c 100644 --- a/expected/pg_tsparser.out +++ b/expected/pg_tsparser.out @@ -236,3 +236,39 @@ SELECT to_tsvector('english_ts', 'test2.com'); 'com':3 'test2':2 'test2.com':1 (1 row) +-- Test non-ASCII symbols +-- must have a UTF8 database +SELECT getdatabaseencoding(); + getdatabaseencoding +--------------------- + UTF8 +(1 row) + +SET client_encoding TO 'UTF8'; +SELECT * from ts_parse('tsparser', 'аб_вгд 12_абв 12-абв абв.рф абв2.рф'); + tokid | token +-------+-------- + 17 | аб_вгд + 10 | аб + 12 | _ + 10 | вгд + 12 | + 15 | 12_абв + 9 | 12 + 12 | _ + 10 | абв + 12 | + 15 | 12-абв + 9 | 12 + 12 | - + 10 | абв + 12 | + 2 | абв + 12 | . + 2 | рф + 12 | + 3 | абв2 + 12 | . + 2 | рф +(22 rows) + diff --git a/meson.build b/meson.build new file mode 100644 index 0000000..9dc5c8f --- /dev/null +++ b/meson.build @@ -0,0 +1,37 @@ +# Copyright (c) 2025, Postgres Professional + +# Does not support the PGXS infrastructure at this time. Please, compile as part +# of the contrib source tree. + +pg_tsparser_sources = files( + 'tsparser.c' +) + +if host_system == 'windows' + pg_tsparser_sources += rc_lib_gen.process(win32ver_rc, extra_args: [ + '--NAME', 'pg_tsparser', + '--FILEDESC', 'pg_tsparser - modifies the default text parsing strategy.',]) +endif + +pg_tsparser = shared_module('pg_tsparser', + pg_tsparser_sources, + kwargs: contrib_mod_args, +) +contrib_targets += pg_tsparser + +install_data( + 'pg_tsparser.control', + 'pg_tsparser--1.0.sql', + kwargs: contrib_data_args, +) + +tests += { + 'name': 'pg_tsparser', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), + 'regress': { + 'sql': [ + 'pg_tsparser', + ], + }, +} diff --git a/sql/pg_tsparser.sql b/sql/pg_tsparser.sql index 7cd9b1b..6f27d8f 100644 --- a/sql/pg_tsparser.sql +++ b/sql/pg_tsparser.sql @@ -26,3 +26,11 @@ SELECT to_tsvector('english_ts', '12_abc'); SELECT to_tsvector('english_ts', '12-abc'); SELECT to_tsvector('english_ts', 'test.com'); SELECT to_tsvector('english_ts', 'test2.com'); + +-- Test non-ASCII symbols + +-- must have a UTF8 database +SELECT getdatabaseencoding(); +SET client_encoding TO 'UTF8'; + +SELECT * from ts_parse('tsparser', 'аб_вгд 12_абв 12-абв абв.рф абв2.рф'); diff --git a/tsparser.c b/tsparser.c index 3cdafc5..e821dce 100644 --- a/tsparser.c +++ b/tsparser.c @@ -249,11 +249,9 @@ typedef struct TParser /* string and position information */ char *str; /* multibyte string */ int lenstr; /* length of mbstring */ -#ifdef USE_WIDE_UPPER_LOWER wchar_t *wstr; /* wide character string */ pg_wchar *pgwstr; /* wide character string for C-locale */ bool usewide; -#endif /* State of parse */ int charmaxlen; @@ -271,6 +269,10 @@ typedef struct TParser int type; } TParser; +#if PG_VERSION_NUM < 120000 +#define pg_strtoint32(value) pg_atoi((value), sizeof(int32), 0) +#endif + /* forward decls here */ static bool TParserGet(TParser *prs); @@ -302,18 +304,19 @@ TParserInit(char *str, int len) prs->str = str; prs->lenstr = len; -#ifdef USE_WIDE_UPPER_LOWER - /* * Use wide char code only when max encoding length > 1. */ if (prs->charmaxlen > 1) { - Oid collation = DEFAULT_COLLATION_OID; /* TODO */ pg_locale_t mylocale = 0; /* TODO */ prs->usewide = true; - if (lc_ctype_is_c(collation)) +#if PG_VERSION_NUM >= 150000 || (defined(PGPRO_STD) && PG_VERSION_NUM >= 120000) + if (database_ctype_is_c) +#else + if (lc_ctype_is_c(DEFAULT_COLLATION_OID)) +#endif { /* * char2wchar doesn't work for C-locale and sizeof(pg_wchar) could @@ -331,7 +334,6 @@ TParserInit(char *str, int len) } else prs->usewide = false; -#endif prs->state = newTParserPosition(NULL); prs->state->state = TPS_Base; @@ -368,15 +370,12 @@ TParserCopyInit(const TParser *orig) prs->charmaxlen = orig->charmaxlen; prs->str = orig->str + orig->state->posbyte; prs->lenstr = orig->lenstr - orig->state->posbyte; - -#ifdef USE_WIDE_UPPER_LOWER prs->usewide = orig->usewide; if (orig->pgwstr) prs->pgwstr = orig->pgwstr + orig->state->poschar; if (orig->wstr) prs->wstr = orig->wstr + orig->state->poschar; -#endif prs->state = newTParserPosition(NULL); prs->state->state = TPS_Base; @@ -401,12 +400,10 @@ TParserClose(TParser *prs) prs->state = ptr; } -#ifdef USE_WIDE_UPPER_LOWER if (prs->wstr) pfree(prs->wstr); if (prs->pgwstr) pfree(prs->pgwstr); -#endif #ifdef WPARSER_TRACE fprintf(stderr, "closing parser\n"); @@ -445,96 +442,45 @@ TParserCopyClose(TParser *prs) * - if locale is C then we use pgwstr instead of wstr. */ -#ifdef USE_WIDE_UPPER_LOWER - -#define p_iswhat(type) \ +#define p_iswhat(type, nonascii) \ + \ static int \ -p_is##type(TParser *prs) { \ - Assert( prs->state ); \ - if ( prs->usewide ) \ +p_is##type(TParser *prs) \ +{ \ + Assert(prs->state); \ + if (prs->usewide) \ { \ - if ( prs->pgwstr ) \ + if (prs->pgwstr) \ { \ unsigned int c = *(prs->pgwstr + prs->state->poschar); \ - if ( c > 0x7f ) \ - return 0; \ - return is##type( c ); \ + if (c > 0x7f) \ + return nonascii; \ + return is##type(c); \ } \ - return isw##type( *( prs->wstr + prs->state->poschar ) ); \ + return isw##type(*(prs->wstr + prs->state->poschar)); \ } \ - \ - return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); \ -} \ + return is##type(*(unsigned char *) (prs->str + prs->state->posbyte)); \ +} \ \ static int \ -p_isnot##type(TParser *prs) { \ +p_isnot##type(TParser *prs) \ +{ \ return !p_is##type(prs); \ } -static int -p_isalnum(TParser *prs) -{ - Assert(prs->state); - - if (prs->usewide) - { - if (prs->pgwstr) - { - unsigned int c = *(prs->pgwstr + prs->state->poschar); - - /* - * any non-ascii symbol with multibyte encoding with C-locale is - * an alpha character - */ - if (c > 0x7f) - return 1; - - return isalnum(c); - } - - return iswalnum(*(prs->wstr + prs->state->poschar)); - } - - return isalnum(*(unsigned char *) (prs->str + prs->state->posbyte)); -} -static int -p_isnotalnum(TParser *prs) -{ - return !p_isalnum(prs); -} - -static int -p_isalpha(TParser *prs) -{ - Assert(prs->state); - - if (prs->usewide) - { - if (prs->pgwstr) - { - unsigned int c = *(prs->pgwstr + prs->state->poschar); - - /* - * any non-ascii symbol with multibyte encoding with C-locale is - * an alpha character - */ - if (c > 0x7f) - return 1; - - return isalpha(c); - } - - return iswalpha(*(prs->wstr + prs->state->poschar)); - } - - return isalpha(*(unsigned char *) (prs->str + prs->state->posbyte)); -} - -static int -p_isnotalpha(TParser *prs) -{ - return !p_isalpha(prs); -} +/* + * In C locale with a multibyte encoding, any non-ASCII symbol is considered + * an alpha character, but not a member of other char classes. + */ +p_iswhat(alnum, 1) +p_iswhat(alpha, 1) +p_iswhat(digit, 0) +p_iswhat(lower, 0) +p_iswhat(print, 0) +p_iswhat(punct, 0) +p_iswhat(space, 0) +p_iswhat(upper, 0) +p_iswhat(xdigit, 0) /* p_iseq should be used only for ascii symbols */ @@ -544,39 +490,6 @@ p_iseq(TParser *prs, char c) Assert(prs->state); return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0; } -#else /* USE_WIDE_UPPER_LOWER */ - -#define p_iswhat(type) \ -static int \ -p_is##type(TParser *prs) { \ - Assert( prs->state ); \ - return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ); \ -} \ - \ -static int \ -p_isnot##type(TParser *prs) { \ - return !p_is##type(prs); \ -} - - -static int -p_iseq(TParser *prs, char c) -{ - Assert(prs->state); - return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0; -} - -p_iswhat(alnum) -p_iswhat(alpha) -#endif /* USE_WIDE_UPPER_LOWER */ - -p_iswhat(digit) -p_iswhat(lower) -p_iswhat(print) -p_iswhat(punct) -p_iswhat(space) -p_iswhat(upper) -p_iswhat(xdigit) static int p_isEOF(TParser *prs) @@ -793,8 +706,6 @@ p_isspecial(TParser *prs) if (pg_dsplen(prs->str + prs->state->posbyte) == 0) return 1; -#ifdef USE_WIDE_UPPER_LOWER - /* * Unicode Characters in the 'Mark, Spacing Combining' Category That * characters are not alpha although they are not breakers of word too. @@ -1058,7 +969,6 @@ p_isspecial(TParser *prs) StopHigh = StopMiddle; } } -#endif return 0; } @@ -2070,7 +1980,11 @@ typedef struct #undef USE_PHRASE_SEARCH #endif +#if PG_VERSION_NUM >= 130000 +static TSTernaryValue +#else static bool +#endif #ifdef USE_PHRASE_SEARCH checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data) #else @@ -2087,7 +2001,11 @@ checkcondition_HL(void *opaque, QueryOperand *val) { /* don't need to find all positions */ if (!data) +#if PG_VERSION_NUM >= 130000 + return TS_YES; +#else return true; +#endif if (!data->pos) { @@ -2102,17 +2020,29 @@ checkcondition_HL(void *opaque, QueryOperand *val) data->pos[data->npos++] = checkval->words[i].pos; } } +#else +#if PG_VERSION_NUM >= 130000 + return TS_YES; #else return true; +#endif #endif } #ifdef USE_PHRASE_SEARCH if (data && data->npos > 0) +#if PG_VERSION_NUM >= 130000 + return TS_YES; +#else return true; #endif +#endif +#if PG_VERSION_NUM >= 130000 + return TS_NO; +#else return false; +#endif } @@ -2610,13 +2540,13 @@ tsparser_headline(PG_FUNCTION_ARGS) char *val = defGetString(defel); if (pg_strcasecmp(defel->defname, "MaxWords") == 0) - max_words = pg_atoi(val, sizeof(int32), 0); + max_words = pg_strtoint32(val); else if (pg_strcasecmp(defel->defname, "MinWords") == 0) - min_words = pg_atoi(val, sizeof(int32), 0); + min_words = pg_strtoint32(val); else if (pg_strcasecmp(defel->defname, "ShortWord") == 0) - shortword = pg_atoi(val, sizeof(int32), 0); + shortword = pg_strtoint32(val); else if (pg_strcasecmp(defel->defname, "MaxFragments") == 0) - max_fragments = pg_atoi(val, sizeof(int32), 0); + max_fragments = pg_strtoint32(val); else if (pg_strcasecmp(defel->defname, "StartSel") == 0) prs->startsel = pstrdup(val); else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.
Alternative Proxies: