From a13fdc4f84a6bda90a5629acab101793c75c5458 Mon Sep 17 00:00:00 2001 From: Arthur Zakirov Date: Fri, 22 Feb 2019 14:33:03 +0300 Subject: [PATCH 1/8] Mop-up for commit 85feb77aa09cda9ff3e12cf95c757c499dc25343. Adjust commentary in regc_pg_locale.c to remove mention of the possibility of not having functions, since we no longer consider that. Eliminate duplicate code in wparser_def.c by generalizing the p_iswhat macro to take a parameter saying what to return for non-ASCII chars in C locale. (That's not really a consequence of the USE_WIDE_UPPER_LOWER-ectomy, but I noticed it while doing that.) --- expected/pg_tsparser.out | 28 +++++++ sql/pg_tsparser.sql | 3 + tsparser.c | 153 +++++++-------------------------------- 3 files changed, 59 insertions(+), 125 deletions(-) diff --git a/expected/pg_tsparser.out b/expected/pg_tsparser.out index 537d2df..0fb681f 100644 --- a/expected/pg_tsparser.out +++ b/expected/pg_tsparser.out @@ -236,3 +236,31 @@ SELECT to_tsvector('english_ts', 'test2.com'); 'com':3 'test2':2 'test2.com':1 (1 row) +-- Test non-ASCII symbols +SELECT * from ts_parse('tsparser', 'аб_вгд 12_абв 12-абв абв.рф абв2.рф'); + tokid | token +-------+-------- + 17 | аб_вгд + 10 | аб + 12 | _ + 10 | вгд + 12 | + 15 | 12_абв + 9 | 12 + 12 | _ + 10 | абв + 12 | + 15 | 12-абв + 9 | 12 + 12 | - + 10 | абв + 12 | + 2 | абв + 12 | . + 2 | рф + 12 | + 3 | абв2 + 12 | . + 2 | рф +(22 rows) + diff --git a/sql/pg_tsparser.sql b/sql/pg_tsparser.sql index 7cd9b1b..98f42eb 100644 --- a/sql/pg_tsparser.sql +++ b/sql/pg_tsparser.sql @@ -26,3 +26,6 @@ SELECT to_tsvector('english_ts', '12_abc'); SELECT to_tsvector('english_ts', '12-abc'); SELECT to_tsvector('english_ts', 'test.com'); SELECT to_tsvector('english_ts', 'test2.com'); + +-- Test non-ASCII symbols +SELECT * from ts_parse('tsparser', 'аб_вгд 12_абв 12-абв абв.рф абв2.рф'); diff --git a/tsparser.c b/tsparser.c index 3cdafc5..53ff6ac 100644 --- a/tsparser.c +++ b/tsparser.c @@ -249,11 +249,9 @@ typedef struct TParser /* string and position information */ char *str; /* multibyte string */ int lenstr; /* length of mbstring */ -#ifdef USE_WIDE_UPPER_LOWER wchar_t *wstr; /* wide character string */ pg_wchar *pgwstr; /* wide character string for C-locale */ bool usewide; -#endif /* State of parse */ int charmaxlen; @@ -302,8 +300,6 @@ TParserInit(char *str, int len) prs->str = str; prs->lenstr = len; -#ifdef USE_WIDE_UPPER_LOWER - /* * Use wide char code only when max encoding length > 1. */ @@ -331,7 +327,6 @@ TParserInit(char *str, int len) } else prs->usewide = false; -#endif prs->state = newTParserPosition(NULL); prs->state->state = TPS_Base; @@ -368,15 +363,12 @@ TParserCopyInit(const TParser *orig) prs->charmaxlen = orig->charmaxlen; prs->str = orig->str + orig->state->posbyte; prs->lenstr = orig->lenstr - orig->state->posbyte; - -#ifdef USE_WIDE_UPPER_LOWER prs->usewide = orig->usewide; if (orig->pgwstr) prs->pgwstr = orig->pgwstr + orig->state->poschar; if (orig->wstr) prs->wstr = orig->wstr + orig->state->poschar; -#endif prs->state = newTParserPosition(NULL); prs->state->state = TPS_Base; @@ -401,12 +393,10 @@ TParserClose(TParser *prs) prs->state = ptr; } -#ifdef USE_WIDE_UPPER_LOWER if (prs->wstr) pfree(prs->wstr); if (prs->pgwstr) pfree(prs->pgwstr); -#endif #ifdef WPARSER_TRACE fprintf(stderr, "closing parser\n"); @@ -445,96 +435,45 @@ TParserCopyClose(TParser *prs) * - if locale is C then we use pgwstr instead of wstr. */ -#ifdef USE_WIDE_UPPER_LOWER - -#define p_iswhat(type) \ +#define p_iswhat(type, nonascii) \ + \ static int \ -p_is##type(TParser *prs) { \ - Assert( prs->state ); \ - if ( prs->usewide ) \ +p_is##type(TParser *prs) \ +{ \ + Assert(prs->state); \ + if (prs->usewide) \ { \ - if ( prs->pgwstr ) \ + if (prs->pgwstr) \ { \ unsigned int c = *(prs->pgwstr + prs->state->poschar); \ - if ( c > 0x7f ) \ - return 0; \ - return is##type( c ); \ + if (c > 0x7f) \ + return nonascii; \ + return is##type(c); \ } \ - return isw##type( *( prs->wstr + prs->state->poschar ) ); \ + return isw##type(*(prs->wstr + prs->state->poschar)); \ } \ - \ - return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); \ -} \ + return is##type(*(unsigned char *) (prs->str + prs->state->posbyte)); \ +} \ \ static int \ -p_isnot##type(TParser *prs) { \ +p_isnot##type(TParser *prs) \ +{ \ return !p_is##type(prs); \ } -static int -p_isalnum(TParser *prs) -{ - Assert(prs->state); - - if (prs->usewide) - { - if (prs->pgwstr) - { - unsigned int c = *(prs->pgwstr + prs->state->poschar); - - /* - * any non-ascii symbol with multibyte encoding with C-locale is - * an alpha character - */ - if (c > 0x7f) - return 1; - - return isalnum(c); - } - - return iswalnum(*(prs->wstr + prs->state->poschar)); - } - - return isalnum(*(unsigned char *) (prs->str + prs->state->posbyte)); -} -static int -p_isnotalnum(TParser *prs) -{ - return !p_isalnum(prs); -} - -static int -p_isalpha(TParser *prs) -{ - Assert(prs->state); - - if (prs->usewide) - { - if (prs->pgwstr) - { - unsigned int c = *(prs->pgwstr + prs->state->poschar); - - /* - * any non-ascii symbol with multibyte encoding with C-locale is - * an alpha character - */ - if (c > 0x7f) - return 1; - - return isalpha(c); - } - - return iswalpha(*(prs->wstr + prs->state->poschar)); - } - - return isalpha(*(unsigned char *) (prs->str + prs->state->posbyte)); -} - -static int -p_isnotalpha(TParser *prs) -{ - return !p_isalpha(prs); -} +/* + * In C locale with a multibyte encoding, any non-ASCII symbol is considered + * an alpha character, but not a member of other char classes. + */ +p_iswhat(alnum, 1) +p_iswhat(alpha, 1) +p_iswhat(digit, 0) +p_iswhat(lower, 0) +p_iswhat(print, 0) +p_iswhat(punct, 0) +p_iswhat(space, 0) +p_iswhat(upper, 0) +p_iswhat(xdigit, 0) /* p_iseq should be used only for ascii symbols */ @@ -544,39 +483,6 @@ p_iseq(TParser *prs, char c) Assert(prs->state); return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0; } -#else /* USE_WIDE_UPPER_LOWER */ - -#define p_iswhat(type) \ -static int \ -p_is##type(TParser *prs) { \ - Assert( prs->state ); \ - return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ); \ -} \ - \ -static int \ -p_isnot##type(TParser *prs) { \ - return !p_is##type(prs); \ -} - - -static int -p_iseq(TParser *prs, char c) -{ - Assert(prs->state); - return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0; -} - -p_iswhat(alnum) -p_iswhat(alpha) -#endif /* USE_WIDE_UPPER_LOWER */ - -p_iswhat(digit) -p_iswhat(lower) -p_iswhat(print) -p_iswhat(punct) -p_iswhat(space) -p_iswhat(upper) -p_iswhat(xdigit) static int p_isEOF(TParser *prs) @@ -793,8 +699,6 @@ p_isspecial(TParser *prs) if (pg_dsplen(prs->str + prs->state->posbyte) == 0) return 1; -#ifdef USE_WIDE_UPPER_LOWER - /* * Unicode Characters in the 'Mark, Spacing Combining' Category That * characters are not alpha although they are not breakers of word too. @@ -1058,7 +962,6 @@ p_isspecial(TParser *prs) StopHigh = StopMiddle; } } -#endif return 0; } From 6081a71c9404e8d2d2af5c1c12b92b11c83d1701 Mon Sep 17 00:00:00 2001 From: Arthur Zakirov Date: Mon, 25 Feb 2019 16:06:44 +0300 Subject: [PATCH 2/8] Use UTF8 database for tests --- Makefile | 3 +++ expected/pg_tsparser.out | 8 ++++++++ sql/pg_tsparser.sql | 5 +++++ 3 files changed, 16 insertions(+) diff --git a/Makefile b/Makefile index 9ddd3fc..bbc36cb 100644 --- a/Makefile +++ b/Makefile @@ -8,6 +8,9 @@ DATA = pg_tsparser--1.0.sql PGFILEDESC = "pg_tsparser - parser for text search" REGRESS = pg_tsparser +# We need a UTF8 database +ENCODING = UTF8 +NO_LOCALE = 1 ifdef USE_PGXS PG_CONFIG = pg_config diff --git a/expected/pg_tsparser.out b/expected/pg_tsparser.out index 0fb681f..23dbb8c 100644 --- a/expected/pg_tsparser.out +++ b/expected/pg_tsparser.out @@ -237,6 +237,14 @@ SELECT to_tsvector('english_ts', 'test2.com'); (1 row) -- Test non-ASCII symbols +-- must have a UTF8 database +SELECT getdatabaseencoding(); + getdatabaseencoding +--------------------- + UTF8 +(1 row) + +SET client_encoding TO 'UTF8'; SELECT * from ts_parse('tsparser', 'аб_вгд 12_абв 12-абв абв.рф абв2.рф'); tokid | token -------+-------- diff --git a/sql/pg_tsparser.sql b/sql/pg_tsparser.sql index 98f42eb..6f27d8f 100644 --- a/sql/pg_tsparser.sql +++ b/sql/pg_tsparser.sql @@ -28,4 +28,9 @@ SELECT to_tsvector('english_ts', 'test.com'); SELECT to_tsvector('english_ts', 'test2.com'); -- Test non-ASCII symbols + +-- must have a UTF8 database +SELECT getdatabaseencoding(); +SET client_encoding TO 'UTF8'; + SELECT * from ts_parse('tsparser', 'аб_вгд 12_абв 12-абв абв.рф абв2.рф'); From edf322e4f0b968c15cbc6bac05484eb4add19cf5 Mon Sep 17 00:00:00 2001 From: Daria Lepikhova Date: Thu, 8 Oct 2020 13:00:49 +0500 Subject: [PATCH 3/8] Support 13 psql. Added returning type TSTernaryValue for checkcondition_HL() function. --- tsparser.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tsparser.c b/tsparser.c index 53ff6ac..78bf357 100644 --- a/tsparser.c +++ b/tsparser.c @@ -1973,7 +1973,11 @@ typedef struct #undef USE_PHRASE_SEARCH #endif +#if PG_VERSION_NUM >= 130000 +static TSTernaryValue +#else static bool +#endif #ifdef USE_PHRASE_SEARCH checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data) #else @@ -1990,7 +1994,11 @@ checkcondition_HL(void *opaque, QueryOperand *val) { /* don't need to find all positions */ if (!data) +#if PG_VERSION_NUM >= 130000 + return TS_YES; +#else return true; +#endif if (!data->pos) { @@ -2005,17 +2013,29 @@ checkcondition_HL(void *opaque, QueryOperand *val) data->pos[data->npos++] = checkval->words[i].pos; } } +#else +#if PG_VERSION_NUM >= 130000 + return TS_YES; #else return true; +#endif #endif } #ifdef USE_PHRASE_SEARCH if (data && data->npos > 0) +#if PG_VERSION_NUM >= 130000 + return TS_YES; +#else return true; #endif +#endif +#if PG_VERSION_NUM >= 130000 + return TS_NO; +#else return false; +#endif } From ff61b616273e53c738c01ace6045e640f9a41cab Mon Sep 17 00:00:00 2001 From: Roman Zharkov Date: Thu, 20 May 2021 10:50:40 +0600 Subject: [PATCH 4/8] [refer #PGPRO-4978] Update the .gitignore file. tags: pg_tsparser --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 8a9a6c9..1167d7c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ *.o *.so results +/log/ From ef4120fa145a0bc67eb77a833a1bfc9cf6519c8e Mon Sep 17 00:00:00 2001 From: Marina Polyakova Date: Wed, 29 Jun 2022 18:47:20 +0300 Subject: [PATCH 5/8] PGPRO-6866: do not use the function pg_atoi if possible In PostgreSQL version 12 or higher it's more effecient to use the function pg_strtoint32 instead (see the commit 86eaf208ea048936df6be77276a246d3f92e9620). And in PostgreSQL 15 the function pg_atoi was removed altogether (see the commit 73508475d69e90f98ebd9b7e1a5933a26a49c5e9). Therefore if possible use the function pg_strtoint32 instead. --- tsparser.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tsparser.c b/tsparser.c index 78bf357..0546e53 100644 --- a/tsparser.c +++ b/tsparser.c @@ -269,6 +269,10 @@ typedef struct TParser int type; } TParser; +#if PG_VERSION_NUM < 120000 +#define pg_strtoint32(value) pg_atoi((value), sizeof(int32), 0) +#endif + /* forward decls here */ static bool TParserGet(TParser *prs); @@ -2533,13 +2537,13 @@ tsparser_headline(PG_FUNCTION_ARGS) char *val = defGetString(defel); if (pg_strcasecmp(defel->defname, "MaxWords") == 0) - max_words = pg_atoi(val, sizeof(int32), 0); + max_words = pg_strtoint32(val); else if (pg_strcasecmp(defel->defname, "MinWords") == 0) - min_words = pg_atoi(val, sizeof(int32), 0); + min_words = pg_strtoint32(val); else if (pg_strcasecmp(defel->defname, "ShortWord") == 0) - shortword = pg_atoi(val, sizeof(int32), 0); + shortword = pg_strtoint32(val); else if (pg_strcasecmp(defel->defname, "MaxFragments") == 0) - max_fragments = pg_atoi(val, sizeof(int32), 0); + max_fragments = pg_strtoint32(val); else if (pg_strcasecmp(defel->defname, "StartSel") == 0) prs->startsel = pstrdup(val); else if (pg_strcasecmp(defel->defname, "StopSel") == 0) From 381bc16417009ae279e271c94e8c7e8f258e918f Mon Sep 17 00:00:00 2001 From: Marina Polyakova Date: Tue, 29 Aug 2023 17:12:48 +0300 Subject: [PATCH 6/8] PGPRO-8706: Fix t_isspace(), etc., when datlocprovider=i and datctype=C for 16+ See the commit f413941f41d370a7893caa3e6ed384b89a0577fd (Fix t_isspace(), etc., when datlocprovider=i and datctype=C.) in PostgreSQL 16+. A fix for previous major versions will be added later. --- tsparser.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tsparser.c b/tsparser.c index 0546e53..8d7f8f7 100644 --- a/tsparser.c +++ b/tsparser.c @@ -309,11 +309,14 @@ TParserInit(char *str, int len) */ if (prs->charmaxlen > 1) { - Oid collation = DEFAULT_COLLATION_OID; /* TODO */ pg_locale_t mylocale = 0; /* TODO */ prs->usewide = true; - if (lc_ctype_is_c(collation)) +#if PG_VERSION_NUM >= 160000 + if (database_ctype_is_c) +#else + if (lc_ctype_is_c(DEFAULT_COLLATION_OID)) +#endif { /* * char2wchar doesn't work for C-locale and sizeof(pg_wchar) could From 7f2b5600f5d298e76553ec4c0846c038890f9d1f Mon Sep 17 00:00:00 2001 From: CourteousSleet Date: Wed, 28 Aug 2024 14:44:46 +0300 Subject: [PATCH 7/8] [PGPRO-8706] Backport pg_tsparser fix Tags: icu, pg_tsparser --- tsparser.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tsparser.c b/tsparser.c index 8d7f8f7..e821dce 100644 --- a/tsparser.c +++ b/tsparser.c @@ -312,7 +312,7 @@ TParserInit(char *str, int len) pg_locale_t mylocale = 0; /* TODO */ prs->usewide = true; -#if PG_VERSION_NUM >= 160000 +#if PG_VERSION_NUM >= 150000 || (defined(PGPRO_STD) && PG_VERSION_NUM >= 120000) if (database_ctype_is_c) #else if (lc_ctype_is_c(DEFAULT_COLLATION_OID)) From f15c01d0151ea7ced8fe16f71f0db1c65d759bcb Mon Sep 17 00:00:00 2001 From: Zharkov Roman Date: Tue, 21 Jan 2025 16:04:49 +0300 Subject: [PATCH 8/8] Add meson.build file to support building from the contrib source tree. --- meson.build | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 meson.build diff --git a/meson.build b/meson.build new file mode 100644 index 0000000..9dc5c8f --- /dev/null +++ b/meson.build @@ -0,0 +1,37 @@ +# Copyright (c) 2025, Postgres Professional + +# Does not support the PGXS infrastructure at this time. Please, compile as part +# of the contrib source tree. + +pg_tsparser_sources = files( + 'tsparser.c' +) + +if host_system == 'windows' + pg_tsparser_sources += rc_lib_gen.process(win32ver_rc, extra_args: [ + '--NAME', 'pg_tsparser', + '--FILEDESC', 'pg_tsparser - modifies the default text parsing strategy.',]) +endif + +pg_tsparser = shared_module('pg_tsparser', + pg_tsparser_sources, + kwargs: contrib_mod_args, +) +contrib_targets += pg_tsparser + +install_data( + 'pg_tsparser.control', + 'pg_tsparser--1.0.sql', + kwargs: contrib_data_args, +) + +tests += { + 'name': 'pg_tsparser', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), + 'regress': { + 'sql': [ + 'pg_tsparser', + ], + }, +} pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy