Skip to content

Commit f10eab7

Browse files
committed
Make array_to_tsvector() sort and de-duplicate the given strings.
This is required for the result to be a legal tsvector value. Noted while fooling with Andreas Seltenreich's ts_delete() crash. Discussion: <87invhoj6e.fsf@credativ.de>
1 parent c50d192 commit f10eab7

File tree

4 files changed

+52
-8
lines changed

4 files changed

+52
-8
lines changed

doc/src/sgml/func.sgml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9294,7 +9294,7 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
92949294
<entry><type>tsvector</type></entry>
92959295
<entry>convert array of lexemes to <type>tsvector</type></entry>
92969296
<entry><literal>array_to_tsvector('{fat,cat,rat}'::text[])</literal></entry>
9297-
<entry><literal>'fat' 'cat' 'rat'</literal></entry>
9297+
<entry><literal>'cat' 'fat' 'rat'</literal></entry>
92989298
</row>
92999299
<row>
93009300
<entry>

src/backend/utils/adt/tsvector_op.c

Lines changed: 42 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -416,17 +416,34 @@ tsvector_bsearch(const TSVector tsv, char *lexeme, int lexeme_len)
416416
return -1;
417417
}
418418

419+
/*
420+
* qsort comparator functions
421+
*/
422+
419423
static int
420-
compareint(const void *va, const void *vb)
424+
compare_int(const void *va, const void *vb)
421425
{
422-
int32 a = *((const int32 *) va);
423-
int32 b = *((const int32 *) vb);
426+
int a = *((const int *) va);
427+
int b = *((const int *) vb);
424428

425429
if (a == b)
426430
return 0;
427431
return (a > b) ? 1 : -1;
428432
}
429433

434+
static int
435+
compare_text_lexemes(const void *va, const void *vb)
436+
{
437+
Datum a = *((const Datum *) va);
438+
Datum b = *((const Datum *) vb);
439+
char *alex = VARDATA_ANY(a);
440+
int alex_len = VARSIZE_ANY_EXHDR(a);
441+
char *blex = VARDATA_ANY(b);
442+
int blex_len = VARSIZE_ANY_EXHDR(b);
443+
444+
return tsCompareString(alex, alex_len, blex, blex_len, false);
445+
}
446+
430447
/*
431448
* Internal routine to delete lexemes from TSVector by array of offsets.
432449
*
@@ -459,7 +476,7 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
459476
{
460477
int kp;
461478

462-
qsort(indices_to_delete, indices_count, sizeof(int), compareint);
479+
qsort(indices_to_delete, indices_count, sizeof(int), compare_int);
463480
kp = 0;
464481
for (k = 1; k < indices_count; k++)
465482
{
@@ -743,32 +760,50 @@ array_to_tsvector(PG_FUNCTION_ARGS)
743760
bool *nulls;
744761
int nitems,
745762
i,
763+
j,
746764
tslen,
747765
datalen = 0;
748766
char *cur;
749767

750768
deconstruct_array(v, TEXTOID, -1, false, 'i', &dlexemes, &nulls, &nitems);
751769

770+
/* Reject nulls (maybe we should just ignore them, instead?) */
752771
for (i = 0; i < nitems; i++)
753772
{
754773
if (nulls[i])
755774
ereport(ERROR,
756775
(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
757776
errmsg("lexeme array may not contain nulls")));
777+
}
758778

759-
datalen += VARSIZE_ANY_EXHDR(dlexemes[i]);
779+
/* Sort and de-dup, because this is required for a valid tsvector. */
780+
if (nitems > 1)
781+
{
782+
qsort(dlexemes, nitems, sizeof(Datum), compare_text_lexemes);
783+
j = 0;
784+
for (i = 1; i < nitems; i++)
785+
{
786+
if (compare_text_lexemes(&dlexemes[j], &dlexemes[i]) < 0)
787+
dlexemes[++j] = dlexemes[i];
788+
}
789+
nitems = ++j;
760790
}
761791

792+
/* Calculate space needed for surviving lexemes. */
793+
for (i = 0; i < nitems; i++)
794+
datalen += VARSIZE_ANY_EXHDR(dlexemes[i]);
762795
tslen = CALCDATASIZE(nitems, datalen);
796+
797+
/* Allocate and fill tsvector. */
763798
tsout = (TSVector) palloc0(tslen);
764799
SET_VARSIZE(tsout, tslen);
765800
tsout->size = nitems;
801+
766802
arrout = ARRPTR(tsout);
767803
cur = STRPTR(tsout);
768-
769804
for (i = 0; i < nitems; i++)
770805
{
771-
char *lex = VARDATA(dlexemes[i]);
806+
char *lex = VARDATA_ANY(dlexemes[i]);
772807
int lex_len = VARSIZE_ANY_EXHDR(dlexemes[i]);
773808

774809
memcpy(cur, lex, lex_len);

src/test/regress/expected/tstypes.out

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1165,6 +1165,13 @@ SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship','strike']);
11651165

11661166
SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship', NULL]);
11671167
ERROR: lexeme array may not contain nulls
1168+
-- array_to_tsvector must sort and de-dup
1169+
SELECT array_to_tsvector(ARRAY['foo','bar','baz','bar']);
1170+
array_to_tsvector
1171+
-------------------
1172+
'bar' 'baz' 'foo'
1173+
(1 row)
1174+
11681175
SELECT setweight('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd zxc:81,567,222A'::tsvector, 'c');
11691176
setweight
11701177
----------------------------------------------------------

src/test/regress/sql/tstypes.sql

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,8 @@ SELECT tsvector_to_array('base hidden rebel spaceship strike'::tsvector);
226226

227227
SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship','strike']);
228228
SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship', NULL]);
229+
-- array_to_tsvector must sort and de-dup
230+
SELECT array_to_tsvector(ARRAY['foo','bar','baz','bar']);
229231

230232
SELECT setweight('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd zxc:81,567,222A'::tsvector, 'c');
231233
SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c');

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy