Skip to content

Commit af01c25

Browse files
feodordmpgpro
authored andcommitted
Make exact distance match for FTS phrase operator
Phrase operator now requires exact distance betweens lexems instead of less-or-equal. Per discussion c19fcfec308e6ccd952cdde9e648b505@mail.gmail.com Conflicts: doc/src/sgml/textsearch.sgml src/backend/utils/adt/tsvector_op.c
1 parent f4e9a02 commit af01c25

File tree

4 files changed

+122
-52
lines changed

4 files changed

+122
-52
lines changed

doc/src/sgml/textsearch.sgml

Lines changed: 56 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,57 @@ text @@ text
344344
The form <type>text</type> <literal>@@</literal> <type>text</type>
345345
is equivalent to <literal>to_tsvector(x) @@ plainto_tsquery(y)</literal>.
346346
</para>
347+
348+
<para>
349+
Within a <type>tsquery</>, the <literal>&amp;</literal> (AND) operator
350+
specifies that both its arguments must appear in the document to have a
351+
match. Similarly, the <literal>|</literal> (OR) operator specifies that
352+
at least one of its arguments must appear, while the <literal>!</> (NOT)
353+
operator specifies that its argument must <emphasis>not</> appear in
354+
order to have a match. Parentheses can be used to control nesting of
355+
these operators.
356+
</para>
357+
358+
<para>
359+
Searching for phrases is possible with the help of
360+
the <literal>&lt;-&gt;</> (FOLLOWED BY) <type>tsquery</> operator, which
361+
matches only if its arguments have matches that are adjacent and in the
362+
given order. For example:
363+
364+
<programlisting>
365+
SELECT to_tsvector('fatal error') @@ to_tsquery('fatal &lt;-&gt; error');
366+
?column?
367+
----------
368+
t
369+
370+
SELECT to_tsvector('error is not fatal') @@ to_tsquery('fatal &lt;-&gt; error');
371+
?column?
372+
----------
373+
f
374+
</programlisting>
375+
376+
There is a more general version of the FOLLOWED BY operator having the
377+
form <literal>&lt;<replaceable>N</>&gt;</literal>,
378+
where <replaceable>N</> is an integer standing for the exact distance
379+
allowed between the matching lexemes. <literal>&lt;1&gt;</literal> is
380+
the same as <literal>&lt;-&gt;</>, while <literal>&lt;2&gt;</literal>
381+
allows one other lexeme to appear between the matches, and so
382+
on. The <literal>phraseto_tsquery</> function makes use of this
383+
operator to construct a <literal>tsquery</> that can match a multi-word
384+
phrase when some of the words are stop words. For example:
385+
386+
<programlisting>
387+
SELECT phraseto_tsquery('cats ate rats');
388+
phraseto_tsquery
389+
-------------------------------
390+
( 'cat' &lt;-&gt; 'ate' ) &lt;-&gt; 'rat'
391+
392+
SELECT phraseto_tsquery('the cats ate the rats');
393+
phraseto_tsquery
394+
-------------------------------
395+
( 'cat' &lt;-&gt; 'ate' ) &lt;2&gt; 'rat'
396+
</programlisting>
397+
</para>
347398
</sect2>
348399

349400
<sect2 id="textsearch-intro-configurations">
@@ -1501,8 +1552,11 @@ SELECT to_tsquery('fat') &lt;-&gt; to_tsquery('cat | rat');
15011552

15021553
<listitem>
15031554
<para>
1504-
Returns the distanced phrase-concatenation of the two given queries.
1505-
This function lies in the implementation of the <literal>&lt;-&gt;</> operator.
1555+
Returns a query that searches for a match to the first given query
1556+
followed by a match to the second given query at a distance of at
1557+
<replaceable>distance</replaceable> lexemes, using
1558+
the <literal>&lt;<replaceable>N</>&gt;</literal>
1559+
<type>tsquery</> operator. For example:
15061560

15071561
<screen>
15081562
SELECT tsquery_phrase(to_tsquery('fat'), to_tsquery('cat'), 10);

src/backend/utils/adt/tsvector_op.c

Lines changed: 43 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -788,11 +788,12 @@ TS_phrase_execute(QueryItem *curitem,
788788
}
789789
else
790790
{
791-
ExecPhraseData Ldata = {0, false, NULL},
792-
Rdata = {0, false, NULL};
793-
WordEntryPos *Lpos,
794-
*Rpos,
795-
*pos_iter = NULL;
791+
ExecPhraseData Ldata = {0, false, NULL},
792+
Rdata = {0, false, NULL};
793+
WordEntryPos *Lpos,
794+
*LposStart,
795+
*Rpos,
796+
*pos_iter = NULL;
796797

797798
Assert(curitem->qoperator.oper == OP_PHRASE);
798799

@@ -830,52 +831,60 @@ TS_phrase_execute(QueryItem *curitem,
830831
pos_iter = data->pos;
831832
}
832833

833-
Lpos = Ldata.pos;
834-
Rpos = Rdata.pos;
835-
836834
/*
837835
* Find matches by distance, WEP_GETPOS() is needed because
838836
* ExecPhraseData->data can point to the tsvector's WordEntryPosVector
839837
*/
840838

839+
Rpos = Rdata.pos;
840+
LposStart = Ldata.pos;
841841
while (Rpos < Rdata.pos + Rdata.npos)
842842
{
843+
/*
844+
* We need to check all possible distances, so reset Lpos
845+
* to guranteed not yet satisfied position.
846+
*/
847+
Lpos = LposStart;
843848
while (Lpos < Ldata.pos + Ldata.npos)
844849
{
845-
if (WEP_GETPOS(*Lpos) <= WEP_GETPOS(*Rpos))
850+
if (WEP_GETPOS(*Rpos) - WEP_GETPOS(*Lpos) ==
851+
curitem->qoperator.distance)
846852
{
847-
/*
848-
* Lpos is behind the Rpos, so we have to check the
849-
* distance condition
850-
*/
851-
if (WEP_GETPOS(*Rpos) - WEP_GETPOS(*Lpos) <= curitem->qoperator.distance)
853+
/* MATCH! */
854+
if (data)
852855
{
853-
/* MATCH! */
854-
if (data)
855-
{
856-
*pos_iter = WEP_GETPOS(*Rpos);
857-
pos_iter++;
858-
859-
break; /* We need to build a unique result
860-
* array, so go to the next Rpos */
861-
}
862-
else
863-
{
864-
/*
865-
* We are in the root of the phrase tree and hence
866-
* we don't have to store the resulting positions
867-
*/
868-
return true;
869-
}
856+
/* Store position for upper phrase operator */
857+
*pos_iter = WEP_GETPOS(*Rpos);
858+
pos_iter++;
859+
860+
/*
861+
* Set left start position to next, because current one
862+
* could not satisfy distance for any other right
863+
* position
864+
*/
865+
LposStart = Lpos + 1;
866+
break;
870867
}
868+
else
869+
{
870+
/*
871+
* We are in the root of the phrase tree and hence
872+
* we don't have to store the resulting positions
873+
*/
874+
return true;
875+
}
876+
871877
}
872-
else
878+
else if (WEP_GETPOS(*Rpos) <= WEP_GETPOS(*Lpos) ||
879+
WEP_GETPOS(*Rpos) - WEP_GETPOS(*Lpos) <
880+
curitem->qoperator.distance)
873881
{
874882
/*
875-
* Go to the next Rpos, because Lpos
876-
* is ahead of the current Rpos
883+
* Go to the next Rpos, because Lpos is ahead or on less
884+
* distance than required by current operator
877885
*/
878886
break;
887+
879888
}
880889

881890
Lpos++;

src/test/regress/expected/tstypes.out

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -683,10 +683,10 @@ SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <-> 2' AS "true";
683683
t
684684
(1 row)
685685

686-
SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <2> 2' AS "true";
687-
true
688-
------
689-
t
686+
SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <2> 2' AS "false";
687+
false
688+
-------
689+
f
690690
(1 row)
691691

692692
SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <-> 3' AS "false";
@@ -701,6 +701,12 @@ SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <2> 3' AS "true";
701701
t
702702
(1 row)
703703

704+
SELECT to_tsvector('simple', '1 2 1 2') @@ '1 <3> 2' AS "true";
705+
true
706+
------
707+
t
708+
(1 row)
709+
704710
SELECT to_tsvector('simple', '1 2 11 3') @@ '1 <-> 3' AS "false";
705711
false
706712
-------
@@ -915,7 +921,7 @@ SELECT ts_rank_cd(' a:1 sa:2A sb:2D g'::tsvector, 'a <-> s:*');
915921
SELECT ts_rank_cd(' a:1 sa:2A sb:2D g'::tsvector, 'a <-> s:* <-> sa:A');
916922
ts_rank_cd
917923
------------
918-
0.0714286
924+
0
919925
(1 row)
920926

921927
SELECT ts_rank_cd(' a:1 sa:2A sb:2D g'::tsvector, 'a <-> s:* <-> sa:B');
@@ -942,10 +948,10 @@ SELECT 'a:1 b:2'::tsvector @@ 'a <1> b'::tsquery AS "true";
942948
t
943949
(1 row)
944950

945-
SELECT 'a:1 b:2'::tsvector @@ 'a <2> b'::tsquery AS "true";
946-
true
947-
------
948-
t
951+
SELECT 'a:1 b:2'::tsvector @@ 'a <2> b'::tsquery AS "false";
952+
false
953+
-------
954+
f
949955
(1 row)
950956

951957
SELECT 'a:1 b:3'::tsvector @@ 'a <-> b'::tsquery AS "false";
@@ -972,9 +978,9 @@ SELECT 'a:1 b:3'::tsvector @@ 'a <2> b'::tsquery AS "true";
972978
t
973979
(1 row)
974980

975-
SELECT 'a:1 b:3'::tsvector @@ 'a <3> b'::tsquery AS "true";
976-
true
977-
------
978-
t
981+
SELECT 'a:1 b:3'::tsvector @@ 'a <3> b'::tsquery AS "false";
982+
false
983+
-------
984+
f
979985
(1 row)
980986

src/test/regress/sql/tstypes.sql

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -133,9 +133,10 @@ SELECT 'supeznova supernova'::tsvector @@ 'super:*'::tsquery AS "true";
133133

134134
--phrase search
135135
SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <-> 2' AS "true";
136-
SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <2> 2' AS "true";
136+
SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <2> 2' AS "false";
137137
SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <-> 3' AS "false";
138138
SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <2> 3' AS "true";
139+
SELECT to_tsvector('simple', '1 2 1 2') @@ '1 <3> 2' AS "true";
139140

140141
SELECT to_tsvector('simple', '1 2 11 3') @@ '1 <-> 3' AS "false";
141142
SELECT to_tsvector('simple', '1 2 11 3') @@ '1:* <-> 3' AS "true";
@@ -183,10 +184,10 @@ SELECT ts_rank_cd(' a:1 sa:2A sb:2D g'::tsvector, 'a <-> s:* <-> sa:B');
183184
SELECT 'a:1 b:2'::tsvector @@ 'a <-> b'::tsquery AS "true";
184185
SELECT 'a:1 b:2'::tsvector @@ 'a <0> b'::tsquery AS "false";
185186
SELECT 'a:1 b:2'::tsvector @@ 'a <1> b'::tsquery AS "true";
186-
SELECT 'a:1 b:2'::tsvector @@ 'a <2> b'::tsquery AS "true";
187+
SELECT 'a:1 b:2'::tsvector @@ 'a <2> b'::tsquery AS "false";
187188
SELECT 'a:1 b:3'::tsvector @@ 'a <-> b'::tsquery AS "false";
188189
SELECT 'a:1 b:3'::tsvector @@ 'a <0> b'::tsquery AS "false";
189190
SELECT 'a:1 b:3'::tsvector @@ 'a <1> b'::tsquery AS "false";
190191
SELECT 'a:1 b:3'::tsvector @@ 'a <2> b'::tsquery AS "true";
191-
SELECT 'a:1 b:3'::tsvector @@ 'a <3> b'::tsquery AS "true";
192+
SELECT 'a:1 b:3'::tsvector @@ 'a <3> b'::tsquery AS "false";
192193

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy