Skip to content

Commit 866bad9

Browse files
committed
Add a rank/(rank+1) normalization option to ts_rank(). While the usefulness
of this seems a bit marginal, if it's useful enough to be shown in the manual then we probably ought to support doing it without double evaluation of the ts_rank function. Per my proposal earlier today.
1 parent 5858990 commit 866bad9

File tree

2 files changed

+32
-15
lines changed

2 files changed

+32
-15
lines changed

doc/src/sgml/textsearch.sgml

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.33 2007/11/14 18:36:37 tgl Exp $ -->
1+
<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.34 2007/11/14 23:43:27 tgl Exp $ -->
22

33
<chapter id="textsearch">
44
<title id="textsearch-title">Full Text Search</title>
@@ -940,6 +940,7 @@ SELECT plainto_tsquery('english', 'The Fat &amp; Rats:C');
940940
<listitem>
941941
<para>
942942
4 divides the rank by the mean harmonic distance between extents
943+
(this is implemented only by <function>ts_rank_cd</>)
943944
</para>
944945
</listitem>
945946
<listitem>
@@ -953,17 +954,24 @@ SELECT plainto_tsquery('english', 'The Fat &amp; Rats:C');
953954
of unique words in document
954955
</para>
955956
</listitem>
957+
<listitem>
958+
<para>
959+
32 divides the rank by itself + 1
960+
</para>
961+
</listitem>
956962
</itemizedlist>
957963

964+
If more than one flag bit is specified, the transformations are
965+
applied in the order listed.
958966
</para>
959967

960968
<para>
961969
It is important to note that the ranking functions do not use any global
962-
information so it is impossible to produce a fair normalization to 1% or
963-
100%, as sometimes desired. However, a simple technique like
964-
<literal>rank/(rank+1)</literal> can be applied. Of course, this is just
965-
a cosmetic change, i.e., the ordering of the search results will not
966-
change.
970+
information, so it is impossible to produce a fair normalization to 1% or
971+
100% as sometimes desired. Normalization option 32
972+
(<literal>rank/(rank+1)</literal>) can be applied to scale all ranks
973+
into the range zero to one, but of course this is just a cosmetic change;
974+
it will not affect the ordering of the search results.
967975
</para>
968976

969977
<para>
@@ -991,7 +999,7 @@ ORDER BY rank DESC LIMIT 10;
991999
This is the same example using normalized ranking:
9921000

9931001
<programlisting>
994-
SELECT title, ts_rank_cd(textsearch, query)/(ts_rank_cd(textsearch, query) + 1) AS rank
1002+
SELECT title, ts_rank_cd(textsearch, query, 32 /* rank/(rank+1) */ ) AS rank
9951003
FROM apod, to_tsquery('neutrino|(dark &amp; matter)') query
9961004
WHERE query @@ textsearch
9971005
ORDER BY rank DESC LIMIT 10;

src/backend/utils/adt/tsrank.c

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
*
88
*
99
* IDENTIFICATION
10-
* $PostgreSQL: pgsql/src/backend/utils/adt/tsrank.c,v 1.8 2007/09/20 18:10:57 teodor Exp $
10+
* $PostgreSQL: pgsql/src/backend/utils/adt/tsrank.c,v 1.9 2007/11/14 23:43:27 tgl Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -25,13 +25,14 @@ static float weights[] = {0.1f, 0.2f, 0.4f, 1.0f};
2525

2626
#define wpos(wep) ( w[ WEP_GETWEIGHT(wep) ] )
2727

28-
#define RANK_NO_NORM 0x00
28+
#define RANK_NO_NORM 0x00
2929
#define RANK_NORM_LOGLENGTH 0x01
30-
#define RANK_NORM_LENGTH 0x02
31-
#define RANK_NORM_EXTDIST 0x04
32-
#define RANK_NORM_UNIQ 0x08
33-
#define RANK_NORM_LOGUNIQ 0x10
34-
#define DEF_NORM_METHOD RANK_NO_NORM
30+
#define RANK_NORM_LENGTH 0x02
31+
#define RANK_NORM_EXTDIST 0x04
32+
#define RANK_NORM_UNIQ 0x08
33+
#define RANK_NORM_LOGUNIQ 0x10
34+
#define RANK_NORM_RDIVRPLUS1 0x20
35+
#define DEF_NORM_METHOD RANK_NO_NORM
3536

3637
static float calc_rank_or(float *w, TSVector t, TSQuery q);
3738
static float calc_rank_and(float *w, TSVector t, TSQuery q);
@@ -348,12 +349,17 @@ calc_rank(float *w, TSVector t, TSQuery q, int4 method)
348349
res /= (float) len;
349350
}
350351

352+
/* RANK_NORM_EXTDIST not applicable */
353+
351354
if ((method & RANK_NORM_UNIQ) && t->size > 0)
352355
res /= (float) (t->size);
353356

354357
if ((method & RANK_NORM_LOGUNIQ) && t->size > 0)
355358
res /= log((double) (t->size + 1)) / log(2.0);
356359

360+
if (method & RANK_NORM_RDIVRPLUS1)
361+
res /= (res + 1);
362+
357363
return res;
358364
}
359365

@@ -762,7 +768,7 @@ calc_rank_cd(float4 *arrdata, TSVector txt, TSQuery query, int method)
762768
Wdoc /= (double) len;
763769
}
764770

765-
if ((method & RANK_NORM_EXTDIST) && SumDist > 0)
771+
if ((method & RANK_NORM_EXTDIST) && NExtent > 0 && SumDist > 0)
766772
Wdoc /= ((double) NExtent) / SumDist;
767773

768774
if ((method & RANK_NORM_UNIQ) && txt->size > 0)
@@ -771,6 +777,9 @@ calc_rank_cd(float4 *arrdata, TSVector txt, TSQuery query, int method)
771777
if ((method & RANK_NORM_LOGUNIQ) && txt->size > 0)
772778
Wdoc /= log((double) (txt->size + 1)) / log(2.0);
773779

780+
if (method & RANK_NORM_RDIVRPLUS1)
781+
Wdoc /= (Wdoc + 1);
782+
774783
pfree(doc);
775784

776785
pfree( qr.operandexist );

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy