Skip to content

Commit 38c4fe8

Browse files
committed
Significantly improve ranking:
1) rank_cd now use weight of lexemes 2) rank_cd and rank can use any combination of normalization methods: no normalization normalization by log(length of document) -----/------- by length of document -----/------- by number of unique word in document -----/------- by log(number of unique word in document) -----/------- by number of covers (only rank_cd) Improve cover's search. TODO: changes in documentation
1 parent 85fa9f5 commit 38c4fe8

File tree

3 files changed

+161
-92
lines changed

3 files changed

+161
-92
lines changed

contrib/tsearch2/expected/tsearch2.out

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2315,9 +2315,9 @@ An hour of storm to place
23152315
The sculpture of these granite seams,
23162316
Upon a woman s face. E. J. Pratt (1882 1964)
23172317
'), to_tsquery('sea&thousand&years'));
2318-
rank_cd
2319-
---------
2320-
1.2
2318+
rank_cd
2319+
-----------
2320+
0.0555556
23212321
(1 row)
23222322

23232323
select rank_cd(to_tsvector('Erosion It took the sea a thousand years,
@@ -2329,9 +2329,9 @@ An hour of storm to place
23292329
The sculpture of these granite seams,
23302330
Upon a woman s face. E. J. Pratt (1882 1964)
23312331
'), to_tsquery('granite&sea'));
2332-
rank_cd
2333-
----------
2334-
0.880303
2332+
rank_cd
2333+
-----------
2334+
0.0238095
23352335
(1 row)
23362336

23372337
select rank_cd(to_tsvector('Erosion It took the sea a thousand years,
@@ -2345,7 +2345,7 @@ Upon a woman s face. E. J. Pratt (1882 1964)
23452345
'), to_tsquery('sea'));
23462346
rank_cd
23472347
---------
2348-
2
2348+
0.2
23492349
(1 row)
23502350

23512351
select get_covers(to_tsvector('Erosion It took the sea a thousand years,

contrib/tsearch2/rank.c

Lines changed: 152 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,13 @@ static float weights[] = {0.1, 0.2, 0.4, 1.0};
4141

4242
#define wpos(wep) ( w[ WEP_GETWEIGHT(wep) ] )
4343

44-
#define DEF_NORM_METHOD 0
44+
#define RANK_NO_NORM 0x00
45+
#define RANK_NORM_LOGLENGTH 0x01
46+
#define RANK_NORM_LENGTH 0x02
47+
#define RANK_NORM_EXTDIST 0x04
48+
#define RANK_NORM_UNIQ 0x08
49+
#define RANK_NORM_LOGUNIQ 0x10
50+
#define DEF_NORM_METHOD RANK_NO_NORM
4551

4652
static float calc_rank_or(float *w, tsvector * t, QUERYTYPE * q);
4753
static float calc_rank_and(float *w, tsvector * t, QUERYTYPE * q);
@@ -328,23 +334,21 @@ calc_rank(float *w, tsvector * t, QUERYTYPE * q, int4 method)
328334
if (res < 0)
329335
res = 1e-20;
330336

331-
switch (method)
332-
{
333-
case 0:
334-
break;
335-
case 1:
336-
res /= log((float) (cnt_length(t) + 1)) / log(2.0);
337-
break;
338-
case 2:
339-
len = cnt_length(t);
340-
if (len > 0)
341-
res /= (float) len;
342-
break;
343-
default:
344-
/* internal error */
345-
elog(ERROR, "unrecognized normalization method: %d", method);
337+
if ( (method & RANK_NORM_LOGLENGTH) && t->size>0 )
338+
res /= log((double) (cnt_length(t) + 1)) / log(2.0);
339+
340+
if ( method & RANK_NORM_LENGTH ) {
341+
len = cnt_length(t);
342+
if ( len>0 )
343+
res /= (float) len;
346344
}
347345

346+
if ( (method & RANK_NORM_UNIQ) && t->size > 0 )
347+
res /= (float)( t->size );
348+
349+
if ( (method & RANK_NORM_LOGUNIQ) && t->size > 0 )
350+
res /= log((double) (t->size + 1)) / log(2.0);
351+
348352
return res;
349353
}
350354

@@ -420,6 +424,7 @@ typedef struct
420424
ITEM **item;
421425
int16 nitem;
422426
bool needfree;
427+
uint8 wclass;
423428
int32 pos;
424429
} DocRepresentation;
425430

@@ -452,19 +457,28 @@ reset_istrue_flag(QUERYTYPE * query)
452457
}
453458
}
454459

460+
typedef struct {
461+
int pos;
462+
int p;
463+
int q;
464+
DocRepresentation *begin;
465+
DocRepresentation *end;
466+
} Extention;
467+
468+
455469
static bool
456-
Cover(DocRepresentation * doc, int len, QUERYTYPE * query, int *pos, int *p, int *q)
470+
Cover(DocRepresentation * doc, int len, QUERYTYPE * query, Extention *ext)
457471
{
458472
DocRepresentation *ptr;
459-
int lastpos = *pos;
473+
int lastpos = ext->pos;
460474
int i;
461475
bool found = false;
462476

463477
reset_istrue_flag(query);
464478

465-
*p = 0x7fffffff;
466-
*q = 0;
467-
ptr = doc + *pos;
479+
ext->p = 0x7fffffff;
480+
ext->q = 0;
481+
ptr = doc + ext->pos;
468482

469483
/* find upper bound of cover from current position, move up */
470484
while (ptr - doc < len)
@@ -473,9 +487,10 @@ Cover(DocRepresentation * doc, int len, QUERYTYPE * query, int *pos, int *p, int
473487
ptr->item[i]->istrue = 1;
474488
if (TS_execute(GETQUERY(query), NULL, false, checkcondition_ITEM))
475489
{
476-
if (ptr->pos > *q)
490+
if (ptr->pos > ext->q)
477491
{
478-
*q = ptr->pos;
492+
ext->q = ptr->pos;
493+
ext->end = ptr;
479494
lastpos = ptr - doc;
480495
found = true;
481496
}
@@ -498,25 +513,27 @@ Cover(DocRepresentation * doc, int len, QUERYTYPE * query, int *pos, int *p, int
498513
ptr->item[i]->istrue = 1;
499514
if (TS_execute(GETQUERY(query), NULL, true, checkcondition_ITEM))
500515
{
501-
if (ptr->pos < *p)
502-
*p = ptr->pos;
516+
if (ptr->pos < ext->p) {
517+
ext->begin = ptr;
518+
ext->p = ptr->pos;
519+
}
503520
break;
504521
}
505522
ptr--;
506523
}
507524

508-
if (*p <= *q)
525+
if (ext->p <= ext->q)
509526
{
510527
/*
511528
* set position for next try to next lexeme after begining of founded
512529
* cover
513530
*/
514-
*pos = (ptr - doc) + 1;
531+
ext->pos = (ptr - doc) + 1;
515532
return true;
516533
}
517534

518-
(*pos)++;
519-
return Cover(doc, len, query, pos, p, q);
535+
ext->pos++;
536+
return Cover(doc, len, query, ext);
520537
}
521538

522539
static DocRepresentation *
@@ -593,6 +610,7 @@ get_docrep(tsvector * txt, QUERYTYPE * query, int *doclen)
593610
doc[cur].item = doc[cur - 1].item;
594611
}
595612
doc[cur].pos = WEP_GETPOS(post[j]);
613+
doc[cur].wclass = WEP_GETWEIGHT(post[j]);
596614
cur++;
597615
}
598616
}
@@ -610,61 +628,110 @@ get_docrep(tsvector * txt, QUERYTYPE * query, int *doclen)
610628
return NULL;
611629
}
612630

613-
614-
Datum
615-
rank_cd(PG_FUNCTION_ARGS)
616-
{
617-
int K = PG_GETARG_INT32(0);
618-
tsvector *txt = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(1));
619-
QUERYTYPE *query = (QUERYTYPE *) PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(2));
620-
int method = DEF_NORM_METHOD;
631+
static float4
632+
calc_rank_cd(float4 *arrdata, tsvector *txt, QUERYTYPE *query, int method) {
621633
DocRepresentation *doc;
622-
float res = 0.0;
623-
int p = 0,
624-
q = 0,
625-
len,
626-
cur,
634+
int len,
627635
i,
628636
doclen = 0;
637+
Extention ext;
638+
double Wdoc = 0.0;
639+
double invws[lengthof(weights)];
640+
double SumDist=0.0, PrevExtPos=0.0, CurExtPos=0.0;
641+
int NExtent=0;
629642

630-
doc = get_docrep(txt, query, &doclen);
631-
if (!doc)
643+
for (i = 0; i < lengthof(weights); i++)
632644
{
633-
PG_FREE_IF_COPY(txt, 1);
634-
PG_FREE_IF_COPY(query, 2);
635-
PG_RETURN_FLOAT4(0.0);
645+
invws[i] = ((double)((arrdata[i] >= 0) ? arrdata[i] : weights[i]));
646+
if (invws[i] > 1.0)
647+
ereport(ERROR,
648+
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
649+
errmsg("weight out of range")));
650+
invws[i] = 1.0/invws[i];
636651
}
637652

638-
cur = 0;
639-
if (K <= 0)
640-
K = 4;
641-
while (Cover(doc, doclen, query, &cur, &p, &q))
642-
res += (q - p + 1 > K) ? ((float) K) / ((float) (q - p + 1)) : 1.0;
653+
doc = get_docrep(txt, query, &doclen);
654+
if (!doc)
655+
return 0.0;
643656

644-
if (PG_NARGS() == 4)
645-
method = PG_GETARG_INT32(3);
657+
MemSet( &ext, 0, sizeof(Extention) );
658+
while (Cover(doc, doclen, query, &ext)) {
659+
double Cpos = 0.0;
660+
double InvSum = 0.0;
661+
DocRepresentation *ptr = ext.begin;
646662

647-
switch (method)
648-
{
649-
case 0:
650-
break;
651-
case 1:
652-
res /= log((float) (cnt_length(txt) + 1));
653-
break;
654-
case 2:
655-
len = cnt_length(txt);
656-
if (len > 0)
657-
res /= (float) len;
658-
break;
659-
default:
660-
/* internal error */
661-
elog(ERROR, "unrecognized normalization method: %d", method);
663+
while ( ptr<=ext.end ) {
664+
InvSum += invws[ ptr->wclass ];
665+
ptr++;
666+
}
667+
668+
Cpos = ((double)( ext.end-ext.begin+1 )) / InvSum;
669+
Wdoc += Cpos / ( (double)(( 1 + (ext.q - ext.p) - (ext.end - ext.begin) )) );
670+
671+
CurExtPos = ((double)(ext.q + ext.p))/2.0;
672+
if ( NExtent>0 && CurExtPos > PrevExtPos /* prevent devision by zero in a case of multiple lexize */ )
673+
SumDist += 1.0/( CurExtPos - PrevExtPos );
674+
675+
PrevExtPos = CurExtPos;
676+
NExtent++;
677+
}
678+
679+
if ( (method & RANK_NORM_LOGLENGTH) && txt->size > 0 )
680+
Wdoc /= log((double) (cnt_length(txt) + 1));
681+
682+
if ( method & RANK_NORM_LENGTH ) {
683+
len = cnt_length(txt);
684+
if ( len>0 )
685+
Wdoc /= (double) len;
662686
}
663687

688+
if ( (method & RANK_NORM_EXTDIST) && SumDist > 0 )
689+
Wdoc /= ((double)NExtent) / SumDist;
690+
691+
if ( (method & RANK_NORM_UNIQ) && txt->size > 0 )
692+
Wdoc /= (double)( txt->size );
693+
694+
if ( (method & RANK_NORM_LOGUNIQ) && txt->size > 0 )
695+
Wdoc /= log((double) (txt->size + 1)) / log(2.0);
696+
664697
for (i = 0; i < doclen; i++)
665698
if (doc[i].needfree)
666699
pfree(doc[i].item);
667700
pfree(doc);
701+
702+
return (float4)Wdoc;
703+
}
704+
705+
Datum
706+
rank_cd(PG_FUNCTION_ARGS)
707+
{
708+
ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
709+
tsvector *txt = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(1));
710+
QUERYTYPE *query = (QUERYTYPE *) PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(2));
711+
int method = DEF_NORM_METHOD;
712+
float4 res;
713+
714+
if (ARR_NDIM(win) != 1)
715+
ereport(ERROR,
716+
(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
717+
errmsg("array of weight must be one-dimensional")));
718+
719+
if (ARRNELEMS(win) < lengthof(weights))
720+
ereport(ERROR,
721+
(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
722+
errmsg("array of weight is too short")));
723+
724+
if (ARR_HASNULL(win))
725+
ereport(ERROR,
726+
(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
727+
errmsg("array of weight must not contain nulls")));
728+
729+
if (PG_NARGS() == 4)
730+
method = PG_GETARG_INT32(3);
731+
732+
res = calc_rank_cd( (float4 *) ARR_DATA_PTR(win), txt, query, method);
733+
734+
PG_FREE_IF_COPY(win, 0);
668735
PG_FREE_IF_COPY(txt, 1);
669736
PG_FREE_IF_COPY(query, 2);
670737

@@ -675,13 +742,16 @@ rank_cd(PG_FUNCTION_ARGS)
675742
Datum
676743
rank_cd_def(PG_FUNCTION_ARGS)
677744
{
678-
PG_RETURN_DATUM(DirectFunctionCall4(
679-
rank_cd,
680-
Int32GetDatum(-1),
681-
PG_GETARG_DATUM(0),
682-
PG_GETARG_DATUM(1),
683-
(PG_NARGS() == 3) ? PG_GETARG_DATUM(2) : Int32GetDatum(DEF_NORM_METHOD)
684-
));
745+
tsvector *txt = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
746+
QUERYTYPE *query = (QUERYTYPE *) PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(1));
747+
float4 res;
748+
749+
res = calc_rank_cd( weights, txt, query, (PG_NARGS() == 3) ? PG_GETARG_DATUM(2) : DEF_NORM_METHOD);
750+
751+
PG_FREE_IF_COPY(txt, 1);
752+
PG_FREE_IF_COPY(query, 2);
753+
754+
PG_RETURN_FLOAT4(res);
685755
}
686756

687757
/**************debug*************/
@@ -721,11 +791,9 @@ get_covers(PG_FUNCTION_ARGS)
721791
text *out;
722792
char *cptr;
723793
DocRepresentation *doc;
724-
int pos = 0,
725-
p,
726-
q,
727-
olddwpos = 0;
794+
int olddwpos = 0;
728795
int ncover = 1;
796+
Extention ext;
729797

730798
doc = get_docrep(txt, query, &rlen);
731799

@@ -765,14 +833,15 @@ get_covers(PG_FUNCTION_ARGS)
765833
}
766834
qsort((void *) dw, dlen, sizeof(DocWord), compareDocWord);
767835

768-
while (Cover(doc, rlen, query, &pos, &p, &q))
836+
MemSet( &ext, 0, sizeof(Extention) );
837+
while (Cover(doc, rlen, query, &ext))
769838
{
770839
dwptr = dw + olddwpos;
771-
while (dwptr->pos < p && dwptr - dw < dlen)
840+
while (dwptr->pos < ext.p && dwptr - dw < dlen)
772841
dwptr++;
773842
olddwpos = dwptr - dw;
774843
dwptr->start = ncover;
775-
while (dwptr->pos < q + 1 && dwptr - dw < dlen)
844+
while (dwptr->pos < ext.q + 1 && dwptr - dw < dlen)
776845
dwptr++;
777846
(dwptr - 1)->finish = ncover;
778847
len += 4 /* {}+two spaces */ + 2 * 16 /* numbers */ ;

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy