From 86f185f3c1600d47c8ee8a5738461d561e4c4977 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Sat, 7 Oct 2017 23:50:29 +0300 Subject: [PATCH 1/4] tf_idf_source GUC and its validation --- Makefile | 2 +- src/rum.h | 6 +++ src/rumutil.c | 11 +++++ src/tf_idf.c | 121 ++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 139 insertions(+), 1 deletion(-) create mode 100644 src/tf_idf.c diff --git a/Makefile b/Makefile index 0717592f5e..dcfd883319 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ OBJS = src/rumsort.o src/rum_ts_utils.o src/rumtsquery.o \ src/rumbtree.o src/rumbulk.o src/rumdatapage.o \ src/rumentrypage.o src/rumget.o src/ruminsert.o \ src/rumscan.o src/rumutil.o src/rumvacuum.o src/rumvalidate.o \ - src/btree_rum.o $(WIN32RES) + src/btree_rum.o src/tf_idf.o $(WIN32RES) EXTENSION = rum DATA = rum--1.0.sql rum--1.0--1.1.sql rum--1.1.sql diff --git a/src/rum.h b/src/rum.h index 78cb8db439..2a5549e6e0 100644 --- a/src/rum.h +++ b/src/rum.h @@ -19,6 +19,7 @@ #include "access/sdir.h" #include "lib/rbtree.h" #include "storage/bufmgr.h" +#include "utils/guc.h" #include "rumsort.h" @@ -1008,4 +1009,9 @@ extern Datum FunctionCall10Coll(FmgrInfo *flinfo, Oid collation, Datum arg6, Datum arg7, Datum arg8, Datum arg9, Datum arg10); +/* tf_idf.c */ +extern char *TFIDFSource; +extern bool check_tf_idf_source(char **newval, void **extra, GucSource source); +extern void assign_tf_idf_source(const char *newval, void *extra); + #endif /* __RUM_H__ */ diff --git a/src/rumutil.c b/src/rumutil.c index 25eaaedddd..e67209b578 100644 --- a/src/rumutil.c +++ b/src/rumutil.c @@ -49,6 +49,17 @@ _PG_init(void) PGC_USERSET, 0, NULL, NULL, NULL); + DefineCustomStringVariable("tf_tdf_source", + "Source statistics for TD/IFD calculation.", + "", + &TFIDFSource, + "", + PGC_USERSET, + 0, + check_tf_idf_source, + assign_tf_idf_source, + NULL); + rum_relopt_kind = add_reloption_kind(); add_string_reloption(rum_relopt_kind, "attach", diff --git a/src/tf_idf.c b/src/tf_idf.c new file mode 100644 index 0000000000..a283c3e863 --- /dev/null +++ b/src/tf_idf.c @@ -0,0 +1,121 @@ +/*------------------------------------------------------------------------- + * + * tf_idf.c + * Implementation of TD/IDF statistics calculation. + * + * Portions Copyright (c) 2017, Postgres Professional + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "catalog/namespace.h" +#include "catalog/pg_type.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" +#include "utils/varlena.h" + +#include "rum.h" + +char *TFIDFSource; + +#define EXIT_CHECK_TF_IDF_SOURCE(error) \ + do { \ + GUC_check_errdetail(error); \ + pfree(rawname); \ + list_free(namelist); \ + if (rel) \ + RelationClose(rel); \ + return false; \ + } while (false); + +bool +check_tf_idf_source(char **newval, void **extra, GucSource source) +{ + char *rawname; + char *attname; + List *namelist; + Oid namespaceId; + Oid relId; + Relation rel = NULL; + TupleDesc tupDesc; + AttrNumber attrno; + int i; + + /* Need a modifiable copy of string */ + rawname = pstrdup(*newval); + + /* Parse string into list of identifiers */ + if (!SplitIdentifierString(rawname, '.', &namelist)) + { + /* syntax error in name list */ + EXIT_CHECK_TF_IDF_SOURCE("List syntax is invalid."); + } + + switch (list_length(namelist)) + { + case 0: + return true; + case 1: + EXIT_CHECK_TF_IDF_SOURCE("improper column name (there should be at least 2 dotted names)"); + case 2: + relId = RelnameGetRelid(linitial(namelist)); + attname = lsecond(namelist); + break; + case 3: + /* use exact schema given */ + namespaceId = LookupExplicitNamespace(linitial(namelist), true); + if (!OidIsValid(namespaceId)) + relId = InvalidOid; + else + relId = get_relname_relid(lsecond(namelist), namespaceId); + attname = lthird(namelist); + break; + default: + EXIT_CHECK_TF_IDF_SOURCE("improper column name (too many dotted names)"); + } + + if (!OidIsValid(relId)) + EXIT_CHECK_TF_IDF_SOURCE("relation not found"); + + rel = RelationIdGetRelation(relId); + tupDesc = rel->rd_att; + if (rel->rd_rel->relkind == RELKIND_INDEX) + { + attrno = pg_atoi(attname, sizeof(attrno), 10); + if (attrno <= 0 || attrno > rel->rd_index->indnatts) + EXIT_CHECK_TF_IDF_SOURCE("wrong index attribute number"); + if (rel->rd_index->indkey.values[attrno - 1] != InvalidAttrNumber) + EXIT_CHECK_TF_IDF_SOURCE("regular indexed column is specified"); + } + else + { + attrno = InvalidAttrNumber; + for (i = 0; i < tupDesc->natts; i++) + { + if (namestrcmp(&(tupDesc->attrs[i]->attname), attname) == 0) + { + attrno = tupDesc->attrs[i]->attnum; + break; + } + } + + if (attrno == InvalidAttrNumber) + EXIT_CHECK_TF_IDF_SOURCE("attribute not found"); + } + + if (tupDesc->attrs[attrno - 1]->atttypid != TSVECTOROID) + EXIT_CHECK_TF_IDF_SOURCE("attribute should be of tsvector type"); + + pfree(rawname); + list_free(namelist); + RelationClose(rel); + return true; +} + + +void +assign_tf_idf_source(const char *newval, void *extra) +{ + +} \ No newline at end of file From 90b1638d94cb8bcfbb44abe132336af711e64e3b Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Sun, 8 Oct 2017 00:50:50 +0300 Subject: [PATCH 2/4] Implement estimate_idf(). --- src/rum.h | 1 + src/tf_idf.c | 210 ++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 200 insertions(+), 11 deletions(-) diff --git a/src/rum.h b/src/rum.h index 2a5549e6e0..3f48ed4fa7 100644 --- a/src/rum.h +++ b/src/rum.h @@ -1013,5 +1013,6 @@ extern Datum FunctionCall10Coll(FmgrInfo *flinfo, Oid collation, extern char *TFIDFSource; extern bool check_tf_idf_source(char **newval, void **extra, GucSource source); extern void assign_tf_idf_source(const char *newval, void *extra); +extern float4 estimate_idf(char *lexeme, int length); #endif /* __RUM_H__ */ diff --git a/src/tf_idf.c b/src/tf_idf.c index a283c3e863..0d7aff5eb1 100644 --- a/src/tf_idf.c +++ b/src/tf_idf.c @@ -10,14 +10,48 @@ #include "postgres.h" #include "catalog/namespace.h" +#include "catalog/pg_statistic.h" #include "catalog/pg_type.h" #include "utils/builtins.h" #include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/syscache.h" #include "utils/varlena.h" #include "rum.h" -char *TFIDFSource; +/* lookup table type for binary searching through MCELEMs */ +typedef struct +{ + text *element; + float4 frequency; +} TextFreq; + +/* type of keys for bsearch'ing through an array of TextFreqs */ +typedef struct +{ + char *lexeme; + int length; +} LexemeKey; + +typedef struct +{ + TextFreq *lookup; + int nmcelem; + float4 minfreq; +} MCelemStats; + +typedef struct +{ + Oid relId; + AttrNumber attrno; +} RelAttrInfo; + +char *TFIDFSource; +static RelAttrInfo TFIDFSourceParsed; +static bool TDIDFLoaded = false; +static MemoryContext TFIDFContext = NULL; +static MCelemStats TDIDFStats; #define EXIT_CHECK_TF_IDF_SOURCE(error) \ do { \ @@ -29,18 +63,24 @@ char *TFIDFSource; return false; \ } while (false); +static void load_tf_idf_source(void); +static void check_load_tf_idf_source(void); +static void forget_tf_idf_stats(void); +static int compare_lexeme_textfreq(const void *e1, const void *e2); + bool check_tf_idf_source(char **newval, void **extra, GucSource source) { - char *rawname; - char *attname; - List *namelist; - Oid namespaceId; - Oid relId; - Relation rel = NULL; - TupleDesc tupDesc; - AttrNumber attrno; - int i; + char *rawname; + char *attname; + List *namelist; + Oid namespaceId; + Oid relId; + Relation rel = NULL; + TupleDesc tupDesc; + AttrNumber attrno; + int i; + RelAttrInfo *myextra; /* Need a modifiable copy of string */ rawname = pstrdup(*newval); @@ -107,6 +147,11 @@ check_tf_idf_source(char **newval, void **extra, GucSource source) if (tupDesc->attrs[attrno - 1]->atttypid != TSVECTOROID) EXIT_CHECK_TF_IDF_SOURCE("attribute should be of tsvector type"); + myextra = (RelAttrInfo *) malloc(sizeof(RelAttrInfo)); + myextra->relId = relId; + myextra->attrno = attrno; + *extra = (void *) myextra; + pfree(rawname); list_free(namelist); RelationClose(rel); @@ -117,5 +162,148 @@ check_tf_idf_source(char **newval, void **extra, GucSource source) void assign_tf_idf_source(const char *newval, void *extra) { + RelAttrInfo *myextra = (RelAttrInfo *) extra; + + TFIDFSourceParsed = *myextra; + forget_tf_idf_stats(); +} + +static void +load_tf_idf_source(void) +{ + HeapTuple statsTuple; + AttStatsSlot sslot; + MemoryContext oldContext; + int i; + + if (!TFIDFContext) + TFIDFContext = AllocSetContextCreate(TopMemoryContext, + "Memory context for TF/IDF statistics", + ALLOCSET_DEFAULT_SIZES); + + statsTuple = SearchSysCache3(STATRELATTINH, + ObjectIdGetDatum(TFIDFSourceParsed.relId), + Int16GetDatum(TFIDFSourceParsed.attrno), + BoolGetDatum(true)); + + if (!statsTuple) + statsTuple = SearchSysCache3(STATRELATTINH, + ObjectIdGetDatum(TFIDFSourceParsed.relId), + Int16GetDatum(TFIDFSourceParsed.attrno), + BoolGetDatum(false)); + + MemoryContextReset(TFIDFContext); + TDIDFLoaded = false; + + oldContext = MemoryContextSwitchTo(TFIDFContext); + + if (!statsTuple + || !get_attstatsslot(&sslot, statsTuple, + STATISTIC_KIND_MCELEM, InvalidOid, + ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS) + || sslot.nnumbers != sslot.nvalues + 2) + { + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("statistics for TD/IDF is not found"), + errhint("consider running ANALYZE"))); + } + + TDIDFStats.nmcelem = sslot.nvalues; + TDIDFStats.minfreq = sslot.numbers[sslot.nnumbers - 2]; + /* + * Transpose the data into a single array so we can use bsearch(). + */ + TDIDFStats.lookup = (TextFreq *) palloc(sizeof(TextFreq) * TDIDFStats.nmcelem); + for (i = 0; i < TDIDFStats.nmcelem; i++) + { + /* + * The text Datums came from an array, so it cannot be compressed or + * stored out-of-line -- it's safe to use VARSIZE_ANY*. + */ + Assert(!VARATT_IS_COMPRESSED(sslot.values[i]) && !VARATT_IS_EXTERNAL(sslot.values[i])); + TDIDFStats.lookup[i].element = (text *) DatumGetPointer(sslot.values[i]); + TDIDFStats.lookup[i].frequency = sslot.numbers[i]; + } -} \ No newline at end of file + MemoryContextSwitchTo(oldContext); + + ReleaseSysCache(statsTuple); +} + +static void +check_load_tf_idf_source(void) +{ + if (!TDIDFLoaded) + load_tf_idf_source(); +} + +static void +forget_tf_idf_stats(void) +{ + MemoryContextReset(TFIDFContext); + TDIDFLoaded = false; +} + +/* + * bsearch() comparator for a lexeme (non-NULL terminated string with length) + * and a TextFreq. Use length, then byte-for-byte comparison, because that's + * how ANALYZE code sorted data before storing it in a statistic tuple. + * See ts_typanalyze.c for details. + */ +static int +compare_lexeme_textfreq(const void *e1, const void *e2) +{ + const LexemeKey *key = (const LexemeKey *) e1; + const TextFreq *t = (const TextFreq *) e2; + int len1, + len2; + + len1 = key->length; + len2 = VARSIZE_ANY_EXHDR(t->element); + + /* Compare lengths first, possibly avoiding a strncmp call */ + if (len1 > len2) + return 1; + else if (len1 < len2) + return -1; + + /* Fall back on byte-for-byte comparison */ + return strncmp(key->lexeme, VARDATA_ANY(t->element), len1); +} + +float4 +estimate_idf(char *lexeme, int length) +{ + TextFreq *searchres; + LexemeKey key; + float4 selec; + + check_load_tf_idf_source(); + + key.lexeme = lexeme; + key.length = length; + + searchres = (TextFreq *) bsearch(&key, TDIDFStats.lookup, TDIDFStats.nmcelem, + sizeof(TextFreq), + compare_lexeme_textfreq); + + if (searchres) + { + /* + * The element is in MCELEM. Return precise selectivity (or + * at least as precise as ANALYZE could find out). + */ + selec = searchres->frequency; + } + else + { + /* + * The element is not in MCELEM. Punt, but assume that the + * selectivity cannot be more than minfreq / 2. + */ + selec = TDIDFStats.minfreq / 2; + } + + return 1.0f / selec; +} From 3bb824402e17d473694b6cbb556ec2c5b7b7241b Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Sun, 8 Oct 2017 02:18:24 +0300 Subject: [PATCH 3/4] Add IDF to scoring (vary basic). --- src/rum_ts_utils.c | 49 +++++++++++++++++++++++++++++++++++++++++---- src/tf_idf.c | 50 +++++++++++++++++++++++++++++++++++++++------- 2 files changed, 88 insertions(+), 11 deletions(-) diff --git a/src/rum_ts_utils.c b/src/rum_ts_utils.c index 07faabe42c..5c22f88879 100644 --- a/src/rum_ts_utils.c +++ b/src/rum_ts_utils.c @@ -101,11 +101,17 @@ typedef struct typedef struct { - bool operandexist; + bool operandexist; WordEntryPos pos; } QueryRepresentationOperand; +typedef struct +{ + float4 idf; + bool idfloaded; +} QueryRepresentationIDF; + typedef struct { TSQuery query; @@ -113,6 +119,7 @@ typedef struct int *map_item_operand; QueryRepresentationOperand *operandData; + QueryRepresentationIDF *operandIdf; int length; } QueryRepresentation; @@ -140,6 +147,7 @@ static WordEntryPosVector POSNULL = { #define RANK_NORM_UNIQ 0x08 #define RANK_NORM_LOGUNIQ 0x10 #define RANK_NORM_RDIVRPLUS1 0x20 +#define RANK_NORM_IDF 0x40 #define DEF_NORM_METHOD RANK_NO_NORM #define QR_GET_OPERAND(q, v) \ @@ -1229,6 +1237,7 @@ calc_score_docr(float4 *arrdata, DocRepresentation *doc, uint32 doclen, { double Cpos = 0.0; double InvSum = 0.0; + double Idf = 0.0; int nNoise; DocRepresentation *ptr = ext.begin; /* Added by SK */ @@ -1278,13 +1287,43 @@ calc_score_docr(float4 *arrdata, DocRepresentation *doc, uint32 doclen, /* Compute the number of query terms in the cover */ for (i = 0; i < qr->length; i++) + { if (qr->operandData[i].operandexist) - nitems++; + { + if (method & RANK_NORM_IDF) + { + if (!qr->operandIdf[i].idfloaded) + { + QueryOperand *oper = (QueryOperand *) (GETQUERY(qr->query) + i); + qr->operandIdf[i].idf = + estimate_idf( + GETOPERAND(qr->query) + oper->distance, + oper->length + ); + qr->operandIdf[i].idfloaded = true; + } + + Idf += qr->operandIdf[i].idf; + } + else + { + nitems++; + } + } + } Cpos = ((double) (ext.end - ext.begin + 1)) / InvSum; - if (nitems > 0) - Cpos *= nitems; + if (method & RANK_NORM_IDF) + { + if (Idf >= 1.0) + Cpos *= Idf; + } + else + { + if (nitems > 0) + Cpos *= nitems; + } /* * if doc are big enough then ext.q may be equal to ext.p due to limit @@ -1369,6 +1408,8 @@ calc_score(float4 *arrdata, TSVector txt, TSQuery query, int method) qr.query = query; qr.map_item_operand = NULL; qr.operandData = palloc0(sizeof(qr.operandData[0]) * query->size); + if (method & RANK_NORM_IDF) + qr.operandIdf = palloc0(sizeof(qr.operandIdf[0]) * query->size); qr.length = query->size; doc = get_docrep(txt, &qr, &doclen); diff --git a/src/tf_idf.c b/src/tf_idf.c index 0d7aff5eb1..995e9de572 100644 --- a/src/tf_idf.c +++ b/src/tf_idf.c @@ -12,6 +12,7 @@ #include "catalog/namespace.h" #include "catalog/pg_statistic.h" #include "catalog/pg_type.h" +#include "nodes/nodeFuncs.h" #include "utils/builtins.h" #include "utils/lsyscache.h" #include "utils/memutils.h" @@ -20,6 +21,12 @@ #include "rum.h" +/* + * FIXME: + * * cache IDF + * * handle prefix search + */ + /* lookup table type for binary searching through MCELEMs */ typedef struct { @@ -77,7 +84,6 @@ check_tf_idf_source(char **newval, void **extra, GucSource source) Oid namespaceId; Oid relId; Relation rel = NULL; - TupleDesc tupDesc; AttrNumber attrno; int i; RelAttrInfo *myextra; @@ -119,17 +125,27 @@ check_tf_idf_source(char **newval, void **extra, GucSource source) EXIT_CHECK_TF_IDF_SOURCE("relation not found"); rel = RelationIdGetRelation(relId); - tupDesc = rel->rd_att; if (rel->rd_rel->relkind == RELKIND_INDEX) { + int exprnum = 0; + attrno = pg_atoi(attname, sizeof(attrno), 10); if (attrno <= 0 || attrno > rel->rd_index->indnatts) EXIT_CHECK_TF_IDF_SOURCE("wrong index attribute number"); if (rel->rd_index->indkey.values[attrno - 1] != InvalidAttrNumber) EXIT_CHECK_TF_IDF_SOURCE("regular indexed column is specified"); + for (i = 0; i < attrno - 1; i++) + { + if (rel->rd_index->indkey.values[i] == InvalidAttrNumber) + exprnum++; + } + if (exprType((Node *) list_nth(rel->rd_indexprs, exprnum)) != TSVECTOROID) + EXIT_CHECK_TF_IDF_SOURCE("indexed expression should be of tsvector type"); } else { + TupleDesc tupDesc = rel->rd_att; + attrno = InvalidAttrNumber; for (i = 0; i < tupDesc->natts; i++) { @@ -139,13 +155,12 @@ check_tf_idf_source(char **newval, void **extra, GucSource source) break; } } - if (attrno == InvalidAttrNumber) EXIT_CHECK_TF_IDF_SOURCE("attribute not found"); + if (tupDesc->attrs[attrno - 1]->atttypid != TSVECTOROID) + EXIT_CHECK_TF_IDF_SOURCE("attribute should be of tsvector type"); } - if (tupDesc->attrs[attrno - 1]->atttypid != TSVECTOROID) - EXIT_CHECK_TF_IDF_SOURCE("attribute should be of tsvector type"); myextra = (RelAttrInfo *) malloc(sizeof(RelAttrInfo)); myextra->relId = relId; @@ -164,7 +179,16 @@ assign_tf_idf_source(const char *newval, void *extra) { RelAttrInfo *myextra = (RelAttrInfo *) extra; - TFIDFSourceParsed = *myextra; + if (myextra) + { + TFIDFSourceParsed = *myextra; + } + else + { + TFIDFSourceParsed.relId = InvalidOid; + TFIDFSourceParsed.attrno = InvalidAttrNumber; + } + forget_tf_idf_stats(); } @@ -181,6 +205,15 @@ load_tf_idf_source(void) "Memory context for TF/IDF statistics", ALLOCSET_DEFAULT_SIZES); + if (!OidIsValid(TFIDFSourceParsed.relId) + || TFIDFSourceParsed.attrno == InvalidAttrNumber) + { + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("statistics for TD/IDF is not defined"), + errhint("consider setting tf_idf_source GUC"))); + } + statsTuple = SearchSysCache3(STATRELATTINH, ObjectIdGetDatum(TFIDFSourceParsed.relId), Int16GetDatum(TFIDFSourceParsed.attrno), @@ -228,6 +261,8 @@ load_tf_idf_source(void) MemoryContextSwitchTo(oldContext); + TDIDFLoaded = true; + ReleaseSysCache(statsTuple); } @@ -241,7 +276,8 @@ check_load_tf_idf_source(void) static void forget_tf_idf_stats(void) { - MemoryContextReset(TFIDFContext); + if (TFIDFContext) + MemoryContextReset(TFIDFContext); TDIDFLoaded = false; } From 581daf5fbe5b4b1cb56c7adefc17c4dd054ca60b Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Sun, 8 Oct 2017 23:18:49 +0300 Subject: [PATCH 4/4] Better IDF calculation. --- src/rum_ts_utils.c | 60 +++++++++++++--------------------------------- src/tf_idf.c | 5 ++-- 2 files changed, 20 insertions(+), 45 deletions(-) diff --git a/src/rum_ts_utils.c b/src/rum_ts_utils.c index 5c22f88879..d9f79423b2 100644 --- a/src/rum_ts_utils.c +++ b/src/rum_ts_utils.c @@ -96,6 +96,7 @@ typedef struct } key; } data; uint8 wclass; + float4 idf; int32 pos; } DocRepresentation; @@ -106,12 +107,6 @@ typedef struct } QueryRepresentationOperand; -typedef struct -{ - float4 idf; - bool idfloaded; -} QueryRepresentationIDF; - typedef struct { TSQuery query; @@ -119,7 +114,6 @@ typedef struct int *map_item_operand; QueryRepresentationOperand *operandData; - QueryRepresentationIDF *operandIdf; int length; } QueryRepresentation; @@ -1098,7 +1092,7 @@ find_wordentry(TSVector t, TSQuery q, QueryOperand *item, int32 *nitem) } static DocRepresentation * -get_docrep(TSVector txt, QueryRepresentation *qr, uint32 *doclen) +get_docrep(TSVector txt, QueryRepresentation *qr, uint32 *doclen, bool load_idf) { QueryItem *item = GETQUERY(qr->query); WordEntry *entry, @@ -1134,6 +1128,8 @@ get_docrep(TSVector txt, QueryRepresentation *qr, uint32 *doclen) while (entry - firstentry < nitem) { + float4 idf; + if (entry->haspos) { dimt = POSDATALEN(txt, entry); @@ -1187,12 +1183,18 @@ get_docrep(TSVector txt, QueryRepresentation *qr, uint32 *doclen) } } + + if (load_idf) + idf = estimate_idf(STRPTR(txt) + entry->pos, entry->len); + else + idf = 1.0f; } else { doc[cur].data.item.nitem = doc[cur - 1].data.item.nitem; doc[cur].data.item.item = doc[cur - 1].data.item.item; } + doc[cur].idf = idf; doc[cur].pos = WEP_GETPOS(post[j]); doc[cur].wclass = WEP_GETWEIGHT(post[j]); cur++; @@ -1256,6 +1258,7 @@ calc_score_docr(float4 *arrdata, DocRepresentation *doc, uint32 doclen, /* For rum_tsquery_distance() */ else new_cover_key += (int)(uintptr_t)ptr->data.key.item_first; + Idf += ptr->idf; ptr++; } @@ -1287,43 +1290,16 @@ calc_score_docr(float4 *arrdata, DocRepresentation *doc, uint32 doclen, /* Compute the number of query terms in the cover */ for (i = 0; i < qr->length; i++) - { if (qr->operandData[i].operandexist) - { - if (method & RANK_NORM_IDF) - { - if (!qr->operandIdf[i].idfloaded) - { - QueryOperand *oper = (QueryOperand *) (GETQUERY(qr->query) + i); - qr->operandIdf[i].idf = - estimate_idf( - GETOPERAND(qr->query) + oper->distance, - oper->length - ); - qr->operandIdf[i].idfloaded = true; - } - - Idf += qr->operandIdf[i].idf; - } - else - { - nitems++; - } - } - } + nitems++; Cpos = ((double) (ext.end - ext.begin + 1)) / InvSum; + if (nitems > 0) + Cpos *= nitems; + if (method & RANK_NORM_IDF) - { - if (Idf >= 1.0) - Cpos *= Idf; - } - else - { - if (nitems > 0) - Cpos *= nitems; - } + Cpos *= Idf; /* * if doc are big enough then ext.q may be equal to ext.p due to limit @@ -1408,11 +1384,9 @@ calc_score(float4 *arrdata, TSVector txt, TSQuery query, int method) qr.query = query; qr.map_item_operand = NULL; qr.operandData = palloc0(sizeof(qr.operandData[0]) * query->size); - if (method & RANK_NORM_IDF) - qr.operandIdf = palloc0(sizeof(qr.operandIdf[0]) * query->size); qr.length = query->size; - doc = get_docrep(txt, &qr, &doclen); + doc = get_docrep(txt, &qr, &doclen, (method & RANK_NORM_IDF) ? true : false); if (!doc) { pfree(qr.operandData); diff --git a/src/tf_idf.c b/src/tf_idf.c index 995e9de572..1c14ef2d04 100644 --- a/src/tf_idf.c +++ b/src/tf_idf.c @@ -23,8 +23,8 @@ /* * FIXME: - * * cache IDF - * * handle prefix search + * * cache IDF for ts_query (non-prefix search?) + * * calculate IDF from RUM index */ /* lookup table type for binary searching through MCELEMs */ @@ -139,6 +139,7 @@ check_tf_idf_source(char **newval, void **extra, GucSource source) if (rel->rd_index->indkey.values[i] == InvalidAttrNumber) exprnum++; } + RelationGetIndexExpressions(rel); if (exprType((Node *) list_nth(rel->rd_indexprs, exprnum)) != TSVECTOROID) EXIT_CHECK_TF_IDF_SOURCE("indexed expression should be of tsvector type"); } pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy