Skip to content

Commit 90b1638

Browse files
author
Alexander Korotkov
committed
Implement estimate_idf().
1 parent 86f185f commit 90b1638

File tree

2 files changed

+200
-11
lines changed

2 files changed

+200
-11
lines changed

src/rum.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1013,5 +1013,6 @@ extern Datum FunctionCall10Coll(FmgrInfo *flinfo, Oid collation,
10131013
extern char *TFIDFSource;
10141014
extern bool check_tf_idf_source(char **newval, void **extra, GucSource source);
10151015
extern void assign_tf_idf_source(const char *newval, void *extra);
1016+
extern float4 estimate_idf(char *lexeme, int length);
10161017

10171018
#endif /* __RUM_H__ */

src/tf_idf.c

Lines changed: 199 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,48 @@
1010
#include "postgres.h"
1111

1212
#include "catalog/namespace.h"
13+
#include "catalog/pg_statistic.h"
1314
#include "catalog/pg_type.h"
1415
#include "utils/builtins.h"
1516
#include "utils/lsyscache.h"
17+
#include "utils/memutils.h"
18+
#include "utils/syscache.h"
1619
#include "utils/varlena.h"
1720

1821
#include "rum.h"
1922

20-
char *TFIDFSource;
23+
/* lookup table type for binary searching through MCELEMs */
24+
typedef struct
25+
{
26+
text *element;
27+
float4 frequency;
28+
} TextFreq;
29+
30+
/* type of keys for bsearch'ing through an array of TextFreqs */
31+
typedef struct
32+
{
33+
char *lexeme;
34+
int length;
35+
} LexemeKey;
36+
37+
typedef struct
38+
{
39+
TextFreq *lookup;
40+
int nmcelem;
41+
float4 minfreq;
42+
} MCelemStats;
43+
44+
typedef struct
45+
{
46+
Oid relId;
47+
AttrNumber attrno;
48+
} RelAttrInfo;
49+
50+
char *TFIDFSource;
51+
static RelAttrInfo TFIDFSourceParsed;
52+
static bool TDIDFLoaded = false;
53+
static MemoryContext TFIDFContext = NULL;
54+
static MCelemStats TDIDFStats;
2155

2256
#define EXIT_CHECK_TF_IDF_SOURCE(error) \
2357
do { \
@@ -29,18 +63,24 @@ char *TFIDFSource;
2963
return false; \
3064
} while (false);
3165

66+
static void load_tf_idf_source(void);
67+
static void check_load_tf_idf_source(void);
68+
static void forget_tf_idf_stats(void);
69+
static int compare_lexeme_textfreq(const void *e1, const void *e2);
70+
3271
bool
3372
check_tf_idf_source(char **newval, void **extra, GucSource source)
3473
{
35-
char *rawname;
36-
char *attname;
37-
List *namelist;
38-
Oid namespaceId;
39-
Oid relId;
40-
Relation rel = NULL;
41-
TupleDesc tupDesc;
42-
AttrNumber attrno;
43-
int i;
74+
char *rawname;
75+
char *attname;
76+
List *namelist;
77+
Oid namespaceId;
78+
Oid relId;
79+
Relation rel = NULL;
80+
TupleDesc tupDesc;
81+
AttrNumber attrno;
82+
int i;
83+
RelAttrInfo *myextra;
4484

4585
/* Need a modifiable copy of string */
4686
rawname = pstrdup(*newval);
@@ -107,6 +147,11 @@ check_tf_idf_source(char **newval, void **extra, GucSource source)
107147
if (tupDesc->attrs[attrno - 1]->atttypid != TSVECTOROID)
108148
EXIT_CHECK_TF_IDF_SOURCE("attribute should be of tsvector type");
109149

150+
myextra = (RelAttrInfo *) malloc(sizeof(RelAttrInfo));
151+
myextra->relId = relId;
152+
myextra->attrno = attrno;
153+
*extra = (void *) myextra;
154+
110155
pfree(rawname);
111156
list_free(namelist);
112157
RelationClose(rel);
@@ -117,5 +162,148 @@ check_tf_idf_source(char **newval, void **extra, GucSource source)
117162
void
118163
assign_tf_idf_source(const char *newval, void *extra)
119164
{
165+
RelAttrInfo *myextra = (RelAttrInfo *) extra;
166+
167+
TFIDFSourceParsed = *myextra;
168+
forget_tf_idf_stats();
169+
}
170+
171+
static void
172+
load_tf_idf_source(void)
173+
{
174+
HeapTuple statsTuple;
175+
AttStatsSlot sslot;
176+
MemoryContext oldContext;
177+
int i;
178+
179+
if (!TFIDFContext)
180+
TFIDFContext = AllocSetContextCreate(TopMemoryContext,
181+
"Memory context for TF/IDF statistics",
182+
ALLOCSET_DEFAULT_SIZES);
183+
184+
statsTuple = SearchSysCache3(STATRELATTINH,
185+
ObjectIdGetDatum(TFIDFSourceParsed.relId),
186+
Int16GetDatum(TFIDFSourceParsed.attrno),
187+
BoolGetDatum(true));
188+
189+
if (!statsTuple)
190+
statsTuple = SearchSysCache3(STATRELATTINH,
191+
ObjectIdGetDatum(TFIDFSourceParsed.relId),
192+
Int16GetDatum(TFIDFSourceParsed.attrno),
193+
BoolGetDatum(false));
194+
195+
MemoryContextReset(TFIDFContext);
196+
TDIDFLoaded = false;
197+
198+
oldContext = MemoryContextSwitchTo(TFIDFContext);
199+
200+
if (!statsTuple
201+
|| !get_attstatsslot(&sslot, statsTuple,
202+
STATISTIC_KIND_MCELEM, InvalidOid,
203+
ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS)
204+
|| sslot.nnumbers != sslot.nvalues + 2)
205+
{
206+
ereport(ERROR,
207+
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
208+
errmsg("statistics for TD/IDF is not found"),
209+
errhint("consider running ANALYZE")));
210+
}
211+
212+
TDIDFStats.nmcelem = sslot.nvalues;
213+
TDIDFStats.minfreq = sslot.numbers[sslot.nnumbers - 2];
214+
/*
215+
* Transpose the data into a single array so we can use bsearch().
216+
*/
217+
TDIDFStats.lookup = (TextFreq *) palloc(sizeof(TextFreq) * TDIDFStats.nmcelem);
218+
for (i = 0; i < TDIDFStats.nmcelem; i++)
219+
{
220+
/*
221+
* The text Datums came from an array, so it cannot be compressed or
222+
* stored out-of-line -- it's safe to use VARSIZE_ANY*.
223+
*/
224+
Assert(!VARATT_IS_COMPRESSED(sslot.values[i]) && !VARATT_IS_EXTERNAL(sslot.values[i]));
225+
TDIDFStats.lookup[i].element = (text *) DatumGetPointer(sslot.values[i]);
226+
TDIDFStats.lookup[i].frequency = sslot.numbers[i];
227+
}
120228

121-
}
229+
MemoryContextSwitchTo(oldContext);
230+
231+
ReleaseSysCache(statsTuple);
232+
}
233+
234+
static void
235+
check_load_tf_idf_source(void)
236+
{
237+
if (!TDIDFLoaded)
238+
load_tf_idf_source();
239+
}
240+
241+
static void
242+
forget_tf_idf_stats(void)
243+
{
244+
MemoryContextReset(TFIDFContext);
245+
TDIDFLoaded = false;
246+
}
247+
248+
/*
249+
* bsearch() comparator for a lexeme (non-NULL terminated string with length)
250+
* and a TextFreq. Use length, then byte-for-byte comparison, because that's
251+
* how ANALYZE code sorted data before storing it in a statistic tuple.
252+
* See ts_typanalyze.c for details.
253+
*/
254+
static int
255+
compare_lexeme_textfreq(const void *e1, const void *e2)
256+
{
257+
const LexemeKey *key = (const LexemeKey *) e1;
258+
const TextFreq *t = (const TextFreq *) e2;
259+
int len1,
260+
len2;
261+
262+
len1 = key->length;
263+
len2 = VARSIZE_ANY_EXHDR(t->element);
264+
265+
/* Compare lengths first, possibly avoiding a strncmp call */
266+
if (len1 > len2)
267+
return 1;
268+
else if (len1 < len2)
269+
return -1;
270+
271+
/* Fall back on byte-for-byte comparison */
272+
return strncmp(key->lexeme, VARDATA_ANY(t->element), len1);
273+
}
274+
275+
float4
276+
estimate_idf(char *lexeme, int length)
277+
{
278+
TextFreq *searchres;
279+
LexemeKey key;
280+
float4 selec;
281+
282+
check_load_tf_idf_source();
283+
284+
key.lexeme = lexeme;
285+
key.length = length;
286+
287+
searchres = (TextFreq *) bsearch(&key, TDIDFStats.lookup, TDIDFStats.nmcelem,
288+
sizeof(TextFreq),
289+
compare_lexeme_textfreq);
290+
291+
if (searchres)
292+
{
293+
/*
294+
* The element is in MCELEM. Return precise selectivity (or
295+
* at least as precise as ANALYZE could find out).
296+
*/
297+
selec = searchres->frequency;
298+
}
299+
else
300+
{
301+
/*
302+
* The element is not in MCELEM. Punt, but assume that the
303+
* selectivity cannot be more than minfreq / 2.
304+
*/
305+
selec = TDIDFStats.minfreq / 2;
306+
}
307+
308+
return 1.0f / selec;
309+
}

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy