Skip to content

Commit 4e57668

Browse files
committed
Create a selectivity estimation function for the text search @@ operator.
Jan Urbanski
1 parent e2b7d0c commit 4e57668

File tree

9 files changed

+463
-30
lines changed

9 files changed

+463
-30
lines changed

doc/src/sgml/catalogs.sgml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
<!-- $PostgreSQL: pgsql/doc/src/sgml/catalogs.sgml,v 2.174 2008/09/15 18:43:41 tgl Exp $ -->
1+
<!-- $PostgreSQL: pgsql/doc/src/sgml/catalogs.sgml,v 2.175 2008/09/19 19:03:40 tgl Exp $ -->
22
<!--
33
Documentation of the system catalogs, directed toward PostgreSQL developers
44
-->
@@ -6664,6 +6664,9 @@
66646664
A list of the frequencies of the most common values or elements,
66656665
i.e., number of occurrences of each divided by total number of rows.
66666666
(NULL when <structfield>most_common_vals</structfield> is.)
6667+
For some datatypes such as <type>tsvector</>, it can also store some
6668+
additional information, making it longer than the
6669+
<structfield>most_common_vals</> array.
66676670
</entry>
66686671
</row>
66696672

src/backend/tsearch/Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
#
55
# Copyright (c) 2006-2008, PostgreSQL Global Development Group
66
#
7-
# $PostgreSQL: pgsql/src/backend/tsearch/Makefile,v 1.7 2008/07/14 00:51:45 tgl Exp $
7+
# $PostgreSQL: pgsql/src/backend/tsearch/Makefile,v 1.8 2008/09/19 19:03:40 tgl Exp $
88
#
99
#-------------------------------------------------------------------------
1010
subdir = src/backend/tsearch
@@ -19,7 +19,7 @@ DICTFILES=synonym_sample.syn thesaurus_sample.ths hunspell_sample.affix \
1919
OBJS = ts_locale.o ts_parse.o wparser.o wparser_def.o dict.o \
2020
dict_simple.o dict_synonym.o dict_thesaurus.o \
2121
dict_ispell.o regis.o spell.o \
22-
to_tsany.o ts_typanalyze.o ts_utils.o
22+
to_tsany.o ts_selfuncs.o ts_typanalyze.o ts_utils.o
2323

2424
include $(top_srcdir)/src/backend/common.mk
2525

src/backend/tsearch/ts_selfuncs.c

Lines changed: 363 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,363 @@
1+
/*-------------------------------------------------------------------------
2+
*
3+
* ts_selfuncs.c
4+
* Selectivity estimation functions for text search operators.
5+
*
6+
* Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
7+
*
8+
*
9+
* IDENTIFICATION
10+
* $PostgreSQL: pgsql/src/backend/tsearch/ts_selfuncs.c,v 1.1 2008/09/19 19:03:40 tgl Exp $
11+
*
12+
*-------------------------------------------------------------------------
13+
*/
14+
#include "postgres.h"
15+
16+
#include "catalog/pg_statistic.h"
17+
#include "catalog/pg_type.h"
18+
#include "miscadmin.h"
19+
#include "nodes/nodes.h"
20+
#include "tsearch/ts_type.h"
21+
#include "utils/lsyscache.h"
22+
#include "utils/selfuncs.h"
23+
#include "utils/syscache.h"
24+
25+
26+
/*
27+
* The default text search selectivity is chosen to be small enough to
28+
* encourage indexscans for typical table densities. See selfuncs.h and
29+
* DEFAULT_EQ_SEL for details.
30+
*/
31+
#define DEFAULT_TS_MATCH_SEL 0.005
32+
33+
/* lookup table type for binary searching through MCELEMs */
34+
typedef struct
35+
{
36+
text *element;
37+
float4 frequency;
38+
} TextFreq;
39+
40+
/* type of keys for bsearch'ing through an array of TextFreqs */
41+
typedef struct
42+
{
43+
char *lexeme;
44+
int length;
45+
} LexemeKey;
46+
47+
static Selectivity tsquerysel(VariableStatData *vardata, Datum constval);
48+
static Selectivity mcelem_tsquery_selec(TSQuery query,
49+
Datum *mcelem, int nmcelem,
50+
float4 *numbers, int nnumbers);
51+
static Selectivity tsquery_opr_selec(QueryItem *item, char *operand,
52+
TextFreq *lookup, int length, float4 minfreq);
53+
static int compare_lexeme_textfreq(const void *e1, const void *e2);
54+
55+
56+
/*
57+
* tsmatchsel -- Selectivity of "@@"
58+
*
59+
* restriction selectivity function for tsvector @@ tsquery and
60+
* tsquery @@ tsvector
61+
*/
62+
Datum
63+
tsmatchsel(PG_FUNCTION_ARGS)
64+
{
65+
PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
66+
#ifdef NOT_USED
67+
Oid operator = PG_GETARG_OID(1);
68+
#endif
69+
List *args = (List *) PG_GETARG_POINTER(2);
70+
int varRelid = PG_GETARG_INT32(3);
71+
VariableStatData vardata;
72+
Node *other;
73+
bool varonleft;
74+
Selectivity selec;
75+
76+
/*
77+
* If expression is not variable = something or something = variable, then
78+
* punt and return a default estimate.
79+
*/
80+
if (!get_restriction_variable(root, args, varRelid,
81+
&vardata, &other, &varonleft))
82+
PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
83+
84+
/*
85+
* Can't do anything useful if the something is not a constant, either.
86+
*/
87+
if (!IsA(other, Const))
88+
{
89+
ReleaseVariableStats(vardata);
90+
PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
91+
}
92+
93+
/*
94+
* The "@@" operator is strict, so we can cope with NULL right away
95+
*/
96+
if (((Const *) other)->constisnull)
97+
{
98+
ReleaseVariableStats(vardata);
99+
PG_RETURN_FLOAT8(0.0);
100+
}
101+
102+
/*
103+
* OK, there's a Var and a Const we're dealing with here. We need the Var
104+
* to be a TSVector (or else we don't have any useful statistic for it).
105+
* We have to check this because the Var might be the TSQuery not the
106+
* TSVector.
107+
*/
108+
if (vardata.vartype == TSVECTOROID)
109+
{
110+
/* tsvector @@ tsquery or the other way around */
111+
Assert(((Const *) other)->consttype == TSQUERYOID);
112+
113+
selec = tsquerysel(&vardata, ((Const *) other)->constvalue);
114+
}
115+
else
116+
{
117+
/* The Var is something we don't have useful statistics for */
118+
selec = DEFAULT_TS_MATCH_SEL;
119+
}
120+
121+
ReleaseVariableStats(vardata);
122+
123+
CLAMP_PROBABILITY(selec);
124+
125+
PG_RETURN_FLOAT8((float8) selec);
126+
}
127+
128+
129+
/*
130+
* tsmatchjoinsel -- join selectivity of "@@"
131+
*
132+
* join selectivity function for tsvector @@ tsquery and tsquery @@ tsvector
133+
*/
134+
Datum
135+
tsmatchjoinsel(PG_FUNCTION_ARGS)
136+
{
137+
/* for the moment we just punt */
138+
PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
139+
}
140+
141+
142+
/*
143+
* @@ selectivity for tsvector var vs tsquery constant
144+
*/
145+
static Selectivity
146+
tsquerysel(VariableStatData *vardata, Datum constval)
147+
{
148+
Selectivity selec;
149+
150+
if (HeapTupleIsValid(vardata->statsTuple))
151+
{
152+
TSQuery query;
153+
Form_pg_statistic stats;
154+
Datum *values;
155+
int nvalues;
156+
float4 *numbers;
157+
int nnumbers;
158+
159+
/* The caller made sure the const is a TSQuery, so get it now */
160+
query = DatumGetTSQuery(constval);
161+
162+
stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
163+
164+
/* MCELEM will be an array of TEXT elements for a tsvector column */
165+
if (get_attstatsslot(vardata->statsTuple,
166+
TEXTOID, -1,
167+
STATISTIC_KIND_MCELEM, InvalidOid,
168+
&values, &nvalues,
169+
&numbers, &nnumbers))
170+
{
171+
/*
172+
* There is a most-common-elements slot for the tsvector Var, so
173+
* use that.
174+
*/
175+
selec = mcelem_tsquery_selec(query, values, nvalues,
176+
numbers, nnumbers);
177+
free_attstatsslot(TEXTOID, values, nvalues, numbers, nnumbers);
178+
}
179+
else
180+
{
181+
/* No most-common-elements info, so we must punt */
182+
selec = (Selectivity) DEFAULT_TS_MATCH_SEL;
183+
}
184+
}
185+
else
186+
{
187+
/* No stats at all, so we must punt */
188+
selec = (Selectivity) DEFAULT_TS_MATCH_SEL;
189+
}
190+
191+
return selec;
192+
}
193+
194+
/*
195+
* Extract data from the pg_statistic arrays into useful format.
196+
*/
197+
static Selectivity
198+
mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
199+
float4 *numbers, int nnumbers)
200+
{
201+
float4 minfreq;
202+
TextFreq *lookup;
203+
Selectivity selec;
204+
int i;
205+
206+
/*
207+
* There should be two more Numbers than Values, because the last two
208+
* cells are taken for minimal and maximal frequency. Punt if not.
209+
*/
210+
if (nnumbers != nmcelem + 2)
211+
return DEFAULT_TS_MATCH_SEL;
212+
213+
/*
214+
* Transpose the data into a single array so we can use bsearch().
215+
*/
216+
lookup = (TextFreq *) palloc(sizeof(TextFreq) * nmcelem);
217+
for (i = 0; i < nmcelem; i++)
218+
{
219+
/*
220+
* The text Datums came from an array, so it cannot be compressed
221+
* or stored out-of-line -- it's safe to use VARSIZE_ANY*.
222+
*/
223+
Assert(!VARATT_IS_COMPRESSED(mcelem[i]) && !VARATT_IS_EXTERNAL(mcelem[i]));
224+
lookup[i].element = (text *) DatumGetPointer(mcelem[i]);
225+
lookup[i].frequency = numbers[i];
226+
}
227+
228+
/*
229+
* Grab the lowest frequency. compute_tsvector_stats() stored it for us in
230+
* the one before the last cell of the Numbers array. See ts_typanalyze.c
231+
*/
232+
minfreq = numbers[nnumbers - 2];
233+
234+
selec = tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), lookup,
235+
nmcelem, minfreq);
236+
237+
pfree(lookup);
238+
239+
return selec;
240+
}
241+
242+
/*
243+
* Traverse the tsquery in preorder, calculating selectivity as:
244+
*
245+
* selec(left_oper) * selec(right_oper) in AND nodes,
246+
*
247+
* selec(left_oper) + selec(right_oper) -
248+
* selec(left_oper) * selec(right_oper) in OR nodes,
249+
*
250+
* 1 - select(oper) in NOT nodes
251+
*
252+
* freq[val] in VAL nodes, if the value is in MCELEM
253+
* min(freq[MCELEM]) / 2 in VAL nodes, if it is not
254+
*
255+
*
256+
* The MCELEM array is already sorted (see ts_typanalyze.c), so we can use
257+
* binary search for determining freq[MCELEM].
258+
*/
259+
static Selectivity
260+
tsquery_opr_selec(QueryItem *item, char *operand,
261+
TextFreq *lookup, int length, float4 minfreq)
262+
{
263+
LexemeKey key;
264+
TextFreq *searchres;
265+
Selectivity selec, s1, s2;
266+
267+
/* since this function recurses, it could be driven to stack overflow */
268+
check_stack_depth();
269+
270+
if (item->type == QI_VAL)
271+
{
272+
QueryOperand *oper = (QueryOperand *) item;
273+
274+
/*
275+
* Prepare the key for bsearch().
276+
*/
277+
key.lexeme = operand + oper->distance;
278+
key.length = oper->length;
279+
280+
searchres = (TextFreq *) bsearch(&key, lookup, length,
281+
sizeof(TextFreq),
282+
compare_lexeme_textfreq);
283+
284+
if (searchres)
285+
{
286+
/*
287+
* The element is in MCELEM. Return precise selectivity (or at
288+
* least as precise as ANALYZE could find out).
289+
*/
290+
return (Selectivity) searchres->frequency;
291+
}
292+
else
293+
{
294+
/*
295+
* The element is not in MCELEM. Punt, but assert that the
296+
* selectivity cannot be more than minfreq / 2.
297+
*/
298+
return (Selectivity) Min(DEFAULT_TS_MATCH_SEL, minfreq / 2);
299+
}
300+
}
301+
302+
/* Current TSQuery node is an operator */
303+
switch (item->operator.oper)
304+
{
305+
case OP_NOT:
306+
selec = 1.0 - tsquery_opr_selec(item + 1, operand,
307+
lookup, length, minfreq);
308+
break;
309+
310+
case OP_AND:
311+
s1 = tsquery_opr_selec(item + 1, operand,
312+
lookup, length, minfreq);
313+
s2 = tsquery_opr_selec(item + item->operator.left, operand,
314+
lookup, length, minfreq);
315+
selec = s1 * s2;
316+
break;
317+
318+
case OP_OR:
319+
s1 = tsquery_opr_selec(item + 1, operand,
320+
lookup, length, minfreq);
321+
s2 = tsquery_opr_selec(item + item->operator.left, operand,
322+
lookup, length, minfreq);
323+
selec = s1 + s2 - s1 * s2;
324+
break;
325+
326+
default:
327+
elog(ERROR, "unrecognized operator: %d", item->operator.oper);
328+
selec = 0; /* keep compiler quiet */
329+
break;
330+
}
331+
332+
/* Clamp intermediate results to stay sane despite roundoff error */
333+
CLAMP_PROBABILITY(selec);
334+
335+
return selec;
336+
}
337+
338+
/*
339+
* bsearch() comparator for a lexeme (non-NULL terminated string with length)
340+
* and a TextFreq. Use length, then byte-for-byte comparison, because that's
341+
* how ANALYZE code sorted data before storing it in a statistic tuple.
342+
* See ts_typanalyze.c for details.
343+
*/
344+
static int
345+
compare_lexeme_textfreq(const void *e1, const void *e2)
346+
{
347+
const LexemeKey *key = (const LexemeKey *) e1;
348+
const TextFreq *t = (const TextFreq *) e2;
349+
int len1,
350+
len2;
351+
352+
len1 = key->length;
353+
len2 = VARSIZE_ANY_EXHDR(t->element);
354+
355+
/* Compare lengths first, possibly avoiding a strncmp call */
356+
if (len1 > len2)
357+
return 1;
358+
else if (len1 < len2)
359+
return -1;
360+
361+
/* Fall back on byte-for-byte comparison */
362+
return strncmp(key->lexeme, VARDATA_ANY(t->element), len1);
363+
}

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy