Skip to content

Commit 97532f7

Browse files
committed
Add some knowledge about prefix matches to tsmatchsel(). It's not terribly
bright, but it beats assuming that a prefix match behaves identically to an exact match, which is what the code was doing before :-(. Noted while experimenting with Artur Dobrowski's example.
1 parent d4fe61b commit 97532f7

File tree

1 file changed

+108
-54
lines changed

1 file changed

+108
-54
lines changed

src/backend/tsearch/ts_selfuncs.c

Lines changed: 108 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
*
88
*
99
* IDENTIFICATION
10-
* $PostgreSQL: pgsql/src/backend/tsearch/ts_selfuncs.c,v 1.8 2010/07/31 03:27:40 tgl Exp $
10+
* $PostgreSQL: pgsql/src/backend/tsearch/ts_selfuncs.c,v 1.9 2010/08/01 21:31:08 tgl Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -257,93 +257,147 @@ mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
257257
*
258258
* 1 - select(oper) in NOT nodes
259259
*
260-
* freq[val] in VAL nodes, if the value is in MCELEM
260+
* histogram-based estimation in prefix VAL nodes
261+
*
262+
* freq[val] in exact VAL nodes, if the value is in MCELEM
261263
* min(freq[MCELEM]) / 2 in VAL nodes, if it is not
262264
*
263265
* The MCELEM array is already sorted (see ts_typanalyze.c), so we can use
264266
* binary search for determining freq[MCELEM].
265267
*
266268
* If we don't have stats for the tsvector, we still use this logic,
267-
* except we always use DEFAULT_TS_MATCH_SEL for VAL nodes. This case
268-
* is signaled by lookup == NULL.
269+
* except we use default estimates for VAL nodes. This case is signaled
270+
* by lookup == NULL.
269271
*/
270272
static Selectivity
271273
tsquery_opr_selec(QueryItem *item, char *operand,
272274
TextFreq *lookup, int length, float4 minfreq)
273275
{
274-
LexemeKey key;
275-
TextFreq *searchres;
276-
Selectivity selec,
277-
s1,
278-
s2;
276+
Selectivity selec;
279277

280278
/* since this function recurses, it could be driven to stack overflow */
281279
check_stack_depth();
282280

283281
if (item->type == QI_VAL)
284282
{
285283
QueryOperand *oper = (QueryOperand *) item;
286-
287-
/* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */
288-
if (lookup == NULL)
289-
return (Selectivity) DEFAULT_TS_MATCH_SEL;
284+
LexemeKey key;
290285

291286
/*
292287
* Prepare the key for bsearch().
293288
*/
294289
key.lexeme = operand + oper->distance;
295290
key.length = oper->length;
296291

297-
searchres = (TextFreq *) bsearch(&key, lookup, length,
298-
sizeof(TextFreq),
299-
compare_lexeme_textfreq);
300-
301-
if (searchres)
292+
if (oper->prefix)
302293
{
294+
/* Prefix match, ie the query item is lexeme:* */
295+
Selectivity matched,
296+
allmcvs;
297+
int i;
298+
299+
/*
300+
* Our strategy is to scan through the MCV list and add up the
301+
* frequencies of the ones that match the prefix, thereby
302+
* assuming that the MCVs are representative of the whole lexeme
303+
* population in this respect. Compare histogram_selectivity().
304+
*
305+
* This is only a good plan if we have a pretty fair number of
306+
* MCVs available; we set the threshold at 100. If no stats or
307+
* insufficient stats, arbitrarily use DEFAULT_TS_MATCH_SEL*4.
308+
*/
309+
if (lookup == NULL || length < 100)
310+
return (Selectivity) (DEFAULT_TS_MATCH_SEL * 4);
311+
312+
matched = allmcvs = 0;
313+
for (i = 0; i < length; i++)
314+
{
315+
TextFreq *t = lookup + i;
316+
int tlen = VARSIZE_ANY_EXHDR(t->element);
317+
318+
if (tlen >= key.length &&
319+
strncmp(key.lexeme, VARDATA_ANY(t->element),
320+
key.length) == 0)
321+
matched += t->frequency;
322+
allmcvs += t->frequency;
323+
}
324+
325+
if (allmcvs > 0) /* paranoia about zero divide */
326+
selec = matched / allmcvs;
327+
else
328+
selec = (Selectivity) (DEFAULT_TS_MATCH_SEL * 4);
329+
303330
/*
304-
* The element is in MCELEM. Return precise selectivity (or at
305-
* least as precise as ANALYZE could find out).
331+
* In any case, never believe that a prefix match has selectivity
332+
* less than DEFAULT_TS_MATCH_SEL.
306333
*/
307-
return (Selectivity) searchres->frequency;
334+
selec = Max(DEFAULT_TS_MATCH_SEL, selec);
308335
}
309336
else
310337
{
311-
/*
312-
* The element is not in MCELEM. Punt, but assume that the
313-
* selectivity cannot be more than minfreq / 2.
314-
*/
315-
return (Selectivity) Min(DEFAULT_TS_MATCH_SEL, minfreq / 2);
338+
/* Regular exact lexeme match */
339+
TextFreq *searchres;
340+
341+
/* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */
342+
if (lookup == NULL)
343+
return (Selectivity) DEFAULT_TS_MATCH_SEL;
344+
345+
searchres = (TextFreq *) bsearch(&key, lookup, length,
346+
sizeof(TextFreq),
347+
compare_lexeme_textfreq);
348+
349+
if (searchres)
350+
{
351+
/*
352+
* The element is in MCELEM. Return precise selectivity (or
353+
* at least as precise as ANALYZE could find out).
354+
*/
355+
selec = searchres->frequency;
356+
}
357+
else
358+
{
359+
/*
360+
* The element is not in MCELEM. Punt, but assume that the
361+
* selectivity cannot be more than minfreq / 2.
362+
*/
363+
selec = Min(DEFAULT_TS_MATCH_SEL, minfreq / 2);
364+
}
316365
}
317366
}
318-
319-
/* Current TSQuery node is an operator */
320-
switch (item->qoperator.oper)
367+
else
321368
{
322-
case OP_NOT:
323-
selec = 1.0 - tsquery_opr_selec(item + 1, operand,
324-
lookup, length, minfreq);
325-
break;
326-
327-
case OP_AND:
328-
s1 = tsquery_opr_selec(item + 1, operand,
329-
lookup, length, minfreq);
330-
s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
331-
lookup, length, minfreq);
332-
selec = s1 * s2;
333-
break;
334-
335-
case OP_OR:
336-
s1 = tsquery_opr_selec(item + 1, operand,
337-
lookup, length, minfreq);
338-
s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
339-
lookup, length, minfreq);
340-
selec = s1 + s2 - s1 * s2;
341-
break;
342-
343-
default:
344-
elog(ERROR, "unrecognized operator: %d", item->qoperator.oper);
345-
selec = 0; /* keep compiler quiet */
346-
break;
369+
/* Current TSQuery node is an operator */
370+
Selectivity s1,
371+
s2;
372+
373+
switch (item->qoperator.oper)
374+
{
375+
case OP_NOT:
376+
selec = 1.0 - tsquery_opr_selec(item + 1, operand,
377+
lookup, length, minfreq);
378+
break;
379+
380+
case OP_AND:
381+
s1 = tsquery_opr_selec(item + 1, operand,
382+
lookup, length, minfreq);
383+
s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
384+
lookup, length, minfreq);
385+
selec = s1 * s2;
386+
break;
387+
388+
case OP_OR:
389+
s1 = tsquery_opr_selec(item + 1, operand,
390+
lookup, length, minfreq);
391+
s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
392+
lookup, length, minfreq);
393+
selec = s1 + s2 - s1 * s2;
394+
break;
395+
396+
default:
397+
elog(ERROR, "unrecognized operator: %d", item->qoperator.oper);
398+
selec = 0; /* keep compiler quiet */
399+
break;
400+
}
347401
}
348402

349403
/* Clamp intermediate results to stay sane despite roundoff error */

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy