Skip to content

Commit c6fbe6d

Browse files
committed
Add selectivity estimation functions for intarray operators.
Uriy Zhuravlev and Alexander Korotkov, reviewed by Jeff Janes, some cleanup by me.
1 parent 4348738 commit c6fbe6d

File tree

7 files changed

+443
-17
lines changed

7 files changed

+443
-17
lines changed

contrib/intarray/Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@
22

33
MODULE_big = _int
44
OBJS = _int_bool.o _int_gist.o _int_op.o _int_tool.o \
5-
_intbig_gist.o _int_gin.o $(WIN32RES)
5+
_intbig_gist.o _int_gin.o _int_selfuncs.o $(WIN32RES)
66

77
EXTENSION = intarray
8-
DATA = intarray--1.0.sql intarray--unpackaged--1.0.sql
8+
DATA = intarray--1.1.sql intarray--1.0--1.1.sql intarray--unpackaged--1.0.sql
99
PGFILEDESC = "intarray - functions and operators for arrays of integers"
1010

1111
REGRESS = _int

contrib/intarray/_int_selfuncs.c

Lines changed: 341 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,341 @@
1+
/*-------------------------------------------------------------------------
2+
*
3+
* _int_selfuncs.c
4+
* Functions for selectivity estimation of intarray operators
5+
*
6+
* Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
7+
* Portions Copyright (c) 1994, Regents of the University of California
8+
*
9+
*
10+
* IDENTIFICATION
11+
* contrib/intarray/_int_selfuncs.c
12+
*
13+
*-------------------------------------------------------------------------
14+
*/
15+
#include "postgres.h"
16+
#include "_int.h"
17+
18+
#include "access/htup_details.h"
19+
#include "catalog/pg_operator.h"
20+
#include "catalog/pg_statistic.h"
21+
#include "catalog/pg_type.h"
22+
#include "utils/selfuncs.h"
23+
#include "utils/syscache.h"
24+
#include "utils/lsyscache.h"
25+
#include "miscadmin.h"
26+
27+
PG_FUNCTION_INFO_V1(_int_overlap_sel);
28+
PG_FUNCTION_INFO_V1(_int_contains_sel);
29+
PG_FUNCTION_INFO_V1(_int_contained_sel);
30+
PG_FUNCTION_INFO_V1(_int_overlap_joinsel);
31+
PG_FUNCTION_INFO_V1(_int_contains_joinsel);
32+
PG_FUNCTION_INFO_V1(_int_contained_joinsel);
33+
PG_FUNCTION_INFO_V1(_int_matchsel);
34+
35+
Datum _int_overlap_sel(PG_FUNCTION_ARGS);
36+
Datum _int_contains_sel(PG_FUNCTION_ARGS);
37+
Datum _int_contained_sel(PG_FUNCTION_ARGS);
38+
Datum _int_overlap_joinsel(PG_FUNCTION_ARGS);
39+
Datum _int_contains_joinsel(PG_FUNCTION_ARGS);
40+
Datum _int_contained_joinsel(PG_FUNCTION_ARGS);
41+
Datum _int_matchsel(PG_FUNCTION_ARGS);
42+
43+
44+
static Selectivity int_query_opr_selec(ITEM *item, Datum *values, float4 *freqs,
45+
int nmncelems, float4 minfreq);
46+
static int compare_val_int4(const void *a, const void *b);
47+
48+
/*
49+
* Wrappers around the default array selectivity estimation functions.
50+
*
51+
* The default array selectivity operators for the @>, && and @< operators
52+
* work fine for integer arrays. However, if we tried to just use arraycontsel
53+
* and arracontjoinsel directly as the cost estimator functions for our
54+
* operators, they would not work as intended, because they look at the
55+
* operator's OID. Our operators behave exactly like the built-in anyarray
56+
* versions, but we must tell the cost estimator functions which built-in
57+
* operators they correspond to. These wrappers just replace the operator
58+
* OID with the corresponding built-in operator's OID, and call the built-in
59+
* function.
60+
*/
61+
62+
Datum
63+
_int_overlap_sel(PG_FUNCTION_ARGS)
64+
{
65+
PG_RETURN_DATUM(DirectFunctionCall4(arraycontsel,
66+
PG_GETARG_DATUM(0),
67+
ObjectIdGetDatum(OID_ARRAY_OVERLAP_OP),
68+
PG_GETARG_DATUM(2),
69+
PG_GETARG_DATUM(3)));
70+
}
71+
72+
Datum
73+
_int_contains_sel(PG_FUNCTION_ARGS)
74+
{
75+
PG_RETURN_DATUM(DirectFunctionCall4(arraycontsel,
76+
PG_GETARG_DATUM(0),
77+
ObjectIdGetDatum(OID_ARRAY_CONTAINS_OP),
78+
PG_GETARG_DATUM(2),
79+
PG_GETARG_DATUM(3)));
80+
}
81+
82+
Datum
83+
_int_contained_sel(PG_FUNCTION_ARGS)
84+
{
85+
PG_RETURN_DATUM(DirectFunctionCall4(arraycontsel,
86+
PG_GETARG_DATUM(0),
87+
ObjectIdGetDatum(OID_ARRAY_CONTAINED_OP),
88+
PG_GETARG_DATUM(2),
89+
PG_GETARG_DATUM(3)));
90+
}
91+
92+
Datum
93+
_int_overlap_joinsel(PG_FUNCTION_ARGS)
94+
{
95+
PG_RETURN_DATUM(DirectFunctionCall5(arraycontjoinsel,
96+
PG_GETARG_DATUM(0),
97+
ObjectIdGetDatum(OID_ARRAY_OVERLAP_OP),
98+
PG_GETARG_DATUM(2),
99+
PG_GETARG_DATUM(3),
100+
PG_GETARG_DATUM(4)));
101+
}
102+
103+
Datum
104+
_int_contains_joinsel(PG_FUNCTION_ARGS)
105+
{
106+
PG_RETURN_DATUM(DirectFunctionCall5(arraycontjoinsel,
107+
PG_GETARG_DATUM(0),
108+
ObjectIdGetDatum(OID_ARRAY_CONTAINS_OP),
109+
PG_GETARG_DATUM(2),
110+
PG_GETARG_DATUM(3),
111+
PG_GETARG_DATUM(4)));
112+
}
113+
114+
Datum
115+
_int_contained_joinsel(PG_FUNCTION_ARGS)
116+
{
117+
PG_RETURN_DATUM(DirectFunctionCall5(arraycontjoinsel,
118+
PG_GETARG_DATUM(0),
119+
ObjectIdGetDatum(OID_ARRAY_CONTAINED_OP),
120+
PG_GETARG_DATUM(2),
121+
PG_GETARG_DATUM(3),
122+
PG_GETARG_DATUM(4)));
123+
}
124+
125+
126+
/*
127+
* _int_matchsel -- restriction selectivity function for intarray @@ query_int
128+
*/
129+
Datum
130+
_int_matchsel(PG_FUNCTION_ARGS)
131+
{
132+
PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
133+
134+
List *args = (List *) PG_GETARG_POINTER(2);
135+
int varRelid = PG_GETARG_INT32(3);
136+
VariableStatData vardata;
137+
Node *other;
138+
bool varonleft;
139+
Selectivity selec;
140+
QUERYTYPE *query;
141+
Datum *mcelems = NULL;
142+
float4 *mcefreqs = NULL;
143+
int nmcelems = 0;
144+
float4 minfreq = 0.0;
145+
float4 nullfrac = 0.0;
146+
Form_pg_statistic stats;
147+
Datum *values = NULL;
148+
int nvalues = 0;
149+
float4 *numbers = NULL;
150+
int nnumbers = 0;
151+
152+
/*
153+
* If expression is not "variable @@ something" or "something @@ variable"
154+
* then punt and return a default estimate.
155+
*/
156+
if (!get_restriction_variable(root, args, varRelid,
157+
&vardata, &other, &varonleft))
158+
PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
159+
160+
/*
161+
* Variable should be int[]. We don't support cases where variable is
162+
* query_int.
163+
*/
164+
if (vardata.vartype != INT4ARRAYOID)
165+
PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
166+
167+
/*
168+
* Can't do anything useful if the something is not a constant, either.
169+
*/
170+
if (!IsA(other, Const))
171+
{
172+
ReleaseVariableStats(vardata);
173+
PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
174+
}
175+
176+
/*
177+
* The "@@" operator is strict, so we can cope with NULL right away.
178+
*/
179+
if (((Const *) other)->constisnull)
180+
{
181+
ReleaseVariableStats(vardata);
182+
PG_RETURN_FLOAT8(0.0);
183+
}
184+
185+
/* The caller made sure the const is a query, so get it now */
186+
query = DatumGetQueryTypeP(((Const *) other)->constvalue);
187+
188+
/* Empty query matches nothing */
189+
if (query->size == 0)
190+
{
191+
ReleaseVariableStats(vardata);
192+
return (Selectivity) 0.0;
193+
}
194+
195+
/*
196+
* Get the statistics for the intarray column.
197+
*
198+
* We're interested in the Most-Common-Elements list, and the NULL
199+
* fraction.
200+
*/
201+
if (HeapTupleIsValid(vardata.statsTuple))
202+
{
203+
stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
204+
nullfrac = stats->stanullfrac;
205+
206+
/*
207+
* For an int4 array, the default array type analyze function will
208+
* collect a Most Common Elements list, which is an array of int4s.
209+
*/
210+
if (get_attstatsslot(vardata.statsTuple,
211+
INT4OID, -1,
212+
STATISTIC_KIND_MCELEM, InvalidOid,
213+
NULL,
214+
&values, &nvalues,
215+
&numbers, &nnumbers))
216+
{
217+
/*
218+
* There should be three more Numbers than Values, because the
219+
* last three (for intarray) cells are taken for minimal, maximal
220+
* and nulls frequency. Punt if not.
221+
*/
222+
if (nnumbers == nvalues + 3)
223+
{
224+
/* Grab the lowest frequency. */
225+
minfreq = numbers[nnumbers - (nnumbers - nvalues)];
226+
227+
mcelems = values;
228+
mcefreqs = numbers;
229+
nmcelems = nvalues;
230+
}
231+
}
232+
}
233+
234+
/* Process the logical expression in the query, using the stats */
235+
selec = int_query_opr_selec(GETQUERY(query) + query->size - 1,
236+
mcelems, mcefreqs, nmcelems, minfreq);
237+
238+
/* MCE stats count only non-null rows, so adjust for null rows. */
239+
selec *= (1.0 - nullfrac);
240+
241+
free_attstatsslot(INT4OID, values, nvalues, numbers, nnumbers);
242+
ReleaseVariableStats(vardata);
243+
244+
CLAMP_PROBABILITY(selec);
245+
246+
PG_RETURN_FLOAT8((float8) selec);
247+
}
248+
249+
/*
250+
* Estimate selectivity of single intquery operator
251+
*/
252+
static Selectivity
253+
int_query_opr_selec(ITEM *item, Datum *mcelems, float4 *mcefreqs,
254+
int nmcelems, float4 minfreq)
255+
{
256+
Selectivity selec;
257+
258+
/* since this function recurses, it could be driven to stack overflow */
259+
check_stack_depth();
260+
261+
if (item->type == VAL)
262+
{
263+
Datum *searchres;
264+
265+
if (mcelems == NULL)
266+
return (Selectivity) DEFAULT_EQ_SEL;
267+
268+
searchres = (Datum *) bsearch(&item->val, mcelems, nmcelems,
269+
sizeof(Datum), compare_val_int4);
270+
if (searchres)
271+
{
272+
/*
273+
* The element is in MCELEM. Return precise selectivity (or at
274+
* least as precise as ANALYZE could find out).
275+
*/
276+
selec = mcefreqs[searchres - mcelems];
277+
}
278+
else
279+
{
280+
/*
281+
* The element is not in MCELEM. Punt, but assume that the
282+
* selectivity cannot be more than minfreq / 2.
283+
*/
284+
selec = Min(DEFAULT_EQ_SEL, minfreq / 2);
285+
}
286+
}
287+
else if (item->type == OPR)
288+
{
289+
/* Current query node is an operator */
290+
Selectivity s1,
291+
s2;
292+
293+
s1 = int_query_opr_selec(item - 1, mcelems, mcefreqs, nmcelems,
294+
minfreq);
295+
switch (item->val)
296+
{
297+
case (int32) '!':
298+
selec = 1.0 - s1;
299+
break;
300+
301+
case (int32) '&':
302+
s2 = int_query_opr_selec(item + item->left, mcelems, mcefreqs,
303+
nmcelems, minfreq);
304+
selec = s1 * s2;
305+
break;
306+
307+
case (int32) '|':
308+
s2 = int_query_opr_selec(item + item->left, mcelems, mcefreqs,
309+
nmcelems, minfreq);
310+
selec = s1 + s2 - s1 * s2;
311+
break;
312+
313+
default:
314+
elog(ERROR, "unrecognized operator: %d", item->val);
315+
selec = 0; /* keep compiler quiet */
316+
break;
317+
}
318+
}
319+
else
320+
{
321+
elog(ERROR, "unrecognized int query item type: %u", item->type);
322+
selec = 0; /* keep compiler quiet */
323+
}
324+
325+
/* Clamp intermediate results to stay sane despite roundoff error */
326+
CLAMP_PROBABILITY(selec);
327+
328+
return selec;
329+
}
330+
331+
/*
332+
* Comparison function for binary search in mcelem array.
333+
*/
334+
static int
335+
compare_val_int4(const void *a, const void *b)
336+
{
337+
int32 key = *(int32 *) a;
338+
const Datum *t = (const Datum *) b;
339+
340+
return key - DatumGetInt32(*t);
341+
}

contrib/intarray/expected/_int.out

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -368,6 +368,7 @@ SELECT '1&(2&(4&(5|!6)))'::query_int;
368368

369369
CREATE TABLE test__int( a int[] );
370370
\copy test__int from 'data/test__int.data'
371+
ANALYZE test__int;
371372
SELECT count(*) from test__int WHERE a && '{23,50}';
372373
count
373374
-------

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy