Skip to content

Commit e80252d

Browse files
committed
Add width_bucket(anyelement, anyarray).
This provides a convenient method of classifying input values into buckets that are not necessarily equal-width. It works on any sortable data type. The choice of function name is a bit debatable, perhaps, but showing that there's a relationship to the SQL standard's width_bucket() function seems more attractive than the other proposals. Petr Jelinek, reviewed by Pavel Stehule
1 parent 220bb39 commit e80252d

File tree

7 files changed

+458
-12
lines changed

7 files changed

+458
-12
lines changed

doc/src/sgml/func.sgml

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -901,25 +901,40 @@
901901
<indexterm>
902902
<primary>width_bucket</primary>
903903
</indexterm>
904-
<literal><function>width_bucket(<parameter>op</parameter> <type>numeric</type>, <parameter>b1</parameter> <type>numeric</type>, <parameter>b2</parameter> <type>numeric</type>, <parameter>count</parameter> <type>int</type>)</function></literal>
905-
</entry>
904+
<literal><function>width_bucket(<parameter>operand</parameter> <type>dp</type>, <parameter>b1</parameter> <type>dp</type>, <parameter>b2</parameter> <type>dp</type>, <parameter>count</parameter> <type>int</type>)</function></literal></entry>
906905
<entry><type>int</type></entry>
907-
<entry>return the bucket to which <parameter>operand</> would
908-
be assigned in an equidepth histogram with <parameter>count</>
909-
buckets, in the range <parameter>b1</> to <parameter>b2</></entry>
906+
<entry>return the bucket number to which <parameter>operand</> would
907+
be assigned in a histogram having <parameter>count</> equal-width
908+
buckets spanning the range <parameter>b1</> to <parameter>b2</>;
909+
returns <literal>0</> or <literal><parameter>count</>+1</literal> for
910+
an input outside the range</entry>
910911
<entry><literal>width_bucket(5.35, 0.024, 10.06, 5)</literal></entry>
911912
<entry><literal>3</literal></entry>
912913
</row>
913914

914915
<row>
915-
<entry><literal><function>width_bucket(<parameter>op</parameter> <type>dp</type>, <parameter>b1</parameter> <type>dp</type>, <parameter>b2</parameter> <type>dp</type>, <parameter>count</parameter> <type>int</type>)</function></literal></entry>
916+
<entry><literal><function>width_bucket(<parameter>operand</parameter> <type>numeric</type>, <parameter>b1</parameter> <type>numeric</type>, <parameter>b2</parameter> <type>numeric</type>, <parameter>count</parameter> <type>int</type>)</function></literal></entry>
916917
<entry><type>int</type></entry>
917-
<entry>return the bucket to which <parameter>operand</> would
918-
be assigned in an equidepth histogram with <parameter>count</>
919-
buckets, in the range <parameter>b1</> to <parameter>b2</></entry>
918+
<entry>return the bucket number to which <parameter>operand</> would
919+
be assigned in a histogram having <parameter>count</> equal-width
920+
buckets spanning the range <parameter>b1</> to <parameter>b2</>;
921+
returns <literal>0</> or <literal><parameter>count</>+1</literal> for
922+
an input outside the range</entry>
920923
<entry><literal>width_bucket(5.35, 0.024, 10.06, 5)</literal></entry>
921924
<entry><literal>3</literal></entry>
922925
</row>
926+
927+
<row>
928+
<entry><literal><function>width_bucket(<parameter>operand</parameter> <type>anyelement</type>, <parameter>thresholds</parameter> <type>anyarray</type>)</function></literal></entry>
929+
<entry><type>int</type></entry>
930+
<entry>return the bucket number to which <parameter>operand</> would
931+
be assigned given an array listing the lower bounds of the buckets;
932+
returns <literal>0</> for an input less than the first lower bound;
933+
the <parameter>thresholds</> array <emphasis>must be sorted</>,
934+
smallest first, or unexpected results will be obtained</entry>
935+
<entry><literal>width_bucket(now(), array['yesterday', 'today', 'tomorrow']::timestamptz[])</literal></entry>
936+
<entry><literal>2</literal></entry>
937+
</row>
923938
</tbody>
924939
</tgroup>
925940
</table>

src/backend/utils/adt/arrayfuncs.c

Lines changed: 243 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,10 @@
1515
#include "postgres.h"
1616

1717
#include <ctype.h>
18+
#include <math.h>
1819

1920
#include "access/htup_details.h"
21+
#include "catalog/pg_type.h"
2022
#include "funcapi.h"
2123
#include "libpq/pqformat.h"
2224
#include "utils/array.h"
@@ -130,6 +132,15 @@ static ArrayType *array_replace_internal(ArrayType *array,
130132
Datum replace, bool replace_isnull,
131133
bool remove, Oid collation,
132134
FunctionCallInfo fcinfo);
135+
static int width_bucket_array_float8(Datum operand, ArrayType *thresholds);
136+
static int width_bucket_array_fixed(Datum operand,
137+
ArrayType *thresholds,
138+
Oid collation,
139+
TypeCacheEntry *typentry);
140+
static int width_bucket_array_variable(Datum operand,
141+
ArrayType *thresholds,
142+
Oid collation,
143+
TypeCacheEntry *typentry);
133144

134145

135146
/*
@@ -5502,3 +5513,235 @@ array_replace(PG_FUNCTION_ARGS)
55025513
fcinfo);
55035514
PG_RETURN_ARRAYTYPE_P(array);
55045515
}
5516+
5517+
/*
5518+
* Implements width_bucket(anyelement, anyarray).
5519+
*
5520+
* 'thresholds' is an array containing lower bound values for each bucket;
5521+
* these must be sorted from smallest to largest, or bogus results will be
5522+
* produced. If N thresholds are supplied, the output is from 0 to N:
5523+
* 0 is for inputs < first threshold, N is for inputs >= last threshold.
5524+
*/
5525+
Datum
5526+
width_bucket_array(PG_FUNCTION_ARGS)
5527+
{
5528+
Datum operand = PG_GETARG_DATUM(0);
5529+
ArrayType *thresholds = PG_GETARG_ARRAYTYPE_P(1);
5530+
Oid collation = PG_GET_COLLATION();
5531+
Oid element_type = ARR_ELEMTYPE(thresholds);
5532+
int result;
5533+
5534+
/* Check input */
5535+
if (ARR_NDIM(thresholds) > 1)
5536+
ereport(ERROR,
5537+
(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
5538+
errmsg("thresholds must be one-dimensional array")));
5539+
5540+
if (array_contains_nulls(thresholds))
5541+
ereport(ERROR,
5542+
(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
5543+
errmsg("thresholds array must not contain NULLs")));
5544+
5545+
/* We have a dedicated implementation for float8 data */
5546+
if (element_type == FLOAT8OID)
5547+
result = width_bucket_array_float8(operand, thresholds);
5548+
else
5549+
{
5550+
TypeCacheEntry *typentry;
5551+
5552+
/* Cache information about the input type */
5553+
typentry = (TypeCacheEntry *) fcinfo->flinfo->fn_extra;
5554+
if (typentry == NULL ||
5555+
typentry->type_id != element_type)
5556+
{
5557+
typentry = lookup_type_cache(element_type,
5558+
TYPECACHE_CMP_PROC_FINFO);
5559+
if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid))
5560+
ereport(ERROR,
5561+
(errcode(ERRCODE_UNDEFINED_FUNCTION),
5562+
errmsg("could not identify a comparison function for type %s",
5563+
format_type_be(element_type))));
5564+
fcinfo->flinfo->fn_extra = (void *) typentry;
5565+
}
5566+
5567+
/*
5568+
* We have separate implementation paths for fixed- and variable-width
5569+
* types, since indexing the array is a lot cheaper in the first case.
5570+
*/
5571+
if (typentry->typlen > 0)
5572+
result = width_bucket_array_fixed(operand, thresholds,
5573+
collation, typentry);
5574+
else
5575+
result = width_bucket_array_variable(operand, thresholds,
5576+
collation, typentry);
5577+
}
5578+
5579+
/* Avoid leaking memory when handed toasted input. */
5580+
PG_FREE_IF_COPY(thresholds, 1);
5581+
5582+
PG_RETURN_INT32(result);
5583+
}
5584+
5585+
/*
5586+
* width_bucket_array for float8 data.
5587+
*/
5588+
static int
5589+
width_bucket_array_float8(Datum operand, ArrayType *thresholds)
5590+
{
5591+
float8 op = DatumGetFloat8(operand);
5592+
float8 *thresholds_data;
5593+
int left;
5594+
int right;
5595+
5596+
/*
5597+
* Since we know the array contains no NULLs, we can just index it
5598+
* directly.
5599+
*/
5600+
thresholds_data = (float8 *) ARR_DATA_PTR(thresholds);
5601+
5602+
left = 0;
5603+
right = ArrayGetNItems(ARR_NDIM(thresholds), ARR_DIMS(thresholds));
5604+
5605+
/*
5606+
* If the probe value is a NaN, it's greater than or equal to all possible
5607+
* threshold values (including other NaNs), so we need not search. Note
5608+
* that this would give the same result as searching even if the array
5609+
* contains multiple NaNs (as long as they're correctly sorted), since the
5610+
* loop logic will find the rightmost of multiple equal threshold values.
5611+
*/
5612+
if (isnan(op))
5613+
return right;
5614+
5615+
/* Find the bucket */
5616+
while (left < right)
5617+
{
5618+
int mid = (left + right) / 2;
5619+
5620+
if (isnan(thresholds_data[mid]) || op < thresholds_data[mid])
5621+
right = mid;
5622+
else
5623+
left = mid + 1;
5624+
}
5625+
5626+
return left;
5627+
}
5628+
5629+
/*
5630+
* width_bucket_array for generic fixed-width data types.
5631+
*/
5632+
static int
5633+
width_bucket_array_fixed(Datum operand,
5634+
ArrayType *thresholds,
5635+
Oid collation,
5636+
TypeCacheEntry *typentry)
5637+
{
5638+
char *thresholds_data;
5639+
int typlen = typentry->typlen;
5640+
bool typbyval = typentry->typbyval;
5641+
FunctionCallInfoData locfcinfo;
5642+
int left;
5643+
int right;
5644+
5645+
/*
5646+
* Since we know the array contains no NULLs, we can just index it
5647+
* directly.
5648+
*/
5649+
thresholds_data = (char *) ARR_DATA_PTR(thresholds);
5650+
5651+
InitFunctionCallInfoData(locfcinfo, &typentry->cmp_proc_finfo, 2,
5652+
collation, NULL, NULL);
5653+
5654+
/* Find the bucket */
5655+
left = 0;
5656+
right = ArrayGetNItems(ARR_NDIM(thresholds), ARR_DIMS(thresholds));
5657+
while (left < right)
5658+
{
5659+
int mid = (left + right) / 2;
5660+
char *ptr;
5661+
int32 cmpresult;
5662+
5663+
ptr = thresholds_data + mid * typlen;
5664+
5665+
locfcinfo.arg[0] = operand;
5666+
locfcinfo.arg[1] = fetch_att(ptr, typbyval, typlen);
5667+
locfcinfo.argnull[0] = false;
5668+
locfcinfo.argnull[1] = false;
5669+
locfcinfo.isnull = false;
5670+
5671+
cmpresult = DatumGetInt32(FunctionCallInvoke(&locfcinfo));
5672+
5673+
if (cmpresult < 0)
5674+
right = mid;
5675+
else
5676+
left = mid + 1;
5677+
}
5678+
5679+
return left;
5680+
}
5681+
5682+
/*
5683+
* width_bucket_array for generic variable-width data types.
5684+
*/
5685+
static int
5686+
width_bucket_array_variable(Datum operand,
5687+
ArrayType *thresholds,
5688+
Oid collation,
5689+
TypeCacheEntry *typentry)
5690+
{
5691+
char *thresholds_data;
5692+
int typlen = typentry->typlen;
5693+
bool typbyval = typentry->typbyval;
5694+
char typalign = typentry->typalign;
5695+
FunctionCallInfoData locfcinfo;
5696+
int left;
5697+
int right;
5698+
5699+
thresholds_data = (char *) ARR_DATA_PTR(thresholds);
5700+
5701+
InitFunctionCallInfoData(locfcinfo, &typentry->cmp_proc_finfo, 2,
5702+
collation, NULL, NULL);
5703+
5704+
/* Find the bucket */
5705+
left = 0;
5706+
right = ArrayGetNItems(ARR_NDIM(thresholds), ARR_DIMS(thresholds));
5707+
while (left < right)
5708+
{
5709+
int mid = (left + right) / 2;
5710+
char *ptr;
5711+
int i;
5712+
int32 cmpresult;
5713+
5714+
/* Locate mid'th array element by advancing from left element */
5715+
ptr = thresholds_data;
5716+
for (i = left; i < mid; i++)
5717+
{
5718+
ptr = att_addlength_pointer(ptr, typlen, ptr);
5719+
ptr = (char *) att_align_nominal(ptr, typalign);
5720+
}
5721+
5722+
locfcinfo.arg[0] = operand;
5723+
locfcinfo.arg[1] = fetch_att(ptr, typbyval, typlen);
5724+
locfcinfo.argnull[0] = false;
5725+
locfcinfo.argnull[1] = false;
5726+
locfcinfo.isnull = false;
5727+
5728+
cmpresult = DatumGetInt32(FunctionCallInvoke(&locfcinfo));
5729+
5730+
if (cmpresult < 0)
5731+
right = mid;
5732+
else
5733+
{
5734+
left = mid + 1;
5735+
5736+
/*
5737+
* Move the thresholds pointer to match new "left" index, so we
5738+
* don't have to seek over those elements again. This trick
5739+
* ensures we do only O(N) array indexing work, not O(N^2).
5740+
*/
5741+
ptr = att_addlength_pointer(ptr, typlen, ptr);
5742+
thresholds_data = (char *) att_align_nominal(ptr, typalign);
5743+
}
5744+
}
5745+
5746+
return left;
5747+
}

src/include/catalog/catversion.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,6 @@
5353
*/
5454

5555
/* yyyymmddN */
56-
#define CATALOG_VERSION_NO 201408281
56+
#define CATALOG_VERSION_NO 201409091
5757

5858
#endif

src/include/catalog/pg_proc.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -514,7 +514,7 @@ DATA(insert OID = 308 ( float84le PGNSP PGUID 12 1 0 0 0 f f f t t f i 2 0
514514
DATA(insert OID = 309 ( float84gt PGNSP PGUID 12 1 0 0 0 f f f t t f i 2 0 16 "701 700" _null_ _null_ _null_ _null_ float84gt _null_ _null_ _null_ ));
515515
DATA(insert OID = 310 ( float84ge PGNSP PGUID 12 1 0 0 0 f f f t t f i 2 0 16 "701 700" _null_ _null_ _null_ _null_ float84ge _null_ _null_ _null_ ));
516516
DATA(insert OID = 320 ( width_bucket PGNSP PGUID 12 1 0 0 0 f f f f t f i 4 0 23 "701 701 701 23" _null_ _null_ _null_ _null_ width_bucket_float8 _null_ _null_ _null_ ));
517-
DESCR("bucket number of operand in equidepth histogram");
517+
DESCR("bucket number of operand in equal-width histogram");
518518

519519
DATA(insert OID = 311 ( float8 PGNSP PGUID 12 1 0 0 0 f f f f t f i 1 0 701 "700" _null_ _null_ _null_ _null_ ftod _null_ _null_ _null_ ));
520520
DESCR("convert float4 to float8");
@@ -885,6 +885,8 @@ DATA(insert OID = 2334 ( array_agg_finalfn PGNSP PGUID 12 1 0 0 0 f f f f f f
885885
DESCR("aggregate final function");
886886
DATA(insert OID = 2335 ( array_agg PGNSP PGUID 12 1 0 0 0 t f f f f f i 1 0 2277 "2283" _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
887887
DESCR("concatenate aggregate input into an array");
888+
DATA(insert OID = 3218 ( width_bucket PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 23 "2283 2277" _null_ _null_ _null_ _null_ width_bucket_array _null_ _null_ _null_ ));
889+
DESCR("bucket number of operand given a sorted array of bucket lower bounds");
888890
DATA(insert OID = 3816 ( array_typanalyze PGNSP PGUID 12 1 0 0 0 f f f f t f s 1 0 16 "2281" _null_ _null_ _null_ _null_ array_typanalyze _null_ _null_ _null_ ));
889891
DESCR("array typanalyze");
890892
DATA(insert OID = 3817 ( arraycontsel PGNSP PGUID 12 1 0 0 0 f f f f t f s 4 0 701 "2281 26 2281 23" _null_ _null_ _null_ _null_ arraycontsel _null_ _null_ _null_ ));
@@ -2301,7 +2303,7 @@ DESCR("trunc(x/y)");
23012303
DATA(insert OID = 1980 ( numeric_div_trunc PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 1700 "1700 1700" _null_ _null_ _null_ _null_ numeric_div_trunc _null_ _null_ _null_ ));
23022304
DESCR("trunc(x/y)");
23032305
DATA(insert OID = 2170 ( width_bucket PGNSP PGUID 12 1 0 0 0 f f f f t f i 4 0 23 "1700 1700 1700 23" _null_ _null_ _null_ _null_ width_bucket_numeric _null_ _null_ _null_ ));
2304-
DESCR("bucket number of operand in equidepth histogram");
2306+
DESCR("bucket number of operand in equal-width histogram");
23052307

23062308
DATA(insert OID = 1747 ( time_pl_interval PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 1083 "1083 1186" _null_ _null_ _null_ _null_ time_pl_interval _null_ _null_ _null_ ));
23072309
DATA(insert OID = 1748 ( time_mi_interval PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 1083 "1083 1186" _null_ _null_ _null_ _null_ time_mi_interval _null_ _null_ _null_ ));

src/include/utils/array.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,7 @@ extern Datum array_fill_with_lower_bounds(PG_FUNCTION_ARGS);
214214
extern Datum array_unnest(PG_FUNCTION_ARGS);
215215
extern Datum array_remove(PG_FUNCTION_ARGS);
216216
extern Datum array_replace(PG_FUNCTION_ARGS);
217+
extern Datum width_bucket_array(PG_FUNCTION_ARGS);
217218

218219
extern Datum array_ref(ArrayType *array, int nSubscripts, int *indx,
219220
int arraytyplen, int elmlen, bool elmbyval, char elmalign,

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy