Skip to content

Commit b525bf7

Browse files
committed
Add KNNGIST support to contrib/pg_trgm.
Teodor Sigaev, with some revision by Tom
1 parent b576757 commit b525bf7

File tree

9 files changed

+213
-42
lines changed

9 files changed

+213
-42
lines changed

contrib/pg_trgm/expected/pg_trgm.out

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1187,6 +1187,13 @@ select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu198
11871187
qwertyu0988 | 0.333333
11881188
(1 row)
11891189

1190+
select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
1191+
?column? | t
1192+
----------+-------------
1193+
0.411765 | qwertyu0988
1194+
0.5 | qwertyu0987
1195+
(2 rows)
1196+
11901197
create index trgm_idx on test_trgm using gist (t gist_trgm_ops);
11911198
set enable_seqscan=off;
11921199
select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
@@ -2315,6 +2322,22 @@ select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu198
23152322
qwertyu0988 | 0.333333
23162323
(1 row)
23172324

2325+
explain (costs off)
2326+
select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
2327+
QUERY PLAN
2328+
---------------------------------------------------
2329+
Limit
2330+
-> Index Scan using trgm_idx on test_trgm
2331+
Order By: (t <-> 'q0987wertyu0988'::text)
2332+
(3 rows)
2333+
2334+
select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
2335+
?column? | t
2336+
----------+-------------
2337+
0.411765 | qwertyu0988
2338+
0.5 | qwertyu0987
2339+
(2 rows)
2340+
23182341
drop index trgm_idx;
23192342
create index trgm_idx on test_trgm using gin (t gin_trgm_ops);
23202343
set enable_seqscan=off;

contrib/pg_trgm/pg_trgm.sql.in

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ LANGUAGE C STRICT IMMUTABLE;
2626
CREATE OR REPLACE FUNCTION similarity_op(text,text)
2727
RETURNS bool
2828
AS 'MODULE_PATHNAME'
29-
LANGUAGE C STRICT STABLE;
29+
LANGUAGE C STRICT STABLE; -- stable because depends on trgm_limit
3030

3131
CREATE OPERATOR % (
3232
LEFTARG = text,
@@ -37,6 +37,18 @@ CREATE OPERATOR % (
3737
JOIN = contjoinsel
3838
);
3939

40+
CREATE OR REPLACE FUNCTION similarity_dist(text,text)
41+
RETURNS float4
42+
AS 'MODULE_PATHNAME'
43+
LANGUAGE C STRICT IMMUTABLE;
44+
45+
CREATE OPERATOR <-> (
46+
LEFTARG = text,
47+
RIGHTARG = text,
48+
PROCEDURE = similarity_dist,
49+
COMMUTATOR = '<->'
50+
);
51+
4052
-- gist key
4153
CREATE OR REPLACE FUNCTION gtrgm_in(cstring)
4254
RETURNS gtrgm
@@ -60,6 +72,11 @@ RETURNS bool
6072
AS 'MODULE_PATHNAME'
6173
LANGUAGE C IMMUTABLE STRICT;
6274

75+
CREATE OR REPLACE FUNCTION gtrgm_distance(internal,text,int,oid)
76+
RETURNS float8
77+
AS 'MODULE_PATHNAME'
78+
LANGUAGE C IMMUTABLE STRICT;
79+
6380
CREATE OR REPLACE FUNCTION gtrgm_compress(internal)
6481
RETURNS internal
6582
AS 'MODULE_PATHNAME'
@@ -95,13 +112,15 @@ CREATE OPERATOR CLASS gist_trgm_ops
95112
FOR TYPE text USING gist
96113
AS
97114
OPERATOR 1 % (text, text),
115+
OPERATOR 2 <-> (text, text) FOR ORDER BY pg_catalog.float_ops,
98116
FUNCTION 1 gtrgm_consistent (internal, text, int, oid, internal),
99117
FUNCTION 2 gtrgm_union (bytea, internal),
100118
FUNCTION 3 gtrgm_compress (internal),
101119
FUNCTION 4 gtrgm_decompress (internal),
102120
FUNCTION 5 gtrgm_penalty (internal, internal, internal),
103121
FUNCTION 6 gtrgm_picksplit (internal, internal),
104122
FUNCTION 7 gtrgm_same (gtrgm, gtrgm, internal),
123+
FUNCTION 8 gtrgm_distance (internal, text, int, oid),
105124
STORAGE gtrgm;
106125

107126
-- support functions for gin

contrib/pg_trgm/sql/pg_trgm.sql

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,17 @@ CREATE TABLE test_trgm(t text);
2626
select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
2727
select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t;
2828
select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu1988' order by sml desc, t;
29+
select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
2930
3031
create index trgm_idx on test_trgm using gist (t gist_trgm_ops);
3132
set enable_seqscan=off;
3233
3334
select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
3435
select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t;
3536
select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu1988' order by sml desc, t;
37+
explain (costs off)
38+
select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
39+
select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
3640
3741
drop index trgm_idx;
3842
create index trgm_idx on test_trgm using gin (t gin_trgm_ops);

contrib/pg_trgm/trgm.h

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,10 @@
44
#ifndef __TRGM_H__
55
#define __TRGM_H__
66

7-
#include "postgres.h"
8-
97
#include "access/gist.h"
108
#include "access/itup.h"
11-
#include "utils/builtins.h"
129
#include "storage/bufpage.h"
10+
#include "utils/builtins.h"
1311

1412
/* options */
1513
#define LPADDING 2
@@ -18,6 +16,10 @@
1816
#define IGNORECASE
1917
#define DIVUNION
2018

19+
/* operator strategy numbers */
20+
#define SimilarityStrategyNumber 1
21+
#define DistanceStrategyNumber 2
22+
2123

2224
typedef char trgm[3];
2325

@@ -89,4 +91,4 @@ extern float4 trgm_limit;
8991
TRGM *generate_trgm(char *str, int slen);
9092
float4 cnt_sml(TRGM *trg1, TRGM *trg2);
9193

92-
#endif
94+
#endif /* __TRGM_H__ */

contrib/pg_trgm/trgm_gin.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
/*
22
* contrib/pg_trgm/trgm_gin.c
33
*/
4+
#include "postgres.h"
5+
46
#include "trgm.h"
57

68
#include "access/gin.h"
@@ -10,6 +12,7 @@
1012
#include "utils/array.h"
1113
#include "utils/builtins.h"
1214

15+
1316
PG_FUNCTION_INFO_V1(gin_extract_trgm);
1417
Datum gin_extract_trgm(PG_FUNCTION_ARGS);
1518

contrib/pg_trgm/trgm_gist.c

Lines changed: 110 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,19 @@
11
/*
22
* contrib/pg_trgm/trgm_gist.c
33
*/
4+
#include "postgres.h"
5+
46
#include "trgm.h"
57

68
#include "access/gist.h"
79
#include "access/itup.h"
10+
#include "access/skey.h"
811
#include "access/tuptoaster.h"
912
#include "storage/bufpage.h"
1013
#include "utils/array.h"
1114
#include "utils/builtins.h"
1215

16+
1317
PG_FUNCTION_INFO_V1(gtrgm_in);
1418
Datum gtrgm_in(PG_FUNCTION_ARGS);
1519

@@ -25,6 +29,9 @@ Datum gtrgm_decompress(PG_FUNCTION_ARGS);
2529
PG_FUNCTION_INFO_V1(gtrgm_consistent);
2630
Datum gtrgm_consistent(PG_FUNCTION_ARGS);
2731

32+
PG_FUNCTION_INFO_V1(gtrgm_distance);
33+
Datum gtrgm_distance(PG_FUNCTION_ARGS);
34+
2835
PG_FUNCTION_INFO_V1(gtrgm_union);
2936
Datum gtrgm_union(PG_FUNCTION_ARGS);
3037

@@ -159,18 +166,35 @@ gtrgm_decompress(PG_FUNCTION_ARGS)
159166
}
160167
}
161168

169+
static int4
170+
cnt_sml_sign_common(TRGM *qtrg, BITVECP sign)
171+
{
172+
int4 count = 0;
173+
int4 k,
174+
len = ARRNELEM(qtrg);
175+
trgm *ptr = GETARR(qtrg);
176+
int4 tmp = 0;
177+
178+
for (k = 0; k < len; k++)
179+
{
180+
CPTRGM(((char *) &tmp), ptr + k);
181+
count += GETBIT(sign, HASHVAL(tmp));
182+
}
183+
184+
return count;
185+
}
186+
162187
Datum
163188
gtrgm_consistent(PG_FUNCTION_ARGS)
164189
{
165190
GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
166191
text *query = PG_GETARG_TEXT_P(1);
167-
168-
/* StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2); */
192+
StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2);
169193
/* Oid subtype = PG_GETARG_OID(3); */
170194
bool *recheck = (bool *) PG_GETARG_POINTER(4);
171195
TRGM *key = (TRGM *) DatumGetPointer(entry->key);
172196
TRGM *qtrg;
173-
bool res = false;
197+
bool res;
174198
char *cache = (char *) fcinfo->flinfo->fn_extra;
175199

176200
/* All cases served by this function are exact */
@@ -193,39 +217,95 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
193217

194218
qtrg = (TRGM *) (cache + MAXALIGN(VARSIZE(query)));
195219

196-
if (GIST_LEAF(entry))
197-
{ /* all leafs contains orig trgm */
198-
float4 tmpsml = cnt_sml(key, qtrg);
220+
switch (strategy)
221+
{
222+
case SimilarityStrategyNumber:
223+
if (GIST_LEAF(entry))
224+
{ /* all leafs contains orig trgm */
225+
float4 tmpsml = cnt_sml(key, qtrg);
199226

200-
/* strange bug at freebsd 5.2.1 and gcc 3.3.3 */
201-
res = (*(int *) &tmpsml == *(int *) &trgm_limit || tmpsml > trgm_limit) ? true : false;
227+
/* strange bug at freebsd 5.2.1 and gcc 3.3.3 */
228+
res = (*(int *) &tmpsml == *(int *) &trgm_limit || tmpsml > trgm_limit) ? true : false;
229+
}
230+
else if (ISALLTRUE(key))
231+
{ /* non-leaf contains signature */
232+
res = true;
233+
}
234+
else
235+
{ /* non-leaf contains signature */
236+
int4 count = cnt_sml_sign_common(qtrg, GETSIGN(key));
237+
int4 len = ARRNELEM(qtrg);
238+
239+
if (len == 0)
240+
res = false;
241+
else
242+
res = (((((float8) count) / ((float8) len))) >= trgm_limit) ? true : false;
243+
}
244+
break;
245+
default:
246+
elog(ERROR, "unrecognized strategy number: %d", strategy);
247+
res = false; /* keep compiler quiet */
248+
break;
202249
}
203-
else if (ISALLTRUE(key))
204-
{ /* non-leaf contains signature */
205-
res = true;
250+
251+
PG_RETURN_BOOL(res);
252+
}
253+
254+
Datum
255+
gtrgm_distance(PG_FUNCTION_ARGS)
256+
{
257+
GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
258+
text *query = PG_GETARG_TEXT_P(1);
259+
StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2);
260+
/* Oid subtype = PG_GETARG_OID(3); */
261+
TRGM *key = (TRGM *) DatumGetPointer(entry->key);
262+
TRGM *qtrg;
263+
float8 res;
264+
char *cache = (char *) fcinfo->flinfo->fn_extra;
265+
266+
if (cache == NULL || VARSIZE(cache) != VARSIZE(query) || memcmp(cache, query, VARSIZE(query)) != 0)
267+
{
268+
qtrg = generate_trgm(VARDATA(query), VARSIZE(query) - VARHDRSZ);
269+
270+
if (cache)
271+
pfree(cache);
272+
273+
fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
274+
MAXALIGN(VARSIZE(query)) + VARSIZE(qtrg));
275+
cache = (char *) fcinfo->flinfo->fn_extra;
276+
277+
memcpy(cache, query, VARSIZE(query));
278+
memcpy(cache + MAXALIGN(VARSIZE(query)), qtrg, VARSIZE(qtrg));
206279
}
207-
else
208-
{ /* non-leaf contains signature */
209-
int4 count = 0;
210-
int4 k,
211-
len = ARRNELEM(qtrg);
212-
trgm *ptr = GETARR(qtrg);
213-
BITVECP sign = GETSIGN(key);
214-
int4 tmp = 0;
215280

216-
for (k = 0; k < len; k++)
217-
{
218-
CPTRGM(((char *) &tmp), ptr + k);
219-
count += GETBIT(sign, HASHVAL(tmp));
220-
}
221-
#ifdef DIVUNION
222-
res = (len == count) ? true : ((((((float4) count) / ((float4) (len - count)))) >= trgm_limit) ? true : false);
223-
#else
224-
res = (len == 0) ? false : ((((((float4) count) / ((float4) len))) >= trgm_limit) ? true : false);
225-
#endif
281+
qtrg = (TRGM *) (cache + MAXALIGN(VARSIZE(query)));
282+
283+
switch (strategy)
284+
{
285+
case DistanceStrategyNumber:
286+
if (GIST_LEAF(entry))
287+
{ /* all leafs contains orig trgm */
288+
res = 1.0 - cnt_sml(key, qtrg);
289+
}
290+
else if (ISALLTRUE(key))
291+
{ /* all leafs contains orig trgm */
292+
res = 0.0;
293+
}
294+
else
295+
{ /* non-leaf contains signature */
296+
int4 count = cnt_sml_sign_common(qtrg, GETSIGN(key));
297+
int4 len = ARRNELEM(qtrg);
298+
299+
res = (len == 0) ? -1.0 : 1.0 - ((float8) count) / ((float8) len);
300+
}
301+
break;
302+
default:
303+
elog(ERROR, "unrecognized strategy number: %d", strategy);
304+
res = 0; /* keep compiler quiet */
305+
break;
226306
}
227307

228-
PG_RETURN_BOOL(res);
308+
PG_RETURN_FLOAT8(res);
229309
}
230310

231311
static int4

contrib/pg_trgm/trgm_op.c

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,16 @@
11
/*
22
* contrib/pg_trgm/trgm_op.c
33
*/
4-
#include "trgm.h"
4+
#include "postgres.h"
5+
56
#include <ctype.h>
6-
#include "utils/array.h"
7+
8+
#include "trgm.h"
9+
710
#include "catalog/pg_type.h"
811
#include "tsearch/ts_locale.h"
12+
#include "utils/array.h"
13+
914

1015
PG_MODULE_MAGIC;
1116

@@ -359,16 +364,25 @@ similarity(PG_FUNCTION_ARGS)
359364
PG_RETURN_FLOAT4(res);
360365
}
361366

367+
PG_FUNCTION_INFO_V1(similarity_dist);
368+
Datum similarity_dist(PG_FUNCTION_ARGS);
369+
Datum
370+
similarity_dist(PG_FUNCTION_ARGS)
371+
{
372+
float4 res = DatumGetFloat4(DirectFunctionCall2(similarity,
373+
PG_GETARG_DATUM(0),
374+
PG_GETARG_DATUM(1)));
375+
PG_RETURN_FLOAT4(1.0 - res);
376+
}
377+
362378
PG_FUNCTION_INFO_V1(similarity_op);
363379
Datum similarity_op(PG_FUNCTION_ARGS);
364380
Datum
365381
similarity_op(PG_FUNCTION_ARGS)
366382
{
367-
float4 res = DatumGetFloat4(DirectFunctionCall2(
368-
similarity,
383+
float4 res = DatumGetFloat4(DirectFunctionCall2(similarity,
369384
PG_GETARG_DATUM(0),
370-
PG_GETARG_DATUM(1)
371-
));
385+
PG_GETARG_DATUM(1)));
372386

373387
PG_RETURN_BOOL(res >= trgm_limit);
374388
}

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy