Skip to content

Commit 6e2f3ae

Browse files
committed
Support LIKE and ILIKE index searches via contrib/pg_trgm indexes.
Unlike Btree-based LIKE optimization, this works for non-left-anchored search patterns. The effectiveness of the search depends on how many trigrams can be extracted from the pattern. (The worst case, with no trigrams, degrades to a full-table scan, so this isn't a panacea. But it can be very useful.) Alexander Korotkov, reviewed by Jan Urbanski
1 parent 6238473 commit 6e2f3ae

File tree

9 files changed

+640
-50
lines changed

9 files changed

+640
-50
lines changed

contrib/pg_trgm/expected/pg_trgm.out

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3467,3 +3467,93 @@ select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu198
34673467
qwertyu0988 | 0.333333
34683468
(1 row)
34693469

3470+
create table test2(t text);
3471+
insert into test2 values ('abcdef');
3472+
insert into test2 values ('quark');
3473+
create index test2_idx_gin on test2 using gin (t gin_trgm_ops);
3474+
set enable_seqscan=off;
3475+
explain (costs off)
3476+
select * from test2 where t like '%BCD%';
3477+
QUERY PLAN
3478+
------------------------------------------
3479+
Bitmap Heap Scan on test2
3480+
Recheck Cond: (t ~~ '%BCD%'::text)
3481+
-> Bitmap Index Scan on test2_idx_gin
3482+
Index Cond: (t ~~ '%BCD%'::text)
3483+
(4 rows)
3484+
3485+
explain (costs off)
3486+
select * from test2 where t ilike '%BCD%';
3487+
QUERY PLAN
3488+
-------------------------------------------
3489+
Bitmap Heap Scan on test2
3490+
Recheck Cond: (t ~~* '%BCD%'::text)
3491+
-> Bitmap Index Scan on test2_idx_gin
3492+
Index Cond: (t ~~* '%BCD%'::text)
3493+
(4 rows)
3494+
3495+
select * from test2 where t like '%BCD%';
3496+
t
3497+
---
3498+
(0 rows)
3499+
3500+
select * from test2 where t like '%bcd%';
3501+
t
3502+
--------
3503+
abcdef
3504+
(1 row)
3505+
3506+
select * from test2 where t ilike '%BCD%';
3507+
t
3508+
--------
3509+
abcdef
3510+
(1 row)
3511+
3512+
select * from test2 where t ilike 'qua%';
3513+
t
3514+
-------
3515+
quark
3516+
(1 row)
3517+
3518+
drop index test2_idx_gin;
3519+
create index test2_idx_gist on test2 using gist (t gist_trgm_ops);
3520+
set enable_seqscan=off;
3521+
explain (costs off)
3522+
select * from test2 where t like '%BCD%';
3523+
QUERY PLAN
3524+
------------------------------------------
3525+
Index Scan using test2_idx_gist on test2
3526+
Index Cond: (t ~~ '%BCD%'::text)
3527+
(2 rows)
3528+
3529+
explain (costs off)
3530+
select * from test2 where t ilike '%BCD%';
3531+
QUERY PLAN
3532+
------------------------------------------
3533+
Index Scan using test2_idx_gist on test2
3534+
Index Cond: (t ~~* '%BCD%'::text)
3535+
(2 rows)
3536+
3537+
select * from test2 where t like '%BCD%';
3538+
t
3539+
---
3540+
(0 rows)
3541+
3542+
select * from test2 where t like '%bcd%';
3543+
t
3544+
--------
3545+
abcdef
3546+
(1 row)
3547+
3548+
select * from test2 where t ilike '%BCD%';
3549+
t
3550+
--------
3551+
abcdef
3552+
(1 row)
3553+
3554+
select * from test2 where t ilike 'qua%';
3555+
t
3556+
-------
3557+
quark
3558+
(1 row)
3559+

contrib/pg_trgm/pg_trgm.sql.in

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,8 @@ FOR TYPE text USING gist
113113
AS
114114
OPERATOR 1 % (text, text),
115115
OPERATOR 2 <-> (text, text) FOR ORDER BY pg_catalog.float_ops,
116+
OPERATOR 3 pg_catalog.~~ (text, text),
117+
OPERATOR 4 pg_catalog.~~* (text, text),
116118
FUNCTION 1 gtrgm_consistent (internal, text, int, oid, internal),
117119
FUNCTION 2 gtrgm_union (bytea, internal),
118120
FUNCTION 3 gtrgm_compress (internal),
@@ -124,17 +126,17 @@ AS
124126
STORAGE gtrgm;
125127

126128
-- support functions for gin
127-
CREATE OR REPLACE FUNCTION gin_extract_trgm(text, internal)
129+
CREATE OR REPLACE FUNCTION gin_extract_value_trgm(text, internal)
128130
RETURNS internal
129131
AS 'MODULE_PATHNAME'
130132
LANGUAGE C IMMUTABLE STRICT;
131133

132-
CREATE OR REPLACE FUNCTION gin_extract_trgm(text, internal, int2, internal, internal)
134+
CREATE OR REPLACE FUNCTION gin_extract_query_trgm(text, internal, int2, internal, internal, internal, internal)
133135
RETURNS internal
134136
AS 'MODULE_PATHNAME'
135137
LANGUAGE C IMMUTABLE STRICT;
136138

137-
CREATE OR REPLACE FUNCTION gin_trgm_consistent(internal, int2, text, int4, internal, internal)
139+
CREATE OR REPLACE FUNCTION gin_trgm_consistent(internal, int2, text, int4, internal, internal, internal, internal)
138140
RETURNS bool
139141
AS 'MODULE_PATHNAME'
140142
LANGUAGE C IMMUTABLE STRICT;
@@ -144,8 +146,10 @@ CREATE OPERATOR CLASS gin_trgm_ops
144146
FOR TYPE text USING gin
145147
AS
146148
OPERATOR 1 % (text, text),
149+
OPERATOR 3 pg_catalog.~~ (text, text),
150+
OPERATOR 4 pg_catalog.~~* (text, text),
147151
FUNCTION 1 btint4cmp (int4, int4),
148-
FUNCTION 2 gin_extract_trgm (text, internal),
149-
FUNCTION 3 gin_extract_trgm (text, internal, int2, internal, internal),
150-
FUNCTION 4 gin_trgm_consistent (internal, int2, text, int4, internal, internal),
152+
FUNCTION 2 gin_extract_value_trgm (text, internal),
153+
FUNCTION 3 gin_extract_query_trgm (text, internal, int2, internal, internal, internal, internal),
154+
FUNCTION 4 gin_trgm_consistent (internal, int2, text, int4, internal, internal, internal, internal),
151155
STORAGE int4;

contrib/pg_trgm/sql/pg_trgm.sql

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,28 @@ set enable_seqscan=off;
4545
select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
4646
select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t;
4747
select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu1988' order by sml desc, t;
48+
49+
create table test2(t text);
50+
insert into test2 values ('abcdef');
51+
insert into test2 values ('quark');
52+
create index test2_idx_gin on test2 using gin (t gin_trgm_ops);
53+
set enable_seqscan=off;
54+
explain (costs off)
55+
select * from test2 where t like '%BCD%';
56+
explain (costs off)
57+
select * from test2 where t ilike '%BCD%';
58+
select * from test2 where t like '%BCD%';
59+
select * from test2 where t like '%bcd%';
60+
select * from test2 where t ilike '%BCD%';
61+
select * from test2 where t ilike 'qua%';
62+
drop index test2_idx_gin;
63+
create index test2_idx_gist on test2 using gist (t gist_trgm_ops);
64+
set enable_seqscan=off;
65+
explain (costs off)
66+
select * from test2 where t like '%BCD%';
67+
explain (costs off)
68+
select * from test2 where t ilike '%BCD%';
69+
select * from test2 where t like '%BCD%';
70+
select * from test2 where t like '%bcd%';
71+
select * from test2 where t ilike '%BCD%';
72+
select * from test2 where t ilike 'qua%';

contrib/pg_trgm/trgm.h

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,21 @@
1313
#define LPADDING 2
1414
#define RPADDING 1
1515
#define KEEPONLYALNUM
16+
/*
17+
* Caution: IGNORECASE macro means that trigrams are case-insensitive.
18+
* If this macro is disabled, the ~~* operator must be removed from the
19+
* operator classes, because we can't handle case-insensitive wildcard search
20+
* with case-sensitive trigrams. Failure to do this will result in "cannot
21+
* handle ~~* with case-sensitive trigrams" errors.
22+
*/
1623
#define IGNORECASE
1724
#define DIVUNION
1825

1926
/* operator strategy numbers */
20-
#define SimilarityStrategyNumber 1
21-
#define DistanceStrategyNumber 2
27+
#define SimilarityStrategyNumber 1
28+
#define DistanceStrategyNumber 2
29+
#define LikeStrategyNumber 3
30+
#define ILikeStrategyNumber 4
2231

2332

2433
typedef char trgm[3];
@@ -40,7 +49,10 @@ uint32 trgm2int(trgm *ptr);
4049
#else
4150
#define ISPRINTABLECHAR(a) ( isascii( *(unsigned char*)(a) ) && isprint( *(unsigned char*)(a) ) )
4251
#endif
43-
#define ISPRINTABLETRGM(t) ( ISPRINTABLECHAR( ((char*)t) ) && ISPRINTABLECHAR( ((char*)t)+1 ) && ISPRINTABLECHAR( ((char*)t)+2 ) )
52+
#define ISPRINTABLETRGM(t) ( ISPRINTABLECHAR( ((char*)(t)) ) && ISPRINTABLECHAR( ((char*)(t))+1 ) && ISPRINTABLECHAR( ((char*)(t))+2 ) )
53+
54+
#define ISESCAPECHAR(x) (*(x) == '\\') /* Wildcard escape character */
55+
#define ISWILDCARDCHAR(x) (*(x) == '_' || *(x) == '%') /* Wildcard meta-character */
4456

4557
typedef struct
4658
{
@@ -65,7 +77,7 @@ typedef char *BITVECP;
6577
for(i=0;i<SIGLEN;i++)
6678

6779
#define GETBYTE(x,i) ( *( (BITVECP)(x) + (int)( (i) / BITBYTE ) ) )
68-
#define GETBITBYTE(x,i) ( ((char)(x)) >> i & 0x01 )
80+
#define GETBITBYTE(x,i) ( (((char)(x)) >> (i)) & 0x01 )
6981
#define CLRBIT(x,i) GETBYTE(x,i) &= ~( 0x01 << ( (i) % BITBYTE ) )
7082
#define SETBIT(x,i) GETBYTE(x,i) |= ( 0x01 << ( (i) % BITBYTE ) )
7183
#define GETBIT(x,i) ( (GETBYTE(x,i) >> ( (i) % BITBYTE )) & 0x01 )
@@ -89,6 +101,8 @@ typedef char *BITVECP;
89101
extern float4 trgm_limit;
90102

91103
TRGM *generate_trgm(char *str, int slen);
104+
TRGM *generate_wildcard_trgm(const char *str, int slen);
92105
float4 cnt_sml(TRGM *trg1, TRGM *trg2);
106+
bool trgm_contained_by(TRGM *trg1, TRGM *trg2);
93107

94108
#endif /* __TRGM_H__ */

contrib/pg_trgm/trgm_gin.c

Lines changed: 120 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
#include "access/gin.h"
99
#include "access/itup.h"
10+
#include "access/skey.h"
1011
#include "access/tuptoaster.h"
1112
#include "storage/bufpage.h"
1213
#include "utils/array.h"
@@ -16,14 +17,31 @@
1617
PG_FUNCTION_INFO_V1(gin_extract_trgm);
1718
Datum gin_extract_trgm(PG_FUNCTION_ARGS);
1819

20+
PG_FUNCTION_INFO_V1(gin_extract_value_trgm);
21+
Datum gin_extract_value_trgm(PG_FUNCTION_ARGS);
22+
23+
PG_FUNCTION_INFO_V1(gin_extract_query_trgm);
24+
Datum gin_extract_query_trgm(PG_FUNCTION_ARGS);
25+
1926
PG_FUNCTION_INFO_V1(gin_trgm_consistent);
2027
Datum gin_trgm_consistent(PG_FUNCTION_ARGS);
2128

2229
/*
23-
* This function is used as both extractValue and extractQuery
30+
* This function can only be called if a pre-9.1 version of the GIN operator
31+
* class definition is present in the catalogs (probably as a consequence
32+
* of upgrade-in-place). Complain.
2433
*/
2534
Datum
2635
gin_extract_trgm(PG_FUNCTION_ARGS)
36+
{
37+
ereport(ERROR,
38+
(errmsg("GIN operator class for pg_trgm is out of date"),
39+
errhint("Please drop and re-create the pg_trgm catalog entries.")));
40+
PG_RETURN_NULL();
41+
}
42+
43+
Datum
44+
gin_extract_value_trgm(PG_FUNCTION_ARGS)
2745
{
2846
text *val = (text *) PG_GETARG_TEXT_P(0);
2947
int32 *nentries = (int32 *) PG_GETARG_POINTER(1);
@@ -57,34 +75,124 @@ gin_extract_trgm(PG_FUNCTION_ARGS)
5775
PG_RETURN_POINTER(entries);
5876
}
5977

78+
Datum
79+
gin_extract_query_trgm(PG_FUNCTION_ARGS)
80+
{
81+
text *val = (text *) PG_GETARG_TEXT_P(0);
82+
int32 *nentries = (int32 *) PG_GETARG_POINTER(1);
83+
StrategyNumber strategy = PG_GETARG_UINT16(2);
84+
/* bool **pmatch = (bool **) PG_GETARG_POINTER(3); */
85+
/* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
86+
/* bool **nullFlags = (bool **) PG_GETARG_POINTER(5); */
87+
int32 *searchMode = (int32 *) PG_GETARG_POINTER(6);
88+
Datum *entries = NULL;
89+
TRGM *trg;
90+
int32 trglen;
91+
trgm *ptr;
92+
int32 i;
93+
94+
switch (strategy)
95+
{
96+
case SimilarityStrategyNumber:
97+
trg = generate_trgm(VARDATA(val), VARSIZE(val) - VARHDRSZ);
98+
break;
99+
case ILikeStrategyNumber:
100+
#ifndef IGNORECASE
101+
elog(ERROR, "cannot handle ~~* with case-sensitive trigrams");
102+
#endif
103+
/* FALL THRU */
104+
case LikeStrategyNumber:
105+
/*
106+
* For wildcard search we extract all the trigrams that every
107+
* potentially-matching string must include.
108+
*/
109+
trg = generate_wildcard_trgm(VARDATA(val), VARSIZE(val) - VARHDRSZ);
110+
break;
111+
default:
112+
elog(ERROR, "unrecognized strategy number: %d", strategy);
113+
trg = NULL; /* keep compiler quiet */
114+
break;
115+
}
116+
117+
trglen = ARRNELEM(trg);
118+
*nentries = trglen;
119+
120+
if (trglen > 0)
121+
{
122+
entries = (Datum *) palloc(sizeof(Datum) * trglen);
123+
ptr = GETARR(trg);
124+
for (i = 0; i < trglen; i++)
125+
{
126+
int32 item = trgm2int(ptr);
127+
128+
entries[i] = Int32GetDatum(item);
129+
ptr++;
130+
}
131+
}
132+
133+
/*
134+
* If no trigram was extracted then we have to scan all the index.
135+
*/
136+
if (trglen == 0)
137+
*searchMode = GIN_SEARCH_MODE_ALL;
138+
139+
PG_RETURN_POINTER(entries);
140+
}
141+
60142
Datum
61143
gin_trgm_consistent(PG_FUNCTION_ARGS)
62144
{
63145
bool *check = (bool *) PG_GETARG_POINTER(0);
64-
/* StrategyNumber strategy = PG_GETARG_UINT16(1); */
146+
StrategyNumber strategy = PG_GETARG_UINT16(1);
65147
/* text *query = PG_GETARG_TEXT_P(2); */
66148
int32 nkeys = PG_GETARG_INT32(3);
67149
/* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
68150
bool *recheck = (bool *) PG_GETARG_POINTER(5);
69-
bool res = FALSE;
151+
bool res;
70152
int32 i,
71-
ntrue = 0;
153+
ntrue;
72154

73155
/* All cases served by this function are inexact */
74156
*recheck = true;
75157

76-
/* Count the matches */
77-
for (i = 0; i < nkeys; i++)
158+
switch (strategy)
78159
{
79-
if (check[i])
80-
ntrue++;
81-
}
82-
160+
case SimilarityStrategyNumber:
161+
/* Count the matches */
162+
ntrue = 0;
163+
for (i = 0; i < nkeys; i++)
164+
{
165+
if (check[i])
166+
ntrue++;
167+
}
83168
#ifdef DIVUNION
84-
res = (nkeys == ntrue) ? true : ((((((float4) ntrue) / ((float4) (nkeys - ntrue)))) >= trgm_limit) ? true : false);
169+
res = (nkeys == ntrue) ? true : ((((((float4) ntrue) / ((float4) (nkeys - ntrue)))) >= trgm_limit) ? true : false);
85170
#else
86-
res = (nkeys == 0) ? false : ((((((float4) ntrue) / ((float4) nkeys))) >= trgm_limit) ? true : false);
171+
res = (nkeys == 0) ? false : ((((((float4) ntrue) / ((float4) nkeys))) >= trgm_limit) ? true : false);
87172
#endif
173+
break;
174+
case ILikeStrategyNumber:
175+
#ifndef IGNORECASE
176+
elog(ERROR, "cannot handle ~~* with case-sensitive trigrams");
177+
#endif
178+
/* FALL THRU */
179+
case LikeStrategyNumber:
180+
/* Check if all extracted trigrams are presented. */
181+
res = true;
182+
for (i = 0; i < nkeys; i++)
183+
{
184+
if (!check[i])
185+
{
186+
res = false;
187+
break;
188+
}
189+
}
190+
break;
191+
default:
192+
elog(ERROR, "unrecognized strategy number: %d", strategy);
193+
res = false; /* keep compiler quiet */
194+
break;
195+
}
88196

89197
PG_RETURN_BOOL(res);
90198
}

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy