Skip to content

Commit f576b17

Browse files
committed
Add word_similarity to pg_trgm contrib module.
Patch introduces a concept of similarity over string and just a word from another string. Version of extension is not changed because 1.2 was already introduced in 9.6 release cycle, so, there wasn't a public version. Author: Alexander Korotkov, Artur Zakirov
1 parent 1c4f001 commit f576b17

File tree

10 files changed

+726
-75
lines changed

10 files changed

+726
-75
lines changed

contrib/pg_trgm/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ EXTENSION = pg_trgm
77
DATA = pg_trgm--1.2.sql pg_trgm--1.0--1.1.sql pg_trgm--1.1--1.2.sql pg_trgm--unpackaged--1.0.sql
88
PGFILEDESC = "pg_trgm - trigram matching"
99

10-
REGRESS = pg_trgm
10+
REGRESS = pg_trgm pg_word_trgm
1111

1212
ifdef USE_PGXS
1313
PG_CONFIG = pg_config

contrib/pg_trgm/expected/pg_trgm.out

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ select similarity('---', '####---');
5959
0
6060
(1 row)
6161

62-
CREATE TABLE test_trgm(t text);
62+
CREATE TABLE test_trgm(t text COLLATE "C");
6363
\copy test_trgm from 'data/trgm.data'
6464
select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
6565
t | sml
@@ -3467,7 +3467,7 @@ select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu198
34673467
qwertyu0988 | 0.333333
34683468
(1 row)
34693469

3470-
create table test2(t text);
3470+
create table test2(t text COLLATE "C");
34713471
insert into test2 values ('abcdef');
34723472
insert into test2 values ('quark');
34733473
insert into test2 values (' z foo bar');

contrib/pg_trgm/pg_trgm--1.1--1.2.sql

Lines changed: 63 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,72 @@
33
-- complain if script is sourced in psql, rather than via ALTER EXTENSION
44
\echo Use "ALTER EXTENSION pg_trgm UPDATE TO '1.2'" to load this file. \quit
55

6+
CREATE FUNCTION word_similarity(text,text)
7+
RETURNS float4
8+
AS 'MODULE_PATHNAME'
9+
LANGUAGE C STRICT IMMUTABLE;
10+
11+
CREATE FUNCTION word_similarity_op(text,text)
12+
RETURNS bool
13+
AS 'MODULE_PATHNAME'
14+
LANGUAGE C STRICT STABLE; -- stable because depends on pg_trgm.word_similarity_threshold
15+
16+
CREATE FUNCTION word_similarity_commutator_op(text,text)
17+
RETURNS bool
18+
AS 'MODULE_PATHNAME'
19+
LANGUAGE C STRICT STABLE; -- stable because depends on pg_trgm.word_similarity_threshold
20+
21+
CREATE FUNCTION word_similarity_dist_op(text,text)
22+
RETURNS float4
23+
AS 'MODULE_PATHNAME'
24+
LANGUAGE C STRICT IMMUTABLE;
25+
26+
CREATE FUNCTION word_similarity_dist_commutator_op(text,text)
27+
RETURNS float4
28+
AS 'MODULE_PATHNAME'
29+
LANGUAGE C STRICT IMMUTABLE;
30+
31+
CREATE OPERATOR <% (
32+
LEFTARG = text,
33+
RIGHTARG = text,
34+
PROCEDURE = word_similarity_op,
35+
COMMUTATOR = '%>',
36+
RESTRICT = contsel,
37+
JOIN = contjoinsel
38+
);
39+
40+
CREATE OPERATOR %> (
41+
LEFTARG = text,
42+
RIGHTARG = text,
43+
PROCEDURE = word_similarity_commutator_op,
44+
COMMUTATOR = '<%',
45+
RESTRICT = contsel,
46+
JOIN = contjoinsel
47+
);
48+
49+
CREATE OPERATOR <<-> (
50+
LEFTARG = text,
51+
RIGHTARG = text,
52+
PROCEDURE = word_similarity_dist_op,
53+
COMMUTATOR = '<->>'
54+
);
55+
56+
CREATE OPERATOR <->> (
57+
LEFTARG = text,
58+
RIGHTARG = text,
59+
PROCEDURE = word_similarity_dist_commutator_op,
60+
COMMUTATOR = '<<->'
61+
);
62+
663
CREATE FUNCTION gin_trgm_triconsistent(internal, int2, text, int4, internal, internal, internal)
764
RETURNS "char"
865
AS 'MODULE_PATHNAME'
966
LANGUAGE C IMMUTABLE STRICT;
1067

68+
ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD
69+
OPERATOR 7 %> (text, text),
70+
OPERATOR 8 <->> (text, text) FOR ORDER BY pg_catalog.float_ops;
71+
1172
ALTER OPERATOR FAMILY gin_trgm_ops USING gin ADD
12-
FUNCTION 6 (text, text) gin_trgm_triconsistent (internal, int2, text, int4, internal, internal, internal);
73+
OPERATOR 7 %> (text, text),
74+
FUNCTION 6 (text, text) gin_trgm_triconsistent (internal, int2, text, int4, internal, internal, internal);

contrib/pg_trgm/pg_trgm--1.2.sql

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,39 @@ CREATE OPERATOR % (
3939
JOIN = contjoinsel
4040
);
4141

42+
CREATE FUNCTION word_similarity(text,text)
43+
RETURNS float4
44+
AS 'MODULE_PATHNAME'
45+
LANGUAGE C STRICT IMMUTABLE;
46+
47+
CREATE FUNCTION word_similarity_op(text,text)
48+
RETURNS bool
49+
AS 'MODULE_PATHNAME'
50+
LANGUAGE C STRICT STABLE; -- stable because depends on pg_trgm.word_similarity_threshold
51+
52+
CREATE FUNCTION word_similarity_commutator_op(text,text)
53+
RETURNS bool
54+
AS 'MODULE_PATHNAME'
55+
LANGUAGE C STRICT STABLE; -- stable because depends on pg_trgm.word_similarity_threshold
56+
57+
CREATE OPERATOR <% (
58+
LEFTARG = text,
59+
RIGHTARG = text,
60+
PROCEDURE = word_similarity_op,
61+
COMMUTATOR = '%>',
62+
RESTRICT = contsel,
63+
JOIN = contjoinsel
64+
);
65+
66+
CREATE OPERATOR %> (
67+
LEFTARG = text,
68+
RIGHTARG = text,
69+
PROCEDURE = word_similarity_commutator_op,
70+
COMMUTATOR = '<%',
71+
RESTRICT = contsel,
72+
JOIN = contjoinsel
73+
);
74+
4275
CREATE FUNCTION similarity_dist(text,text)
4376
RETURNS float4
4477
AS 'MODULE_PATHNAME'
@@ -51,6 +84,30 @@ CREATE OPERATOR <-> (
5184
COMMUTATOR = '<->'
5285
);
5386

87+
CREATE FUNCTION word_similarity_dist_op(text,text)
88+
RETURNS float4
89+
AS 'MODULE_PATHNAME'
90+
LANGUAGE C STRICT IMMUTABLE;
91+
92+
CREATE FUNCTION word_similarity_dist_commutator_op(text,text)
93+
RETURNS float4
94+
AS 'MODULE_PATHNAME'
95+
LANGUAGE C STRICT IMMUTABLE;
96+
97+
CREATE OPERATOR <<-> (
98+
LEFTARG = text,
99+
RIGHTARG = text,
100+
PROCEDURE = word_similarity_dist_op,
101+
COMMUTATOR = '<->>'
102+
);
103+
104+
CREATE OPERATOR <->> (
105+
LEFTARG = text,
106+
RIGHTARG = text,
107+
PROCEDURE = word_similarity_dist_commutator_op,
108+
COMMUTATOR = '<<->'
109+
);
110+
54111
-- gist key
55112
CREATE FUNCTION gtrgm_in(cstring)
56113
RETURNS gtrgm
@@ -140,6 +197,12 @@ ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD
140197
OPERATOR 5 pg_catalog.~ (text, text),
141198
OPERATOR 6 pg_catalog.~* (text, text);
142199

200+
-- Add operators that are new in 9.6 (pg_trgm 1.2).
201+
202+
ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD
203+
OPERATOR 7 %> (text, text),
204+
OPERATOR 8 <->> (text, text) FOR ORDER BY pg_catalog.float_ops;
205+
143206
-- support functions for gin
144207
CREATE FUNCTION gin_extract_value_trgm(text, internal)
145208
RETURNS internal
@@ -187,4 +250,5 @@ AS 'MODULE_PATHNAME'
187250
LANGUAGE C IMMUTABLE STRICT;
188251

189252
ALTER OPERATOR FAMILY gin_trgm_ops USING gin ADD
253+
OPERATOR 7 %> (text, text),
190254
FUNCTION 6 (text,text) gin_trgm_triconsistent (internal, int2, text, int4, internal, internal, internal);

contrib/pg_trgm/sql/pg_trgm.sql

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ select similarity('wow',' WOW ');
1313

1414
select similarity('---', '####---');
1515

16-
CREATE TABLE test_trgm(t text);
16+
CREATE TABLE test_trgm(t text COLLATE "C");
1717

1818
\copy test_trgm from 'data/trgm.data'
1919

@@ -40,7 +40,7 @@ select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu098
4040
select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t;
4141
select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu1988' order by sml desc, t;
4242

43-
create table test2(t text);
43+
create table test2(t text COLLATE "C");
4444
insert into test2 values ('abcdef');
4545
insert into test2 values ('quark');
4646
insert into test2 values (' z foo bar');

contrib/pg_trgm/trgm.h

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,14 @@
2626
#define DIVUNION
2727

2828
/* operator strategy numbers */
29-
#define SimilarityStrategyNumber 1
30-
#define DistanceStrategyNumber 2
31-
#define LikeStrategyNumber 3
32-
#define ILikeStrategyNumber 4
33-
#define RegExpStrategyNumber 5
34-
#define RegExpICaseStrategyNumber 6
35-
29+
#define SimilarityStrategyNumber 1
30+
#define DistanceStrategyNumber 2
31+
#define LikeStrategyNumber 3
32+
#define ILikeStrategyNumber 4
33+
#define RegExpStrategyNumber 5
34+
#define RegExpICaseStrategyNumber 6
35+
#define WordSimilarityStrategyNumber 7
36+
#define WordDistanceStrategyNumber 8
3637

3738
typedef char trgm[3];
3839

@@ -103,15 +104,28 @@ typedef char *BITVECP;
103104
#define GETARR(x) ( (trgm*)( (char*)x+TRGMHDRSIZE ) )
104105
#define ARRNELEM(x) ( ( VARSIZE(x) - TRGMHDRSIZE )/sizeof(trgm) )
105106

107+
/*
108+
* If DIVUNION is defined then similarity formula is:
109+
* count / (len1 + len2 - count)
110+
* else if DIVUNION is not defined then similarity formula is:
111+
* count / max(len1, len2)
112+
*/
113+
#ifdef DIVUNION
114+
#define CALCSML(count, len1, len2) ((float4) (count)) / ((float4) ((len1) + (len2) - (count)))
115+
#else
116+
#define CALCSML(count, len1, len2) ((float4) (count)) / ((float4) (((len1) > (len2)) ? (len1) : (len2)))
117+
#endif
118+
106119
typedef struct TrgmPackedGraph TrgmPackedGraph;
107120

108121
extern double similarity_threshold;
122+
extern double word_similarity_threshold;
109123

110124
extern uint32 trgm2int(trgm *ptr);
111125
extern void compact_trigram(trgm *tptr, char *str, int bytelen);
112126
extern TRGM *generate_trgm(char *str, int slen);
113127
extern TRGM *generate_wildcard_trgm(const char *str, int slen);
114-
extern float4 cnt_sml(TRGM *trg1, TRGM *trg2);
128+
extern float4 cnt_sml(TRGM *trg1, TRGM *trg2, bool inexact);
115129
extern bool trgm_contained_by(TRGM *trg1, TRGM *trg2);
116130
extern bool *trgm_presence_map(TRGM *query, TRGM *key);
117131
extern TRGM *createTrgmNFA(text *text_re, Oid collation,

contrib/pg_trgm/trgm_gin.c

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ gin_extract_query_trgm(PG_FUNCTION_ARGS)
8989
switch (strategy)
9090
{
9191
case SimilarityStrategyNumber:
92+
case WordSimilarityStrategyNumber:
9293
trg = generate_trgm(VARDATA(val), VARSIZE(val) - VARHDRSZ);
9394
break;
9495
case ILikeStrategyNumber:
@@ -176,13 +177,18 @@ gin_trgm_consistent(PG_FUNCTION_ARGS)
176177
bool res;
177178
int32 i,
178179
ntrue;
180+
double nlimit;
179181

180182
/* All cases served by this function are inexact */
181183
*recheck = true;
182184

183185
switch (strategy)
184186
{
185187
case SimilarityStrategyNumber:
188+
case WordSimilarityStrategyNumber:
189+
nlimit = (strategy == SimilarityStrategyNumber) ?
190+
similarity_threshold : word_similarity_threshold;
191+
186192
/* Count the matches */
187193
ntrue = 0;
188194
for (i = 0; i < nkeys; i++)
@@ -207,8 +213,7 @@ gin_trgm_consistent(PG_FUNCTION_ARGS)
207213
* So, independly on DIVUNION the upper bound formula is the same.
208214
*/
209215
res = (nkeys == 0) ? false :
210-
((((((float4) ntrue) / ((float4) nkeys))) >= similarity_threshold)
211-
? true : false);
216+
(((((float4) ntrue) / ((float4) nkeys))) >= nlimit);
212217
break;
213218
case ILikeStrategyNumber:
214219
#ifndef IGNORECASE
@@ -270,10 +275,15 @@ gin_trgm_triconsistent(PG_FUNCTION_ARGS)
270275
int32 i,
271276
ntrue;
272277
bool *boolcheck;
278+
double nlimit;
273279

274280
switch (strategy)
275281
{
276282
case SimilarityStrategyNumber:
283+
case WordSimilarityStrategyNumber:
284+
nlimit = (strategy == SimilarityStrategyNumber) ?
285+
similarity_threshold : word_similarity_threshold;
286+
277287
/* Count the matches */
278288
ntrue = 0;
279289
for (i = 0; i < nkeys; i++)
@@ -285,9 +295,9 @@ gin_trgm_triconsistent(PG_FUNCTION_ARGS)
285295
/*
286296
* See comment in gin_trgm_consistent() about * upper bound formula
287297
*/
288-
res = (nkeys == 0) ? GIN_FALSE :
289-
(((((float4) ntrue) / ((float4) nkeys)) >= similarity_threshold)
290-
? GIN_MAYBE : GIN_FALSE);
298+
res = (nkeys == 0)
299+
? GIN_FALSE : (((((float4) ntrue) / ((float4) nkeys)) >= nlimit)
300+
? GIN_MAYBE : GIN_FALSE);
291301
break;
292302
case ILikeStrategyNumber:
293303
#ifndef IGNORECASE

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy