Skip to content

Commit b87b52b

Browse files
committed
Support of multibyte encoding for pg_trgm
1 parent e4ffd14 commit b87b52b

File tree

3 files changed

+161
-88
lines changed

3 files changed

+161
-88
lines changed

contrib/pg_trgm/trgm.h

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* $PostgreSQL: pgsql/contrib/pg_trgm/trgm.h,v 1.9 2008/05/17 01:28:21 adunstan Exp $
2+
* $PostgreSQL: pgsql/contrib/pg_trgm/trgm.h,v 1.10 2008/11/12 13:43:54 teodor Exp $
33
*/
44
#ifndef __TRGM_H__
55
#define __TRGM_H__
@@ -31,7 +31,14 @@ typedef char trgm[3];
3131
*(((char*)(a))+2) = *(((char*)(b))+2); \
3232
} while(0);
3333

34-
#define TRGMINT(a) ( (*(((char*)(a))+2)<<16)+(*(((char*)(a))+1)<<8)+*(((char*)(a))+0) )
34+
uint32 trgm2int(trgm *ptr);
35+
36+
#ifdef KEEPONLYALNUM
37+
#define ISPRINTABLECHAR(a) ( isascii( *(unsigned char*)(a) ) && (isalnum( *(unsigned char*)(a) ) || *(unsigned char*)(a)==' ') )
38+
#else
39+
#define ISPRINTABLECHAR(a) ( isascii( *(unsigned char*)(a) ) && isprint( *(unsigned char*)(a) ) )
40+
#endif
41+
#define ISPRINTABLETRGM(t) ( ISPRINTABLECHAR( ((char*)t) ) && ISPRINTABLECHAR( ((char*)t)+1 ) && ISPRINTABLECHAR( ((char*)t)+2 ) )
3542

3643
typedef struct
3744
{

contrib/pg_trgm/trgm_gin.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* $PostgreSQL: pgsql/contrib/pg_trgm/trgm_gin.c,v 1.5 2008/07/11 11:56:48 teodor Exp $
2+
* $PostgreSQL: pgsql/contrib/pg_trgm/trgm_gin.c,v 1.6 2008/11/12 13:43:54 teodor Exp $
33
*/
44
#include "trgm.h"
55

@@ -42,7 +42,7 @@ gin_extract_trgm(PG_FUNCTION_ARGS)
4242
ptr = GETARR(trg);
4343
while (ptr - GETARR(trg) < ARRNELEM(trg))
4444
{
45-
item = TRGMINT(ptr);
45+
item = trgm2int(ptr);
4646
entries[i++] = Int32GetDatum(item);
4747

4848
ptr++;

contrib/pg_trgm/trgm_op.c

Lines changed: 150 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
/*
2-
* $PostgreSQL: pgsql/contrib/pg_trgm/trgm_op.c,v 1.10 2008/05/17 01:28:21 adunstan Exp $
2+
* $PostgreSQL: pgsql/contrib/pg_trgm/trgm_op.c,v 1.11 2008/11/12 13:43:54 teodor Exp $
33
*/
44
#include "trgm.h"
55
#include <ctype.h>
66
#include "utils/array.h"
77
#include "catalog/pg_type.h"
8+
#include "tsearch/ts_locale.h"
89

910
PG_MODULE_MAGIC;
1011

@@ -31,9 +32,6 @@ show_limit(PG_FUNCTION_ARGS)
3132
PG_RETURN_FLOAT4(trgm_limit);
3233
}
3334

34-
#define WORDWAIT 0
35-
#define INWORD 1
36-
3735
static int
3836
comp_trgm(const void *a, const void *b)
3937
{
@@ -60,18 +58,119 @@ unique_array(trgm * a, int len)
6058
return curend + 1 - a;
6159
}
6260

61+
#ifdef KEEPONLYALNUM
62+
#define iswordchr(c) (t_isalpha(c) || t_isdigit(c))
63+
#else
64+
#define iswordchr(c) (!t_isspace(c))
65+
#endif
66+
67+
/*
68+
* Finds first word in string, returns pointer to the word,
69+
* endword points to the character after word
70+
*/
71+
static char*
72+
find_word(char *str, int lenstr, char **endword, int *charlen)
73+
{
74+
char *beginword = str;
75+
76+
while( beginword - str < lenstr && !iswordchr(beginword) )
77+
beginword += pg_mblen(beginword);
78+
79+
if (beginword - str >= lenstr)
80+
return NULL;
81+
82+
*endword = beginword;
83+
*charlen = 0;
84+
while( *endword - str < lenstr && iswordchr(*endword) )
85+
{
86+
*endword += pg_mblen(*endword);
87+
(*charlen)++;
88+
}
89+
90+
return beginword;
91+
}
92+
93+
#ifdef USE_WIDE_UPPER_LOWER
94+
static void
95+
cnt_trigram(trgm *tptr, char *str, int bytelen)
96+
{
97+
if ( bytelen == 3 )
98+
{
99+
CPTRGM(tptr, str);
100+
}
101+
else
102+
{
103+
pg_crc32 crc;
104+
105+
INIT_CRC32(crc);
106+
COMP_CRC32(crc, str, bytelen);
107+
FIN_CRC32(crc);
108+
109+
/*
110+
* use only 3 upper bytes from crc, hope, it's
111+
* good enough hashing
112+
*/
113+
CPTRGM(tptr, &crc);
114+
}
115+
}
116+
#endif
117+
118+
/*
119+
* Adds trigramm from words (already padded).
120+
*/
121+
static trgm*
122+
make_trigrams( trgm *tptr, char *str, int bytelen, int charlen )
123+
{
124+
char *ptr = str;
125+
126+
if ( charlen < 3 )
127+
return tptr;
128+
129+
#ifdef USE_WIDE_UPPER_LOWER
130+
if (pg_database_encoding_max_length() > 1)
131+
{
132+
int lenfirst = pg_mblen(str),
133+
lenmiddle = pg_mblen(str + lenfirst),
134+
lenlast = pg_mblen(str + lenfirst + lenmiddle);
135+
136+
while( (ptr - str) + lenfirst + lenmiddle + lenlast <= bytelen )
137+
{
138+
cnt_trigram(tptr, ptr, lenfirst + lenmiddle + lenlast);
139+
140+
ptr += lenfirst;
141+
tptr++;
142+
143+
lenfirst = lenmiddle;
144+
lenmiddle = lenlast;
145+
lenlast = pg_mblen(ptr + lenfirst + lenmiddle);
146+
}
147+
}
148+
else
149+
#endif
150+
{
151+
Assert( bytelen == charlen );
152+
153+
while (ptr - str < bytelen - 2 /* number of trigrams = strlen - 2 */ )
154+
{
155+
CPTRGM(tptr, ptr);
156+
ptr++;
157+
tptr++;
158+
}
159+
}
160+
161+
return tptr;
162+
}
63163

64164
TRGM *
65165
generate_trgm(char *str, int slen)
66166
{
67167
TRGM *trg;
68-
char *buf,
69-
*sptr,
70-
*bufptr;
168+
char *buf;
71169
trgm *tptr;
72-
int state = WORDWAIT;
73-
int wl,
74-
len;
170+
int len,
171+
charlen,
172+
bytelen;
173+
char *bword, *eword;
75174

76175
trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
77176
trg->flag = ARRKEY;
@@ -83,7 +182,6 @@ generate_trgm(char *str, int slen)
83182
tptr = GETARR(trg);
84183

85184
buf = palloc(sizeof(char) * (slen + 4));
86-
sptr = str;
87185

88186
if (LPADDING > 0)
89187
{
@@ -92,82 +190,29 @@ generate_trgm(char *str, int slen)
92190
*(buf + 1) = ' ';
93191
}
94192

95-
bufptr = buf + LPADDING;
96-
while (sptr - str < slen)
193+
eword = str;
194+
while( (bword=find_word(eword, slen - (eword-str), &eword, &charlen)) != NULL )
97195
{
98-
if (state == WORDWAIT)
99-
{
100-
if (
101-
#ifdef KEEPONLYALNUM
102-
isalnum((unsigned char) *sptr)
103-
#else
104-
!isspace((unsigned char) *sptr)
105-
#endif
106-
)
107-
{
108-
*bufptr = *sptr; /* start put word in buffer */
109-
bufptr++;
110-
state = INWORD;
111-
if (sptr - str == slen - 1 /* last char */ )
112-
goto gettrg;
113-
}
114-
}
115-
else
116-
{
117-
if (
118-
#ifdef KEEPONLYALNUM
119-
!isalnum((unsigned char) *sptr)
196+
#ifdef IGNORECASE
197+
bword = lowerstr_with_len(bword, eword - bword);
198+
bytelen = strlen(bword);
120199
#else
121-
isspace((unsigned char) *sptr)
200+
bytelen = eword - bword;
122201
#endif
123-
)
124-
{
125-
gettrg:
126-
/* word in buffer, so count trigrams */
127-
*bufptr = ' ';
128-
*(bufptr + 1) = ' ';
129-
wl = bufptr - (buf + LPADDING) - 2 + LPADDING + RPADDING;
130-
if (wl <= 0)
131-
{
132-
bufptr = buf + LPADDING;
133-
state = WORDWAIT;
134-
sptr++;
135-
continue;
136-
}
202+
203+
memcpy(buf + LPADDING, bword, bytelen);
137204

138205
#ifdef IGNORECASE
139-
do
140-
{ /* lower word */
141-
int wwl = bufptr - buf;
142-
143-
bufptr = buf + LPADDING;
144-
while (bufptr - buf < wwl)
145-
{
146-
*bufptr = tolower((unsigned char) *bufptr);
147-
bufptr++;
148-
}
149-
} while (0);
206+
pfree(bword);
150207
#endif
151-
bufptr = buf;
152-
/* set trigrams */
153-
while (bufptr - buf < wl)
154-
{
155-
CPTRGM(tptr, bufptr);
156-
bufptr++;
157-
tptr++;
158-
}
159-
bufptr = buf + LPADDING;
160-
state = WORDWAIT;
161-
}
162-
else
163-
{
164-
*bufptr = *sptr; /* put in buffer */
165-
bufptr++;
166-
if (sptr - str == slen - 1)
167-
goto gettrg;
168-
}
169-
}
170-
sptr++;
208+
buf[LPADDING+bytelen] = ' ';
209+
buf[LPADDING+bytelen+1] = ' ';
210+
211+
/*
212+
* count trigrams
213+
*/
214+
tptr = make_trigrams( tptr, buf, bytelen + LPADDING + RPADDING,
215+
charlen + LPADDING + RPADDING );
171216
}
172217

173218
pfree(buf);
@@ -186,6 +231,19 @@ generate_trgm(char *str, int slen)
186231
return trg;
187232
}
188233

234+
uint32
235+
trgm2int(trgm *ptr)
236+
{
237+
uint32 val = 0;
238+
239+
val |= *( ((unsigned char*)ptr) );
240+
val <<= 8;
241+
val |= *( ((unsigned char*)ptr) + 1 );
242+
val <<= 8;
243+
val |= *( ((unsigned char*)ptr) + 2 );
244+
245+
return val;
246+
}
189247

190248
PG_FUNCTION_INFO_V1(show_trgm);
191249
Datum show_trgm(PG_FUNCTION_ARGS);
@@ -204,10 +262,18 @@ show_trgm(PG_FUNCTION_ARGS)
204262

205263
for (i = 0, ptr = GETARR(trg); i < ARRNELEM(trg); i++, ptr++)
206264
{
207-
text *item = (text *) palloc(VARHDRSZ + 3);
265+
text *item = (text *) palloc(VARHDRSZ + Max(12, pg_database_encoding_max_length()*3) );
208266

209-
SET_VARSIZE(item, VARHDRSZ + 3);
210-
CPTRGM(VARDATA(item), ptr);
267+
if ( pg_database_encoding_max_length() > 1 && !ISPRINTABLETRGM(ptr) )
268+
{
269+
snprintf(VARDATA(item), 12, "0x%06x", trgm2int(ptr));
270+
SET_VARSIZE(item, VARHDRSZ + strlen(VARDATA(item)));
271+
}
272+
else
273+
{
274+
SET_VARSIZE(item, VARHDRSZ + 3);
275+
CPTRGM(VARDATA(item), ptr);
276+
}
211277
d[i] = PointerGetDatum(item);
212278
}
213279

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy