Skip to content

Commit 604ab08

Browse files
committed
Add levenshtein_less_equal, optimized version for small distances.
Alexander Korotkov, heavily revised by me.
1 parent 262c1a4 commit 604ab08

File tree

5 files changed

+459
-213
lines changed

5 files changed

+459
-213
lines changed

contrib/fuzzystrmatch/fuzzystrmatch.c

Lines changed: 31 additions & 213 deletions
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,6 @@
99
* Copyright (c) 2001-2010, PostgreSQL Global Development Group
1010
* ALL RIGHTS RESERVED;
1111
*
12-
* levenshtein()
13-
* -------------
14-
* Written based on a description of the algorithm by Michael Gilleland
15-
* found at http://www.merriampark.com/ld.htm
16-
* Also looked at levenshtein.c in the PHP 4.0.6 distribution for
17-
* inspiration.
18-
* Configurable penalty costs extension is introduced by Volkan
19-
* YAZICI <volkan.yazici@gmail.com>.
20-
*
2112
* metaphone()
2213
* -----------
2314
* Modified for PostgreSQL by Joe Conway.
@@ -61,6 +52,8 @@ PG_MODULE_MAGIC;
6152
*/
6253
extern Datum levenshtein_with_costs(PG_FUNCTION_ARGS);
6354
extern Datum levenshtein(PG_FUNCTION_ARGS);
55+
extern Datum levenshtein_less_equal_with_costs(PG_FUNCTION_ARGS);
56+
extern Datum levenshtein_less_equal(PG_FUNCTION_ARGS);
6457
extern Datum metaphone(PG_FUNCTION_ARGS);
6558
extern Datum soundex(PG_FUNCTION_ARGS);
6659
extern Datum difference(PG_FUNCTION_ARGS);
@@ -85,16 +78,6 @@ soundex_code(char letter)
8578
return letter;
8679
}
8780

88-
89-
/*
90-
* Levenshtein
91-
*/
92-
#define MAX_LEVENSHTEIN_STRLEN 255
93-
94-
static int levenshtein_internal(text *s, text *t,
95-
int ins_c, int del_c, int sub_c);
96-
97-
9881
/*
9982
* Metaphone
10083
*/
@@ -197,224 +180,59 @@ rest_of_char_same(const char *s1, const char *s2, int len)
197180
return true;
198181
}
199182

200-
/*
201-
* levenshtein_internal - Calculates Levenshtein distance metric
202-
* between supplied strings. Generally
203-
* (1, 1, 1) penalty costs suffices common
204-
* cases, but your mileage may vary.
205-
*/
206-
static int
207-
levenshtein_internal(text *s, text *t,
208-
int ins_c, int del_c, int sub_c)
209-
{
210-
int m,
211-
n,
212-
s_bytes,
213-
t_bytes;
214-
int *prev;
215-
int *curr;
216-
int *s_char_len = NULL;
217-
int i,
218-
j;
219-
const char *s_data;
220-
const char *t_data;
221-
const char *y;
222-
223-
/* Extract a pointer to the actual character data. */
224-
s_data = VARDATA_ANY(s);
225-
t_data = VARDATA_ANY(t);
226-
227-
/* Determine length of each string in bytes and characters. */
228-
s_bytes = VARSIZE_ANY_EXHDR(s);
229-
t_bytes = VARSIZE_ANY_EXHDR(t);
230-
m = pg_mbstrlen_with_len(s_data, s_bytes);
231-
n = pg_mbstrlen_with_len(t_data, t_bytes);
232-
233-
/*
234-
* We can transform an empty s into t with n insertions, or a non-empty t
235-
* into an empty s with m deletions.
236-
*/
237-
if (!m)
238-
return n * ins_c;
239-
if (!n)
240-
return m * del_c;
241-
242-
/*
243-
* For security concerns, restrict excessive CPU+RAM usage. (This
244-
* implementation uses O(m) memory and has O(mn) complexity.)
245-
*/
246-
if (m > MAX_LEVENSHTEIN_STRLEN ||
247-
n > MAX_LEVENSHTEIN_STRLEN)
248-
ereport(ERROR,
249-
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
250-
errmsg("argument exceeds the maximum length of %d bytes",
251-
MAX_LEVENSHTEIN_STRLEN)));
252-
253-
/*
254-
* In order to avoid calling pg_mblen() repeatedly on each character in s,
255-
* we cache all the lengths before starting the main loop -- but if all the
256-
* characters in both strings are single byte, then we skip this and use
257-
* a fast-path in the main loop. If only one string contains multi-byte
258-
* characters, we still build the array, so that the fast-path needn't
259-
* deal with the case where the array hasn't been initialized.
260-
*/
261-
if (m != s_bytes || n != t_bytes)
262-
{
263-
int i;
264-
const char *cp = s_data;
265-
266-
s_char_len = (int *) palloc((m + 1) * sizeof(int));
267-
for (i = 0; i < m; ++i)
268-
{
269-
s_char_len[i] = pg_mblen(cp);
270-
cp += s_char_len[i];
271-
}
272-
s_char_len[i] = 0;
273-
}
274-
275-
/* One more cell for initialization column and row. */
276-
++m;
277-
++n;
278-
279-
/*
280-
* One way to compute Levenshtein distance is to incrementally construct
281-
* an (m+1)x(n+1) matrix where cell (i, j) represents the minimum number
282-
* of operations required to transform the first i characters of s into
283-
* the first j characters of t. The last column of the final row is the
284-
* answer.
285-
*
286-
* We use that algorithm here with some modification. In lieu of holding
287-
* the entire array in memory at once, we'll just use two arrays of size
288-
* m+1 for storing accumulated values. At each step one array represents
289-
* the "previous" row and one is the "current" row of the notional large
290-
* array.
291-
*/
292-
prev = (int *) palloc(2 * m * sizeof(int));
293-
curr = prev + m;
294-
295-
/*
296-
* To transform the first i characters of s into the first 0 characters
297-
* of t, we must perform i deletions.
298-
*/
299-
for (i = 0; i < m; i++)
300-
prev[i] = i * del_c;
301-
302-
/* Loop through rows of the notional array */
303-
for (y = t_data, j = 1; j < n; j++)
304-
{
305-
int *temp;
306-
const char *x = s_data;
307-
int y_char_len = n != t_bytes + 1 ? pg_mblen(y) : 1;
308-
309-
/*
310-
* To transform the first 0 characters of s into the first j
311-
* characters of t, we must perform j insertions.
312-
*/
313-
curr[0] = j * ins_c;
314-
315-
/*
316-
* This inner loop is critical to performance, so we include a
317-
* fast-path to handle the (fairly common) case where no multibyte
318-
* characters are in the mix. The fast-path is entitled to assume
319-
* that if s_char_len is not initialized then BOTH strings contain
320-
* only single-byte characters.
321-
*/
322-
if (s_char_len != NULL)
323-
{
324-
for (i = 1; i < m; i++)
325-
{
326-
int ins;
327-
int del;
328-
int sub;
329-
int x_char_len = s_char_len[i - 1];
330-
331-
/*
332-
* Calculate costs for insertion, deletion, and substitution.
333-
*
334-
* When calculating cost for substitution, we compare the last
335-
* character of each possibly-multibyte character first,
336-
* because that's enough to rule out most mis-matches. If we
337-
* get past that test, then we compare the lengths and the
338-
* remaining bytes.
339-
*/
340-
ins = prev[i] + ins_c;
341-
del = curr[i - 1] + del_c;
342-
if (x[x_char_len-1] == y[y_char_len-1]
343-
&& x_char_len == y_char_len &&
344-
(x_char_len == 1 || rest_of_char_same(x, y, x_char_len)))
345-
sub = prev[i - 1];
346-
else
347-
sub = prev[i - 1] + sub_c;
348-
349-
/* Take the one with minimum cost. */
350-
curr[i] = Min(ins, del);
351-
curr[i] = Min(curr[i], sub);
352-
353-
/* Point to next character. */
354-
x += x_char_len;
355-
}
356-
}
357-
else
358-
{
359-
for (i = 1; i < m; i++)
360-
{
361-
int ins;
362-
int del;
363-
int sub;
183+
#include "levenshtein.c"
184+
#define LEVENSHTEIN_LESS_EQUAL
185+
#include "levenshtein.c"
364186

365-
/* Calculate costs for insertion, deletion, and substitution. */
366-
ins = prev[i] + ins_c;
367-
del = curr[i - 1] + del_c;
368-
sub = prev[i - 1] + ((*x == *y) ? 0 : sub_c);
369-
370-
/* Take the one with minimum cost. */
371-
curr[i] = Min(ins, del);
372-
curr[i] = Min(curr[i], sub);
187+
PG_FUNCTION_INFO_V1(levenshtein_with_costs);
188+
Datum
189+
levenshtein_with_costs(PG_FUNCTION_ARGS)
190+
{
191+
text *src = PG_GETARG_TEXT_PP(0);
192+
text *dst = PG_GETARG_TEXT_PP(1);
193+
int ins_c = PG_GETARG_INT32(2);
194+
int del_c = PG_GETARG_INT32(3);
195+
int sub_c = PG_GETARG_INT32(4);
373196

374-
/* Point to next character. */
375-
x++;
376-
}
377-
}
197+
PG_RETURN_INT32(levenshtein_internal(src, dst, ins_c, del_c, sub_c));
198+
}
378199

379-
/* Swap current row with previous row. */
380-
temp = curr;
381-
curr = prev;
382-
prev = temp;
383200

384-
/* Point to next character. */
385-
y += y_char_len;
386-
}
201+
PG_FUNCTION_INFO_V1(levenshtein);
202+
Datum
203+
levenshtein(PG_FUNCTION_ARGS)
204+
{
205+
text *src = PG_GETARG_TEXT_PP(0);
206+
text *dst = PG_GETARG_TEXT_PP(1);
387207

388-
/*
389-
* Because the final value was swapped from the previous row to the
390-
* current row, that's where we'll find it.
391-
*/
392-
return prev[m - 1];
208+
PG_RETURN_INT32(levenshtein_internal(src, dst, 1, 1, 1));
393209
}
394210

395211

396-
PG_FUNCTION_INFO_V1(levenshtein_with_costs);
212+
PG_FUNCTION_INFO_V1(levenshtein_less_equal_with_costs);
397213
Datum
398-
levenshtein_with_costs(PG_FUNCTION_ARGS)
214+
levenshtein_less_equal_with_costs(PG_FUNCTION_ARGS)
399215
{
400216
text *src = PG_GETARG_TEXT_PP(0);
401217
text *dst = PG_GETARG_TEXT_PP(1);
402218
int ins_c = PG_GETARG_INT32(2);
403219
int del_c = PG_GETARG_INT32(3);
404220
int sub_c = PG_GETARG_INT32(4);
221+
int max_d = PG_GETARG_INT32(5);
405222

406-
PG_RETURN_INT32(levenshtein_internal(src, dst, ins_c, del_c, sub_c));
223+
PG_RETURN_INT32(levenshtein_less_equal_internal(src, dst, ins_c, del_c, sub_c, max_d));
407224
}
408225

409226

410-
PG_FUNCTION_INFO_V1(levenshtein);
227+
PG_FUNCTION_INFO_V1(levenshtein_less_equal);
411228
Datum
412-
levenshtein(PG_FUNCTION_ARGS)
229+
levenshtein_less_equal(PG_FUNCTION_ARGS)
413230
{
414231
text *src = PG_GETARG_TEXT_PP(0);
415232
text *dst = PG_GETARG_TEXT_PP(1);
233+
int max_d = PG_GETARG_INT32(2);
416234

417-
PG_RETURN_INT32(levenshtein_internal(src, dst, 1, 1, 1));
235+
PG_RETURN_INT32(levenshtein_less_equal_internal(src, dst, 1, 1, 1, max_d));
418236
}
419237

420238

contrib/fuzzystrmatch/fuzzystrmatch.sql.in

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,14 @@ CREATE OR REPLACE FUNCTION levenshtein (text,text,int,int,int) RETURNS int
1111
AS 'MODULE_PATHNAME','levenshtein_with_costs'
1212
LANGUAGE C IMMUTABLE STRICT;
1313

14+
CREATE OR REPLACE FUNCTION levenshtein_less_equal (text,text,int) RETURNS int
15+
AS 'MODULE_PATHNAME','levenshtein_less_equal'
16+
LANGUAGE C IMMUTABLE STRICT;
17+
18+
CREATE OR REPLACE FUNCTION levenshtein_less_equal (text,text,int,int,int,int) RETURNS int
19+
AS 'MODULE_PATHNAME','levenshtein_less_equal_with_costs'
20+
LANGUAGE C IMMUTABLE STRICT;
21+
1422
CREATE OR REPLACE FUNCTION metaphone (text,int) RETURNS text
1523
AS 'MODULE_PATHNAME','metaphone'
1624
LANGUAGE C IMMUTABLE STRICT;

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy