Skip to content

Commit 57d9aef

Browse files
committed
Teach levenshtein() about multi-byte characters.
Based on a patch by, and further ideas from, Alexander Korotkov.
1 parent ad17ff9 commit 57d9aef

File tree

2 files changed

+122
-22
lines changed

2 files changed

+122
-22
lines changed

contrib/fuzzystrmatch/fuzzystrmatch.c

Lines changed: 118 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
*
66
* Joe Conway <mail@joeconway.com>
77
*
8-
* $PostgreSQL: pgsql/contrib/fuzzystrmatch/fuzzystrmatch.c,v 1.33 2010/07/29 20:11:48 rhaas Exp $
8+
* $PostgreSQL: pgsql/contrib/fuzzystrmatch/fuzzystrmatch.c,v 1.34 2010/08/02 23:20:23 rhaas Exp $
99
* Copyright (c) 2001-2010, PostgreSQL Global Development Group
1010
* ALL RIGHTS RESERVED;
1111
*
@@ -50,6 +50,7 @@
5050
#include <ctype.h>
5151

5252
#include "fmgr.h"
53+
#include "mb/pg_wchar.h"
5354
#include "utils/builtins.h"
5455

5556
PG_MODULE_MAGIC;
@@ -183,6 +184,18 @@ getcode(char c)
183184
/* These prevent GH from becoming F */
184185
#define NOGHTOF(c) (getcode(c) & 16) /* BDH */
185186

187+
/* Faster than memcmp(), for this use case. */
188+
static bool inline
189+
rest_of_char_same(const char *s1, const char *s2, int len)
190+
{
191+
while (len > 0)
192+
{
193+
len--;
194+
if (s1[len] != s2[len])
195+
return false;
196+
}
197+
return true;
198+
}
186199

187200
/*
188201
* levenshtein_internal - Calculates Levenshtein distance metric
@@ -195,16 +208,27 @@ levenshtein_internal(text *s, text *t,
195208
int ins_c, int del_c, int sub_c)
196209
{
197210
int m,
198-
n;
211+
n,
212+
s_bytes,
213+
t_bytes;
199214
int *prev;
200215
int *curr;
216+
int *s_char_len = NULL;
201217
int i,
202218
j;
203-
const char *x;
219+
const char *s_data;
220+
const char *t_data;
204221
const char *y;
205222

206-
m = VARSIZE_ANY_EXHDR(s);
207-
n = VARSIZE_ANY_EXHDR(t);
223+
/* Extract a pointer to the actual character data. */
224+
s_data = VARDATA_ANY(s);
225+
t_data = VARDATA_ANY(t);
226+
227+
/* Determine length of each string in bytes and characters. */
228+
s_bytes = VARSIZE_ANY_EXHDR(s);
229+
t_bytes = VARSIZE_ANY_EXHDR(t);
230+
m = pg_mbstrlen_with_len(s_data, s_bytes);
231+
n = pg_mbstrlen_with_len(t_data, t_bytes);
208232

209233
/*
210234
* We can transform an empty s into t with n insertions, or a non-empty t
@@ -226,6 +250,28 @@ levenshtein_internal(text *s, text *t,
226250
errmsg("argument exceeds the maximum length of %d bytes",
227251
MAX_LEVENSHTEIN_STRLEN)));
228252

253+
/*
254+
* In order to avoid calling pg_mblen() repeatedly on each character in s,
255+
* we cache all the lengths before starting the main loop -- but if all the
256+
* characters in both strings are single byte, then we skip this and use
257+
* a fast-path in the main loop. If only one string contains multi-byte
258+
* characters, we still build the array, so that the fast-path needn't
259+
* deal with the case where the array hasn't been initialized.
260+
*/
261+
if (m != s_bytes || n != t_bytes)
262+
{
263+
int i;
264+
const char *cp = s_data;
265+
266+
s_char_len = (int *) palloc((m + 1) * sizeof(int));
267+
for (i = 0; i < m; ++i)
268+
{
269+
s_char_len[i] = pg_mblen(cp);
270+
cp += s_char_len[i];
271+
}
272+
s_char_len[i] = 0;
273+
}
274+
229275
/* One more cell for initialization column and row. */
230276
++m;
231277
++n;
@@ -244,36 +290,89 @@ levenshtein_internal(text *s, text *t,
244290
prev[i] = i * del_c;
245291

246292
/* Loop through rows of the notional array */
247-
for (y = VARDATA_ANY(t), j = 1; j < n; y++, j++)
293+
for (y = t_data, j = 1; j < n; j++)
248294
{
249295
int *temp;
296+
const char *x = s_data;
297+
int y_char_len = n != t_bytes + 1 ? pg_mblen(y) : 1;
250298

251299
/*
252300
* First cell must increment sequentially, as we're on the j'th row of
253301
* the (m+1)x(n+1) array.
254302
*/
255303
curr[0] = j * ins_c;
256304

257-
for (x = VARDATA_ANY(s), i = 1; i < m; x++, i++)
305+
/*
306+
* This inner loop is critical to performance, so we include a
307+
* fast-path to handle the (fairly common) case where no multibyte
308+
* characters are in the mix. The fast-path is entitled to assume
309+
* that if s_char_len is not initialized then BOTH strings contain
310+
* only single-byte characters.
311+
*/
312+
if (s_char_len != NULL)
258313
{
259-
int ins;
260-
int del;
261-
int sub;
262-
263-
/* Calculate costs for probable operations. */
264-
ins = prev[i] + ins_c; /* Insertion */
265-
del = curr[i - 1] + del_c; /* Deletion */
266-
sub = prev[i - 1] + ((*x == *y) ? 0 : sub_c); /* Substitution */
267-
268-
/* Take the one with minimum cost. */
269-
curr[i] = Min(ins, del);
270-
curr[i] = Min(curr[i], sub);
314+
for (i = 1; i < m; i++)
315+
{
316+
int ins;
317+
int del;
318+
int sub;
319+
int x_char_len = s_char_len[i - 1];
320+
321+
/*
322+
* Calculate costs for insertion, deletion, and substitution.
323+
*
324+
* When calculating cost for substitution, we compare the last
325+
* character of each possibly-multibyte character first,
326+
* because that's enough to rule out most mis-matches. If we
327+
* get past that test, then we compare the lengths and the
328+
* remaining bytes.
329+
*/
330+
ins = prev[i] + ins_c;
331+
del = curr[i - 1] + del_c;
332+
if (x[x_char_len-1] == y[y_char_len-1]
333+
&& x_char_len == y_char_len &&
334+
(x_char_len == 1 || rest_of_char_same(x, y, x_char_len)))
335+
sub = prev[i - 1];
336+
else
337+
sub = prev[i - 1] + sub_c;
338+
339+
/* Take the one with minimum cost. */
340+
curr[i] = Min(ins, del);
341+
curr[i] = Min(curr[i], sub);
342+
343+
/* Point to next character. */
344+
x += x_char_len;
345+
}
346+
}
347+
else
348+
{
349+
for (i = 1; i < m; i++)
350+
{
351+
int ins;
352+
int del;
353+
int sub;
354+
355+
/* Calculate costs for insertion, deletion, and substitution. */
356+
ins = prev[i] + ins_c;
357+
del = curr[i - 1] + del_c;
358+
sub = prev[i - 1] + ((*x == *y) ? 0 : sub_c);
359+
360+
/* Take the one with minimum cost. */
361+
curr[i] = Min(ins, del);
362+
curr[i] = Min(curr[i], sub);
363+
364+
/* Point to next character. */
365+
x++;
366+
}
271367
}
272368

273369
/* Swap current row with previous row. */
274370
temp = curr;
275371
curr = prev;
276372
prev = temp;
373+
374+
/* Point to next character. */
375+
y += y_char_len;
277376
}
278377

279378
/*

doc/src/sgml/fuzzystrmatch.sgml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
<!-- $PostgreSQL: pgsql/doc/src/sgml/fuzzystrmatch.sgml,v 1.6 2010/07/29 19:34:40 petere Exp $ -->
1+
<!-- $PostgreSQL: pgsql/doc/src/sgml/fuzzystrmatch.sgml,v 1.7 2010/08/02 23:20:23 rhaas Exp $ -->
22

33
<sect1 id="fuzzystrmatch">
44
<title>fuzzystrmatch</title>
@@ -14,8 +14,9 @@
1414

1515
<caution>
1616
<para>
17-
At present, <filename>fuzzystrmatch</> does not work well with
18-
multi-byte encodings (such as UTF-8).
17+
At present, the <function>soundex</>, <function>metaphone</>,
18+
<function>dmetaphone</>, and <function>dmetaphone_alt</> functions do
19+
not work well with multi-byte encodings (such as UTF-8).
1920
</para>
2021
</caution>
2122

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy