Skip to content

Commit 1b24887

Browse files
committed
Allow multi-character source strings in contrib/unaccent.
This could be useful in languages where diacritic signs are represented as separate characters; more generally it supports using unaccent dictionaries for substring substitutions beyond narrowly conceived "diacritic removal". In any case, since the rule-file parser doesn't complain about multi-character source strings, it behooves us to do something unsurprising with them.
1 parent 97c40ce commit 1b24887

File tree

2 files changed

+67
-32
lines changed

2 files changed

+67
-32
lines changed

contrib/unaccent/unaccent.c

Lines changed: 59 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,16 @@
2323
PG_MODULE_MAGIC;
2424

2525
/*
26-
* Unaccent dictionary uses a trie to find a character to replace. Each node of
27-
* the trie is an array of 256 TrieChar structs (n-th element of array
28-
* corresponds to byte)
26+
* An unaccent dictionary uses a trie to find a string to replace. Each node
27+
* of the trie is an array of 256 TrieChar structs; the N-th element of the
28+
* array corresponds to next byte value N. That element can contain both a
29+
* replacement string (to be used if the source string ends with this byte)
30+
* and a link to another trie node (to be followed if there are more bytes).
31+
*
32+
* Note that the trie search logic pays no attention to multibyte character
33+
* boundaries. This is OK as long as both the data entered into the trie and
34+
* the data we're trying to look up are validly encoded; no partial-character
35+
* matches will occur.
2936
*/
3037
typedef struct TrieChar
3138
{
@@ -36,34 +43,38 @@ typedef struct TrieChar
3643

3744
/*
3845
* placeChar - put str into trie's structure, byte by byte.
46+
*
47+
* If node is NULL, we need to make a new node, which will be returned;
48+
* otherwise the return value is the same as node.
3949
*/
4050
static TrieChar *
41-
placeChar(TrieChar *node, unsigned char *str, int lenstr, char *replaceTo, int replacelen)
51+
placeChar(TrieChar *node, const unsigned char *str, int lenstr,
52+
const char *replaceTo, int replacelen)
4253
{
4354
TrieChar *curnode;
4455

4556
if (!node)
46-
{
47-
node = palloc(sizeof(TrieChar) * 256);
48-
memset(node, 0, sizeof(TrieChar) * 256);
49-
}
57+
node = (TrieChar *) palloc0(sizeof(TrieChar) * 256);
58+
59+
Assert(lenstr > 0); /* else str[0] doesn't exist */
5060

5161
curnode = node + *str;
5262

53-
if (lenstr == 1)
63+
if (lenstr <= 1)
5464
{
5565
if (curnode->replaceTo)
56-
elog(WARNING, "duplicate TO argument, use first one");
66+
elog(WARNING, "duplicate source strings, first one will be used");
5767
else
5868
{
5969
curnode->replacelen = replacelen;
60-
curnode->replaceTo = palloc(replacelen);
70+
curnode->replaceTo = (char *) palloc(replacelen);
6171
memcpy(curnode->replaceTo, replaceTo, replacelen);
6272
}
6373
}
6474
else
6575
{
66-
curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1, replaceTo, replacelen);
76+
curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1,
77+
replaceTo, replacelen);
6778
}
6879

6980
return node;
@@ -213,23 +224,35 @@ initTrie(char *filename)
213224
}
214225

215226
/*
216-
* findReplaceTo - find multibyte character in trie
227+
* findReplaceTo - find longest possible match in trie
228+
*
229+
* On success, returns pointer to ending subnode, plus length of matched
230+
* source string in *p_matchlen. On failure, returns NULL.
217231
*/
218232
static TrieChar *
219-
findReplaceTo(TrieChar *node, unsigned char *src, int srclen)
233+
findReplaceTo(TrieChar *node, const unsigned char *src, int srclen,
234+
int *p_matchlen)
220235
{
221-
while (node)
236+
TrieChar *result = NULL;
237+
int matchlen = 0;
238+
239+
*p_matchlen = 0; /* prevent uninitialized-variable warnings */
240+
241+
while (node && matchlen < srclen)
222242
{
223-
node = node + *src;
224-
if (srclen == 1)
225-
return node;
243+
node = node + src[matchlen];
244+
matchlen++;
245+
246+
if (node->replaceTo)
247+
{
248+
result = node;
249+
*p_matchlen = matchlen;
250+
}
226251

227-
src++;
228-
srclen--;
229252
node = node->nextChar;
230253
}
231254

232-
return NULL;
255+
return result;
233256
}
234257

235258
PG_FUNCTION_INFO_V1(unaccent_init);
@@ -280,18 +303,17 @@ unaccent_lexize(PG_FUNCTION_ARGS)
280303
TrieChar *rootTrie = (TrieChar *) PG_GETARG_POINTER(0);
281304
char *srcchar = (char *) PG_GETARG_POINTER(1);
282305
int32 len = PG_GETARG_INT32(2);
283-
char *srcstart,
306+
char *srcstart = srcchar,
284307
*trgchar = NULL;
285-
int charlen;
286308
TSLexeme *res = NULL;
287-
TrieChar *node;
288309

289-
srcstart = srcchar;
290-
while (srcchar - srcstart < len)
310+
while (len > 0)
291311
{
292-
charlen = pg_mblen(srcchar);
312+
TrieChar *node;
313+
int matchlen;
293314

294-
node = findReplaceTo(rootTrie, (unsigned char *) srcchar, charlen);
315+
node = findReplaceTo(rootTrie, (unsigned char *) srcchar, len,
316+
&matchlen);
295317
if (node && node->replaceTo)
296318
{
297319
if (!res)
@@ -309,13 +331,18 @@ unaccent_lexize(PG_FUNCTION_ARGS)
309331
memcpy(trgchar, node->replaceTo, node->replacelen);
310332
trgchar += node->replacelen;
311333
}
312-
else if (res)
334+
else
313335
{
314-
memcpy(trgchar, srcchar, charlen);
315-
trgchar += charlen;
336+
matchlen = pg_mblen(srcchar);
337+
if (res)
338+
{
339+
memcpy(trgchar, srcchar, matchlen);
340+
trgchar += matchlen;
341+
}
316342
}
317343

318-
srcchar += charlen;
344+
srcchar += matchlen;
345+
len -= matchlen;
319346
}
320347

321348
if (res)

doc/src/sgml/unaccent.sgml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,14 @@
7070
</para>
7171
</listitem>
7272

73+
<listitem>
74+
<para>
75+
Actually, each <quote>character</> can be any string not containing
76+
whitespace, so <filename>unaccent</> dictionaries could be used for
77+
other sorts of substring substitutions besides diacritic removal.
78+
</para>
79+
</listitem>
80+
7381
<listitem>
7482
<para>
7583
As with other <productname>PostgreSQL</> text search configuration files,

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy