Skip to content

Commit 324300b

Browse files
committed
improve support of agglutinative languages (query with compound words).
regression=# select to_tsquery( '\'fotballklubber\''); to_tsquery ------------------------------------------------ 'fotball' & 'klubb' | 'fot' & 'ball' & 'klubb' (1 row) So, changed interface to dictionaries, lexize method of dictionary shoud return pointer to aray of TSLexeme structs instead of char**. Last element should have TSLexeme->lexeme == NULL. typedef struct { /* number of variant of split word , for example Word 'fotballklubber' (norwegian) has two varian to split: ( fotball, klubb ) and ( fot, ball, klubb ). So, dictionary should return: nvariant lexeme 1 fotball 1 klubb 2 fot 2 ball 2 klubb */ uint16 nvariant; /* currently unused */ uint16 flags; /* C-string */ char *lexeme; } TSLexeme;
1 parent d314616 commit 324300b

File tree

12 files changed

+146
-85
lines changed

12 files changed

+146
-85
lines changed

contrib/tsearch2/dict.c

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -183,15 +183,15 @@ lexize(PG_FUNCTION_ARGS)
183183
{
184184
text *in = PG_GETARG_TEXT_P(1);
185185
DictInfo *dict;
186-
char **res,
187-
**ptr;
186+
TSLexeme *res,
187+
*ptr;
188188
Datum *da;
189189
ArrayType *a;
190190

191191
SET_FUNCOID();
192192
dict = finddict(PG_GETARG_OID(0));
193193

194-
ptr = res = (char **) DatumGetPointer(
194+
ptr = res = (TSLexeme *) DatumGetPointer(
195195
FunctionCall3(&(dict->lexize_info),
196196
PointerGetDatum(dict->dictionary),
197197
PointerGetDatum(VARDATA(in)),
@@ -207,13 +207,13 @@ lexize(PG_FUNCTION_ARGS)
207207
PG_RETURN_NULL();
208208
}
209209

210-
while (*ptr)
210+
while (ptr->lexeme)
211211
ptr++;
212212
da = (Datum *) palloc(sizeof(Datum) * (ptr - res + 1));
213213
ptr = res;
214-
while (*ptr)
214+
while (ptr->lexeme)
215215
{
216-
da[ptr - res] = PointerGetDatum(char2text(*ptr));
216+
da[ptr - res] = PointerGetDatum(char2text(ptr->lexeme));
217217
ptr++;
218218
}
219219

@@ -227,10 +227,10 @@ lexize(PG_FUNCTION_ARGS)
227227
);
228228

229229
ptr = res;
230-
while (*ptr)
230+
while (ptr->lexeme)
231231
{
232232
pfree(DatumGetPointer(da[ptr - res]));
233-
pfree(*ptr);
233+
pfree(ptr->lexeme);
234234
ptr++;
235235
}
236236
pfree(res);

contrib/tsearch2/dict.h

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,4 +38,27 @@ typedef struct
3838

3939
void parse_cfgdict(text *in, Map ** m);
4040

41+
/* return struct for any lexize function */
42+
typedef struct {
43+
/* number of variant of split word , for example
44+
Word 'fotballklubber' (norwegian) has two varian to split:
45+
( fotball, klubb ) and ( fot, ball, klubb ). So, dictionary
46+
should return:
47+
nvariant lexeme
48+
1 fotball
49+
1 klubb
50+
2 fot
51+
2 ball
52+
2 klubb
53+
54+
*/
55+
uint16 nvariant;
56+
57+
/* currently unused */
58+
uint16 flags;
59+
60+
/* C-string */
61+
char *lexeme;
62+
} TSLexeme;
63+
4164
#endif

contrib/tsearch2/dict_ex.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,16 +54,16 @@ dex_lexize(PG_FUNCTION_ARGS)
5454
DictExample *d = (DictExample *) PG_GETARG_POINTER(0);
5555
char *in = (char *) PG_GETARG_POINTER(1);
5656
char *txt = pnstrdup(in, PG_GETARG_INT32(2));
57-
char **res = palloc(sizeof(char *) * 2);
57+
TSLexeme *res = palloc(sizeof(TSLexeme) * 2);
58+
59+
memset(res,0,sizeof(TSLexeme) * 2);
5860

5961
if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
6062
{
6163
pfree(txt);
62-
res[0] = NULL;
6364
}
6465
else
65-
res[0] = txt;
66-
res[1] = NULL;
66+
res[0].lexeme = txt;
6767

6868
PG_RETURN_POINTER(res);
6969
}

contrib/tsearch2/dict_ispell.c

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -159,14 +159,13 @@ spell_lexize(PG_FUNCTION_ARGS)
159159
DictISpell *d = (DictISpell *) PG_GETARG_POINTER(0);
160160
char *in = (char *) PG_GETARG_POINTER(1);
161161
char *txt;
162-
char **res;
163-
char **ptr,
164-
**cptr;
162+
TSLexeme *res;
163+
TSLexeme *ptr,
164+
*cptr;
165165

166166
if (!PG_GETARG_INT32(2))
167167
PG_RETURN_POINTER(NULL);
168168

169-
res = palloc(sizeof(char *) * 2);
170169
txt = pnstrdup(in, PG_GETARG_INT32(2));
171170
res = NINormalizeWord(&(d->obj), txt);
172171
pfree(txt);
@@ -175,22 +174,22 @@ spell_lexize(PG_FUNCTION_ARGS)
175174
PG_RETURN_POINTER(NULL);
176175

177176
ptr = cptr = res;
178-
while (*ptr)
177+
while (ptr->lexeme)
179178
{
180-
if (searchstoplist(&(d->stoplist), *ptr))
179+
if (searchstoplist(&(d->stoplist), ptr->lexeme))
181180
{
182-
pfree(*ptr);
183-
*ptr = NULL;
181+
pfree(ptr->lexeme);
182+
ptr->lexeme = NULL;
184183
ptr++;
185184
}
186185
else
187186
{
188-
*cptr = *ptr;
187+
memcpy(cptr, ptr, sizeof(TSLexeme));
189188
cptr++;
190189
ptr++;
191190
}
192191
}
193-
*cptr = NULL;
192+
cptr->lexeme = NULL;
194193

195194
PG_RETURN_POINTER(res);
196195
}

contrib/tsearch2/dict_snowball.c

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -105,12 +105,12 @@ snb_lexize(PG_FUNCTION_ARGS)
105105
DictSnowball *d = (DictSnowball *) PG_GETARG_POINTER(0);
106106
char *in = (char *) PG_GETARG_POINTER(1);
107107
char *txt = pnstrdup(in, PG_GETARG_INT32(2));
108-
char **res = palloc(sizeof(char *) * 2);
108+
TSLexeme *res = palloc(sizeof(TSLexeme) * 2);
109109

110+
memset(res, 0, sizeof(TSLexeme) * 2);
110111
if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
111112
{
112113
pfree(txt);
113-
res[0] = NULL;
114114
}
115115
else
116116
{
@@ -122,10 +122,8 @@ snb_lexize(PG_FUNCTION_ARGS)
122122
memcpy(txt, d->z->p, d->z->l);
123123
txt[d->z->l] = '\0';
124124
}
125-
res[0] = txt;
125+
res->lexeme = txt;
126126
}
127-
res[1] = NULL;
128-
129127

130128
PG_RETURN_POINTER(res);
131129
}

contrib/tsearch2/dict_syn.c

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ syn_lexize(PG_FUNCTION_ARGS)
162162
char *in = (char *) PG_GETARG_POINTER(1);
163163
Syn key,
164164
*found;
165-
char **res = NULL;
165+
TSLexeme *res = NULL;
166166

167167
if (!PG_GETARG_INT32(2))
168168
PG_RETURN_POINTER(NULL);
@@ -176,10 +176,9 @@ syn_lexize(PG_FUNCTION_ARGS)
176176
if (!found)
177177
PG_RETURN_POINTER(NULL);
178178

179-
res = palloc(sizeof(char *) * 2);
180-
181-
res[0] = pstrdup(found->out);
182-
res[1] = NULL;
179+
res = palloc(sizeof(TSLexeme) * 2);
180+
memset(res,0,sizeof(TSLexeme) * 2);
181+
res[0].lexeme = pstrdup(found->out);
183182

184183
PG_RETURN_POINTER(res);
185184
}

contrib/tsearch2/gendict/dict_tmpl.c.IN

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -52,15 +52,15 @@ dlexize_CFG_MODNAME(PG_FUNCTION_ARGS) {
5252
HASINIT DictExample *d = (DictExample*)PG_GETARG_POINTER(0);
5353
char *in = (char*)PG_GETARG_POINTER(1);
5454
char *txt = pnstrdup(in, PG_GETARG_INT32(2));
55-
char **res=palloc(sizeof(char*)*2);
55+
TSLexeme *res=palloc(sizeof(TSLexeme*)*2);
5656

57-
/* Your INIT dictionary code */
57+
/* Your LEXIZE dictionary code */
5858
HASINIT if ( *txt=='\0' || searchstoplist(&(d->stoplist),txt) ) {
5959
HASINIT pfree(txt);
60-
HASINIT res[0]=NULL;
60+
HASINIT res[0].lexeme=NULL;
6161
HASINIT } else
62-
res[0]=txt;
63-
res[1]=NULL;
62+
res[0].lexeme=txt;
63+
res[1].lexeme=NULL;
6464

6565
PG_RETURN_POINTER(res);
6666
}

contrib/tsearch2/ispell/spell.c

Lines changed: 39 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1119,17 +1119,32 @@ SplitToVariants(IspellDict * Conf, SPNode * snode, SplitVar * orig, char *word,
11191119
return var;
11201120
}
11211121

1122-
char **
1122+
TSLexeme *
11231123
NINormalizeWord(IspellDict * Conf, char *word)
11241124
{
11251125
char **res = NormalizeSubWord(Conf, word, 0);
1126+
TSLexeme *lcur=NULL, *lres=NULL;
1127+
u_int16_t NVariant=1;
1128+
1129+
if (res) {
1130+
char **ptr = res;
1131+
lcur = lres = (TSLexeme*)palloc( MAX_NORM * sizeof(TSLexeme) );
1132+
while(*ptr) {
1133+
lcur->lexeme=*ptr;
1134+
lcur->flags=0;
1135+
lcur->nvariant = NVariant++;
1136+
lcur++;
1137+
ptr++;
1138+
}
1139+
lcur->lexeme=NULL;
1140+
pfree(res);
1141+
}
11261142

11271143
if (Conf->compoundcontrol != '\t')
11281144
{
11291145
int wordlen = strlen(word);
11301146
SplitVar *ptr,
11311147
*var = SplitToVariants(Conf, NULL, NULL, word, wordlen, 0, -1);
1132-
char **cur = res;
11331148
int i;
11341149

11351150
while (var)
@@ -1140,30 +1155,31 @@ NINormalizeWord(IspellDict * Conf, char *word)
11401155

11411156
if (subres)
11421157
{
1143-
char **ptr = subres;
1158+
char **subptr = subres;
1159+
1160+
if ( !lcur )
1161+
lcur = lres = (TSLexeme*)palloc( MAX_NORM * sizeof(TSLexeme) );
1162+
1163+
while(*subptr) {
1164+
for(i=0;i<var->nstem-1;i++) {
1165+
lcur->lexeme=(subptr==subres) ? var->stem[ i ] : pstrdup(var->stem[ i ]);
1166+
lcur->flags=0;
1167+
lcur->nvariant = NVariant;
1168+
lcur++;
1169+
}
11441170

1145-
if (cur)
1146-
{
1147-
while (*cur)
1148-
cur++;
1149-
}
1150-
else
1151-
res = cur = (char **) palloc(MAX_NORM * sizeof(char *));
1171+
lcur->lexeme=*subptr;
1172+
lcur->flags=0;
1173+
lcur->nvariant = NVariant;
1174+
lcur++;
1175+
subptr++;
1176+
NVariant++;
1177+
}
11521178

1153-
for (i = 0; i < var->nstem - 1; i++)
1154-
{
1155-
*cur = var->stem[i];
1156-
cur++;
1157-
}
1158-
while (*ptr)
1159-
{
1160-
*cur = *ptr;
1161-
cur++;
1162-
ptr++;
1163-
}
1164-
*cur = NULL;
1179+
lcur->lexeme=NULL;
11651180
pfree(subres);
11661181
var->stem[0] = NULL;
1182+
pfree( var->stem[ var->nstem-1 ] );
11671183
}
11681184
}
11691185

@@ -1175,7 +1191,7 @@ NINormalizeWord(IspellDict * Conf, char *word)
11751191
var = ptr;
11761192
}
11771193
}
1178-
return res;
1194+
return lres;
11791195
}
11801196

11811197

contrib/tsearch2/ispell/spell.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,11 @@
33

44
#include <sys/types.h>
55
#include "regex/regex.h"
6-
#include "regis.h"
76
#include "c.h"
87

9-
8+
#include "regis.h"
9+
#include "dict.h"
10+
1011
struct SPNode;
1112

1213

@@ -116,7 +117,7 @@ typedef struct
116117

117118
} IspellDict;
118119

119-
char **NINormalizeWord(IspellDict * Conf, char *word);
120+
TSLexeme *NINormalizeWord(IspellDict * Conf, char *word);
120121
int NIImportAffixes(IspellDict * Conf, const char *filename);
121122
int NIImportDictionary(IspellDict * Conf, const char *filename);
122123

contrib/tsearch2/query.c

Lines changed: 33 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,7 @@ pushval_morph(QPRS_STATE * state, int typeval, char *strval, int lenval, int2 we
265265
{
266266
int4 count = 0;
267267
PRSTEXT prs;
268+
uint32 variant, pos, cntvar=0, cntpos=0, cnt=0;
268269

269270
prs.lenwords = 32;
270271
prs.curwords = 0;
@@ -273,17 +274,39 @@ pushval_morph(QPRS_STATE * state, int typeval, char *strval, int lenval, int2 we
273274

274275
parsetext_v2(findcfg(state->cfg_id), &prs, strval, lenval);
275276

276-
for (count = 0; count < prs.curwords; count++)
277-
{
278-
pushval_asis(state, VAL, prs.words[count].word, prs.words[count].len, weight);
279-
pfree(prs.words[count].word);
280-
if (count)
281-
pushquery(state, OPR, (int4) '&', 0, 0, 0);
282-
}
283-
pfree(prs.words);
277+
if ( prs.curwords>0 ) {
278+
279+
while (count < prs.curwords) {
280+
pos = prs.words[count].pos.pos;
281+
cntvar=0;
282+
while(count < prs.curwords && pos==prs.words[count].pos.pos) {
283+
variant = prs.words[count].nvariant;
284+
285+
cnt=0;
286+
while(count < prs.curwords && pos==prs.words[count].pos.pos && variant==prs.words[count].nvariant) {
287+
288+
pushval_asis(state, VAL, prs.words[count].word, prs.words[count].len, weight);
289+
pfree(prs.words[count].word);
290+
if ( cnt )
291+
pushquery(state, OPR, (int4) '&', 0, 0, 0);
292+
cnt++;
293+
count++;
294+
}
295+
296+
if ( cntvar )
297+
pushquery(state, OPR, (int4) '|', 0, 0, 0);
298+
cntvar++;
299+
}
300+
301+
if (cntpos)
302+
pushquery(state, OPR, (int4) '&', 0, 0, 0);
303+
304+
cntpos++;
305+
}
306+
307+
pfree(prs.words);
284308

285-
/* XXX */
286-
if (prs.curwords == 0)
309+
} else
287310
pushval_asis(state, VALSTOP, NULL, 0, 0);
288311
}
289312

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy