Skip to content

Commit 22505f4

Browse files
committed
Add thesaurus dictionary which can replace N>0 lexemes by M>0 lexemes.
It required some changes in lexize algorithm, but interface with dictionaries stays compatible with old dictionaries. Funded by Georgia Public Library Service and LibLime, Inc.
1 parent 3b7ed9b commit 22505f4

File tree

13 files changed

+1257
-129
lines changed

13 files changed

+1257
-129
lines changed

contrib/tsearch2/Makefile

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
1-
# $PostgreSQL: pgsql/contrib/tsearch2/Makefile,v 1.14 2006/05/02 11:28:54 teodor Exp $
1+
# $PostgreSQL: pgsql/contrib/tsearch2/Makefile,v 1.15 2006/05/31 14:05:31 teodor Exp $
22

33
MODULE_big = tsearch2
44
OBJS = dict_ex.o dict.o snmap.o stopword.o common.o prs_dcfg.o \
5-
dict_snowball.o dict_ispell.o dict_syn.o \
5+
dict_snowball.o dict_ispell.o dict_syn.o dict_thesaurus.o \
66
wparser.o wparser_def.o \
77
ts_cfg.o tsvector.o query_cleanup.o crc32.o query.o gistidx.o \
88
tsvector_op.o rank.o ts_stat.o \
99
query_util.o query_support.o query_rewrite.o query_gist.o \
10-
ts_locale.o ginidx.o
10+
ts_locale.o ts_lexize.o ginidx.o
1111

1212
SUBDIRS := snowball ispell wordparser
1313
SUBDIROBJS := $(SUBDIRS:%=%/SUBSYS.o)
@@ -16,7 +16,7 @@ OBJS += $(SUBDIROBJS)
1616

1717
PG_CPPFLAGS = -I$(srcdir)/snowball -I$(srcdir)/ispell -I$(srcdir)/wordparser
1818

19-
DATA = stopword/english.stop stopword/russian.stop stopword/russian.stop.utf8
19+
DATA = stopword/english.stop stopword/russian.stop stopword/russian.stop.utf8 thesaurus
2020
DATA_built = tsearch2.sql untsearch2.sql
2121
DOCS = README.tsearch2
2222
REGRESS = tsearch2

contrib/tsearch2/common.c

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include "catalog/pg_proc.h"
66
#include "catalog/pg_namespace.h"
77
#include "utils/syscache.h"
8+
#include "miscadmin.h"
89

910
#include "ts_cfg.h"
1011
#include "dict.h"
@@ -163,3 +164,23 @@ get_oidnamespace(Oid funcoid)
163164

164165
return nspoid;
165166
}
167+
168+
/* if path is relative, take it as relative to share dir */
169+
char *
170+
to_absfilename(char *filename) {
171+
if (!is_absolute_path(filename)) {
172+
char sharepath[MAXPGPATH];
173+
char *absfn;
174+
#ifdef WIN32
175+
char delim = '\\';
176+
#else
177+
char delim = '/';
178+
#endif
179+
get_share_path(my_exec_path, sharepath);
180+
absfn = palloc(strlen(sharepath) + strlen(filename) + 2);
181+
sprintf(absfn, "%s%c%s", sharepath, delim, filename);
182+
filename = absfn;
183+
}
184+
185+
return filename;
186+
}

contrib/tsearch2/common.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ text *mtextdup(text *in);
1616

1717
int text_cmp(text *a, text *b);
1818

19+
char * to_absfilename(char *filename);
20+
1921
#define NEXTVAL(x) ( (text*)( (char*)(x) + INTALIGN( VARSIZE(x) ) ) )
2022
#define ARRNELEMS(x) ArrayGetNItems( ARR_NDIM(x), ARR_DIMS(x))
2123

contrib/tsearch2/dict.c

Lines changed: 50 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* $PostgreSQL: pgsql/contrib/tsearch2/dict.c,v 1.11 2006/03/11 04:38:30 momjian Exp $ */
1+
/* $PostgreSQL: pgsql/contrib/tsearch2/dict.c,v 1.12 2006/05/31 14:05:31 teodor Exp $ */
22

33
/*
44
* interface functions to dictionary
@@ -50,16 +50,19 @@ init_dict(Oid id, DictInfo * dict)
5050
Datum opt;
5151
Oid oid = InvalidOid;
5252

53+
/* setup dictlexize method */
54+
oid = DatumGetObjectId(SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 3, &isnull));
55+
if (isnull || oid == InvalidOid)
56+
ts_error(ERROR, "Null dict_lexize for dictonary %d", id);
57+
fmgr_info_cxt(oid, &(dict->lexize_info), TopMemoryContext);
58+
59+
/* setup and call dictinit method, optinally */
5360
oid = DatumGetObjectId(SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 1, &isnull));
5461
if (!(isnull || oid == InvalidOid))
5562
{
5663
opt = SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 2, &isnull);
5764
dict->dictionary = (void *) DatumGetPointer(OidFunctionCall1(oid, opt));
5865
}
59-
oid = DatumGetObjectId(SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 3, &isnull));
60-
if (isnull || oid == InvalidOid)
61-
ts_error(ERROR, "Null dict_lexize for dictonary %d", id);
62-
fmgr_info_cxt(oid, &(dict->lexize_info), TopMemoryContext);
6366
dict->dict_id = id;
6467
}
6568
else
@@ -98,6 +101,29 @@ comparedict(const void *a, const void *b)
98101
return (((DictInfo *) a)->dict_id < ((DictInfo *) b)->dict_id) ? -1 : 1;
99102
}
100103

104+
static void
105+
insertdict(Oid id) {
106+
DictInfo newdict;
107+
108+
if (DList.len == DList.reallen)
109+
{
110+
DictInfo *tmp;
111+
int reallen = (DList.reallen) ? 2 * DList.reallen : 16;
112+
113+
tmp = (DictInfo *) realloc(DList.list, sizeof(DictInfo) * reallen);
114+
if (!tmp)
115+
ts_error(ERROR, "No memory");
116+
DList.reallen = reallen;
117+
DList.list = tmp;
118+
}
119+
init_dict(id, &newdict);
120+
121+
DList.list[DList.len] = newdict;
122+
DList.len++;
123+
124+
qsort(DList.list, DList.len, sizeof(DictInfo), comparedict);
125+
}
126+
101127
DictInfo *
102128
finddict(Oid id)
103129
{
@@ -117,23 +143,8 @@ finddict(Oid id)
117143
return DList.last_dict;
118144
}
119145

120-
/* last chance */
121-
if (DList.len == DList.reallen)
122-
{
123-
DictInfo *tmp;
124-
int reallen = (DList.reallen) ? 2 * DList.reallen : 16;
125-
126-
tmp = (DictInfo *) realloc(DList.list, sizeof(DictInfo) * reallen);
127-
if (!tmp)
128-
ts_error(ERROR, "No memory");
129-
DList.reallen = reallen;
130-
DList.list = tmp;
131-
}
132-
DList.last_dict = &(DList.list[DList.len]);
133-
init_dict(id, DList.last_dict);
134-
135-
DList.len++;
136-
qsort(DList.list, DList.len, sizeof(DictInfo), comparedict);
146+
/* insert new dictionary */
147+
insertdict(id);
137148
return finddict(id); /* qsort changed order!! */ ;
138149
}
139150

@@ -190,17 +201,32 @@ lexize(PG_FUNCTION_ARGS)
190201
*ptr;
191202
Datum *da;
192203
ArrayType *a;
204+
DictSubState dstate = { false, false, NULL };
193205

194206
SET_FUNCOID();
195207
dict = finddict(PG_GETARG_OID(0));
196208

197209
ptr = res = (TSLexeme *) DatumGetPointer(
198-
FunctionCall3(&(dict->lexize_info),
210+
FunctionCall4(&(dict->lexize_info),
211+
PointerGetDatum(dict->dictionary),
212+
PointerGetDatum(VARDATA(in)),
213+
Int32GetDatum(VARSIZE(in) - VARHDRSZ),
214+
PointerGetDatum(&dstate)
215+
)
216+
);
217+
218+
if (dstate.getnext) {
219+
dstate.isend = true;
220+
ptr = res = (TSLexeme *) DatumGetPointer(
221+
FunctionCall4(&(dict->lexize_info),
199222
PointerGetDatum(dict->dictionary),
200223
PointerGetDatum(VARDATA(in)),
201-
Int32GetDatum(VARSIZE(in) - VARHDRSZ)
224+
Int32GetDatum(VARSIZE(in) - VARHDRSZ),
225+
PointerGetDatum(&dstate)
202226
)
203227
);
228+
}
229+
204230
PG_FREE_IF_COPY(in, 1);
205231
if (!res)
206232
{

contrib/tsearch2/dict.h

Lines changed: 55 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1-
/* $PostgreSQL: pgsql/contrib/tsearch2/dict.h,v 1.6 2006/03/11 04:38:30 momjian Exp $ */
1+
/* $PostgreSQL: pgsql/contrib/tsearch2/dict.h,v 1.7 2006/05/31 14:05:31 teodor Exp $ */
22

33
#ifndef __DICT_H__
44
#define __DICT_H__
55
#include "postgres.h"
66
#include "fmgr.h"
7+
#include "ts_cfg.h"
78

89
typedef struct
910
{
@@ -29,6 +30,11 @@ DictInfo *finddict(Oid id);
2930
Oid name2id_dict(text *name);
3031
void reset_dict(void);
3132

33+
typedef struct {
34+
bool isend; /* in: marks for lexize_info about text end is reached */
35+
bool getnext; /* out: dict wants next lexeme */
36+
void *private; /* internal dict state between calls with getnext == true */
37+
} DictSubState;
3238

3339
/* simple parser of cfg string */
3440
typedef struct
@@ -45,17 +51,61 @@ typedef struct
4551
/*
4652
* number of variant of split word , for example Word 'fotballklubber'
4753
* (norwegian) has two varian to split: ( fotball, klubb ) and ( fot,
48-
* ball, klubb ). So, dictionary should return: nvariant lexeme 1
49-
* fotball 1 klubb 2 fot 2 ball 2 klubb
50-
*
54+
* ball, klubb ). So, dictionary should return:
55+
* nvariant lexeme
56+
* 1 fotball
57+
* 1 klubb
58+
* 2 fot
59+
* 2 ball
60+
* 2 klubb
5161
*/
5262
uint16 nvariant;
5363

54-
/* currently unused */
5564
uint16 flags;
5665

5766
/* C-string */
5867
char *lexeme;
5968
} TSLexeme;
6069

70+
#define TSL_ADDPOS 0x01
71+
72+
73+
/*
74+
* Lexize subsystem
75+
*/
76+
77+
typedef struct ParsedLex {
78+
int type;
79+
char *lemm;
80+
int lenlemm;
81+
bool resfollow;
82+
struct ParsedLex *next;
83+
} ParsedLex;
84+
85+
typedef struct ListParsedLex {
86+
ParsedLex *head;
87+
ParsedLex *tail;
88+
} ListParsedLex;
89+
90+
typedef struct {
91+
TSCfgInfo *cfg;
92+
Oid curDictId;
93+
int posDict;
94+
DictSubState dictState;
95+
ParsedLex *curSub;
96+
ListParsedLex towork; /* current list to work */
97+
ListParsedLex waste; /* list of lexemes that already lexized */
98+
99+
/* fields to store last variant to lexize (basically, thesaurus
100+
or similar to, which wants several lexemes */
101+
102+
ParsedLex *lastRes;
103+
TSLexeme *tmpRes;
104+
} LexizeData;
105+
106+
107+
void LexizeInit(LexizeData *ld, TSCfgInfo *cfg);
108+
void LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm);
109+
TSLexeme* LexizeExec(LexizeData *ld, ParsedLex **correspondLexem);
110+
61111
#endif

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy