Skip to content

Commit 92bcb5a

Browse files
committed
Allow do not lexize words in substitution.
Docs will be submitted some later, now it's at http://www.sai.msu.su/~megera/oddmuse/index.cgi/Thesaurus_dictionary
1 parent 63e464a commit 92bcb5a

File tree

2 files changed

+69
-30
lines changed

2 files changed

+69
-30
lines changed

contrib/tsearch2/dict_thesaurus.c

Lines changed: 60 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* $PostgreSQL: pgsql/contrib/tsearch2/dict_thesaurus.c,v 1.4 2006/06/02 18:03:06 teodor Exp $ */
1+
/* $PostgreSQL: pgsql/contrib/tsearch2/dict_thesaurus.c,v 1.5 2006/06/06 16:25:55 teodor Exp $ */
22

33
/*
44
* thesaurus
@@ -13,6 +13,11 @@
1313
#include "common.h"
1414
#include "ts_locale.h"
1515

16+
/*
17+
* Temporay we use TSLexeme.flags for inner use...
18+
*/
19+
#define DT_USEASIS 0x1000
20+
1621
typedef struct LexemeInfo {
1722
uint16 idsubst; /* entry's number in DictThesaurus->subst */
1823
uint16 posinsubst; /* pos info in entry */
@@ -94,7 +99,7 @@ newLexeme( DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 posinsubst
9499
}
95100

96101
static void
97-
addWrd( DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 nwrd, uint16 posinsubst ) {
102+
addWrd( DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 nwrd, uint16 posinsubst, bool useasis ) {
98103
static int nres=0;
99104
static int ntres = 0;
100105
TheSubstitute *ptr;
@@ -138,7 +143,10 @@ addWrd( DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 nwrd, uint16
138143
ptr->res[ nres ].lexeme[e-b] = '\0';
139144

140145
ptr->res[ nres ].nvariant = nwrd;
141-
ptr->res[ nres ].flags = TSL_ADDPOS;
146+
if ( useasis )
147+
ptr->res[ nres ].flags = DT_USEASIS;
148+
else
149+
ptr->res[ nres ].flags = 0;
142150

143151
ptr->res[ ++nres ].lexeme = NULL;
144152
}
@@ -154,6 +162,7 @@ thesaurusRead( char *filename, DictThesaurus *d ) {
154162
char str[BUFSIZ];
155163
int lineno=0;
156164
uint16 idsubst = 0;
165+
bool useasis=false;
157166

158167
fh = fopen(to_absfilename(filename), "r");
159168
if (!fh)
@@ -196,13 +205,24 @@ thesaurusRead( char *filename, DictThesaurus *d ) {
196205
state = TR_WAITLEX;
197206
}
198207
} else if ( state == TR_WAITSUBS ) {
199-
if ( !t_isspace(ptr) ) {
208+
if ( t_iseq(ptr, '*') ) {
209+
useasis = true;
210+
state = TR_INSUBS;
211+
beginwrd = ptr + pg_mblen(ptr);
212+
} else if ( t_iseq(ptr, '\\') ) {
213+
useasis = false;
214+
state = TR_INSUBS;
215+
beginwrd = ptr + pg_mblen(ptr);
216+
} else if ( !t_isspace(ptr) ) {
217+
useasis = false;
200218
beginwrd = ptr;
201219
state = TR_INSUBS;
202220
}
203221
} else if ( state == TR_INSUBS ) {
204222
if ( t_isspace(ptr) ) {
205-
addWrd( d, beginwrd, ptr, idsubst, nwrd++, posinsubst );
223+
if ( ptr == beginwrd )
224+
elog(ERROR, "Thesaurus: Unexpected end of line or lexeme at %d line", lineno);
225+
addWrd( d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis );
206226
state = TR_WAITSUBS;
207227
}
208228
} else
@@ -211,8 +231,11 @@ thesaurusRead( char *filename, DictThesaurus *d ) {
211231
ptr += pg_mblen(ptr);
212232
}
213233

214-
if ( state == TR_INSUBS )
215-
addWrd( d, beginwrd, ptr, idsubst, nwrd++, posinsubst );
234+
if ( state == TR_INSUBS ) {
235+
if ( ptr == beginwrd )
236+
elog(ERROR, "Thesaurus: Unexpected end of line or lexeme at %d line", lineno);
237+
addWrd( d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis );
238+
}
216239

217240
idsubst++;
218241

@@ -319,7 +342,9 @@ compileTheLexeme(DictThesaurus *d) {
319342
elog(ERROR,"Out of memory");
320343

321344
for(i=0;i<d->nwrds;i++) {
322-
TSLexeme *ptr = (TSLexeme*) DatumGetPointer(
345+
TSLexeme *ptr;
346+
347+
ptr = (TSLexeme*) DatumGetPointer(
323348
FunctionCall4(
324349
&(d->subdict.lexize_info),
325350
PointerGetDatum(d->subdict.dictionary),
@@ -331,9 +356,11 @@ compileTheLexeme(DictThesaurus *d) {
331356

332357
if ( !(ptr && ptr->lexeme) ) {
333358
if ( !ptr )
334-
elog(ERROR,"Thesaurus: word '%s' isn't recognized by subdictionary", d->wrds[i].lexeme);
359+
elog(ERROR,"Thesaurus: word-sample '%s' isn't recognized by subdictionary (rule %d)",
360+
d->wrds[i].lexeme, d->wrds[i].entries->idsubst+1 );
335361
else
336-
elog(NOTICE,"Thesaurus: word '%s' is recognized as stop-word, assign any stop-word", d->wrds[i].lexeme);
362+
elog(NOTICE,"Thesaurus: word-sample '%s' is recognized as stop-word, assign any stop-word (rule %d)",
363+
d->wrds[i].lexeme, d->wrds[i].entries->idsubst+1);
337364

338365
newwrds = addCompiledLexeme( newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0);
339366
} else {
@@ -413,17 +440,25 @@ compileTheSubstitute(DictThesaurus *d) {
413440
inptr = rem;
414441

415442
while( inptr && inptr->lexeme ) {
416-
TSLexeme *reml, *lexized = (TSLexeme*) DatumGetPointer(
417-
FunctionCall4(
418-
&(d->subdict.lexize_info),
419-
PointerGetDatum(d->subdict.dictionary),
420-
PointerGetDatum(inptr->lexeme),
421-
Int32GetDatum(strlen(inptr->lexeme)),
422-
PointerGetDatum(NULL)
423-
)
424-
);
443+
TSLexeme *lexized, tmplex[2];
444+
445+
if ( inptr->flags & DT_USEASIS ) { /* do not lexize */
446+
tmplex[0] = *inptr;
447+
tmplex[0].flags = 0;
448+
tmplex[1].lexeme = NULL;
449+
lexized = tmplex;
450+
} else {
451+
lexized = (TSLexeme*) DatumGetPointer(
452+
FunctionCall4(
453+
&(d->subdict.lexize_info),
454+
PointerGetDatum(d->subdict.dictionary),
455+
PointerGetDatum(inptr->lexeme),
456+
Int32GetDatum(strlen(inptr->lexeme)),
457+
PointerGetDatum(NULL)
458+
)
459+
);
460+
}
425461

426-
reml = lexized;
427462
if ( lexized && lexized->lexeme ) {
428463
int toset = (lexized->lexeme && outptr != d->subst[i].res ) ? (outptr - d->subst[i].res) : -1;
429464

@@ -447,8 +482,10 @@ compileTheSubstitute(DictThesaurus *d) {
447482

448483
if ( toset > 0)
449484
d->subst[i].res[toset].flags |= TSL_ADDPOS;
485+
} else if ( lexized ) {
486+
elog(NOTICE,"Thesaurus: word '%s' in substition is a stop-word, ignored (rule %d)", inptr->lexeme, i+1);
450487
} else {
451-
elog(NOTICE,"Thesaurus: word '%s' isn't recognized by subdictionary or it's a stop-word, ignored", inptr->lexeme);
488+
elog(ERROR,"Thesaurus: word '%s' in substition isn't recognized (rule %d)", inptr->lexeme, i+1);
452489
}
453490

454491
if ( inptr->lexeme )
@@ -457,7 +494,7 @@ compileTheSubstitute(DictThesaurus *d) {
457494
}
458495

459496
if ( outptr == d->subst[i].res )
460-
elog(ERROR,"Thesaurus: all words in subsitution aren't recognized by subdictionary");
497+
elog(ERROR,"Thesaurus: all words in subsitution are stop word (rule %d)", i+1);
461498

462499
d->subst[i].reslen = outptr - d->subst[i].res;
463500

@@ -717,7 +754,7 @@ thesaurus_lexize(PG_FUNCTION_ARGS)
717754

718755
infos = (LexemeInfo**)palloc(sizeof(LexemeInfo*)*nlex);
719756
for(i=0;i<nlex;i++)
720-
if ( (infos[i] = findTheLexeme(d, basevar[i].lexeme)) == NULL )
757+
if ( (infos[i] = findTheLexeme(d, basevar[i].lexeme)) == NULL )
721758
break;
722759

723760
if ( i<nlex ) {

contrib/tsearch2/thesaurus

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,16 @@
11
#
22
# Theasurus config file. Character ':' splits
3-
# string to part:
4-
# to be substituted string
5-
# substituting string
3+
# string to part, example:
4+
# sample-words : substitute-words
65
#
6+
# Any substitute-word can be marked by preceding '*' character,
7+
# which means do not lexize this word
8+
# Docs: http://www.sai.msu.su/~megera/oddmuse/index.cgi/Thesaurus_dictionary
79

8-
#one two three : 123
9-
#one two : 12
10-
#one : 1
11-
#two : 2
10+
#one two three : *123
11+
#one two : *12
12+
#one : *1
13+
#two : *2
1214

1315
#foo bar : blah blah
1416
#f bar : fbar

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy