Skip to content

Commit 0766880

Browse files
committed
Cope with more than 64K phrases in a thesaurus dictionary.
dict_thesaurus stored phrase IDs in uint16 fields, so it would get confused and even crash if there were more than 64K entries in the configuration file. It turns out to be basically free to widen the phrase IDs to uint32, so let's just do so. This was complained of some time ago by David Boutin (in bug #7793); he later submitted an informal patch but it was never acted on. We now have another complaint (bug #11901 from Luc Ouellette) so it's time to make something happen. This is basically Boutin's patch, but for future-proofing I also added a defense against too many words per phrase. Note that we don't need any explicit defense against overflow of the uint32 counters, since before that happens we'd hit array allocation sizes that repalloc rejects. Back-patch to all supported branches because of the crash risk.
1 parent 4e74680 commit 0766880

File tree

1 file changed

+17
-8
lines changed

1 file changed

+17
-8
lines changed

src/backend/tsearch/dict_thesaurus.c

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929

3030
typedef struct LexemeInfo
3131
{
32-
uint16 idsubst; /* entry's number in DictThesaurus->subst */
32+
uint32 idsubst; /* entry's number in DictThesaurus->subst */
3333
uint16 posinsubst; /* pos info in entry */
3434
uint16 tnvariant; /* total num lexemes in one variant */
3535
struct LexemeInfo *nextentry;
@@ -69,7 +69,7 @@ typedef struct
6969

7070

7171
static void
72-
newLexeme(DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 posinsubst)
72+
newLexeme(DictThesaurus *d, char *b, char *e, uint32 idsubst, uint16 posinsubst)
7373
{
7474
TheLexeme *ptr;
7575

@@ -103,7 +103,7 @@ newLexeme(DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 posinsubst)
103103
}
104104

105105
static void
106-
addWrd(DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 nwrd, uint16 posinsubst, bool useasis)
106+
addWrd(DictThesaurus *d, char *b, char *e, uint32 idsubst, uint16 nwrd, uint16 posinsubst, bool useasis)
107107
{
108108
static int nres = 0;
109109
static int ntres = 0;
@@ -144,7 +144,6 @@ addWrd(DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 nwrd, uint16 p
144144
ntres *= 2;
145145
ptr->res = (TSLexeme *) repalloc(ptr->res, sizeof(TSLexeme) * ntres);
146146
}
147-
148147
}
149148

150149
ptr->res[nres].lexeme = palloc(e - b + 1);
@@ -169,7 +168,7 @@ static void
169168
thesaurusRead(char *filename, DictThesaurus *d)
170169
{
171170
tsearch_readline_state trst;
172-
uint16 idsubst = 0;
171+
uint32 idsubst = 0;
173172
bool useasis = false;
174173
char *line;
175174

@@ -185,8 +184,8 @@ thesaurusRead(char *filename, DictThesaurus *d)
185184
char *ptr;
186185
int state = TR_WAITLEX;
187186
char *beginwrd = NULL;
188-
uint16 posinsubst = 0;
189-
uint16 nwrd = 0;
187+
uint32 posinsubst = 0;
188+
uint32 nwrd = 0;
190189

191190
ptr = line;
192191

@@ -287,6 +286,16 @@ thesaurusRead(char *filename, DictThesaurus *d)
287286
(errcode(ERRCODE_CONFIG_FILE_ERROR),
288287
errmsg("unexpected end of line")));
289288

289+
/*
290+
* Note: currently, tsearch_readline can't return lines exceeding 4KB,
291+
* so overflow of the word counts is impossible. But that may not
292+
* always be true, so let's check.
293+
*/
294+
if (nwrd != (uint16) nwrd || posinsubst != (uint16) posinsubst)
295+
ereport(ERROR,
296+
(errcode(ERRCODE_CONFIG_FILE_ERROR),
297+
errmsg("too many lexemes in thesaurus entry")));
298+
290299
pfree(line);
291300
}
292301

@@ -671,7 +680,7 @@ findTheLexeme(DictThesaurus *d, char *lexeme)
671680
}
672681

673682
static bool
674-
matchIdSubst(LexemeInfo *stored, uint16 idsubst)
683+
matchIdSubst(LexemeInfo *stored, uint32 idsubst)
675684
{
676685
bool res = true;
677686

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy