Skip to content

Commit c0e1500

Browse files
author
Maksim Milyutin
committed
Make stable version of index searching on hash values of lexemes in
tsvector and tsquery
1 parent 27f78d4 commit c0e1500

File tree

6 files changed

+38
-121
lines changed

6 files changed

+38
-121
lines changed

expected/rum.out

Lines changed: 0 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -304,49 +304,3 @@ SELECT a <=> to_tsquery('pg_catalog.english', 'w:*'), *
304304
FROM test_rum
305305
WHERE a @@ to_tsquery('pg_catalog.english', 'w:*')
306306
ORDER BY a <=> to_tsquery('pg_catalog.english', 'w:*');
307-
?column? | t | a
308-
----------+--------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------
309-
8.22467 | so well that only a fragment, as it were, gave way. It still hangs as if | 'fragment':6 'gave':10 'hang':14 'still':13 'way':11 'well':2
310-
8.22467 | wine, but wouldn't you divide with your neighbors! The columns in the | 'column':11 'divid':6 'neighbor':9 'wine':1 'wouldn':3
311-
8.22467 | not say, but you wrote as if you knew it by sight as well as by heart. | 'heart':17 'knew':9 'say':2 'sight':12 'well':14 'wrote':5
312-
16.4493 | little series of pictures. Have you ever been here, I wonder? You did | 'ever':7 'littl':1 'pictur':4 'seri':2 'wonder':11
313-
16.4493 | itself. Put on your "specs" and look at the castle, half way up the | 'castl':10 'half':11 'look':7 'put':2 'spec':5 'way':12
314-
16.4493 | _berg_, "the Jettenhuhl, a wooded spur of the Konigestuhl." Look at it | 'berg':1 'jettenhuhl':3 'konigestuhl':9 'look':10 'spur':6 'wood':5
315-
16.4493 | thickness of the walls, twenty-one feet, and the solid masonry, held it | 'feet':8 'held':13 'masonri':12 'one':7 'solid':11 'thick':1 'twenti':6 'twenty-on':5 'wall':4
316-
16.4493 | ornamental building, and I wish you could see it, if you have not seen | 'build':2 'could':7 'ornament':1 'see':8 'seen':14 'wish':5
317-
16.4493 | thinking--"to go or not to go?" We are this far on the way. Reached | 'far':11 'go':3,7 'reach':15 'think':1 'way':14
318-
16.4493 | curious spectacle, but on the whole had "the banquet-hall deserted" | 'banquet':10 'banquet-hal':9 'curious':1 'desert':12 'hall':11 'spectacl':2 'whole':6
319-
16.4493 | As a reward for your reformation I write to you on this precious sheet. | 'precious':13 'reform':6 'reward':3 'sheet':14 'write':8
320-
16.4493 | entrance of the Black Forest, among picturesque, thickly-wooded hills, | 'among':6 'black':4 'entranc':1 'forest':5 'hill':11 'picturesqu':7 'thick':9 'thickly-wood':8 'wood':10
321-
16.4493 | You see I have come to be wonderfully attached to Heidelberg, the | 'attach':9 'come':5 'heidelberg':11 'see':2 'wonder':8
322-
16.4493 | my appreciation of you in a more complimentary way than by sending this | 'appreci':2 'complimentari':8 'send':12 'way':9
323-
(14 rows)
324-
325-
SELECT a <=> to_tsquery('pg_catalog.english', 'b:*'), *
326-
FROM test_rum
327-
WHERE a @@ to_tsquery('pg_catalog.english', 'b:*')
328-
ORDER BY a <=> to_tsquery('pg_catalog.english', 'b:*');
329-
?column? | t | a
330-
----------+--------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------
331-
8.22467 | been trying my best to get all those "passes" into my brain. Now, thanks | 'best':4 'brain':12 'get':6 'pass':9 'thank':14 'tri':2
332-
8.22467 | All the above information, I beg you to believe, I do not intend you | 'beg':6 'believ':9 'inform':4 'intend':13
333-
8.22467 | curious spectacle, but on the whole had "the banquet-hall deserted" | 'banquet':10 'banquet-hal':9 'curious':1 'desert':12 'hall':11 'spectacl':2 'whole':6
334-
8.22467 | oaks, limes and maples, bordered with flower-beds and shrubberies, and | 'bed':9 'border':5 'flower':8 'flower-b':7 'lime':2 'mapl':4 'oak':1 'shrubberi':11
335-
13.1595 | foo bar foo the over foo qq bar | 'bar':2,8 'foo':1,3,6 'qq':7
336-
16.4493 | ornamental building, and I wish you could see it, if you have not seen | 'build':2 'could':7 'ornament':1 'see':8 'seen':14 'wish':5
337-
16.4493 | the--nearest guide-book! | 'book':5 'guid':4 'guide-book':3 'nearest':2
338-
16.4493 | to your letter, I have them all in the handiest kind of a bunch. Ariel | 'ariel':15 'bunch':14 'handiest':10 'kind':11 'letter':3
339-
16.4493 | beautiful, the quaint, the historically poetic, learned and picturesque | 'beauti':1 'histor':5 'learn':7 'picturesqu':9 'poetic':6 'quaint':3
340-
16.4493 | there are dreadful reports of floods and roads caved in and bridges | 'bridg':12 'cave':9 'dread':3 'flood':6 'report':4 'road':8
341-
16.4493 | the Conversationhaus, the bazaar, mingling with the throng, listening to | 'bazaar':4 'conversationhaus':2 'listen':9 'mingl':5 'throng':8
342-
16.4493 | the band, and comparing what it is with what it was. It was a gay and | 'band':2 'compar':4 'gay':15
343-
16.4493 | look. The situation is most beautiful. It lies, you know, at the | 'beauti':6 'know':10 'lie':8 'look':1 'situat':3
344-
16.4493 | entrance of the Black Forest, among picturesque, thickly-wooded hills, | 'among':6 'black':4 'entranc':1 'forest':5 'hill':11 'picturesqu':7 'thick':9 'thickly-wood':8 'wood':10
345-
16.4493 | town with angry, headlong speed. There is an avenue along its bank of | 'along':10 'angri':3 'avenu':9 'bank':12 'headlong':4 'speed':5 'town':1
346-
16.4493 | like, "I'll do my bidding gently," and as surely, if I get there. But | 'bid':6 'gentl':7 'get':13 'like':1 'll':3 'sure':10
347-
16.4493 | _berg_, "the Jettenhuhl, a wooded spur of the Konigestuhl." Look at it | 'berg':1 'jettenhuhl':3 'konigestuhl':9 'look':10 'spur':6 'wood':5
348-
16.4493 | Gesprente Thurm is the one that was blown up by the French. The | 'blown':8 'french':12 'gesprent':1 'one':5 'thurm':2
349-
16.4493 | portico that shows in the Schlosshof are the four brought from | 'brought':10 'four':9 'portico':1 'schlosshof':6 'show':3
350-
16.4493 | the few that escaped destruction in 1693. It is a beautiful, highly | '1693':7 'beauti':11 'destruct':5 'escap':4 'high':12
351-
(20 rows)
352-

rum--1.0.sql

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -80,17 +80,12 @@ RETURNS bytea
8080
AS 'MODULE_PATHNAME'
8181
LANGUAGE C IMMUTABLE STRICT;
8282

83-
CREATE FUNCTION rum_cmp_tslexeme(bytea, bytea)
84-
RETURNS integer
85-
AS 'MODULE_PATHNAME'
86-
LANGUAGE C IMMUTABLE STRICT;
87-
8883
CREATE OPERATOR CLASS rum_tsvector_ops
8984
FOR TYPE tsvector USING rum
9085
AS
9186
OPERATOR 1 @@ (tsvector, tsquery),
9287
OPERATOR 2 <=> (tsvector, tsquery) FOR ORDER BY pg_catalog.float_ops,
93-
FUNCTION 1 rum_cmp_tslexeme(bytea, bytea),
88+
FUNCTION 1 btint4cmp(integer, integer),
9489
FUNCTION 2 rum_extract_tsvector(tsvector,internal,internal,internal,internal),
9590
FUNCTION 3 rum_extract_tsquery(tsquery,internal,smallint,internal,internal,internal,internal),
9691
FUNCTION 4 rum_tsquery_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal),
@@ -99,7 +94,7 @@ AS
9994
FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal),
10095
FUNCTION 8 rum_tsquery_distance(internal,smallint,tsvector,int,internal,internal,internal,internal,internal),
10196
FUNCTION 10 rum_ts_join_pos(internal, internal),
102-
STORAGE bytea;
97+
STORAGE integer;
10398
-- timestamp ops
10499

105100
CREATE FUNCTION timestamp_distance(timestamp, timestamp)
@@ -205,13 +200,13 @@ FOR TYPE tsvector USING rum
205200
AS
206201
OPERATOR 1 @@ (tsvector, tsquery),
207202
--support function
208-
FUNCTION 1 gin_cmp_tslexeme(text, text),
203+
FUNCTION 1 btint4cmp(integer, integer),
209204
FUNCTION 2 rum_extract_tsvector(tsvector,internal,internal,internal,internal),
210205
FUNCTION 3 rum_extract_tsquery(tsquery,internal,smallint,internal,internal,internal,internal),
211206
FUNCTION 4 rum_tsquery_timestamp_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal),
212207
FUNCTION 5 gin_cmp_prefix(text,text,smallint,internal),
213208
FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal),
214-
STORAGE text;
209+
STORAGE integer;
215210

216211
-- timestamptz ops
217212

@@ -281,13 +276,13 @@ FOR TYPE tsvector USING rum
281276
AS
282277
OPERATOR 1 @@ (tsvector, tsquery),
283278
--support function
284-
FUNCTION 1 gin_cmp_tslexeme(text, text),
279+
FUNCTION 1 btint4cmp(integer, integer),
285280
FUNCTION 2 rum_extract_tsvector(tsvector,internal,internal,internal,internal),
286281
FUNCTION 3 rum_extract_tsquery(tsquery,internal,smallint,internal,internal,internal,internal),
287282
FUNCTION 4 rum_tsquery_timestamp_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal),
288283
FUNCTION 5 gin_cmp_prefix(text,text,smallint,internal),
289284
FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal),
290-
STORAGE text;
285+
STORAGE integer;
291286

292287
-- inversed
293288

@@ -315,10 +310,10 @@ CREATE OPERATOR CLASS rum_tsquery_ops
315310
DEFAULT FOR TYPE tsquery USING rum
316311
AS
317312
OPERATOR 1 @@ (tsquery, tsvector),
318-
FUNCTION 1 gin_cmp_tslexeme(text, text),
313+
FUNCTION 1 btint4cmp(integer, integer),
319314
FUNCTION 2 ruminv_extract_tsquery(tsquery,internal,internal,internal,internal),
320315
FUNCTION 3 ruminv_extract_tsvector(tsvector,internal,smallint,internal,internal,internal,internal),
321316
FUNCTION 4 ruminv_tsvector_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal),
322317
FUNCTION 6 ruminv_tsquery_config(internal),
323-
STORAGE text;
318+
STORAGE integer;
324319

rum.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,7 @@ typedef signed char RumNullCategory;
243243
*/
244244
#define RumGetDownlink(itup) RumItemPointerGetBlockNumber(&(itup)->t_tid)
245245
#define RumSetDownlink(itup,blkno) ItemPointerSet(&(itup)->t_tid, blkno, InvalidOffsetNumber)
246-
CREATE INDEX rumidx ON test_rum USING rum (a rum_tsvector_ops);
246+
247247

248248
/*
249249
* Data (posting tree) pages

rum_ts_utils.c

Lines changed: 4 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@
2727

2828
#include <math.h>
2929

30-
PG_FUNCTION_INFO_V1(rum_cmp_tslexeme);
3130
PG_FUNCTION_INFO_V1(rum_extract_tsvector);
3231
PG_FUNCTION_INFO_V1(rum_extract_tsquery);
3332
PG_FUNCTION_INFO_V1(rum_tsvector_config);
@@ -505,16 +504,10 @@ rum_extract_tsvector(PG_FUNCTION_ARGS)
505504

506505
for (i = 0; i < vector->size; i++)
507506
{
508-
text *txt;
509-
bytea *hash_value;
510507
bytea *posData;
511508
int posDataSize;
512509

513-
txt = cstring_to_text_with_len(STRPTR(vector) + we->pos, we->len);
514-
hash_value = (bytea *) palloc(VARHDRSZ + sizeof(int32));
515-
SET_VARSIZE(hash_value, VARHDRSZ + sizeof(int32));
516-
*VARDATA(hash_value) = DirectFunctionCall1(hashtext, PointerGetDatum(txt));
517-
entries[i] = PointerGetDatum(hash_value);
510+
entries[i] = hash_any((const unsigned char *) (STRPTR(vector) + we->pos), we->len);
518511

519512
if (we->haspos)
520513
{
@@ -592,15 +585,9 @@ rum_extract_tsquery(PG_FUNCTION_ARGS)
592585

593586
for (i = 0; i < (*nentries); i++)
594587
{
595-
text *txt;
596-
bytea *hash_value;
597-
598-
txt = cstring_to_text_with_len(GETOPERAND(query) + operands[i]->distance,
599-
operands[i]->length);
600-
hash_value = (bytea *) palloc(VARHDRSZ + sizeof(int32));
601-
SET_VARSIZE(hash_value, VARHDRSZ + sizeof(int32));
602-
*VARDATA(hash_value) = DirectFunctionCall1(hashtext, PointerGetDatum(txt));
603-
entries[i] = PointerGetDatum(hash_value);
588+
entries[i] = hash_any(
589+
(const unsigned char *) (GETOPERAND(query) + operands[i]->distance),
590+
operands[i]->length);
604591
partialmatch[i] = operands[i]->prefix;
605592
(*extra_data)[i] = (Pointer) map_item_operand;
606593
}
@@ -1400,17 +1387,3 @@ rum_ts_join_pos(PG_FUNCTION_ARGS)
14001387

14011388
PG_RETURN_BYTEA_P(result);
14021389
}
1403-
1404-
Datum
1405-
rum_cmp_tslexeme(PG_FUNCTION_ARGS)
1406-
{
1407-
bytea *arg1 = PG_GETARG_BYTEA_P(0);
1408-
bytea *arg2 = PG_GETARG_BYTEA_P(1);
1409-
int32 a = *VARDATA(arg1);
1410-
int32 b = *VARDATA(arg2);
1411-
int cmp;
1412-
1413-
cmp = (a > b) ? 1 : ((a == b) ? 0 : -1);
1414-
1415-
PG_RETURN_INT32(cmp);
1416-
}

rumtsquery.c

Lines changed: 15 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
#include "postgres.h"
1313

14+
#include "access/hash.h"
1415
#include "catalog/pg_type.h"
1516
#include "tsearch/ts_type.h"
1617
#include "tsearch/ts_utils.h"
@@ -274,11 +275,14 @@ extract_wraps(QueryItemWrap * wrap, ExtractContext * context, int level)
274275

275276
for (index = 0; index < context->index; index++)
276277
{
277-
text *entry;
278-
279-
entry = DatumGetByteaP(context->entries[index]);
280-
if (VARSIZE_ANY_EXHDR(entry) == wrap->length &&
281-
!memcmp(context->operand + wrap->distance, VARDATA_ANY(entry), wrap->length))
278+
int32 entry;
279+
int32 operand_hash;
280+
281+
entry = DatumGetInt32(context->entries[index]);
282+
operand_hash = hash_any(
283+
(const unsigned char *) (context->operand + wrap->distance),
284+
wrap->length);
285+
if (entry == operand_hash)
282286
break;
283287
}
284288

@@ -287,7 +291,9 @@ extract_wraps(QueryItemWrap * wrap, ExtractContext * context, int level)
287291
index = context->index;
288292
addinfo = (bytea *) palloc(VARHDRSZ + 2 * Max(level, 1) * MAX_ENCODED_LEN);
289293
ptr = (unsigned char *) VARDATA(addinfo);
290-
context->entries[index] = PointerGetDatum(cstring_to_text_with_len(context->operand + wrap->distance, wrap->length));
294+
context->entries[index] = hash_any(
295+
(const unsigned char *) (context->operand + wrap->distance),
296+
wrap->length);
291297
context->addInfo[index] = PointerGetDatum(addinfo);
292298
context->addInfoIsNull[index] = false;
293299
context->index++;
@@ -419,12 +425,6 @@ ruminv_extract_tsquery(PG_FUNCTION_ARGS)
419425
}
420426
*nentries = count;
421427

422-
/* elog(NOTICE, "%d", *nentries);
423-
for (i = 0; i < *nentries; i++)
424-
{
425-
elog(NOTICE, "%s", text_to_cstring(DatumGetPointer((entries)[i])));
426-
}*/
427-
428428
PG_FREE_IF_COPY(query, 0);
429429
PG_RETURN_POINTER(entries);
430430
}
@@ -460,10 +460,9 @@ ruminv_extract_tsvector(PG_FUNCTION_ARGS)
460460

461461
for (i = 0; i < vector->size; i++)
462462
{
463-
text *txt;
464-
465-
txt = cstring_to_text_with_len(STRPTR(vector) + we[i].pos, we[i].len);
466-
entries[i] = PointerGetDatum(txt);
463+
entries[i] = hash_any(
464+
(const unsigned char *) (STRPTR(vector) + we[i].pos),
465+
we[i].len);
467466
(*nullFlags)[i] = false;
468467
}
469468
(*nullFlags)[*nentries - 1] = true;

sql/rum.sql

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -100,17 +100,13 @@ DELETE FROM tst WHERE i = 5;
100100
VACUUM tst;
101101
INSERT INTO tst SELECT i%10, to_tsvector('simple', substr(md5(i::text), 1, 1)) FROM generate_series(14001,15000) i;
102102

103-
-- set enable_bitmapscan=off;
104-
-- explain (costs off)
105-
-- SELECT a <=> to_tsquery('pg_catalog.english', 'w:*'), *
106-
-- FROM test_rum
107-
-- WHERE a @@ to_tsquery('pg_catalog.english', 'w:*')
108-
-- ORDER BY a <=> to_tsquery('pg_catalog.english', 'w:*');
109-
-- SELECT a <=> to_tsquery('pg_catalog.english', 'w:*'), *
110-
-- FROM test_rum
111-
-- WHERE a @@ to_tsquery('pg_catalog.english', 'w:*')
112-
-- ORDER BY a <=> to_tsquery('pg_catalog.english', 'w:*');
113-
-- SELECT a <=> to_tsquery('pg_catalog.english', 'b:*'), *
114-
-- FROM test_rum
115-
-- WHERE a @@ to_tsquery('pg_catalog.english', 'b:*')
116-
-- ORDER BY a <=> to_tsquery('pg_catalog.english', 'b:*');
103+
set enable_bitmapscan=off;
104+
explain (costs off)
105+
SELECT a <=> to_tsquery('pg_catalog.english', 'w:*'), *
106+
FROM test_rum
107+
WHERE a @@ to_tsquery('pg_catalog.english', 'w:*')
108+
ORDER BY a <=> to_tsquery('pg_catalog.english', 'w:*');
109+
SELECT a <=> to_tsquery('pg_catalog.english', 'w:*'), *
110+
FROM test_rum
111+
WHERE a @@ to_tsquery('pg_catalog.english', 'w:*')
112+
ORDER BY a <=> to_tsquery('pg_catalog.english', 'w:*');

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy