Skip to content

Commit 92ee252

Browse files
committed
Tweak processing of multiple-index-scan plans to reduce overhead when
handling many-way scans: instead of re-evaluating all prior indexscan quals to see if a tuple has been fetched more than once, use a hash table indexed by tuple CTID. But fall back to the old way if the hash table grows to exceed SortMem.
1 parent 38e2bf6 commit 92ee252

File tree

2 files changed

+153
-31
lines changed

2 files changed

+153
-31
lines changed

src/backend/executor/nodeIndexscan.c

Lines changed: 148 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $Header: /cvsroot/pgsql/src/backend/executor/nodeIndexscan.c,v 1.82 2003/08/04 02:39:59 momjian Exp $
11+
* $Header: /cvsroot/pgsql/src/backend/executor/nodeIndexscan.c,v 1.83 2003/08/22 20:26:43 tgl Exp $
1212
*
1313
*-------------------------------------------------------------------------
1414
*/
@@ -28,19 +28,51 @@
2828
#include "access/heapam.h"
2929
#include "executor/execdebug.h"
3030
#include "executor/nodeIndexscan.h"
31+
#include "miscadmin.h"
3132
#include "nodes/nodeFuncs.h"
3233
#include "optimizer/clauses.h"
3334
#include "parser/parsetree.h"
3435

35-
/* ----------------
36-
* Misc stuff to move to executor.h soon -cim 6/5/90
37-
* ----------------
38-
*/
36+
3937
#define NO_OP 0
4038
#define LEFT_OP 1
4139
#define RIGHT_OP 2
4240

41+
/*
42+
* In a multiple-index plan, we must take care to return any given tuple
43+
* only once, even if it matches conditions of several index scans. Our
44+
* preferred way to do this is to record already-returned tuples in a hash
45+
* table (using the TID as unique identifier). However, in a very large
46+
* scan this could conceivably run out of memory. We limit the hash table
47+
* to no more than SortMem KB; if it grows past that, we fall back to the
48+
* pre-7.4 technique: evaluate the prior-scan index quals again for each
49+
* tuple (which is space-efficient, but slow).
50+
*
51+
* When scanning backwards, we use scannum to determine when to emit the
52+
* tuple --- we have to re-emit a tuple in the same scan as it was first
53+
* encountered.
54+
*
55+
* Note: this code would break if the planner were ever to create a multiple
56+
* index plan with overall backwards direction, because the hashtable code
57+
* will emit a tuple the first time it is encountered (which would be the
58+
* highest scan in which it matches the index), but the evaluate-the-quals
59+
* code will emit a tuple in the lowest-numbered scan in which it's valid.
60+
* This could be fixed at need by making the evaluate-the-quals case more
61+
* complex. Currently the planner will never create such a plan (since it
62+
* considers multi-index plans unordered anyway), so there's no need for
63+
* more complexity.
64+
*/
65+
typedef struct
66+
{
67+
/* tid is the hash key and so must be first! */
68+
ItemPointerData tid; /* TID of a tuple we've returned */
69+
int scannum; /* number of scan we returned it in */
70+
} DupHashTabEntry;
71+
72+
4373
static TupleTableSlot *IndexNext(IndexScanState *node);
74+
static void create_duphash(IndexScanState *node);
75+
4476

4577
/* ----------------------------------------------------------------
4678
* IndexNext
@@ -163,7 +195,7 @@ IndexNext(IndexScanState *node)
163195
while ((tuple = index_getnext(scandesc, direction)) != NULL)
164196
{
165197
/*
166-
* store the scanned tuple in the scan tuple slot of the scan
198+
* Store the scanned tuple in the scan tuple slot of the scan
167199
* state. Note: we pass 'false' because tuples returned by
168200
* amgetnext are pointers onto disk pages and must not be
169201
* pfree()'d.
@@ -174,36 +206,80 @@ IndexNext(IndexScanState *node)
174206
false); /* don't pfree */
175207

176208
/*
177-
* We must check to see if the current tuple was already
178-
* matched by an earlier index, so we don't double-report it.
179-
* We do this by passing the tuple through ExecQual and
180-
* checking for failure with all previous qualifications.
209+
* If it's a multiple-index scan, make sure not to double-report
210+
* a tuple matched by more than one index. (See notes above.)
181211
*/
182-
if (node->iss_IndexPtr > 0)
212+
if (numIndices > 1)
183213
{
184-
bool prev_matches = false;
185-
int prev_index;
186-
List *qual;
187-
188-
econtext->ecxt_scantuple = slot;
189-
ResetExprContext(econtext);
190-
qual = node->indxqualorig;
191-
for (prev_index = 0;
192-
prev_index < node->iss_IndexPtr;
193-
prev_index++)
214+
/* First try the hash table */
215+
if (node->iss_DupHash)
194216
{
195-
if (ExecQual((List *) lfirst(qual), econtext, false))
217+
DupHashTabEntry *entry;
218+
bool found;
219+
220+
entry = (DupHashTabEntry *)
221+
hash_search(node->iss_DupHash,
222+
&tuple->t_data->t_ctid,
223+
HASH_ENTER,
224+
&found);
225+
if (entry == NULL ||
226+
node->iss_DupHash->hctl->nentries > node->iss_MaxHash)
227+
{
228+
/* out of memory (either hard or soft limit) */
229+
/* release hash table and fall thru to old code */
230+
hash_destroy(node->iss_DupHash);
231+
node->iss_DupHash = NULL;
232+
}
233+
else if (found)
196234
{
197-
prev_matches = true;
198-
break;
235+
/* pre-existing entry */
236+
237+
/*
238+
* It's duplicate if first emitted in a different
239+
* scan. If same scan, we must be backing up, so
240+
* okay to emit again.
241+
*/
242+
if (entry->scannum != node->iss_IndexPtr)
243+
{
244+
/* Dup, so drop it and loop back for another */
245+
ExecClearTuple(slot);
246+
continue;
247+
}
248+
}
249+
else
250+
{
251+
/* new entry, finish filling it in */
252+
entry->scannum = node->iss_IndexPtr;
199253
}
200-
qual = lnext(qual);
201254
}
202-
if (prev_matches)
255+
/* If hash table has overflowed, do it the hard way */
256+
if (node->iss_DupHash == NULL &&
257+
node->iss_IndexPtr > 0)
203258
{
204-
/* Duplicate, so drop it and loop back for another */
205-
ExecClearTuple(slot);
206-
continue;
259+
bool prev_matches = false;
260+
int prev_index;
261+
List *qual;
262+
263+
econtext->ecxt_scantuple = slot;
264+
ResetExprContext(econtext);
265+
qual = node->indxqualorig;
266+
for (prev_index = 0;
267+
prev_index < node->iss_IndexPtr;
268+
prev_index++)
269+
{
270+
if (ExecQual((List *) lfirst(qual), econtext, false))
271+
{
272+
prev_matches = true;
273+
break;
274+
}
275+
qual = lnext(qual);
276+
}
277+
if (prev_matches)
278+
{
279+
/* Dup, so drop it and loop back for another */
280+
ExecClearTuple(slot);
281+
continue;
282+
}
207283
}
208284
}
209285

@@ -383,6 +459,14 @@ ExecIndexReScan(IndexScanState *node, ExprContext *exprCtxt)
383459
return;
384460
}
385461

462+
/* reset hash table */
463+
if (numIndices > 1)
464+
{
465+
if (node->iss_DupHash)
466+
hash_destroy(node->iss_DupHash);
467+
create_duphash(node);
468+
}
469+
386470
/* reset index scans */
387471
if (ScanDirectionIsBackward(((IndexScan *) node->ss.ps.plan)->indxorderdir))
388472
node->iss_IndexPtr = numIndices;
@@ -432,6 +516,10 @@ ExecEndIndexScan(IndexScanState *node)
432516
ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
433517
ExecClearTuple(node->ss.ss_ScanTupleSlot);
434518

519+
/* drop hash table */
520+
if (node->iss_DupHash)
521+
hash_destroy(node->iss_DupHash);
522+
435523
/*
436524
* close the index relations
437525
*/
@@ -507,7 +595,7 @@ ExecIndexRestrPos(IndexScanState *node)
507595

508596
/* ----------------------------------------------------------------
509597
* ExecInitIndexScan
510-
*
598+
*
511599
* Initializes the index scan's state information, creates
512600
* scan keys, and opens the base and index relations.
513601
*
@@ -919,12 +1007,42 @@ ExecInitIndexScan(IndexScan *node, EState *estate)
9191007
ExecAssignResultTypeFromTL(&indexstate->ss.ps);
9201008
ExecAssignScanProjectionInfo(&indexstate->ss);
9211009

1010+
/*
1011+
* Initialize hash table if needed.
1012+
*/
1013+
if (numIndices > 1)
1014+
create_duphash(indexstate);
1015+
else
1016+
indexstate->iss_DupHash = NULL;
1017+
9221018
/*
9231019
* all done.
9241020
*/
9251021
return indexstate;
9261022
}
9271023

1024+
static void
1025+
create_duphash(IndexScanState *node)
1026+
{
1027+
HASHCTL hash_ctl;
1028+
1029+
MemSet(&hash_ctl, 0, sizeof(hash_ctl));
1030+
hash_ctl.keysize = SizeOfIptrData;
1031+
hash_ctl.entrysize = sizeof(DupHashTabEntry);
1032+
hash_ctl.hash = tag_hash;
1033+
hash_ctl.hcxt = CurrentMemoryContext;
1034+
node->iss_DupHash = hash_create("DupHashTable",
1035+
(long) ceil(node->ss.ps.plan->plan_rows),
1036+
&hash_ctl,
1037+
HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
1038+
if (node->iss_DupHash == NULL)
1039+
ereport(ERROR,
1040+
(errcode(ERRCODE_OUT_OF_MEMORY),
1041+
errmsg("out of memory")));
1042+
node->iss_MaxHash = (SortMem * 1024L) /
1043+
(MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(sizeof(DupHashTabEntry)));
1044+
}
1045+
9281046
int
9291047
ExecCountSlotsIndexScan(IndexScan *node)
9301048
{

src/include/nodes/execnodes.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
* Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
88
* Portions Copyright (c) 1994, Regents of the University of California
99
*
10-
* $Id: execnodes.h,v 1.104 2003/08/19 01:13:41 tgl Exp $
10+
* $Id: execnodes.h,v 1.105 2003/08/22 20:26:43 tgl Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -768,6 +768,8 @@ typedef ScanState SeqScanState;
768768
* RuntimeKeysReady true if runtime Skeys have been computed
769769
* RelationDescs ptr to array of relation descriptors
770770
* ScanDescs ptr to array of scan descriptors
771+
* DupHash hashtable for recognizing dups in multiple scan
772+
* MaxHash max # entries we will allow in hashtable
771773
* ----------------
772774
*/
773775
typedef struct IndexScanState
@@ -785,6 +787,8 @@ typedef struct IndexScanState
785787
bool iss_RuntimeKeysReady;
786788
RelationPtr iss_RelationDescs;
787789
IndexScanDescPtr iss_ScanDescs;
790+
HTAB *iss_DupHash;
791+
long iss_MaxHash;
788792
} IndexScanState;
789793

790794
/* ----------------

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy