Skip to content

Commit 849074f

Browse files
committed
Revise hash join code so that we can increase the number of batches
on-the-fly, and thereby avoid blowing out memory when the planner has underestimated the hash table size. Hash join will now obey the work_mem limit with some faithfulness. Per my recent proposal (hash aggregate part isn't done yet though).
1 parent 31b6d84 commit 849074f

File tree

9 files changed

+632
-428
lines changed

9 files changed

+632
-428
lines changed

src/backend/executor/nodeHash.c

Lines changed: 313 additions & 198 deletions
Large diffs are not rendered by default.

src/backend/executor/nodeHashjoin.c

Lines changed: 228 additions & 165 deletions
Large diffs are not rendered by default.

src/backend/optimizer/path/costsize.c

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@
4949
* Portions Copyright (c) 1994, Regents of the University of California
5050
*
5151
* IDENTIFICATION
52-
* $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.137 2004/12/31 22:00:04 pgsql Exp $
52+
* $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.138 2005/03/06 22:15:04 tgl Exp $
5353
*
5454
*-------------------------------------------------------------------------
5555
*/
@@ -1074,9 +1074,9 @@ cost_hashjoin(HashPath *path, Query *root)
10741074
double innerbytes = relation_byte_size(inner_path_rows,
10751075
inner_path->parent->width);
10761076
int num_hashclauses = list_length(hashclauses);
1077-
int virtualbuckets;
1078-
int physicalbuckets;
1077+
int numbuckets;
10791078
int numbatches;
1079+
double virtualbuckets;
10801080
Selectivity innerbucketsize;
10811081
Selectivity joininfactor;
10821082
ListCell *hcl;
@@ -1123,9 +1123,9 @@ cost_hashjoin(HashPath *path, Query *root)
11231123
/* Get hash table size that executor would use for inner relation */
11241124
ExecChooseHashTableSize(inner_path_rows,
11251125
inner_path->parent->width,
1126-
&virtualbuckets,
1127-
&physicalbuckets,
1126+
&numbuckets,
11281127
&numbatches);
1128+
virtualbuckets = (double) numbuckets * (double) numbatches;
11291129

11301130
/*
11311131
* Determine bucketsize fraction for inner relation. We use the
@@ -1196,13 +1196,13 @@ cost_hashjoin(HashPath *path, Query *root)
11961196
}
11971197

11981198
/*
1199-
* if inner relation is too big then we will need to "batch" the join,
1199+
* If inner relation is too big then we will need to "batch" the join,
12001200
* which implies writing and reading most of the tuples to disk an
12011201
* extra time. Charge one cost unit per page of I/O (correct since it
12021202
* should be nice and sequential...). Writing the inner rel counts as
12031203
* startup cost, all the rest as run cost.
12041204
*/
1205-
if (numbatches)
1205+
if (numbatches > 1)
12061206
{
12071207
double outerpages = page_size(outer_path_rows,
12081208
outer_path->parent->width);
@@ -1228,7 +1228,9 @@ cost_hashjoin(HashPath *path, Query *root)
12281228
* The number of tuple comparisons needed is the number of outer
12291229
* tuples times the typical number of tuples in a hash bucket, which
12301230
* is the inner relation size times its bucketsize fraction. At each
1231-
* one, we need to evaluate the hashjoin quals.
1231+
* one, we need to evaluate the hashjoin quals. (Note: charging the
1232+
* full qual eval cost at each tuple is pessimistic, since we don't
1233+
* evaluate the quals unless the hash values match exactly.)
12321234
*/
12331235
startup_cost += hash_qual_cost.startup;
12341236
run_cost += hash_qual_cost.per_tuple *

src/backend/utils/adt/selfuncs.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
*
1616
*
1717
* IDENTIFICATION
18-
* $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.171 2005/02/01 23:07:58 tgl Exp $
18+
* $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.172 2005/03/06 22:15:04 tgl Exp $
1919
*
2020
*-------------------------------------------------------------------------
2121
*/
@@ -2154,7 +2154,7 @@ estimate_num_groups(Query *root, List *groupExprs, double input_rows)
21542154
* inner rel is well-dispersed (or the alternatives seem much worse).
21552155
*/
21562156
Selectivity
2157-
estimate_hash_bucketsize(Query *root, Node *hashkey, int nbuckets)
2157+
estimate_hash_bucketsize(Query *root, Node *hashkey, double nbuckets)
21582158
{
21592159
VariableStatData vardata;
21602160
double estfract,
@@ -2212,8 +2212,8 @@ estimate_hash_bucketsize(Query *root, Node *hashkey, int nbuckets)
22122212
* the number of buckets is less than the expected number of distinct
22132213
* values; otherwise it is 1/ndistinct.
22142214
*/
2215-
if (ndistinct > (double) nbuckets)
2216-
estfract = 1.0 / (double) nbuckets;
2215+
if (ndistinct > nbuckets)
2216+
estfract = 1.0 / nbuckets;
22172217
else
22182218
estfract = 1.0 / ndistinct;
22192219

src/include/executor/hashjoin.h

Lines changed: 45 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
88
* Portions Copyright (c) 1994, Regents of the University of California
99
*
10-
* $PostgreSQL: pgsql/src/include/executor/hashjoin.h,v 1.34 2004/12/31 22:03:29 pgsql Exp $
10+
* $PostgreSQL: pgsql/src/include/executor/hashjoin.h,v 1.35 2005/03/06 22:15:05 tgl Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -20,11 +20,12 @@
2020
/* ----------------------------------------------------------------
2121
* hash-join hash table structures
2222
*
23-
* Each active hashjoin has a HashJoinTable control block which is
23+
* Each active hashjoin has a HashJoinTable control block, which is
2424
* palloc'd in the executor's per-query context. All other storage needed
2525
* for the hashjoin is kept in private memory contexts, two for each hashjoin.
2626
* This makes it easy and fast to release the storage when we don't need it
27-
* anymore.
27+
* anymore. (Exception: data associated with the temp files lives in the
28+
* per-query context too, since we always call buffile.c in that context.)
2829
*
2930
* The hashtable contexts are made children of the per-query context, ensuring
3031
* that they will be discarded at end of statement even if the join is
@@ -35,40 +36,64 @@
3536
* "hashCxt", while storage that is only wanted for the current batch is
3637
* allocated in the "batchCxt". By resetting the batchCxt at the end of
3738
* each batch, we free all the per-batch storage reliably and without tedium.
39+
*
40+
* During first scan of inner relation, we get its tuples from executor.
41+
* If nbatch > 1 then tuples that don't belong in first batch get saved
42+
* into inner-batch temp files. The same statements apply for the
43+
* first scan of the outer relation, except we write tuples to outer-batch
44+
* temp files. After finishing the first scan, we do the following for
45+
* each remaining batch:
46+
* 1. Read tuples from inner batch file, load into hash buckets.
47+
* 2. Read tuples from outer batch file, match to hash buckets and output.
48+
*
49+
* It is possible to increase nbatch on the fly if the in-memory hash table
50+
* gets too big. The hash-value-to-batch computation is arranged so that this
51+
* can only cause a tuple to go into a later batch than previously thought,
52+
* never into an earlier batch. When we increase nbatch, we rescan the hash
53+
* table and dump out any tuples that are now of a later batch to the correct
54+
* inner batch file. Subsequently, while reading either inner or outer batch
55+
* files, we might find tuples that no longer belong to the current batch;
56+
* if so, we just dump them out to the correct batch file.
3857
* ----------------------------------------------------------------
3958
*/
4059

60+
/* these are in nodes/execnodes.h: */
61+
/* typedef struct HashJoinTupleData *HashJoinTuple; */
62+
/* typedef struct HashJoinTableData *HashJoinTable; */
63+
4164
typedef struct HashJoinTupleData
4265
{
43-
struct HashJoinTupleData *next; /* link to next tuple in same
44-
* bucket */
66+
struct HashJoinTupleData *next; /* link to next tuple in same bucket */
67+
uint32 hashvalue; /* tuple's hash code */
4568
HeapTupleData htup; /* tuple header */
4669
} HashJoinTupleData;
4770

48-
typedef HashJoinTupleData *HashJoinTuple;
49-
5071
typedef struct HashJoinTableData
5172
{
52-
int nbuckets; /* buckets in use during this batch */
53-
int totalbuckets; /* total number of (virtual) buckets */
54-
HashJoinTuple *buckets; /* buckets[i] is head of list of tuples */
73+
int nbuckets; /* # buckets in the in-memory hash table */
74+
/* buckets[i] is head of list of tuples in i'th in-memory bucket */
75+
struct HashJoinTupleData **buckets;
5576
/* buckets array is per-batch storage, as are all the tuples */
5677

57-
int nbatch; /* number of batches; 0 means 1-pass join */
58-
int curbatch; /* current batch #, or 0 during 1st pass */
78+
int nbatch; /* number of batches */
79+
int curbatch; /* current batch #; 0 during 1st pass */
80+
81+
int nbatch_original; /* nbatch when we started inner scan */
82+
int nbatch_outstart; /* nbatch when we started outer scan */
83+
84+
bool growEnabled; /* flag to shut off nbatch increases */
5985

6086
bool hashNonEmpty; /* did inner plan produce any rows? */
6187

6288
/*
63-
* all these arrays are allocated for the life of the hash join, but
64-
* only if nbatch > 0:
89+
* These arrays are allocated for the life of the hash join, but
90+
* only if nbatch > 1. A file is opened only when we first write
91+
* a tuple into it (otherwise its pointer remains NULL). Note that
92+
* the zero'th array elements never get used, since we will process
93+
* rather than dump out any tuples of batch zero.
6594
*/
6695
BufFile **innerBatchFile; /* buffered virtual temp file per batch */
6796
BufFile **outerBatchFile; /* buffered virtual temp file per batch */
68-
long *outerBatchSize; /* count of tuples in each outer batch
69-
* file */
70-
long *innerBatchSize; /* count of tuples in each inner batch
71-
* file */
7297

7398
/*
7499
* Info about the datatype-specific hash functions for the datatypes
@@ -79,21 +104,11 @@ typedef struct HashJoinTableData
79104
*/
80105
FmgrInfo *hashfunctions; /* lookup data for hash functions */
81106

82-
/*
83-
* During 1st scan of inner relation, we get tuples from executor. If
84-
* nbatch > 0 then tuples that don't belong in first nbuckets logical
85-
* buckets get dumped into inner-batch temp files. The same statements
86-
* apply for the 1st scan of the outer relation, except we write
87-
* tuples to outer-batch temp files. If nbatch > 0 then we do the
88-
* following for each batch: 1. Read tuples from inner batch file,
89-
* load into hash buckets. 2. Read tuples from outer batch file, match
90-
* to hash buckets and output.
91-
*/
107+
Size spaceUsed; /* memory space currently used by tuples */
108+
Size spaceAllowed; /* upper limit for space used */
92109

93110
MemoryContext hashCxt; /* context for whole-hash-join storage */
94111
MemoryContext batchCxt; /* context for this-batch-only storage */
95112
} HashJoinTableData;
96113

97-
typedef HashJoinTableData *HashJoinTable;
98-
99114
#endif /* HASHJOIN_H */

src/include/executor/nodeHash.h

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
88
* Portions Copyright (c) 1994, Regents of the University of California
99
*
10-
* $PostgreSQL: pgsql/src/include/executor/nodeHash.h,v 1.35 2004/12/31 22:03:29 pgsql Exp $
10+
* $PostgreSQL: pgsql/src/include/executor/nodeHash.h,v 1.36 2005/03/06 22:15:05 tgl Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -25,18 +25,20 @@ extern void ExecReScanHash(HashState *node, ExprContext *exprCtxt);
2525
extern HashJoinTable ExecHashTableCreate(Hash *node, List *hashOperators);
2626
extern void ExecHashTableDestroy(HashJoinTable hashtable);
2727
extern void ExecHashTableInsert(HashJoinTable hashtable,
28-
ExprContext *econtext,
29-
List *hashkeys);
30-
extern int ExecHashGetBucket(HashJoinTable hashtable,
31-
ExprContext *econtext,
32-
List *hashkeys);
33-
extern int ExecHashGetBatch(int bucketno, HashJoinTable hashtable);
34-
extern HeapTuple ExecScanHashBucket(HashJoinState *hjstate, List *hjclauses,
35-
ExprContext *econtext);
36-
extern void ExecHashTableReset(HashJoinTable hashtable, long ntuples);
28+
HeapTuple tuple,
29+
uint32 hashvalue);
30+
extern uint32 ExecHashGetHashValue(HashJoinTable hashtable,
31+
ExprContext *econtext,
32+
List *hashkeys);
33+
extern void ExecHashGetBucketAndBatch(HashJoinTable hashtable,
34+
uint32 hashvalue,
35+
int *bucketno,
36+
int *batchno);
37+
extern HeapTuple ExecScanHashBucket(HashJoinState *hjstate,
38+
ExprContext *econtext);
39+
extern void ExecHashTableReset(HashJoinTable hashtable);
3740
extern void ExecChooseHashTableSize(double ntuples, int tupwidth,
38-
int *virtualbuckets,
39-
int *physicalbuckets,
41+
int *numbuckets,
4042
int *numbatches);
4143

4244
#endif /* NODEHASH_H */

src/include/executor/nodeHashjoin.h

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,29 @@
11
/*-------------------------------------------------------------------------
22
*
33
* nodeHashjoin.h
4-
*
4+
* prototypes for nodeHashjoin.c
55
*
66
*
77
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
88
* Portions Copyright (c) 1994, Regents of the University of California
99
*
10-
* $PostgreSQL: pgsql/src/include/executor/nodeHashjoin.h,v 1.28 2004/12/31 22:03:29 pgsql Exp $
10+
* $PostgreSQL: pgsql/src/include/executor/nodeHashjoin.h,v 1.29 2005/03/06 22:15:05 tgl Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
1414
#ifndef NODEHASHJOIN_H
1515
#define NODEHASHJOIN_H
1616

1717
#include "nodes/execnodes.h"
18+
#include "storage/buffile.h"
1819

1920
extern int ExecCountSlotsHashJoin(HashJoin *node);
2021
extern HashJoinState *ExecInitHashJoin(HashJoin *node, EState *estate);
2122
extern TupleTableSlot *ExecHashJoin(HashJoinState *node);
2223
extern void ExecEndHashJoin(HashJoinState *node);
2324
extern void ExecReScanHashJoin(HashJoinState *node, ExprContext *exprCtxt);
2425

25-
extern void ExecHashJoinSaveTuple(HeapTuple heapTuple, BufFile *file);
26+
extern void ExecHashJoinSaveTuple(HeapTuple heapTuple, uint32 hashvalue,
27+
BufFile **fileptr);
2628

2729
#endif /* NODEHASHJOIN_H */

src/include/nodes/execnodes.h

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,14 @@
77
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
88
* Portions Copyright (c) 1994, Regents of the University of California
99
*
10-
* $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.122 2004/12/31 22:03:34 pgsql Exp $
10+
* $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.123 2005/03/06 22:15:05 tgl Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
1414
#ifndef EXECNODES_H
1515
#define EXECNODES_H
1616

1717
#include "access/relscan.h"
18-
#include "executor/hashjoin.h"
1918
#include "executor/tuptable.h"
2019
#include "fmgr.h"
2120
#include "nodes/bitmapset.h"
@@ -985,11 +984,13 @@ typedef struct MergeJoinState
985984
* HashJoinState information
986985
*
987986
* hj_HashTable hash table for the hashjoin
987+
* (NULL if table not built yet)
988+
* hj_CurHashValue hash value for current outer tuple
988989
* hj_CurBucketNo bucket# for current outer tuple
989990
* hj_CurTuple last inner tuple matched to current outer
990991
* tuple, or NULL if starting search
991-
* (CurBucketNo and CurTuple are meaningless
992-
* unless OuterTupleSlot is nonempty!)
992+
* (CurHashValue, CurBucketNo and CurTuple are
993+
* undefined if OuterTupleSlot is empty!)
993994
* hj_OuterHashKeys the outer hash keys in the hashjoin condition
994995
* hj_InnerHashKeys the inner hash keys in the hashjoin condition
995996
* hj_HashOperators the join operators in the hashjoin condition
@@ -998,14 +999,19 @@ typedef struct MergeJoinState
998999
* hj_NullInnerTupleSlot prepared null tuple for left outer joins
9991000
* hj_NeedNewOuter true if need new outer tuple on next call
10001001
* hj_MatchedOuter true if found a join match for current outer
1001-
* hj_hashdone true if hash-table-build phase is done
10021002
* ----------------
10031003
*/
1004+
1005+
/* these structs are defined in executor/hashjoin.h: */
1006+
typedef struct HashJoinTupleData *HashJoinTuple;
1007+
typedef struct HashJoinTableData *HashJoinTable;
1008+
10041009
typedef struct HashJoinState
10051010
{
10061011
JoinState js; /* its first field is NodeTag */
10071012
List *hashclauses; /* list of ExprState nodes */
10081013
HashJoinTable hj_HashTable;
1014+
uint32 hj_CurHashValue;
10091015
int hj_CurBucketNo;
10101016
HashJoinTuple hj_CurTuple;
10111017
List *hj_OuterHashKeys; /* list of ExprState nodes */
@@ -1016,7 +1022,6 @@ typedef struct HashJoinState
10161022
TupleTableSlot *hj_NullInnerTupleSlot;
10171023
bool hj_NeedNewOuter;
10181024
bool hj_MatchedOuter;
1019-
bool hj_hashdone;
10201025
} HashJoinState;
10211026

10221027

src/include/utils/selfuncs.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
99
* Portions Copyright (c) 1994, Regents of the University of California
1010
*
11-
* $PostgreSQL: pgsql/src/include/utils/selfuncs.h,v 1.21 2004/12/31 22:03:46 pgsql Exp $
11+
* $PostgreSQL: pgsql/src/include/utils/selfuncs.h,v 1.22 2005/03/06 22:15:05 tgl Exp $
1212
*
1313
*-------------------------------------------------------------------------
1414
*/
@@ -121,7 +121,7 @@ extern double estimate_num_groups(Query *root, List *groupExprs,
121121
double input_rows);
122122

123123
extern Selectivity estimate_hash_bucketsize(Query *root, Node *hashkey,
124-
int nbuckets);
124+
double nbuckets);
125125

126126
extern Datum btcostestimate(PG_FUNCTION_ARGS);
127127
extern Datum rtcostestimate(PG_FUNCTION_ARGS);

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy