Skip to content

Commit ab2324f

Browse files
committed
Improve comments about partitioned hash table freelists.
While I couldn't find any live bugs in commit 44ca402, the comments seemed pretty far from adequate; in particular it was not made plain that "borrowing" entries from other freelists is critical for correctness. Try to improve the commentary. A couple of very minor code style tweaks, as well. Discussion: https://postgr.es/m/10593.1500670709@sss.pgh.pa.us
1 parent 991c8b0 commit ab2324f

File tree

1 file changed

+70
-35
lines changed

1 file changed

+70
-35
lines changed

src/backend/utils/hash/dynahash.c

Lines changed: 70 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@
1515
* to hash_create. This prevents any attempt to split buckets on-the-fly.
1616
* Therefore, each hash bucket chain operates independently, and no fields
1717
* of the hash header change after init except nentries and freeList.
18-
* A partitioned table uses spinlocks to guard changes of those fields.
18+
* (A partitioned table uses multiple copies of those fields, guarded by
19+
* spinlocks, for additional concurrency.)
1920
* This lets any subset of the hash buckets be treated as a separately
2021
* lockable partition. We expect callers to use the low-order bits of a
2122
* lookup key's hash value as a partition number --- this will work because
@@ -121,15 +122,27 @@ typedef HASHELEMENT *HASHBUCKET;
121122
typedef HASHBUCKET *HASHSEGMENT;
122123

123124
/*
124-
* Using array of FreeListData instead of separate arrays of mutexes, nentries
125-
* and freeLists prevents, at least partially, sharing one cache line between
126-
* different mutexes (see below).
125+
* Per-freelist data.
126+
*
127+
* In a partitioned hash table, each freelist is associated with a specific
128+
* set of hashcodes, as determined by the FREELIST_IDX() macro below.
129+
* nentries tracks the number of live hashtable entries having those hashcodes
130+
* (NOT the number of entries in the freelist, as you might expect).
131+
*
132+
* The coverage of a freelist might be more or less than one partition, so it
133+
* needs its own lock rather than relying on caller locking. Relying on that
134+
* wouldn't work even if the coverage was the same, because of the occasional
135+
* need to "borrow" entries from another freelist; see get_hash_entry().
136+
*
137+
* Using an array of FreeListData instead of separate arrays of mutexes,
138+
* nentries and freeLists helps to reduce sharing of cache lines between
139+
* different mutexes.
127140
*/
128141
typedef struct
129142
{
130-
slock_t mutex; /* spinlock */
131-
long nentries; /* number of entries */
132-
HASHELEMENT *freeList; /* list of free elements */
143+
slock_t mutex; /* spinlock for this freelist */
144+
long nentries; /* number of entries in associated buckets */
145+
HASHELEMENT *freeList; /* chain of free elements */
133146
} FreeListData;
134147

135148
/*
@@ -143,12 +156,14 @@ typedef struct
143156
struct HASHHDR
144157
{
145158
/*
146-
* The freelist can become a point of contention on high-concurrency hash
147-
* tables, so we use an array of freelist, each with its own mutex and
148-
* nentries count, instead of just a single one.
159+
* The freelist can become a point of contention in high-concurrency hash
160+
* tables, so we use an array of freelists, each with its own mutex and
161+
* nentries count, instead of just a single one. Although the freelists
162+
* normally operate independently, we will scavenge entries from freelists
163+
* other than a hashcode's default freelist when necessary.
149164
*
150-
* If hash table is not partitioned only freeList[0] is used and spinlocks
151-
* are not used at all.
165+
* If the hash table is not partitioned, only freeList[0] is used and its
166+
* spinlock is not used at all; callers' locking is assumed sufficient.
152167
*/
153168
FreeListData freeList[NUM_FREELISTS];
154169

@@ -184,7 +199,7 @@ struct HASHHDR
184199
#define IS_PARTITIONED(hctl) ((hctl)->num_partitions != 0)
185200

186201
#define FREELIST_IDX(hctl, hashcode) \
187-
(IS_PARTITIONED(hctl) ? hashcode % NUM_FREELISTS : 0)
202+
(IS_PARTITIONED(hctl) ? (hashcode) % NUM_FREELISTS : 0)
188203

189204
/*
190205
* Top control structure for a hashtable --- in a shared table, each backend
@@ -506,19 +521,22 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
506521
nelem_alloc_first;
507522

508523
/*
509-
* If hash table is partitioned all freeLists have equal number of
510-
* elements. Otherwise only freeList[0] is used.
524+
* If hash table is partitioned, give each freelist an equal share of
525+
* the initial allocation. Otherwise only freeList[0] is used.
511526
*/
512527
if (IS_PARTITIONED(hashp->hctl))
513528
freelist_partitions = NUM_FREELISTS;
514529
else
515530
freelist_partitions = 1;
516531

517532
nelem_alloc = nelem / freelist_partitions;
518-
if (nelem_alloc == 0)
533+
if (nelem_alloc <= 0)
519534
nelem_alloc = 1;
520535

521-
/* Make sure all memory will be used */
536+
/*
537+
* Make sure we'll allocate all the requested elements; freeList[0]
538+
* gets the excess if the request isn't divisible by NUM_FREELISTS.
539+
*/
522540
if (nelem_alloc * freelist_partitions < nelem)
523541
nelem_alloc_first =
524542
nelem - nelem_alloc * (freelist_partitions - 1);
@@ -620,7 +638,7 @@ init_htab(HTAB *hashp, long nelem)
620638
int i;
621639

622640
/*
623-
* initialize mutex if it's a partitioned table
641+
* initialize mutexes if it's a partitioned table
624642
*/
625643
if (IS_PARTITIONED(hctl))
626644
for (i = 0; i < NUM_FREELISTS; i++)
@@ -902,6 +920,7 @@ hash_search_with_hash_value(HTAB *hashp,
902920
bool *foundPtr)
903921
{
904922
HASHHDR *hctl = hashp->hctl;
923+
int freelist_idx = FREELIST_IDX(hctl, hashvalue);
905924
Size keysize;
906925
uint32 bucket;
907926
long segment_num;
@@ -910,7 +929,6 @@ hash_search_with_hash_value(HTAB *hashp,
910929
HASHBUCKET currBucket;
911930
HASHBUCKET *prevBucketPtr;
912931
HashCompareFunc match;
913-
int freelist_idx = FREELIST_IDX(hctl, hashvalue);
914932

915933
#if HASH_STATISTICS
916934
hash_accesses++;
@@ -993,13 +1011,14 @@ hash_search_with_hash_value(HTAB *hashp,
9931011
if (IS_PARTITIONED(hctl))
9941012
SpinLockAcquire(&(hctl->freeList[freelist_idx].mutex));
9951013

1014+
/* delete the record from the appropriate nentries counter. */
9961015
Assert(hctl->freeList[freelist_idx].nentries > 0);
9971016
hctl->freeList[freelist_idx].nentries--;
9981017

9991018
/* remove record from hash bucket's chain. */
10001019
*prevBucketPtr = currBucket->link;
10011020

1002-
/* add the record to the freelist for this table. */
1021+
/* add the record to the appropriate freelist. */
10031022
currBucket->link = hctl->freeList[freelist_idx].freeList;
10041023
hctl->freeList[freelist_idx].freeList = currBucket;
10051024

@@ -1220,14 +1239,15 @@ hash_update_hash_key(HTAB *hashp,
12201239
}
12211240

12221241
/*
1223-
* create a new entry if possible
1242+
* Allocate a new hashtable entry if possible; return NULL if out of memory.
1243+
* (Or, if the underlying space allocator throws error for out-of-memory,
1244+
* we won't return at all.)
12241245
*/
12251246
static HASHBUCKET
12261247
get_hash_entry(HTAB *hashp, int freelist_idx)
12271248
{
12281249
HASHHDR *hctl = hashp->hctl;
12291250
HASHBUCKET newElement;
1230-
int borrow_from_idx;
12311251

12321252
for (;;)
12331253
{
@@ -1244,19 +1264,32 @@ get_hash_entry(HTAB *hashp, int freelist_idx)
12441264
if (IS_PARTITIONED(hctl))
12451265
SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
12461266

1247-
/* no free elements. allocate another chunk of buckets */
1267+
/*
1268+
* No free elements in this freelist. In a partitioned table, there
1269+
* might be entries in other freelists, but to reduce contention we
1270+
* prefer to first try to get another chunk of buckets from the main
1271+
* shmem allocator. If that fails, though, we *MUST* root through all
1272+
* the other freelists before giving up. There are multiple callers
1273+
* that assume that they can allocate every element in the initially
1274+
* requested table size, or that deleting an element guarantees they
1275+
* can insert a new element, even if shared memory is entirely full.
1276+
* Failing because the needed element is in a different freelist is
1277+
* not acceptable.
1278+
*/
12481279
if (!element_alloc(hashp, hctl->nelem_alloc, freelist_idx))
12491280
{
1281+
int borrow_from_idx;
1282+
12501283
if (!IS_PARTITIONED(hctl))
12511284
return NULL; /* out of memory */
12521285

1253-
/* try to borrow element from another partition */
1286+
/* try to borrow element from another freelist */
12541287
borrow_from_idx = freelist_idx;
12551288
for (;;)
12561289
{
12571290
borrow_from_idx = (borrow_from_idx + 1) % NUM_FREELISTS;
12581291
if (borrow_from_idx == freelist_idx)
1259-
break;
1292+
break; /* examined all freelists, fail */
12601293

12611294
SpinLockAcquire(&(hctl->freeList[borrow_from_idx].mutex));
12621295
newElement = hctl->freeList[borrow_from_idx].freeList;
@@ -1266,17 +1299,19 @@ get_hash_entry(HTAB *hashp, int freelist_idx)
12661299
hctl->freeList[borrow_from_idx].freeList = newElement->link;
12671300
SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
12681301

1302+
/* careful: count the new element in its proper freelist */
12691303
SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
12701304
hctl->freeList[freelist_idx].nentries++;
12711305
SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
12721306

1273-
break;
1307+
return newElement;
12741308
}
12751309

12761310
SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
12771311
}
12781312

1279-
return newElement;
1313+
/* no elements available to borrow either, so out of memory */
1314+
return NULL;
12801315
}
12811316
}
12821317

@@ -1300,15 +1335,15 @@ hash_get_num_entries(HTAB *hashp)
13001335
long sum = hashp->hctl->freeList[0].nentries;
13011336

13021337
/*
1303-
* We currently don't bother with the mutex; it's only sensible to call
1304-
* this function if you've got lock on all partitions of the table.
1338+
* We currently don't bother with acquiring the mutexes; it's only
1339+
* sensible to call this function if you've got lock on all partitions of
1340+
* the table.
13051341
*/
1306-
1307-
if (!IS_PARTITIONED(hashp->hctl))
1308-
return sum;
1309-
1310-
for (i = 1; i < NUM_FREELISTS; i++)
1311-
sum += hashp->hctl->freeList[i].nentries;
1342+
if (IS_PARTITIONED(hashp->hctl))
1343+
{
1344+
for (i = 1; i < NUM_FREELISTS; i++)
1345+
sum += hashp->hctl->freeList[i].nentries;
1346+
}
13121347

13131348
return sum;
13141349
}

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy