Skip to content

Commit 37b7fc0

Browse files
robertmhaasAlexander Korotkov
authored andcommitted
Move buffer I/O and content LWLocks out of the main tranche.
Move the content lock directly into the BufferDesc, so that locking and pinning a buffer touches only one cache line rather than two. Adjust the definition of BufferDesc slightly so that this doesn't make the BufferDesc any larger than one cache line (at least on platforms where a spinlock is only 1 or 2 bytes). We can't fit the I/O locks into the BufferDesc and stay within one cache line, so move those to a completely separate tranche. This leaves a relatively limited number of LWLocks in the main tranche, so increase the padding of those remaining locks to a full cache line, rather than allowing adjacent locks to share a cache line, hopefully reducing false sharing. Performance testing shows that these changes make little difference on laptop-class machines, but help significantly on larger servers, especially those with more than 2 sockets. Andres Freund, originally based on an earlier patch by Simon Riggs. Review and cosmetic adjustments (including heavy rewriting of the comments) by me.
1 parent f39aecb commit 37b7fc0

File tree

6 files changed

+151
-67
lines changed

6 files changed

+151
-67
lines changed

src/backend/storage/buffer/buf_init.c

Lines changed: 51 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@
2020

2121
BufferDescPadded *BufferDescriptors;
2222
char *BufferBlocks;
23+
LWLockMinimallyPadded *BufferIOLWLockArray = NULL;
24+
LWLockTranche BufferIOLWLockTranche;
25+
LWLockTranche BufferContentLWLockTranche;
2326

2427

2528
/*
@@ -65,22 +68,45 @@ void
6568
InitBufferPool(void)
6669
{
6770
bool foundBufs,
68-
foundDescs;
71+
foundDescs,
72+
foundIOLocks;
6973

7074
/* Align descriptors to a cacheline boundary. */
71-
BufferDescriptors = (BufferDescPadded *) CACHELINEALIGN(
72-
ShmemInitStruct("Buffer Descriptors",
73-
NBuffers * sizeof(BufferDescPadded) + PG_CACHE_LINE_SIZE,
74-
&foundDescs));
75+
BufferDescriptors = (BufferDescPadded *)
76+
CACHELINEALIGN(
77+
ShmemInitStruct("Buffer Descriptors",
78+
NBuffers * sizeof(BufferDescPadded)
79+
+ PG_CACHE_LINE_SIZE,
80+
&foundDescs));
7581

7682
BufferBlocks = (char *)
7783
ShmemInitStruct("Buffer Blocks",
7884
NBuffers * (Size) BLCKSZ, &foundBufs);
7985

80-
if (foundDescs || foundBufs)
86+
/* Align lwlocks to cacheline boundary */
87+
BufferIOLWLockArray = (LWLockMinimallyPadded *)
88+
CACHELINEALIGN(ShmemInitStruct("Buffer IO Locks",
89+
NBuffers * (Size) sizeof(LWLockMinimallyPadded)
90+
+ PG_CACHE_LINE_SIZE,
91+
&foundIOLocks));
92+
93+
BufferIOLWLockTranche.name = "Buffer IO Locks";
94+
BufferIOLWLockTranche.array_base = BufferIOLWLockArray;
95+
BufferIOLWLockTranche.array_stride = sizeof(LWLockMinimallyPadded);
96+
LWLockRegisterTranche(LWTRANCHE_BUFFER_IO_IN_PROGRESS,
97+
&BufferIOLWLockTranche);
98+
99+
BufferContentLWLockTranche.name = "Buffer Content Locks";
100+
BufferContentLWLockTranche.array_base =
101+
((char *) BufferDescriptors) + offsetof(BufferDesc, content_lock);
102+
BufferContentLWLockTranche.array_stride = sizeof(BufferDescPadded);
103+
LWLockRegisterTranche(LWTRANCHE_BUFFER_CONTENT,
104+
&BufferContentLWLockTranche);
105+
106+
if (foundDescs || foundBufs || foundIOLocks)
81107
{
82-
/* both should be present or neither */
83-
Assert(foundDescs && foundBufs);
108+
/* should find all of these, or none of them */
109+
Assert(foundDescs && foundBufs && foundIOLocks);
84110
/* note: this path is only taken in EXEC_BACKEND case */
85111
}
86112
else
@@ -110,8 +136,11 @@ InitBufferPool(void)
110136
*/
111137
buf->freeNext = i + 1;
112138

113-
buf->io_in_progress_lock = LWLockAssign();
114-
buf->content_lock = LWLockAssign();
139+
LWLockInitialize(BufferDescriptorGetContentLock(buf),
140+
LWTRANCHE_BUFFER_CONTENT);
141+
142+
LWLockInitialize(BufferDescriptorGetIOLock(buf),
143+
LWTRANCHE_BUFFER_IO_IN_PROGRESS);
115144
}
116145

117146
/* Correct last entry of linked list */
@@ -144,5 +173,17 @@ BufferShmemSize(void)
144173
/* size of stuff controlled by freelist.c */
145174
size = add_size(size, StrategyShmemSize());
146175

176+
/*
177+
* It would be nice to include the I/O locks in the BufferDesc, but that
178+
* would increase the size of a BufferDesc to more than one cache line, and
179+
* benchmarking has shown that keeping every BufferDesc aligned on a cache
180+
* line boundary is important for performance. So, instead, the array of
181+
* I/O locks is allocated in a separate tranche. Because those locks are
182+
* not highly contentended, we lay out the array with minimal padding.
183+
*/
184+
size = add_size(size, mul_size(NBuffers, sizeof(LWLockMinimallyPadded)));
185+
/* to allow aligning the above */
186+
size = add_size(size, PG_CACHE_LINE_SIZE);
187+
147188
return size;
148189
}

src/backend/storage/buffer/bufmgr.c

Lines changed: 30 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -677,7 +677,8 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
677677
if (!isLocalBuf)
678678
{
679679
if (mode == RBM_ZERO_AND_LOCK)
680-
LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE);
680+
LWLockAcquire(BufferDescriptorGetContentLock(bufHdr),
681+
LW_EXCLUSIVE);
681682
else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
682683
LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr));
683684
}
@@ -818,7 +819,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
818819
if ((mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) &&
819820
!isLocalBuf)
820821
{
821-
LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE);
822+
LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_EXCLUSIVE);
822823
}
823824

824825
if (isLocalBuf)
@@ -984,7 +985,8 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
984985
* happens to be trying to split the page the first one got from
985986
* StrategyGetBuffer.)
986987
*/
987-
if (LWLockConditionalAcquire(buf->content_lock, LW_SHARED))
988+
if (LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
989+
LW_SHARED))
988990
{
989991
/*
990992
* If using a nondefault strategy, and writing the buffer
@@ -1006,7 +1008,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
10061008
StrategyRejectBuffer(strategy, buf))
10071009
{
10081010
/* Drop lock/pin and loop around for another buffer */
1009-
LWLockRelease(buf->content_lock);
1011+
LWLockRelease(BufferDescriptorGetContentLock(buf));
10101012
UnpinBuffer(buf, true);
10111013
continue;
10121014
}
@@ -1019,7 +1021,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
10191021
smgr->smgr_rnode.node.relNode);
10201022

10211023
FlushBuffer(buf, NULL);
1022-
LWLockRelease(buf->content_lock);
1024+
LWLockRelease(BufferDescriptorGetContentLock(buf));
10231025

10241026
TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_DONE(forkNum, blockNum,
10251027
smgr->smgr_rnode.node.spcNode,
@@ -1334,7 +1336,7 @@ MarkBufferDirty(Buffer buffer)
13341336

13351337
Assert(BufferIsPinned(buffer));
13361338
/* unfortunately we can't check if the lock is held exclusively */
1337-
Assert(LWLockHeldByMe(bufHdr->content_lock));
1339+
Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
13381340

13391341
LockBufHdr(bufHdr);
13401342

@@ -1534,8 +1536,8 @@ UnpinBuffer(volatile BufferDesc *buf, bool fixOwner)
15341536
if (ref->refcount == 0)
15351537
{
15361538
/* I'd better not still hold any locks on the buffer */
1537-
Assert(!LWLockHeldByMe(buf->content_lock));
1538-
Assert(!LWLockHeldByMe(buf->io_in_progress_lock));
1539+
Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
1540+
Assert(!LWLockHeldByMe(BufferDescriptorGetIOLock(buf)));
15391541

15401542
LockBufHdr(buf);
15411543

@@ -2055,11 +2057,11 @@ SyncOneBuffer(int buf_id, bool skip_recently_used)
20552057
* buffer is clean by the time we've locked it.)
20562058
*/
20572059
PinBuffer_Locked(bufHdr);
2058-
LWLockAcquire(bufHdr->content_lock, LW_SHARED);
2060+
LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
20592061

20602062
FlushBuffer(bufHdr, NULL);
20612063

2062-
LWLockRelease(bufHdr->content_lock);
2064+
LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
20632065
UnpinBuffer(bufHdr, true);
20642066

20652067
return result | BUF_WRITTEN;
@@ -2865,9 +2867,9 @@ FlushRelationBuffers(Relation rel)
28652867
(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
28662868
{
28672869
PinBuffer_Locked(bufHdr);
2868-
LWLockAcquire(bufHdr->content_lock, LW_SHARED);
2870+
LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
28692871
FlushBuffer(bufHdr, rel->rd_smgr);
2870-
LWLockRelease(bufHdr->content_lock);
2872+
LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
28712873
UnpinBuffer(bufHdr, true);
28722874
}
28732875
else
@@ -2917,9 +2919,9 @@ FlushDatabaseBuffers(Oid dbid)
29172919
(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
29182920
{
29192921
PinBuffer_Locked(bufHdr);
2920-
LWLockAcquire(bufHdr->content_lock, LW_SHARED);
2922+
LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
29212923
FlushBuffer(bufHdr, NULL);
2922-
LWLockRelease(bufHdr->content_lock);
2924+
LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
29232925
UnpinBuffer(bufHdr, true);
29242926
}
29252927
else
@@ -2943,7 +2945,7 @@ FlushOneBuffer(Buffer buffer)
29432945

29442946
bufHdr = GetBufferDescriptor(buffer - 1);
29452947

2946-
Assert(LWLockHeldByMe(bufHdr->content_lock));
2948+
Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
29472949

29482950
FlushBuffer(bufHdr, NULL);
29492951
}
@@ -3040,7 +3042,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
30403042

30413043
Assert(GetPrivateRefCount(buffer) > 0);
30423044
/* here, either share or exclusive lock is OK */
3043-
Assert(LWLockHeldByMe(bufHdr->content_lock));
3045+
Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
30443046

30453047
/*
30463048
* This routine might get called many times on the same page, if we are
@@ -3193,11 +3195,11 @@ LockBuffer(Buffer buffer, int mode)
31933195
buf = GetBufferDescriptor(buffer - 1);
31943196

31953197
if (mode == BUFFER_LOCK_UNLOCK)
3196-
LWLockRelease(buf->content_lock);
3198+
LWLockRelease(BufferDescriptorGetContentLock(buf));
31973199
else if (mode == BUFFER_LOCK_SHARE)
3198-
LWLockAcquire(buf->content_lock, LW_SHARED);
3200+
LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_SHARED);
31993201
else if (mode == BUFFER_LOCK_EXCLUSIVE)
3200-
LWLockAcquire(buf->content_lock, LW_EXCLUSIVE);
3202+
LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_EXCLUSIVE);
32013203
else
32023204
elog(ERROR, "unrecognized buffer lock mode: %d", mode);
32033205
}
@@ -3218,7 +3220,8 @@ ConditionalLockBuffer(Buffer buffer)
32183220

32193221
buf = GetBufferDescriptor(buffer - 1);
32203222

3221-
return LWLockConditionalAcquire(buf->content_lock, LW_EXCLUSIVE);
3223+
return LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
3224+
LW_EXCLUSIVE);
32223225
}
32233226

32243227
/*
@@ -3428,8 +3431,8 @@ WaitIO(volatile BufferDesc *buf)
34283431
UnlockBufHdr(buf);
34293432
if (!(sv_flags & BM_IO_IN_PROGRESS))
34303433
break;
3431-
LWLockAcquire(buf->io_in_progress_lock, LW_SHARED);
3432-
LWLockRelease(buf->io_in_progress_lock);
3434+
LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_SHARED);
3435+
LWLockRelease(BufferDescriptorGetIOLock(buf));
34333436
}
34343437
}
34353438

@@ -3462,7 +3465,7 @@ StartBufferIO(volatile BufferDesc *buf, bool forInput)
34623465
* Grab the io_in_progress lock so that other processes can wait for
34633466
* me to finish the I/O.
34643467
*/
3465-
LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
3468+
LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
34663469

34673470
LockBufHdr(buf);
34683471

@@ -3476,7 +3479,7 @@ StartBufferIO(volatile BufferDesc *buf, bool forInput)
34763479
* him to get unwedged.
34773480
*/
34783481
UnlockBufHdr(buf);
3479-
LWLockRelease(buf->io_in_progress_lock);
3482+
LWLockRelease(BufferDescriptorGetIOLock(buf));
34803483
WaitIO(buf);
34813484
}
34823485

@@ -3486,7 +3489,7 @@ StartBufferIO(volatile BufferDesc *buf, bool forInput)
34863489
{
34873490
/* someone else already did the I/O */
34883491
UnlockBufHdr(buf);
3489-
LWLockRelease(buf->io_in_progress_lock);
3492+
LWLockRelease(BufferDescriptorGetIOLock(buf));
34903493
return false;
34913494
}
34923495

@@ -3535,7 +3538,7 @@ TerminateBufferIO(volatile BufferDesc *buf, bool clear_dirty,
35353538

35363539
InProgressBuf = NULL;
35373540

3538-
LWLockRelease(buf->io_in_progress_lock);
3541+
LWLockRelease(BufferDescriptorGetIOLock(buf));
35393542
}
35403543

35413544
/*
@@ -3560,7 +3563,7 @@ AbortBufferIO(void)
35603563
* we can use TerminateBufferIO. Anyone who's executing WaitIO on the
35613564
* buffer will be in a busy spin until we succeed in doing this.
35623565
*/
3563-
LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
3566+
LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
35643567

35653568
LockBufHdr(buf);
35663569
Assert(buf->flags & BM_IO_IN_PROGRESS);

src/backend/storage/lmgr/lwlock.c

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -344,18 +344,15 @@ NumLWLocks(void)
344344
int numLocks;
345345

346346
/*
347-
* Possibly this logic should be spread out among the affected modules,
348-
* the same way that shmem space estimation is done. But for now, there
349-
* are few enough users of LWLocks that we can get away with just keeping
350-
* the knowledge here.
347+
* Many users of LWLocks no longer reserve space in the main array here,
348+
* but instead allocate separate tranches. The latter approach has the
349+
* advantage of allowing LWLOCK_STATS and LOCK_DEBUG output to produce
350+
* more useful output.
351351
*/
352352

353353
/* Predefined LWLocks */
354354
numLocks = NUM_FIXED_LWLOCKS;
355355

356-
/* bufmgr.c needs two for each shared buffer */
357-
numLocks += 2 * NBuffers;
358-
359356
/* proc.c needs one for each backend or auxiliary process */
360357
numLocks += MaxBackends + NUM_AUXILIARY_PROCS;
361358

@@ -423,6 +420,10 @@ CreateLWLocks(void)
423420
StaticAssertExpr(LW_VAL_EXCLUSIVE > (uint32) MAX_BACKENDS,
424421
"MAX_BACKENDS too big for lwlock.c");
425422

423+
StaticAssertExpr(sizeof(LWLock) <= LWLOCK_MINIMAL_SIZE &&
424+
sizeof(LWLock) <= LWLOCK_PADDED_SIZE,
425+
"Miscalculated LWLock padding");
426+
426427
if (!IsUnderPostmaster)
427428
{
428429
int numLocks = NumLWLocks();

src/include/storage/buf_internals.h

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -115,8 +115,8 @@ typedef struct buftag
115115
* Note: buf_hdr_lock must be held to examine or change the tag, flags,
116116
* usage_count, refcount, or wait_backend_pid fields. buf_id field never
117117
* changes after initialization, so does not need locking. freeNext is
118-
* protected by the buffer_strategy_lock not buf_hdr_lock. The LWLocks can take
119-
* care of themselves. The buf_hdr_lock is *not* used to control access to
118+
* protected by the buffer_strategy_lock not buf_hdr_lock. The LWLock can
119+
* take care of itself. The buf_hdr_lock is *not* used to control access to
120120
* the data in the buffer!
121121
*
122122
* An exception is that if we have the buffer pinned, its tag can't change
@@ -133,22 +133,24 @@ typedef struct buftag
133133
*
134134
* We use this same struct for local buffer headers, but the lock fields
135135
* are not used and not all of the flag bits are useful either.
136+
*
137+
* Be careful to avoid increasing the size of the struct when adding or
138+
* reordering members. Keeping it below 64 bytes (the most common CPU
139+
* cache line size) is fairly important for performance.
136140
*/
137141
typedef struct BufferDesc
138142
{
139143
BufferTag tag; /* ID of page contained in buffer */
140144
BufFlags flags; /* see bit definitions above */
141-
uint16 usage_count; /* usage counter for clock sweep code */
145+
uint8 usage_count; /* usage counter for clock sweep code */
146+
slock_t buf_hdr_lock; /* protects a subset of fields, see above */
142147
unsigned refcount; /* # of backends holding pins on buffer */
143148
int wait_backend_pid; /* backend PID of pin-count waiter */
144149

145-
slock_t buf_hdr_lock; /* protects the above fields */
146-
147150
int buf_id; /* buffer's index number (from 0) */
148151
int freeNext; /* link in freelist chain */
149152

150-
LWLock *io_in_progress_lock; /* to wait for I/O to complete */
151-
LWLock *content_lock; /* to lock access to buffer contents */
153+
LWLock content_lock; /* to lock access to buffer contents */
152154
} BufferDesc;
153155

154156
/*
@@ -184,6 +186,13 @@ typedef union BufferDescPadded
184186

185187
#define BufferDescriptorGetBuffer(bdesc) ((bdesc)->buf_id + 1)
186188

189+
#define BufferDescriptorGetIOLock(bdesc) \
190+
(&(BufferIOLWLockArray[(bdesc)->buf_id]).lock)
191+
#define BufferDescriptorGetContentLock(bdesc) \
192+
((LWLock*) (&(bdesc)->content_lock))
193+
194+
extern PGDLLIMPORT LWLockMinimallyPadded *BufferIOLWLockArray;
195+
187196
/*
188197
* The freeNext field is either the index of the next freelist entry,
189198
* or one of these special values:

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy