Content-Length: 920659 | pFad | https://github.com/postgres/postgres/commit/e6eed40e44419e3268d01fe0d2daec08a7df68f7

F0 Avoid BufferGetLSNAtomic() calls during nbtree scans. · postgres/postgres@e6eed40 · GitHub
Skip to content

Commit e6eed40

Browse files
Avoid BufferGetLSNAtomic() calls during nbtree scans.
Delay calling BufferGetLSNAtomic() until we finish reading a page that actually contains items that btgettuple will return to the executor. This reduces the number of calls during plain index scans (we'll only call BufferGetLSNAtomic() when _bt_readpage returns true), and totally eliminates calls during index-only scans, bitmap index scans, and plain index scans of an unlogged relation. Currently, when checksums (or wal_log_hints) are enabled, acquiring a page's LSN in BufferGetLSNAtomic() involves locking the buffer header (which involves the use of spinlocks). Testing has shown that enabling page-level checksums causes large regressions with certain workloads, especially on larger multi-socket systems. The regression isn't tied to any Postgres 18 commit. However, Postgres 18 commit 04bec89 made initdb use checksums by default, so it seems prudent to address the problem now. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Tomas Vondra <tomas@vondra.me> Discussion: https://postgr.es/m/941f0190-e3c6-4622-9ac7-c04e936e5fdb@vondra.me Discussion: https://postgr.es/m/CAH2-Wzk-Dg5XWs_jDuiHt4_7ryrSY+n=vxmHY51EVqPDFsKXmg@mail.gmail.com
1 parent 016e407 commit e6eed40

File tree

4 files changed

+102
-71
lines changed

4 files changed

+102
-71
lines changed

src/backend/access/nbtree/nbtree.c

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,8 @@ btgettuple(IndexScanDesc scan, ScanDirection dir)
228228
BTScanOpaque so = (BTScanOpaque) scan->opaque;
229229
bool res;
230230

231+
Assert(scan->heapRelation != NULL);
232+
231233
/* btree indexes are never lossy */
232234
scan->xs_recheck = false;
233235

@@ -289,6 +291,8 @@ btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
289291
int64 ntids = 0;
290292
ItemPointer heapTid;
291293

294+
Assert(scan->heapRelation == NULL);
295+
292296
/* Each loop iteration performs another primitive index scan */
293297
do
294298
{
@@ -393,6 +397,32 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
393397
BTScanPosInvalidate(so->currPos);
394398
}
395399

400+
/*
401+
* We prefer to eagerly drop leaf page pins before btgettuple returns.
402+
* This avoids making VACUUM wait to acquire a cleanup lock on the page.
403+
*
404+
* We cannot safely drop leaf page pins during index-only scans due to a
405+
* race condition involving VACUUM setting pages all-visible in the VM.
406+
* It's also unsafe for plain index scans that use a non-MVCC snapshot.
407+
*
408+
* When we drop pins eagerly, the mechanism that marks so->killedItems[]
409+
* index tuples LP_DEAD has to deal with concurrent TID recycling races.
410+
* The scheme used to detect unsafe TID recycling won't work when scanning
411+
* unlogged relations (since it involves saving an affected page's LSN).
412+
* Opt out of eager pin dropping during unlogged relation scans for now
413+
* (this is preferable to opting out of kill_prior_tuple LP_DEAD setting).
414+
*
415+
* Also opt out of dropping leaf page pins eagerly during bitmap scans.
416+
* Pins cannot be held for more than an instant during bitmap scans either
417+
* way, so we might as well avoid wasting cycles on acquiring page LSNs.
418+
*
419+
* See nbtree/README section on making concurrent TID recycling safe.
420+
*/
421+
so->dropPin = (!scan->xs_want_itup &&
422+
IsMVCCSnapshot(scan->xs_snapshot) &&
423+
RelationNeedsWAL(scan->indexRelation) &&
424+
scan->heapRelation != NULL);
425+
396426
so->markItemIndex = -1;
397427
so->needPrimScan = false;
398428
so->scanBehind = false;

src/backend/access/nbtree/nbtsearch.c

Lines changed: 37 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
#include "utils/rel.h"
2626

2727

28-
static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
28+
static inline void _bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so);
2929
static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key,
3030
Buffer buf, bool forupdate, BTStack stack,
3131
int access);
@@ -57,24 +57,29 @@ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
5757
/*
5858
* _bt_drop_lock_and_maybe_pin()
5959
*
60-
* Unlock the buffer; and if it is safe to release the pin, do that, too.
61-
* This will prevent vacuum from stalling in a blocked state trying to read a
62-
* page when a cursor is sitting on it.
63-
*
64-
* See nbtree/README section on making concurrent TID recycling safe.
60+
* Unlock so->currPos.buf. If scan is so->dropPin, drop the pin, too.
61+
* Dropping the pin prevents VACUUM from blocking on acquiring a cleanup lock.
6562
*/
66-
static void
67-
_bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
63+
static inline void
64+
_bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so)
6865
{
69-
_bt_unlockbuf(scan->indexRelation, sp->buf);
70-
71-
if (IsMVCCSnapshot(scan->xs_snapshot) &&
72-
RelationNeedsWAL(scan->indexRelation) &&
73-
!scan->xs_want_itup)
66+
if (!so->dropPin)
7467
{
75-
ReleaseBuffer(sp->buf);
76-
sp->buf = InvalidBuffer;
68+
/* Just drop the lock (not the pin) */
69+
_bt_unlockbuf(rel, so->currPos.buf);
70+
return;
7771
}
72+
73+
/*
74+
* Drop both the lock and the pin.
75+
*
76+
* Have to set so->currPos.lsn so that _bt_killitems has a way to detect
77+
* when concurrent heap TID recycling by VACUUM might have taken place.
78+
*/
79+
Assert(RelationNeedsWAL(rel));
80+
so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
81+
_bt_relbuf(rel, so->currPos.buf);
82+
so->currPos.buf = InvalidBuffer;
7883
}
7984

8085
/*
@@ -866,8 +871,8 @@ _bt_compare(Relation rel,
866871
* if backwards scan, the last item) in the tree that satisfies the
867872
* qualifications in the scan key. On success exit, data about the
868873
* matching tuple(s) on the page has been loaded into so->currPos. We'll
869-
* drop all locks and hold onto a pin on page's buffer, except when
870-
* _bt_drop_lock_and_maybe_pin dropped the pin to avoid blocking VACUUM.
874+
* drop all locks and hold onto a pin on page's buffer, except during
875+
* so->dropPin scans, when we drop both the lock and the pin.
871876
* _bt_returnitem sets the next item to return to scan on success exit.
872877
*
873878
* If there are no matching items in the index, we return false, with no
@@ -1610,7 +1615,13 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
16101615
so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
16111616
so->currPos.prevPage = opaque->btpo_prev;
16121617
so->currPos.nextPage = opaque->btpo_next;
1618+
/* delay setting so->currPos.lsn until _bt_drop_lock_and_maybe_pin */
1619+
so->currPos.dir = dir;
1620+
so->currPos.nextTupleOffset = 0;
16131621

1622+
/* either moreRight or moreLeft should be set now (may be unset later) */
1623+
Assert(ScanDirectionIsForward(dir) ? so->currPos.moreRight :
1624+
so->currPos.moreLeft);
16141625
Assert(!P_IGNORE(opaque));
16151626
Assert(BTScanPosIsPinned(so->currPos));
16161627
Assert(!so->needPrimScan);
@@ -1626,14 +1637,6 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
16261637
so->currPos.currPage);
16271638
}
16281639

1629-
/* initialize remaining currPos fields related to current page */
1630-
so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
1631-
so->currPos.dir = dir;
1632-
so->currPos.nextTupleOffset = 0;
1633-
/* either moreLeft or moreRight should be set now (may be unset later) */
1634-
Assert(ScanDirectionIsForward(dir) ? so->currPos.moreRight :
1635-
so->currPos.moreLeft);
1636-
16371640
PredicateLockPage(rel, so->currPos.currPage, scan->xs_snapshot);
16381641

16391642
/* initialize local variables */
@@ -2107,10 +2110,9 @@ _bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
21072110
*
21082111
* Wrapper on _bt_readnextpage that performs final steps for the current page.
21092112
*
2110-
* On entry, if so->currPos.buf is valid the buffer is pinned but not locked.
2111-
* If there's no pin held, it's because _bt_drop_lock_and_maybe_pin dropped
2112-
* the pin eagerly earlier on. The scan must have so->currPos.currPage set to
2113-
* a valid block, in any case.
2113+
* On entry, so->currPos must be valid. Its buffer will be pinned, though
2114+
* never locked. (Actually, when so->dropPin there won't even be a pin held,
2115+
* though so->currPos.currPage must still be set to a valid block number.)
21142116
*/
21152117
static bool
21162118
_bt_steppage(IndexScanDesc scan, ScanDirection dir)
@@ -2251,12 +2253,14 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
22512253
*/
22522254
if (_bt_readpage(scan, dir, offnum, true))
22532255
{
2256+
Relation rel = scan->indexRelation;
2257+
22542258
/*
22552259
* _bt_readpage succeeded. Drop the lock (and maybe the pin) on
22562260
* so->currPos.buf in preparation for btgettuple returning tuples.
22572261
*/
22582262
Assert(BTScanPosIsPinned(so->currPos));
2259-
_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
2263+
_bt_drop_lock_and_maybe_pin(rel, so);
22602264
return true;
22612265
}
22622266

@@ -2294,8 +2298,8 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
22942298
*
22952299
* On success exit, so->currPos is updated to contain data from the next
22962300
* interesting page, and we return true. We hold a pin on the buffer on
2297-
* success exit, except when _bt_drop_lock_and_maybe_pin decided it was safe
2298-
* to eagerly drop the pin (to avoid blocking VACUUM).
2301+
* success exit (except during so->dropPin index scans, when we drop the pin
2302+
* eagerly to avoid blocking VACUUM).
22992303
*
23002304
* If there are no more matching records in the given direction, we drop all
23012305
* locks and pins, invalidate so->currPos, and return false.
@@ -2413,7 +2417,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
24132417
*/
24142418
Assert(so->currPos.currPage == blkno);
24152419
Assert(BTScanPosIsPinned(so->currPos));
2416-
_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
2420+
_bt_drop_lock_and_maybe_pin(rel, so);
24172421

24182422
return true;
24192423
}

src/backend/access/nbtree/nbtutils.c

Lines changed: 32 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -3335,75 +3335,71 @@ _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate,
33353335
*
33363336
* Note that if we hold a pin on the target page continuously from initially
33373337
* reading the items until applying this function, VACUUM cannot have deleted
3338-
* any items from the page, and so there is no need to search left from the
3339-
* recorded offset. (This observation also guarantees that the item is still
3340-
* the right one to delete, which might otherwise be questionable since heap
3341-
* TIDs can get recycled.) This holds true even if the page has been modified
3342-
* by inserts and page splits, so there is no need to consult the LSN.
3343-
*
3344-
* If the pin was released after reading the page, then we re-read it. If it
3345-
* has been modified since we read it (as determined by the LSN), we dare not
3346-
* flag any entries because it is possible that the old entry was vacuumed
3347-
* away and the TID was re-used by a completely different heap tuple.
3338+
* any items on the page, so the page's TIDs can't have been recycled by now.
3339+
* There's no risk that we'll confuse a new index tuple that happens to use a
3340+
* recycled TID with a now-removed tuple with the same TID (that used to be on
3341+
* this same page). We can't rely on that during scans that drop pins eagerly
3342+
* (so->dropPin scans), though, so we must condition setting LP_DEAD bits on
3343+
* the page LSN having not changed since back when _bt_readpage saw the page.
33483344
*/
33493345
void
33503346
_bt_killitems(IndexScanDesc scan)
33513347
{
3348+
Relation rel = scan->indexRelation;
33523349
BTScanOpaque so = (BTScanOpaque) scan->opaque;
33533350
Page page;
33543351
BTPageOpaque opaque;
33553352
OffsetNumber minoff;
33563353
OffsetNumber maxoff;
3357-
int i;
33583354
int numKilled = so->numKilled;
33593355
bool killedsomething = false;
3360-
bool droppedpin PG_USED_FOR_ASSERTS_ONLY;
33613356

3357+
Assert(numKilled > 0);
33623358
Assert(BTScanPosIsValid(so->currPos));
3359+
Assert(scan->heapRelation != NULL); /* can't be a bitmap index scan */
33633360

3364-
/*
3365-
* Always reset the scan state, so we don't look for same items on other
3366-
* pages.
3367-
*/
3361+
/* Always invalidate so->killedItems[] before leaving so->currPos */
33683362
so->numKilled = 0;
33693363

3370-
if (BTScanPosIsPinned(so->currPos))
3364+
if (!so->dropPin)
33713365
{
33723366
/*
33733367
* We have held the pin on this page since we read the index tuples,
33743368
* so all we need to do is lock it. The pin will have prevented
3375-
* re-use of any TID on the page, so there is no need to check the
3376-
* LSN.
3369+
* concurrent VACUUMs from recycling any of the TIDs on the page.
33773370
*/
3378-
droppedpin = false;
3379-
_bt_lockbuf(scan->indexRelation, so->currPos.buf, BT_READ);
3380-
3381-
page = BufferGetPage(so->currPos.buf);
3371+
Assert(BTScanPosIsPinned(so->currPos));
3372+
_bt_lockbuf(rel, so->currPos.buf, BT_READ);
33823373
}
33833374
else
33843375
{
33853376
Buffer buf;
3377+
XLogRecPtr latestlsn;
33863378

3387-
droppedpin = true;
3388-
/* Attempt to re-read the buffer, getting pin and lock. */
3389-
buf = _bt_getbuf(scan->indexRelation, so->currPos.currPage, BT_READ);
3379+
Assert(!BTScanPosIsPinned(so->currPos));
3380+
Assert(RelationNeedsWAL(rel));
3381+
buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
33903382

3391-
page = BufferGetPage(buf);
3392-
if (BufferGetLSNAtomic(buf) == so->currPos.lsn)
3393-
so->currPos.buf = buf;
3394-
else
3383+
latestlsn = BufferGetLSNAtomic(buf);
3384+
Assert(!XLogRecPtrIsInvalid(so->currPos.lsn));
3385+
Assert(so->currPos.lsn <= latestlsn);
3386+
if (so->currPos.lsn != latestlsn)
33953387
{
3396-
/* Modified while not pinned means hinting is not safe. */
3397-
_bt_relbuf(scan->indexRelation, buf);
3388+
/* Modified, give up on hinting */
3389+
_bt_relbuf(rel, buf);
33983390
return;
33993391
}
3392+
3393+
/* Unmodified, hinting is safe */
3394+
so->currPos.buf = buf;
34003395
}
34013396

3397+
page = BufferGetPage(so->currPos.buf);
34023398
opaque = BTPageGetOpaque(page);
34033399
minoff = P_FIRSTDATAKEY(opaque);
34043400
maxoff = PageGetMaxOffsetNumber(page);
34053401

3406-
for (i = 0; i < numKilled; i++)
3402+
for (int i = 0; i < numKilled; i++)
34073403
{
34083404
int itemIndex = so->killedItems[i];
34093405
BTScanPosItem *kitem = &so->currPos.items[itemIndex];
@@ -3435,7 +3431,7 @@ _bt_killitems(IndexScanDesc scan)
34353431
* correctness.
34363432
*
34373433
* Note that the page may have been modified in almost any way
3438-
* since we first read it (in the !droppedpin case), so it's
3434+
* since we first read it (in the !so->dropPin case), so it's
34393435
* possible that this posting list tuple wasn't a posting list
34403436
* tuple when we first encountered its heap TIDs.
34413437
*/
@@ -3451,7 +3447,7 @@ _bt_killitems(IndexScanDesc scan)
34513447
* though only in the common case where the page can't
34523448
* have been concurrently modified
34533449
*/
3454-
Assert(kitem->indexOffset == offnum || !droppedpin);
3450+
Assert(kitem->indexOffset == offnum || !so->dropPin);
34553451

34563452
/*
34573453
* Read-ahead to later kitems here.
@@ -3518,7 +3514,7 @@ _bt_killitems(IndexScanDesc scan)
35183514
MarkBufferDirtyHint(so->currPos.buf, true);
35193515
}
35203516

3521-
_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
3517+
_bt_unlockbuf(rel, so->currPos.buf);
35223518
}
35233519

35243520

src/include/access/nbtree.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -939,7 +939,7 @@ typedef BTVacuumPostingData *BTVacuumPosting;
939939
* processing. This approach minimizes lock/unlock traffic. We must always
940940
* drop the lock to make it okay for caller to process the returned items.
941941
* Whether or not we can also release the pin during this window will vary.
942-
* We drop the pin eagerly (when safe) to avoid blocking progress by VACUUM
942+
* We drop the pin (when so->dropPin) to avoid blocking progress by VACUUM
943943
* (see nbtree/README section about making concurrent TID recycling safe).
944944
* We'll always release both the lock and the pin on the current page before
945945
* moving on to its sibling page.
@@ -967,7 +967,7 @@ typedef struct BTScanPosData
967967
BlockNumber currPage; /* page referenced by items array */
968968
BlockNumber prevPage; /* currPage's left link */
969969
BlockNumber nextPage; /* currPage's right link */
970-
XLogRecPtr lsn; /* currPage's LSN */
970+
XLogRecPtr lsn; /* currPage's LSN (when so->dropPin) */
971971

972972
/* scan direction for the saved position's call to _bt_readpage */
973973
ScanDirection dir;
@@ -1070,6 +1070,7 @@ typedef struct BTScanOpaqueData
10701070
/* info about killed items if any (killedItems is NULL if never used) */
10711071
int *killedItems; /* currPos.items indexes of killed items */
10721072
int numKilled; /* number of currently stored items */
1073+
bool dropPin; /* drop leaf pin before btgettuple returns? */
10731074

10741075
/*
10751076
* If we are doing an index-only scan, these are the tuple storage

0 commit comments

Comments
 (0)








ApplySandwichStrip

pFad - (p)hone/(F)rame/(a)nonymizer/(d)eclutterfier!      Saves Data!


--- a PPN by Garber Painting Akron. With Image Size Reduction included!

Fetched URL: https://github.com/postgres/postgres/commit/e6eed40e44419e3268d01fe0d2daec08a7df68f7

Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy