Skip to content

Commit d526575

Browse files
committed
Make large sequential scans and VACUUMs work in a limited-size "ring" of
buffers, rather than blowing out the whole shared-buffer arena. Aside from avoiding cache spoliation, this fixes the problem that VACUUM formerly tended to cause a WAL flush for every page it modified, because we had it hacked to use only a single buffer. Those flushes will now occur only once per ring-ful. The exact ring size, and the threshold for seqscans to switch into the ring usage pattern, remain under debate; but the infrastructure seems done. The key bit of infrastructure is a new optional BufferAccessStrategy object that can be passed to ReadBuffer operations; this replaces the former StrategyHintVacuum API. This patch also changes the buffer usage-count methodology a bit: we now advance usage_count when first pinning a buffer, rather than when last unpinning it. To preserve the behavior that a buffer's lifetime starts to decrease when it's released, the clock sweep code is modified to not decrement usage_count of pinned buffers. Work not done in this commit: teach GiST and GIN indexes to use the vacuum BufferAccessStrategy for vacuum-driven fetches. Original patch by Simon, reworked by Heikki and again by Tom.
1 parent 0a6f2ee commit d526575

File tree

24 files changed

+723
-263
lines changed

24 files changed

+723
-263
lines changed

src/backend/access/hash/hash.c

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/access/hash/hash.c,v 1.94 2007/05/03 16:45:58 tgl Exp $
11+
* $PostgreSQL: pgsql/src/backend/access/hash/hash.c,v 1.95 2007/05/30 20:11:51 tgl Exp $
1212
*
1313
* NOTES
1414
* This file contains only the public interface routines.
@@ -547,8 +547,9 @@ hashbulkdelete(PG_FUNCTION_ARGS)
547547

548548
vacuum_delay_point();
549549

550-
buf = _hash_getbuf(rel, blkno, HASH_WRITE,
551-
LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
550+
buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
551+
LH_BUCKET_PAGE | LH_OVERFLOW_PAGE,
552+
info->strategy);
552553
page = BufferGetPage(buf);
553554
opaque = (HashPageOpaque) PageGetSpecialPointer(page);
554555
Assert(opaque->hasho_bucket == cur_bucket);
@@ -596,7 +597,8 @@ hashbulkdelete(PG_FUNCTION_ARGS)
596597

597598
/* If we deleted anything, try to compact free space */
598599
if (bucket_dirty)
599-
_hash_squeezebucket(rel, cur_bucket, bucket_blkno);
600+
_hash_squeezebucket(rel, cur_bucket, bucket_blkno,
601+
info->strategy);
600602

601603
/* Release bucket lock */
602604
_hash_droplock(rel, bucket_blkno, HASH_EXCLUSIVE);

src/backend/access/hash/hashovfl.c

Lines changed: 49 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.57 2007/05/03 16:45:58 tgl Exp $
11+
* $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.58 2007/05/30 20:11:51 tgl Exp $
1212
*
1313
* NOTES
1414
* Overflow pages look like ordinary relation pages.
@@ -362,6 +362,9 @@ _hash_firstfreebit(uint32 map)
362362
* Remove this overflow page from its bucket's chain, and mark the page as
363363
* free. On entry, ovflbuf is write-locked; it is released before exiting.
364364
*
365+
* Since this function is invoked in VACUUM, we provide an access strategy
366+
* parameter that controls fetches of the bucket pages.
367+
*
365368
* Returns the block number of the page that followed the given page
366369
* in the bucket, or InvalidBlockNumber if no following page.
367370
*
@@ -370,7 +373,8 @@ _hash_firstfreebit(uint32 map)
370373
* on the bucket, too.
371374
*/
372375
BlockNumber
373-
_hash_freeovflpage(Relation rel, Buffer ovflbuf)
376+
_hash_freeovflpage(Relation rel, Buffer ovflbuf,
377+
BufferAccessStrategy bstrategy)
374378
{
375379
HashMetaPage metap;
376380
Buffer metabuf;
@@ -413,8 +417,11 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf)
413417
*/
414418
if (BlockNumberIsValid(prevblkno))
415419
{
416-
Buffer prevbuf = _hash_getbuf(rel, prevblkno, HASH_WRITE,
417-
LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
420+
Buffer prevbuf = _hash_getbuf_with_strategy(rel,
421+
prevblkno,
422+
HASH_WRITE,
423+
LH_BUCKET_PAGE | LH_OVERFLOW_PAGE,
424+
bstrategy);
418425
Page prevpage = BufferGetPage(prevbuf);
419426
HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage);
420427

@@ -424,8 +431,11 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf)
424431
}
425432
if (BlockNumberIsValid(nextblkno))
426433
{
427-
Buffer nextbuf = _hash_getbuf(rel, nextblkno, HASH_WRITE,
428-
LH_OVERFLOW_PAGE);
434+
Buffer nextbuf = _hash_getbuf_with_strategy(rel,
435+
nextblkno,
436+
HASH_WRITE,
437+
LH_OVERFLOW_PAGE,
438+
bstrategy);
429439
Page nextpage = BufferGetPage(nextbuf);
430440
HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage);
431441

@@ -434,6 +444,8 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf)
434444
_hash_wrtbuf(rel, nextbuf);
435445
}
436446

447+
/* Note: bstrategy is intentionally not used for metapage and bitmap */
448+
437449
/* Read the metapage so we can determine which bitmap page to use */
438450
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
439451
metap = (HashMetaPage) BufferGetPage(metabuf);
@@ -558,11 +570,15 @@ _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno)
558570
*
559571
* Caller must hold exclusive lock on the target bucket. This allows
560572
* us to safely lock multiple pages in the bucket.
573+
*
574+
* Since this function is invoked in VACUUM, we provide an access strategy
575+
* parameter that controls fetches of the bucket pages.
561576
*/
562577
void
563578
_hash_squeezebucket(Relation rel,
564579
Bucket bucket,
565-
BlockNumber bucket_blkno)
580+
BlockNumber bucket_blkno,
581+
BufferAccessStrategy bstrategy)
566582
{
567583
Buffer wbuf;
568584
Buffer rbuf = 0;
@@ -581,7 +597,11 @@ _hash_squeezebucket(Relation rel,
581597
* start squeezing into the base bucket page.
582598
*/
583599
wblkno = bucket_blkno;
584-
wbuf = _hash_getbuf(rel, wblkno, HASH_WRITE, LH_BUCKET_PAGE);
600+
wbuf = _hash_getbuf_with_strategy(rel,
601+
wblkno,
602+
HASH_WRITE,
603+
LH_BUCKET_PAGE,
604+
bstrategy);
585605
wpage = BufferGetPage(wbuf);
586606
wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage);
587607

@@ -595,16 +615,22 @@ _hash_squeezebucket(Relation rel,
595615
}
596616

597617
/*
598-
* find the last page in the bucket chain by starting at the base bucket
599-
* page and working forward.
618+
* Find the last page in the bucket chain by starting at the base bucket
619+
* page and working forward. Note: we assume that a hash bucket chain is
620+
* usually smaller than the buffer ring being used by VACUUM, else using
621+
* the access strategy here would be counterproductive.
600622
*/
601623
ropaque = wopaque;
602624
do
603625
{
604626
rblkno = ropaque->hasho_nextblkno;
605627
if (ropaque != wopaque)
606628
_hash_relbuf(rel, rbuf);
607-
rbuf = _hash_getbuf(rel, rblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
629+
rbuf = _hash_getbuf_with_strategy(rel,
630+
rblkno,
631+
HASH_WRITE,
632+
LH_OVERFLOW_PAGE,
633+
bstrategy);
608634
rpage = BufferGetPage(rbuf);
609635
ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage);
610636
Assert(ropaque->hasho_bucket == bucket);
@@ -644,7 +670,11 @@ _hash_squeezebucket(Relation rel,
644670
return;
645671
}
646672

647-
wbuf = _hash_getbuf(rel, wblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
673+
wbuf = _hash_getbuf_with_strategy(rel,
674+
wblkno,
675+
HASH_WRITE,
676+
LH_OVERFLOW_PAGE,
677+
bstrategy);
648678
wpage = BufferGetPage(wbuf);
649679
wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage);
650680
Assert(wopaque->hasho_bucket == bucket);
@@ -688,15 +718,19 @@ _hash_squeezebucket(Relation rel,
688718
/* yes, so release wbuf lock first */
689719
_hash_wrtbuf(rel, wbuf);
690720
/* free this overflow page (releases rbuf) */
691-
_hash_freeovflpage(rel, rbuf);
721+
_hash_freeovflpage(rel, rbuf, bstrategy);
692722
/* done */
693723
return;
694724
}
695725

696726
/* free this overflow page, then get the previous one */
697-
_hash_freeovflpage(rel, rbuf);
727+
_hash_freeovflpage(rel, rbuf, bstrategy);
698728

699-
rbuf = _hash_getbuf(rel, rblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
729+
rbuf = _hash_getbuf_with_strategy(rel,
730+
rblkno,
731+
HASH_WRITE,
732+
LH_OVERFLOW_PAGE,
733+
bstrategy);
700734
rpage = BufferGetPage(rbuf);
701735
ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage);
702736
Assert(ropaque->hasho_bucket == bucket);

src/backend/access/hash/hashpage.c

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.67 2007/05/03 16:45:58 tgl Exp $
11+
* $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.68 2007/05/30 20:11:51 tgl Exp $
1212
*
1313
* NOTES
1414
* Postgres hash pages look like ordinary relation pages. The opaque
@@ -214,6 +214,34 @@ _hash_getnewbuf(Relation rel, BlockNumber blkno)
214214
return buf;
215215
}
216216

217+
/*
218+
* _hash_getbuf_with_strategy() -- Get a buffer with nondefault strategy.
219+
*
220+
* This is identical to _hash_getbuf() but also allows a buffer access
221+
* strategy to be specified. We use this for VACUUM operations.
222+
*/
223+
Buffer
224+
_hash_getbuf_with_strategy(Relation rel, BlockNumber blkno,
225+
int access, int flags,
226+
BufferAccessStrategy bstrategy)
227+
{
228+
Buffer buf;
229+
230+
if (blkno == P_NEW)
231+
elog(ERROR, "hash AM does not use P_NEW");
232+
233+
buf = ReadBufferWithStrategy(rel, blkno, bstrategy);
234+
235+
if (access != HASH_NOLOCK)
236+
LockBuffer(buf, access);
237+
238+
/* ref count and lock type are correct */
239+
240+
_hash_checkpage(rel, buf, flags);
241+
242+
return buf;
243+
}
244+
217245
/*
218246
* _hash_relbuf() -- release a locked buffer.
219247
*
@@ -840,5 +868,5 @@ _hash_splitbucket(Relation rel,
840868
_hash_wrtbuf(rel, obuf);
841869
_hash_wrtbuf(rel, nbuf);
842870

843-
_hash_squeezebucket(rel, obucket, start_oblkno);
871+
_hash_squeezebucket(rel, obucket, start_oblkno, NULL);
844872
}

src/backend/access/heap/heapam.c

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.233 2007/05/27 03:50:38 tgl Exp $
11+
* $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.234 2007/05/30 20:11:53 tgl Exp $
1212
*
1313
*
1414
* INTERFACE ROUTINES
@@ -83,6 +83,24 @@ initscan(HeapScanDesc scan, ScanKey key)
8383
*/
8484
scan->rs_nblocks = RelationGetNumberOfBlocks(scan->rs_rd);
8585

86+
/*
87+
* If the table is large relative to NBuffers, use a bulk-read access
88+
* strategy, else use the default random-access strategy. During a
89+
* rescan, don't make a new strategy object if we don't have to.
90+
*/
91+
if (scan->rs_nblocks > NBuffers / 4 &&
92+
!scan->rs_rd->rd_istemp)
93+
{
94+
if (scan->rs_strategy == NULL)
95+
scan->rs_strategy = GetAccessStrategy(BAS_BULKREAD);
96+
}
97+
else
98+
{
99+
if (scan->rs_strategy != NULL)
100+
FreeAccessStrategy(scan->rs_strategy);
101+
scan->rs_strategy = NULL;
102+
}
103+
86104
scan->rs_inited = false;
87105
scan->rs_ctup.t_data = NULL;
88106
ItemPointerSetInvalid(&scan->rs_ctup.t_self);
@@ -123,9 +141,17 @@ heapgetpage(HeapScanDesc scan, BlockNumber page)
123141

124142
Assert(page < scan->rs_nblocks);
125143

126-
scan->rs_cbuf = ReleaseAndReadBuffer(scan->rs_cbuf,
127-
scan->rs_rd,
128-
page);
144+
/* release previous scan buffer, if any */
145+
if (BufferIsValid(scan->rs_cbuf))
146+
{
147+
ReleaseBuffer(scan->rs_cbuf);
148+
scan->rs_cbuf = InvalidBuffer;
149+
}
150+
151+
/* read page using selected strategy */
152+
scan->rs_cbuf = ReadBufferWithStrategy(scan->rs_rd,
153+
page,
154+
scan->rs_strategy);
129155
scan->rs_cblock = page;
130156

131157
if (!scan->rs_pageatatime)
@@ -938,6 +964,7 @@ heap_beginscan(Relation relation, Snapshot snapshot,
938964
scan->rs_rd = relation;
939965
scan->rs_snapshot = snapshot;
940966
scan->rs_nkeys = nkeys;
967+
scan->rs_strategy = NULL; /* set in initscan */
941968

942969
/*
943970
* we can use page-at-a-time mode if it's an MVCC-safe snapshot
@@ -1007,6 +1034,9 @@ heap_endscan(HeapScanDesc scan)
10071034
if (scan->rs_key)
10081035
pfree(scan->rs_key);
10091036

1037+
if (scan->rs_strategy != NULL)
1038+
FreeAccessStrategy(scan->rs_strategy);
1039+
10101040
pfree(scan);
10111041
}
10121042

src/backend/access/nbtree/nbtree.c

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
* Portions Copyright (c) 1994, Regents of the University of California
1313
*
1414
* IDENTIFICATION
15-
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.154 2007/01/05 22:19:23 momjian Exp $
15+
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.155 2007/05/30 20:11:53 tgl Exp $
1616
*
1717
*-------------------------------------------------------------------------
1818
*/
@@ -786,9 +786,10 @@ btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
786786
/*
787787
* We can't use _bt_getbuf() here because it always applies
788788
* _bt_checkpage(), which will barf on an all-zero page. We want to
789-
* recycle all-zero pages, not fail.
789+
* recycle all-zero pages, not fail. Also, we want to use a nondefault
790+
* buffer access strategy.
790791
*/
791-
buf = ReadBuffer(rel, blkno);
792+
buf = ReadBufferWithStrategy(rel, blkno, info->strategy);
792793
LockBuffer(buf, BT_READ);
793794
page = BufferGetPage(buf);
794795
opaque = (BTPageOpaque) PageGetSpecialPointer(page);

src/backend/access/transam/xlog.c

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
88
* Portions Copyright (c) 1994, Regents of the University of California
99
*
10-
* $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.269 2007/05/20 21:08:19 tgl Exp $
10+
* $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.270 2007/05/30 20:11:55 tgl Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -1799,6 +1799,36 @@ XLogFlush(XLogRecPtr record)
17991799
LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
18001800
}
18011801

1802+
/*
1803+
* Test whether XLOG data has been flushed up to (at least) the given position.
1804+
*
1805+
* Returns true if a flush is still needed. (It may be that someone else
1806+
* is already in process of flushing that far, however.)
1807+
*/
1808+
bool
1809+
XLogNeedsFlush(XLogRecPtr record)
1810+
{
1811+
/* Quick exit if already known flushed */
1812+
if (XLByteLE(record, LogwrtResult.Flush))
1813+
return false;
1814+
1815+
/* read LogwrtResult and update local state */
1816+
{
1817+
/* use volatile pointer to prevent code rearrangement */
1818+
volatile XLogCtlData *xlogctl = XLogCtl;
1819+
1820+
SpinLockAcquire(&xlogctl->info_lck);
1821+
LogwrtResult = xlogctl->LogwrtResult;
1822+
SpinLockRelease(&xlogctl->info_lck);
1823+
}
1824+
1825+
/* check again */
1826+
if (XLByteLE(record, LogwrtResult.Flush))
1827+
return false;
1828+
1829+
return true;
1830+
}
1831+
18021832
/*
18031833
* Create a new XLOG file segment, or open a pre-existing one.
18041834
*

src/backend/catalog/index.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.283 2007/05/16 17:28:20 alvherre Exp $
11+
* $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.284 2007/05/30 20:11:55 tgl Exp $
1212
*
1313
*
1414
* INTERFACE ROUTINES
@@ -1658,6 +1658,7 @@ validate_index(Oid heapId, Oid indexId, Snapshot snapshot)
16581658
ivinfo.vacuum_full = false;
16591659
ivinfo.message_level = DEBUG2;
16601660
ivinfo.num_heap_tuples = -1;
1661+
ivinfo.strategy = NULL;
16611662

16621663
state.tuplesort = tuplesort_begin_datum(TIDOID,
16631664
TIDLessOperator, false,

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy