Skip to content

Commit bbace56

Browse files
committed
Fix possible recovery trouble if TRUNCATE overlaps a checkpoint.
If TRUNCATE causes some buffers to be invalidated and thus the checkpoint does not flush them, TRUNCATE must also ensure that the corresponding files are truncated on disk. Otherwise, a replay from the checkpoint might find that the buffers exist but have the wrong contents, which may cause replay to fail. Report by Teja Mupparti. Patch by Kyotaro Horiguchi, per a design suggestion from Heikki Linnakangas, with some changes to the comments by me. Review of this and a prior patch that approached the issue differently by Heikki Linnakangas, Andres Freund, Álvaro Herrera, Masahiko Sawada, and Tom Lane. Discussion: http://postgr.es/m/BYAPR06MB6373BF50B469CA393C614257ABF00@BYAPR06MB6373.namprd06.prod.outlook.com
1 parent 81045e1 commit bbace56

File tree

11 files changed

+120
-28
lines changed

11 files changed

+120
-28
lines changed

src/backend/access/transam/multixact.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3075,8 +3075,8 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
30753075
* crash/basebackup, even though the state of the data directory would
30763076
* require it.
30773077
*/
3078-
Assert(!MyProc->delayChkpt);
3079-
MyProc->delayChkpt = true;
3078+
Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
3079+
MyProc->delayChkpt |= DELAY_CHKPT_START;
30803080

30813081
/* WAL log truncation */
30823082
WriteMTruncateXlogRec(newOldestMultiDB,
@@ -3102,7 +3102,7 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
31023102
/* Then offsets */
31033103
PerformOffsetsTruncation(oldestMulti, newOldestMulti);
31043104

3105-
MyProc->delayChkpt = false;
3105+
MyProc->delayChkpt &= ~DELAY_CHKPT_START;
31063106

31073107
END_CRIT_SECTION();
31083108
LWLockRelease(MultiXactTruncationLock);

src/backend/access/transam/twophase.c

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -474,7 +474,7 @@ MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid,
474474
}
475475
proc->xid = xid;
476476
Assert(proc->xmin == InvalidTransactionId);
477-
proc->delayChkpt = false;
477+
proc->delayChkpt = 0;
478478
proc->statusFlags = 0;
479479
proc->pid = 0;
480480
proc->databaseId = databaseid;
@@ -1165,7 +1165,8 @@ EndPrepare(GlobalTransaction gxact)
11651165

11661166
START_CRIT_SECTION();
11671167

1168-
MyProc->delayChkpt = true;
1168+
Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
1169+
MyProc->delayChkpt |= DELAY_CHKPT_START;
11691170

11701171
XLogBeginInsert();
11711172
for (record = records.head; record != NULL; record = record->next)
@@ -1208,7 +1209,7 @@ EndPrepare(GlobalTransaction gxact)
12081209
* checkpoint starting after this will certainly see the gxact as a
12091210
* candidate for fsyncing.
12101211
*/
1211-
MyProc->delayChkpt = false;
1212+
MyProc->delayChkpt &= ~DELAY_CHKPT_START;
12121213

12131214
/*
12141215
* Remember that we have this GlobalTransaction entry locked for us. If
@@ -2275,7 +2276,8 @@ RecordTransactionCommitPrepared(TransactionId xid,
22752276
START_CRIT_SECTION();
22762277

22772278
/* See notes in RecordTransactionCommit */
2278-
MyProc->delayChkpt = true;
2279+
Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
2280+
MyProc->delayChkpt |= DELAY_CHKPT_START;
22792281

22802282
/*
22812283
* Emit the XLOG commit record. Note that we mark 2PC commits as
@@ -2323,7 +2325,7 @@ RecordTransactionCommitPrepared(TransactionId xid,
23232325
TransactionIdCommitTree(xid, nchildren, children);
23242326

23252327
/* Checkpoint can proceed now */
2326-
MyProc->delayChkpt = false;
2328+
MyProc->delayChkpt &= ~DELAY_CHKPT_START;
23272329

23282330
END_CRIT_SECTION();
23292331

src/backend/access/transam/xact.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1335,8 +1335,9 @@ RecordTransactionCommit(void)
13351335
* This makes checkpoint's determination of which xacts are delayChkpt
13361336
* a bit fuzzy, but it doesn't matter.
13371337
*/
1338+
Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
13381339
START_CRIT_SECTION();
1339-
MyProc->delayChkpt = true;
1340+
MyProc->delayChkpt |= DELAY_CHKPT_START;
13401341

13411342
SetCurrentTransactionStopTimestamp();
13421343

@@ -1437,7 +1438,7 @@ RecordTransactionCommit(void)
14371438
*/
14381439
if (markXidCommitted)
14391440
{
1440-
MyProc->delayChkpt = false;
1441+
MyProc->delayChkpt &= ~DELAY_CHKPT_START;
14411442
END_CRIT_SECTION();
14421443
}
14431444

src/backend/access/transam/xlog.c

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9228,18 +9228,30 @@ CreateCheckPoint(int flags)
92289228
* and we will correctly flush the update below. So we cannot miss any
92299229
* xacts we need to wait for.
92309230
*/
9231-
vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
9231+
vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_START);
92329232
if (nvxids > 0)
92339233
{
92349234
do
92359235
{
92369236
pg_usleep(10000L); /* wait for 10 msec */
9237-
} while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
9237+
} while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
9238+
DELAY_CHKPT_START));
92389239
}
92399240
pfree(vxids);
92409241

92419242
CheckPointGuts(checkPoint.redo, flags);
92429243

9244+
vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_COMPLETE);
9245+
if (nvxids > 0)
9246+
{
9247+
do
9248+
{
9249+
pg_usleep(10000L); /* wait for 10 msec */
9250+
} while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
9251+
DELAY_CHKPT_COMPLETE));
9252+
}
9253+
pfree(vxids);
9254+
92439255
/*
92449256
* Take a snapshot of running transactions and write this to WAL. This
92459257
* allows us to reconstruct the state of running transactions during

src/backend/access/transam/xloginsert.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -925,7 +925,7 @@ XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
925925
/*
926926
* Ensure no checkpoint can change our view of RedoRecPtr.
927927
*/
928-
Assert(MyProc->delayChkpt);
928+
Assert((MyProc->delayChkpt & DELAY_CHKPT_START) != 0);
929929

930930
/*
931931
* Update RedoRecPtr so that we can make the right decision

src/backend/catalog/storage.c

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,22 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
325325

326326
RelationPreTruncate(rel);
327327

328+
/*
329+
* Make sure that a concurrent checkpoint can't complete while truncation
330+
* is in progress.
331+
*
332+
* The truncation operation might drop buffers that the checkpoint
333+
* otherwise would have flushed. If it does, then it's essential that
334+
* the files actually get truncated on disk before the checkpoint record
335+
* is written. Otherwise, if reply begins from that checkpoint, the
336+
* to-be-truncated blocks might still exist on disk but have older
337+
* contents than expected, which can cause replay to fail. It's OK for
338+
* the blocks to not exist on disk at all, but not for them to have the
339+
* wrong contents.
340+
*/
341+
Assert((MyProc->delayChkpt & DELAY_CHKPT_COMPLETE) == 0);
342+
MyProc->delayChkpt |= DELAY_CHKPT_COMPLETE;
343+
328344
/*
329345
* We WAL-log the truncation before actually truncating, which means
330346
* trouble if the truncation fails. If we then crash, the WAL replay
@@ -363,13 +379,24 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
363379
XLogFlush(lsn);
364380
}
365381

366-
/* Do the real work to truncate relation forks */
382+
/*
383+
* This will first remove any buffers from the buffer pool that should no
384+
* longer exist after truncation is complete, and then truncate the
385+
* corresponding files on disk.
386+
*/
367387
smgrtruncate(rel->rd_smgr, forks, nforks, blocks);
368388

389+
/* We've done all the critical work, so checkpoints are OK now. */
390+
MyProc->delayChkpt &= ~DELAY_CHKPT_COMPLETE;
391+
369392
/*
370393
* Update upper-level FSM pages to account for the truncation. This is
371394
* important because the just-truncated pages were likely marked as
372395
* all-free, and would be preferentially selected.
396+
*
397+
* NB: There's no point in delaying checkpoints until this is done.
398+
* Because the FSM is not WAL-logged, we have to be prepared for the
399+
* possibility of corruption after a crash anyway.
373400
*/
374401
if (need_fsm_vacuum)
375402
FreeSpaceMapVacuumRange(rel, nblocks, InvalidBlockNumber);

src/backend/storage/buffer/bufmgr.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3946,7 +3946,9 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
39463946
* essential that CreateCheckpoint waits for virtual transactions
39473947
* rather than full transactionids.
39483948
*/
3949-
MyProc->delayChkpt = delayChkpt = true;
3949+
Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
3950+
MyProc->delayChkpt |= DELAY_CHKPT_START;
3951+
delayChkpt = true;
39503952
lsn = XLogSaveBufferForHint(buffer, buffer_std);
39513953
}
39523954

@@ -3979,7 +3981,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
39793981
UnlockBufHdr(bufHdr, buf_state);
39803982

39813983
if (delayChkpt)
3982-
MyProc->delayChkpt = false;
3984+
MyProc->delayChkpt &= ~DELAY_CHKPT_START;
39833985

39843986
if (dirtied)
39853987
{

src/backend/storage/ipc/procarray.c

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -689,7 +689,10 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid)
689689

690690
proc->lxid = InvalidLocalTransactionId;
691691
proc->xmin = InvalidTransactionId;
692-
proc->delayChkpt = false; /* be sure this is cleared in abort */
692+
693+
/* be sure this is cleared in abort */
694+
proc->delayChkpt = 0;
695+
693696
proc->recoveryConflictPending = false;
694697

695698
/* must be cleared with xid/xmin: */
@@ -728,7 +731,10 @@ ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid)
728731
proc->xid = InvalidTransactionId;
729732
proc->lxid = InvalidLocalTransactionId;
730733
proc->xmin = InvalidTransactionId;
731-
proc->delayChkpt = false; /* be sure this is cleared in abort */
734+
735+
/* be sure this is cleared in abort */
736+
proc->delayChkpt = 0;
737+
732738
proc->recoveryConflictPending = false;
733739

734740
/* must be cleared with xid/xmin: */
@@ -3043,7 +3049,8 @@ GetOldestSafeDecodingTransactionId(bool catalogOnly)
30433049
* delaying checkpoint because they have critical actions in progress.
30443050
*
30453051
* Constructs an array of VXIDs of transactions that are currently in commit
3046-
* critical sections, as shown by having delayChkpt set in their PGPROC.
3052+
* critical sections, as shown by having specified delayChkpt bits set in their
3053+
* PGPROC.
30473054
*
30483055
* Returns a palloc'd array that should be freed by the caller.
30493056
* *nvxids is the number of valid entries.
@@ -3057,13 +3064,15 @@ GetOldestSafeDecodingTransactionId(bool catalogOnly)
30573064
* for clearing of delayChkpt to propagate is unimportant for correctness.
30583065
*/
30593066
VirtualTransactionId *
3060-
GetVirtualXIDsDelayingChkpt(int *nvxids)
3067+
GetVirtualXIDsDelayingChkpt(int *nvxids, int type)
30613068
{
30623069
VirtualTransactionId *vxids;
30633070
ProcArrayStruct *arrayP = procArray;
30643071
int count = 0;
30653072
int index;
30663073

3074+
Assert(type != 0);
3075+
30673076
/* allocate what's certainly enough result space */
30683077
vxids = (VirtualTransactionId *)
30693078
palloc(sizeof(VirtualTransactionId) * arrayP->maxProcs);
@@ -3075,7 +3084,7 @@ GetVirtualXIDsDelayingChkpt(int *nvxids)
30753084
int pgprocno = arrayP->pgprocnos[index];
30763085
PGPROC *proc = &allProcs[pgprocno];
30773086

3078-
if (proc->delayChkpt)
3087+
if ((proc->delayChkpt & type) != 0)
30793088
{
30803089
VirtualTransactionId vxid;
30813090

@@ -3101,12 +3110,14 @@ GetVirtualXIDsDelayingChkpt(int *nvxids)
31013110
* those numbers should be small enough for it not to be a problem.
31023111
*/
31033112
bool
3104-
HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids)
3113+
HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids, int type)
31053114
{
31063115
bool result = false;
31073116
ProcArrayStruct *arrayP = procArray;
31083117
int index;
31093118

3119+
Assert(type != 0);
3120+
31103121
LWLockAcquire(ProcArrayLock, LW_SHARED);
31113122

31123123
for (index = 0; index < arrayP->numProcs; index++)
@@ -3117,7 +3128,8 @@ HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids)
31173128

31183129
GET_VXID_FROM_PGPROC(vxid, *proc);
31193130

3120-
if (proc->delayChkpt && VirtualTransactionIdIsValid(vxid))
3131+
if ((proc->delayChkpt & type) != 0 &&
3132+
VirtualTransactionIdIsValid(vxid))
31213133
{
31223134
int i;
31233135

src/backend/storage/lmgr/proc.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -394,7 +394,7 @@ InitProcess(void)
394394
MyProc->roleId = InvalidOid;
395395
MyProc->tempNamespaceId = InvalidOid;
396396
MyProc->isBackgroundWorker = IsBackgroundWorker;
397-
MyProc->delayChkpt = false;
397+
MyProc->delayChkpt = 0;
398398
MyProc->statusFlags = 0;
399399
/* NB -- autovac launcher intentionally does not set IS_AUTOVACUUM */
400400
if (IsAutoVacuumWorkerProcess())
@@ -579,7 +579,7 @@ InitAuxiliaryProcess(void)
579579
MyProc->roleId = InvalidOid;
580580
MyProc->tempNamespaceId = InvalidOid;
581581
MyProc->isBackgroundWorker = IsBackgroundWorker;
582-
MyProc->delayChkpt = false;
582+
MyProc->delayChkpt = 0;
583583
MyProc->statusFlags = 0;
584584
MyProc->lwWaiting = false;
585585
MyProc->lwWaitMode = 0;

src/include/storage/proc.h

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,41 @@ struct XidCache
8686
*/
8787
#define INVALID_PGPROCNO PG_INT32_MAX
8888

89+
/*
90+
* Flags for PGPROC.delayChkpt
91+
*
92+
* These flags can be used to delay the start or completion of a checkpoint
93+
* for short periods. A flag is in effect if the corresponding bit is set in
94+
* the PGPROC of any backend.
95+
*
96+
* For our purposes here, a checkpoint has three phases: (1) determine the
97+
* location to which the redo pointer will be moved, (2) write all the
98+
* data durably to disk, and (3) WAL-log the checkpoint.
99+
*
100+
* Setting DELAY_CHKPT_START prevents the system from moving from phase 1
101+
* to phase 2. This is useful when we are performing a WAL-logged modification
102+
* of data that will be flushed to disk in phase 2. By setting this flag
103+
* before writing WAL and clearing it after we've both written WAL and
104+
* performed the corresponding modification, we ensure that if the WAL record
105+
* is inserted prior to the new redo point, the corresponding data changes will
106+
* also be flushed to disk before the checkpoint can complete. (In the
107+
* extremely common case where the data being modified is in shared buffers
108+
* and we acquire an exclusive content lock on the relevant buffers before
109+
* writing WAL, this mechanism is not needed, because phase 2 will block
110+
* until we release the content lock and then flush the modified data to
111+
* disk.)
112+
*
113+
* Setting DELAY_CHKPT_COMPLETE prevents the system from moving from phase 2
114+
* to phase 3. This is useful if we are performing a WAL-logged operation that
115+
* might invalidate buffers, such as relation truncation. In this case, we need
116+
* to ensure that any buffers which were invalidated and thus not flushed by
117+
* the checkpoint are actaully destroyed on disk. Replay can cope with a file
118+
* or block that doesn't exist, but not with a block that has the wrong
119+
* contents.
120+
*/
121+
#define DELAY_CHKPT_START (1<<0)
122+
#define DELAY_CHKPT_COMPLETE (1<<1)
123+
89124
typedef enum
90125
{
91126
PROC_WAIT_STATUS_OK,
@@ -191,7 +226,7 @@ struct PGPROC
191226
pg_atomic_uint64 waitStart; /* time at which wait for lock acquisition
192227
* started */
193228

194-
bool delayChkpt; /* true if this proc delays checkpoint start */
229+
int delayChkpt; /* for DELAY_CHKPT_* flags */
195230

196231
uint8 statusFlags; /* this backend's status flags, see PROC_*
197232
* above. mirrored in

src/include/storage/procarray.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,9 @@ extern TransactionId GetOldestActiveTransactionId(void);
5959
extern TransactionId GetOldestSafeDecodingTransactionId(bool catalogOnly);
6060
extern void GetReplicationHorizons(TransactionId *slot_xmin, TransactionId *catalog_xmin);
6161

62-
extern VirtualTransactionId *GetVirtualXIDsDelayingChkpt(int *nvxids);
63-
extern bool HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids);
62+
extern VirtualTransactionId *GetVirtualXIDsDelayingChkpt(int *nvxids, int type);
63+
extern bool HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids,
64+
int nvxids, int type);
6465

6566
extern PGPROC *BackendPidGetProc(int pid);
6667
extern PGPROC *BackendPidGetProcWithLock(int pid);

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy