Skip to content

Commit ca307d5

Browse files
committed
Keep WAL segments by slot's last saved restart LSN
The patch fixes the issue with the unexpected removal of old WAL segments after checkpoint, followed by an immediate restart. The issue occurs when a slot is advanced after the start of the checkpoint and before old WAL segments are removed at the end of the checkpoint. The patch introduces a new in-memory state for slots: last_saved_restart_lsn, which is used to calculate the oldest LSN for removing WAL segments. This state is updated every time with the current restart_lsn at the moment when the slot is saved to disk. This fix changes the shared memory layout. It's applied to HEAD only because we don't have to preserve ABI compatibility during the beta stage. Another fix that doesn't affect the ABI is committed to back branches. Discussion: https://postgr.es/m/1d12d2-67235980-35-19a406a0%4063439497 Author: Vitaly Davydov <v.davydov@postgrespro.ru> Author: Alexander Korotkov <aekorotkov@gmail.com> Reviewed-by: Amit Kapila <amit.kapila16@gmail.com>
1 parent c45a1db commit ca307d5

File tree

2 files changed

+65
-0
lines changed

2 files changed

+65
-0
lines changed

src/backend/replication/slot.c

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,7 @@ ReplicationSlotCreate(const char *name, bool db_specific,
424424
slot->candidate_restart_valid = InvalidXLogRecPtr;
425425
slot->candidate_restart_lsn = InvalidXLogRecPtr;
426426
slot->last_saved_confirmed_flush = InvalidXLogRecPtr;
427+
slot->last_saved_restart_lsn = InvalidXLogRecPtr;
427428
slot->inactive_since = 0;
428429

429430
/*
@@ -1165,20 +1166,41 @@ ReplicationSlotsComputeRequiredLSN(void)
11651166
{
11661167
ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
11671168
XLogRecPtr restart_lsn;
1169+
XLogRecPtr last_saved_restart_lsn;
11681170
bool invalidated;
1171+
ReplicationSlotPersistency persistency;
11691172

11701173
if (!s->in_use)
11711174
continue;
11721175

11731176
SpinLockAcquire(&s->mutex);
1177+
persistency = s->data.persistency;
11741178
restart_lsn = s->data.restart_lsn;
11751179
invalidated = s->data.invalidated != RS_INVAL_NONE;
1180+
last_saved_restart_lsn = s->last_saved_restart_lsn;
11761181
SpinLockRelease(&s->mutex);
11771182

11781183
/* invalidated slots need not apply */
11791184
if (invalidated)
11801185
continue;
11811186

1187+
/*
1188+
* For persistent slot use last_saved_restart_lsn to compute the
1189+
* oldest LSN for removal of WAL segments. The segments between
1190+
* last_saved_restart_lsn and restart_lsn might be needed by a
1191+
* persistent slot in the case of database crash. Non-persistent
1192+
* slots can't survive the database crash, so we don't care about
1193+
* last_saved_restart_lsn for them.
1194+
*/
1195+
if (persistency == RS_PERSISTENT)
1196+
{
1197+
if (last_saved_restart_lsn != InvalidXLogRecPtr &&
1198+
restart_lsn > last_saved_restart_lsn)
1199+
{
1200+
restart_lsn = last_saved_restart_lsn;
1201+
}
1202+
}
1203+
11821204
if (restart_lsn != InvalidXLogRecPtr &&
11831205
(min_required == InvalidXLogRecPtr ||
11841206
restart_lsn < min_required))
@@ -1216,7 +1238,9 @@ ReplicationSlotsComputeLogicalRestartLSN(void)
12161238
{
12171239
ReplicationSlot *s;
12181240
XLogRecPtr restart_lsn;
1241+
XLogRecPtr last_saved_restart_lsn;
12191242
bool invalidated;
1243+
ReplicationSlotPersistency persistency;
12201244

12211245
s = &ReplicationSlotCtl->replication_slots[i];
12221246

@@ -1230,14 +1254,33 @@ ReplicationSlotsComputeLogicalRestartLSN(void)
12301254

12311255
/* read once, it's ok if it increases while we're checking */
12321256
SpinLockAcquire(&s->mutex);
1257+
persistency = s->data.persistency;
12331258
restart_lsn = s->data.restart_lsn;
12341259
invalidated = s->data.invalidated != RS_INVAL_NONE;
1260+
last_saved_restart_lsn = s->last_saved_restart_lsn;
12351261
SpinLockRelease(&s->mutex);
12361262

12371263
/* invalidated slots need not apply */
12381264
if (invalidated)
12391265
continue;
12401266

1267+
/*
1268+
* For persistent slot use last_saved_restart_lsn to compute the
1269+
* oldest LSN for removal of WAL segments. The segments between
1270+
* last_saved_restart_lsn and restart_lsn might be needed by a
1271+
* persistent slot in the case of database crash. Non-persistent
1272+
* slots can't survive the database crash, so we don't care about
1273+
* last_saved_restart_lsn for them.
1274+
*/
1275+
if (persistency == RS_PERSISTENT)
1276+
{
1277+
if (last_saved_restart_lsn != InvalidXLogRecPtr &&
1278+
restart_lsn > last_saved_restart_lsn)
1279+
{
1280+
restart_lsn = last_saved_restart_lsn;
1281+
}
1282+
}
1283+
12411284
if (restart_lsn == InvalidXLogRecPtr)
12421285
continue;
12431286

@@ -1455,6 +1498,7 @@ ReplicationSlotReserveWal(void)
14551498

14561499
Assert(slot != NULL);
14571500
Assert(slot->data.restart_lsn == InvalidXLogRecPtr);
1501+
Assert(slot->last_saved_restart_lsn == InvalidXLogRecPtr);
14581502

14591503
/*
14601504
* The replication slot mechanism is used to prevent removal of required
@@ -1766,6 +1810,8 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes,
17661810
*/
17671811
SpinLockAcquire(&s->mutex);
17681812

1813+
Assert(s->data.restart_lsn >= s->last_saved_restart_lsn);
1814+
17691815
restart_lsn = s->data.restart_lsn;
17701816

17711817
/* we do nothing if the slot is already invalid */
@@ -1835,7 +1881,10 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes,
18351881
* just rely on .invalidated.
18361882
*/
18371883
if (invalidation_cause == RS_INVAL_WAL_REMOVED)
1884+
{
18381885
s->data.restart_lsn = InvalidXLogRecPtr;
1886+
s->last_saved_restart_lsn = InvalidXLogRecPtr;
1887+
}
18391888

18401889
/* Let caller know */
18411890
*invalidated = true;
@@ -2079,6 +2128,12 @@ CheckPointReplicationSlots(bool is_shutdown)
20792128
SaveSlotToPath(s, path, LOG);
20802129
}
20812130
LWLockRelease(ReplicationSlotAllocationLock);
2131+
2132+
/*
2133+
* Recompute the required LSN as SaveSlotToPath() updated
2134+
* last_saved_restart_lsn for slots.
2135+
*/
2136+
ReplicationSlotsComputeRequiredLSN();
20822137
}
20832138

20842139
/*
@@ -2354,6 +2409,7 @@ SaveSlotToPath(ReplicationSlot *slot, const char *dir, int elevel)
23542409
if (!slot->just_dirtied)
23552410
slot->dirty = false;
23562411
slot->last_saved_confirmed_flush = cp.slotdata.confirmed_flush;
2412+
slot->last_saved_restart_lsn = cp.slotdata.restart_lsn;
23572413
SpinLockRelease(&slot->mutex);
23582414

23592415
LWLockRelease(&slot->io_in_progress_lock);
@@ -2569,6 +2625,7 @@ RestoreSlotFromDisk(const char *name)
25692625
slot->effective_xmin = cp.slotdata.xmin;
25702626
slot->effective_catalog_xmin = cp.slotdata.catalog_xmin;
25712627
slot->last_saved_confirmed_flush = cp.slotdata.confirmed_flush;
2628+
slot->last_saved_restart_lsn = cp.slotdata.restart_lsn;
25722629

25732630
slot->candidate_catalog_xmin = InvalidTransactionId;
25742631
slot->candidate_xmin_lsn = InvalidXLogRecPtr;

src/include/replication/slot.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,14 @@ typedef struct ReplicationSlot
215215
* recently stopped.
216216
*/
217217
TimestampTz inactive_since;
218+
219+
/*
220+
* Latest restart_lsn that has been flushed to disk. For persistent slots
221+
* the flushed LSN should be taken into account when calculating the
222+
* oldest LSN for WAL segments removal.
223+
*/
224+
XLogRecPtr last_saved_restart_lsn;
225+
218226
} ReplicationSlot;
219227

220228
#define SlotIsPhysical(slot) ((slot)->data.database == InvalidOid)

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy