Skip to content

Commit fd4ced5

Browse files
Fast promote mode skips checkpoint at end of recovery.
pg_ctl promote -m fast will skip the checkpoint at end of recovery so that we can achieve very fast failover when the apply delay is low. Write new WAL record XLOG_END_OF_RECOVERY to allow us to switch timeline correctly for downstream log readers. If we skip synchronous end of recovery checkpoint we request a normal spread checkpoint so that the window of re-recovery is low. Simon Riggs and Kyotaro Horiguchi, with input from Fujii Masao. Review by Heikki Linnakangas
1 parent ee22c55 commit fd4ced5

File tree

5 files changed

+195
-32
lines changed

5 files changed

+195
-32
lines changed

src/backend/access/rmgrdesc/xlogdesc.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include "access/xlog_internal.h"
1919
#include "catalog/pg_control.h"
2020
#include "utils/guc.h"
21+
#include "utils/timestamp.h"
2122

2223
/*
2324
* GUC support
@@ -119,6 +120,15 @@ xlog_desc(StringInfo buf, uint8 xl_info, char *rec)
119120
memcpy(&fpw, rec, sizeof(bool));
120121
appendStringInfo(buf, "full_page_writes: %s", fpw ? "true" : "false");
121122
}
123+
else if (info == XLOG_END_OF_RECOVERY)
124+
{
125+
xl_end_of_recovery xlrec;
126+
127+
memcpy(&xlrec, rec, sizeof(xl_end_of_recovery));
128+
appendStringInfo(buf, "end_of_recovery: tli %u; time %s",
129+
xlrec.ThisTimeLineID,
130+
timestamptz_to_str(xlrec.end_time));
131+
}
122132
else
123133
appendStringInfo(buf, "UNKNOWN");
124134
}

src/backend/access/transam/xlog.c

Lines changed: 163 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@
6666
#define RECOVERY_COMMAND_FILE "recovery.conf"
6767
#define RECOVERY_COMMAND_DONE "recovery.done"
6868
#define PROMOTE_SIGNAL_FILE "promote"
69+
#define FAST_PROMOTE_SIGNAL_FILE "fast_promote"
6970

7071

7172
/* User-settable parameters */
@@ -210,6 +211,9 @@ bool StandbyMode = false;
210211
static char *PrimaryConnInfo = NULL;
211212
static char *TriggerFile = NULL;
212213

214+
/* whether request for fast promotion has been made yet */
215+
static bool fast_promote = false;
216+
213217
/* if recoveryStopsHere returns true, it saves actual stop xid/time/name here */
214218
static TransactionId recoveryStopXid;
215219
static TimestampTz recoveryStopTime;
@@ -611,6 +615,7 @@ static void CheckRequiredParameterValues(void);
611615
static void XLogReportParameters(void);
612616
static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI);
613617
static void LocalSetXLogInsertAllowed(void);
618+
static void CreateEndOfRecoveryRecord(void);
614619
static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
615620
static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
616621

@@ -642,7 +647,7 @@ static XLogRecord *ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
642647
int emode, bool fetching_ckpt);
643648
static void CheckRecoveryConsistency(void);
644649
static XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader,
645-
XLogRecPtr RecPtr, int whichChkpt);
650+
XLogRecPtr RecPtr, int whichChkpti, bool report);
646651
static bool rescanLatestTimeLine(void);
647652
static void WriteControlFile(void);
648653
static void ReadControlFile(void);
@@ -4848,7 +4853,7 @@ StartupXLOG(void)
48484853
* When a backup_label file is present, we want to roll forward from
48494854
* the checkpoint it identifies, rather than using pg_control.
48504855
*/
4851-
record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0);
4856+
record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true);
48524857
if (record != NULL)
48534858
{
48544859
memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
@@ -4890,7 +4895,7 @@ StartupXLOG(void)
48904895
*/
48914896
checkPointLoc = ControlFile->checkPoint;
48924897
RedoStartLSN = ControlFile->checkPointCopy.redo;
4893-
record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1);
4898+
record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true);
48944899
if (record != NULL)
48954900
{
48964901
ereport(DEBUG1,
@@ -4909,7 +4914,7 @@ StartupXLOG(void)
49094914
else
49104915
{
49114916
checkPointLoc = ControlFile->prevCheckPoint;
4912-
record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2);
4917+
record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2, true);
49134918
if (record != NULL)
49144919
{
49154920
ereport(LOG,
@@ -5393,22 +5398,33 @@ StartupXLOG(void)
53935398
}
53945399

53955400
/*
5396-
* Before replaying this record, check if it is a shutdown
5397-
* checkpoint record that causes the current timeline to
5398-
* change. The checkpoint record is already considered to be
5399-
* part of the new timeline, so we update ThisTimeLineID
5400-
* before replaying it. That's important so that replayEndTLI,
5401-
* which is recorded as the minimum recovery point's TLI if
5401+
* Before replaying this record, check if this record
5402+
* causes the current timeline to change. The record is
5403+
* already considered to be part of the new timeline,
5404+
* so we update ThisTimeLineID before replaying it.
5405+
* That's important so that replayEndTLI, which is
5406+
* recorded as the minimum recovery point's TLI if
54025407
* recovery stops after this record, is set correctly.
54035408
*/
5404-
if (record->xl_rmid == RM_XLOG_ID &&
5405-
(record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN)
5409+
if (record->xl_rmid == RM_XLOG_ID)
54065410
{
5407-
CheckPoint checkPoint;
5408-
TimeLineID newTLI;
5411+
TimeLineID newTLI = ThisTimeLineID;
5412+
uint8 info = record->xl_info & ~XLR_INFO_MASK;
5413+
5414+
if (info == XLOG_CHECKPOINT_SHUTDOWN)
5415+
{
5416+
CheckPoint checkPoint;
5417+
5418+
memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
5419+
newTLI = checkPoint.ThisTimeLineID;
5420+
}
5421+
else if (info == XLOG_END_OF_RECOVERY)
5422+
{
5423+
xl_end_of_recovery xlrec;
54095424

5410-
memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
5411-
newTLI = checkPoint.ThisTimeLineID;
5425+
memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
5426+
newTLI = xlrec.ThisTimeLineID;
5427+
}
54125428

54135429
if (newTLI != ThisTimeLineID)
54145430
{
@@ -5729,9 +5745,36 @@ StartupXLOG(void)
57295745
* allows some extra error checking in xlog_redo.
57305746
*/
57315747
if (bgwriterLaunched)
5732-
RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
5733-
CHECKPOINT_IMMEDIATE |
5734-
CHECKPOINT_WAIT);
5748+
{
5749+
bool checkpoint_wait = true;
5750+
5751+
/*
5752+
* If we've been explicitly promoted with fast option,
5753+
* end of recovery without a checkpoint if possible.
5754+
*/
5755+
if (fast_promote)
5756+
{
5757+
checkPointLoc = ControlFile->prevCheckPoint;
5758+
record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2, false);
5759+
if (record != NULL)
5760+
{
5761+
checkpoint_wait = false;
5762+
CreateEndOfRecoveryRecord();
5763+
}
5764+
}
5765+
5766+
/*
5767+
* In most cases we will wait for a full checkpoint to complete.
5768+
*
5769+
* If not, issue a normal, non-immediate checkpoint but don't wait.
5770+
*/
5771+
if (checkpoint_wait)
5772+
RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
5773+
CHECKPOINT_IMMEDIATE |
5774+
CHECKPOINT_WAIT);
5775+
else
5776+
RequestCheckpoint(0); /* No flags */
5777+
}
57355778
else
57365779
CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
57375780

@@ -6060,12 +6103,15 @@ LocalSetXLogInsertAllowed(void)
60606103
*/
60616104
static XLogRecord *
60626105
ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
6063-
int whichChkpt)
6106+
int whichChkpt, bool report)
60646107
{
60656108
XLogRecord *record;
60666109

60676110
if (!XRecOffIsValid(RecPtr))
60686111
{
6112+
if (!report)
6113+
return NULL;
6114+
60696115
switch (whichChkpt)
60706116
{
60716117
case 1:
@@ -6088,6 +6134,9 @@ ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
60886134

60896135
if (record == NULL)
60906136
{
6137+
if (!report)
6138+
return NULL;
6139+
60916140
switch (whichChkpt)
60926141
{
60936142
case 1:
@@ -6882,6 +6931,44 @@ CreateCheckPoint(int flags)
68826931
LWLockRelease(CheckpointLock);
68836932
}
68846933

6934+
/*
6935+
* Mark the end of recovery in WAL though without running a full checkpoint.
6936+
* We can expect that a restartpoint is likely to be in progress as we
6937+
* do this, though we are unwilling to wait for it to complete. So be
6938+
* careful to avoid taking the CheckpointLock anywhere here.
6939+
*
6940+
* CreateRestartPoint() allows for the case where recovery may end before
6941+
* the restartpoint completes so there is no concern of concurrent behaviour.
6942+
*/
6943+
void
6944+
CreateEndOfRecoveryRecord(void)
6945+
{
6946+
xl_end_of_recovery xlrec;
6947+
XLogRecData rdata;
6948+
6949+
/* sanity check */
6950+
if (!RecoveryInProgress())
6951+
elog(ERROR, "can only be used to end recovery");
6952+
6953+
xlrec.end_time = time(NULL);
6954+
xlrec.ThisTimeLineID = ThisTimeLineID;
6955+
6956+
LocalSetXLogInsertAllowed();
6957+
6958+
START_CRIT_SECTION();
6959+
6960+
rdata.data = (char *) &xlrec;
6961+
rdata.len = sizeof(xl_end_of_recovery);
6962+
rdata.buffer = InvalidBuffer;
6963+
rdata.next = NULL;
6964+
6965+
(void) XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY, &rdata);
6966+
6967+
END_CRIT_SECTION();
6968+
6969+
LocalXLogInsertAllowed = -1; /* return to "check" state */
6970+
}
6971+
68856972
/*
68866973
* Flush all data in shared memory to disk, and fsync
68876974
*
@@ -7613,6 +7700,27 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
76137700

76147701
RecoveryRestartPoint(&checkPoint);
76157702
}
7703+
else if (info == XLOG_END_OF_RECOVERY)
7704+
{
7705+
xl_end_of_recovery xlrec;
7706+
7707+
memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
7708+
7709+
/*
7710+
* For Hot Standby, we could treat this like a Shutdown Checkpoint,
7711+
* but this case is rarer and harder to test, so the benefit doesn't
7712+
* outweigh the potential extra cost of maintenance.
7713+
*/
7714+
7715+
/*
7716+
* We should've already switched to the new TLI before replaying this
7717+
* record.
7718+
*/
7719+
if (xlrec.ThisTimeLineID != ThisTimeLineID)
7720+
ereport(PANIC,
7721+
(errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
7722+
xlrec.ThisTimeLineID, ThisTimeLineID)));
7723+
}
76167724
else if (info == XLOG_NOOP)
76177725
{
76187726
/* nothing to do here */
@@ -9405,8 +9513,39 @@ CheckForStandbyTrigger(void)
94059513

94069514
if (IsPromoteTriggered())
94079515
{
9408-
ereport(LOG,
9516+
/*
9517+
* In 9.1 and 9.2 the postmaster unlinked the promote file
9518+
* inside the signal handler. We now leave the file in place
9519+
* and let the Startup process do the unlink. This allows
9520+
* Startup to know whether we're doing fast or normal
9521+
* promotion. Fast promotion takes precedence.
9522+
*/
9523+
if (stat(FAST_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
9524+
{
9525+
unlink(FAST_PROMOTE_SIGNAL_FILE);
9526+
unlink(PROMOTE_SIGNAL_FILE);
9527+
fast_promote = true;
9528+
}
9529+
else if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
9530+
{
9531+
unlink(PROMOTE_SIGNAL_FILE);
9532+
fast_promote = false;
9533+
}
9534+
9535+
/*
9536+
* We only look for fast promote via the pg_ctl promote option.
9537+
* It would be possible to extend trigger file support for the
9538+
* fast promotion option but that wouldn't be backwards compatible
9539+
* anyway and we're looking to focus further work on the promote
9540+
* option as the right way to signal end of recovery.
9541+
*/
9542+
if (fast_promote)
9543+
ereport(LOG,
9544+
(errmsg("received fast promote request")));
9545+
else
9546+
ereport(LOG,
94099547
(errmsg("received promote request")));
9548+
94109549
ResetPromoteTriggered();
94119550
triggered = true;
94129551
return true;
@@ -9435,15 +9574,10 @@ CheckPromoteSignal(void)
94359574
{
94369575
struct stat stat_buf;
94379576

9438-
if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
9439-
{
9440-
/*
9441-
* Since we are in a signal handler, it's not safe to elog. We
9442-
* silently ignore any error from unlink.
9443-
*/
9444-
unlink(PROMOTE_SIGNAL_FILE);
9577+
if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0 ||
9578+
stat(FAST_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
94459579
return true;
9446-
}
9580+
94479581
return false;
94489582
}
94499583

src/bin/pg_ctl/pg_ctl.c

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1136,6 +1136,15 @@ do_promote(void)
11361136
exit(1);
11371137
}
11381138

1139+
/*
1140+
* Use two different kinds of promotion file so we can understand
1141+
* the difference between smart and fast promotion.
1142+
*/
1143+
if (shutdown_mode >= FAST_MODE)
1144+
snprintf(promote_file, MAXPGPATH, "%s/fast_promote", pg_data);
1145+
else
1146+
snprintf(promote_file, MAXPGPATH, "%s/promote", pg_data);
1147+
11391148
if ((prmfile = fopen(promote_file, "w")) == NULL)
11401149
{
11411150
write_stderr(_("%s: could not create promote signal file \"%s\": %s\n"),
@@ -1799,7 +1808,7 @@ do_help(void)
17991808
" [-o \"OPTIONS\"]\n"), progname);
18001809
printf(_(" %s reload [-D DATADIR] [-s]\n"), progname);
18011810
printf(_(" %s status [-D DATADIR]\n"), progname);
1802-
printf(_(" %s promote [-D DATADIR] [-s]\n"), progname);
1811+
printf(_(" %s promote [-D DATADIR] [-s] [-m PROMOTION-MODE]\n"), progname);
18031812
printf(_(" %s kill SIGNALNAME PID\n"), progname);
18041813
#if defined(WIN32) || defined(__CYGWIN__)
18051814
printf(_(" %s register [-N SERVICENAME] [-U USERNAME] [-P PASSWORD] [-D DATADIR]\n"
@@ -1828,14 +1837,18 @@ do_help(void)
18281837
printf(_(" -o OPTIONS command line options to pass to postgres\n"
18291838
" (PostgreSQL server executable) or initdb\n"));
18301839
printf(_(" -p PATH-TO-POSTGRES normally not necessary\n"));
1831-
printf(_("\nOptions for stop or restart:\n"));
1840+
printf(_("\nOptions for stop, restart or promote:\n"));
18321841
printf(_(" -m, --mode=MODE MODE can be \"smart\", \"fast\", or \"immediate\"\n"));
18331842

18341843
printf(_("\nShutdown modes are:\n"));
18351844
printf(_(" smart quit after all clients have disconnected\n"));
18361845
printf(_(" fast quit directly, with proper shutdown\n"));
18371846
printf(_(" immediate quit without complete shutdown; will lead to recovery on restart\n"));
18381847

1848+
printf(_("\nPromotion modes are:\n"));
1849+
printf(_(" smart promote after performing a checkpoint\n"));
1850+
printf(_(" fast promote quickly without waiting for checkpoint completion\n"));
1851+
18391852
printf(_("\nAllowed signal names for kill:\n"));
18401853
printf(" ABRT HUP INT QUIT TERM USR1 USR2\n");
18411854

@@ -2271,7 +2284,6 @@ main(int argc, char **argv)
22712284
snprintf(pid_file, MAXPGPATH, "%s/postmaster.pid", pg_data);
22722285
snprintf(backup_file, MAXPGPATH, "%s/backup_label", pg_data);
22732286
snprintf(recovery_file, MAXPGPATH, "%s/recovery.conf", pg_data);
2274-
snprintf(promote_file, MAXPGPATH, "%s/promote", pg_data);
22752287
}
22762288

22772289
switch (ctl_command)

src/include/access/xlog_internal.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,12 @@ typedef struct xl_restore_point
217217
char rp_name[MAXFNAMELEN];
218218
} xl_restore_point;
219219

220+
/* End of recovery mark, when we don't do an END_OF_RECOVERY checkpoint */
221+
typedef struct xl_end_of_recovery
222+
{
223+
TimestampTz end_time;
224+
TimeLineID ThisTimeLineID;
225+
} xl_end_of_recovery;
220226

221227
/*
222228
* XLogRecord is defined in xlog.h, but we avoid #including that to keep

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy