Skip to content

Commit 0d01c5b

Browse files
committed
Fix postmaster's handling of a startup-process crash.
Ordinarily, a failure (unexpected exit status) of the startup subprocess should be considered fatal, so the postmaster should just close up shop and quit. However, if we sent the startup process a SIGQUIT or SIGKILL signal, the failure is hardly "unexpected", and we should attempt restart; this is necessary for recovery from ordinary backend crashes in hot-standby scenarios. I attempted to implement the latter rule with a two-line patch in commit 442231d, but it now emerges that that patch was a few bricks shy of a load: it failed to distinguish the case of a signaled startup process from the case where the new startup process crashes before reaching database consistency. That resulted in infinitely respawning a new startup process only to have it crash again. To handle this properly, we really must track whether we have sent the *current* startup process a kill signal. Rather than add yet another ad-hoc boolean to the postmaster's state, I chose to unify this with the existing RecoveryError flag into an enum tracking the startup process's state. That seems more consistent with the postmaster's general state machine design. Back-patch to 9.0, like the previous patch.
1 parent cf0c446 commit 0d01c5b

File tree

1 file changed

+37
-14
lines changed

1 file changed

+37
-14
lines changed

src/backend/postmaster/postmaster.c

Lines changed: 37 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,17 @@ static pid_t StartupPID = 0,
249249
PgStatPID = 0,
250250
SysLoggerPID = 0;
251251

252+
/* Startup process's status */
253+
typedef enum
254+
{
255+
STARTUP_NOT_RUNNING,
256+
STARTUP_RUNNING,
257+
STARTUP_SIGNALED, /* we sent it a SIGQUIT or SIGKILL */
258+
STARTUP_CRASHED
259+
} StartupStatusEnum;
260+
261+
static StartupStatusEnum StartupStatus = STARTUP_NOT_RUNNING;
262+
252263
/* Startup/shutdown state */
253264
#define NoShutdown 0
254265
#define SmartShutdown 1
@@ -258,7 +269,6 @@ static pid_t StartupPID = 0,
258269
static int Shutdown = NoShutdown;
259270

260271
static bool FatalError = false; /* T if recovering from backend crash */
261-
static bool RecoveryError = false; /* T if WAL recovery failed */
262272

263273
/*
264274
* We use a simple state machine to control startup, shutdown, and
@@ -301,8 +311,6 @@ static bool RecoveryError = false; /* T if WAL recovery failed */
301311
* states, nor in PM_SHUTDOWN states (because we don't enter those states
302312
* when trying to recover from a crash). It can be true in PM_STARTUP state,
303313
* because we don't clear it until we've successfully started WAL redo.
304-
* Similarly, RecoveryError means that we have crashed during recovery, and
305-
* should not try to restart.
306314
*/
307315
typedef enum
308316
{
@@ -1238,6 +1246,7 @@ PostmasterMain(int argc, char *argv[])
12381246
*/
12391247
StartupPID = StartupDataBase();
12401248
Assert(StartupPID != 0);
1249+
StartupStatus = STARTUP_RUNNING;
12411250
pmState = PM_STARTUP;
12421251

12431252
/* Some workers may be scheduled to start now */
@@ -2583,6 +2592,7 @@ reaper(SIGNAL_ARGS)
25832592
if (Shutdown > NoShutdown &&
25842593
(EXIT_STATUS_0(exitstatus) || EXIT_STATUS_1(exitstatus)))
25852594
{
2595+
StartupStatus = STARTUP_NOT_RUNNING;
25862596
pmState = PM_WAIT_BACKENDS;
25872597
/* PostmasterStateMachine logic does the rest */
25882598
continue;
@@ -2605,16 +2615,18 @@ reaper(SIGNAL_ARGS)
26052615
/*
26062616
* After PM_STARTUP, any unexpected exit (including FATAL exit) of
26072617
* the startup process is catastrophic, so kill other children,
2608-
* and set RecoveryError so we don't try to reinitialize after
2609-
* they're gone. Exception: if FatalError is already set, that
2610-
* implies we previously sent the startup process a SIGQUIT, so
2618+
* and set StartupStatus so we don't try to reinitialize after
2619+
* they're gone. Exception: if StartupStatus is STARTUP_SIGNALED,
2620+
* then we previously sent the startup process a SIGQUIT; so
26112621
* that's probably the reason it died, and we do want to try to
26122622
* restart in that case.
26132623
*/
26142624
if (!EXIT_STATUS_0(exitstatus))
26152625
{
2616-
if (!FatalError)
2617-
RecoveryError = true;
2626+
if (StartupStatus == STARTUP_SIGNALED)
2627+
StartupStatus = STARTUP_NOT_RUNNING;
2628+
else
2629+
StartupStatus = STARTUP_CRASHED;
26182630
HandleChildCrash(pid, exitstatus,
26192631
_("startup process"));
26202632
continue;
@@ -2623,6 +2635,7 @@ reaper(SIGNAL_ARGS)
26232635
/*
26242636
* Startup succeeded, commence normal operations
26252637
*/
2638+
StartupStatus = STARTUP_NOT_RUNNING;
26262639
FatalError = false;
26272640
Assert(AbortStartTime == 0);
26282641
ReachedNormalRunning = true;
@@ -3170,14 +3183,18 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
31703183

31713184
/* Take care of the startup process too */
31723185
if (pid == StartupPID)
3186+
{
31733187
StartupPID = 0;
3188+
StartupStatus = STARTUP_CRASHED;
3189+
}
31743190
else if (StartupPID != 0 && take_action)
31753191
{
31763192
ereport(DEBUG2,
31773193
(errmsg_internal("sending %s to process %d",
31783194
(SendStop ? "SIGSTOP" : "SIGQUIT"),
31793195
(int) StartupPID)));
31803196
signal_child(StartupPID, (SendStop ? SIGSTOP : SIGQUIT));
3197+
StartupStatus = STARTUP_SIGNALED;
31813198
}
31823199

31833200
/* Take care of the bgwriter too */
@@ -3569,13 +3586,14 @@ PostmasterStateMachine(void)
35693586
}
35703587

35713588
/*
3572-
* If recovery failed, or the user does not want an automatic restart
3573-
* after backend crashes, wait for all non-syslogger children to exit, and
3574-
* then exit postmaster. We don't try to reinitialize when recovery fails,
3575-
* because more than likely it will just fail again and we will keep
3576-
* trying forever.
3589+
* If the startup process failed, or the user does not want an automatic
3590+
* restart after backend crashes, wait for all non-syslogger children to
3591+
* exit, and then exit postmaster. We don't try to reinitialize when the
3592+
* startup process fails, because more than likely it will just fail again
3593+
* and we will keep trying forever.
35773594
*/
3578-
if (pmState == PM_NO_CHILDREN && (RecoveryError || !restart_after_crash))
3595+
if (pmState == PM_NO_CHILDREN &&
3596+
(StartupStatus == STARTUP_CRASHED || !restart_after_crash))
35793597
ExitPostmaster(1);
35803598

35813599
/*
@@ -3595,6 +3613,7 @@ PostmasterStateMachine(void)
35953613

35963614
StartupPID = StartupDataBase();
35973615
Assert(StartupPID != 0);
3616+
StartupStatus = STARTUP_RUNNING;
35983617
pmState = PM_STARTUP;
35993618
/* crash recovery started, reset SIGKILL flag */
36003619
AbortStartTime = 0;
@@ -3726,7 +3745,11 @@ TerminateChildren(int signal)
37263745
{
37273746
SignalChildren(signal);
37283747
if (StartupPID != 0)
3748+
{
37293749
signal_child(StartupPID, signal);
3750+
if (signal == SIGQUIT || signal == SIGKILL)
3751+
StartupStatus = STARTUP_SIGNALED;
3752+
}
37303753
if (BgWriterPID != 0)
37313754
signal_child(BgWriterPID, signal);
37323755
if (CheckpointerPID != 0)

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy