Skip to content

Commit 895df54

Browse files
authored
fix: separate signals for passive, active, and forced shutdown (#12358)
* fix: separate signals for passive, active, and forced shutdown `SIGTERM`: Passive shutdown stopping provisioner daemons from accepting new jobs but waiting for existing jobs to successfully complete. `SIGINT` (old existing behavior): Notify provisioner daemons to cancel in-flight jobs, wait 5s for jobs to be exited, then force quit. `SIGKILL`: Untouched from before, will force-quit. * Revert dramatic signal changes * Rename * Fix shutdown behavior for provisioner daemons * Add test for graceful shutdown
1 parent 2c947c1 commit 895df54

18 files changed

+136
-35
lines changed

cli/agent.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ func (r *RootCmd) workspaceAgent() *clibase.Cmd {
125125
args := append(os.Args, "--no-reap")
126126
err := reaper.ForkReap(
127127
reaper.WithExecArgs(args...),
128-
reaper.WithCatchSignals(InterruptSignals...),
128+
reaper.WithCatchSignals(StopSignals...),
129129
)
130130
if err != nil {
131131
logger.Error(ctx, "agent process reaper unable to fork", slog.Error(err))
@@ -144,7 +144,7 @@ func (r *RootCmd) workspaceAgent() *clibase.Cmd {
144144
// Note that we don't want to handle these signals in the
145145
// process that runs as PID 1, that's why we do this after
146146
// the reaper forked.
147-
ctx, stopNotify := inv.SignalNotifyContext(ctx, InterruptSignals...)
147+
ctx, stopNotify := inv.SignalNotifyContext(ctx, StopSignals...)
148148
defer stopNotify()
149149

150150
// DumpHandler does signal handling, so we call it after the

cli/exp_scaletest.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -890,7 +890,7 @@ func (r *RootCmd) scaletestWorkspaceTraffic() *clibase.Cmd {
890890
Handler: func(inv *clibase.Invocation) (err error) {
891891
ctx := inv.Context()
892892

893-
notifyCtx, stop := signal.NotifyContext(ctx, InterruptSignals...) // Checked later.
893+
notifyCtx, stop := signal.NotifyContext(ctx, StopSignals...) // Checked later.
894894
defer stop()
895895
ctx = notifyCtx
896896

cli/externalauth.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ fi
6565
Handler: func(inv *clibase.Invocation) error {
6666
ctx := inv.Context()
6767

68-
ctx, stop := inv.SignalNotifyContext(ctx, InterruptSignals...)
68+
ctx, stop := inv.SignalNotifyContext(ctx, StopSignals...)
6969
defer stop()
7070

7171
client, err := r.createAgentClient()

cli/gitaskpass.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ func (r *RootCmd) gitAskpass() *clibase.Cmd {
2525
Handler: func(inv *clibase.Invocation) error {
2626
ctx := inv.Context()
2727

28-
ctx, stop := inv.SignalNotifyContext(ctx, InterruptSignals...)
28+
ctx, stop := inv.SignalNotifyContext(ctx, StopSignals...)
2929
defer stop()
3030

3131
user, host, err := gitauth.ParseAskpass(inv.Args[0])

cli/gitssh.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ func (r *RootCmd) gitssh() *clibase.Cmd {
2929

3030
// Catch interrupt signals to ensure the temporary private
3131
// key file is cleaned up on most cases.
32-
ctx, stop := inv.SignalNotifyContext(ctx, InterruptSignals...)
32+
ctx, stop := inv.SignalNotifyContext(ctx, StopSignals...)
3333
defer stop()
3434

3535
// Early check so errors are reported immediately.

cli/server.go

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -337,16 +337,18 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
337337

338338
// Register signals early on so that graceful shutdown can't
339339
// be interrupted by additional signals. Note that we avoid
340-
// shadowing cancel() (from above) here because notifyStop()
340+
// shadowing cancel() (from above) here because stopCancel()
341341
// restores default behavior for the signals. This protects
342342
// the shutdown sequence from abruptly terminating things
343343
// like: database migrations, provisioner work, workspace
344344
// cleanup in dev-mode, etc.
345345
//
346346
// To get out of a graceful shutdown, the user can send
347347
// SIGQUIT with ctrl+\ or SIGKILL with `kill -9`.
348-
notifyCtx, notifyStop := inv.SignalNotifyContext(ctx, InterruptSignals...)
349-
defer notifyStop()
348+
stopCtx, stopCancel := signalNotifyContext(ctx, inv, StopSignalsNoInterrupt...)
349+
defer stopCancel()
350+
interruptCtx, interruptCancel := signalNotifyContext(ctx, inv, InterruptSignals...)
351+
defer interruptCancel()
350352

351353
cacheDir := vals.CacheDir.String()
352354
err = os.MkdirAll(cacheDir, 0o700)
@@ -1028,13 +1030,18 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
10281030
hangDetector.Start()
10291031
defer hangDetector.Close()
10301032

1033+
waitForProvisionerJobs := false
10311034
// Currently there is no way to ask the server to shut
10321035
// itself down, so any exit signal will result in a non-zero
10331036
// exit of the server.
10341037
var exitErr error
10351038
select {
1036-
case <-notifyCtx.Done():
1037-
exitErr = notifyCtx.Err()
1039+
case <-stopCtx.Done():
1040+
exitErr = stopCtx.Err()
1041+
waitForProvisionerJobs = true
1042+
_, _ = io.WriteString(inv.Stdout, cliui.Bold("Stop caught, waiting for provisioner jobs to complete and gracefully exiting. Use ctrl+\\ to force quit"))
1043+
case <-interruptCtx.Done():
1044+
exitErr = interruptCtx.Err()
10381045
_, _ = io.WriteString(inv.Stdout, cliui.Bold("Interrupt caught, gracefully exiting. Use ctrl+\\ to force quit"))
10391046
case <-tunnelDone:
10401047
exitErr = xerrors.New("dev tunnel closed unexpectedly")
@@ -1082,7 +1089,16 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
10821089
defer wg.Done()
10831090

10841091
r.Verbosef(inv, "Shutting down provisioner daemon %d...", id)
1085-
err := shutdownWithTimeout(provisionerDaemon.Shutdown, 5*time.Second)
1092+
timeout := 5 * time.Second
1093+
if waitForProvisionerJobs {
1094+
// It can last for a long time...
1095+
timeout = 30 * time.Minute
1096+
}
1097+
1098+
err := shutdownWithTimeout(func(ctx context.Context) error {
1099+
// We only want to cancel active jobs if we aren't exiting gracefully.
1100+
return provisionerDaemon.Shutdown(ctx, !waitForProvisionerJobs)
1101+
}, timeout)
10861102
if err != nil {
10871103
cliui.Errorf(inv.Stderr, "Failed to shut down provisioner daemon %d: %s\n", id, err)
10881104
return
@@ -2512,3 +2528,12 @@ func escapePostgresURLUserInfo(v string) (string, error) {
25122528

25132529
return v, nil
25142530
}
2531+
2532+
func signalNotifyContext(ctx context.Context, inv *clibase.Invocation, sig ...os.Signal) (context.Context, context.CancelFunc) {
2533+
// On Windows, some of our signal functions lack support.
2534+
// If we pass in no signals, we should just return the context as-is.
2535+
if len(sig) == 0 {
2536+
return context.WithCancel(ctx)
2537+
}
2538+
return inv.SignalNotifyContext(ctx, sig...)
2539+
}

cli/server_createadminuser.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ func (r *RootCmd) newCreateAdminUserCommand() *clibase.Cmd {
4747
logger = logger.Leveled(slog.LevelDebug)
4848
}
4949

50-
ctx, cancel := inv.SignalNotifyContext(ctx, InterruptSignals...)
50+
ctx, cancel := inv.SignalNotifyContext(ctx, StopSignals...)
5151
defer cancel()
5252

5353
if newUserDBURL == "" {

cli/server_test.go

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"net/url"
2222
"os"
2323
"path/filepath"
24+
"reflect"
2425
"runtime"
2526
"strconv"
2627
"strings"
@@ -1605,7 +1606,7 @@ func TestServer_Production(t *testing.T) {
16051606
}
16061607

16071608
//nolint:tparallel,paralleltest // This test cannot be run in parallel due to signal handling.
1608-
func TestServer_Shutdown(t *testing.T) {
1609+
func TestServer_InterruptShutdown(t *testing.T) {
16091610
t.Skip("This test issues an interrupt signal which will propagate to the test runner.")
16101611

16111612
if runtime.GOOS == "windows" {
@@ -1638,6 +1639,46 @@ func TestServer_Shutdown(t *testing.T) {
16381639
require.NoError(t, err)
16391640
}
16401641

1642+
func TestServer_GracefulShutdown(t *testing.T) {
1643+
t.Parallel()
1644+
if runtime.GOOS == "windows" {
1645+
// Sending interrupt signal isn't supported on Windows!
1646+
t.SkipNow()
1647+
}
1648+
ctx, cancelFunc := context.WithCancel(context.Background())
1649+
defer cancelFunc()
1650+
1651+
root, cfg := clitest.New(t,
1652+
"server",
1653+
"--in-memory",
1654+
"--http-address", ":0",
1655+
"--access-url", "http://example.com",
1656+
"--provisioner-daemons", "1",
1657+
"--cache-dir", t.TempDir(),
1658+
)
1659+
var stopFunc context.CancelFunc
1660+
root = root.WithTestSignalNotifyContext(t, func(parent context.Context, signals ...os.Signal) (context.Context, context.CancelFunc) {
1661+
if !reflect.DeepEqual(cli.StopSignalsNoInterrupt, signals) {
1662+
return context.WithCancel(ctx)
1663+
}
1664+
var ctx context.Context
1665+
ctx, stopFunc = context.WithCancel(parent)
1666+
return ctx, stopFunc
1667+
})
1668+
serverErr := make(chan error, 1)
1669+
pty := ptytest.New(t).Attach(root)
1670+
go func() {
1671+
serverErr <- root.WithContext(ctx).Run()
1672+
}()
1673+
_ = waitAccessURL(t, cfg)
1674+
// It's fair to assume `stopFunc` isn't nil here, because the server
1675+
// has started and access URL is propagated.
1676+
stopFunc()
1677+
pty.ExpectMatch("waiting for provisioner jobs to complete")
1678+
err := <-serverErr
1679+
require.NoError(t, err)
1680+
}
1681+
16411682
func BenchmarkServerHelp(b *testing.B) {
16421683
// server --help is a good proxy for measuring the
16431684
// constant overhead of each command.

cli/signal_unix.go

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,23 @@ import (
77
"syscall"
88
)
99

10-
var InterruptSignals = []os.Signal{
10+
// StopSignals is the list of signals that are used for handling
11+
// shutdown behavior.
12+
var StopSignals = []os.Signal{
1113
os.Interrupt,
1214
syscall.SIGTERM,
1315
syscall.SIGHUP,
1416
}
17+
18+
// StopSignals is the list of signals that are used for handling
19+
// graceful shutdown behavior.
20+
var StopSignalsNoInterrupt = []os.Signal{
21+
syscall.SIGTERM,
22+
syscall.SIGHUP,
23+
}
24+
25+
// InterruptSignals is the list of signals that are used for handling
26+
// immediate shutdown behavior.
27+
var InterruptSignals = []os.Signal{
28+
os.Interrupt,
29+
}

cli/signal_windows.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,12 @@ import (
66
"os"
77
)
88

9-
var InterruptSignals = []os.Signal{os.Interrupt}
9+
var StopSignals = []os.Signal{
10+
os.Interrupt,
11+
}
12+
13+
var StopSignalsNoInterrupt = []os.Signal{}
14+
15+
var InterruptSignals = []os.Signal{
16+
os.Interrupt,
17+
}

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy