Skip to content

Commit 22e3ff9

Browse files
mafredrimtojek
andauthored
feat(agent): Add shutdown lifecycle states and shutdown_script support (#6139)
* feat(api): Add agent shutdown lifecycle states * feat(agent): Add shutdown_script support * feat(agent): Add shutdown_script timeout * feat(site): Support new agent lifecycle states --- Co-authored-by: Marcin Tojek <marcin@coder.com>
1 parent 02100c6 commit 22e3ff9

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+1439
-635
lines changed

agent/agent.go

Lines changed: 108 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ func New(options Options) io.Closer {
121121
logDir: options.LogDir,
122122
tempDir: options.TempDir,
123123
lifecycleUpdate: make(chan struct{}, 1),
124+
lifecycleReported: make(chan codersdk.WorkspaceAgentLifecycle, 1),
124125
connStatsChan: make(chan *agentsdk.Stats, 1),
125126
}
126127
a.init(ctx)
@@ -149,9 +150,10 @@ type agent struct {
149150
sessionToken atomic.Pointer[string]
150151
sshServer *ssh.Server
151152

152-
lifecycleUpdate chan struct{}
153-
lifecycleMu sync.Mutex // Protects following.
154-
lifecycleState codersdk.WorkspaceAgentLifecycle
153+
lifecycleUpdate chan struct{}
154+
lifecycleReported chan codersdk.WorkspaceAgentLifecycle
155+
lifecycleMu sync.RWMutex // Protects following.
156+
lifecycleState codersdk.WorkspaceAgentLifecycle
155157

156158
network *tailnet.Conn
157159
connStatsChan chan *agentsdk.Stats
@@ -207,9 +209,9 @@ func (a *agent) reportLifecycleLoop(ctx context.Context) {
207209
}
208210

209211
for r := retry.New(time.Second, 15*time.Second); r.Wait(ctx); {
210-
a.lifecycleMu.Lock()
212+
a.lifecycleMu.RLock()
211213
state := a.lifecycleState
212-
a.lifecycleMu.Unlock()
214+
a.lifecycleMu.RUnlock()
213215

214216
if state == lastReported {
215217
break
@@ -222,6 +224,11 @@ func (a *agent) reportLifecycleLoop(ctx context.Context) {
222224
})
223225
if err == nil {
224226
lastReported = state
227+
select {
228+
case a.lifecycleReported <- state:
229+
case <-a.lifecycleReported:
230+
a.lifecycleReported <- state
231+
}
225232
break
226233
}
227234
if xerrors.Is(err, context.Canceled) || xerrors.Is(err, context.DeadlineExceeded) {
@@ -233,13 +240,20 @@ func (a *agent) reportLifecycleLoop(ctx context.Context) {
233240
}
234241
}
235242

243+
// setLifecycle sets the lifecycle state and notifies the lifecycle loop.
244+
// The state is only updated if it's a valid state transition.
236245
func (a *agent) setLifecycle(ctx context.Context, state codersdk.WorkspaceAgentLifecycle) {
237246
a.lifecycleMu.Lock()
238-
defer a.lifecycleMu.Unlock()
239-
240-
a.logger.Debug(ctx, "set lifecycle state", slog.F("state", state), slog.F("previous", a.lifecycleState))
241-
247+
lastState := a.lifecycleState
248+
if slices.Index(codersdk.WorkspaceAgentLifecycleOrder, lastState) > slices.Index(codersdk.WorkspaceAgentLifecycleOrder, state) {
249+
a.logger.Warn(ctx, "attempted to set lifecycle state to a previous state", slog.F("last", lastState), slog.F("state", state))
250+
a.lifecycleMu.Unlock()
251+
return
252+
}
242253
a.lifecycleState = state
254+
a.logger.Debug(ctx, "set lifecycle state", slog.F("state", state), slog.F("last", lastState))
255+
a.lifecycleMu.Unlock()
256+
243257
select {
244258
case a.lifecycleUpdate <- struct{}{}:
245259
default:
@@ -299,9 +313,10 @@ func (a *agent) run(ctx context.Context) error {
299313
}
300314
}
301315

316+
lifecycleState := codersdk.WorkspaceAgentLifecycleReady
302317
scriptDone := make(chan error, 1)
303318
scriptStart := time.Now()
304-
err := a.trackConnGoroutine(func() {
319+
err = a.trackConnGoroutine(func() {
305320
defer close(scriptDone)
306321
scriptDone <- a.runStartupScript(ctx, metadata.StartupScript)
307322
})
@@ -329,16 +344,17 @@ func (a *agent) run(ctx context.Context) error {
329344
if errors.Is(err, context.Canceled) {
330345
return
331346
}
332-
execTime := time.Since(scriptStart)
333-
lifecycleStatus := codersdk.WorkspaceAgentLifecycleReady
334-
if err != nil {
335-
a.logger.Warn(ctx, "startup script failed", slog.F("execution_time", execTime), slog.Error(err))
336-
lifecycleStatus = codersdk.WorkspaceAgentLifecycleStartError
337-
} else {
338-
a.logger.Info(ctx, "startup script completed", slog.F("execution_time", execTime))
347+
// Only log if there was a startup script.
348+
if metadata.StartupScript != "" {
349+
execTime := time.Since(scriptStart)
350+
if err != nil {
351+
a.logger.Warn(ctx, "startup script failed", slog.F("execution_time", execTime), slog.Error(err))
352+
lifecycleState = codersdk.WorkspaceAgentLifecycleStartError
353+
} else {
354+
a.logger.Info(ctx, "startup script completed", slog.F("execution_time", execTime))
355+
}
339356
}
340-
341-
a.setLifecycle(ctx, lifecycleStatus)
357+
a.setLifecycle(ctx, lifecycleState)
342358
}()
343359
}
344360

@@ -606,14 +622,22 @@ func (a *agent) runCoordinator(ctx context.Context, network *tailnet.Conn) error
606622
}
607623

608624
func (a *agent) runStartupScript(ctx context.Context, script string) error {
625+
return a.runScript(ctx, "startup", script)
626+
}
627+
628+
func (a *agent) runShutdownScript(ctx context.Context, script string) error {
629+
return a.runScript(ctx, "shutdown", script)
630+
}
631+
632+
func (a *agent) runScript(ctx context.Context, lifecycle, script string) error {
609633
if script == "" {
610634
return nil
611635
}
612636

613-
a.logger.Info(ctx, "running startup script", slog.F("script", script))
614-
writer, err := a.filesystem.OpenFile(filepath.Join(a.logDir, "coder-startup-script.log"), os.O_CREATE|os.O_RDWR, 0o600)
637+
a.logger.Info(ctx, "running script", slog.F("lifecycle", lifecycle), slog.F("script", script))
638+
writer, err := a.filesystem.OpenFile(filepath.Join(a.logDir, fmt.Sprintf("coder-%s-script.log", lifecycle)), os.O_CREATE|os.O_RDWR, 0o600)
615639
if err != nil {
616-
return xerrors.Errorf("open startup script log file: %w", err)
640+
return xerrors.Errorf("open %s script log file: %w", lifecycle, err)
617641
}
618642
defer func() {
619643
_ = writer.Close()
@@ -774,7 +798,7 @@ func (a *agent) createCommand(ctx context.Context, rawCommand string, env []stri
774798

775799
rawMetadata := a.metadata.Load()
776800
if rawMetadata == nil {
777-
return nil, xerrors.Errorf("no metadata was provided: %w", err)
801+
return nil, xerrors.Errorf("no metadata was provided")
778802
}
779803
metadata, valid := rawMetadata.(agentsdk.Metadata)
780804
if !valid {
@@ -1290,13 +1314,73 @@ func (a *agent) Close() error {
12901314
if a.isClosed() {
12911315
return nil
12921316
}
1317+
1318+
ctx := context.Background()
1319+
a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleShuttingDown)
1320+
1321+
lifecycleState := codersdk.WorkspaceAgentLifecycleOff
1322+
if metadata, ok := a.metadata.Load().(agentsdk.Metadata); ok && metadata.ShutdownScript != "" {
1323+
scriptDone := make(chan error, 1)
1324+
scriptStart := time.Now()
1325+
go func() {
1326+
defer close(scriptDone)
1327+
scriptDone <- a.runShutdownScript(ctx, metadata.ShutdownScript)
1328+
}()
1329+
1330+
var timeout <-chan time.Time
1331+
// If timeout is zero, an older version of the coder
1332+
// provider was used. Otherwise a timeout is always > 0.
1333+
if metadata.ShutdownScriptTimeout > 0 {
1334+
t := time.NewTimer(metadata.ShutdownScriptTimeout)
1335+
defer t.Stop()
1336+
timeout = t.C
1337+
}
1338+
1339+
var err error
1340+
select {
1341+
case err = <-scriptDone:
1342+
case <-timeout:
1343+
a.logger.Warn(ctx, "shutdown script timed out")
1344+
a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleShutdownTimeout)
1345+
err = <-scriptDone // The script can still complete after a timeout.
1346+
}
1347+
execTime := time.Since(scriptStart)
1348+
if err != nil {
1349+
a.logger.Warn(ctx, "shutdown script failed", slog.F("execution_time", execTime), slog.Error(err))
1350+
lifecycleState = codersdk.WorkspaceAgentLifecycleShutdownError
1351+
} else {
1352+
a.logger.Info(ctx, "shutdown script completed", slog.F("execution_time", execTime))
1353+
}
1354+
}
1355+
1356+
// Set final state and wait for it to be reported because context
1357+
// cancellation will stop the report loop.
1358+
a.setLifecycle(ctx, lifecycleState)
1359+
1360+
// Wait for the lifecycle to be reported, but don't wait forever so
1361+
// that we don't break user expectations.
1362+
ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
1363+
defer cancel()
1364+
lifecycleWaitLoop:
1365+
for {
1366+
select {
1367+
case <-ctx.Done():
1368+
break lifecycleWaitLoop
1369+
case s := <-a.lifecycleReported:
1370+
if s == lifecycleState {
1371+
break lifecycleWaitLoop
1372+
}
1373+
}
1374+
}
1375+
12931376
close(a.closed)
12941377
a.closeCancel()
1378+
_ = a.sshServer.Close()
12951379
if a.network != nil {
12961380
_ = a.network.Close()
12971381
}
1298-
_ = a.sshServer.Close()
12991382
a.connCloseWait.Wait()
1383+
13001384
return nil
13011385
}
13021386

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy