Skip to content

Commit 1b0124e

Browse files
authored
feat: automatically stop workspaces based on failure_ttl (#7989)
1 parent d434181 commit 1b0124e

File tree

17 files changed

+419
-159
lines changed

17 files changed

+419
-159
lines changed

cli/server.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ import (
6262
"github.com/coder/coder/cli/cliui"
6363
"github.com/coder/coder/cli/config"
6464
"github.com/coder/coder/coderd"
65-
"github.com/coder/coder/coderd/autobuild/executor"
65+
"github.com/coder/coder/coderd/autobuild"
6666
"github.com/coder/coder/coderd/database"
6767
"github.com/coder/coder/coderd/database/dbfake"
6868
"github.com/coder/coder/coderd/database/dbmetrics"
@@ -900,7 +900,7 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
900900

901901
autobuildPoller := time.NewTicker(cfg.AutobuildPollInterval.Value())
902902
defer autobuildPoller.Stop()
903-
autobuildExecutor := executor.New(ctx, options.Database, coderAPI.TemplateScheduleStore, logger, autobuildPoller.C)
903+
autobuildExecutor := autobuild.NewExecutor(ctx, options.Database, coderAPI.TemplateScheduleStore, logger, autobuildPoller.C)
904904
autobuildExecutor.Run()
905905

906906
// Currently there is no way to ask the server to shut

coderd/autobuild/doc.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
// Package autobuild contains logic for scheduling workspace
2+
// builds in the background.
3+
package autobuild

coderd/autobuild/executor/lifecycle_executor.go renamed to coderd/autobuild/lifecycle_executor.go

Lines changed: 89 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
package executor
1+
package autobuild
22

33
import (
44
"context"
@@ -13,9 +13,11 @@ import (
1313

1414
"cdr.dev/slog"
1515
"github.com/coder/coder/coderd/database"
16+
"github.com/coder/coder/coderd/database/db2sdk"
1617
"github.com/coder/coder/coderd/database/dbauthz"
1718
"github.com/coder/coder/coderd/schedule"
1819
"github.com/coder/coder/coderd/wsbuilder"
20+
"github.com/coder/coder/codersdk"
1921
)
2022

2123
// Executor automatically starts or stops workspaces.
@@ -35,8 +37,8 @@ type Stats struct {
3537
Error error
3638
}
3739

38-
// New returns a new autobuild executor.
39-
func New(ctx context.Context, db database.Store, tss *atomic.Pointer[schedule.TemplateScheduleStore], log slog.Logger, tick <-chan time.Time) *Executor {
40+
// New returns a new wsactions executor.
41+
func NewExecutor(ctx context.Context, db database.Store, tss *atomic.Pointer[schedule.TemplateScheduleStore], log slog.Logger, tick <-chan time.Time) *Executor {
4042
le := &Executor{
4143
//nolint:gocritic // Autostart has a limited set of permissions.
4244
ctx: dbauthz.AsAutostart(ctx),
@@ -108,7 +110,7 @@ func (e *Executor) runOnce(t time.Time) Stats {
108110
// NOTE: If a workspace build is created with a given TTL and then the user either
109111
// changes or unsets the TTL, the deadline for the workspace build will not
110112
// have changed. This behavior is as expected per #2229.
111-
workspaces, err := e.db.GetWorkspacesEligibleForAutoStartStop(e.ctx, t)
113+
workspaces, err := e.db.GetWorkspacesEligibleForTransition(e.ctx, t)
112114
if err != nil {
113115
e.log.Error(e.ctx, "get workspaces for autostart or autostop", slog.Error(err))
114116
return stats
@@ -125,77 +127,56 @@ func (e *Executor) runOnce(t time.Time) Stats {
125127
log := e.log.With(slog.F("workspace_id", wsID))
126128

127129
eg.Go(func() error {
128-
err := e.db.InTx(func(db database.Store) error {
130+
err := e.db.InTx(func(tx database.Store) error {
129131
// Re-check eligibility since the first check was outside the
130132
// transaction and the workspace settings may have changed.
131-
ws, err := db.GetWorkspaceByID(e.ctx, wsID)
133+
ws, err := tx.GetWorkspaceByID(e.ctx, wsID)
132134
if err != nil {
133135
log.Error(e.ctx, "get workspace autostart failed", slog.Error(err))
134136
return nil
135137
}
136138

137139
// Determine the workspace state based on its latest build.
138-
priorHistory, err := db.GetLatestWorkspaceBuildByWorkspaceID(e.ctx, ws.ID)
140+
latestBuild, err := tx.GetLatestWorkspaceBuildByWorkspaceID(e.ctx, ws.ID)
139141
if err != nil {
140142
log.Warn(e.ctx, "get latest workspace build", slog.Error(err))
141143
return nil
142144
}
143-
144-
templateSchedule, err := (*(e.templateScheduleStore.Load())).GetTemplateScheduleOptions(e.ctx, db, ws.TemplateID)
145+
templateSchedule, err := (*(e.templateScheduleStore.Load())).GetTemplateScheduleOptions(e.ctx, tx, ws.TemplateID)
145146
if err != nil {
146147
log.Warn(e.ctx, "get template schedule options", slog.Error(err))
147148
return nil
148149
}
149150

150-
if !isEligibleForAutoStartStop(ws, priorHistory, templateSchedule) {
151-
return nil
152-
}
153-
154-
priorJob, err := db.GetProvisionerJobByID(e.ctx, priorHistory.JobID)
151+
latestJob, err := tx.GetProvisionerJobByID(e.ctx, latestBuild.JobID)
155152
if err != nil {
156153
log.Warn(e.ctx, "get last provisioner job for workspace %q: %w", slog.Error(err))
157154
return nil
158155
}
159156

160-
validTransition, nextTransition, err := getNextTransition(ws, priorHistory, priorJob)
157+
nextTransition, reason, err := getNextTransition(ws, latestBuild, latestJob, templateSchedule, currentTick)
161158
if err != nil {
162159
log.Debug(e.ctx, "skipping workspace", slog.Error(err))
163160
return nil
164161
}
165162

166-
if currentTick.Before(nextTransition) {
167-
log.Debug(e.ctx, "skipping workspace: too early",
168-
slog.F("next_transition_at", nextTransition),
169-
slog.F("transition", validTransition),
170-
slog.F("current_tick", currentTick),
171-
)
172-
return nil
173-
}
174-
builder := wsbuilder.New(ws, validTransition).
175-
SetLastWorkspaceBuildInTx(&priorHistory).
176-
SetLastWorkspaceBuildJobInTx(&priorJob)
177-
178-
switch validTransition {
179-
case database.WorkspaceTransitionStart:
180-
builder = builder.Reason(database.BuildReasonAutostart)
181-
case database.WorkspaceTransitionStop:
182-
builder = builder.Reason(database.BuildReasonAutostop)
183-
default:
184-
log.Error(e.ctx, "unsupported transition", slog.F("transition", validTransition))
185-
return nil
186-
}
187-
if _, _, err := builder.Build(e.ctx, db, nil); err != nil {
163+
builder := wsbuilder.New(ws, nextTransition).
164+
SetLastWorkspaceBuildInTx(&latestBuild).
165+
SetLastWorkspaceBuildJobInTx(&latestJob).
166+
Reason(reason)
167+
168+
if _, _, err := builder.Build(e.ctx, tx, nil); err != nil {
188169
log.Error(e.ctx, "unable to transition workspace",
189-
slog.F("transition", validTransition),
170+
slog.F("transition", nextTransition),
190171
slog.Error(err),
191172
)
192173
return nil
193174
}
194175
statsMu.Lock()
195-
stats.Transitions[ws.ID] = validTransition
176+
stats.Transitions[ws.ID] = nextTransition
196177
statsMu.Unlock()
197178

198-
log.Info(e.ctx, "scheduling workspace transition", slog.F("transition", validTransition))
179+
log.Info(e.ctx, "scheduling workspace transition", slog.F("transition", nextTransition))
199180

200181
return nil
201182

@@ -218,53 +199,81 @@ func (e *Executor) runOnce(t time.Time) Stats {
218199
return stats
219200
}
220201

221-
func isEligibleForAutoStartStop(ws database.Workspace, priorHistory database.WorkspaceBuild, templateSchedule schedule.TemplateScheduleOptions) bool {
222-
if ws.Deleted {
202+
func getNextTransition(
203+
ws database.Workspace,
204+
latestBuild database.WorkspaceBuild,
205+
latestJob database.ProvisionerJob,
206+
templateSchedule schedule.TemplateScheduleOptions,
207+
currentTick time.Time,
208+
) (
209+
database.WorkspaceTransition,
210+
database.BuildReason,
211+
error,
212+
) {
213+
switch {
214+
case isEligibleForAutostop(latestBuild, latestJob, currentTick):
215+
return database.WorkspaceTransitionStop, database.BuildReasonAutostop, nil
216+
case isEligibleForAutostart(ws, latestBuild, latestJob, templateSchedule, currentTick):
217+
return database.WorkspaceTransitionStart, database.BuildReasonAutostart, nil
218+
case isEligibleForFailedStop(latestBuild, latestJob, templateSchedule):
219+
return database.WorkspaceTransitionStop, database.BuildReasonAutostop, nil
220+
default:
221+
return "", "", xerrors.Errorf("last transition not valid for autostart or autostop")
222+
}
223+
}
224+
225+
// isEligibleForAutostart returns true if the workspace should be autostarted.
226+
func isEligibleForAutostart(ws database.Workspace, build database.WorkspaceBuild, job database.ProvisionerJob, templateSchedule schedule.TemplateScheduleOptions, currentTick time.Time) bool {
227+
// Don't attempt to autostart failed workspaces.
228+
if !job.CompletedAt.Valid || job.Error.String != "" {
223229
return false
224230
}
225-
if templateSchedule.UserAutostartEnabled && ws.AutostartSchedule.Valid && ws.AutostartSchedule.String != "" {
226-
return true
231+
232+
// If the last transition for the workspace was not 'stop' then the workspace
233+
// cannot be started.
234+
if build.Transition != database.WorkspaceTransitionStop {
235+
return false
227236
}
228-
// Don't check the template schedule to see whether it allows autostop, this
229-
// is done during the build when determining the deadline.
230-
if priorHistory.Transition == database.WorkspaceTransitionStart && !priorHistory.Deadline.IsZero() {
231-
return true
237+
238+
// If autostart isn't enabled, or the schedule isn't valid/populated we can't
239+
// autostart the workspace.
240+
if !templateSchedule.UserAutostartEnabled || !ws.AutostartSchedule.Valid || ws.AutostartSchedule.String == "" {
241+
return false
232242
}
233243

234-
return false
244+
sched, err := schedule.Weekly(ws.AutostartSchedule.String)
245+
if err != nil {
246+
return false
247+
}
248+
// Round down to the nearest minute, as this is the finest granularity cron supports.
249+
// Truncate is probably not necessary here, but doing it anyway to be sure.
250+
nextTransition := sched.Next(build.CreatedAt).Truncate(time.Minute)
251+
252+
return !currentTick.Before(nextTransition)
235253
}
236254

237-
func getNextTransition(
238-
ws database.Workspace,
239-
priorHistory database.WorkspaceBuild,
240-
priorJob database.ProvisionerJob,
241-
) (
242-
validTransition database.WorkspaceTransition,
243-
nextTransition time.Time,
244-
err error,
245-
) {
246-
if !priorJob.CompletedAt.Valid || priorJob.Error.String != "" {
247-
return "", time.Time{}, xerrors.Errorf("last workspace build did not complete successfully")
255+
// isEligibleForAutostart returns true if the workspace should be autostopped.
256+
func isEligibleForAutostop(build database.WorkspaceBuild, job database.ProvisionerJob, currentTick time.Time) bool {
257+
// Don't attempt to autostop failed workspaces.
258+
if !job.CompletedAt.Valid || job.Error.String != "" {
259+
return false
248260
}
249261

250-
switch priorHistory.Transition {
251-
case database.WorkspaceTransitionStart:
252-
if priorHistory.Deadline.IsZero() {
253-
return "", time.Time{}, xerrors.Errorf("latest workspace build has zero deadline")
254-
}
255-
// For stopping, do not truncate. This is inconsistent with autostart, but
256-
// it ensures we will not stop too early.
257-
return database.WorkspaceTransitionStop, priorHistory.Deadline, nil
258-
case database.WorkspaceTransitionStop:
259-
sched, err := schedule.Weekly(ws.AutostartSchedule.String)
260-
if err != nil {
261-
return "", time.Time{}, xerrors.Errorf("workspace has invalid autostart schedule: %w", err)
262-
}
263-
// Round down to the nearest minute, as this is the finest granularity cron supports.
264-
// Truncate is probably not necessary here, but doing it anyway to be sure.
265-
nextTransition = sched.Next(priorHistory.CreatedAt).Truncate(time.Minute)
266-
return database.WorkspaceTransitionStart, nextTransition, nil
267-
default:
268-
return "", time.Time{}, xerrors.Errorf("last transition not valid for autostart or autostop")
269-
}
262+
// A workspace must be started in order for it to be auto-stopped.
263+
return build.Transition == database.WorkspaceTransitionStart &&
264+
!build.Deadline.IsZero() &&
265+
// We do not want to stop a workspace prior to it breaching its deadline.
266+
!currentTick.Before(build.Deadline)
267+
}
268+
269+
// isEligibleForFailedStop returns true if the workspace is eligible to be stopped
270+
// due to a failed build.
271+
func isEligibleForFailedStop(build database.WorkspaceBuild, job database.ProvisionerJob, templateSchedule schedule.TemplateScheduleOptions) bool {
272+
// If the template has specified a failure TLL.
273+
return templateSchedule.FailureTTL > 0 &&
274+
// And the job resulted in failure.
275+
db2sdk.ProvisionerJobStatus(job) == codersdk.ProvisionerJobFailed &&
276+
build.Transition == database.WorkspaceTransitionStart &&
277+
// And sufficient time has elapsed since the job has completed.
278+
job.CompletedAt.Valid && database.Now().Sub(job.CompletedAt.Time) > templateSchedule.FailureTTL
270279
}

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy