Skip to content

Commit 179b76f

Browse files
committed
feat: add support for stopping failed workspaces
1 parent 4f809bd commit 179b76f

File tree

8 files changed

+76
-49
lines changed

8 files changed

+76
-49
lines changed

coderd/database/dbauthz/system.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -310,8 +310,8 @@ func (q *querier) GetDeploymentWorkspaceStats(ctx context.Context) (database.Get
310310
return q.db.GetDeploymentWorkspaceStats(ctx)
311311
}
312312

313-
func (q *querier) GetWorkspacesEligibleForAutoStartStop(ctx context.Context, now time.Time) ([]database.Workspace, error) {
314-
return q.db.GetWorkspacesEligibleForAutoStartStop(ctx, now)
313+
func (q *querier) GetWorkspacesEligibleForTransition(ctx context.Context, now time.Time) ([]database.Workspace, error) {
314+
return q.db.GetWorkspacesEligibleForTransition(ctx, now)
315315
}
316316

317317
// TODO: We need to create a ProvisionerJob resource type

coderd/database/dbfake/dbfake.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3357,7 +3357,7 @@ func (q *fakeQuerier) GetWorkspaces(ctx context.Context, arg database.GetWorkspa
33573357
return workspaceRows, err
33583358
}
33593359

3360-
func (q *fakeQuerier) GetWorkspacesEligibleForAutoStartStop(ctx context.Context, now time.Time) ([]database.Workspace, error) {
3360+
func (q *fakeQuerier) GetWorkspacesEligibleForTransition(ctx context.Context, now time.Time) ([]database.Workspace, error) {
33613361
q.mutex.RLock()
33623362
defer q.mutex.RUnlock()
33633363

coderd/database/dbmetrics/dbmetrics.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -903,9 +903,9 @@ func (m metricsStore) GetWorkspaces(ctx context.Context, arg database.GetWorkspa
903903
return workspaces, err
904904
}
905905

906-
func (m metricsStore) GetWorkspacesEligibleForAutoStartStop(ctx context.Context, now time.Time) ([]database.Workspace, error) {
906+
func (m metricsStore) GetWorkspacesEligibleForTransition(ctx context.Context, now time.Time) ([]database.Workspace, error) {
907907
start := time.Now()
908-
workspaces, err := m.s.GetWorkspacesEligibleForAutoStartStop(ctx, now)
908+
workspaces, err := m.s.GetWorkspacesEligibleForTransition(ctx, now)
909909
m.queryLatencies.WithLabelValues("GetWorkspacesEligibleForAutoStartStop").Observe(time.Since(start).Seconds())
910910
return workspaces, err
911911
}

coderd/database/dbmock/store.go

Lines changed: 6 additions & 6 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/querier.go

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/queries.sql.go

Lines changed: 13 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/queries/workspaces.sql

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -405,13 +405,15 @@ SELECT
405405
stopped_workspaces.count AS stopped_workspaces
406406
FROM pending_workspaces, building_workspaces, running_workspaces, failed_workspaces, stopped_workspaces;
407407

408-
-- name: GetWorkspacesEligibleForAutoStartStop :many
408+
-- name: GetWorkspacesEligibleForTransition :many
409409
SELECT
410410
workspaces.*
411411
FROM
412412
workspaces
413413
LEFT JOIN
414414
workspace_builds ON workspace_builds.workspace_id = workspaces.id
415+
INNER JOIN
416+
provisioner_jobs ON workspace_builds.job_id = provisioner_jobs.id
415417
WHERE
416418
workspace_builds.build_number = (
417419
SELECT
@@ -441,5 +443,12 @@ WHERE
441443
(
442444
workspace_builds.transition = 'stop'::workspace_transition AND
443445
workspaces.autostart_schedule IS NOT NULL
446+
) OR
447+
448+
-- If the workspace's most recent job resulted in an error
449+
-- it may be eligible for failed stop.
450+
(
451+
provisioner_jobs.error IS NOT NULL AND
452+
provisioner_jobs.error != ''
444453
)
445-
);
454+
) AND workspaces.deleted = 'false';

coderd/wsactions/lifecycle_executor.go

Lines changed: 40 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,11 @@ import (
1313

1414
"cdr.dev/slog"
1515
"github.com/coder/coder/coderd/database"
16+
"github.com/coder/coder/coderd/database/db2sdk"
1617
"github.com/coder/coder/coderd/database/dbauthz"
1718
"github.com/coder/coder/coderd/schedule"
1819
"github.com/coder/coder/coderd/wsbuilder"
20+
"github.com/coder/coder/codersdk"
1921
)
2022

2123
// Executor automatically starts or stops workspaces.
@@ -108,7 +110,7 @@ func (e *Executor) runOnce(t time.Time) Stats {
108110
// NOTE: If a workspace build is created with a given TTL and then the user either
109111
// changes or unsets the TTL, the deadline for the workspace build will not
110112
// have changed. This behavior is as expected per #2229.
111-
workspaces, err := e.db.GetWorkspacesEligibleForAutoStartStop(e.ctx, t)
113+
workspaces, err := e.db.GetWorkspacesEligibleForTransition(e.ctx, t)
112114
if err != nil {
113115
e.log.Error(e.ctx, "get workspaces for autostart or autostop", slog.Error(err))
114116
return stats
@@ -198,24 +200,6 @@ func (e *Executor) runOnce(t time.Time) Stats {
198200
return stats
199201
}
200202

201-
// isEligibleForTransition returns true if the workspace meets basic criteria
202-
// for transitioning to a new state.
203-
func isEligibleForTransition(ws database.Workspace, latestBuild database.WorkspaceBuild, templateSchedule schedule.TemplateScheduleOptions) bool {
204-
if ws.Deleted {
205-
return false
206-
}
207-
if templateSchedule.UserAutostartEnabled && ws.AutostartSchedule.Valid && ws.AutostartSchedule.String != "" {
208-
return true
209-
}
210-
// Don't check the template schedule to see whether it allows autostop, this
211-
// is done during the build when determining the deadline.
212-
if latestBuild.Transition == database.WorkspaceTransitionStart && !latestBuild.Deadline.IsZero() {
213-
return true
214-
}
215-
216-
return false
217-
}
218-
219203
func getNextTransition(
220204
ws database.Workspace,
221205
latestBuild database.WorkspaceBuild,
@@ -227,32 +211,37 @@ func getNextTransition(
227211
database.BuildReason,
228212
error,
229213
) {
230-
if !isEligibleForTransition(ws, latestBuild, templateSchedule) {
231-
return "", "", xerrors.Errorf("workspace ineligible for transition")
232-
}
233-
234-
if !latestJob.CompletedAt.Valid || latestJob.Error.String != "" {
235-
return "", "", xerrors.Errorf("last workspace build did not complete successfully")
236-
}
237-
238214
switch {
239-
case isEligibleForAutostop(latestBuild, currentTick):
215+
case isEligibleForAutostop(latestBuild, latestJob, currentTick):
240216
return database.WorkspaceTransitionStop, database.BuildReasonAutostop, nil
241-
case isEligibleForAutostart(ws, latestBuild, currentTick):
217+
case isEligibleForAutostart(ws, latestBuild, latestJob, templateSchedule, currentTick):
242218
return database.WorkspaceTransitionStart, database.BuildReasonAutostart, nil
219+
case isEligibleForFailedStop(latestJob, templateSchedule):
220+
return database.WorkspaceTransitionStop, database.BuildReasonAutostop, nil
243221
default:
244222
return "", "", xerrors.Errorf("last transition not valid for autostart or autostop")
245223
}
246224
}
247225

248226
// isEligibleForAutostart returns true if the workspace should be autostarted.
249-
func isEligibleForAutostart(ws database.Workspace, build database.WorkspaceBuild, currentTick time.Time) bool {
227+
func isEligibleForAutostart(ws database.Workspace, build database.WorkspaceBuild, job database.ProvisionerJob, templateSchedule schedule.TemplateScheduleOptions, currentTick time.Time) bool {
228+
// Don't attempt to autostart failed workspaces.
229+
if !job.CompletedAt.Valid || job.Error.String != "" {
230+
return false
231+
}
232+
250233
// If the last transition for the workspace was not 'stop' then the workspace
251234
// cannot be started.
252235
if build.Transition != database.WorkspaceTransitionStop {
253236
return false
254237
}
255238

239+
// If autostart isn't enabled, or the schedule isn't valid/populated we can't
240+
// autostart the workspace.
241+
if !templateSchedule.UserAutostartEnabled || !ws.AutostartSchedule.Valid || ws.AutostartSchedule.String == "" {
242+
return false
243+
}
244+
256245
sched, err := schedule.Weekly(ws.AutostartSchedule.String)
257246
if err != nil {
258247
return false
@@ -265,10 +254,30 @@ func isEligibleForAutostart(ws database.Workspace, build database.WorkspaceBuild
265254
}
266255

267256
// isEligibleForAutostart returns true if the workspace should be autostopped.
268-
func isEligibleForAutostop(build database.WorkspaceBuild, currentTick time.Time) bool {
257+
func isEligibleForAutostop(build database.WorkspaceBuild, job database.ProvisionerJob, currentTick time.Time) bool {
258+
// Don't attempt to autostop failed workspaces.
259+
if !job.CompletedAt.Valid || job.Error.String != "" {
260+
return false
261+
}
262+
269263
// A workspace must be started in order for it to be auto-stopped.
270264
return build.Transition == database.WorkspaceTransitionStart &&
271265
!build.Deadline.IsZero() &&
272266
// We do not want to stop a workspace prior to it breaching its deadline.
273267
!currentTick.Before(build.Deadline)
274268
}
269+
270+
// isEligibleForFailedStop returns true if the workspace is eligible to be stopped
271+
// due to a failed build.
272+
func isEligibleForFailedStop(job database.ProvisionerJob, templateSchedule schedule.TemplateScheduleOptions) bool {
273+
// If the template has specified a failure TLL.
274+
return templateSchedule.FailureTTL > 0 &&
275+
// And the job resulted in failure.
276+
db2sdk.ProvisionerJobStatus(job) == codersdk.ProvisionerJobFailed &&
277+
// And sufficient time has elapsed since the job has completed.
278+
(job.CompletedAt.Valid && database.Now().Sub(job.CompletedAt.Time) > templateSchedule.FailureTTL ||
279+
// Or sufficient time has elapsed since the job was canceled.
280+
job.CanceledAt.Valid && database.Now().Sub(job.CanceledAt.Time) > templateSchedule.FailureTTL ||
281+
// Or the job is stuck/abandoned.
282+
database.Now().Sub(job.UpdatedAt) > 30*time.Second)
283+
}

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy