Skip to content

Commit 8abc973

Browse files
committed
Guarding against overzealous prebuild creations/deletions
Signed-off-by: Danny Kopping <dannykopping@gmail.com>
1 parent c78374f commit 8abc973

File tree

7 files changed

+178
-97
lines changed

7 files changed

+178
-97
lines changed

coderd/database/queries.sql.go

Lines changed: 30 additions & 29 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/queries/prebuilds.sql

Lines changed: 30 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ FROM templates t
3636
WHERE (t.id = sqlc.narg('template_id')::uuid OR sqlc.narg('template_id') IS NULL);
3737

3838
-- name: GetPrebuildsInProgress :many
39-
SELECT t.id AS template_id, wpb.template_version_id, wpb.transition, COUNT(wpb.transition) AS count
39+
SELECT t.id AS template_id, wpb.template_version_id, wpb.transition, COUNT(wpb.transition)::int AS count
4040
FROM workspace_latest_build wlb
4141
INNER JOIN provisioner_jobs pj ON wlb.job_id = pj.id
4242
INNER JOIN workspace_prebuild_builds wpb ON wpb.id = wlb.id
@@ -46,36 +46,37 @@ GROUP BY t.id, wpb.template_version_id, wpb.transition;
4646

4747
-- name: GetPresetsBackoff :many
4848
WITH filtered_builds AS (
49-
-- Only select builds which are for prebuild creations
50-
SELECT wlb.*, tvp.id AS preset_id, pj.job_status
51-
FROM template_version_presets tvp
52-
JOIN workspace_latest_build wlb ON wlb.template_version_preset_id = tvp.id
53-
JOIN provisioner_jobs pj ON wlb.job_id = pj.id
54-
JOIN template_versions tv ON wlb.template_version_id = tv.id
55-
JOIN templates t ON tv.template_id = t.id AND t.active_version_id = tv.id
56-
JOIN template_version_preset_prebuilds tvpp ON tvpp.preset_id = tvp.id
57-
WHERE wlb.transition = 'start'::workspace_transition),
58-
latest_builds AS (
59-
-- Select only the latest build per template_version AND preset
60-
SELECT fb.*,
61-
ROW_NUMBER() OVER (PARTITION BY fb.template_version_preset_id ORDER BY fb.created_at DESC) as rn
62-
FROM filtered_builds fb),
63-
failed_count AS (
64-
-- Count failed builds per template version/preset in the given period
65-
SELECT preset_id, COUNT(*) AS num_failed
66-
FROM filtered_builds
67-
WHERE job_status = 'failed'::provisioner_job_status
68-
AND created_at >= @lookback::timestamptz
69-
GROUP BY preset_id)
49+
-- Only select builds which are for prebuild creations
50+
SELECT wlb.*, tvp.id AS preset_id, pj.job_status, tvpp.desired_instances
51+
FROM template_version_presets tvp
52+
JOIN workspace_latest_build wlb ON wlb.template_version_preset_id = tvp.id
53+
JOIN provisioner_jobs pj ON wlb.job_id = pj.id
54+
JOIN template_versions tv ON wlb.template_version_id = tv.id
55+
JOIN templates t ON tv.template_id = t.id AND t.active_version_id = tv.id
56+
JOIN template_version_preset_prebuilds tvpp ON tvpp.preset_id = tvp.id
57+
WHERE wlb.transition = 'start'::workspace_transition),
58+
latest_builds AS (
59+
-- Select only the latest build per template_version AND preset
60+
SELECT fb.*,
61+
ROW_NUMBER() OVER (PARTITION BY fb.template_version_preset_id ORDER BY fb.created_at DESC) as rn
62+
FROM filtered_builds fb),
63+
failed_count AS (
64+
-- Count failed builds per template version/preset in the given period
65+
SELECT preset_id, COUNT(*) AS num_failed
66+
FROM filtered_builds
67+
WHERE job_status = 'failed'::provisioner_job_status
68+
AND created_at >= @lookback::timestamptz
69+
GROUP BY preset_id)
7070
SELECT lb.template_version_id,
71-
lb.preset_id,
72-
lb.job_status AS latest_build_status,
73-
COALESCE(fc.num_failed, 0)::int AS num_failed,
74-
lb.created_at AS last_build_at
71+
lb.preset_id,
72+
MAX(lb.job_status)::provisioner_job_status AS latest_build_status,
73+
MAX(COALESCE(fc.num_failed, 0))::int AS num_failed,
74+
MAX(lb.created_at)::timestamptz AS last_build_at
7575
FROM latest_builds lb
76-
LEFT JOIN failed_count fc ON fc.preset_id = lb.preset_id
77-
WHERE lb.rn = 1
78-
AND lb.job_status = 'failed'::provisioner_job_status;
76+
LEFT JOIN failed_count fc ON fc.preset_id = lb.preset_id
77+
WHERE lb.rn <= lb.desired_instances -- Fetch the last N builds, where N is the number of desired instances; if any fail, we backoff
78+
AND lb.job_status = 'failed'::provisioner_job_status
79+
GROUP BY lb.template_version_id, lb.preset_id, lb.job_status;
7980

8081
-- name: ClaimPrebuild :one
8182
-- TODO: rewrite to use named CTE instead?

coderd/prebuilds/reconcile.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ type ReconciliationActions struct {
3434
Outdated int32 // Prebuilds which no longer match the active template version.
3535
Extraneous int32 // Extra running prebuilds for active version (somehow).
3636
Starting, Stopping, Deleting int32 // Prebuilds currently being provisioned up or down.
37+
Failed int32 // Number of prebuilds which have failed in the past CODER_WORKSPACE_PREBUILDS_RECONCILIATION_BACKOFF_LOOKBACK_PERIOD.
3738
Create int32 // The number of prebuilds required to be created to reconcile required state.
3839
DeleteIDs []uuid.UUID // IDs of running prebuilds required to be deleted to reconcile required state.
3940
BackoffUntil time.Time // The time to wait until before trying to provision a new prebuild.

coderd/prebuilds/state.go

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,13 +48,14 @@ func (p PresetState) CalculateActions(clock quartz.Clock, backoffInterval time.D
4848
// In-progress builds are common across all presets belonging to a given template.
4949
// In other words: these values will be identical across all presets belonging to this template.
5050
for _, progress := range p.InProgress {
51+
num := progress.Count
5152
switch progress.Transition {
5253
case database.WorkspaceTransitionStart:
53-
starting++
54+
starting+=num
5455
case database.WorkspaceTransitionStop:
55-
stopping++
56+
stopping+=num
5657
case database.WorkspaceTransitionDelete:
57-
deleting++
58+
deleting+=num
5859
}
5960
}
6061

@@ -90,6 +91,8 @@ func (p PresetState) CalculateActions(clock quartz.Clock, backoffInterval time.D
9091
// We backoff when the last build failed, to give the operator some time to investigate the issue and to not provision
9192
// a tonne of prebuilds (_n_ on each reconciliation iteration).
9293
if p.Backoff != nil && p.Backoff.NumFailed > 0 {
94+
actions.Failed = p.Backoff.NumFailed
95+
9396
backoffUntil := p.Backoff.LastBuildAt.Add(time.Duration(p.Backoff.NumFailed) * backoffInterval)
9497

9598
if clock.Now().Before(backoffUntil) {
@@ -104,7 +107,8 @@ func (p PresetState) CalculateActions(clock quartz.Clock, backoffInterval time.D
104107

105108
// It's possible that an operator could stop/start prebuilds which interfere with the reconciliation loop, so
106109
// we check if there are somehow more prebuilds than we expect, and then pick random victims to be deleted.
107-
if extraneous > 0 {
110+
// There must be no active deletions in order for extraneous prebuilds to be deleted.
111+
if extraneous > 0 && deleting == 0 {
108112
// Sort running IDs by creation time so we always delete the oldest prebuilds.
109113
// In general, we want fresher prebuilds (imagine a mono-repo is cloned; newer is better).
110114
slices.SortFunc(p.Running, func(a, b database.GetRunningPrebuildsRow) int {

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy