From 0f51f357f76adc7bc48582dae0a32023469b0a64 Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Tue, 1 Apr 2025 16:06:20 +0000 Subject: [PATCH 01/24] added queries for fetching NotStartedProvisionerJobs --- coderd/database/dbauthz/dbauthz.go | 4 ++ coderd/database/dbmem/dbmem.go | 17 +++++++ coderd/database/dbmetrics/querymetrics.go | 7 +++ coderd/database/dbmock/dbmock.go | 15 ++++++ coderd/database/querier.go | 1 + coderd/database/queries.sql.go | 54 +++++++++++++++++++++ coderd/database/queries/provisionerjobs.sql | 10 ++++ 7 files changed, 108 insertions(+) diff --git a/coderd/database/dbauthz/dbauthz.go b/coderd/database/dbauthz/dbauthz.go index 732739b2f09a5..5be38ce940ccb 100644 --- a/coderd/database/dbauthz/dbauthz.go +++ b/coderd/database/dbauthz/dbauthz.go @@ -1992,6 +1992,10 @@ func (q *querier) GetLogoURL(ctx context.Context) (string, error) { return q.db.GetLogoURL(ctx) } +func (q *querier) GetNotStartedProvisionerJobs(ctx context.Context, notStartedSince time.Time) ([]database.ProvisionerJob, error) { + return q.db.GetNotStartedProvisionerJobs(ctx, notStartedSince) +} + func (q *querier) GetNotificationMessagesByStatus(ctx context.Context, arg database.GetNotificationMessagesByStatusParams) ([]database.NotificationMessage, error) { if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceNotificationMessage); err != nil { return nil, err diff --git a/coderd/database/dbmem/dbmem.go b/coderd/database/dbmem/dbmem.go index 2760c0c929c58..01eb2d66ed5de 100644 --- a/coderd/database/dbmem/dbmem.go +++ b/coderd/database/dbmem/dbmem.go @@ -3897,6 +3897,23 @@ func (q *FakeQuerier) GetLogoURL(_ context.Context) (string, error) { return q.logoURL, nil } +func (q *FakeQuerier) GetNotStartedProvisionerJobs(ctx context.Context, notStartedSince time.Time) ([]database.ProvisionerJob, error) { + q.mutex.RLock() + defer q.mutex.RUnlock() + + notStartedJobs := []database.ProvisionerJob{} + for _, provisionerJob := range q.provisionerJobs { + if !provisionerJob.StartedAt.Valid && !provisionerJob.CompletedAt.Valid && provisionerJob.UpdatedAt.Before(notStartedSince) { + // clone the Tags before appending, since maps are reference types and + // we don't want the caller to be able to mutate the map we have inside + // dbmem! + provisionerJob.Tags = maps.Clone(provisionerJob.Tags) + notStartedJobs = append(notStartedJobs, provisionerJob) + } + } + return notStartedJobs, nil +} + func (q *FakeQuerier) GetNotificationMessagesByStatus(_ context.Context, arg database.GetNotificationMessagesByStatusParams) ([]database.NotificationMessage, error) { err := validateDatabaseType(arg) if err != nil { diff --git a/coderd/database/dbmetrics/querymetrics.go b/coderd/database/dbmetrics/querymetrics.go index 128e741da1d76..1761c0c6b9960 100644 --- a/coderd/database/dbmetrics/querymetrics.go +++ b/coderd/database/dbmetrics/querymetrics.go @@ -949,6 +949,13 @@ func (m queryMetricsStore) GetLogoURL(ctx context.Context) (string, error) { return url, err } +func (m queryMetricsStore) GetNotStartedProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]database.ProvisionerJob, error) { + start := time.Now() + r0, r1 := m.s.GetNotStartedProvisionerJobs(ctx, updatedAt) + m.queryLatencies.WithLabelValues("GetNotStartedProvisionerJobs").Observe(time.Since(start).Seconds()) + return r0, r1 +} + func (m queryMetricsStore) GetNotificationMessagesByStatus(ctx context.Context, arg database.GetNotificationMessagesByStatusParams) ([]database.NotificationMessage, error) { start := time.Now() r0, r1 := m.s.GetNotificationMessagesByStatus(ctx, arg) diff --git a/coderd/database/dbmock/dbmock.go b/coderd/database/dbmock/dbmock.go index 17b263dfb2e07..1e3a6cf15cfa8 100644 --- a/coderd/database/dbmock/dbmock.go +++ b/coderd/database/dbmock/dbmock.go @@ -1923,6 +1923,21 @@ func (mr *MockStoreMockRecorder) GetLogoURL(ctx any) *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetLogoURL", reflect.TypeOf((*MockStore)(nil).GetLogoURL), ctx) } +// GetNotStartedProvisionerJobs mocks base method. +func (m *MockStore) GetNotStartedProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]database.ProvisionerJob, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetNotStartedProvisionerJobs", ctx, updatedAt) + ret0, _ := ret[0].([]database.ProvisionerJob) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetNotStartedProvisionerJobs indicates an expected call of GetNotStartedProvisionerJobs. +func (mr *MockStoreMockRecorder) GetNotStartedProvisionerJobs(ctx, updatedAt any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetNotStartedProvisionerJobs", reflect.TypeOf((*MockStore)(nil).GetNotStartedProvisionerJobs), ctx, updatedAt) +} + // GetNotificationMessagesByStatus mocks base method. func (m *MockStore) GetNotificationMessagesByStatus(ctx context.Context, arg database.GetNotificationMessagesByStatusParams) ([]database.NotificationMessage, error) { m.ctrl.T.Helper() diff --git a/coderd/database/querier.go b/coderd/database/querier.go index d0f74ee609724..86647a834d20a 100644 --- a/coderd/database/querier.go +++ b/coderd/database/querier.go @@ -213,6 +213,7 @@ type sqlcQuerier interface { GetLicenseByID(ctx context.Context, id int32) (License, error) GetLicenses(ctx context.Context) ([]License, error) GetLogoURL(ctx context.Context) (string, error) + GetNotStartedProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]ProvisionerJob, error) GetNotificationMessagesByStatus(ctx context.Context, arg GetNotificationMessagesByStatusParams) ([]NotificationMessage, error) // Fetch the notification report generator log indicating recent activity. GetNotificationReportGeneratorLogByTemplate(ctx context.Context, templateID uuid.UUID) (NotificationReportGeneratorLog, error) diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index e2e38c36fe10a..c525f7895b410 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -7430,6 +7430,60 @@ func (q *sqlQuerier) GetHungProvisionerJobs(ctx context.Context, updatedAt time. return items, nil } +const getNotStartedProvisionerJobs = `-- name: GetNotStartedProvisionerJobs :many +SELECT + id, created_at, updated_at, started_at, canceled_at, completed_at, error, organization_id, initiator_id, provisioner, storage_method, type, input, worker_id, file_id, tags, error_code, trace_metadata, job_status +FROM + provisioner_jobs +WHERE + updated_at < $1 + AND started_at IS NULL + AND completed_at IS NULL +` + +func (q *sqlQuerier) GetNotStartedProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]ProvisionerJob, error) { + rows, err := q.db.QueryContext(ctx, getNotStartedProvisionerJobs, updatedAt) + if err != nil { + return nil, err + } + defer rows.Close() + var items []ProvisionerJob + for rows.Next() { + var i ProvisionerJob + if err := rows.Scan( + &i.ID, + &i.CreatedAt, + &i.UpdatedAt, + &i.StartedAt, + &i.CanceledAt, + &i.CompletedAt, + &i.Error, + &i.OrganizationID, + &i.InitiatorID, + &i.Provisioner, + &i.StorageMethod, + &i.Type, + &i.Input, + &i.WorkerID, + &i.FileID, + &i.Tags, + &i.ErrorCode, + &i.TraceMetadata, + &i.JobStatus, + ); err != nil { + return nil, err + } + items = append(items, i) + } + if err := rows.Close(); err != nil { + return nil, err + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} + const getProvisionerJobByID = `-- name: GetProvisionerJobByID :one SELECT id, created_at, updated_at, started_at, canceled_at, completed_at, error, organization_id, initiator_id, provisioner, storage_method, type, input, worker_id, file_id, tags, error_code, trace_metadata, job_status diff --git a/coderd/database/queries/provisionerjobs.sql b/coderd/database/queries/provisionerjobs.sql index 2d544aedb9bd8..d125b195b88b0 100644 --- a/coderd/database/queries/provisionerjobs.sql +++ b/coderd/database/queries/provisionerjobs.sql @@ -266,6 +266,16 @@ WHERE AND started_at IS NOT NULL AND completed_at IS NULL; +-- name: GetNotStartedProvisionerJobs :many +SELECT + * +FROM + provisioner_jobs +WHERE + updated_at < $1 + AND started_at IS NULL + AND completed_at IS NULL; + -- name: InsertProvisionerJobTimings :many INSERT INTO provisioner_job_timings (job_id, started_at, ended_at, stage, source, action, resource) SELECT From 2f3d6065141f87dab893e8ae5c2d550752070cb7 Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Fri, 9 May 2025 13:12:15 +0000 Subject: [PATCH 02/24] added detector handling of not started jobs --- coderd/unhanger/detector.go | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/coderd/unhanger/detector.go b/coderd/unhanger/detector.go index 14383b1839363..ce3b1bb49800f 100644 --- a/coderd/unhanger/detector.go +++ b/coderd/unhanger/detector.go @@ -179,6 +179,14 @@ func (d *Detector) run(t time.Time) Stats { stats.Error = xerrors.Errorf("get hung provisioner jobs: %w", err) return stats } + // Find all provisioner jobs that are currently running but have not + // received an update in the last 5 minutes. + if err != nil { + stats.Error = xerrors.Errorf("get not started provisioner jobs: %w", err) + return stats + } + jobsUnstarted, err := d.db.GetNotStartedProvisionerJobs(ctx, t.Add(-HungJobDuration)) + jobs = append(jobs, jobsUnstarted...) // Limit the number of jobs we'll unhang in a single run to avoid // timing out. @@ -229,14 +237,6 @@ func unhangJob(ctx context.Context, log slog.Logger, db database.Store, pub pubs return xerrors.Errorf("get provisioner job: %w", err) } - // Check if we should still unhang it. - if !job.StartedAt.Valid { - // This shouldn't be possible to hit because the query only selects - // started and not completed jobs, and a job can't be "un-started". - return jobIneligibleError{ - Err: xerrors.New("job is not started"), - } - } if job.CompletedAt.Valid { return jobIneligibleError{ Err: xerrors.Errorf("job is completed (status %s)", job.JobStatus), From 4b252eb0137c773e127866dc56a4698d20cb5919 Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Fri, 9 May 2025 13:56:08 +0000 Subject: [PATCH 03/24] filling out started_at when unhanging not started jobs --- coderd/database/queries.sql.go | 5 ++++- coderd/database/queries/provisionerjobs.sql | 3 ++- coderd/unhanger/detector.go | 12 +++++++++++- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index c525f7895b410..b46e62554a404 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -8135,7 +8135,8 @@ SET updated_at = $2, completed_at = $3, error = $4, - error_code = $5 + error_code = $5, + started_at = $6 WHERE id = $1 ` @@ -8146,6 +8147,7 @@ type UpdateProvisionerJobWithCompleteByIDParams struct { CompletedAt sql.NullTime `db:"completed_at" json:"completed_at"` Error sql.NullString `db:"error" json:"error"` ErrorCode sql.NullString `db:"error_code" json:"error_code"` + StartedAt sql.NullTime `db:"started_at" json:"started_at"` } func (q *sqlQuerier) UpdateProvisionerJobWithCompleteByID(ctx context.Context, arg UpdateProvisionerJobWithCompleteByIDParams) error { @@ -8155,6 +8157,7 @@ func (q *sqlQuerier) UpdateProvisionerJobWithCompleteByID(ctx context.Context, a arg.CompletedAt, arg.Error, arg.ErrorCode, + arg.StartedAt, ) return err } diff --git a/coderd/database/queries/provisionerjobs.sql b/coderd/database/queries/provisionerjobs.sql index d125b195b88b0..9e9273bb05352 100644 --- a/coderd/database/queries/provisionerjobs.sql +++ b/coderd/database/queries/provisionerjobs.sql @@ -252,7 +252,8 @@ SET updated_at = $2, completed_at = $3, error = $4, - error_code = $5 + error_code = $5, + started_at = $6 WHERE id = $1; diff --git a/coderd/unhanger/detector.go b/coderd/unhanger/detector.go index ce3b1bb49800f..d00b8a1de50bf 100644 --- a/coderd/unhanger/detector.go +++ b/coderd/unhanger/detector.go @@ -295,11 +295,21 @@ func unhangJob(ctx context.Context, log slog.Logger, db database.Store, pub pubs } lowestLogID = newLogs[0].ID - // Mark the job as failed. now = dbtime.Now() + // If we are unhanging a job that was never picked up by the + // provisioner, we need to set the started_at time to the current + // time so that the build duration is correct. + if !job.StartedAt.Valid { + job.StartedAt = sql.NullTime{ + Time: now, + Valid: true, + } + } + // Mark the job as failed. err = db.UpdateProvisionerJobWithCompleteByID(ctx, database.UpdateProvisionerJobWithCompleteByIDParams{ ID: job.ID, UpdatedAt: now, + StartedAt: job.StartedAt, CompletedAt: sql.NullTime{ Time: now, Valid: true, From ca4951995b44bb6beac3f0b0ac4574d01f90a804 Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Tue, 13 May 2025 17:00:40 +0000 Subject: [PATCH 04/24] WIP --- coderd/unhanger/detector.go | 54 ++++++++++++++++++++++++++----------- 1 file changed, 39 insertions(+), 15 deletions(-) diff --git a/coderd/unhanger/detector.go b/coderd/unhanger/detector.go index d00b8a1de50bf..2e0c410aa0bee 100644 --- a/coderd/unhanger/detector.go +++ b/coderd/unhanger/detector.go @@ -25,6 +25,10 @@ const ( // before it is considered hung. HungJobDuration = 5 * time.Minute + // NotStartedTimeElapsed is the duration of time since the last update to a job + // before it is considered hung. + NotStartedTimeElapsed = 30 * time.Minute + // HungJobExitTimeout is the duration of time that provisioners should allow // for a graceful exit upon cancellation due to failing to send an update to // a job. @@ -38,6 +42,13 @@ const ( MaxJobsPerRun = 10 ) +type jobType string + +const ( + hungJobType jobType = "hung" + notStartedJobType jobType = "not started" +) + // HungJobLogMessages are written to provisioner job logs when a job is hung and // terminated. var HungJobLogMessages = []string{ @@ -176,17 +187,17 @@ func (d *Detector) run(t time.Time) Stats { // received an update in the last 5 minutes. jobs, err := d.db.GetHungProvisionerJobs(ctx, t.Add(-HungJobDuration)) if err != nil { - stats.Error = xerrors.Errorf("get hung provisioner jobs: %w", err) + stats.Error = xerrors.Errorf("get %s provisioner jobs: %w", hungJobType, err) return stats } - // Find all provisioner jobs that are currently running but have not - // received an update in the last 5 minutes. + // Find all provisioner jobs that have not been started yet and have not + // received an update in the last 30 minutes. + jobsNotStarted, err := d.db.GetNotStartedProvisionerJobs(ctx, t.Add(-NotStartedTimeElapsed)) if err != nil { - stats.Error = xerrors.Errorf("get not started provisioner jobs: %w", err) + stats.Error = xerrors.Errorf("get %s provisioner jobs: %w", notStartedJobType, err) return stats } - jobsUnstarted, err := d.db.GetNotStartedProvisionerJobs(ctx, t.Add(-HungJobDuration)) - jobs = append(jobs, jobsUnstarted...) + jobs = append(jobs, jobsNotStarted...) // Limit the number of jobs we'll unhang in a single run to avoid // timing out. @@ -198,7 +209,7 @@ func (d *Detector) run(t time.Time) Stats { jobs = jobs[:MaxJobsPerRun] } - // Send a message into the build log for each hung job saying that it + // Send a message into the build log for each hung or not startedjob saying that it // has been detected and will be terminated, then mark the job as // failed. for _, job := range jobs { @@ -206,8 +217,12 @@ func (d *Detector) run(t time.Time) Stats { err := unhangJob(ctx, log, d.db, d.pubsub, job.ID) if err != nil { + jobType := notStartedJobType + if job.StartedAt.Valid { + jobType = hungJobType + } if !(xerrors.As(err, &acquireLockError{}) || xerrors.As(err, &jobIneligibleError{})) { - log.Error(ctx, "error forcefully terminating hung provisioner job", slog.Error(err)) + log.Error(ctx, fmt.Sprintf("error forcefully terminating %s provisioner job", jobType), slog.Error(err)) } continue } @@ -222,7 +237,7 @@ func unhangJob(ctx context.Context, log slog.Logger, db database.Store, pub pubs var lowestLogID int64 err := db.InTx(func(db database.Store) error { - locked, err := db.TryAcquireLock(ctx, database.GenLockID(fmt.Sprintf("hang-detector:%s", jobID))) + locked, err := db.TryAcquireLock(ctx, database.GenLockID(fmt.Sprintf("unhanger:%s", jobID))) if err != nil { return xerrors.Errorf("acquire lock: %w", err) } @@ -237,6 +252,14 @@ func unhangJob(ctx context.Context, log slog.Logger, db database.Store, pub pubs return xerrors.Errorf("get provisioner job: %w", err) } + jobType := hungJobType + threshold := HungJobDuration.Minutes() + + if !job.StartedAt.Valid { + jobType = notStartedJobType + threshold = NotStartedTimeElapsed.Minutes() + } + if job.CompletedAt.Valid { return jobIneligibleError{ Err: xerrors.Errorf("job is completed (status %s)", job.JobStatus), @@ -249,8 +272,8 @@ func unhangJob(ctx context.Context, log slog.Logger, db database.Store, pub pubs } log.Warn( - ctx, "detected hung provisioner job, forcefully terminating", - "threshold", HungJobDuration, + ctx, fmt.Sprintf("detected %s provisioner job, forcefully terminating", jobType), + "threshold", threshold, ) // First, get the latest logs from the build so we can make sure @@ -260,7 +283,7 @@ func unhangJob(ctx context.Context, log slog.Logger, db database.Store, pub pubs CreatedAfter: 0, }) if err != nil { - return xerrors.Errorf("get logs for hung job: %w", err) + return xerrors.Errorf("get logs for %s job: %w", jobType, err) } logStage := "" if len(logs) != 0 { @@ -291,12 +314,13 @@ func unhangJob(ctx context.Context, log slog.Logger, db database.Store, pub pubs } newLogs, err := db.InsertProvisionerJobLogs(ctx, insertParams) if err != nil { - return xerrors.Errorf("insert logs for hung job: %w", err) + return xerrors.Errorf("insert logs for %s job: %w", jobType, err) } lowestLogID = newLogs[0].ID now = dbtime.Now() - // If we are unhanging a job that was never picked up by the + + // If we are failing a job that was never picked up by the // provisioner, we need to set the started_at time to the current // time so that the build duration is correct. if !job.StartedAt.Valid { @@ -315,7 +339,7 @@ func unhangJob(ctx context.Context, log slog.Logger, db database.Store, pub pubs Valid: true, }, Error: sql.NullString{ - String: "Coder: Build has been detected as hung for 5 minutes and has been terminated by hang detector.", + String: fmt.Sprintf("Coder: Build has been detected as %s for %.0f minutes and has been terminated by hang detector.", jobType, threshold), Valid: true, }, ErrorCode: sql.NullString{ From af994c2fc1bc1f25e6609fd096626fdcdf6d9986 Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Tue, 13 May 2025 18:19:38 +0000 Subject: [PATCH 05/24] refactored to reaper & added tests --- cli/server.go | 4 +- coderd/coderdtest/coderdtest.go | 4 +- coderd/database/dbauthz/dbauthz.go | 4 +- coderd/{unhanger => reaper}/detector.go | 57 ++-- coderd/{unhanger => reaper}/detector_test.go | 283 +++++++++++++++++-- provisioner/terraform/serve.go | 8 +- 6 files changed, 294 insertions(+), 66 deletions(-) rename coderd/{unhanger => reaper}/detector.go (91%) rename coderd/{unhanger => reaper}/detector_test.go (72%) diff --git a/cli/server.go b/cli/server.go index d32ed51c06007..e924439ec26b2 100644 --- a/cli/server.go +++ b/cli/server.go @@ -92,10 +92,10 @@ import ( "github.com/coder/coder/v2/coderd/prometheusmetrics" "github.com/coder/coder/v2/coderd/prometheusmetrics/insights" "github.com/coder/coder/v2/coderd/promoauth" + "github.com/coder/coder/v2/coderd/reaper" "github.com/coder/coder/v2/coderd/schedule" "github.com/coder/coder/v2/coderd/telemetry" "github.com/coder/coder/v2/coderd/tracing" - "github.com/coder/coder/v2/coderd/unhanger" "github.com/coder/coder/v2/coderd/updatecheck" "github.com/coder/coder/v2/coderd/util/ptr" "github.com/coder/coder/v2/coderd/util/slice" @@ -1129,7 +1129,7 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. hangDetectorTicker := time.NewTicker(vals.JobHangDetectorInterval.Value()) defer hangDetectorTicker.Stop() - hangDetector := unhanger.New(ctx, options.Database, options.Pubsub, logger, hangDetectorTicker.C) + hangDetector := reaper.New(ctx, options.Database, options.Pubsub, logger, hangDetectorTicker.C) hangDetector.Start() defer hangDetector.Close() diff --git a/coderd/coderdtest/coderdtest.go b/coderd/coderdtest/coderdtest.go index e843d0d748578..ff418f3abaff2 100644 --- a/coderd/coderdtest/coderdtest.go +++ b/coderd/coderdtest/coderdtest.go @@ -72,10 +72,10 @@ import ( "github.com/coder/coder/v2/coderd/notifications/notificationstest" "github.com/coder/coder/v2/coderd/rbac" "github.com/coder/coder/v2/coderd/rbac/policy" + "github.com/coder/coder/v2/coderd/reaper" "github.com/coder/coder/v2/coderd/runtimeconfig" "github.com/coder/coder/v2/coderd/schedule" "github.com/coder/coder/v2/coderd/telemetry" - "github.com/coder/coder/v2/coderd/unhanger" "github.com/coder/coder/v2/coderd/updatecheck" "github.com/coder/coder/v2/coderd/util/ptr" "github.com/coder/coder/v2/coderd/webpush" @@ -367,7 +367,7 @@ func NewOptions(t testing.TB, options *Options) (func(http.Handler), context.Can hangDetectorTicker := time.NewTicker(options.DeploymentValues.JobHangDetectorInterval.Value()) defer hangDetectorTicker.Stop() - hangDetector := unhanger.New(ctx, options.Database, options.Pubsub, options.Logger.Named("unhanger.detector"), hangDetectorTicker.C) + hangDetector := reaper.New(ctx, options.Database, options.Pubsub, options.Logger.Named("reaper.detector"), hangDetectorTicker.C) hangDetector.Start() t.Cleanup(hangDetector.Close) diff --git a/coderd/database/dbauthz/dbauthz.go b/coderd/database/dbauthz/dbauthz.go index 5be38ce940ccb..4bcb7076a5a69 100644 --- a/coderd/database/dbauthz/dbauthz.go +++ b/coderd/database/dbauthz/dbauthz.go @@ -219,7 +219,7 @@ var ( Scope: rbac.ScopeAll, }.WithCachedASTValue() - // See unhanger package. + // See reaper package. subjectHangDetector = rbac.Subject{ Type: rbac.SubjectTypeHangDetector, FriendlyName: "Hang Detector", @@ -408,7 +408,7 @@ func AsAutostart(ctx context.Context) context.Context { } // AsHangDetector returns a context with an actor that has permissions required -// for unhanger.Detector to function. +// for reaper.Detector to function. func AsHangDetector(ctx context.Context) context.Context { return As(ctx, subjectHangDetector) } diff --git a/coderd/unhanger/detector.go b/coderd/reaper/detector.go similarity index 91% rename from coderd/unhanger/detector.go rename to coderd/reaper/detector.go index 2e0c410aa0bee..a7736f2a2a070 100644 --- a/coderd/unhanger/detector.go +++ b/coderd/reaper/detector.go @@ -1,4 +1,4 @@ -package unhanger +package reaper import ( "context" @@ -49,14 +49,15 @@ const ( notStartedJobType jobType = "not started" ) -// HungJobLogMessages are written to provisioner job logs when a job is hung and -// terminated. -var HungJobLogMessages = []string{ - "", - "====================", - "Coder: Build has been detected as hung for 5 minutes and will be terminated.", - "====================", - "", +// jobLogMessages are written to provisioner job logs when a job is reaped +func jobLogMessages(jobType jobType, threshold float64) []string { + return []string{ + "", + "====================", + fmt.Sprintf("Coder: Build has been detected as %s for %.0f minutes and will be terminated.", jobType, threshold), + "====================", + "", + } } // acquireLockError is returned when the detector fails to acquire a lock and @@ -209,18 +210,14 @@ func (d *Detector) run(t time.Time) Stats { jobs = jobs[:MaxJobsPerRun] } - // Send a message into the build log for each hung or not startedjob saying that it - // has been detected and will be terminated, then mark the job as - // failed. + // Send a message into the build log for each hung or not started job saying that it + // has been detected and will be terminated, then mark the job as failed. for _, job := range jobs { log := d.log.With(slog.F("job_id", job.ID)) - err := unhangJob(ctx, log, d.db, d.pubsub, job.ID) + err := reapJob(ctx, log, d.db, d.pubsub, job.ID) if err != nil { - jobType := notStartedJobType - if job.StartedAt.Valid { - jobType = hungJobType - } + jobType, _ := reapParamsFromJob(job) if !(xerrors.As(err, &acquireLockError{}) || xerrors.As(err, &jobIneligibleError{})) { log.Error(ctx, fmt.Sprintf("error forcefully terminating %s provisioner job", jobType), slog.Error(err)) } @@ -233,11 +230,11 @@ func (d *Detector) run(t time.Time) Stats { return stats } -func unhangJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub.Pubsub, jobID uuid.UUID) error { +func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub.Pubsub, jobID uuid.UUID) error { var lowestLogID int64 err := db.InTx(func(db database.Store) error { - locked, err := db.TryAcquireLock(ctx, database.GenLockID(fmt.Sprintf("unhanger:%s", jobID))) + locked, err := db.TryAcquireLock(ctx, database.GenLockID(fmt.Sprintf("reaper:%s", jobID))) if err != nil { return xerrors.Errorf("acquire lock: %w", err) } @@ -252,20 +249,14 @@ func unhangJob(ctx context.Context, log slog.Logger, db database.Store, pub pubs return xerrors.Errorf("get provisioner job: %w", err) } - jobType := hungJobType - threshold := HungJobDuration.Minutes() - - if !job.StartedAt.Valid { - jobType = notStartedJobType - threshold = NotStartedTimeElapsed.Minutes() - } + jobType, threshold := reapParamsFromJob(job) if job.CompletedAt.Valid { return jobIneligibleError{ Err: xerrors.Errorf("job is completed (status %s)", job.JobStatus), } } - if job.UpdatedAt.After(time.Now().Add(-HungJobDuration)) { + if job.UpdatedAt.After(time.Now().Add(-time.Duration(threshold) * time.Minute)) { return jobIneligibleError{ Err: xerrors.New("job has been updated recently"), } @@ -303,7 +294,7 @@ func unhangJob(ctx context.Context, log slog.Logger, db database.Store, pub pubs Output: nil, } now := dbtime.Now() - for i, msg := range HungJobLogMessages { + for i, msg := range jobLogMessages(jobType, threshold) { // Set the created at in a way that ensures each message has // a unique timestamp so they will be sorted correctly. insertParams.CreatedAt = append(insertParams.CreatedAt, now.Add(time.Millisecond*time.Duration(i))) @@ -405,3 +396,13 @@ func unhangJob(ctx context.Context, log slog.Logger, db database.Store, pub pubs return nil } + +func reapParamsFromJob(job database.ProvisionerJob) (jobType, float64) { + jobType := hungJobType + threshold := HungJobDuration.Minutes() + if !job.StartedAt.Valid { + jobType = notStartedJobType + threshold = NotStartedTimeElapsed.Minutes() + } + return jobType, threshold +} diff --git a/coderd/unhanger/detector_test.go b/coderd/reaper/detector_test.go similarity index 72% rename from coderd/unhanger/detector_test.go rename to coderd/reaper/detector_test.go index 43eb62bfa884b..e57e0bdccd1be 100644 --- a/coderd/unhanger/detector_test.go +++ b/coderd/reaper/detector_test.go @@ -1,4 +1,4 @@ -package unhanger_test +package reaper_test import ( "context" @@ -22,11 +22,41 @@ import ( "github.com/coder/coder/v2/coderd/database/dbtestutil" "github.com/coder/coder/v2/coderd/provisionerdserver" "github.com/coder/coder/v2/coderd/rbac" - "github.com/coder/coder/v2/coderd/unhanger" + "github.com/coder/coder/v2/coderd/reaper" "github.com/coder/coder/v2/provisionersdk" "github.com/coder/coder/v2/testutil" ) +// jobType represents the type of job being reaped +type jobType string + +const ( + hungJobType jobType = "hung" + notStartedJobType jobType = "not started" +) + +// jobLogMessages returns the messages to be written to provisioner job logs when a job is reaped +func jobLogMessages(jobType jobType, threshold float64) []string { + return []string{ + "", + "====================", + fmt.Sprintf("Coder: Build has been detected as %s for %.0f minutes and will be terminated.", jobType, threshold), + "====================", + "", + } +} + +// reapParamsFromJob determines the type and threshold for a job being reaped +func reapParamsFromJob(job database.ProvisionerJob) (jobType, float64) { + jobType := hungJobType + threshold := reaper.HungJobDuration.Minutes() + if !job.StartedAt.Valid { + jobType = notStartedJobType + threshold = reaper.NotStartedTimeElapsed.Minutes() + } + return jobType, threshold +} + func TestMain(m *testing.M) { goleak.VerifyTestMain(m, testutil.GoleakOptions...) } @@ -39,10 +69,10 @@ func TestDetectorNoJobs(t *testing.T) { db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan unhanger.Stats) + statsCh = make(chan reaper.Stats) ) - detector := unhanger.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := reaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() tickCh <- time.Now() @@ -62,7 +92,7 @@ func TestDetectorNoHungJobs(t *testing.T) { db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan unhanger.Stats) + statsCh = make(chan reaper.Stats) ) // Insert some jobs that are running and haven't been updated in a while, @@ -89,7 +119,7 @@ func TestDetectorNoHungJobs(t *testing.T) { }) } - detector := unhanger.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := reaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() tickCh <- now @@ -109,7 +139,7 @@ func TestDetectorHungWorkspaceBuild(t *testing.T) { db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan unhanger.Stats) + statsCh = make(chan reaper.Stats) ) var ( @@ -195,7 +225,7 @@ func TestDetectorHungWorkspaceBuild(t *testing.T) { t.Log("previous job ID: ", previousWorkspaceBuildJob.ID) t.Log("current job ID: ", currentWorkspaceBuildJob.ID) - detector := unhanger.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := reaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() tickCh <- now @@ -231,7 +261,7 @@ func TestDetectorHungWorkspaceBuildNoOverrideState(t *testing.T) { db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan unhanger.Stats) + statsCh = make(chan reaper.Stats) ) var ( @@ -318,7 +348,7 @@ func TestDetectorHungWorkspaceBuildNoOverrideState(t *testing.T) { t.Log("previous job ID: ", previousWorkspaceBuildJob.ID) t.Log("current job ID: ", currentWorkspaceBuildJob.ID) - detector := unhanger.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := reaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() tickCh <- now @@ -354,7 +384,7 @@ func TestDetectorHungWorkspaceBuildNoOverrideStateIfNoExistingBuild(t *testing.T db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan unhanger.Stats) + statsCh = make(chan reaper.Stats) ) var ( @@ -411,7 +441,7 @@ func TestDetectorHungWorkspaceBuildNoOverrideStateIfNoExistingBuild(t *testing.T t.Log("current job ID: ", currentWorkspaceBuildJob.ID) - detector := unhanger.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := reaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() tickCh <- now @@ -439,6 +469,98 @@ func TestDetectorHungWorkspaceBuildNoOverrideStateIfNoExistingBuild(t *testing.T detector.Wait() } +func TestDetectorNotStartedWorkspaceBuildNoOverrideStateIfNoExistingBuild(t *testing.T) { + t.Parallel() + + var ( + ctx = testutil.Context(t, testutil.WaitLong) + db, pubsub = dbtestutil.NewDB(t) + log = testutil.Logger(t) + tickCh = make(chan time.Time) + statsCh = make(chan reaper.Stats) + ) + + var ( + now = time.Now() + thirtyFiveMinAgo = now.Add(-time.Minute * 35) + org = dbgen.Organization(t, db, database.Organization{}) + user = dbgen.User(t, db, database.User{}) + file = dbgen.File(t, db, database.File{}) + template = dbgen.Template(t, db, database.Template{ + OrganizationID: org.ID, + CreatedBy: user.ID, + }) + templateVersion = dbgen.TemplateVersion(t, db, database.TemplateVersion{ + OrganizationID: org.ID, + TemplateID: uuid.NullUUID{ + UUID: template.ID, + Valid: true, + }, + CreatedBy: user.ID, + }) + workspace = dbgen.Workspace(t, db, database.WorkspaceTable{ + OwnerID: user.ID, + OrganizationID: org.ID, + TemplateID: template.ID, + }) + + // First build. + expectedWorkspaceBuildState = []byte(`{"dean":"cool","colin":"also cool"}`) + currentWorkspaceBuildJob = dbgen.ProvisionerJob(t, db, pubsub, database.ProvisionerJob{ + CreatedAt: thirtyFiveMinAgo, + UpdatedAt: thirtyFiveMinAgo, + StartedAt: sql.NullTime{ + Time: time.Time{}, + Valid: false, + }, + OrganizationID: org.ID, + InitiatorID: user.ID, + Provisioner: database.ProvisionerTypeEcho, + StorageMethod: database.ProvisionerStorageMethodFile, + FileID: file.ID, + Type: database.ProvisionerJobTypeWorkspaceBuild, + Input: []byte("{}"), + }) + currentWorkspaceBuild = dbgen.WorkspaceBuild(t, db, database.WorkspaceBuild{ + WorkspaceID: workspace.ID, + TemplateVersionID: templateVersion.ID, + BuildNumber: 1, + JobID: currentWorkspaceBuildJob.ID, + // Should not be overridden. + ProvisionerState: expectedWorkspaceBuildState, + }) + ) + + t.Log("current job ID: ", currentWorkspaceBuildJob.ID) + + detector := reaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector.Start() + tickCh <- now + + stats := <-statsCh + require.NoError(t, stats.Error) + require.Len(t, stats.TerminatedJobIDs, 1) + require.Equal(t, currentWorkspaceBuildJob.ID, stats.TerminatedJobIDs[0]) + + // Check that the current provisioner job was updated. + job, err := db.GetProvisionerJobByID(ctx, currentWorkspaceBuildJob.ID) + require.NoError(t, err) + require.WithinDuration(t, now, job.UpdatedAt, 30*time.Second) + require.True(t, job.CompletedAt.Valid) + require.WithinDuration(t, now, job.CompletedAt.Time, 30*time.Second) + require.True(t, job.Error.Valid) + require.Contains(t, job.Error.String, "Build has been detected as not started") + require.False(t, job.ErrorCode.Valid) + + // Check that the provisioner state was NOT updated. + build, err := db.GetWorkspaceBuildByID(ctx, currentWorkspaceBuild.ID) + require.NoError(t, err) + require.Equal(t, expectedWorkspaceBuildState, build.ProvisionerState) + + detector.Close() + detector.Wait() +} + func TestDetectorHungOtherJobTypes(t *testing.T) { t.Parallel() @@ -447,7 +569,7 @@ func TestDetectorHungOtherJobTypes(t *testing.T) { db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan unhanger.Stats) + statsCh = make(chan reaper.Stats) ) var ( @@ -509,7 +631,7 @@ func TestDetectorHungOtherJobTypes(t *testing.T) { t.Log("template import job ID: ", templateImportJob.ID) t.Log("template dry-run job ID: ", templateDryRunJob.ID) - detector := unhanger.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := reaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() tickCh <- now @@ -543,6 +665,109 @@ func TestDetectorHungOtherJobTypes(t *testing.T) { detector.Wait() } +func TestDetectorNotStartedOtherJobTypes(t *testing.T) { + t.Parallel() + + var ( + ctx = testutil.Context(t, testutil.WaitLong) + db, pubsub = dbtestutil.NewDB(t) + log = testutil.Logger(t) + tickCh = make(chan time.Time) + statsCh = make(chan reaper.Stats) + ) + + var ( + now = time.Now() + thirtyFiveMinAgo = now.Add(-time.Minute * 35) + org = dbgen.Organization(t, db, database.Organization{}) + user = dbgen.User(t, db, database.User{}) + file = dbgen.File(t, db, database.File{}) + + // Template import job. + templateImportJob = dbgen.ProvisionerJob(t, db, pubsub, database.ProvisionerJob{ + CreatedAt: thirtyFiveMinAgo, + UpdatedAt: thirtyFiveMinAgo, + StartedAt: sql.NullTime{ + Time: time.Time{}, + Valid: false, + }, + OrganizationID: org.ID, + InitiatorID: user.ID, + Provisioner: database.ProvisionerTypeEcho, + StorageMethod: database.ProvisionerStorageMethodFile, + FileID: file.ID, + Type: database.ProvisionerJobTypeTemplateVersionImport, + Input: []byte("{}"), + }) + _ = dbgen.TemplateVersion(t, db, database.TemplateVersion{ + OrganizationID: org.ID, + JobID: templateImportJob.ID, + CreatedBy: user.ID, + }) + ) + + // Template dry-run job. + dryRunVersion := dbgen.TemplateVersion(t, db, database.TemplateVersion{ + OrganizationID: org.ID, + CreatedBy: user.ID, + }) + input, err := json.Marshal(provisionerdserver.TemplateVersionDryRunJob{ + TemplateVersionID: dryRunVersion.ID, + }) + require.NoError(t, err) + templateDryRunJob := dbgen.ProvisionerJob(t, db, pubsub, database.ProvisionerJob{ + CreatedAt: thirtyFiveMinAgo, + UpdatedAt: thirtyFiveMinAgo, + StartedAt: sql.NullTime{ + Time: time.Time{}, + Valid: false, + }, + OrganizationID: org.ID, + InitiatorID: user.ID, + Provisioner: database.ProvisionerTypeEcho, + StorageMethod: database.ProvisionerStorageMethodFile, + FileID: file.ID, + Type: database.ProvisionerJobTypeTemplateVersionDryRun, + Input: input, + }) + + t.Log("template import job ID: ", templateImportJob.ID) + t.Log("template dry-run job ID: ", templateDryRunJob.ID) + + detector := reaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector.Start() + tickCh <- now + + stats := <-statsCh + require.NoError(t, stats.Error) + require.Len(t, stats.TerminatedJobIDs, 2) + require.Contains(t, stats.TerminatedJobIDs, templateImportJob.ID) + require.Contains(t, stats.TerminatedJobIDs, templateDryRunJob.ID) + + // Check that the template import job was updated. + job, err := db.GetProvisionerJobByID(ctx, templateImportJob.ID) + require.NoError(t, err) + require.WithinDuration(t, now, job.UpdatedAt, 30*time.Second) + require.True(t, job.CompletedAt.Valid) + require.WithinDuration(t, now, job.CompletedAt.Time, 30*time.Second) + require.True(t, job.Error.Valid) + require.Contains(t, job.Error.String, "Build has been detected as not started") + require.False(t, job.ErrorCode.Valid) + + // Check that the template dry-run job was updated. + job, err = db.GetProvisionerJobByID(ctx, templateDryRunJob.ID) + require.NoError(t, err) + require.WithinDuration(t, now, job.UpdatedAt, 30*time.Second) + require.True(t, job.CompletedAt.Valid) + require.WithinDuration(t, now, job.CompletedAt.Time, 30*time.Second) + require.True(t, job.Error.Valid) + require.Contains(t, job.Error.String, "Build has been detected as not started") + require.False(t, job.ErrorCode.Valid) + + detector.Close() + detector.Wait() +} + func TestDetectorHungCanceledJob(t *testing.T) { t.Parallel() @@ -551,7 +776,7 @@ func TestDetectorHungCanceledJob(t *testing.T) { db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan unhanger.Stats) + statsCh = make(chan reaper.Stats) ) var ( @@ -591,7 +816,7 @@ func TestDetectorHungCanceledJob(t *testing.T) { t.Log("template import job ID: ", templateImportJob.ID) - detector := unhanger.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := reaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() tickCh <- now @@ -653,7 +878,7 @@ func TestDetectorPushesLogs(t *testing.T) { db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan unhanger.Stats) + statsCh = make(chan reaper.Stats) ) var ( @@ -706,7 +931,7 @@ func TestDetectorPushesLogs(t *testing.T) { require.Len(t, logs, 10) } - detector := unhanger.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := reaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() // Create pubsub subscription to listen for new log events. @@ -741,12 +966,14 @@ func TestDetectorPushesLogs(t *testing.T) { CreatedAfter: after, }) require.NoError(t, err) - require.Len(t, logs, len(unhanger.HungJobLogMessages)) + jobType, threshold := reapParamsFromJob(templateImportJob) + expectedLogs := jobLogMessages(jobType, threshold) + require.Len(t, logs, len(expectedLogs)) for i, log := range logs { assert.Equal(t, database.LogLevelError, log.Level) assert.Equal(t, c.expectStage, log.Stage) assert.Equal(t, database.LogSourceProvisionerDaemon, log.Source) - assert.Equal(t, unhanger.HungJobLogMessages[i], log.Output) + assert.Equal(t, expectedLogs[i], log.Output) } // Double check the full log count. @@ -755,7 +982,7 @@ func TestDetectorPushesLogs(t *testing.T) { CreatedAfter: 0, }) require.NoError(t, err) - require.Len(t, logs, c.preLogCount+len(unhanger.HungJobLogMessages)) + require.Len(t, logs, c.preLogCount+len(expectedLogs)) detector.Close() detector.Wait() @@ -771,15 +998,15 @@ func TestDetectorMaxJobsPerRun(t *testing.T) { db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan unhanger.Stats) + statsCh = make(chan reaper.Stats) org = dbgen.Organization(t, db, database.Organization{}) user = dbgen.User(t, db, database.User{}) file = dbgen.File(t, db, database.File{}) ) - // Create unhanger.MaxJobsPerRun + 1 hung jobs. + // Create MaxJobsPerRun + 1 hung jobs. now := time.Now() - for i := 0; i < unhanger.MaxJobsPerRun+1; i++ { + for i := 0; i < reaper.MaxJobsPerRun+1; i++ { pj := dbgen.ProvisionerJob(t, db, pubsub, database.ProvisionerJob{ CreatedAt: now.Add(-time.Hour), UpdatedAt: now.Add(-time.Hour), @@ -802,14 +1029,14 @@ func TestDetectorMaxJobsPerRun(t *testing.T) { }) } - detector := unhanger.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := reaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() tickCh <- now - // Make sure that only unhanger.MaxJobsPerRun jobs are terminated. + // Make sure that only MaxJobsPerRun jobs are terminated. stats := <-statsCh require.NoError(t, stats.Error) - require.Len(t, stats.TerminatedJobIDs, unhanger.MaxJobsPerRun) + require.Len(t, stats.TerminatedJobIDs, reaper.MaxJobsPerRun) // Run the detector again and make sure that only the remaining job is // terminated. @@ -823,7 +1050,7 @@ func TestDetectorMaxJobsPerRun(t *testing.T) { } // wrapDBAuthz adds our Authorization/RBAC around the given database store, to -// ensure the unhanger has the right permissions to do its work. +// ensure the reaper has the right permissions to do its work. func wrapDBAuthz(db database.Store, logger slog.Logger) database.Store { return dbauthz.New( db, diff --git a/provisioner/terraform/serve.go b/provisioner/terraform/serve.go index 562946d8ef92e..01d63b3b90086 100644 --- a/provisioner/terraform/serve.go +++ b/provisioner/terraform/serve.go @@ -16,7 +16,7 @@ import ( "cdr.dev/slog" "github.com/coder/coder/v2/coderd/database" - "github.com/coder/coder/v2/coderd/unhanger" + "github.com/coder/coder/v2/coderd/reaper" "github.com/coder/coder/v2/provisionersdk" ) @@ -39,9 +39,9 @@ type ServeOptions struct { // // This is a no-op on Windows where the process can't be interrupted. // - // Default value: 3 minutes (unhanger.HungJobExitTimeout). This value should + // Default value: 3 minutes (reaper.HungJobExitTimeout). This value should // be kept less than the value that Coder uses to mark hung jobs as failed, - // which is 5 minutes (see unhanger package). + // which is 5 minutes (see reaper package). ExitTimeout time.Duration } @@ -131,7 +131,7 @@ func Serve(ctx context.Context, options *ServeOptions) error { options.Tracer = trace.NewNoopTracerProvider().Tracer("noop") } if options.ExitTimeout == 0 { - options.ExitTimeout = unhanger.HungJobExitTimeout + options.ExitTimeout = reaper.HungJobExitTimeout } return provisionersdk.Serve(ctx, &server{ execMut: &sync.Mutex{}, From 38157271858139038a15c48129539eb55b81f55f Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Tue, 13 May 2025 18:20:48 +0000 Subject: [PATCH 06/24] Revert "filling out started_at when unhanging not started jobs" This reverts commit 4b252eb0137c773e127866dc56a4698d20cb5919. --- coderd/database/queries.sql.go | 5 +---- coderd/database/queries/provisionerjobs.sql | 3 +-- coderd/reaper/detector.go | 13 +------------ 3 files changed, 3 insertions(+), 18 deletions(-) diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index b46e62554a404..c525f7895b410 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -8135,8 +8135,7 @@ SET updated_at = $2, completed_at = $3, error = $4, - error_code = $5, - started_at = $6 + error_code = $5 WHERE id = $1 ` @@ -8147,7 +8146,6 @@ type UpdateProvisionerJobWithCompleteByIDParams struct { CompletedAt sql.NullTime `db:"completed_at" json:"completed_at"` Error sql.NullString `db:"error" json:"error"` ErrorCode sql.NullString `db:"error_code" json:"error_code"` - StartedAt sql.NullTime `db:"started_at" json:"started_at"` } func (q *sqlQuerier) UpdateProvisionerJobWithCompleteByID(ctx context.Context, arg UpdateProvisionerJobWithCompleteByIDParams) error { @@ -8157,7 +8155,6 @@ func (q *sqlQuerier) UpdateProvisionerJobWithCompleteByID(ctx context.Context, a arg.CompletedAt, arg.Error, arg.ErrorCode, - arg.StartedAt, ) return err } diff --git a/coderd/database/queries/provisionerjobs.sql b/coderd/database/queries/provisionerjobs.sql index 9e9273bb05352..d125b195b88b0 100644 --- a/coderd/database/queries/provisionerjobs.sql +++ b/coderd/database/queries/provisionerjobs.sql @@ -252,8 +252,7 @@ SET updated_at = $2, completed_at = $3, error = $4, - error_code = $5, - started_at = $6 + error_code = $5 WHERE id = $1; diff --git a/coderd/reaper/detector.go b/coderd/reaper/detector.go index a7736f2a2a070..3041f8e7b9961 100644 --- a/coderd/reaper/detector.go +++ b/coderd/reaper/detector.go @@ -309,22 +309,11 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub } lowestLogID = newLogs[0].ID - now = dbtime.Now() - - // If we are failing a job that was never picked up by the - // provisioner, we need to set the started_at time to the current - // time so that the build duration is correct. - if !job.StartedAt.Valid { - job.StartedAt = sql.NullTime{ - Time: now, - Valid: true, - } - } // Mark the job as failed. + now = dbtime.Now() err = db.UpdateProvisionerJobWithCompleteByID(ctx, database.UpdateProvisionerJobWithCompleteByIDParams{ ID: job.ID, UpdatedAt: now, - StartedAt: job.StartedAt, CompletedAt: sql.NullTime{ Time: now, Valid: true, From b65f620d556dd290c61af2a4530a8302f0c7fe42 Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Tue, 13 May 2025 18:31:15 +0000 Subject: [PATCH 07/24] created new ORM update to avoid forcing setting StartedAt on every Complete update --- coderd/database/dbauthz/dbauthz.go | 4 +++ coderd/database/dbmem/dbmem.go | 24 +++++++++++++++ coderd/database/dbmetrics/querymetrics.go | 7 +++++ coderd/database/dbmock/dbmock.go | 14 +++++++++ coderd/database/querier.go | 1 + coderd/database/queries.sql.go | 34 +++++++++++++++++++++ coderd/database/queries/provisionerjobs.sql | 12 ++++++++ coderd/reaper/detector.go | 12 +++++++- 8 files changed, 107 insertions(+), 1 deletion(-) diff --git a/coderd/database/dbauthz/dbauthz.go b/coderd/database/dbauthz/dbauthz.go index 4bcb7076a5a69..5db272f62a856 100644 --- a/coderd/database/dbauthz/dbauthz.go +++ b/coderd/database/dbauthz/dbauthz.go @@ -4254,6 +4254,10 @@ func (q *querier) UpdateProvisionerJobWithCompleteByID(ctx context.Context, arg return q.db.UpdateProvisionerJobWithCompleteByID(ctx, arg) } +func (q *querier) UpdateProvisionerJobWithCompleteWithStartedAtByID(ctx context.Context, arg database.UpdateProvisionerJobWithCompleteWithStartedAtByIDParams) error { + return q.db.UpdateProvisionerJobWithCompleteWithStartedAtByID(ctx, arg) +} + func (q *querier) UpdateReplica(ctx context.Context, arg database.UpdateReplicaParams) (database.Replica, error) { if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceSystem); err != nil { return database.Replica{}, err diff --git a/coderd/database/dbmem/dbmem.go b/coderd/database/dbmem/dbmem.go index 01eb2d66ed5de..43c804d1fdb46 100644 --- a/coderd/database/dbmem/dbmem.go +++ b/coderd/database/dbmem/dbmem.go @@ -10942,6 +10942,30 @@ func (q *FakeQuerier) UpdateProvisionerJobWithCompleteByID(_ context.Context, ar return sql.ErrNoRows } +func (q *FakeQuerier) UpdateProvisionerJobWithCompleteWithStartedAtByID(_ context.Context, arg database.UpdateProvisionerJobWithCompleteWithStartedAtByIDParams) error { + if err := validateDatabaseType(arg); err != nil { + return err + } + + q.mutex.Lock() + defer q.mutex.Unlock() + + for index, job := range q.provisionerJobs { + if arg.ID != job.ID { + continue + } + job.UpdatedAt = arg.UpdatedAt + job.CompletedAt = arg.CompletedAt + job.Error = arg.Error + job.ErrorCode = arg.ErrorCode + job.JobStatus = provisionerJobStatus(job) + job.StartedAt = arg.StartedAt + q.provisionerJobs[index] = job + return nil + } + return sql.ErrNoRows +} + func (q *FakeQuerier) UpdateReplica(_ context.Context, arg database.UpdateReplicaParams) (database.Replica, error) { if err := validateDatabaseType(arg); err != nil { return database.Replica{}, err diff --git a/coderd/database/dbmetrics/querymetrics.go b/coderd/database/dbmetrics/querymetrics.go index 1761c0c6b9960..2d3aea66a6ad8 100644 --- a/coderd/database/dbmetrics/querymetrics.go +++ b/coderd/database/dbmetrics/querymetrics.go @@ -2706,6 +2706,13 @@ func (m queryMetricsStore) UpdateProvisionerJobWithCompleteByID(ctx context.Cont return err } +func (m queryMetricsStore) UpdateProvisionerJobWithCompleteWithStartedAtByID(ctx context.Context, arg database.UpdateProvisionerJobWithCompleteWithStartedAtByIDParams) error { + start := time.Now() + r0 := m.s.UpdateProvisionerJobWithCompleteWithStartedAtByID(ctx, arg) + m.queryLatencies.WithLabelValues("UpdateProvisionerJobWithCompleteWithStartedAtByID").Observe(time.Since(start).Seconds()) + return r0 +} + func (m queryMetricsStore) UpdateReplica(ctx context.Context, arg database.UpdateReplicaParams) (database.Replica, error) { start := time.Now() replica, err := m.s.UpdateReplica(ctx, arg) diff --git a/coderd/database/dbmock/dbmock.go b/coderd/database/dbmock/dbmock.go index 1e3a6cf15cfa8..0e1abb4d9dca6 100644 --- a/coderd/database/dbmock/dbmock.go +++ b/coderd/database/dbmock/dbmock.go @@ -5732,6 +5732,20 @@ func (mr *MockStoreMockRecorder) UpdateProvisionerJobWithCompleteByID(ctx, arg a return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdateProvisionerJobWithCompleteByID", reflect.TypeOf((*MockStore)(nil).UpdateProvisionerJobWithCompleteByID), ctx, arg) } +// UpdateProvisionerJobWithCompleteWithStartedAtByID mocks base method. +func (m *MockStore) UpdateProvisionerJobWithCompleteWithStartedAtByID(ctx context.Context, arg database.UpdateProvisionerJobWithCompleteWithStartedAtByIDParams) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "UpdateProvisionerJobWithCompleteWithStartedAtByID", ctx, arg) + ret0, _ := ret[0].(error) + return ret0 +} + +// UpdateProvisionerJobWithCompleteWithStartedAtByID indicates an expected call of UpdateProvisionerJobWithCompleteWithStartedAtByID. +func (mr *MockStoreMockRecorder) UpdateProvisionerJobWithCompleteWithStartedAtByID(ctx, arg any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdateProvisionerJobWithCompleteWithStartedAtByID", reflect.TypeOf((*MockStore)(nil).UpdateProvisionerJobWithCompleteWithStartedAtByID), ctx, arg) +} + // UpdateReplica mocks base method. func (m *MockStore) UpdateReplica(ctx context.Context, arg database.UpdateReplicaParams) (database.Replica, error) { m.ctrl.T.Helper() diff --git a/coderd/database/querier.go b/coderd/database/querier.go index 86647a834d20a..5167a7099dae3 100644 --- a/coderd/database/querier.go +++ b/coderd/database/querier.go @@ -567,6 +567,7 @@ type sqlcQuerier interface { UpdateProvisionerJobByID(ctx context.Context, arg UpdateProvisionerJobByIDParams) error UpdateProvisionerJobWithCancelByID(ctx context.Context, arg UpdateProvisionerJobWithCancelByIDParams) error UpdateProvisionerJobWithCompleteByID(ctx context.Context, arg UpdateProvisionerJobWithCompleteByIDParams) error + UpdateProvisionerJobWithCompleteWithStartedAtByID(ctx context.Context, arg UpdateProvisionerJobWithCompleteWithStartedAtByIDParams) error UpdateReplica(ctx context.Context, arg UpdateReplicaParams) (Replica, error) UpdateTailnetPeerStatusByCoordinator(ctx context.Context, arg UpdateTailnetPeerStatusByCoordinatorParams) error UpdateTemplateACLByID(ctx context.Context, arg UpdateTemplateACLByIDParams) error diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index c525f7895b410..b8ac4452beb19 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -8159,6 +8159,40 @@ func (q *sqlQuerier) UpdateProvisionerJobWithCompleteByID(ctx context.Context, a return err } +const updateProvisionerJobWithCompleteWithStartedAtByID = `-- name: UpdateProvisionerJobWithCompleteWithStartedAtByID :exec +UPDATE + provisioner_jobs +SET + updated_at = $2, + completed_at = $3, + error = $4, + error_code = $5, + started_at = $6 +WHERE + id = $1 +` + +type UpdateProvisionerJobWithCompleteWithStartedAtByIDParams struct { + ID uuid.UUID `db:"id" json:"id"` + UpdatedAt time.Time `db:"updated_at" json:"updated_at"` + CompletedAt sql.NullTime `db:"completed_at" json:"completed_at"` + Error sql.NullString `db:"error" json:"error"` + ErrorCode sql.NullString `db:"error_code" json:"error_code"` + StartedAt sql.NullTime `db:"started_at" json:"started_at"` +} + +func (q *sqlQuerier) UpdateProvisionerJobWithCompleteWithStartedAtByID(ctx context.Context, arg UpdateProvisionerJobWithCompleteWithStartedAtByIDParams) error { + _, err := q.db.ExecContext(ctx, updateProvisionerJobWithCompleteWithStartedAtByID, + arg.ID, + arg.UpdatedAt, + arg.CompletedAt, + arg.Error, + arg.ErrorCode, + arg.StartedAt, + ) + return err +} + const deleteProvisionerKey = `-- name: DeleteProvisionerKey :exec DELETE FROM provisioner_keys diff --git a/coderd/database/queries/provisionerjobs.sql b/coderd/database/queries/provisionerjobs.sql index d125b195b88b0..d464bdfa4d587 100644 --- a/coderd/database/queries/provisionerjobs.sql +++ b/coderd/database/queries/provisionerjobs.sql @@ -256,6 +256,18 @@ SET WHERE id = $1; +-- name: UpdateProvisionerJobWithCompleteWithStartedAtByID :exec +UPDATE + provisioner_jobs +SET + updated_at = $2, + completed_at = $3, + error = $4, + error_code = $5, + started_at = $6 +WHERE + id = $1; + -- name: GetHungProvisionerJobs :many SELECT * diff --git a/coderd/reaper/detector.go b/coderd/reaper/detector.go index 3041f8e7b9961..92d229205adc8 100644 --- a/coderd/reaper/detector.go +++ b/coderd/reaper/detector.go @@ -311,7 +311,16 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub // Mark the job as failed. now = dbtime.Now() - err = db.UpdateProvisionerJobWithCompleteByID(ctx, database.UpdateProvisionerJobWithCompleteByIDParams{ + + // If the job was never started, set the StartedAt time to the current + // time so that the build duration is correct. + if !job.StartedAt.Valid { + job.StartedAt = sql.NullTime{ + Time: now, + Valid: true, + } + } + err = db.UpdateProvisionerJobWithCompleteWithStartedAtByID(ctx, database.UpdateProvisionerJobWithCompleteWithStartedAtByIDParams{ ID: job.ID, UpdatedAt: now, CompletedAt: sql.NullTime{ @@ -325,6 +334,7 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub ErrorCode: sql.NullString{ Valid: false, }, + StartedAt: job.StartedAt, }) if err != nil { return xerrors.Errorf("mark job as failed: %w", err) From 3c7c323ea8978b984a48feab25a9b01103f19e4a Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Tue, 13 May 2025 18:40:38 +0000 Subject: [PATCH 08/24] added missing dbauthz tests --- coderd/database/dbauthz/dbauthz_test.go | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/coderd/database/dbauthz/dbauthz_test.go b/coderd/database/dbauthz/dbauthz_test.go index 6dc9a32f03943..01181dd6adc38 100644 --- a/coderd/database/dbauthz/dbauthz_test.go +++ b/coderd/database/dbauthz/dbauthz_test.go @@ -4034,6 +4034,13 @@ func (s *MethodTestSuite) TestSystemFunctions() { ID: j.ID, }).Asserts( /*rbac.ResourceSystem, policy.ActionUpdate*/ ) })) + s.Run("UpdateProvisionerJobWithCompleteWithStartedAtByID", s.Subtest(func(db database.Store, check *expects) { + // TODO: we need to create a ProvisionerJob resource + j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) + check.Args(database.UpdateProvisionerJobWithCompleteWithStartedAtByIDParams{ + ID: j.ID, + }).Asserts( /*rbac.ResourceSystem, policy.ActionUpdate*/ ) + })) s.Run("UpdateProvisionerJobByID", s.Subtest(func(db database.Store, check *expects) { // TODO: we need to create a ProvisionerJob resource j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) @@ -4204,6 +4211,9 @@ func (s *MethodTestSuite) TestSystemFunctions() { s.Run("GetHungProvisionerJobs", s.Subtest(func(db database.Store, check *expects) { check.Args(time.Time{}).Asserts() })) + s.Run("GetNotStartedProvisionerJobs", s.Subtest(func(db database.Store, check *expects) { + check.Args(time.Time{}).Asserts() + })) s.Run("UpsertOAuthSigningKey", s.Subtest(func(db database.Store, check *expects) { check.Args("foo").Asserts(rbac.ResourceSystem, policy.ActionUpdate) })) From 35df01f247423ad544892c5696309c61d1b16dcb Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Tue, 13 May 2025 18:45:40 +0000 Subject: [PATCH 09/24] added checks for StartedAt value in the updated jobs --- coderd/reaper/detector_test.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/coderd/reaper/detector_test.go b/coderd/reaper/detector_test.go index e57e0bdccd1be..1fec8a1a726e7 100644 --- a/coderd/reaper/detector_test.go +++ b/coderd/reaper/detector_test.go @@ -548,6 +548,8 @@ func TestDetectorNotStartedWorkspaceBuildNoOverrideStateIfNoExistingBuild(t *tes require.WithinDuration(t, now, job.UpdatedAt, 30*time.Second) require.True(t, job.CompletedAt.Valid) require.WithinDuration(t, now, job.CompletedAt.Time, 30*time.Second) + require.True(t, job.StartedAt.Valid) + require.WithinDuration(t, now, job.StartedAt.Time, 30*time.Second) require.True(t, job.Error.Valid) require.Contains(t, job.Error.String, "Build has been detected as not started") require.False(t, job.ErrorCode.Valid) @@ -750,6 +752,8 @@ func TestDetectorNotStartedOtherJobTypes(t *testing.T) { require.WithinDuration(t, now, job.UpdatedAt, 30*time.Second) require.True(t, job.CompletedAt.Valid) require.WithinDuration(t, now, job.CompletedAt.Time, 30*time.Second) + require.True(t, job.StartedAt.Valid) + require.WithinDuration(t, now, job.StartedAt.Time, 30*time.Second) require.True(t, job.Error.Valid) require.Contains(t, job.Error.String, "Build has been detected as not started") require.False(t, job.ErrorCode.Valid) @@ -760,6 +764,8 @@ func TestDetectorNotStartedOtherJobTypes(t *testing.T) { require.WithinDuration(t, now, job.UpdatedAt, 30*time.Second) require.True(t, job.CompletedAt.Valid) require.WithinDuration(t, now, job.CompletedAt.Time, 30*time.Second) + require.True(t, job.StartedAt.Valid) + require.WithinDuration(t, now, job.StartedAt.Time, 30*time.Second) require.True(t, job.Error.Valid) require.Contains(t, job.Error.String, "Build has been detected as not started") require.False(t, job.ErrorCode.Valid) From 8aa1ee247b5eeeb0613af6d114bf9c37c304579f Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Wed, 14 May 2025 09:42:48 +0000 Subject: [PATCH 10/24] refactor from reaper to jobreaper --- cli/server.go | 4 +- coderd/coderdtest/coderdtest.go | 4 +- coderd/{reaper => jobreaper}/detector.go | 2 +- coderd/{reaper => jobreaper}/detector_test.go | 56 +++++++++---------- provisioner/terraform/serve.go | 4 +- 5 files changed, 35 insertions(+), 35 deletions(-) rename coderd/{reaper => jobreaper}/detector.go (99%) rename coderd/{reaper => jobreaper}/detector_test.go (94%) diff --git a/cli/server.go b/cli/server.go index e924439ec26b2..994886697ff64 100644 --- a/cli/server.go +++ b/cli/server.go @@ -87,12 +87,12 @@ import ( "github.com/coder/coder/v2/coderd/externalauth" "github.com/coder/coder/v2/coderd/gitsshkey" "github.com/coder/coder/v2/coderd/httpmw" + "github.com/coder/coder/v2/coderd/jobreaper" "github.com/coder/coder/v2/coderd/notifications" "github.com/coder/coder/v2/coderd/oauthpki" "github.com/coder/coder/v2/coderd/prometheusmetrics" "github.com/coder/coder/v2/coderd/prometheusmetrics/insights" "github.com/coder/coder/v2/coderd/promoauth" - "github.com/coder/coder/v2/coderd/reaper" "github.com/coder/coder/v2/coderd/schedule" "github.com/coder/coder/v2/coderd/telemetry" "github.com/coder/coder/v2/coderd/tracing" @@ -1129,7 +1129,7 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. hangDetectorTicker := time.NewTicker(vals.JobHangDetectorInterval.Value()) defer hangDetectorTicker.Stop() - hangDetector := reaper.New(ctx, options.Database, options.Pubsub, logger, hangDetectorTicker.C) + hangDetector := jobreaper.New(ctx, options.Database, options.Pubsub, logger, hangDetectorTicker.C) hangDetector.Start() defer hangDetector.Close() diff --git a/coderd/coderdtest/coderdtest.go b/coderd/coderdtest/coderdtest.go index ff418f3abaff2..f8c9cedbed99e 100644 --- a/coderd/coderdtest/coderdtest.go +++ b/coderd/coderdtest/coderdtest.go @@ -68,11 +68,11 @@ import ( "github.com/coder/coder/v2/coderd/externalauth" "github.com/coder/coder/v2/coderd/gitsshkey" "github.com/coder/coder/v2/coderd/httpmw" + "github.com/coder/coder/v2/coderd/jobreaper" "github.com/coder/coder/v2/coderd/notifications" "github.com/coder/coder/v2/coderd/notifications/notificationstest" "github.com/coder/coder/v2/coderd/rbac" "github.com/coder/coder/v2/coderd/rbac/policy" - "github.com/coder/coder/v2/coderd/reaper" "github.com/coder/coder/v2/coderd/runtimeconfig" "github.com/coder/coder/v2/coderd/schedule" "github.com/coder/coder/v2/coderd/telemetry" @@ -367,7 +367,7 @@ func NewOptions(t testing.TB, options *Options) (func(http.Handler), context.Can hangDetectorTicker := time.NewTicker(options.DeploymentValues.JobHangDetectorInterval.Value()) defer hangDetectorTicker.Stop() - hangDetector := reaper.New(ctx, options.Database, options.Pubsub, options.Logger.Named("reaper.detector"), hangDetectorTicker.C) + hangDetector := jobreaper.New(ctx, options.Database, options.Pubsub, options.Logger.Named("reaper.detector"), hangDetectorTicker.C) hangDetector.Start() t.Cleanup(hangDetector.Close) diff --git a/coderd/reaper/detector.go b/coderd/jobreaper/detector.go similarity index 99% rename from coderd/reaper/detector.go rename to coderd/jobreaper/detector.go index 92d229205adc8..14d8068eadae6 100644 --- a/coderd/reaper/detector.go +++ b/coderd/jobreaper/detector.go @@ -1,4 +1,4 @@ -package reaper +package jobreaper import ( "context" diff --git a/coderd/reaper/detector_test.go b/coderd/jobreaper/detector_test.go similarity index 94% rename from coderd/reaper/detector_test.go rename to coderd/jobreaper/detector_test.go index 1fec8a1a726e7..dba1c013b0766 100644 --- a/coderd/reaper/detector_test.go +++ b/coderd/jobreaper/detector_test.go @@ -1,4 +1,4 @@ -package reaper_test +package jobreaper_test import ( "context" @@ -20,9 +20,9 @@ import ( "github.com/coder/coder/v2/coderd/database/dbauthz" "github.com/coder/coder/v2/coderd/database/dbgen" "github.com/coder/coder/v2/coderd/database/dbtestutil" + "github.com/coder/coder/v2/coderd/jobreaper" "github.com/coder/coder/v2/coderd/provisionerdserver" "github.com/coder/coder/v2/coderd/rbac" - "github.com/coder/coder/v2/coderd/reaper" "github.com/coder/coder/v2/provisionersdk" "github.com/coder/coder/v2/testutil" ) @@ -49,10 +49,10 @@ func jobLogMessages(jobType jobType, threshold float64) []string { // reapParamsFromJob determines the type and threshold for a job being reaped func reapParamsFromJob(job database.ProvisionerJob) (jobType, float64) { jobType := hungJobType - threshold := reaper.HungJobDuration.Minutes() + threshold := jobreaper.HungJobDuration.Minutes() if !job.StartedAt.Valid { jobType = notStartedJobType - threshold = reaper.NotStartedTimeElapsed.Minutes() + threshold = jobreaper.NotStartedTimeElapsed.Minutes() } return jobType, threshold } @@ -69,10 +69,10 @@ func TestDetectorNoJobs(t *testing.T) { db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan reaper.Stats) + statsCh = make(chan jobreaper.Stats) ) - detector := reaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() tickCh <- time.Now() @@ -92,7 +92,7 @@ func TestDetectorNoHungJobs(t *testing.T) { db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan reaper.Stats) + statsCh = make(chan jobreaper.Stats) ) // Insert some jobs that are running and haven't been updated in a while, @@ -119,7 +119,7 @@ func TestDetectorNoHungJobs(t *testing.T) { }) } - detector := reaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() tickCh <- now @@ -139,7 +139,7 @@ func TestDetectorHungWorkspaceBuild(t *testing.T) { db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan reaper.Stats) + statsCh = make(chan jobreaper.Stats) ) var ( @@ -225,7 +225,7 @@ func TestDetectorHungWorkspaceBuild(t *testing.T) { t.Log("previous job ID: ", previousWorkspaceBuildJob.ID) t.Log("current job ID: ", currentWorkspaceBuildJob.ID) - detector := reaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() tickCh <- now @@ -261,7 +261,7 @@ func TestDetectorHungWorkspaceBuildNoOverrideState(t *testing.T) { db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan reaper.Stats) + statsCh = make(chan jobreaper.Stats) ) var ( @@ -348,7 +348,7 @@ func TestDetectorHungWorkspaceBuildNoOverrideState(t *testing.T) { t.Log("previous job ID: ", previousWorkspaceBuildJob.ID) t.Log("current job ID: ", currentWorkspaceBuildJob.ID) - detector := reaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() tickCh <- now @@ -384,7 +384,7 @@ func TestDetectorHungWorkspaceBuildNoOverrideStateIfNoExistingBuild(t *testing.T db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan reaper.Stats) + statsCh = make(chan jobreaper.Stats) ) var ( @@ -441,7 +441,7 @@ func TestDetectorHungWorkspaceBuildNoOverrideStateIfNoExistingBuild(t *testing.T t.Log("current job ID: ", currentWorkspaceBuildJob.ID) - detector := reaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() tickCh <- now @@ -477,7 +477,7 @@ func TestDetectorNotStartedWorkspaceBuildNoOverrideStateIfNoExistingBuild(t *tes db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan reaper.Stats) + statsCh = make(chan jobreaper.Stats) ) var ( @@ -533,7 +533,7 @@ func TestDetectorNotStartedWorkspaceBuildNoOverrideStateIfNoExistingBuild(t *tes t.Log("current job ID: ", currentWorkspaceBuildJob.ID) - detector := reaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() tickCh <- now @@ -571,7 +571,7 @@ func TestDetectorHungOtherJobTypes(t *testing.T) { db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan reaper.Stats) + statsCh = make(chan jobreaper.Stats) ) var ( @@ -633,7 +633,7 @@ func TestDetectorHungOtherJobTypes(t *testing.T) { t.Log("template import job ID: ", templateImportJob.ID) t.Log("template dry-run job ID: ", templateDryRunJob.ID) - detector := reaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() tickCh <- now @@ -675,7 +675,7 @@ func TestDetectorNotStartedOtherJobTypes(t *testing.T) { db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan reaper.Stats) + statsCh = make(chan jobreaper.Stats) ) var ( @@ -736,7 +736,7 @@ func TestDetectorNotStartedOtherJobTypes(t *testing.T) { t.Log("template import job ID: ", templateImportJob.ID) t.Log("template dry-run job ID: ", templateDryRunJob.ID) - detector := reaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() tickCh <- now @@ -782,7 +782,7 @@ func TestDetectorHungCanceledJob(t *testing.T) { db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan reaper.Stats) + statsCh = make(chan jobreaper.Stats) ) var ( @@ -822,7 +822,7 @@ func TestDetectorHungCanceledJob(t *testing.T) { t.Log("template import job ID: ", templateImportJob.ID) - detector := reaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() tickCh <- now @@ -884,7 +884,7 @@ func TestDetectorPushesLogs(t *testing.T) { db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan reaper.Stats) + statsCh = make(chan jobreaper.Stats) ) var ( @@ -937,7 +937,7 @@ func TestDetectorPushesLogs(t *testing.T) { require.Len(t, logs, 10) } - detector := reaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() // Create pubsub subscription to listen for new log events. @@ -1004,7 +1004,7 @@ func TestDetectorMaxJobsPerRun(t *testing.T) { db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan reaper.Stats) + statsCh = make(chan jobreaper.Stats) org = dbgen.Organization(t, db, database.Organization{}) user = dbgen.User(t, db, database.User{}) file = dbgen.File(t, db, database.File{}) @@ -1012,7 +1012,7 @@ func TestDetectorMaxJobsPerRun(t *testing.T) { // Create MaxJobsPerRun + 1 hung jobs. now := time.Now() - for i := 0; i < reaper.MaxJobsPerRun+1; i++ { + for i := 0; i < jobreaper.MaxJobsPerRun+1; i++ { pj := dbgen.ProvisionerJob(t, db, pubsub, database.ProvisionerJob{ CreatedAt: now.Add(-time.Hour), UpdatedAt: now.Add(-time.Hour), @@ -1035,14 +1035,14 @@ func TestDetectorMaxJobsPerRun(t *testing.T) { }) } - detector := reaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() tickCh <- now // Make sure that only MaxJobsPerRun jobs are terminated. stats := <-statsCh require.NoError(t, stats.Error) - require.Len(t, stats.TerminatedJobIDs, reaper.MaxJobsPerRun) + require.Len(t, stats.TerminatedJobIDs, jobreaper.MaxJobsPerRun) // Run the detector again and make sure that only the remaining job is // terminated. diff --git a/provisioner/terraform/serve.go b/provisioner/terraform/serve.go index 01d63b3b90086..67eeaccaed05d 100644 --- a/provisioner/terraform/serve.go +++ b/provisioner/terraform/serve.go @@ -16,7 +16,7 @@ import ( "cdr.dev/slog" "github.com/coder/coder/v2/coderd/database" - "github.com/coder/coder/v2/coderd/reaper" + "github.com/coder/coder/v2/coderd/jobreaper" "github.com/coder/coder/v2/provisionersdk" ) @@ -131,7 +131,7 @@ func Serve(ctx context.Context, options *ServeOptions) error { options.Tracer = trace.NewNoopTracerProvider().Tracer("noop") } if options.ExitTimeout == 0 { - options.ExitTimeout = reaper.HungJobExitTimeout + options.ExitTimeout = jobreaper.HungJobExitTimeout } return provisionersdk.Serve(ctx, &server{ execMut: &sync.Mutex{}, From 4385933bd0f1c5d0741f3fc8f4909d1a1d3e336e Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Wed, 14 May 2025 11:47:24 +0000 Subject: [PATCH 11/24] WIP --- cli/server.go | 10 +- coderd/coderdtest/coderdtest.go | 10 +- coderd/database/dbauthz/dbauthz.go | 55 +++++----- coderd/database/dbauthz/dbauthz_test.go | 29 ++---- coderd/database/dbmem/dbmem.go | 38 +++---- coderd/database/dbmetrics/querymetrics.go | 14 +-- coderd/database/dbmock/dbmock.go | 30 +++--- coderd/database/querier.go | 2 +- coderd/database/queries.sql.go | 6 +- coderd/database/queries/provisionerjobs.sql | 2 +- coderd/httpmw/loggermw/logger.go | 2 +- coderd/jobreaper/detector.go | 106 +++++++++++--------- coderd/jobreaper/detector_test.go | 18 ++-- coderd/rbac/authz.go | 2 +- coderd/rbac/object_gen.go | 1 + coderd/rbac/policy/policy.go | 3 +- codersdk/deployment.go | 8 +- codersdk/rbacresources_gen.go | 2 +- 18 files changed, 177 insertions(+), 161 deletions(-) diff --git a/cli/server.go b/cli/server.go index 994886697ff64..c04514f7c76e5 100644 --- a/cli/server.go +++ b/cli/server.go @@ -1127,11 +1127,11 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. ctx, options.Database, options.Pubsub, options.PrometheusRegistry, coderAPI.TemplateScheduleStore, &coderAPI.Auditor, coderAPI.AccessControlStore, logger, autobuildTicker.C, options.NotificationsEnqueuer) autobuildExecutor.Run() - hangDetectorTicker := time.NewTicker(vals.JobHangDetectorInterval.Value()) - defer hangDetectorTicker.Stop() - hangDetector := jobreaper.New(ctx, options.Database, options.Pubsub, logger, hangDetectorTicker.C) - hangDetector.Start() - defer hangDetector.Close() + jobReaperTicker := time.NewTicker(vals.JobReaperDetectorInterval.Value()) + defer jobReaperTicker.Stop() + jobReaper := jobreaper.New(ctx, options.Database, options.Pubsub, logger, jobReaperTicker.C) + jobReaper.Start() + defer jobReaper.Close() waitForProvisionerJobs := false // Currently there is no way to ask the server to shut diff --git a/coderd/coderdtest/coderdtest.go b/coderd/coderdtest/coderdtest.go index f8c9cedbed99e..1c36bfba97f28 100644 --- a/coderd/coderdtest/coderdtest.go +++ b/coderd/coderdtest/coderdtest.go @@ -365,11 +365,11 @@ func NewOptions(t testing.TB, options *Options) (func(http.Handler), context.Can ).WithStatsChannel(options.AutobuildStats) lifecycleExecutor.Run() - hangDetectorTicker := time.NewTicker(options.DeploymentValues.JobHangDetectorInterval.Value()) - defer hangDetectorTicker.Stop() - hangDetector := jobreaper.New(ctx, options.Database, options.Pubsub, options.Logger.Named("reaper.detector"), hangDetectorTicker.C) - hangDetector.Start() - t.Cleanup(hangDetector.Close) + jobReaperTicker := time.NewTicker(options.DeploymentValues.JobReaperDetectorInterval.Value()) + defer jobReaperTicker.Stop() + jobReaper := jobreaper.New(ctx, options.Database, options.Pubsub, options.Logger.Named("reaper.detector"), jobReaperTicker.C) + jobReaper.Start() + t.Cleanup(jobReaper.Close) if options.TelemetryReporter == nil { options.TelemetryReporter = telemetry.NewNoop() diff --git a/coderd/database/dbauthz/dbauthz.go b/coderd/database/dbauthz/dbauthz.go index 5db272f62a856..90ee7560776d0 100644 --- a/coderd/database/dbauthz/dbauthz.go +++ b/coderd/database/dbauthz/dbauthz.go @@ -220,18 +220,19 @@ var ( }.WithCachedASTValue() // See reaper package. - subjectHangDetector = rbac.Subject{ - Type: rbac.SubjectTypeHangDetector, - FriendlyName: "Hang Detector", + subjectJobReaper = rbac.Subject{ + Type: rbac.SubjectTypeJobReaper, + FriendlyName: "Job Reaper", ID: uuid.Nil.String(), Roles: rbac.Roles([]rbac.Role{ { - Identifier: rbac.RoleIdentifier{Name: "hangdetector"}, - DisplayName: "Hang Detector Daemon", + Identifier: rbac.RoleIdentifier{Name: "jobreaper"}, + DisplayName: "Job Reaper Daemon", Site: rbac.Permissions(map[string][]policy.Action{ - rbac.ResourceSystem.Type: {policy.WildcardSymbol}, - rbac.ResourceTemplate.Type: {policy.ActionRead}, - rbac.ResourceWorkspace.Type: {policy.ActionRead, policy.ActionUpdate}, + rbac.ResourceSystem.Type: {policy.WildcardSymbol}, + rbac.ResourceTemplate.Type: {policy.ActionRead}, + rbac.ResourceWorkspace.Type: {policy.ActionRead, policy.ActionUpdate}, + rbac.ResourceProvisionerJobs.Type: {policy.ActionRead, policy.ActionUpdate}, }), Org: map[string][]rbac.Permission{}, User: []rbac.Permission{}, @@ -407,10 +408,10 @@ func AsAutostart(ctx context.Context) context.Context { return As(ctx, subjectAutostart) } -// AsHangDetector returns a context with an actor that has permissions required +// AsJobReaper returns a context with an actor that has permissions required // for reaper.Detector to function. -func AsHangDetector(ctx context.Context) context.Context { - return As(ctx, subjectHangDetector) +func AsJobReaper(ctx context.Context) context.Context { + return As(ctx, subjectJobReaper) } // AsKeyRotator returns a context with an actor that has permissions required for rotating crypto keys. @@ -1074,6 +1075,13 @@ func (q *querier) customRoleCheck(ctx context.Context, role database.CustomRole) return nil } +func (q *querier) GetPendingProvisionerJobs(ctx context.Context, lastUpdatedSince time.Time) ([]database.ProvisionerJob, error) { + if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { + return nil, err + } + return q.db.GetPendingProvisionerJobs(ctx, lastUpdatedSince) +} + func (q *querier) AcquireLock(ctx context.Context, id int64) error { return q.db.AcquireLock(ctx, id) } @@ -1912,11 +1920,10 @@ func (q *querier) GetHealthSettings(ctx context.Context) (string, error) { return q.db.GetHealthSettings(ctx) } -// TODO: We need to create a ProvisionerJob resource type func (q *querier) GetHungProvisionerJobs(ctx context.Context, hungSince time.Time) ([]database.ProvisionerJob, error) { - // if err := q.authorizeContext(ctx, policy.ActionCreate, rbac.ResourceSystem); err != nil { - // return nil, err - // } + if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { + return nil, err + } return q.db.GetHungProvisionerJobs(ctx, hungSince) } @@ -1992,10 +1999,6 @@ func (q *querier) GetLogoURL(ctx context.Context) (string, error) { return q.db.GetLogoURL(ctx) } -func (q *querier) GetNotStartedProvisionerJobs(ctx context.Context, notStartedSince time.Time) ([]database.ProvisionerJob, error) { - return q.db.GetNotStartedProvisionerJobs(ctx, notStartedSince) -} - func (q *querier) GetNotificationMessagesByStatus(ctx context.Context, arg database.GetNotificationMessagesByStatusParams) ([]database.NotificationMessage, error) { if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceNotificationMessage); err != nil { return nil, err @@ -4180,6 +4183,10 @@ func (q *querier) UpdateProvisionerJobByID(ctx context.Context, arg database.Upd } func (q *querier) UpdateProvisionerJobWithCancelByID(ctx context.Context, arg database.UpdateProvisionerJobWithCancelByIDParams) error { + if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { + return err + } + job, err := q.db.GetProvisionerJobByID(ctx, arg.ID) if err != nil { return err @@ -4246,15 +4253,17 @@ func (q *querier) UpdateProvisionerJobWithCancelByID(ctx context.Context, arg da return q.db.UpdateProvisionerJobWithCancelByID(ctx, arg) } -// TODO: We need to create a ProvisionerJob resource type func (q *querier) UpdateProvisionerJobWithCompleteByID(ctx context.Context, arg database.UpdateProvisionerJobWithCompleteByIDParams) error { - // if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceSystem); err != nil { - // return err - // } + if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { + return err + } return q.db.UpdateProvisionerJobWithCompleteByID(ctx, arg) } func (q *querier) UpdateProvisionerJobWithCompleteWithStartedAtByID(ctx context.Context, arg database.UpdateProvisionerJobWithCompleteWithStartedAtByIDParams) error { + if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { + return err + } return q.db.UpdateProvisionerJobWithCompleteWithStartedAtByID(ctx, arg) } diff --git a/coderd/database/dbauthz/dbauthz_test.go b/coderd/database/dbauthz/dbauthz_test.go index 01181dd6adc38..0bc116d63987d 100644 --- a/coderd/database/dbauthz/dbauthz_test.go +++ b/coderd/database/dbauthz/dbauthz_test.go @@ -3891,9 +3891,8 @@ func (s *MethodTestSuite) TestSystemFunctions() { check.Args().Asserts(rbac.ResourceSystem, policy.ActionDelete) })) s.Run("GetProvisionerJobsCreatedAfter", s.Subtest(func(db database.Store, check *expects) { - // TODO: add provisioner job resource type _ = dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{CreatedAt: time.Now().Add(-time.Hour)}) - check.Args(time.Now()).Asserts( /*rbac.ResourceSystem, policy.ActionRead*/ ) + check.Args(time.Now()).Asserts(rbac.ResourceSystem, policy.ActionRead) })) s.Run("GetTemplateVersionsByIDs", s.Subtest(func(db database.Store, check *expects) { dbtestutil.DisableForeignKeysAndTriggers(s.T(), db) @@ -3976,11 +3975,10 @@ func (s *MethodTestSuite) TestSystemFunctions() { Returns([]database.WorkspaceAgent{agt}) })) s.Run("GetProvisionerJobsByIDs", s.Subtest(func(db database.Store, check *expects) { - // TODO: add a ProvisionerJob resource type a := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) b := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args([]uuid.UUID{a.ID, b.ID}). - Asserts( /*rbac.ResourceSystem, policy.ActionRead*/ ). + Asserts(rbac.ResourceSystem, policy.ActionRead). Returns(slice.New(a, b)) })) s.Run("InsertWorkspaceAgent", s.Subtest(func(db database.Store, check *expects) { @@ -4015,7 +4013,6 @@ func (s *MethodTestSuite) TestSystemFunctions() { }).Asserts(rbac.ResourceSystem, policy.ActionUpdate).Returns() })) s.Run("AcquireProvisionerJob", s.Subtest(func(db database.Store, check *expects) { - // TODO: we need to create a ProvisionerJob resource j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{ StartedAt: sql.NullTime{Valid: false}, UpdatedAt: time.Now(), @@ -4025,54 +4022,48 @@ func (s *MethodTestSuite) TestSystemFunctions() { OrganizationID: j.OrganizationID, Types: []database.ProvisionerType{j.Provisioner}, ProvisionerTags: must(json.Marshal(j.Tags)), - }).Asserts( /*rbac.ResourceSystem, policy.ActionUpdate*/ ) + }).Asserts(rbac.ResourceSystem, policy.ActionUpdate) })) s.Run("UpdateProvisionerJobWithCompleteByID", s.Subtest(func(db database.Store, check *expects) { - // TODO: we need to create a ProvisionerJob resource j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args(database.UpdateProvisionerJobWithCompleteByIDParams{ ID: j.ID, - }).Asserts( /*rbac.ResourceSystem, policy.ActionUpdate*/ ) + }).Asserts(rbac.ResourceSystem, policy.ActionUpdate) })) s.Run("UpdateProvisionerJobWithCompleteWithStartedAtByID", s.Subtest(func(db database.Store, check *expects) { - // TODO: we need to create a ProvisionerJob resource j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args(database.UpdateProvisionerJobWithCompleteWithStartedAtByIDParams{ ID: j.ID, - }).Asserts( /*rbac.ResourceSystem, policy.ActionUpdate*/ ) + }).Asserts(rbac.ResourceSystem, policy.ActionUpdate) })) s.Run("UpdateProvisionerJobByID", s.Subtest(func(db database.Store, check *expects) { - // TODO: we need to create a ProvisionerJob resource j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args(database.UpdateProvisionerJobByIDParams{ ID: j.ID, UpdatedAt: time.Now(), - }).Asserts( /*rbac.ResourceSystem, policy.ActionUpdate*/ ) + }).Asserts(rbac.ResourceSystem, policy.ActionUpdate) })) s.Run("InsertProvisionerJob", s.Subtest(func(db database.Store, check *expects) { dbtestutil.DisableForeignKeysAndTriggers(s.T(), db) - // TODO: we need to create a ProvisionerJob resource check.Args(database.InsertProvisionerJobParams{ ID: uuid.New(), Provisioner: database.ProvisionerTypeEcho, StorageMethod: database.ProvisionerStorageMethodFile, Type: database.ProvisionerJobTypeWorkspaceBuild, Input: json.RawMessage("{}"), - }).Asserts( /*rbac.ResourceSystem, policy.ActionCreate*/ ) + }).Asserts(rbac.ResourceSystem, policy.ActionCreate) })) s.Run("InsertProvisionerJobLogs", s.Subtest(func(db database.Store, check *expects) { - // TODO: we need to create a ProvisionerJob resource j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args(database.InsertProvisionerJobLogsParams{ JobID: j.ID, - }).Asserts( /*rbac.ResourceSystem, policy.ActionCreate*/ ) + }).Asserts(rbac.ResourceSystem, policy.ActionCreate) })) s.Run("InsertProvisionerJobTimings", s.Subtest(func(db database.Store, check *expects) { - // TODO: we need to create a ProvisionerJob resource j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args(database.InsertProvisionerJobTimingsParams{ JobID: j.ID, - }).Asserts( /*rbac.ResourceSystem, policy.ActionCreate*/ ) + }).Asserts(rbac.ResourceSystem, policy.ActionCreate) })) s.Run("UpsertProvisionerDaemon", s.Subtest(func(db database.Store, check *expects) { dbtestutil.DisableForeignKeysAndTriggers(s.T(), db) @@ -4211,7 +4202,7 @@ func (s *MethodTestSuite) TestSystemFunctions() { s.Run("GetHungProvisionerJobs", s.Subtest(func(db database.Store, check *expects) { check.Args(time.Time{}).Asserts() })) - s.Run("GetNotStartedProvisionerJobs", s.Subtest(func(db database.Store, check *expects) { + s.Run("GetPendingProvisionerJobs", s.Subtest(func(db database.Store, check *expects) { check.Args(time.Time{}).Asserts() })) s.Run("UpsertOAuthSigningKey", s.Subtest(func(db database.Store, check *expects) { diff --git a/coderd/database/dbmem/dbmem.go b/coderd/database/dbmem/dbmem.go index 43c804d1fdb46..ffc862d221c7d 100644 --- a/coderd/database/dbmem/dbmem.go +++ b/coderd/database/dbmem/dbmem.go @@ -1386,6 +1386,10 @@ func isDeprecated(template database.Template) bool { return template.Deprecated != "" } +func (q *FakeQuerier) GetProvisionerJobsToBeReaped(ctx context.Context, updatedAt time.Time) ([]database.ProvisionerJob, error) { + panic("not implemented") +} + func (*FakeQuerier) AcquireLock(_ context.Context, _ int64) error { return xerrors.New("AcquireLock must only be called within a transaction") } @@ -3897,23 +3901,6 @@ func (q *FakeQuerier) GetLogoURL(_ context.Context) (string, error) { return q.logoURL, nil } -func (q *FakeQuerier) GetNotStartedProvisionerJobs(ctx context.Context, notStartedSince time.Time) ([]database.ProvisionerJob, error) { - q.mutex.RLock() - defer q.mutex.RUnlock() - - notStartedJobs := []database.ProvisionerJob{} - for _, provisionerJob := range q.provisionerJobs { - if !provisionerJob.StartedAt.Valid && !provisionerJob.CompletedAt.Valid && provisionerJob.UpdatedAt.Before(notStartedSince) { - // clone the Tags before appending, since maps are reference types and - // we don't want the caller to be able to mutate the map we have inside - // dbmem! - provisionerJob.Tags = maps.Clone(provisionerJob.Tags) - notStartedJobs = append(notStartedJobs, provisionerJob) - } - } - return notStartedJobs, nil -} - func (q *FakeQuerier) GetNotificationMessagesByStatus(_ context.Context, arg database.GetNotificationMessagesByStatusParams) ([]database.NotificationMessage, error) { err := validateDatabaseType(arg) if err != nil { @@ -4291,6 +4278,23 @@ func (q *FakeQuerier) GetParameterSchemasByJobID(_ context.Context, jobID uuid.U return parameters, nil } +func (q *FakeQuerier) GetPendingProvisionerJobs(_ context.Context, lastUpdatedSince time.Time) ([]database.ProvisionerJob, error) { + q.mutex.RLock() + defer q.mutex.RUnlock() + + pendingJobs := []database.ProvisionerJob{} + for _, provisionerJob := range q.provisionerJobs { + if !provisionerJob.StartedAt.Valid && !provisionerJob.CompletedAt.Valid && provisionerJob.UpdatedAt.Before(lastUpdatedSince) { + // clone the Tags before appending, since maps are reference types and + // we don't want the caller to be able to mutate the map we have inside + // dbmem! + provisionerJob.Tags = maps.Clone(provisionerJob.Tags) + pendingJobs = append(pendingJobs, provisionerJob) + } + } + return pendingJobs, nil +} + func (*FakeQuerier) GetPrebuildMetrics(_ context.Context) ([]database.GetPrebuildMetricsRow, error) { return nil, ErrUnimplemented } diff --git a/coderd/database/dbmetrics/querymetrics.go b/coderd/database/dbmetrics/querymetrics.go index 2d3aea66a6ad8..7e31679d88a50 100644 --- a/coderd/database/dbmetrics/querymetrics.go +++ b/coderd/database/dbmetrics/querymetrics.go @@ -949,13 +949,6 @@ func (m queryMetricsStore) GetLogoURL(ctx context.Context) (string, error) { return url, err } -func (m queryMetricsStore) GetNotStartedProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]database.ProvisionerJob, error) { - start := time.Now() - r0, r1 := m.s.GetNotStartedProvisionerJobs(ctx, updatedAt) - m.queryLatencies.WithLabelValues("GetNotStartedProvisionerJobs").Observe(time.Since(start).Seconds()) - return r0, r1 -} - func (m queryMetricsStore) GetNotificationMessagesByStatus(ctx context.Context, arg database.GetNotificationMessagesByStatusParams) ([]database.NotificationMessage, error) { start := time.Now() r0, r1 := m.s.GetNotificationMessagesByStatus(ctx, arg) @@ -1117,6 +1110,13 @@ func (m queryMetricsStore) GetParameterSchemasByJobID(ctx context.Context, jobID return schemas, err } +func (m queryMetricsStore) GetPendingProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]database.ProvisionerJob, error) { + start := time.Now() + r0, r1 := m.s.GetPendingProvisionerJobs(ctx, updatedAt) + m.queryLatencies.WithLabelValues("GetPendingProvisionerJobs").Observe(time.Since(start).Seconds()) + return r0, r1 +} + func (m queryMetricsStore) GetPrebuildMetrics(ctx context.Context) ([]database.GetPrebuildMetricsRow, error) { start := time.Now() r0, r1 := m.s.GetPrebuildMetrics(ctx) diff --git a/coderd/database/dbmock/dbmock.go b/coderd/database/dbmock/dbmock.go index 0e1abb4d9dca6..38c0f7b35f3e9 100644 --- a/coderd/database/dbmock/dbmock.go +++ b/coderd/database/dbmock/dbmock.go @@ -1923,21 +1923,6 @@ func (mr *MockStoreMockRecorder) GetLogoURL(ctx any) *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetLogoURL", reflect.TypeOf((*MockStore)(nil).GetLogoURL), ctx) } -// GetNotStartedProvisionerJobs mocks base method. -func (m *MockStore) GetNotStartedProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]database.ProvisionerJob, error) { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "GetNotStartedProvisionerJobs", ctx, updatedAt) - ret0, _ := ret[0].([]database.ProvisionerJob) - ret1, _ := ret[1].(error) - return ret0, ret1 -} - -// GetNotStartedProvisionerJobs indicates an expected call of GetNotStartedProvisionerJobs. -func (mr *MockStoreMockRecorder) GetNotStartedProvisionerJobs(ctx, updatedAt any) *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetNotStartedProvisionerJobs", reflect.TypeOf((*MockStore)(nil).GetNotStartedProvisionerJobs), ctx, updatedAt) -} - // GetNotificationMessagesByStatus mocks base method. func (m *MockStore) GetNotificationMessagesByStatus(ctx context.Context, arg database.GetNotificationMessagesByStatusParams) ([]database.NotificationMessage, error) { m.ctrl.T.Helper() @@ -2283,6 +2268,21 @@ func (mr *MockStoreMockRecorder) GetParameterSchemasByJobID(ctx, jobID any) *gom return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetParameterSchemasByJobID", reflect.TypeOf((*MockStore)(nil).GetParameterSchemasByJobID), ctx, jobID) } +// GetPendingProvisionerJobs mocks base method. +func (m *MockStore) GetPendingProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]database.ProvisionerJob, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetPendingProvisionerJobs", ctx, updatedAt) + ret0, _ := ret[0].([]database.ProvisionerJob) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetPendingProvisionerJobs indicates an expected call of GetPendingProvisionerJobs. +func (mr *MockStoreMockRecorder) GetPendingProvisionerJobs(ctx, updatedAt any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetPendingProvisionerJobs", reflect.TypeOf((*MockStore)(nil).GetPendingProvisionerJobs), ctx, updatedAt) +} + // GetPrebuildMetrics mocks base method. func (m *MockStore) GetPrebuildMetrics(ctx context.Context) ([]database.GetPrebuildMetricsRow, error) { m.ctrl.T.Helper() diff --git a/coderd/database/querier.go b/coderd/database/querier.go index 5167a7099dae3..52a7ff22daa98 100644 --- a/coderd/database/querier.go +++ b/coderd/database/querier.go @@ -213,7 +213,6 @@ type sqlcQuerier interface { GetLicenseByID(ctx context.Context, id int32) (License, error) GetLicenses(ctx context.Context) ([]License, error) GetLogoURL(ctx context.Context) (string, error) - GetNotStartedProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]ProvisionerJob, error) GetNotificationMessagesByStatus(ctx context.Context, arg GetNotificationMessagesByStatusParams) ([]NotificationMessage, error) // Fetch the notification report generator log indicating recent activity. GetNotificationReportGeneratorLogByTemplate(ctx context.Context, templateID uuid.UUID) (NotificationReportGeneratorLog, error) @@ -238,6 +237,7 @@ type sqlcQuerier interface { GetOrganizations(ctx context.Context, arg GetOrganizationsParams) ([]Organization, error) GetOrganizationsByUserID(ctx context.Context, arg GetOrganizationsByUserIDParams) ([]Organization, error) GetParameterSchemasByJobID(ctx context.Context, jobID uuid.UUID) ([]ParameterSchema, error) + GetPendingProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]ProvisionerJob, error) GetPrebuildMetrics(ctx context.Context) ([]GetPrebuildMetricsRow, error) GetPresetByID(ctx context.Context, presetID uuid.UUID) (GetPresetByIDRow, error) GetPresetByWorkspaceBuildID(ctx context.Context, workspaceBuildID uuid.UUID) (TemplateVersionPreset, error) diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index b8ac4452beb19..dce0903e84b83 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -7430,7 +7430,7 @@ func (q *sqlQuerier) GetHungProvisionerJobs(ctx context.Context, updatedAt time. return items, nil } -const getNotStartedProvisionerJobs = `-- name: GetNotStartedProvisionerJobs :many +const getPendingProvisionerJobs = `-- name: GetPendingProvisionerJobs :many SELECT id, created_at, updated_at, started_at, canceled_at, completed_at, error, organization_id, initiator_id, provisioner, storage_method, type, input, worker_id, file_id, tags, error_code, trace_metadata, job_status FROM @@ -7441,8 +7441,8 @@ WHERE AND completed_at IS NULL ` -func (q *sqlQuerier) GetNotStartedProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]ProvisionerJob, error) { - rows, err := q.db.QueryContext(ctx, getNotStartedProvisionerJobs, updatedAt) +func (q *sqlQuerier) GetPendingProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]ProvisionerJob, error) { + rows, err := q.db.QueryContext(ctx, getPendingProvisionerJobs, updatedAt) if err != nil { return nil, err } diff --git a/coderd/database/queries/provisionerjobs.sql b/coderd/database/queries/provisionerjobs.sql index d464bdfa4d587..02654eb4ddc4e 100644 --- a/coderd/database/queries/provisionerjobs.sql +++ b/coderd/database/queries/provisionerjobs.sql @@ -278,7 +278,7 @@ WHERE AND started_at IS NOT NULL AND completed_at IS NULL; --- name: GetNotStartedProvisionerJobs :many +-- name: GetPendingProvisionerJobs :many SELECT * FROM diff --git a/coderd/httpmw/loggermw/logger.go b/coderd/httpmw/loggermw/logger.go index 9eeb07a5f10e5..30e5e2d811ad8 100644 --- a/coderd/httpmw/loggermw/logger.go +++ b/coderd/httpmw/loggermw/logger.go @@ -132,7 +132,7 @@ var actorLogOrder = []rbac.SubjectType{ rbac.SubjectTypeAutostart, rbac.SubjectTypeCryptoKeyReader, rbac.SubjectTypeCryptoKeyRotator, - rbac.SubjectTypeHangDetector, + rbac.SubjectTypeJobReaper, rbac.SubjectTypeNotifier, rbac.SubjectTypePrebuildsOrchestrator, rbac.SubjectTypeProvisionerd, diff --git a/coderd/jobreaper/detector.go b/coderd/jobreaper/detector.go index 14d8068eadae6..3ff1f6f6ddf86 100644 --- a/coderd/jobreaper/detector.go +++ b/coderd/jobreaper/detector.go @@ -25,9 +25,9 @@ const ( // before it is considered hung. HungJobDuration = 5 * time.Minute - // NotStartedTimeElapsed is the duration of time since the last update to a job + // PendingJobTimeElapsed is the duration of time since the last update to a job // before it is considered hung. - NotStartedTimeElapsed = 30 * time.Minute + PendingJobTimeElapsed = 30 * time.Minute // HungJobExitTimeout is the duration of time that provisioners should allow // for a graceful exit upon cancellation due to failing to send an update to @@ -42,24 +42,23 @@ const ( MaxJobsPerRun = 10 ) -type jobType string - -const ( - hungJobType jobType = "hung" - notStartedJobType jobType = "not started" -) - // jobLogMessages are written to provisioner job logs when a job is reaped -func jobLogMessages(jobType jobType, threshold float64) []string { +func jobLogMessages(jobType database.ProvisionerJobStatus, threshold time.Duration) []string { return []string{ "", "====================", - fmt.Sprintf("Coder: Build has been detected as %s for %.0f minutes and will be terminated.", jobType, threshold), + fmt.Sprintf("Coder: Build has been detected as %s for %.0f minutes and will be terminated.", jobType, threshold.Minutes()), "====================", "", } } +type jobToReap struct { + ID uuid.UUID + Threshold time.Duration + Status database.ProvisionerJobStatus +} + // acquireLockError is returned when the detector fails to acquire a lock and // cancels the current run. type acquireLockError struct{} @@ -108,7 +107,7 @@ type Stats struct { // New returns a new hang detector. func New(ctx context.Context, db database.Store, pub pubsub.Pubsub, log slog.Logger, tick <-chan time.Time) *Detector { //nolint:gocritic // Hang detector has a limited set of permissions. - ctx, cancel := context.WithCancel(dbauthz.AsHangDetector(ctx)) + ctx, cancel := context.WithCancel(dbauthz.AsJobReaper(ctx)) d := &Detector{ ctx: ctx, cancel: cancel, @@ -184,42 +183,55 @@ func (d *Detector) run(t time.Time) Stats { Error: nil, } + jobsToReap := make([]*jobToReap, 0, MaxJobsPerRun) + // Find all provisioner jobs that are currently running but have not // received an update in the last 5 minutes. - jobs, err := d.db.GetHungProvisionerJobs(ctx, t.Add(-HungJobDuration)) + hungJobs, err := d.db.GetHungProvisionerJobs(ctx, t.Add(-HungJobDuration)) if err != nil { - stats.Error = xerrors.Errorf("get %s provisioner jobs: %w", hungJobType, err) + stats.Error = xerrors.Errorf("get hung provisioner jobs: %w", err) return stats } + for _, job := range hungJobs { + jobsToReap = append(jobsToReap, &jobToReap{ + ID: job.ID, + Threshold: HungJobDuration, + }) + } + // Find all provisioner jobs that have not been started yet and have not // received an update in the last 30 minutes. - jobsNotStarted, err := d.db.GetNotStartedProvisionerJobs(ctx, t.Add(-NotStartedTimeElapsed)) + jobsPending, err := d.db.GetPendingProvisionerJobs(ctx, t.Add(-PendingJobTimeElapsed)) if err != nil { - stats.Error = xerrors.Errorf("get %s provisioner jobs: %w", notStartedJobType, err) + stats.Error = xerrors.Errorf("get pending provisioner jobs: %w", err) return stats } - jobs = append(jobs, jobsNotStarted...) + for _, job := range jobsPending { + jobsToReap = append(jobsToReap, &jobToReap{ + ID: job.ID, + Threshold: PendingJobTimeElapsed, + }) + } - // Limit the number of jobs we'll unhang in a single run to avoid + // Limit the number of jobs we'll reap in a single run to avoid // timing out. - if len(jobs) > MaxJobsPerRun { - // Pick a random subset of the jobs to unhang. - rand.Shuffle(len(jobs), func(i, j int) { - jobs[i], jobs[j] = jobs[j], jobs[i] + if len(jobsToReap) > MaxJobsPerRun { + // Pick a random subset of the jobs to reap. + rand.Shuffle(len(jobsToReap), func(i, j int) { + jobsToReap[i], jobsToReap[j] = jobsToReap[j], jobsToReap[i] }) - jobs = jobs[:MaxJobsPerRun] + jobsToReap = jobsToReap[:MaxJobsPerRun] } - // Send a message into the build log for each hung or not started job saying that it + // Send a message into the build log for each hung or pending job saying that it // has been detected and will be terminated, then mark the job as failed. - for _, job := range jobs { + for _, job := range jobsToReap { log := d.log.With(slog.F("job_id", job.ID)) - err := reapJob(ctx, log, d.db, d.pubsub, job.ID) + err := reapJob(ctx, log, d.db, d.pubsub, job) if err != nil { - jobType, _ := reapParamsFromJob(job) if !(xerrors.As(err, &acquireLockError{}) || xerrors.As(err, &jobIneligibleError{})) { - log.Error(ctx, fmt.Sprintf("error forcefully terminating %s provisioner job", jobType), slog.Error(err)) + log.Error(ctx, fmt.Sprintf("error forcefully terminating %s provisioner job", job.Status), slog.Error(err)) } continue } @@ -230,11 +242,11 @@ func (d *Detector) run(t time.Time) Stats { return stats } -func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub.Pubsub, jobID uuid.UUID) error { +func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub.Pubsub, jobToReap *jobToReap) error { var lowestLogID int64 err := db.InTx(func(db database.Store) error { - locked, err := db.TryAcquireLock(ctx, database.GenLockID(fmt.Sprintf("reaper:%s", jobID))) + locked, err := db.TryAcquireLock(ctx, database.GenLockID(fmt.Sprintf("reaper:%s", jobToReap.ID))) if err != nil { return xerrors.Errorf("acquire lock: %w", err) } @@ -244,26 +256,26 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub } // Refetch the job while we hold the lock. - job, err := db.GetProvisionerJobByID(ctx, jobID) + job, err := db.GetProvisionerJobByID(ctx, jobToReap.ID) if err != nil { return xerrors.Errorf("get provisioner job: %w", err) } - jobType, threshold := reapParamsFromJob(job) + threshold := reapParamsFromJob(job) if job.CompletedAt.Valid { return jobIneligibleError{ Err: xerrors.Errorf("job is completed (status %s)", job.JobStatus), } } - if job.UpdatedAt.After(time.Now().Add(-time.Duration(threshold) * time.Minute)) { + if job.UpdatedAt.After(time.Now().Add(-threshold)) { return jobIneligibleError{ Err: xerrors.New("job has been updated recently"), } } log.Warn( - ctx, fmt.Sprintf("detected %s provisioner job, forcefully terminating", jobType), + ctx, fmt.Sprintf("detected %s provisioner job, forcefully terminating", job.JobStatus), "threshold", threshold, ) @@ -274,7 +286,7 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub CreatedAfter: 0, }) if err != nil { - return xerrors.Errorf("get logs for %s job: %w", jobType, err) + return xerrors.Errorf("get logs for %s job: %w", job.JobStatus, err) } logStage := "" if len(logs) != 0 { @@ -294,7 +306,7 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub Output: nil, } now := dbtime.Now() - for i, msg := range jobLogMessages(jobType, threshold) { + for i, msg := range jobLogMessages(job.JobStatus, threshold) { // Set the created at in a way that ensures each message has // a unique timestamp so they will be sorted correctly. insertParams.CreatedAt = append(insertParams.CreatedAt, now.Add(time.Millisecond*time.Duration(i))) @@ -305,16 +317,16 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub } newLogs, err := db.InsertProvisionerJobLogs(ctx, insertParams) if err != nil { - return xerrors.Errorf("insert logs for %s job: %w", jobType, err) + return xerrors.Errorf("insert logs for %s job: %w", job.JobStatus, err) } lowestLogID = newLogs[0].ID // Mark the job as failed. now = dbtime.Now() - // If the job was never started, set the StartedAt time to the current + // If the job was never started (pending), set the StartedAt time to the current // time so that the build duration is correct. - if !job.StartedAt.Valid { + if job.JobStatus == database.ProvisionerJobStatusPending { job.StartedAt = sql.NullTime{ Time: now, Valid: true, @@ -328,7 +340,7 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub Valid: true, }, Error: sql.NullString{ - String: fmt.Sprintf("Coder: Build has been detected as %s for %.0f minutes and has been terminated by hang detector.", jobType, threshold), + String: fmt.Sprintf("Coder: Build has been detected as %s for %.0f minutes and has been terminated by hang detector.", job.JobStatus, threshold.Minutes()), Valid: true, }, ErrorCode: sql.NullString{ @@ -388,7 +400,7 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub if err != nil { return xerrors.Errorf("marshal log notification: %w", err) } - err = pub.Publish(provisionersdk.ProvisionerJobLogsNotifyChannel(jobID), data) + err = pub.Publish(provisionersdk.ProvisionerJobLogsNotifyChannel(jobToReap.ID), data) if err != nil { return xerrors.Errorf("publish log notification: %w", err) } @@ -396,12 +408,10 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub return nil } -func reapParamsFromJob(job database.ProvisionerJob) (jobType, float64) { - jobType := hungJobType - threshold := HungJobDuration.Minutes() - if !job.StartedAt.Valid { - jobType = notStartedJobType - threshold = NotStartedTimeElapsed.Minutes() +func reapParamsFromJob(job database.ProvisionerJob) time.Duration { + threshold := HungJobDuration + if job.JobStatus == database.ProvisionerJobStatusPending { + threshold = PendingJobTimeElapsed } - return jobType, threshold + return threshold } diff --git a/coderd/jobreaper/detector_test.go b/coderd/jobreaper/detector_test.go index dba1c013b0766..ada9a9853f032 100644 --- a/coderd/jobreaper/detector_test.go +++ b/coderd/jobreaper/detector_test.go @@ -31,8 +31,8 @@ import ( type jobType string const ( - hungJobType jobType = "hung" - notStartedJobType jobType = "not started" + hungJobType jobType = "hung" + pendingJobType jobType = "pending" ) // jobLogMessages returns the messages to be written to provisioner job logs when a job is reaped @@ -51,8 +51,8 @@ func reapParamsFromJob(job database.ProvisionerJob) (jobType, float64) { jobType := hungJobType threshold := jobreaper.HungJobDuration.Minutes() if !job.StartedAt.Valid { - jobType = notStartedJobType - threshold = jobreaper.NotStartedTimeElapsed.Minutes() + jobType = pendingJobType + threshold = jobreaper.PendingJobTimeElapsed.Minutes() } return jobType, threshold } @@ -469,7 +469,7 @@ func TestDetectorHungWorkspaceBuildNoOverrideStateIfNoExistingBuild(t *testing.T detector.Wait() } -func TestDetectorNotStartedWorkspaceBuildNoOverrideStateIfNoExistingBuild(t *testing.T) { +func TestDetectorPendingWorkspaceBuildNoOverrideStateIfNoExistingBuild(t *testing.T) { t.Parallel() var ( @@ -551,7 +551,7 @@ func TestDetectorNotStartedWorkspaceBuildNoOverrideStateIfNoExistingBuild(t *tes require.True(t, job.StartedAt.Valid) require.WithinDuration(t, now, job.StartedAt.Time, 30*time.Second) require.True(t, job.Error.Valid) - require.Contains(t, job.Error.String, "Build has been detected as not started") + require.Contains(t, job.Error.String, "Build has been detected as pending") require.False(t, job.ErrorCode.Valid) // Check that the provisioner state was NOT updated. @@ -667,7 +667,7 @@ func TestDetectorHungOtherJobTypes(t *testing.T) { detector.Wait() } -func TestDetectorNotStartedOtherJobTypes(t *testing.T) { +func TestDetectorPendingOtherJobTypes(t *testing.T) { t.Parallel() var ( @@ -755,7 +755,7 @@ func TestDetectorNotStartedOtherJobTypes(t *testing.T) { require.True(t, job.StartedAt.Valid) require.WithinDuration(t, now, job.StartedAt.Time, 30*time.Second) require.True(t, job.Error.Valid) - require.Contains(t, job.Error.String, "Build has been detected as not started") + require.Contains(t, job.Error.String, "Build has been detected as pending") require.False(t, job.ErrorCode.Valid) // Check that the template dry-run job was updated. @@ -767,7 +767,7 @@ func TestDetectorNotStartedOtherJobTypes(t *testing.T) { require.True(t, job.StartedAt.Valid) require.WithinDuration(t, now, job.StartedAt.Time, 30*time.Second) require.True(t, job.Error.Valid) - require.Contains(t, job.Error.String, "Build has been detected as not started") + require.Contains(t, job.Error.String, "Build has been detected as pending") require.False(t, job.ErrorCode.Valid) detector.Close() diff --git a/coderd/rbac/authz.go b/coderd/rbac/authz.go index d2c6d5d0675be..c63042a2a1363 100644 --- a/coderd/rbac/authz.go +++ b/coderd/rbac/authz.go @@ -65,7 +65,7 @@ const ( SubjectTypeUser SubjectType = "user" SubjectTypeProvisionerd SubjectType = "provisionerd" SubjectTypeAutostart SubjectType = "autostart" - SubjectTypeHangDetector SubjectType = "hang_detector" + SubjectTypeJobReaper SubjectType = "job_reaper" SubjectTypeResourceMonitor SubjectType = "resource_monitor" SubjectTypeCryptoKeyRotator SubjectType = "crypto_key_rotator" SubjectTypeCryptoKeyReader SubjectType = "crypto_key_reader" diff --git a/coderd/rbac/object_gen.go b/coderd/rbac/object_gen.go index 40b7dc87a56f8..0336616b5e550 100644 --- a/coderd/rbac/object_gen.go +++ b/coderd/rbac/object_gen.go @@ -235,6 +235,7 @@ var ( // ResourceProvisionerJobs // Valid Actions // - "ActionRead" :: read provisioner jobs + // - "ActionUpdate" :: update provisioner jobs ResourceProvisionerJobs = Object{ Type: "provisioner_jobs", } diff --git a/coderd/rbac/policy/policy.go b/coderd/rbac/policy/policy.go index 35da0892abfdb..070692c5e50c6 100644 --- a/coderd/rbac/policy/policy.go +++ b/coderd/rbac/policy/policy.go @@ -182,7 +182,8 @@ var RBACPermissions = map[string]PermissionDefinition{ }, "provisioner_jobs": { Actions: map[Action]ActionDefinition{ - ActionRead: actDef("read provisioner jobs"), + ActionRead: actDef("read provisioner jobs"), + ActionUpdate: actDef("update provisioner jobs"), }, }, "organization": { diff --git a/codersdk/deployment.go b/codersdk/deployment.go index 0741bf9e3844a..39b67feb2c73a 100644 --- a/codersdk/deployment.go +++ b/codersdk/deployment.go @@ -345,7 +345,7 @@ type DeploymentValues struct { // HTTPAddress is a string because it may be set to zero to disable. HTTPAddress serpent.String `json:"http_address,omitempty" typescript:",notnull"` AutobuildPollInterval serpent.Duration `json:"autobuild_poll_interval,omitempty"` - JobHangDetectorInterval serpent.Duration `json:"job_hang_detector_interval,omitempty"` + JobReaperDetectorInterval serpent.Duration `json:"job_hang_detector_interval,omitempty"` DERP DERP `json:"derp,omitempty" typescript:",notnull"` Prometheus PrometheusConfig `json:"prometheus,omitempty" typescript:",notnull"` Pprof PprofConfig `json:"pprof,omitempty" typescript:",notnull"` @@ -1287,13 +1287,13 @@ func (c *DeploymentValues) Options() serpent.OptionSet { Annotations: serpent.Annotations{}.Mark(annotationFormatDuration, "true"), }, { - Name: "Job Hang Detector Interval", - Description: "Interval to poll for hung jobs and automatically terminate them.", + Name: "Job Reaper Detect Interval", + Description: "Interval to poll for hung and pending jobs and automatically terminate them.", Flag: "job-hang-detector-interval", Env: "CODER_JOB_HANG_DETECTOR_INTERVAL", Hidden: true, Default: time.Minute.String(), - Value: &c.JobHangDetectorInterval, + Value: &c.JobReaperDetectorInterval, YAML: "jobHangDetectorInterval", Annotations: serpent.Annotations{}.Mark(annotationFormatDuration, "true"), }, diff --git a/codersdk/rbacresources_gen.go b/codersdk/rbacresources_gen.go index 54f65767928d6..737b7beeba29d 100644 --- a/codersdk/rbacresources_gen.go +++ b/codersdk/rbacresources_gen.go @@ -90,7 +90,7 @@ var RBACResourceActions = map[RBACResource][]RBACAction{ ResourceOrganization: {ActionCreate, ActionDelete, ActionRead, ActionUpdate}, ResourceOrganizationMember: {ActionCreate, ActionDelete, ActionRead, ActionUpdate}, ResourceProvisionerDaemon: {ActionCreate, ActionDelete, ActionRead, ActionUpdate}, - ResourceProvisionerJobs: {ActionRead}, + ResourceProvisionerJobs: {ActionRead, ActionUpdate}, ResourceReplicas: {ActionRead}, ResourceSystem: {ActionCreate, ActionDelete, ActionRead, ActionUpdate}, ResourceTailnetCoordinator: {ActionCreate, ActionDelete, ActionRead, ActionUpdate}, From 96fee5183e7848f01970c30709d56d13bf2a0e11 Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Wed, 14 May 2025 13:12:31 +0000 Subject: [PATCH 12/24] WIP --- coderd/database/dbauthz/dbauthz.go | 61 ++++++++++++++---------------- coderd/rbac/roles.go | 2 +- coderd/rbac/roles_test.go | 2 +- codersdk/rbacresources_gen.go | 2 +- 4 files changed, 31 insertions(+), 36 deletions(-) diff --git a/coderd/database/dbauthz/dbauthz.go b/coderd/database/dbauthz/dbauthz.go index 90ee7560776d0..d01d86bd35f01 100644 --- a/coderd/database/dbauthz/dbauthz.go +++ b/coderd/database/dbauthz/dbauthz.go @@ -170,10 +170,10 @@ var ( Identifier: rbac.RoleIdentifier{Name: "provisionerd"}, DisplayName: "Provisioner Daemon", Site: rbac.Permissions(map[string][]policy.Action{ - // TODO: Add ProvisionerJob resource type. - rbac.ResourceFile.Type: {policy.ActionRead}, - rbac.ResourceSystem.Type: {policy.WildcardSymbol}, - rbac.ResourceTemplate.Type: {policy.ActionRead, policy.ActionUpdate}, + rbac.ResourceProvisionerJobs.Type: {policy.ActionRead, policy.ActionUpdate}, + rbac.ResourceFile.Type: {policy.ActionRead}, + rbac.ResourceSystem.Type: {policy.WildcardSymbol}, + rbac.ResourceTemplate.Type: {policy.ActionRead, policy.ActionUpdate}, // Unsure why provisionerd needs update and read personal rbac.ResourceUser.Type: {policy.ActionRead, policy.ActionReadPersonal, policy.ActionUpdatePersonal}, rbac.ResourceWorkspaceDormant.Type: {policy.ActionDelete, policy.ActionRead, policy.ActionUpdate, policy.ActionWorkspaceStop}, @@ -1093,11 +1093,10 @@ func (q *querier) AcquireNotificationMessages(ctx context.Context, arg database. return q.db.AcquireNotificationMessages(ctx, arg) } -// TODO: We need to create a ProvisionerJob resource type func (q *querier) AcquireProvisionerJob(ctx context.Context, arg database.AcquireProvisionerJobParams) (database.ProvisionerJob, error) { - // if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceSystem); err != nil { - // return database.ProvisionerJob{}, err - // } + if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { + return database.ProvisionerJob{}, err + } return q.db.AcquireProvisionerJob(ctx, arg) } @@ -2322,16 +2321,17 @@ func (q *querier) GetProvisionerJobTimingsByJobID(ctx context.Context, jobID uui return q.db.GetProvisionerJobTimingsByJobID(ctx, jobID) } -// TODO: We have a ProvisionerJobs resource, but it hasn't been checked for this use-case. func (q *querier) GetProvisionerJobsByIDs(ctx context.Context, ids []uuid.UUID) ([]database.ProvisionerJob, error) { - // if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceSystem); err != nil { - // return nil, err - // } + if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { + return nil, err + } return q.db.GetProvisionerJobsByIDs(ctx, ids) } -// TODO: We have a ProvisionerJobs resource, but it hasn't been checked for this use-case. func (q *querier) GetProvisionerJobsByIDsWithQueuePosition(ctx context.Context, ids []uuid.UUID) ([]database.GetProvisionerJobsByIDsWithQueuePositionRow, error) { + if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { + return nil, err + } return q.db.GetProvisionerJobsByIDsWithQueuePosition(ctx, ids) } @@ -2339,11 +2339,10 @@ func (q *querier) GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndP return fetchWithPostFilter(q.auth, policy.ActionRead, q.db.GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisioner)(ctx, arg) } -// TODO: We have a ProvisionerJobs resource, but it hasn't been checked for this use-case. func (q *querier) GetProvisionerJobsCreatedAfter(ctx context.Context, createdAt time.Time) ([]database.ProvisionerJob, error) { - // if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceSystem); err != nil { - // return nil, err - // } + if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { + return nil, err + } return q.db.GetProvisionerJobsCreatedAfter(ctx, createdAt) } @@ -3531,27 +3530,24 @@ func (q *querier) InsertPresetParameters(ctx context.Context, arg database.Inser return q.db.InsertPresetParameters(ctx, arg) } -// TODO: We need to create a ProvisionerJob resource type func (q *querier) InsertProvisionerJob(ctx context.Context, arg database.InsertProvisionerJobParams) (database.ProvisionerJob, error) { - // if err := q.authorizeContext(ctx, policy.ActionCreate, rbac.ResourceSystem); err != nil { - // return database.ProvisionerJob{}, err - // } + if err := q.authorizeContext(ctx, policy.ActionCreate, rbac.ResourceProvisionerJobs); err != nil { + return database.ProvisionerJob{}, err + } return q.db.InsertProvisionerJob(ctx, arg) } -// TODO: We need to create a ProvisionerJob resource type func (q *querier) InsertProvisionerJobLogs(ctx context.Context, arg database.InsertProvisionerJobLogsParams) ([]database.ProvisionerJobLog, error) { - // if err := q.authorizeContext(ctx, policy.ActionCreate, rbac.ResourceSystem); err != nil { - // return nil, err - // } + if err := q.authorizeContext(ctx, policy.ActionCreate, rbac.ResourceProvisionerJobs); err != nil { + return nil, err + } return q.db.InsertProvisionerJobLogs(ctx, arg) } -// TODO: We need to create a ProvisionerJob resource type func (q *querier) InsertProvisionerJobTimings(ctx context.Context, arg database.InsertProvisionerJobTimingsParams) ([]database.ProvisionerJobTiming, error) { - // if err := q.authorizeContext(ctx, policy.ActionCreate, rbac.ResourceSystem); err != nil { - // return nil, err - // } + if err := q.authorizeContext(ctx, policy.ActionCreate, rbac.ResourceProvisionerJobs); err != nil { + return nil, err + } return q.db.InsertProvisionerJobTimings(ctx, arg) } @@ -4174,11 +4170,10 @@ func (q *querier) UpdateProvisionerDaemonLastSeenAt(ctx context.Context, arg dat return q.db.UpdateProvisionerDaemonLastSeenAt(ctx, arg) } -// TODO: We need to create a ProvisionerJob resource type func (q *querier) UpdateProvisionerJobByID(ctx context.Context, arg database.UpdateProvisionerJobByIDParams) error { - // if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceSystem); err != nil { - // return err - // } + if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { + return err + } return q.db.UpdateProvisionerJobByID(ctx, arg) } diff --git a/coderd/rbac/roles.go b/coderd/rbac/roles.go index 56124faee44e2..0b94a74201b16 100644 --- a/coderd/rbac/roles.go +++ b/coderd/rbac/roles.go @@ -503,7 +503,7 @@ func ReloadBuiltinRoles(opts *RoleOptions) { // the ability to create templates and provisioners has // a lot of overlap. ResourceProvisionerDaemon.Type: {policy.ActionCreate, policy.ActionRead, policy.ActionUpdate, policy.ActionDelete}, - ResourceProvisionerJobs.Type: {policy.ActionRead}, + ResourceProvisionerJobs.Type: {policy.ActionRead, policy.ActionUpdate, policy.ActionCreate}, }), }, User: []Permission{}, diff --git a/coderd/rbac/roles_test.go b/coderd/rbac/roles_test.go index e90c89914fdec..5f1381caedf27 100644 --- a/coderd/rbac/roles_test.go +++ b/coderd/rbac/roles_test.go @@ -580,7 +580,7 @@ func TestRolePermissions(t *testing.T) { }, { Name: "ProvisionerJobs", - Actions: []policy.Action{policy.ActionRead}, + Actions: []policy.Action{policy.ActionRead, policy.ActionUpdate}, Resource: rbac.ResourceProvisionerJobs.InOrg(orgID), AuthorizeMap: map[bool][]hasAuthSubjects{ true: {owner, orgTemplateAdmin, orgAdmin}, diff --git a/codersdk/rbacresources_gen.go b/codersdk/rbacresources_gen.go index 737b7beeba29d..8a89ac6d5e507 100644 --- a/codersdk/rbacresources_gen.go +++ b/codersdk/rbacresources_gen.go @@ -90,7 +90,7 @@ var RBACResourceActions = map[RBACResource][]RBACAction{ ResourceOrganization: {ActionCreate, ActionDelete, ActionRead, ActionUpdate}, ResourceOrganizationMember: {ActionCreate, ActionDelete, ActionRead, ActionUpdate}, ResourceProvisionerDaemon: {ActionCreate, ActionDelete, ActionRead, ActionUpdate}, - ResourceProvisionerJobs: {ActionRead, ActionUpdate}, + ResourceProvisionerJobs: {ActionRead, ActionUpdate, ActionCreate}, ResourceReplicas: {ActionRead}, ResourceSystem: {ActionCreate, ActionDelete, ActionRead, ActionUpdate}, ResourceTailnetCoordinator: {ActionCreate, ActionDelete, ActionRead, ActionUpdate}, From d8db119c6bcf3337d7c68b1740a176c8aa951558 Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Thu, 15 May 2025 16:46:41 +0000 Subject: [PATCH 13/24] WIP --- cli/testdata/server-config.yaml.golden | 2 +- coderd/database/dbauthz/dbauthz.go | 30 ++-- coderd/database/dbmem/dbmem.go | 62 +++---- coderd/database/dbmetrics/querymetrics.go | 21 +-- coderd/database/dbmock/dbmock.go | 45 ++--- coderd/database/querier.go | 3 +- coderd/database/queries.sql.go | 181 ++++++++------------ coderd/database/queries/provisionerjobs.sql | 31 ++-- coderd/jobreaper/detector.go | 97 +++++------ coderd/jobreaper/detector_test.go | 2 +- coderd/rbac/roles.go | 1 + site/src/api/rbacresourcesGenerated.ts | 1 + 12 files changed, 194 insertions(+), 282 deletions(-) diff --git a/cli/testdata/server-config.yaml.golden b/cli/testdata/server-config.yaml.golden index fc76a6c2ec8a0..9995a7f389130 100644 --- a/cli/testdata/server-config.yaml.golden +++ b/cli/testdata/server-config.yaml.golden @@ -183,7 +183,7 @@ networking: # Interval to poll for scheduled workspace builds. # (default: 1m0s, type: duration) autobuildPollInterval: 1m0s -# Interval to poll for hung jobs and automatically terminate them. +# Interval to poll for hung and pending jobs and automatically terminate them. # (default: 1m0s, type: duration) jobHangDetectorInterval: 1m0s introspection: diff --git a/coderd/database/dbauthz/dbauthz.go b/coderd/database/dbauthz/dbauthz.go index d01d86bd35f01..31e21a0f7788e 100644 --- a/coderd/database/dbauthz/dbauthz.go +++ b/coderd/database/dbauthz/dbauthz.go @@ -170,7 +170,7 @@ var ( Identifier: rbac.RoleIdentifier{Name: "provisionerd"}, DisplayName: "Provisioner Daemon", Site: rbac.Permissions(map[string][]policy.Action{ - rbac.ResourceProvisionerJobs.Type: {policy.ActionRead, policy.ActionUpdate}, + rbac.ResourceProvisionerJobs.Type: {policy.ActionRead, policy.ActionUpdate, policy.ActionCreate}, rbac.ResourceFile.Type: {policy.ActionRead}, rbac.ResourceSystem.Type: {policy.WildcardSymbol}, rbac.ResourceTemplate.Type: {policy.ActionRead, policy.ActionUpdate}, @@ -1075,13 +1075,6 @@ func (q *querier) customRoleCheck(ctx context.Context, role database.CustomRole) return nil } -func (q *querier) GetPendingProvisionerJobs(ctx context.Context, lastUpdatedSince time.Time) ([]database.ProvisionerJob, error) { - if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { - return nil, err - } - return q.db.GetPendingProvisionerJobs(ctx, lastUpdatedSince) -} - func (q *querier) AcquireLock(ctx context.Context, id int64) error { return q.db.AcquireLock(ctx, id) } @@ -1919,13 +1912,6 @@ func (q *querier) GetHealthSettings(ctx context.Context) (string, error) { return q.db.GetHealthSettings(ctx) } -func (q *querier) GetHungProvisionerJobs(ctx context.Context, hungSince time.Time) ([]database.ProvisionerJob, error) { - if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { - return nil, err - } - return q.db.GetHungProvisionerJobs(ctx, hungSince) -} - func (q *querier) GetInboxNotificationByID(ctx context.Context, id uuid.UUID) (database.InboxNotification, error) { return fetchWithAction(q.log, q.auth, policy.ActionRead, q.db.GetInboxNotificationByID)(ctx, id) } @@ -2336,6 +2322,9 @@ func (q *querier) GetProvisionerJobsByIDsWithQueuePosition(ctx context.Context, } func (q *querier) GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisioner(ctx context.Context, arg database.GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisionerParams) ([]database.GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisionerRow, error) { + if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { + return nil, err + } return fetchWithPostFilter(q.auth, policy.ActionRead, q.db.GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisioner)(ctx, arg) } @@ -2346,6 +2335,13 @@ func (q *querier) GetProvisionerJobsCreatedAfter(ctx context.Context, createdAt return q.db.GetProvisionerJobsCreatedAfter(ctx, createdAt) } +func (q *querier) GetProvisionerJobsToBeReaped(ctx context.Context, arg database.GetProvisionerJobsToBeReapedParams) ([]database.ProvisionerJob, error) { + if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { + return nil, err + } + return q.db.GetProvisionerJobsToBeReaped(ctx, arg) +} + func (q *querier) GetProvisionerKeyByHashedSecret(ctx context.Context, hashedSecret []byte) (database.ProvisionerKey, error) { return fetch(q.log, q.auth, q.db.GetProvisionerKeyByHashedSecret)(ctx, hashedSecret) } @@ -3538,14 +3534,14 @@ func (q *querier) InsertProvisionerJob(ctx context.Context, arg database.InsertP } func (q *querier) InsertProvisionerJobLogs(ctx context.Context, arg database.InsertProvisionerJobLogsParams) ([]database.ProvisionerJobLog, error) { - if err := q.authorizeContext(ctx, policy.ActionCreate, rbac.ResourceProvisionerJobs); err != nil { + if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { return nil, err } return q.db.InsertProvisionerJobLogs(ctx, arg) } func (q *querier) InsertProvisionerJobTimings(ctx context.Context, arg database.InsertProvisionerJobTimingsParams) ([]database.ProvisionerJobTiming, error) { - if err := q.authorizeContext(ctx, policy.ActionCreate, rbac.ResourceProvisionerJobs); err != nil { + if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { return nil, err } return q.db.InsertProvisionerJobTimings(ctx, arg) diff --git a/coderd/database/dbmem/dbmem.go b/coderd/database/dbmem/dbmem.go index ffc862d221c7d..33eeaf5f5f936 100644 --- a/coderd/database/dbmem/dbmem.go +++ b/coderd/database/dbmem/dbmem.go @@ -1386,10 +1386,6 @@ func isDeprecated(template database.Template) bool { return template.Deprecated != "" } -func (q *FakeQuerier) GetProvisionerJobsToBeReaped(ctx context.Context, updatedAt time.Time) ([]database.ProvisionerJob, error) { - panic("not implemented") -} - func (*FakeQuerier) AcquireLock(_ context.Context, _ int64) error { return xerrors.New("AcquireLock must only be called within a transaction") } @@ -3711,23 +3707,6 @@ func (q *FakeQuerier) GetHealthSettings(_ context.Context) (string, error) { return string(q.healthSettings), nil } -func (q *FakeQuerier) GetHungProvisionerJobs(_ context.Context, hungSince time.Time) ([]database.ProvisionerJob, error) { - q.mutex.RLock() - defer q.mutex.RUnlock() - - hungJobs := []database.ProvisionerJob{} - for _, provisionerJob := range q.provisionerJobs { - if provisionerJob.StartedAt.Valid && !provisionerJob.CompletedAt.Valid && provisionerJob.UpdatedAt.Before(hungSince) { - // clone the Tags before appending, since maps are reference types and - // we don't want the caller to be able to mutate the map we have inside - // dbmem! - provisionerJob.Tags = maps.Clone(provisionerJob.Tags) - hungJobs = append(hungJobs, provisionerJob) - } - } - return hungJobs, nil -} - func (q *FakeQuerier) GetInboxNotificationByID(_ context.Context, id uuid.UUID) (database.InboxNotification, error) { q.mutex.RLock() defer q.mutex.RUnlock() @@ -4278,23 +4257,6 @@ func (q *FakeQuerier) GetParameterSchemasByJobID(_ context.Context, jobID uuid.U return parameters, nil } -func (q *FakeQuerier) GetPendingProvisionerJobs(_ context.Context, lastUpdatedSince time.Time) ([]database.ProvisionerJob, error) { - q.mutex.RLock() - defer q.mutex.RUnlock() - - pendingJobs := []database.ProvisionerJob{} - for _, provisionerJob := range q.provisionerJobs { - if !provisionerJob.StartedAt.Valid && !provisionerJob.CompletedAt.Valid && provisionerJob.UpdatedAt.Before(lastUpdatedSince) { - // clone the Tags before appending, since maps are reference types and - // we don't want the caller to be able to mutate the map we have inside - // dbmem! - provisionerJob.Tags = maps.Clone(provisionerJob.Tags) - pendingJobs = append(pendingJobs, provisionerJob) - } - } - return pendingJobs, nil -} - func (*FakeQuerier) GetPrebuildMetrics(_ context.Context) ([]database.GetPrebuildMetricsRow, error) { return nil, ErrUnimplemented } @@ -4898,6 +4860,30 @@ func (q *FakeQuerier) GetProvisionerJobsCreatedAfter(_ context.Context, after ti return jobs, nil } +func (q *FakeQuerier) GetProvisionerJobsToBeReaped(_ context.Context, arg database.GetProvisionerJobsToBeReapedParams) ([]database.ProvisionerJob, error) { + q.mutex.RLock() + defer q.mutex.RUnlock() + maxJobs := arg.MaxJobs + + hungJobs := []database.ProvisionerJob{} + for _, provisionerJob := range q.provisionerJobs { + if !provisionerJob.CompletedAt.Valid { + if (provisionerJob.StartedAt.Valid && provisionerJob.UpdatedAt.Before(arg.HungSince)) || + (!provisionerJob.StartedAt.Valid && provisionerJob.UpdatedAt.Before(arg.PendingSince)) { + // clone the Tags before appending, since maps are reference types and + // we don't want the caller to be able to mutate the map we have inside + // dbmem! + provisionerJob.Tags = maps.Clone(provisionerJob.Tags) + hungJobs = append(hungJobs, provisionerJob) + if len(hungJobs) >= int(maxJobs) { + return hungJobs, nil + } + } + } + } + return hungJobs, nil +} + func (q *FakeQuerier) GetProvisionerKeyByHashedSecret(_ context.Context, hashedSecret []byte) (database.ProvisionerKey, error) { q.mutex.RLock() defer q.mutex.RUnlock() diff --git a/coderd/database/dbmetrics/querymetrics.go b/coderd/database/dbmetrics/querymetrics.go index 7e31679d88a50..4be271533a97c 100644 --- a/coderd/database/dbmetrics/querymetrics.go +++ b/coderd/database/dbmetrics/querymetrics.go @@ -865,13 +865,6 @@ func (m queryMetricsStore) GetHealthSettings(ctx context.Context) (string, error return r0, r1 } -func (m queryMetricsStore) GetHungProvisionerJobs(ctx context.Context, hungSince time.Time) ([]database.ProvisionerJob, error) { - start := time.Now() - jobs, err := m.s.GetHungProvisionerJobs(ctx, hungSince) - m.queryLatencies.WithLabelValues("GetHungProvisionerJobs").Observe(time.Since(start).Seconds()) - return jobs, err -} - func (m queryMetricsStore) GetInboxNotificationByID(ctx context.Context, id uuid.UUID) (database.InboxNotification, error) { start := time.Now() r0, r1 := m.s.GetInboxNotificationByID(ctx, id) @@ -1110,13 +1103,6 @@ func (m queryMetricsStore) GetParameterSchemasByJobID(ctx context.Context, jobID return schemas, err } -func (m queryMetricsStore) GetPendingProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]database.ProvisionerJob, error) { - start := time.Now() - r0, r1 := m.s.GetPendingProvisionerJobs(ctx, updatedAt) - m.queryLatencies.WithLabelValues("GetPendingProvisionerJobs").Observe(time.Since(start).Seconds()) - return r0, r1 -} - func (m queryMetricsStore) GetPrebuildMetrics(ctx context.Context) ([]database.GetPrebuildMetricsRow, error) { start := time.Now() r0, r1 := m.s.GetPrebuildMetrics(ctx) @@ -1236,6 +1222,13 @@ func (m queryMetricsStore) GetProvisionerJobsCreatedAfter(ctx context.Context, c return jobs, err } +func (m queryMetricsStore) GetProvisionerJobsToBeReaped(ctx context.Context, arg database.GetProvisionerJobsToBeReapedParams) ([]database.ProvisionerJob, error) { + start := time.Now() + r0, r1 := m.s.GetProvisionerJobsToBeReaped(ctx, arg) + m.queryLatencies.WithLabelValues("GetProvisionerJobsToBeReaped").Observe(time.Since(start).Seconds()) + return r0, r1 +} + func (m queryMetricsStore) GetProvisionerKeyByHashedSecret(ctx context.Context, hashedSecret []byte) (database.ProvisionerKey, error) { start := time.Now() r0, r1 := m.s.GetProvisionerKeyByHashedSecret(ctx, hashedSecret) diff --git a/coderd/database/dbmock/dbmock.go b/coderd/database/dbmock/dbmock.go index 38c0f7b35f3e9..5f0024aa22c42 100644 --- a/coderd/database/dbmock/dbmock.go +++ b/coderd/database/dbmock/dbmock.go @@ -1743,21 +1743,6 @@ func (mr *MockStoreMockRecorder) GetHealthSettings(ctx any) *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetHealthSettings", reflect.TypeOf((*MockStore)(nil).GetHealthSettings), ctx) } -// GetHungProvisionerJobs mocks base method. -func (m *MockStore) GetHungProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]database.ProvisionerJob, error) { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "GetHungProvisionerJobs", ctx, updatedAt) - ret0, _ := ret[0].([]database.ProvisionerJob) - ret1, _ := ret[1].(error) - return ret0, ret1 -} - -// GetHungProvisionerJobs indicates an expected call of GetHungProvisionerJobs. -func (mr *MockStoreMockRecorder) GetHungProvisionerJobs(ctx, updatedAt any) *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetHungProvisionerJobs", reflect.TypeOf((*MockStore)(nil).GetHungProvisionerJobs), ctx, updatedAt) -} - // GetInboxNotificationByID mocks base method. func (m *MockStore) GetInboxNotificationByID(ctx context.Context, id uuid.UUID) (database.InboxNotification, error) { m.ctrl.T.Helper() @@ -2268,21 +2253,6 @@ func (mr *MockStoreMockRecorder) GetParameterSchemasByJobID(ctx, jobID any) *gom return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetParameterSchemasByJobID", reflect.TypeOf((*MockStore)(nil).GetParameterSchemasByJobID), ctx, jobID) } -// GetPendingProvisionerJobs mocks base method. -func (m *MockStore) GetPendingProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]database.ProvisionerJob, error) { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "GetPendingProvisionerJobs", ctx, updatedAt) - ret0, _ := ret[0].([]database.ProvisionerJob) - ret1, _ := ret[1].(error) - return ret0, ret1 -} - -// GetPendingProvisionerJobs indicates an expected call of GetPendingProvisionerJobs. -func (mr *MockStoreMockRecorder) GetPendingProvisionerJobs(ctx, updatedAt any) *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetPendingProvisionerJobs", reflect.TypeOf((*MockStore)(nil).GetPendingProvisionerJobs), ctx, updatedAt) -} - // GetPrebuildMetrics mocks base method. func (m *MockStore) GetPrebuildMetrics(ctx context.Context) ([]database.GetPrebuildMetricsRow, error) { m.ctrl.T.Helper() @@ -2538,6 +2508,21 @@ func (mr *MockStoreMockRecorder) GetProvisionerJobsCreatedAfter(ctx, createdAt a return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetProvisionerJobsCreatedAfter", reflect.TypeOf((*MockStore)(nil).GetProvisionerJobsCreatedAfter), ctx, createdAt) } +// GetProvisionerJobsToBeReaped mocks base method. +func (m *MockStore) GetProvisionerJobsToBeReaped(ctx context.Context, arg database.GetProvisionerJobsToBeReapedParams) ([]database.ProvisionerJob, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetProvisionerJobsToBeReaped", ctx, arg) + ret0, _ := ret[0].([]database.ProvisionerJob) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetProvisionerJobsToBeReaped indicates an expected call of GetProvisionerJobsToBeReaped. +func (mr *MockStoreMockRecorder) GetProvisionerJobsToBeReaped(ctx, arg any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetProvisionerJobsToBeReaped", reflect.TypeOf((*MockStore)(nil).GetProvisionerJobsToBeReaped), ctx, arg) +} + // GetProvisionerKeyByHashedSecret mocks base method. func (m *MockStore) GetProvisionerKeyByHashedSecret(ctx context.Context, hashedSecret []byte) (database.ProvisionerKey, error) { m.ctrl.T.Helper() diff --git a/coderd/database/querier.go b/coderd/database/querier.go index 52a7ff22daa98..af85e9d0d9c22 100644 --- a/coderd/database/querier.go +++ b/coderd/database/querier.go @@ -196,7 +196,6 @@ type sqlcQuerier interface { GetGroupMembersCountByGroupID(ctx context.Context, arg GetGroupMembersCountByGroupIDParams) (int64, error) GetGroups(ctx context.Context, arg GetGroupsParams) ([]GetGroupsRow, error) GetHealthSettings(ctx context.Context) (string, error) - GetHungProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]ProvisionerJob, error) GetInboxNotificationByID(ctx context.Context, id uuid.UUID) (InboxNotification, error) // Fetches inbox notifications for a user filtered by templates and targets // param user_id: The user ID @@ -237,7 +236,6 @@ type sqlcQuerier interface { GetOrganizations(ctx context.Context, arg GetOrganizationsParams) ([]Organization, error) GetOrganizationsByUserID(ctx context.Context, arg GetOrganizationsByUserIDParams) ([]Organization, error) GetParameterSchemasByJobID(ctx context.Context, jobID uuid.UUID) ([]ParameterSchema, error) - GetPendingProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]ProvisionerJob, error) GetPrebuildMetrics(ctx context.Context) ([]GetPrebuildMetricsRow, error) GetPresetByID(ctx context.Context, presetID uuid.UUID) (GetPresetByIDRow, error) GetPresetByWorkspaceBuildID(ctx context.Context, workspaceBuildID uuid.UUID) (TemplateVersionPreset, error) @@ -271,6 +269,7 @@ type sqlcQuerier interface { GetProvisionerJobsByIDsWithQueuePosition(ctx context.Context, ids []uuid.UUID) ([]GetProvisionerJobsByIDsWithQueuePositionRow, error) GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisioner(ctx context.Context, arg GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisionerParams) ([]GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisionerRow, error) GetProvisionerJobsCreatedAfter(ctx context.Context, createdAt time.Time) ([]ProvisionerJob, error) + GetProvisionerJobsToBeReaped(ctx context.Context, arg GetProvisionerJobsToBeReapedParams) ([]ProvisionerJob, error) GetProvisionerKeyByHashedSecret(ctx context.Context, hashedSecret []byte) (ProvisionerKey, error) GetProvisionerKeyByID(ctx context.Context, id uuid.UUID) (ProvisionerKey, error) GetProvisionerKeyByName(ctx context.Context, arg GetProvisionerKeyByNameParams) (ProvisionerKey, error) diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index dce0903e84b83..10727fa75dec6 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -7376,114 +7376,6 @@ func (q *sqlQuerier) AcquireProvisionerJob(ctx context.Context, arg AcquireProvi return i, err } -const getHungProvisionerJobs = `-- name: GetHungProvisionerJobs :many -SELECT - id, created_at, updated_at, started_at, canceled_at, completed_at, error, organization_id, initiator_id, provisioner, storage_method, type, input, worker_id, file_id, tags, error_code, trace_metadata, job_status -FROM - provisioner_jobs -WHERE - updated_at < $1 - AND started_at IS NOT NULL - AND completed_at IS NULL -` - -func (q *sqlQuerier) GetHungProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]ProvisionerJob, error) { - rows, err := q.db.QueryContext(ctx, getHungProvisionerJobs, updatedAt) - if err != nil { - return nil, err - } - defer rows.Close() - var items []ProvisionerJob - for rows.Next() { - var i ProvisionerJob - if err := rows.Scan( - &i.ID, - &i.CreatedAt, - &i.UpdatedAt, - &i.StartedAt, - &i.CanceledAt, - &i.CompletedAt, - &i.Error, - &i.OrganizationID, - &i.InitiatorID, - &i.Provisioner, - &i.StorageMethod, - &i.Type, - &i.Input, - &i.WorkerID, - &i.FileID, - &i.Tags, - &i.ErrorCode, - &i.TraceMetadata, - &i.JobStatus, - ); err != nil { - return nil, err - } - items = append(items, i) - } - if err := rows.Close(); err != nil { - return nil, err - } - if err := rows.Err(); err != nil { - return nil, err - } - return items, nil -} - -const getPendingProvisionerJobs = `-- name: GetPendingProvisionerJobs :many -SELECT - id, created_at, updated_at, started_at, canceled_at, completed_at, error, organization_id, initiator_id, provisioner, storage_method, type, input, worker_id, file_id, tags, error_code, trace_metadata, job_status -FROM - provisioner_jobs -WHERE - updated_at < $1 - AND started_at IS NULL - AND completed_at IS NULL -` - -func (q *sqlQuerier) GetPendingProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]ProvisionerJob, error) { - rows, err := q.db.QueryContext(ctx, getPendingProvisionerJobs, updatedAt) - if err != nil { - return nil, err - } - defer rows.Close() - var items []ProvisionerJob - for rows.Next() { - var i ProvisionerJob - if err := rows.Scan( - &i.ID, - &i.CreatedAt, - &i.UpdatedAt, - &i.StartedAt, - &i.CanceledAt, - &i.CompletedAt, - &i.Error, - &i.OrganizationID, - &i.InitiatorID, - &i.Provisioner, - &i.StorageMethod, - &i.Type, - &i.Input, - &i.WorkerID, - &i.FileID, - &i.Tags, - &i.ErrorCode, - &i.TraceMetadata, - &i.JobStatus, - ); err != nil { - return nil, err - } - items = append(items, i) - } - if err := rows.Close(); err != nil { - return nil, err - } - if err := rows.Err(); err != nil { - return nil, err - } - return items, nil -} - const getProvisionerJobByID = `-- name: GetProvisionerJobByID :one SELECT id, created_at, updated_at, started_at, canceled_at, completed_at, error, organization_id, initiator_id, provisioner, storage_method, type, input, worker_id, file_id, tags, error_code, trace_metadata, job_status @@ -7951,6 +7843,79 @@ func (q *sqlQuerier) GetProvisionerJobsCreatedAfter(ctx context.Context, created return items, nil } +const getProvisionerJobsToBeReaped = `-- name: GetProvisionerJobsToBeReaped :many +SELECT + id, created_at, updated_at, started_at, canceled_at, completed_at, error, organization_id, initiator_id, provisioner, storage_method, type, input, worker_id, file_id, tags, error_code, trace_metadata, job_status +FROM + provisioner_jobs +WHERE + ( + -- If the job has not been started within $1, reap it. + updated_at < $1 + AND started_at IS NULL + AND completed_at IS NULL + ) + OR + ( + -- If the job has been started but not completed within $2, reap it. + updated_at < $2 + AND started_at IS NOT NULL + AND completed_at IS NULL + ) +ORDER BY random() +LIMIT $3 +FOR UPDATE SKIP LOCKED +` + +type GetProvisionerJobsToBeReapedParams struct { + PendingSince time.Time `db:"pending_since" json:"pending_since"` + HungSince time.Time `db:"hung_since" json:"hung_since"` + MaxJobs int32 `db:"max_jobs" json:"max_jobs"` +} + +func (q *sqlQuerier) GetProvisionerJobsToBeReaped(ctx context.Context, arg GetProvisionerJobsToBeReapedParams) ([]ProvisionerJob, error) { + rows, err := q.db.QueryContext(ctx, getProvisionerJobsToBeReaped, arg.PendingSince, arg.HungSince, arg.MaxJobs) + if err != nil { + return nil, err + } + defer rows.Close() + var items []ProvisionerJob + for rows.Next() { + var i ProvisionerJob + if err := rows.Scan( + &i.ID, + &i.CreatedAt, + &i.UpdatedAt, + &i.StartedAt, + &i.CanceledAt, + &i.CompletedAt, + &i.Error, + &i.OrganizationID, + &i.InitiatorID, + &i.Provisioner, + &i.StorageMethod, + &i.Type, + &i.Input, + &i.WorkerID, + &i.FileID, + &i.Tags, + &i.ErrorCode, + &i.TraceMetadata, + &i.JobStatus, + ); err != nil { + return nil, err + } + items = append(items, i) + } + if err := rows.Close(); err != nil { + return nil, err + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} + const insertProvisionerJob = `-- name: InsertProvisionerJob :one INSERT INTO provisioner_jobs ( diff --git a/coderd/database/queries/provisionerjobs.sql b/coderd/database/queries/provisionerjobs.sql index 02654eb4ddc4e..b455babf95be6 100644 --- a/coderd/database/queries/provisionerjobs.sql +++ b/coderd/database/queries/provisionerjobs.sql @@ -268,25 +268,28 @@ SET WHERE id = $1; --- name: GetHungProvisionerJobs :many +-- name: GetProvisionerJobsToBeReaped :many SELECT * FROM provisioner_jobs WHERE - updated_at < $1 - AND started_at IS NOT NULL - AND completed_at IS NULL; - --- name: GetPendingProvisionerJobs :many -SELECT - * -FROM - provisioner_jobs -WHERE - updated_at < $1 - AND started_at IS NULL - AND completed_at IS NULL; + ( + -- If the job has not been started within $1, reap it. + updated_at < @pending_since + AND started_at IS NULL + AND completed_at IS NULL + ) + OR + ( + -- If the job has been started but not completed within $2, reap it. + updated_at < @hung_since + AND started_at IS NOT NULL + AND completed_at IS NULL + ) +ORDER BY random() +LIMIT @max_jobs +FOR UPDATE SKIP LOCKED; -- name: InsertProvisionerJobTimings :many INSERT INTO provisioner_job_timings (job_id, started_at, ended_at, stage, source, action, resource) diff --git a/coderd/jobreaper/detector.go b/coderd/jobreaper/detector.go index 3ff1f6f6ddf86..762a04c819230 100644 --- a/coderd/jobreaper/detector.go +++ b/coderd/jobreaper/detector.go @@ -4,8 +4,7 @@ import ( "context" "database/sql" "encoding/json" - "fmt" - "math/rand" //#nosec // this is only used for shuffling an array to pick random jobs to unhang + "fmt" //#nosec // this is only used for shuffling an array to pick random jobs to unhang "time" "golang.org/x/xerrors" @@ -25,9 +24,9 @@ const ( // before it is considered hung. HungJobDuration = 5 * time.Minute - // PendingJobTimeElapsed is the duration of time since the last update to a job + // PendingJobDuration is the duration of time since the last update to a job // before it is considered hung. - PendingJobTimeElapsed = 30 * time.Minute + PendingJobDuration = 30 * time.Minute // HungJobExitTimeout is the duration of time that provisioners should allow // for a graceful exit upon cancellation due to failing to send an update to @@ -43,11 +42,11 @@ const ( ) // jobLogMessages are written to provisioner job logs when a job is reaped -func jobLogMessages(jobType database.ProvisionerJobStatus, threshold time.Duration) []string { +func jobLogMessages(reapType ReapType, threshold time.Duration) []string { return []string{ "", "====================", - fmt.Sprintf("Coder: Build has been detected as %s for %.0f minutes and will be terminated.", jobType, threshold.Minutes()), + fmt.Sprintf("Coder: Build has been detected as %s for %.0f minutes and will be terminated.", reapType, threshold.Minutes()), "====================", "", } @@ -56,9 +55,16 @@ func jobLogMessages(jobType database.ProvisionerJobStatus, threshold time.Durati type jobToReap struct { ID uuid.UUID Threshold time.Duration - Status database.ProvisionerJobStatus + Type ReapType } +type ReapType string + +const ( + Pending ReapType = "pending" + Hung ReapType = "hung" +) + // acquireLockError is returned when the detector fails to acquire a lock and // cancels the current run. type acquireLockError struct{} @@ -183,44 +189,31 @@ func (d *Detector) run(t time.Time) Stats { Error: nil, } - jobsToReap := make([]*jobToReap, 0, MaxJobsPerRun) - - // Find all provisioner jobs that are currently running but have not - // received an update in the last 5 minutes. - hungJobs, err := d.db.GetHungProvisionerJobs(ctx, t.Add(-HungJobDuration)) + // Find all provisioner jobs to be reaped + jobs, err := d.db.GetProvisionerJobsToBeReaped(ctx, database.GetProvisionerJobsToBeReapedParams{ + PendingSince: t.Add(-PendingJobDuration), + HungSince: t.Add(-HungJobDuration), + MaxJobs: MaxJobsPerRun, + }) if err != nil { - stats.Error = xerrors.Errorf("get hung provisioner jobs: %w", err) + stats.Error = xerrors.Errorf("get provisioner jobs to be reaped: %w", err) return stats } - for _, job := range hungJobs { - jobsToReap = append(jobsToReap, &jobToReap{ - ID: job.ID, - Threshold: HungJobDuration, - }) - } - // Find all provisioner jobs that have not been started yet and have not - // received an update in the last 30 minutes. - jobsPending, err := d.db.GetPendingProvisionerJobs(ctx, t.Add(-PendingJobTimeElapsed)) - if err != nil { - stats.Error = xerrors.Errorf("get pending provisioner jobs: %w", err) - return stats - } - for _, job := range jobsPending { - jobsToReap = append(jobsToReap, &jobToReap{ - ID: job.ID, - Threshold: PendingJobTimeElapsed, - }) - } + jobsToReap := make([]*jobToReap, 0, len(jobs)) - // Limit the number of jobs we'll reap in a single run to avoid - // timing out. - if len(jobsToReap) > MaxJobsPerRun { - // Pick a random subset of the jobs to reap. - rand.Shuffle(len(jobsToReap), func(i, j int) { - jobsToReap[i], jobsToReap[j] = jobsToReap[j], jobsToReap[i] - }) - jobsToReap = jobsToReap[:MaxJobsPerRun] + for _, job := range jobs { + j := &jobToReap{ + ID: job.ID, + } + if job.JobStatus == database.ProvisionerJobStatusPending { + j.Threshold = PendingJobDuration + j.Type = Pending + } else { + j.Threshold = HungJobDuration + j.Type = Hung + } + jobsToReap = append(jobsToReap, j) } // Send a message into the build log for each hung or pending job saying that it @@ -231,7 +224,7 @@ func (d *Detector) run(t time.Time) Stats { err := reapJob(ctx, log, d.db, d.pubsub, job) if err != nil { if !(xerrors.As(err, &acquireLockError{}) || xerrors.As(err, &jobIneligibleError{})) { - log.Error(ctx, fmt.Sprintf("error forcefully terminating %s provisioner job", job.Status), slog.Error(err)) + log.Error(ctx, fmt.Sprintf("error forcefully terminating %s provisioner job", job.Type), slog.Error(err)) } continue } @@ -261,22 +254,20 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub return xerrors.Errorf("get provisioner job: %w", err) } - threshold := reapParamsFromJob(job) - if job.CompletedAt.Valid { return jobIneligibleError{ Err: xerrors.Errorf("job is completed (status %s)", job.JobStatus), } } - if job.UpdatedAt.After(time.Now().Add(-threshold)) { + if job.UpdatedAt.After(time.Now().Add(-jobToReap.Threshold)) { return jobIneligibleError{ Err: xerrors.New("job has been updated recently"), } } log.Warn( - ctx, fmt.Sprintf("detected %s provisioner job, forcefully terminating", job.JobStatus), - "threshold", threshold, + ctx, fmt.Sprintf("detected %s provisioner job, forcefully terminating", jobToReap.Type), + "threshold", jobToReap.Threshold, ) // First, get the latest logs from the build so we can make sure @@ -286,7 +277,7 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub CreatedAfter: 0, }) if err != nil { - return xerrors.Errorf("get logs for %s job: %w", job.JobStatus, err) + return xerrors.Errorf("get logs for %s job: %w", jobToReap.Type, err) } logStage := "" if len(logs) != 0 { @@ -306,7 +297,7 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub Output: nil, } now := dbtime.Now() - for i, msg := range jobLogMessages(job.JobStatus, threshold) { + for i, msg := range jobLogMessages(jobToReap.Type, jobToReap.Threshold) { // Set the created at in a way that ensures each message has // a unique timestamp so they will be sorted correctly. insertParams.CreatedAt = append(insertParams.CreatedAt, now.Add(time.Millisecond*time.Duration(i))) @@ -340,7 +331,7 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub Valid: true, }, Error: sql.NullString{ - String: fmt.Sprintf("Coder: Build has been detected as %s for %.0f minutes and has been terminated by hang detector.", job.JobStatus, threshold.Minutes()), + String: fmt.Sprintf("Coder: Build has been detected as %s for %.0f minutes and has been terminated by hang detector.", jobToReap.Type, jobToReap.Threshold.Minutes()), Valid: true, }, ErrorCode: sql.NullString{ @@ -407,11 +398,3 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub return nil } - -func reapParamsFromJob(job database.ProvisionerJob) time.Duration { - threshold := HungJobDuration - if job.JobStatus == database.ProvisionerJobStatusPending { - threshold = PendingJobTimeElapsed - } - return threshold -} diff --git a/coderd/jobreaper/detector_test.go b/coderd/jobreaper/detector_test.go index ada9a9853f032..5d2a57b0fb47c 100644 --- a/coderd/jobreaper/detector_test.go +++ b/coderd/jobreaper/detector_test.go @@ -52,7 +52,7 @@ func reapParamsFromJob(job database.ProvisionerJob) (jobType, float64) { threshold := jobreaper.HungJobDuration.Minutes() if !job.StartedAt.Valid { jobType = pendingJobType - threshold = jobreaper.PendingJobTimeElapsed.Minutes() + threshold = jobreaper.PendingJobDuration.Minutes() } return jobType, threshold } diff --git a/coderd/rbac/roles.go b/coderd/rbac/roles.go index 0b94a74201b16..b9bcff5d42bdf 100644 --- a/coderd/rbac/roles.go +++ b/coderd/rbac/roles.go @@ -273,6 +273,7 @@ func ReloadBuiltinRoles(opts *RoleOptions) { Permissions(map[string][]policy.Action{ ResourceWorkspace.Type: ownerWorkspaceActions, ResourceWorkspaceDormant.Type: {policy.ActionRead, policy.ActionDelete, policy.ActionCreate, policy.ActionUpdate, policy.ActionWorkspaceStop}, + ResourceProvisionerJobs.Type: {policy.ActionRead, policy.ActionUpdate, policy.ActionCreate}, })...), Org: map[string][]Permission{}, User: []Permission{}, diff --git a/site/src/api/rbacresourcesGenerated.ts b/site/src/api/rbacresourcesGenerated.ts index 079dcb4a87a61..7016b78d876e0 100644 --- a/site/src/api/rbacresourcesGenerated.ts +++ b/site/src/api/rbacresourcesGenerated.ts @@ -131,6 +131,7 @@ export const RBACResourceActions: Partial< }, provisioner_jobs: { read: "read provisioner jobs", + update: "update provisioner jobs", }, replicas: { read: "read replicas", From 5120fb1d66852d2e0c32a0c893b648e3dc4aab1a Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Thu, 15 May 2025 19:19:04 +0000 Subject: [PATCH 14/24] WIP --- coderd/database/dbauthz/dbauthz.go | 1 + coderd/database/dbauthz/dbauthz_test.go | 27 +++++++++++-------------- coderd/rbac/object_gen.go | 1 + coderd/rbac/policy/policy.go | 1 + coderd/rbac/roles_test.go | 2 +- codersdk/rbacresources_gen.go | 2 +- 6 files changed, 17 insertions(+), 17 deletions(-) diff --git a/coderd/database/dbauthz/dbauthz.go b/coderd/database/dbauthz/dbauthz.go index 31e21a0f7788e..24adf526f7010 100644 --- a/coderd/database/dbauthz/dbauthz.go +++ b/coderd/database/dbauthz/dbauthz.go @@ -347,6 +347,7 @@ var ( rbac.ResourceNotificationTemplate.Type: {policy.ActionCreate, policy.ActionUpdate, policy.ActionDelete}, rbac.ResourceCryptoKey.Type: {policy.ActionCreate, policy.ActionUpdate, policy.ActionDelete}, rbac.ResourceFile.Type: {policy.ActionCreate, policy.ActionRead}, + rbac.ResourceProvisionerJobs.Type: {policy.ActionRead, policy.ActionUpdate, policy.ActionCreate}, }), Org: map[string][]rbac.Permission{}, User: []rbac.Permission{}, diff --git a/coderd/database/dbauthz/dbauthz_test.go b/coderd/database/dbauthz/dbauthz_test.go index 0bc116d63987d..517ef486c54a7 100644 --- a/coderd/database/dbauthz/dbauthz_test.go +++ b/coderd/database/dbauthz/dbauthz_test.go @@ -3892,7 +3892,7 @@ func (s *MethodTestSuite) TestSystemFunctions() { })) s.Run("GetProvisionerJobsCreatedAfter", s.Subtest(func(db database.Store, check *expects) { _ = dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{CreatedAt: time.Now().Add(-time.Hour)}) - check.Args(time.Now()).Asserts(rbac.ResourceSystem, policy.ActionRead) + check.Args(time.Now()).Asserts(rbac.ResourceProvisionerJobs, policy.ActionRead) })) s.Run("GetTemplateVersionsByIDs", s.Subtest(func(db database.Store, check *expects) { dbtestutil.DisableForeignKeysAndTriggers(s.T(), db) @@ -3978,7 +3978,7 @@ func (s *MethodTestSuite) TestSystemFunctions() { a := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) b := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args([]uuid.UUID{a.ID, b.ID}). - Asserts(rbac.ResourceSystem, policy.ActionRead). + Asserts(rbac.ResourceProvisionerJobs, policy.ActionRead). Returns(slice.New(a, b)) })) s.Run("InsertWorkspaceAgent", s.Subtest(func(db database.Store, check *expects) { @@ -4022,26 +4022,26 @@ func (s *MethodTestSuite) TestSystemFunctions() { OrganizationID: j.OrganizationID, Types: []database.ProvisionerType{j.Provisioner}, ProvisionerTags: must(json.Marshal(j.Tags)), - }).Asserts(rbac.ResourceSystem, policy.ActionUpdate) + }).Asserts(rbac.ResourceProvisionerJobs, policy.ActionUpdate) })) s.Run("UpdateProvisionerJobWithCompleteByID", s.Subtest(func(db database.Store, check *expects) { j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args(database.UpdateProvisionerJobWithCompleteByIDParams{ ID: j.ID, - }).Asserts(rbac.ResourceSystem, policy.ActionUpdate) + }).Asserts(rbac.ResourceProvisionerJobs, policy.ActionUpdate) })) s.Run("UpdateProvisionerJobWithCompleteWithStartedAtByID", s.Subtest(func(db database.Store, check *expects) { j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args(database.UpdateProvisionerJobWithCompleteWithStartedAtByIDParams{ ID: j.ID, - }).Asserts(rbac.ResourceSystem, policy.ActionUpdate) + }).Asserts(rbac.ResourceProvisionerJobs, policy.ActionUpdate) })) s.Run("UpdateProvisionerJobByID", s.Subtest(func(db database.Store, check *expects) { j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args(database.UpdateProvisionerJobByIDParams{ ID: j.ID, UpdatedAt: time.Now(), - }).Asserts(rbac.ResourceSystem, policy.ActionUpdate) + }).Asserts(rbac.ResourceProvisionerJobs, policy.ActionUpdate) })) s.Run("InsertProvisionerJob", s.Subtest(func(db database.Store, check *expects) { dbtestutil.DisableForeignKeysAndTriggers(s.T(), db) @@ -4051,19 +4051,19 @@ func (s *MethodTestSuite) TestSystemFunctions() { StorageMethod: database.ProvisionerStorageMethodFile, Type: database.ProvisionerJobTypeWorkspaceBuild, Input: json.RawMessage("{}"), - }).Asserts(rbac.ResourceSystem, policy.ActionCreate) + }).Asserts(rbac.ResourceProvisionerJobs, policy.ActionCreate) })) s.Run("InsertProvisionerJobLogs", s.Subtest(func(db database.Store, check *expects) { j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args(database.InsertProvisionerJobLogsParams{ JobID: j.ID, - }).Asserts(rbac.ResourceSystem, policy.ActionCreate) + }).Asserts(rbac.ResourceProvisionerJobs, policy.ActionUpdate) })) s.Run("InsertProvisionerJobTimings", s.Subtest(func(db database.Store, check *expects) { j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args(database.InsertProvisionerJobTimingsParams{ JobID: j.ID, - }).Asserts(rbac.ResourceSystem, policy.ActionCreate) + }).Asserts(rbac.ResourceProvisionerJobs, policy.ActionUpdate) })) s.Run("UpsertProvisionerDaemon", s.Subtest(func(db database.Store, check *expects) { dbtestutil.DisableForeignKeysAndTriggers(s.T(), db) @@ -4199,11 +4199,8 @@ func (s *MethodTestSuite) TestSystemFunctions() { s.Run("GetFileTemplates", s.Subtest(func(db database.Store, check *expects) { check.Args(uuid.New()).Asserts(rbac.ResourceSystem, policy.ActionRead) })) - s.Run("GetHungProvisionerJobs", s.Subtest(func(db database.Store, check *expects) { - check.Args(time.Time{}).Asserts() - })) - s.Run("GetPendingProvisionerJobs", s.Subtest(func(db database.Store, check *expects) { - check.Args(time.Time{}).Asserts() + s.Run("GetProvisionerJobsToBeReaped", s.Subtest(func(db database.Store, check *expects) { + check.Args(database.GetProvisionerJobsToBeReapedParams{}).Asserts(rbac.ResourceProvisionerJobs, policy.ActionRead) })) s.Run("UpsertOAuthSigningKey", s.Subtest(func(db database.Store, check *expects) { check.Args("foo").Asserts(rbac.ResourceSystem, policy.ActionUpdate) @@ -4282,7 +4279,7 @@ func (s *MethodTestSuite) TestSystemFunctions() { check.Args([]uuid.UUID{uuid.New()}).Asserts(rbac.ResourceSystem, policy.ActionRead) })) s.Run("GetProvisionerJobsByIDsWithQueuePosition", s.Subtest(func(db database.Store, check *expects) { - check.Args([]uuid.UUID{}).Asserts() + check.Args([]uuid.UUID{}).Asserts(rbac.ResourceProvisionerJobs, policy.ActionRead) })) s.Run("GetReplicaByID", s.Subtest(func(db database.Store, check *expects) { check.Args(uuid.New()).Asserts(rbac.ResourceSystem, policy.ActionRead).Errors(sql.ErrNoRows) diff --git a/coderd/rbac/object_gen.go b/coderd/rbac/object_gen.go index 0336616b5e550..ad1a510fd44bd 100644 --- a/coderd/rbac/object_gen.go +++ b/coderd/rbac/object_gen.go @@ -234,6 +234,7 @@ var ( // ResourceProvisionerJobs // Valid Actions + // - "ActionCreate" :: create provisioner jobs // - "ActionRead" :: read provisioner jobs // - "ActionUpdate" :: update provisioner jobs ResourceProvisionerJobs = Object{ diff --git a/coderd/rbac/policy/policy.go b/coderd/rbac/policy/policy.go index 070692c5e50c6..c37e84c48f964 100644 --- a/coderd/rbac/policy/policy.go +++ b/coderd/rbac/policy/policy.go @@ -184,6 +184,7 @@ var RBACPermissions = map[string]PermissionDefinition{ Actions: map[Action]ActionDefinition{ ActionRead: actDef("read provisioner jobs"), ActionUpdate: actDef("update provisioner jobs"), + ActionCreate: actDef("create provisioner jobs"), }, }, "organization": { diff --git a/coderd/rbac/roles_test.go b/coderd/rbac/roles_test.go index 5f1381caedf27..6d42a01474d1a 100644 --- a/coderd/rbac/roles_test.go +++ b/coderd/rbac/roles_test.go @@ -580,7 +580,7 @@ func TestRolePermissions(t *testing.T) { }, { Name: "ProvisionerJobs", - Actions: []policy.Action{policy.ActionRead, policy.ActionUpdate}, + Actions: []policy.Action{policy.ActionRead, policy.ActionUpdate, policy.ActionCreate}, Resource: rbac.ResourceProvisionerJobs.InOrg(orgID), AuthorizeMap: map[bool][]hasAuthSubjects{ true: {owner, orgTemplateAdmin, orgAdmin}, diff --git a/codersdk/rbacresources_gen.go b/codersdk/rbacresources_gen.go index 8a89ac6d5e507..6157281f21356 100644 --- a/codersdk/rbacresources_gen.go +++ b/codersdk/rbacresources_gen.go @@ -90,7 +90,7 @@ var RBACResourceActions = map[RBACResource][]RBACAction{ ResourceOrganization: {ActionCreate, ActionDelete, ActionRead, ActionUpdate}, ResourceOrganizationMember: {ActionCreate, ActionDelete, ActionRead, ActionUpdate}, ResourceProvisionerDaemon: {ActionCreate, ActionDelete, ActionRead, ActionUpdate}, - ResourceProvisionerJobs: {ActionRead, ActionUpdate, ActionCreate}, + ResourceProvisionerJobs: {ActionCreate, ActionRead, ActionUpdate}, ResourceReplicas: {ActionRead}, ResourceSystem: {ActionCreate, ActionDelete, ActionRead, ActionUpdate}, ResourceTailnetCoordinator: {ActionCreate, ActionDelete, ActionRead, ActionUpdate}, From 8d4fa5aaaf7af956917d81d58760d721c7c4b058 Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Thu, 15 May 2025 19:20:53 +0000 Subject: [PATCH 15/24] fixed sql comments --- coderd/database/queries/provisionerjobs.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/coderd/database/queries/provisionerjobs.sql b/coderd/database/queries/provisionerjobs.sql index b455babf95be6..2317ea38ac683 100644 --- a/coderd/database/queries/provisionerjobs.sql +++ b/coderd/database/queries/provisionerjobs.sql @@ -275,14 +275,14 @@ FROM provisioner_jobs WHERE ( - -- If the job has not been started within $1, reap it. + -- If the job has not been started before @pending_since, reap it. updated_at < @pending_since AND started_at IS NULL AND completed_at IS NULL ) OR ( - -- If the job has been started but not completed within $2, reap it. + -- If the job has been started but not completed before @hung_since, reap it. updated_at < @hung_since AND started_at IS NOT NULL AND completed_at IS NULL From 18b809c6438850ad0d808074b6d0f2098d7bee0c Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Fri, 16 May 2025 14:16:49 +0000 Subject: [PATCH 16/24] taking a step back with RBAC --- coderd/database/dbauthz/dbauthz.go | 77 +++++++++++++------------ coderd/database/dbauthz/dbauthz_test.go | 20 +++---- coderd/database/queries.sql.go | 4 +- site/src/api/rbacresourcesGenerated.ts | 1 + 4 files changed, 54 insertions(+), 48 deletions(-) diff --git a/coderd/database/dbauthz/dbauthz.go b/coderd/database/dbauthz/dbauthz.go index 24adf526f7010..56f0479457cbf 100644 --- a/coderd/database/dbauthz/dbauthz.go +++ b/coderd/database/dbauthz/dbauthz.go @@ -1088,9 +1088,9 @@ func (q *querier) AcquireNotificationMessages(ctx context.Context, arg database. } func (q *querier) AcquireProvisionerJob(ctx context.Context, arg database.AcquireProvisionerJobParams) (database.ProvisionerJob, error) { - if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { - return database.ProvisionerJob{}, err - } + // if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { + // return database.ProvisionerJob{}, err + // } return q.db.AcquireProvisionerJob(ctx, arg) } @@ -2309,30 +2309,31 @@ func (q *querier) GetProvisionerJobTimingsByJobID(ctx context.Context, jobID uui } func (q *querier) GetProvisionerJobsByIDs(ctx context.Context, ids []uuid.UUID) ([]database.ProvisionerJob, error) { - if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { - return nil, err - } + // if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { + // return nil, err + // } return q.db.GetProvisionerJobsByIDs(ctx, ids) } func (q *querier) GetProvisionerJobsByIDsWithQueuePosition(ctx context.Context, ids []uuid.UUID) ([]database.GetProvisionerJobsByIDsWithQueuePositionRow, error) { - if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { - return nil, err - } + // if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { + // return nil, err + // } + // policy.ActionRead, rbac.ResourceProvisionerJobs.InOrg(org.ID) return q.db.GetProvisionerJobsByIDsWithQueuePosition(ctx, ids) } func (q *querier) GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisioner(ctx context.Context, arg database.GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisionerParams) ([]database.GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisionerRow, error) { - if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { - return nil, err - } + // if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { + // return nil, err + // } return fetchWithPostFilter(q.auth, policy.ActionRead, q.db.GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisioner)(ctx, arg) } func (q *querier) GetProvisionerJobsCreatedAfter(ctx context.Context, createdAt time.Time) ([]database.ProvisionerJob, error) { - if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { - return nil, err - } + // if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { + // return nil, err + // } return q.db.GetProvisionerJobsCreatedAfter(ctx, createdAt) } @@ -3528,23 +3529,27 @@ func (q *querier) InsertPresetParameters(ctx context.Context, arg database.Inser } func (q *querier) InsertProvisionerJob(ctx context.Context, arg database.InsertProvisionerJobParams) (database.ProvisionerJob, error) { - if err := q.authorizeContext(ctx, policy.ActionCreate, rbac.ResourceProvisionerJobs); err != nil { - return database.ProvisionerJob{}, err - } + // TODO: Remove this once we have a proper rbac check for provisioner jobs. + // Currently ProvisionerJobs are not associated with a user, so we can't + // check for a user's permissions. We'd need to check for the associated workspace + // and verify ownership through that. + // if err := q.authorizeContext(ctx, policy.ActionCreate, rbac.ResourceProvisionerJobs); err != nil { + // return database.ProvisionerJob{}, err + // } return q.db.InsertProvisionerJob(ctx, arg) } func (q *querier) InsertProvisionerJobLogs(ctx context.Context, arg database.InsertProvisionerJobLogsParams) ([]database.ProvisionerJobLog, error) { - if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { - return nil, err - } + // if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { + // return nil, err + // } return q.db.InsertProvisionerJobLogs(ctx, arg) } func (q *querier) InsertProvisionerJobTimings(ctx context.Context, arg database.InsertProvisionerJobTimingsParams) ([]database.ProvisionerJobTiming, error) { - if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { - return nil, err - } + // if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { + // return nil, err + // } return q.db.InsertProvisionerJobTimings(ctx, arg) } @@ -4168,16 +4173,16 @@ func (q *querier) UpdateProvisionerDaemonLastSeenAt(ctx context.Context, arg dat } func (q *querier) UpdateProvisionerJobByID(ctx context.Context, arg database.UpdateProvisionerJobByIDParams) error { - if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { - return err - } + // if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { + // return err + // } return q.db.UpdateProvisionerJobByID(ctx, arg) } func (q *querier) UpdateProvisionerJobWithCancelByID(ctx context.Context, arg database.UpdateProvisionerJobWithCancelByIDParams) error { - if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { - return err - } + // if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { + // return err + // } job, err := q.db.GetProvisionerJobByID(ctx, arg.ID) if err != nil { @@ -4246,16 +4251,16 @@ func (q *querier) UpdateProvisionerJobWithCancelByID(ctx context.Context, arg da } func (q *querier) UpdateProvisionerJobWithCompleteByID(ctx context.Context, arg database.UpdateProvisionerJobWithCompleteByIDParams) error { - if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { - return err - } + // if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { + // return err + // } return q.db.UpdateProvisionerJobWithCompleteByID(ctx, arg) } func (q *querier) UpdateProvisionerJobWithCompleteWithStartedAtByID(ctx context.Context, arg database.UpdateProvisionerJobWithCompleteWithStartedAtByIDParams) error { - if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { - return err - } + // if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { + // return err + // } return q.db.UpdateProvisionerJobWithCompleteWithStartedAtByID(ctx, arg) } diff --git a/coderd/database/dbauthz/dbauthz_test.go b/coderd/database/dbauthz/dbauthz_test.go index 517ef486c54a7..3fb5e3d62f135 100644 --- a/coderd/database/dbauthz/dbauthz_test.go +++ b/coderd/database/dbauthz/dbauthz_test.go @@ -3892,7 +3892,7 @@ func (s *MethodTestSuite) TestSystemFunctions() { })) s.Run("GetProvisionerJobsCreatedAfter", s.Subtest(func(db database.Store, check *expects) { _ = dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{CreatedAt: time.Now().Add(-time.Hour)}) - check.Args(time.Now()).Asserts(rbac.ResourceProvisionerJobs, policy.ActionRead) + check.Args(time.Now()).Asserts( /* rbac.ResourceProvisionerJobs, policy.ActionRead */ ) })) s.Run("GetTemplateVersionsByIDs", s.Subtest(func(db database.Store, check *expects) { dbtestutil.DisableForeignKeysAndTriggers(s.T(), db) @@ -3978,7 +3978,7 @@ func (s *MethodTestSuite) TestSystemFunctions() { a := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) b := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args([]uuid.UUID{a.ID, b.ID}). - Asserts(rbac.ResourceProvisionerJobs, policy.ActionRead). + Asserts( /* rbac.ResourceProvisionerJobs, policy.ActionRead */ ). Returns(slice.New(a, b)) })) s.Run("InsertWorkspaceAgent", s.Subtest(func(db database.Store, check *expects) { @@ -4022,26 +4022,26 @@ func (s *MethodTestSuite) TestSystemFunctions() { OrganizationID: j.OrganizationID, Types: []database.ProvisionerType{j.Provisioner}, ProvisionerTags: must(json.Marshal(j.Tags)), - }).Asserts(rbac.ResourceProvisionerJobs, policy.ActionUpdate) + }).Asserts( /* rbac.ResourceProvisionerJobs, policy.ActionUpdate */ ) })) s.Run("UpdateProvisionerJobWithCompleteByID", s.Subtest(func(db database.Store, check *expects) { j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args(database.UpdateProvisionerJobWithCompleteByIDParams{ ID: j.ID, - }).Asserts(rbac.ResourceProvisionerJobs, policy.ActionUpdate) + }).Asserts( /* rbac.ResourceProvisionerJobs, policy.ActionUpdate */ ) })) s.Run("UpdateProvisionerJobWithCompleteWithStartedAtByID", s.Subtest(func(db database.Store, check *expects) { j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args(database.UpdateProvisionerJobWithCompleteWithStartedAtByIDParams{ ID: j.ID, - }).Asserts(rbac.ResourceProvisionerJobs, policy.ActionUpdate) + }).Asserts( /* rbac.ResourceProvisionerJobs, policy.ActionUpdate */ ) })) s.Run("UpdateProvisionerJobByID", s.Subtest(func(db database.Store, check *expects) { j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args(database.UpdateProvisionerJobByIDParams{ ID: j.ID, UpdatedAt: time.Now(), - }).Asserts(rbac.ResourceProvisionerJobs, policy.ActionUpdate) + }).Asserts( /* rbac.ResourceProvisionerJobs, policy.ActionUpdate */ ) })) s.Run("InsertProvisionerJob", s.Subtest(func(db database.Store, check *expects) { dbtestutil.DisableForeignKeysAndTriggers(s.T(), db) @@ -4051,19 +4051,19 @@ func (s *MethodTestSuite) TestSystemFunctions() { StorageMethod: database.ProvisionerStorageMethodFile, Type: database.ProvisionerJobTypeWorkspaceBuild, Input: json.RawMessage("{}"), - }).Asserts(rbac.ResourceProvisionerJobs, policy.ActionCreate) + }).Asserts( /* rbac.ResourceProvisionerJobs, policy.ActionCreate */ ) })) s.Run("InsertProvisionerJobLogs", s.Subtest(func(db database.Store, check *expects) { j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args(database.InsertProvisionerJobLogsParams{ JobID: j.ID, - }).Asserts(rbac.ResourceProvisionerJobs, policy.ActionUpdate) + }).Asserts( /* rbac.ResourceProvisionerJobs, policy.ActionUpdate */ ) })) s.Run("InsertProvisionerJobTimings", s.Subtest(func(db database.Store, check *expects) { j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args(database.InsertProvisionerJobTimingsParams{ JobID: j.ID, - }).Asserts(rbac.ResourceProvisionerJobs, policy.ActionUpdate) + }).Asserts( /* rbac.ResourceProvisionerJobs, policy.ActionUpdate */ ) })) s.Run("UpsertProvisionerDaemon", s.Subtest(func(db database.Store, check *expects) { dbtestutil.DisableForeignKeysAndTriggers(s.T(), db) @@ -4279,7 +4279,7 @@ func (s *MethodTestSuite) TestSystemFunctions() { check.Args([]uuid.UUID{uuid.New()}).Asserts(rbac.ResourceSystem, policy.ActionRead) })) s.Run("GetProvisionerJobsByIDsWithQueuePosition", s.Subtest(func(db database.Store, check *expects) { - check.Args([]uuid.UUID{}).Asserts(rbac.ResourceProvisionerJobs, policy.ActionRead) + check.Args([]uuid.UUID{}).Asserts( /* rbac.ResourceProvisionerJobs, policy.ActionRead */ ) })) s.Run("GetReplicaByID", s.Subtest(func(db database.Store, check *expects) { check.Args(uuid.New()).Asserts(rbac.ResourceSystem, policy.ActionRead).Errors(sql.ErrNoRows) diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index 10727fa75dec6..06aba2e412071 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -7850,14 +7850,14 @@ FROM provisioner_jobs WHERE ( - -- If the job has not been started within $1, reap it. + -- If the job has not been started before @pending_since, reap it. updated_at < $1 AND started_at IS NULL AND completed_at IS NULL ) OR ( - -- If the job has been started but not completed within $2, reap it. + -- If the job has been started but not completed before @hung_since, reap it. updated_at < $2 AND started_at IS NOT NULL AND completed_at IS NULL diff --git a/site/src/api/rbacresourcesGenerated.ts b/site/src/api/rbacresourcesGenerated.ts index 7016b78d876e0..3acb86c079908 100644 --- a/site/src/api/rbacresourcesGenerated.ts +++ b/site/src/api/rbacresourcesGenerated.ts @@ -130,6 +130,7 @@ export const RBACResourceActions: Partial< update: "update a provisioner daemon", }, provisioner_jobs: { + create: "create provisioner jobs", read: "read provisioner jobs", update: "update provisioner jobs", }, From 0fe1404bb3c5777fa9983fe7bf268b05873e25f8 Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Fri, 16 May 2025 14:40:32 +0000 Subject: [PATCH 17/24] WIP --- coderd/database/dbauthz/dbauthz.go | 7 ++++ coderd/database/dbauthz/dbauthz_test.go | 3 ++ coderd/database/dbmem/dbmem.go | 7 ++++ coderd/database/dbmetrics/querymetrics.go | 7 ++++ coderd/database/dbmock/dbmock.go | 15 ++++++++ coderd/database/querier.go | 3 ++ coderd/database/queries.sql.go | 41 ++++++++++++++++++++- coderd/database/queries/provisionerjobs.sql | 15 +++++++- coderd/jobreaper/detector.go | 14 ++----- 9 files changed, 99 insertions(+), 13 deletions(-) diff --git a/coderd/database/dbauthz/dbauthz.go b/coderd/database/dbauthz/dbauthz.go index 56f0479457cbf..43210084eec5e 100644 --- a/coderd/database/dbauthz/dbauthz.go +++ b/coderd/database/dbauthz/dbauthz.go @@ -2300,6 +2300,13 @@ func (q *querier) GetProvisionerJobByID(ctx context.Context, id uuid.UUID) (data return job, nil } +func (q *querier) GetProvisionerJobByIDForUpdate(ctx context.Context, id uuid.UUID) (database.ProvisionerJob, error) { + if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { + return database.ProvisionerJob{}, err + } + return q.db.GetProvisionerJobByIDForUpdate(ctx, id) +} + func (q *querier) GetProvisionerJobTimingsByJobID(ctx context.Context, jobID uuid.UUID) ([]database.ProvisionerJobTiming, error) { _, err := q.GetProvisionerJobByID(ctx, jobID) if err != nil { diff --git a/coderd/database/dbauthz/dbauthz_test.go b/coderd/database/dbauthz/dbauthz_test.go index 3fb5e3d62f135..57532855955ee 100644 --- a/coderd/database/dbauthz/dbauthz_test.go +++ b/coderd/database/dbauthz/dbauthz_test.go @@ -4444,6 +4444,9 @@ func (s *MethodTestSuite) TestSystemFunctions() { VapidPrivateKey: "test", }).Asserts(rbac.ResourceDeploymentConfig, policy.ActionUpdate) })) + s.Run("GetProvisionerJobByIDForUpdate", s.Subtest(func(db database.Store, check *expects) { + check.Args(uuid.New()).Asserts(rbac.ResourceProvisionerJobs, policy.ActionRead).Errors(sql.ErrNoRows) + })) } func (s *MethodTestSuite) TestNotifications() { diff --git a/coderd/database/dbmem/dbmem.go b/coderd/database/dbmem/dbmem.go index 33eeaf5f5f936..d6725b066b732 100644 --- a/coderd/database/dbmem/dbmem.go +++ b/coderd/database/dbmem/dbmem.go @@ -4625,6 +4625,13 @@ func (q *FakeQuerier) GetProvisionerJobByID(ctx context.Context, id uuid.UUID) ( return q.getProvisionerJobByIDNoLock(ctx, id) } +func (q *FakeQuerier) GetProvisionerJobByIDForUpdate(ctx context.Context, id uuid.UUID) (database.ProvisionerJob, error) { + q.mutex.RLock() + defer q.mutex.RUnlock() + + return q.getProvisionerJobByIDNoLock(ctx, id) +} + func (q *FakeQuerier) GetProvisionerJobTimingsByJobID(_ context.Context, jobID uuid.UUID) ([]database.ProvisionerJobTiming, error) { q.mutex.RLock() defer q.mutex.RUnlock() diff --git a/coderd/database/dbmetrics/querymetrics.go b/coderd/database/dbmetrics/querymetrics.go index 4be271533a97c..854d03fc666cf 100644 --- a/coderd/database/dbmetrics/querymetrics.go +++ b/coderd/database/dbmetrics/querymetrics.go @@ -1187,6 +1187,13 @@ func (m queryMetricsStore) GetProvisionerJobByID(ctx context.Context, id uuid.UU return job, err } +func (m queryMetricsStore) GetProvisionerJobByIDForUpdate(ctx context.Context, id uuid.UUID) (database.ProvisionerJob, error) { + start := time.Now() + r0, r1 := m.s.GetProvisionerJobByIDForUpdate(ctx, id) + m.queryLatencies.WithLabelValues("GetProvisionerJobByIDForUpdate").Observe(time.Since(start).Seconds()) + return r0, r1 +} + func (m queryMetricsStore) GetProvisionerJobTimingsByJobID(ctx context.Context, jobID uuid.UUID) ([]database.ProvisionerJobTiming, error) { start := time.Now() r0, r1 := m.s.GetProvisionerJobTimingsByJobID(ctx, jobID) diff --git a/coderd/database/dbmock/dbmock.go b/coderd/database/dbmock/dbmock.go index 5f0024aa22c42..5dbe039b46758 100644 --- a/coderd/database/dbmock/dbmock.go +++ b/coderd/database/dbmock/dbmock.go @@ -2433,6 +2433,21 @@ func (mr *MockStoreMockRecorder) GetProvisionerJobByID(ctx, id any) *gomock.Call return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetProvisionerJobByID", reflect.TypeOf((*MockStore)(nil).GetProvisionerJobByID), ctx, id) } +// GetProvisionerJobByIDForUpdate mocks base method. +func (m *MockStore) GetProvisionerJobByIDForUpdate(ctx context.Context, id uuid.UUID) (database.ProvisionerJob, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetProvisionerJobByIDForUpdate", ctx, id) + ret0, _ := ret[0].(database.ProvisionerJob) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetProvisionerJobByIDForUpdate indicates an expected call of GetProvisionerJobByIDForUpdate. +func (mr *MockStoreMockRecorder) GetProvisionerJobByIDForUpdate(ctx, id any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetProvisionerJobByIDForUpdate", reflect.TypeOf((*MockStore)(nil).GetProvisionerJobByIDForUpdate), ctx, id) +} + // GetProvisionerJobTimingsByJobID mocks base method. func (m *MockStore) GetProvisionerJobTimingsByJobID(ctx context.Context, jobID uuid.UUID) ([]database.ProvisionerJobTiming, error) { m.ctrl.T.Helper() diff --git a/coderd/database/querier.go b/coderd/database/querier.go index af85e9d0d9c22..b956aea67c130 100644 --- a/coderd/database/querier.go +++ b/coderd/database/querier.go @@ -264,6 +264,9 @@ type sqlcQuerier interface { // Previous job information. GetProvisionerDaemonsWithStatusByOrganization(ctx context.Context, arg GetProvisionerDaemonsWithStatusByOrganizationParams) ([]GetProvisionerDaemonsWithStatusByOrganizationRow, error) GetProvisionerJobByID(ctx context.Context, id uuid.UUID) (ProvisionerJob, error) + // Gets a single provisioner job by ID for update. + // This is used to securely reap jobs that have been hung/pending for a long time. + GetProvisionerJobByIDForUpdate(ctx context.Context, id uuid.UUID) (ProvisionerJob, error) GetProvisionerJobTimingsByJobID(ctx context.Context, jobID uuid.UUID) ([]ProvisionerJobTiming, error) GetProvisionerJobsByIDs(ctx context.Context, ids []uuid.UUID) ([]ProvisionerJob, error) GetProvisionerJobsByIDsWithQueuePosition(ctx context.Context, ids []uuid.UUID) ([]GetProvisionerJobsByIDsWithQueuePositionRow, error) diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index 06aba2e412071..e119c0f002ea2 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -7412,6 +7412,46 @@ func (q *sqlQuerier) GetProvisionerJobByID(ctx context.Context, id uuid.UUID) (P return i, err } +const getProvisionerJobByIDForUpdate = `-- name: GetProvisionerJobByIDForUpdate :one +SELECT + id, created_at, updated_at, started_at, canceled_at, completed_at, error, organization_id, initiator_id, provisioner, storage_method, type, input, worker_id, file_id, tags, error_code, trace_metadata, job_status +FROM + provisioner_jobs +WHERE + id = $1 +FOR UPDATE +SKIP LOCKED +` + +// Gets a single provisioner job by ID for update. +// This is used to securely reap jobs that have been hung/pending for a long time. +func (q *sqlQuerier) GetProvisionerJobByIDForUpdate(ctx context.Context, id uuid.UUID) (ProvisionerJob, error) { + row := q.db.QueryRowContext(ctx, getProvisionerJobByIDForUpdate, id) + var i ProvisionerJob + err := row.Scan( + &i.ID, + &i.CreatedAt, + &i.UpdatedAt, + &i.StartedAt, + &i.CanceledAt, + &i.CompletedAt, + &i.Error, + &i.OrganizationID, + &i.InitiatorID, + &i.Provisioner, + &i.StorageMethod, + &i.Type, + &i.Input, + &i.WorkerID, + &i.FileID, + &i.Tags, + &i.ErrorCode, + &i.TraceMetadata, + &i.JobStatus, + ) + return i, err +} + const getProvisionerJobTimingsByJobID = `-- name: GetProvisionerJobTimingsByJobID :many SELECT job_id, started_at, ended_at, stage, source, action, resource FROM provisioner_job_timings WHERE job_id = $1 @@ -7864,7 +7904,6 @@ WHERE ) ORDER BY random() LIMIT $3 -FOR UPDATE SKIP LOCKED ` type GetProvisionerJobsToBeReapedParams struct { diff --git a/coderd/database/queries/provisionerjobs.sql b/coderd/database/queries/provisionerjobs.sql index 2317ea38ac683..1a01cbdf8ddd4 100644 --- a/coderd/database/queries/provisionerjobs.sql +++ b/coderd/database/queries/provisionerjobs.sql @@ -41,6 +41,18 @@ FROM WHERE id = $1; +-- name: GetProvisionerJobByIDForUpdate :one +-- Gets a single provisioner job by ID for update. +-- This is used to securely reap jobs that have been hung/pending for a long time. +SELECT + * +FROM + provisioner_jobs +WHERE + id = $1 +FOR UPDATE +SKIP LOCKED; + -- name: GetProvisionerJobsByIDs :many SELECT * @@ -288,8 +300,7 @@ WHERE AND completed_at IS NULL ) ORDER BY random() -LIMIT @max_jobs -FOR UPDATE SKIP LOCKED; +LIMIT @max_jobs; -- name: InsertProvisionerJobTimings :many INSERT INTO provisioner_job_timings (job_id, started_at, ended_at, stage, source, action, resource) diff --git a/coderd/jobreaper/detector.go b/coderd/jobreaper/detector.go index 762a04c819230..da33d2f537905 100644 --- a/coderd/jobreaper/detector.go +++ b/coderd/jobreaper/detector.go @@ -239,18 +239,12 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub var lowestLogID int64 err := db.InTx(func(db database.Store) error { - locked, err := db.TryAcquireLock(ctx, database.GenLockID(fmt.Sprintf("reaper:%s", jobToReap.ID))) - if err != nil { - return xerrors.Errorf("acquire lock: %w", err) - } - if !locked { - // This error is ignored. - return acquireLockError{} - } - // Refetch the job while we hold the lock. - job, err := db.GetProvisionerJobByID(ctx, jobToReap.ID) + job, err := db.GetProvisionerJobByIDForUpdate(ctx, jobToReap.ID) if err != nil { + if xerrors.Is(err, sql.ErrNoRows) { + return acquireLockError{} + } return xerrors.Errorf("get provisioner job: %w", err) } From 77be34e62b1e0f498d5873f1fe20bad4f2c4f2b2 Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Fri, 16 May 2025 16:32:46 +0000 Subject: [PATCH 18/24] WIP --- coderd/database/dbauthz/dbauthz.go | 19 ++++++++++++------- coderd/database/dbauthz/dbauthz_test.go | 20 ++++++++++++-------- coderd/rbac/roles.go | 1 - 3 files changed, 24 insertions(+), 16 deletions(-) diff --git a/coderd/database/dbauthz/dbauthz.go b/coderd/database/dbauthz/dbauthz.go index 43210084eec5e..a2aee40ba801d 100644 --- a/coderd/database/dbauthz/dbauthz.go +++ b/coderd/database/dbauthz/dbauthz.go @@ -2316,9 +2316,15 @@ func (q *querier) GetProvisionerJobTimingsByJobID(ctx context.Context, jobID uui } func (q *querier) GetProvisionerJobsByIDs(ctx context.Context, ids []uuid.UUID) ([]database.ProvisionerJob, error) { - // if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { - // return nil, err - // } + provisionerJobs, err := q.db.GetProvisionerJobsByIDs(ctx, ids) + if err != nil { + return nil, err + } + for _, job := range provisionerJobs { + if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs.InOrg(job.OrganizationID)); err != nil { + return nil, err + } + } return q.db.GetProvisionerJobsByIDs(ctx, ids) } @@ -2326,7 +2332,6 @@ func (q *querier) GetProvisionerJobsByIDsWithQueuePosition(ctx context.Context, // if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { // return nil, err // } - // policy.ActionRead, rbac.ResourceProvisionerJobs.InOrg(org.ID) return q.db.GetProvisionerJobsByIDsWithQueuePosition(ctx, ids) } @@ -2338,9 +2343,9 @@ func (q *querier) GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndP } func (q *querier) GetProvisionerJobsCreatedAfter(ctx context.Context, createdAt time.Time) ([]database.ProvisionerJob, error) { - // if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { - // return nil, err - // } + if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { + return nil, err + } return q.db.GetProvisionerJobsCreatedAfter(ctx, createdAt) } diff --git a/coderd/database/dbauthz/dbauthz_test.go b/coderd/database/dbauthz/dbauthz_test.go index 57532855955ee..381c9b55b8352 100644 --- a/coderd/database/dbauthz/dbauthz_test.go +++ b/coderd/database/dbauthz/dbauthz_test.go @@ -694,9 +694,12 @@ func (s *MethodTestSuite) TestProvisionerJob() { Asserts(v.RBACObject(tpl), []policy.Action{policy.ActionRead, policy.ActionUpdate}).Returns() })) s.Run("GetProvisionerJobsByIDs", s.Subtest(func(db database.Store, check *expects) { - a := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) - b := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) - check.Args([]uuid.UUID{a.ID, b.ID}).Asserts().Returns(slice.New(a, b)) + orgID := uuid.New() + a := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{OrganizationID: orgID}) + b := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{OrganizationID: orgID}) + check.Args([]uuid.UUID{a.ID, b.ID}). + Asserts(rbac.ResourceProvisionerJobs.InOrg(orgID), policy.ActionRead, rbac.ResourceProvisionerJobs.InOrg(orgID), policy.ActionRead). + Returns(slice.New(a, b)) })) s.Run("GetProvisionerLogsAfterID", s.Subtest(func(db database.Store, check *expects) { u := dbgen.User(s.T(), db, database.User{}) @@ -3892,7 +3895,7 @@ func (s *MethodTestSuite) TestSystemFunctions() { })) s.Run("GetProvisionerJobsCreatedAfter", s.Subtest(func(db database.Store, check *expects) { _ = dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{CreatedAt: time.Now().Add(-time.Hour)}) - check.Args(time.Now()).Asserts( /* rbac.ResourceProvisionerJobs, policy.ActionRead */ ) + check.Args(time.Now()).Asserts(rbac.ResourceProvisionerJobs, policy.ActionRead) })) s.Run("GetTemplateVersionsByIDs", s.Subtest(func(db database.Store, check *expects) { dbtestutil.DisableForeignKeysAndTriggers(s.T(), db) @@ -3975,10 +3978,11 @@ func (s *MethodTestSuite) TestSystemFunctions() { Returns([]database.WorkspaceAgent{agt}) })) s.Run("GetProvisionerJobsByIDs", s.Subtest(func(db database.Store, check *expects) { - a := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) - b := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) + orgID := uuid.New() + a := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{OrganizationID: orgID}) + b := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{OrganizationID: orgID}) check.Args([]uuid.UUID{a.ID, b.ID}). - Asserts( /* rbac.ResourceProvisionerJobs, policy.ActionRead */ ). + Asserts(rbac.ResourceProvisionerJobs.InOrg(orgID), policy.ActionRead, rbac.ResourceProvisionerJobs.InOrg(orgID), policy.ActionRead). Returns(slice.New(a, b)) })) s.Run("InsertWorkspaceAgent", s.Subtest(func(db database.Store, check *expects) { @@ -4279,7 +4283,7 @@ func (s *MethodTestSuite) TestSystemFunctions() { check.Args([]uuid.UUID{uuid.New()}).Asserts(rbac.ResourceSystem, policy.ActionRead) })) s.Run("GetProvisionerJobsByIDsWithQueuePosition", s.Subtest(func(db database.Store, check *expects) { - check.Args([]uuid.UUID{}).Asserts( /* rbac.ResourceProvisionerJobs, policy.ActionRead */ ) + check.Args([]uuid.UUID{}).Asserts( /* rbac.ResourceProvisionerJobs.InOrg(orgID), policy.ActionRead */ ) })) s.Run("GetReplicaByID", s.Subtest(func(db database.Store, check *expects) { check.Args(uuid.New()).Asserts(rbac.ResourceSystem, policy.ActionRead).Errors(sql.ErrNoRows) diff --git a/coderd/rbac/roles.go b/coderd/rbac/roles.go index b9bcff5d42bdf..0b94a74201b16 100644 --- a/coderd/rbac/roles.go +++ b/coderd/rbac/roles.go @@ -273,7 +273,6 @@ func ReloadBuiltinRoles(opts *RoleOptions) { Permissions(map[string][]policy.Action{ ResourceWorkspace.Type: ownerWorkspaceActions, ResourceWorkspaceDormant.Type: {policy.ActionRead, policy.ActionDelete, policy.ActionCreate, policy.ActionUpdate, policy.ActionWorkspaceStop}, - ResourceProvisionerJobs.Type: {policy.ActionRead, policy.ActionUpdate, policy.ActionCreate}, })...), Org: map[string][]Permission{}, User: []Permission{}, From 4351529183f4c507a9ef8d35e073ad5573a5e33f Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Fri, 16 May 2025 17:02:49 +0000 Subject: [PATCH 19/24] WIP --- coderd/database/dbauthz/dbauthz.go | 30 ++++++++++++------------- coderd/database/dbauthz/dbauthz_test.go | 10 ++++----- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/coderd/database/dbauthz/dbauthz.go b/coderd/database/dbauthz/dbauthz.go index a2aee40ba801d..e2b72a544595e 100644 --- a/coderd/database/dbauthz/dbauthz.go +++ b/coderd/database/dbauthz/dbauthz.go @@ -1088,9 +1088,9 @@ func (q *querier) AcquireNotificationMessages(ctx context.Context, arg database. } func (q *querier) AcquireProvisionerJob(ctx context.Context, arg database.AcquireProvisionerJobParams) (database.ProvisionerJob, error) { - // if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { - // return database.ProvisionerJob{}, err - // } + if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { + return database.ProvisionerJob{}, err + } return q.db.AcquireProvisionerJob(ctx, arg) } @@ -3559,9 +3559,9 @@ func (q *querier) InsertProvisionerJobLogs(ctx context.Context, arg database.Ins } func (q *querier) InsertProvisionerJobTimings(ctx context.Context, arg database.InsertProvisionerJobTimingsParams) ([]database.ProvisionerJobTiming, error) { - // if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { - // return nil, err - // } + if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { + return nil, err + } return q.db.InsertProvisionerJobTimings(ctx, arg) } @@ -4185,9 +4185,9 @@ func (q *querier) UpdateProvisionerDaemonLastSeenAt(ctx context.Context, arg dat } func (q *querier) UpdateProvisionerJobByID(ctx context.Context, arg database.UpdateProvisionerJobByIDParams) error { - // if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { - // return err - // } + if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { + return err + } return q.db.UpdateProvisionerJobByID(ctx, arg) } @@ -4263,16 +4263,16 @@ func (q *querier) UpdateProvisionerJobWithCancelByID(ctx context.Context, arg da } func (q *querier) UpdateProvisionerJobWithCompleteByID(ctx context.Context, arg database.UpdateProvisionerJobWithCompleteByIDParams) error { - // if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { - // return err - // } + if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { + return err + } return q.db.UpdateProvisionerJobWithCompleteByID(ctx, arg) } func (q *querier) UpdateProvisionerJobWithCompleteWithStartedAtByID(ctx context.Context, arg database.UpdateProvisionerJobWithCompleteWithStartedAtByIDParams) error { - // if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { - // return err - // } + if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { + return err + } return q.db.UpdateProvisionerJobWithCompleteWithStartedAtByID(ctx, arg) } diff --git a/coderd/database/dbauthz/dbauthz_test.go b/coderd/database/dbauthz/dbauthz_test.go index 381c9b55b8352..07344b30261ad 100644 --- a/coderd/database/dbauthz/dbauthz_test.go +++ b/coderd/database/dbauthz/dbauthz_test.go @@ -4026,26 +4026,26 @@ func (s *MethodTestSuite) TestSystemFunctions() { OrganizationID: j.OrganizationID, Types: []database.ProvisionerType{j.Provisioner}, ProvisionerTags: must(json.Marshal(j.Tags)), - }).Asserts( /* rbac.ResourceProvisionerJobs, policy.ActionUpdate */ ) + }).Asserts(rbac.ResourceProvisionerJobs, policy.ActionUpdate) })) s.Run("UpdateProvisionerJobWithCompleteByID", s.Subtest(func(db database.Store, check *expects) { j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args(database.UpdateProvisionerJobWithCompleteByIDParams{ ID: j.ID, - }).Asserts( /* rbac.ResourceProvisionerJobs, policy.ActionUpdate */ ) + }).Asserts(rbac.ResourceProvisionerJobs, policy.ActionUpdate) })) s.Run("UpdateProvisionerJobWithCompleteWithStartedAtByID", s.Subtest(func(db database.Store, check *expects) { j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args(database.UpdateProvisionerJobWithCompleteWithStartedAtByIDParams{ ID: j.ID, - }).Asserts( /* rbac.ResourceProvisionerJobs, policy.ActionUpdate */ ) + }).Asserts(rbac.ResourceProvisionerJobs, policy.ActionUpdate) })) s.Run("UpdateProvisionerJobByID", s.Subtest(func(db database.Store, check *expects) { j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args(database.UpdateProvisionerJobByIDParams{ ID: j.ID, UpdatedAt: time.Now(), - }).Asserts( /* rbac.ResourceProvisionerJobs, policy.ActionUpdate */ ) + }).Asserts(rbac.ResourceProvisionerJobs, policy.ActionUpdate) })) s.Run("InsertProvisionerJob", s.Subtest(func(db database.Store, check *expects) { dbtestutil.DisableForeignKeysAndTriggers(s.T(), db) @@ -4067,7 +4067,7 @@ func (s *MethodTestSuite) TestSystemFunctions() { j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args(database.InsertProvisionerJobTimingsParams{ JobID: j.ID, - }).Asserts( /* rbac.ResourceProvisionerJobs, policy.ActionUpdate */ ) + }).Asserts(rbac.ResourceProvisionerJobs, policy.ActionUpdate) })) s.Run("UpsertProvisionerDaemon", s.Subtest(func(db database.Store, check *expects) { dbtestutil.DisableForeignKeysAndTriggers(s.T(), db) From c03bfa39ac791e899a931c0c31f9bd0e7c2fe916 Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Mon, 19 May 2025 07:57:25 +0000 Subject: [PATCH 20/24] fixed InOrg check for provisionerjob resource --- coderd/database/dbauthz/dbauthz.go | 2 +- coderd/database/dbauthz/dbauthz_test.go | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/coderd/database/dbauthz/dbauthz.go b/coderd/database/dbauthz/dbauthz.go index e2b72a544595e..032f37339c21a 100644 --- a/coderd/database/dbauthz/dbauthz.go +++ b/coderd/database/dbauthz/dbauthz.go @@ -2325,7 +2325,7 @@ func (q *querier) GetProvisionerJobsByIDs(ctx context.Context, ids []uuid.UUID) return nil, err } } - return q.db.GetProvisionerJobsByIDs(ctx, ids) + return provisionerJobs, nil } func (q *querier) GetProvisionerJobsByIDsWithQueuePosition(ctx context.Context, ids []uuid.UUID) ([]database.GetProvisionerJobsByIDsWithQueuePositionRow, error) { diff --git a/coderd/database/dbauthz/dbauthz_test.go b/coderd/database/dbauthz/dbauthz_test.go index 07344b30261ad..5eb5115c51ef2 100644 --- a/coderd/database/dbauthz/dbauthz_test.go +++ b/coderd/database/dbauthz/dbauthz_test.go @@ -694,11 +694,11 @@ func (s *MethodTestSuite) TestProvisionerJob() { Asserts(v.RBACObject(tpl), []policy.Action{policy.ActionRead, policy.ActionUpdate}).Returns() })) s.Run("GetProvisionerJobsByIDs", s.Subtest(func(db database.Store, check *expects) { - orgID := uuid.New() - a := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{OrganizationID: orgID}) - b := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{OrganizationID: orgID}) + o := dbgen.Organization(s.T(), db, database.Organization{}) + a := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{OrganizationID: o.ID}) + b := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{OrganizationID: o.ID}) check.Args([]uuid.UUID{a.ID, b.ID}). - Asserts(rbac.ResourceProvisionerJobs.InOrg(orgID), policy.ActionRead, rbac.ResourceProvisionerJobs.InOrg(orgID), policy.ActionRead). + Asserts(rbac.ResourceProvisionerJobs.InOrg(o.ID), policy.ActionRead, rbac.ResourceProvisionerJobs.InOrg(o.ID), policy.ActionRead). Returns(slice.New(a, b)) })) s.Run("GetProvisionerLogsAfterID", s.Subtest(func(db database.Store, check *expects) { @@ -3978,11 +3978,11 @@ func (s *MethodTestSuite) TestSystemFunctions() { Returns([]database.WorkspaceAgent{agt}) })) s.Run("GetProvisionerJobsByIDs", s.Subtest(func(db database.Store, check *expects) { - orgID := uuid.New() - a := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{OrganizationID: orgID}) - b := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{OrganizationID: orgID}) + o := dbgen.Organization(s.T(), db, database.Organization{}) + a := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{OrganizationID: o.ID}) + b := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{OrganizationID: o.ID}) check.Args([]uuid.UUID{a.ID, b.ID}). - Asserts(rbac.ResourceProvisionerJobs.InOrg(orgID), policy.ActionRead, rbac.ResourceProvisionerJobs.InOrg(orgID), policy.ActionRead). + Asserts(rbac.ResourceProvisionerJobs.InOrg(o.ID), policy.ActionRead, rbac.ResourceProvisionerJobs.InOrg(o.ID), policy.ActionRead). Returns(slice.New(a, b)) })) s.Run("InsertWorkspaceAgent", s.Subtest(func(db database.Store, check *expects) { @@ -4283,7 +4283,7 @@ func (s *MethodTestSuite) TestSystemFunctions() { check.Args([]uuid.UUID{uuid.New()}).Asserts(rbac.ResourceSystem, policy.ActionRead) })) s.Run("GetProvisionerJobsByIDsWithQueuePosition", s.Subtest(func(db database.Store, check *expects) { - check.Args([]uuid.UUID{}).Asserts( /* rbac.ResourceProvisionerJobs.InOrg(orgID), policy.ActionRead */ ) + check.Args([]uuid.UUID{}).Asserts() })) s.Run("GetReplicaByID", s.Subtest(func(db database.Store, check *expects) { check.Args(uuid.New()).Asserts(rbac.ResourceSystem, policy.ActionRead).Errors(sql.ErrNoRows) From a15bd1cff30f2916ffbfd07c5b34623e2e1a52fb Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Mon, 19 May 2025 14:31:33 +0000 Subject: [PATCH 21/24] PR review; naming in the comments, added comments for SQL, less verbose logging --- coderd/database/dbauthz/dbauthz.go | 7 +--- coderd/database/dbmem/dbmem.go | 8 +++-- coderd/database/queries/provisionerjobs.sql | 1 + coderd/jobreaper/detector.go | 23 ++++++------ coderd/jobreaper/detector_test.go | 39 ++++----------------- provisioner/terraform/serve.go | 4 +-- 6 files changed, 29 insertions(+), 53 deletions(-) diff --git a/coderd/database/dbauthz/dbauthz.go b/coderd/database/dbauthz/dbauthz.go index 032f37339c21a..35727f14b8665 100644 --- a/coderd/database/dbauthz/dbauthz.go +++ b/coderd/database/dbauthz/dbauthz.go @@ -3542,12 +3542,7 @@ func (q *querier) InsertPresetParameters(ctx context.Context, arg database.Inser func (q *querier) InsertProvisionerJob(ctx context.Context, arg database.InsertProvisionerJobParams) (database.ProvisionerJob, error) { // TODO: Remove this once we have a proper rbac check for provisioner jobs. - // Currently ProvisionerJobs are not associated with a user, so we can't - // check for a user's permissions. We'd need to check for the associated workspace - // and verify ownership through that. - // if err := q.authorizeContext(ctx, policy.ActionCreate, rbac.ResourceProvisionerJobs); err != nil { - // return database.ProvisionerJob{}, err - // } + // Details in https://github.com/coder/coder/issues/16160 return q.db.InsertProvisionerJob(ctx, arg) } diff --git a/coderd/database/dbmem/dbmem.go b/coderd/database/dbmem/dbmem.go index d6725b066b732..3475fd3bf9fbe 100644 --- a/coderd/database/dbmem/dbmem.go +++ b/coderd/database/dbmem/dbmem.go @@ -8,6 +8,7 @@ import ( "errors" "fmt" "math" + "math/rand/v2" "reflect" "regexp" "slices" @@ -4883,11 +4884,14 @@ func (q *FakeQuerier) GetProvisionerJobsToBeReaped(_ context.Context, arg databa provisionerJob.Tags = maps.Clone(provisionerJob.Tags) hungJobs = append(hungJobs, provisionerJob) if len(hungJobs) >= int(maxJobs) { - return hungJobs, nil + break } } } } + rand.Shuffle(len(hungJobs), func(i, j int) { + hungJobs[i], hungJobs[j] = hungJobs[j], hungJobs[i] + }) return hungJobs, nil } @@ -10955,8 +10959,8 @@ func (q *FakeQuerier) UpdateProvisionerJobWithCompleteWithStartedAtByID(_ contex job.CompletedAt = arg.CompletedAt job.Error = arg.Error job.ErrorCode = arg.ErrorCode - job.JobStatus = provisionerJobStatus(job) job.StartedAt = arg.StartedAt + job.JobStatus = provisionerJobStatus(job) q.provisionerJobs[index] = job return nil } diff --git a/coderd/database/queries/provisionerjobs.sql b/coderd/database/queries/provisionerjobs.sql index 1a01cbdf8ddd4..4096fa5b2755a 100644 --- a/coderd/database/queries/provisionerjobs.sql +++ b/coderd/database/queries/provisionerjobs.sql @@ -299,6 +299,7 @@ WHERE AND started_at IS NOT NULL AND completed_at IS NULL ) +-- To avoid repeatedly attempting to reap the same jobs, we randomly order and limit to @max_jobs. ORDER BY random() LIMIT @max_jobs; diff --git a/coderd/jobreaper/detector.go b/coderd/jobreaper/detector.go index da33d2f537905..ad5774ee6b95d 100644 --- a/coderd/jobreaper/detector.go +++ b/coderd/jobreaper/detector.go @@ -20,12 +20,12 @@ import ( ) const ( - // HungJobDuration is the duration of time since the last update to a job - // before it is considered hung. + // HungJobDuration is the duration of time since the last update + // to a RUNNING job before it is considered hung. HungJobDuration = 5 * time.Minute - // PendingJobDuration is the duration of time since the last update to a job - // before it is considered hung. + // PendingJobDuration is the duration of time since last update + // to a PENDING job before it is considered dead. PendingJobDuration = 30 * time.Minute // HungJobExitTimeout is the duration of time that provisioners should allow @@ -42,7 +42,7 @@ const ( ) // jobLogMessages are written to provisioner job logs when a job is reaped -func jobLogMessages(reapType ReapType, threshold time.Duration) []string { +func JobLogMessages(reapType ReapType, threshold time.Duration) []string { return []string{ "", "====================", @@ -110,9 +110,9 @@ type Stats struct { Error error } -// New returns a new hang detector. +// New returns a new job reaper. func New(ctx context.Context, db database.Store, pub pubsub.Pubsub, log slog.Logger, tick <-chan time.Time) *Detector { - //nolint:gocritic // Hang detector has a limited set of permissions. + //nolint:gocritic // Job reaper has a limited set of permissions. ctx, cancel := context.WithCancel(dbauthz.AsJobReaper(ctx)) d := &Detector{ ctx: ctx, @@ -224,7 +224,7 @@ func (d *Detector) run(t time.Time) Stats { err := reapJob(ctx, log, d.db, d.pubsub, job) if err != nil { if !(xerrors.As(err, &acquireLockError{}) || xerrors.As(err, &jobIneligibleError{})) { - log.Error(ctx, fmt.Sprintf("error forcefully terminating %s provisioner job", job.Type), slog.Error(err)) + log.Error(ctx, "error forcefully terminating provisioner job", slog.F("type", job.Type), slog.Error(err)) } continue } @@ -260,7 +260,8 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub } log.Warn( - ctx, fmt.Sprintf("detected %s provisioner job, forcefully terminating", jobToReap.Type), + ctx, "forcefully terminating provisioner job", + "type", jobToReap.Type, "threshold", jobToReap.Threshold, ) @@ -291,7 +292,7 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub Output: nil, } now := dbtime.Now() - for i, msg := range jobLogMessages(jobToReap.Type, jobToReap.Threshold) { + for i, msg := range JobLogMessages(jobToReap.Type, jobToReap.Threshold) { // Set the created at in a way that ensures each message has // a unique timestamp so they will be sorted correctly. insertParams.CreatedAt = append(insertParams.CreatedAt, now.Add(time.Millisecond*time.Duration(i))) @@ -325,7 +326,7 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub Valid: true, }, Error: sql.NullString{ - String: fmt.Sprintf("Coder: Build has been detected as %s for %.0f minutes and has been terminated by hang detector.", jobToReap.Type, jobToReap.Threshold.Minutes()), + String: fmt.Sprintf("Coder: Build has been detected as %s for %.0f minutes and has been terminated by the reaper.", jobToReap.Type, jobToReap.Threshold.Minutes()), Valid: true, }, ErrorCode: sql.NullString{ diff --git a/coderd/jobreaper/detector_test.go b/coderd/jobreaper/detector_test.go index 5d2a57b0fb47c..28457aeeca3a8 100644 --- a/coderd/jobreaper/detector_test.go +++ b/coderd/jobreaper/detector_test.go @@ -27,36 +27,6 @@ import ( "github.com/coder/coder/v2/testutil" ) -// jobType represents the type of job being reaped -type jobType string - -const ( - hungJobType jobType = "hung" - pendingJobType jobType = "pending" -) - -// jobLogMessages returns the messages to be written to provisioner job logs when a job is reaped -func jobLogMessages(jobType jobType, threshold float64) []string { - return []string{ - "", - "====================", - fmt.Sprintf("Coder: Build has been detected as %s for %.0f minutes and will be terminated.", jobType, threshold), - "====================", - "", - } -} - -// reapParamsFromJob determines the type and threshold for a job being reaped -func reapParamsFromJob(job database.ProvisionerJob) (jobType, float64) { - jobType := hungJobType - threshold := jobreaper.HungJobDuration.Minutes() - if !job.StartedAt.Valid { - jobType = pendingJobType - threshold = jobreaper.PendingJobDuration.Minutes() - } - return jobType, threshold -} - func TestMain(m *testing.M) { goleak.VerifyTestMain(m, testutil.GoleakOptions...) } @@ -972,8 +942,13 @@ func TestDetectorPushesLogs(t *testing.T) { CreatedAfter: after, }) require.NoError(t, err) - jobType, threshold := reapParamsFromJob(templateImportJob) - expectedLogs := jobLogMessages(jobType, threshold) + threshold := jobreaper.HungJobDuration + jobType := jobreaper.Hung + if templateImportJob.JobStatus == database.ProvisionerJobStatusPending { + threshold = jobreaper.PendingJobDuration + jobType = jobreaper.Pending + } + expectedLogs := jobreaper.JobLogMessages(jobType, threshold) require.Len(t, logs, len(expectedLogs)) for i, log := range logs { assert.Equal(t, database.LogLevelError, log.Level) diff --git a/provisioner/terraform/serve.go b/provisioner/terraform/serve.go index 67eeaccaed05d..3e671b0c68e56 100644 --- a/provisioner/terraform/serve.go +++ b/provisioner/terraform/serve.go @@ -39,9 +39,9 @@ type ServeOptions struct { // // This is a no-op on Windows where the process can't be interrupted. // - // Default value: 3 minutes (reaper.HungJobExitTimeout). This value should + // Default value: 3 minutes (jobreaper.HungJobExitTimeout). This value should // be kept less than the value that Coder uses to mark hung jobs as failed, - // which is 5 minutes (see reaper package). + // which is 5 minutes (see jobreaper package). ExitTimeout time.Duration } From 5b9348f40a8a4e0189066977cb5de30202fd2bbf Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Mon, 19 May 2025 15:00:58 +0000 Subject: [PATCH 22/24] fixes to tests after lint remove rand --- coderd/database/dbmem/dbmem.go | 2 +- coderd/database/querier.go | 1 + coderd/database/queries.sql.go | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/coderd/database/dbmem/dbmem.go b/coderd/database/dbmem/dbmem.go index 3475fd3bf9fbe..5486f930b8321 100644 --- a/coderd/database/dbmem/dbmem.go +++ b/coderd/database/dbmem/dbmem.go @@ -8,7 +8,7 @@ import ( "errors" "fmt" "math" - "math/rand/v2" + "math/rand/v2" //nolint:staticcheck // used only in tests to randomize list order "reflect" "regexp" "slices" diff --git a/coderd/database/querier.go b/coderd/database/querier.go index b956aea67c130..e42f2b259cd08 100644 --- a/coderd/database/querier.go +++ b/coderd/database/querier.go @@ -272,6 +272,7 @@ type sqlcQuerier interface { GetProvisionerJobsByIDsWithQueuePosition(ctx context.Context, ids []uuid.UUID) ([]GetProvisionerJobsByIDsWithQueuePositionRow, error) GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisioner(ctx context.Context, arg GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisionerParams) ([]GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisionerRow, error) GetProvisionerJobsCreatedAfter(ctx context.Context, createdAt time.Time) ([]ProvisionerJob, error) + // To avoid repeatedly attempting to reap the same jobs, we randomly order and limit to @max_jobs. GetProvisionerJobsToBeReaped(ctx context.Context, arg GetProvisionerJobsToBeReapedParams) ([]ProvisionerJob, error) GetProvisionerKeyByHashedSecret(ctx context.Context, hashedSecret []byte) (ProvisionerKey, error) GetProvisionerKeyByID(ctx context.Context, id uuid.UUID) (ProvisionerKey, error) diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index e119c0f002ea2..00922cc3f7ff0 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -7912,6 +7912,7 @@ type GetProvisionerJobsToBeReapedParams struct { MaxJobs int32 `db:"max_jobs" json:"max_jobs"` } +// To avoid repeatedly attempting to reap the same jobs, we randomly order and limit to @max_jobs. func (q *sqlQuerier) GetProvisionerJobsToBeReaped(ctx context.Context, arg GetProvisionerJobsToBeReapedParams) ([]ProvisionerJob, error) { rows, err := q.db.QueryContext(ctx, getProvisionerJobsToBeReaped, arg.PendingSince, arg.HungSince, arg.MaxJobs) if err != nil { From 91d2d3201c86531c121c7981cfc0d76c19e26afd Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Mon, 19 May 2025 15:22:38 +0000 Subject: [PATCH 23/24] readded rand to fix gen failing in CI --- coderd/database/dbmem/dbmem.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/coderd/database/dbmem/dbmem.go b/coderd/database/dbmem/dbmem.go index 5486f930b8321..d3b518b2a6460 100644 --- a/coderd/database/dbmem/dbmem.go +++ b/coderd/database/dbmem/dbmem.go @@ -8,7 +8,7 @@ import ( "errors" "fmt" "math" - "math/rand/v2" //nolint:staticcheck // used only in tests to randomize list order + "math/rand" //#nosec // this is only used for shuffling an array to pick random jobs to reap "reflect" "regexp" "slices" From 767cb77cf4bdac0827cf9b90a9e781d3bb6f32b4 Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Tue, 20 May 2025 13:04:35 +0000 Subject: [PATCH 24/24] adjusted TODOs --- coderd/database/dbauthz/dbauthz.go | 26 ++++++++++++------------- coderd/database/dbauthz/dbauthz_test.go | 4 ++-- coderd/database/dbmem/dbmem.go | 4 ++-- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/coderd/database/dbauthz/dbauthz.go b/coderd/database/dbauthz/dbauthz.go index 35727f14b8665..16d707c466104 100644 --- a/coderd/database/dbauthz/dbauthz.go +++ b/coderd/database/dbauthz/dbauthz.go @@ -2320,8 +2320,12 @@ func (q *querier) GetProvisionerJobsByIDs(ctx context.Context, ids []uuid.UUID) if err != nil { return nil, err } + orgIDs := make(map[uuid.UUID]struct{}) for _, job := range provisionerJobs { - if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs.InOrg(job.OrganizationID)); err != nil { + orgIDs[job.OrganizationID] = struct{}{} + } + for orgID := range orgIDs { + if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs.InOrg(orgID)); err != nil { return nil, err } } @@ -2329,16 +2333,14 @@ func (q *querier) GetProvisionerJobsByIDs(ctx context.Context, ids []uuid.UUID) } func (q *querier) GetProvisionerJobsByIDsWithQueuePosition(ctx context.Context, ids []uuid.UUID) ([]database.GetProvisionerJobsByIDsWithQueuePositionRow, error) { - // if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { - // return nil, err - // } + // TODO: Remove this once we have a proper rbac check for provisioner jobs. + // Details in https://github.com/coder/coder/issues/16160 return q.db.GetProvisionerJobsByIDsWithQueuePosition(ctx, ids) } func (q *querier) GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisioner(ctx context.Context, arg database.GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisionerParams) ([]database.GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisionerRow, error) { - // if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { - // return nil, err - // } + // TODO: Remove this once we have a proper rbac check for provisioner jobs. + // Details in https://github.com/coder/coder/issues/16160 return fetchWithPostFilter(q.auth, policy.ActionRead, q.db.GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisioner)(ctx, arg) } @@ -3547,9 +3549,8 @@ func (q *querier) InsertProvisionerJob(ctx context.Context, arg database.InsertP } func (q *querier) InsertProvisionerJobLogs(ctx context.Context, arg database.InsertProvisionerJobLogsParams) ([]database.ProvisionerJobLog, error) { - // if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { - // return nil, err - // } + // TODO: Remove this once we have a proper rbac check for provisioner jobs. + // Details in https://github.com/coder/coder/issues/16160 return q.db.InsertProvisionerJobLogs(ctx, arg) } @@ -4187,9 +4188,8 @@ func (q *querier) UpdateProvisionerJobByID(ctx context.Context, arg database.Upd } func (q *querier) UpdateProvisionerJobWithCancelByID(ctx context.Context, arg database.UpdateProvisionerJobWithCancelByIDParams) error { - // if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { - // return err - // } + // TODO: Remove this once we have a proper rbac check for provisioner jobs. + // Details in https://github.com/coder/coder/issues/16160 job, err := q.db.GetProvisionerJobByID(ctx, arg.ID) if err != nil { diff --git a/coderd/database/dbauthz/dbauthz_test.go b/coderd/database/dbauthz/dbauthz_test.go index 5eb5115c51ef2..f851d0d6d1c08 100644 --- a/coderd/database/dbauthz/dbauthz_test.go +++ b/coderd/database/dbauthz/dbauthz_test.go @@ -698,7 +698,7 @@ func (s *MethodTestSuite) TestProvisionerJob() { a := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{OrganizationID: o.ID}) b := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{OrganizationID: o.ID}) check.Args([]uuid.UUID{a.ID, b.ID}). - Asserts(rbac.ResourceProvisionerJobs.InOrg(o.ID), policy.ActionRead, rbac.ResourceProvisionerJobs.InOrg(o.ID), policy.ActionRead). + Asserts(rbac.ResourceProvisionerJobs.InOrg(o.ID), policy.ActionRead). Returns(slice.New(a, b)) })) s.Run("GetProvisionerLogsAfterID", s.Subtest(func(db database.Store, check *expects) { @@ -3982,7 +3982,7 @@ func (s *MethodTestSuite) TestSystemFunctions() { a := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{OrganizationID: o.ID}) b := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{OrganizationID: o.ID}) check.Args([]uuid.UUID{a.ID, b.ID}). - Asserts(rbac.ResourceProvisionerJobs.InOrg(o.ID), policy.ActionRead, rbac.ResourceProvisionerJobs.InOrg(o.ID), policy.ActionRead). + Asserts(rbac.ResourceProvisionerJobs.InOrg(o.ID), policy.ActionRead). Returns(slice.New(a, b)) })) s.Run("InsertWorkspaceAgent", s.Subtest(func(db database.Store, check *expects) { diff --git a/coderd/database/dbmem/dbmem.go b/coderd/database/dbmem/dbmem.go index d3b518b2a6460..794d46c0083d0 100644 --- a/coderd/database/dbmem/dbmem.go +++ b/coderd/database/dbmem/dbmem.go @@ -8,7 +8,7 @@ import ( "errors" "fmt" "math" - "math/rand" //#nosec // this is only used for shuffling an array to pick random jobs to reap + insecurerand "math/rand" //#nosec // this is only used for shuffling an array to pick random jobs to reap "reflect" "regexp" "slices" @@ -4889,7 +4889,7 @@ func (q *FakeQuerier) GetProvisionerJobsToBeReaped(_ context.Context, arg databa } } } - rand.Shuffle(len(hungJobs), func(i, j int) { + insecurerand.Shuffle(len(hungJobs), func(i, j int) { hungJobs[i], hungJobs[j] = hungJobs[j], hungJobs[i] }) return hungJobs, nil pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy