diff --git a/enterprise/coderd/prebuilds/metricscollector.go b/enterprise/coderd/prebuilds/metricscollector.go index 7a7734b6f8093..90257c26dd580 100644 --- a/enterprise/coderd/prebuilds/metricscollector.go +++ b/enterprise/coderd/prebuilds/metricscollector.go @@ -27,6 +27,7 @@ const ( MetricDesiredGauge = namespace + "desired" MetricRunningGauge = namespace + "running" MetricEligibleGauge = namespace + "eligible" + MetricPresetHardLimitedGauge = namespace + "preset_hard_limited" MetricLastUpdatedGauge = namespace + "metrics_last_updated" ) @@ -82,6 +83,12 @@ var ( labels, nil, ) + presetHardLimitedDesc = prometheus.NewDesc( + MetricPresetHardLimitedGauge, + "Indicates whether a given preset has reached the hard failure limit (1 = hard-limited). Metric is omitted otherwise.", + labels, + nil, + ) lastUpdateDesc = prometheus.NewDesc( MetricLastUpdatedGauge, "The unix timestamp when the metrics related to prebuilt workspaces were last updated; these metrics are cached.", @@ -104,17 +111,22 @@ type MetricsCollector struct { replacementsCounter map[replacementKey]float64 replacementsCounterMu sync.Mutex + + isPresetHardLimited map[hardLimitedPresetKey]bool + isPresetHardLimitedMu sync.Mutex } var _ prometheus.Collector = new(MetricsCollector) func NewMetricsCollector(db database.Store, logger slog.Logger, snapshotter prebuilds.StateSnapshotter) *MetricsCollector { log := logger.Named("prebuilds_metrics_collector") + return &MetricsCollector{ database: db, logger: log, snapshotter: snapshotter, replacementsCounter: make(map[replacementKey]float64), + isPresetHardLimited: make(map[hardLimitedPresetKey]bool), } } @@ -126,6 +138,7 @@ func (*MetricsCollector) Describe(descCh chan<- *prometheus.Desc) { descCh <- desiredPrebuildsDesc descCh <- runningPrebuildsDesc descCh <- eligiblePrebuildsDesc + descCh <- presetHardLimitedDesc descCh <- lastUpdateDesc } @@ -173,6 +186,17 @@ func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) { metricsCh <- prometheus.MustNewConstMetric(eligiblePrebuildsDesc, prometheus.GaugeValue, float64(state.Eligible), preset.TemplateName, preset.Name, preset.OrganizationName) } + mc.isPresetHardLimitedMu.Lock() + for key, isHardLimited := range mc.isPresetHardLimited { + var val float64 + if isHardLimited { + val = 1 + } + + metricsCh <- prometheus.MustNewConstMetric(presetHardLimitedDesc, prometheus.GaugeValue, val, key.templateName, key.presetName, key.orgName) + } + mc.isPresetHardLimitedMu.Unlock() + metricsCh <- prometheus.MustNewConstMetric(lastUpdateDesc, prometheus.GaugeValue, float64(currentState.createdAt.Unix())) } @@ -247,3 +271,25 @@ func (mc *MetricsCollector) trackResourceReplacement(orgName, templateName, pres // cause an issue (or indeed if either would), so we just track the replacement. mc.replacementsCounter[key]++ } + +type hardLimitedPresetKey struct { + orgName, templateName, presetName string +} + +func (k hardLimitedPresetKey) String() string { + return fmt.Sprintf("%s:%s:%s", k.orgName, k.templateName, k.presetName) +} + +// nolint:revive // isHardLimited determines if the preset should be reported as hard-limited in Prometheus. +func (mc *MetricsCollector) trackHardLimitedStatus(orgName, templateName, presetName string, isHardLimited bool) { + mc.isPresetHardLimitedMu.Lock() + defer mc.isPresetHardLimitedMu.Unlock() + + key := hardLimitedPresetKey{orgName: orgName, templateName: templateName, presetName: presetName} + + if isHardLimited { + mc.isPresetHardLimited[key] = true + } else { + delete(mc.isPresetHardLimited, key) + } +} diff --git a/enterprise/coderd/prebuilds/reconcile.go b/enterprise/coderd/prebuilds/reconcile.go index 7796e43777951..603c0da3588a6 100644 --- a/enterprise/coderd/prebuilds/reconcile.go +++ b/enterprise/coderd/prebuilds/reconcile.go @@ -361,17 +361,22 @@ func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.Pres slog.F("preset_name", ps.Preset.Name), ) - // If the preset was previously hard-limited, log it and exit early. - if ps.Preset.PrebuildStatus == database.PrebuildStatusHardLimited { - logger.Warn(ctx, "skipping hard limited preset") - return nil - } + // Report a preset as hard-limited only if all the following conditions are met: + // - The preset is marked as hard-limited + // - The preset is using the active version of its template, and the template has not been deleted + // + // The second condition is important because a hard-limited preset that has become outdated is no longer relevant. + // Its associated prebuilt workspaces were likely deleted, and it's not meaningful to continue reporting it + // as hard-limited to the admin. + reportAsHardLimited := ps.IsHardLimited && ps.Preset.UsingActiveVersion && !ps.Preset.Deleted + c.metrics.trackHardLimitedStatus(ps.Preset.OrganizationName, ps.Preset.TemplateName, ps.Preset.Name, reportAsHardLimited) // If the preset reached the hard failure limit for the first time during this iteration: // - Mark it as hard-limited in the database // - Send notifications to template admins - if ps.IsHardLimited { - logger.Warn(ctx, "skipping hard limited preset") + // - Continue execution, we disallow only creation operation for hard-limited presets. Deletion is allowed. + if ps.Preset.PrebuildStatus != database.PrebuildStatusHardLimited && ps.IsHardLimited { + logger.Warn(ctx, "preset is hard limited, notifying template admins") err := c.store.UpdatePresetPrebuildStatus(ctx, database.UpdatePresetPrebuildStatusParams{ Status: database.PrebuildStatusHardLimited, @@ -384,10 +389,7 @@ func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.Pres err = c.notifyPrebuildFailureLimitReached(ctx, ps) if err != nil { logger.Error(ctx, "failed to notify that number of prebuild failures reached the limit", slog.Error(err)) - return nil } - - return nil } state := ps.CalculateState() @@ -452,6 +454,13 @@ func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.Pres actions.Create = desired } + // If preset is hard-limited, and it's a create operation, log it and exit early. + // Creation operation is disallowed for hard-limited preset. + if ps.IsHardLimited && actions.Create > 0 { + logger.Warn(ctx, "skipping hard limited preset for create operation") + return nil + } + var multiErr multierror.Error for range actions.Create { diff --git a/enterprise/coderd/prebuilds/reconcile_test.go b/enterprise/coderd/prebuilds/reconcile_test.go index f52a77ca500b9..b7f5974197150 100644 --- a/enterprise/coderd/prebuilds/reconcile_test.go +++ b/enterprise/coderd/prebuilds/reconcile_test.go @@ -696,7 +696,8 @@ func TestSkippingHardLimitedPresets(t *testing.T) { ).Leveled(slog.LevelDebug) db, pubSub := dbtestutil.NewDB(t) fakeEnqueuer := newFakeEnqueuer() - controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, clock, prometheus.NewRegistry(), fakeEnqueuer) + registry := prometheus.NewRegistry() + controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, clock, registry, fakeEnqueuer) // Template admin to receive a notification. templateAdmin := dbgen.User(t, db, database.User{ @@ -732,6 +733,17 @@ func TestSkippingHardLimitedPresets(t *testing.T) { workspaceCount := len(workspaces) require.Equal(t, 1, workspaceCount) + // Verify initial state: metric is not set - meaning preset is not hard limited. + require.NoError(t, controller.ForceMetricsUpdate(ctx)) + mf, err := registry.Gather() + require.NoError(t, err) + metric := findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{ + "template_name": template.Name, + "preset_name": preset.Name, + "org_name": org.Name, + }) + require.Nil(t, metric) + // We simulate a failed prebuild in the test; Consequently, the backoff mechanism is triggered when ReconcileAll is called. // Even though ReconciliationBackoffInterval is set to zero, we still need to advance the clock by at least one nanosecond. clock.Advance(time.Nanosecond).MustWait(ctx) @@ -755,6 +767,16 @@ func TestSkippingHardLimitedPresets(t *testing.T) { // When hard limit is not reached, a new workspace should be created. require.Equal(t, 2, len(workspaces)) require.Equal(t, database.PrebuildStatusHealthy, updatedPreset.PrebuildStatus) + + // When hard limit is not reached, metric is not set. + mf, err = registry.Gather() + require.NoError(t, err) + metric = findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{ + "template_name": template.Name, + "preset_name": preset.Name, + "org_name": org.Name, + }) + require.Nil(t, metric) return } @@ -775,6 +797,252 @@ func TestSkippingHardLimitedPresets(t *testing.T) { return true }) require.Len(t, matching, 1) + + // When hard limit is reached, metric is set to 1. + mf, err = registry.Gather() + require.NoError(t, err) + metric = findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{ + "template_name": template.Name, + "preset_name": preset.Name, + "org_name": org.Name, + }) + require.NotNil(t, metric) + require.NotNil(t, metric.GetGauge()) + require.EqualValues(t, 1, metric.GetGauge().GetValue()) + }) + } +} + +func TestHardLimitedPresetShouldNotBlockDeletion(t *testing.T) { + t.Parallel() + + if !dbtestutil.WillUsePostgres() { + t.Skip("This test requires postgres") + } + + // Test cases verify the behavior of prebuild creation depending on configured failure limits. + testCases := []struct { + name string + hardLimit int64 + createNewTemplateVersion bool + deleteTemplate bool + }{ + { + // hard limit is hit - but we allow deletion of prebuilt workspace because it's outdated (new template version was created) + name: "new template version is created", + hardLimit: 1, + createNewTemplateVersion: true, + deleteTemplate: false, + }, + { + // hard limit is hit - but we allow deletion of prebuilt workspace because template is deleted + name: "template is deleted", + hardLimit: 1, + createNewTemplateVersion: false, + deleteTemplate: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + clock := quartz.NewMock(t) + ctx := testutil.Context(t, testutil.WaitShort) + cfg := codersdk.PrebuildsConfig{ + FailureHardLimit: serpent.Int64(tc.hardLimit), + ReconciliationBackoffInterval: 0, + } + logger := slogtest.Make( + t, &slogtest.Options{IgnoreErrors: true}, + ).Leveled(slog.LevelDebug) + db, pubSub := dbtestutil.NewDB(t) + fakeEnqueuer := newFakeEnqueuer() + registry := prometheus.NewRegistry() + controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, clock, registry, fakeEnqueuer) + + // Template admin to receive a notification. + templateAdmin := dbgen.User(t, db, database.User{ + RBACRoles: []string{codersdk.RoleTemplateAdmin}, + }) + + // Set up test environment with a template, version, and preset. + ownerID := uuid.New() + dbgen.User(t, db, database.User{ + ID: ownerID, + }) + org, template := setupTestDBTemplate(t, db, ownerID, false) + templateVersionID := setupTestDBTemplateVersion(ctx, t, clock, db, pubSub, org.ID, ownerID, template.ID) + preset := setupTestDBPreset(t, db, templateVersionID, 2, uuid.New().String()) + + // Create a successful prebuilt workspace. + successfulWorkspace, _ := setupTestDBPrebuild( + t, + clock, + db, + pubSub, + database.WorkspaceTransitionStart, + database.ProvisionerJobStatusSucceeded, + org.ID, + preset, + template.ID, + templateVersionID, + ) + + // Make sure that prebuilt workspaces created in such order: [successful, failed]. + clock.Advance(time.Second).MustWait(ctx) + + // Create a failed prebuilt workspace that counts toward the hard failure limit. + setupTestDBPrebuild( + t, + clock, + db, + pubSub, + database.WorkspaceTransitionStart, + database.ProvisionerJobStatusFailed, + org.ID, + preset, + template.ID, + templateVersionID, + ) + + getJobStatusMap := func(workspaces []database.WorkspaceTable) map[database.ProvisionerJobStatus]int { + jobStatusMap := make(map[database.ProvisionerJobStatus]int) + for _, workspace := range workspaces { + workspaceBuilds, err := db.GetWorkspaceBuildsByWorkspaceID(ctx, database.GetWorkspaceBuildsByWorkspaceIDParams{ + WorkspaceID: workspace.ID, + }) + require.NoError(t, err) + + for _, workspaceBuild := range workspaceBuilds { + job, err := db.GetProvisionerJobByID(ctx, workspaceBuild.JobID) + require.NoError(t, err) + jobStatusMap[job.JobStatus]++ + } + } + return jobStatusMap + } + + // Verify initial state: two workspaces exist, one successful, one failed. + workspaces, err := db.GetWorkspacesByTemplateID(ctx, template.ID) + require.NoError(t, err) + require.Equal(t, 2, len(workspaces)) + jobStatusMap := getJobStatusMap(workspaces) + require.Len(t, jobStatusMap, 2) + require.Equal(t, 1, jobStatusMap[database.ProvisionerJobStatusSucceeded]) + require.Equal(t, 1, jobStatusMap[database.ProvisionerJobStatusFailed]) + + // Verify initial state: metric is not set - meaning preset is not hard limited. + require.NoError(t, controller.ForceMetricsUpdate(ctx)) + mf, err := registry.Gather() + require.NoError(t, err) + metric := findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{ + "template_name": template.Name, + "preset_name": preset.Name, + "org_name": org.Name, + }) + require.Nil(t, metric) + + // We simulate a failed prebuild in the test; Consequently, the backoff mechanism is triggered when ReconcileAll is called. + // Even though ReconciliationBackoffInterval is set to zero, we still need to advance the clock by at least one nanosecond. + clock.Advance(time.Nanosecond).MustWait(ctx) + + // Trigger reconciliation to attempt creating a new prebuild. + // The outcome depends on whether the hard limit has been reached. + require.NoError(t, controller.ReconcileAll(ctx)) + + // These two additional calls to ReconcileAll should not trigger any notifications. + // A notification is only sent once. + require.NoError(t, controller.ReconcileAll(ctx)) + require.NoError(t, controller.ReconcileAll(ctx)) + + // Verify the final state after reconciliation. + // When hard limit is reached, no new workspace should be created. + workspaces, err = db.GetWorkspacesByTemplateID(ctx, template.ID) + require.NoError(t, err) + require.Equal(t, 2, len(workspaces)) + jobStatusMap = getJobStatusMap(workspaces) + require.Len(t, jobStatusMap, 2) + require.Equal(t, 1, jobStatusMap[database.ProvisionerJobStatusSucceeded]) + require.Equal(t, 1, jobStatusMap[database.ProvisionerJobStatusFailed]) + + updatedPreset, err := db.GetPresetByID(ctx, preset.ID) + require.NoError(t, err) + require.Equal(t, database.PrebuildStatusHardLimited, updatedPreset.PrebuildStatus) + + // When hard limit is reached, a notification should be sent. + matching := fakeEnqueuer.Sent(func(notification *notificationstest.FakeNotification) bool { + if !assert.Equal(t, notifications.PrebuildFailureLimitReached, notification.TemplateID, "unexpected template") { + return false + } + + if !assert.Equal(t, templateAdmin.ID, notification.UserID, "unexpected receiver") { + return false + } + + return true + }) + require.Len(t, matching, 1) + + // When hard limit is reached, metric is set to 1. + mf, err = registry.Gather() + require.NoError(t, err) + metric = findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{ + "template_name": template.Name, + "preset_name": preset.Name, + "org_name": org.Name, + }) + require.NotNil(t, metric) + require.NotNil(t, metric.GetGauge()) + require.EqualValues(t, 1, metric.GetGauge().GetValue()) + + if tc.createNewTemplateVersion { + // Create a new template version and mark it as active + // This marks the template version that we care about as inactive + setupTestDBTemplateVersion(ctx, t, clock, db, pubSub, org.ID, ownerID, template.ID) + } + + if tc.deleteTemplate { + require.NoError(t, db.UpdateTemplateDeletedByID(ctx, database.UpdateTemplateDeletedByIDParams{ + ID: template.ID, + Deleted: true, + UpdatedAt: dbtime.Now(), + })) + } + + // Trigger reconciliation to make sure that successful, but outdated prebuilt workspace will be deleted. + require.NoError(t, controller.ReconcileAll(ctx)) + + workspaces, err = db.GetWorkspacesByTemplateID(ctx, template.ID) + require.NoError(t, err) + require.Equal(t, 2, len(workspaces)) + + jobStatusMap = getJobStatusMap(workspaces) + require.Len(t, jobStatusMap, 3) + require.Equal(t, 1, jobStatusMap[database.ProvisionerJobStatusSucceeded]) + require.Equal(t, 1, jobStatusMap[database.ProvisionerJobStatusFailed]) + // Pending job should be the job that deletes successful, but outdated prebuilt workspace. + // Prebuilt workspace MUST be deleted, despite the fact that preset is marked as hard limited. + require.Equal(t, 1, jobStatusMap[database.ProvisionerJobStatusPending]) + + workspaceBuilds, err := db.GetWorkspaceBuildsByWorkspaceID(ctx, database.GetWorkspaceBuildsByWorkspaceIDParams{ + WorkspaceID: successfulWorkspace.ID, + }) + require.NoError(t, err) + require.Equal(t, 2, len(workspaceBuilds)) + // Make sure that successfully created, but outdated prebuilt workspace was scheduled for deletion. + require.Equal(t, database.WorkspaceTransitionDelete, workspaceBuilds[0].Transition) + require.Equal(t, database.WorkspaceTransitionStart, workspaceBuilds[1].Transition) + + // Metric is deleted after preset became outdated. + mf, err = registry.Gather() + require.NoError(t, err) + metric = findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{ + "template_name": template.Name, + "preset_name": preset.Name, + "org_name": org.Name, + }) + require.Nil(t, metric) }) } } pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy