diff --git a/enterprise/coderd/prebuilds/metricscollector.go b/enterprise/coderd/prebuilds/metricscollector.go index 7b55227effffa..76089c025243d 100644 --- a/enterprise/coderd/prebuilds/metricscollector.go +++ b/enterprise/coderd/prebuilds/metricscollector.go @@ -2,14 +2,17 @@ package prebuilds import ( "context" + "fmt" + "sync/atomic" "time" - "cdr.dev/slog" - "github.com/prometheus/client_golang/prometheus" + "golang.org/x/xerrors" + + "cdr.dev/slog" "github.com/coder/coder/v2/coderd/database" - "github.com/coder/coder/v2/coderd/database/dbauthz" + "github.com/coder/coder/v2/coderd/database/dbtime" "github.com/coder/coder/v2/coderd/prebuilds" ) @@ -55,20 +58,34 @@ var ( labels, nil, ) + lastUpdateDesc = prometheus.NewDesc( + "coderd_prebuilt_workspaces_metrics_last_updated", + "The unix timestamp when the metrics related to prebuilt workspaces were last updated; these metrics are cached.", + []string{}, + nil, + ) +) + +const ( + metricsUpdateInterval = time.Second * 15 + metricsUpdateTimeout = time.Second * 10 ) type MetricsCollector struct { database database.Store logger slog.Logger snapshotter prebuilds.StateSnapshotter + + latestState atomic.Pointer[metricsState] } var _ prometheus.Collector = new(MetricsCollector) func NewMetricsCollector(db database.Store, logger slog.Logger, snapshotter prebuilds.StateSnapshotter) *MetricsCollector { + log := logger.Named("prebuilds_metrics_collector") return &MetricsCollector{ database: db, - logger: logger.Named("prebuilds_metrics_collector"), + logger: log, snapshotter: snapshotter, } } @@ -80,38 +97,34 @@ func (*MetricsCollector) Describe(descCh chan<- *prometheus.Desc) { descCh <- desiredPrebuildsDesc descCh <- runningPrebuildsDesc descCh <- eligiblePrebuildsDesc + descCh <- lastUpdateDesc } +// Collect uses the cached state to set configured metrics. +// The state is cached because this function can be called multiple times per second and retrieving the current state +// is an expensive operation. func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) { - // nolint:gocritic // We need to set an authz context to read metrics from the db. - ctx, cancel := context.WithTimeout(dbauthz.AsPrebuildsOrchestrator(context.Background()), 10*time.Second) - defer cancel() - prebuildMetrics, err := mc.database.GetPrebuildMetrics(ctx) - if err != nil { - mc.logger.Error(ctx, "failed to get prebuild metrics", slog.Error(err)) + currentState := mc.latestState.Load() // Grab a copy; it's ok if it goes stale during the course of this func. + if currentState == nil { + mc.logger.Warn(context.Background(), "failed to set prebuilds metrics; state not set") + metricsCh <- prometheus.MustNewConstMetric(lastUpdateDesc, prometheus.GaugeValue, 0) return } - for _, metric := range prebuildMetrics { + for _, metric := range currentState.prebuildMetrics { metricsCh <- prometheus.MustNewConstMetric(createdPrebuildsDesc, prometheus.CounterValue, float64(metric.CreatedCount), metric.TemplateName, metric.PresetName, metric.OrganizationName) metricsCh <- prometheus.MustNewConstMetric(failedPrebuildsDesc, prometheus.CounterValue, float64(metric.FailedCount), metric.TemplateName, metric.PresetName, metric.OrganizationName) metricsCh <- prometheus.MustNewConstMetric(claimedPrebuildsDesc, prometheus.CounterValue, float64(metric.ClaimedCount), metric.TemplateName, metric.PresetName, metric.OrganizationName) } - snapshot, err := mc.snapshotter.SnapshotState(ctx, mc.database) - if err != nil { - mc.logger.Error(ctx, "failed to get latest prebuild state", slog.Error(err)) - return - } - - for _, preset := range snapshot.Presets { + for _, preset := range currentState.snapshot.Presets { if !preset.UsingActiveVersion { continue } - presetSnapshot, err := snapshot.FilterByPreset(preset.ID) + presetSnapshot, err := currentState.snapshot.FilterByPreset(preset.ID) if err != nil { - mc.logger.Error(ctx, "failed to filter by preset", slog.Error(err)) + mc.logger.Error(context.Background(), "failed to filter by preset", slog.Error(err)) continue } state := presetSnapshot.CalculateState() @@ -120,4 +133,57 @@ func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) { metricsCh <- prometheus.MustNewConstMetric(runningPrebuildsDesc, prometheus.GaugeValue, float64(state.Actual), preset.TemplateName, preset.Name, preset.OrganizationName) metricsCh <- prometheus.MustNewConstMetric(eligiblePrebuildsDesc, prometheus.GaugeValue, float64(state.Eligible), preset.TemplateName, preset.Name, preset.OrganizationName) } + + metricsCh <- prometheus.MustNewConstMetric(lastUpdateDesc, prometheus.GaugeValue, float64(currentState.createdAt.Unix())) +} + +type metricsState struct { + prebuildMetrics []database.GetPrebuildMetricsRow + snapshot *prebuilds.GlobalSnapshot + createdAt time.Time +} + +// BackgroundFetch updates the metrics state every given interval. +func (mc *MetricsCollector) BackgroundFetch(ctx context.Context, updateInterval, updateTimeout time.Duration) { + tick := time.NewTicker(time.Nanosecond) + defer tick.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-tick.C: + // Tick immediately, then set regular interval. + tick.Reset(updateInterval) + + if err := mc.UpdateState(ctx, updateTimeout); err != nil { + mc.logger.Error(ctx, "failed to update prebuilds metrics state", slog.Error(err)) + } + } + } +} + +// UpdateState builds the current metrics state. +func (mc *MetricsCollector) UpdateState(ctx context.Context, timeout time.Duration) error { + start := time.Now() + fetchCtx, fetchCancel := context.WithTimeout(ctx, timeout) + defer fetchCancel() + + prebuildMetrics, err := mc.database.GetPrebuildMetrics(fetchCtx) + if err != nil { + return xerrors.Errorf("fetch prebuild metrics: %w", err) + } + + snapshot, err := mc.snapshotter.SnapshotState(fetchCtx, mc.database) + if err != nil { + return xerrors.Errorf("snapshot state: %w", err) + } + mc.logger.Debug(ctx, "fetched prebuilds metrics state", slog.F("duration_secs", fmt.Sprintf("%.2f", time.Since(start).Seconds()))) + + mc.latestState.Store(&metricsState{ + prebuildMetrics: prebuildMetrics, + snapshot: snapshot, + createdAt: dbtime.Now(), + }) + return nil } diff --git a/enterprise/coderd/prebuilds/metricscollector_test.go b/enterprise/coderd/prebuilds/metricscollector_test.go index 859509ced6635..de3f5d017f715 100644 --- a/enterprise/coderd/prebuilds/metricscollector_test.go +++ b/enterprise/coderd/prebuilds/metricscollector_test.go @@ -16,6 +16,7 @@ import ( "github.com/coder/quartz" "github.com/coder/coder/v2/coderd/database" + "github.com/coder/coder/v2/coderd/database/dbauthz" "github.com/coder/coder/v2/coderd/database/dbgen" "github.com/coder/coder/v2/coderd/database/dbtestutil" agplprebuilds "github.com/coder/coder/v2/coderd/prebuilds" @@ -248,6 +249,10 @@ func TestMetricsCollector(t *testing.T) { setupTestDBWorkspaceAgent(t, db, workspace.ID, eligible) } + // Force an update to the metrics state to allow the collector to collect fresh metrics. + // nolint:gocritic // Authz context needed to retrieve state. + require.NoError(t, collector.UpdateState(dbauthz.AsPrebuildsOrchestrator(ctx), testutil.WaitLong)) + metricsFamilies, err := registry.Gather() require.NoError(t, err) diff --git a/enterprise/coderd/prebuilds/reconcile.go b/enterprise/coderd/prebuilds/reconcile.go index c31da695637ba..79a8baa337e72 100644 --- a/enterprise/coderd/prebuilds/reconcile.go +++ b/enterprise/coderd/prebuilds/reconcile.go @@ -5,6 +5,7 @@ import ( "database/sql" "fmt" "math" + "sync" "sync/atomic" "time" @@ -67,10 +68,12 @@ func NewStoreReconciler(store database.Store, provisionNotifyCh: make(chan database.ProvisionerJob, 10), } - reconciler.metrics = NewMetricsCollector(store, logger, reconciler) - if err := registerer.Register(reconciler.metrics); err != nil { - // If the registerer fails to register the metrics collector, it's not fatal. - logger.Error(context.Background(), "failed to register prometheus metrics", slog.Error(err)) + if registerer != nil { + reconciler.metrics = NewMetricsCollector(store, logger, reconciler) + if err := registerer.Register(reconciler.metrics); err != nil { + // If the registerer fails to register the metrics collector, it's not fatal. + logger.Error(context.Background(), "failed to register prometheus metrics", slog.Error(err)) + } } return reconciler @@ -87,9 +90,11 @@ func (c *StoreReconciler) Run(ctx context.Context) { slog.F("backoff_interval", c.cfg.ReconciliationBackoffInterval.String()), slog.F("backoff_lookback", c.cfg.ReconciliationBackoffLookback.String())) + var wg sync.WaitGroup ticker := c.clock.NewTicker(reconciliationInterval) defer ticker.Stop() defer func() { + wg.Wait() c.done <- struct{}{} }() @@ -97,6 +102,15 @@ func (c *StoreReconciler) Run(ctx context.Context) { ctx, cancel := context.WithCancelCause(dbauthz.AsPrebuildsOrchestrator(ctx)) c.cancelFn = cancel + // Start updating metrics in the background. + if c.metrics != nil { + wg.Add(1) + go func() { + defer wg.Done() + c.metrics.BackgroundFetch(ctx, metricsUpdateInterval, metricsUpdateTimeout) + }() + } + // Everything is in place, reconciler can now be considered as running. // // NOTE: without this atomic bool, Stop might race with Run for the c.cancelFn above.
Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.
Alternative Proxies: