Skip to content

Commit b2a1de9

Browse files
authored
feat: fetch prebuilds metrics state in background (#17792)
`Collect()` is called whenever the `/metrics` endpoint is hit to retrieve metrics. The queries used in prebuilds metrics collection are quite heavy, and we want to avoid having them running concurrently / too often to keep db load down. Here I'm moving towards a background retrieval of the state required to set the metrics, which gets invalidated every interval. Also introduces `coderd_prebuilt_workspaces_metrics_last_updated` which operators can use to determine when these metrics go stale. See #17789 as well. --------- Signed-off-by: Danny Kopping <dannykopping@gmail.com>
1 parent 709445e commit b2a1de9

File tree

3 files changed

+109
-24
lines changed

3 files changed

+109
-24
lines changed

enterprise/coderd/prebuilds/metricscollector.go

Lines changed: 86 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,17 @@ package prebuilds
22

33
import (
44
"context"
5+
"fmt"
6+
"sync/atomic"
57
"time"
68

7-
"cdr.dev/slog"
8-
99
"github.com/prometheus/client_golang/prometheus"
10+
"golang.org/x/xerrors"
11+
12+
"cdr.dev/slog"
1013

1114
"github.com/coder/coder/v2/coderd/database"
12-
"github.com/coder/coder/v2/coderd/database/dbauthz"
15+
"github.com/coder/coder/v2/coderd/database/dbtime"
1316
"github.com/coder/coder/v2/coderd/prebuilds"
1417
)
1518

@@ -55,20 +58,34 @@ var (
5558
labels,
5659
nil,
5760
)
61+
lastUpdateDesc = prometheus.NewDesc(
62+
"coderd_prebuilt_workspaces_metrics_last_updated",
63+
"The unix timestamp when the metrics related to prebuilt workspaces were last updated; these metrics are cached.",
64+
[]string{},
65+
nil,
66+
)
67+
)
68+
69+
const (
70+
metricsUpdateInterval = time.Second * 15
71+
metricsUpdateTimeout = time.Second * 10
5872
)
5973

6074
type MetricsCollector struct {
6175
database database.Store
6276
logger slog.Logger
6377
snapshotter prebuilds.StateSnapshotter
78+
79+
latestState atomic.Pointer[metricsState]
6480
}
6581

6682
var _ prometheus.Collector = new(MetricsCollector)
6783

6884
func NewMetricsCollector(db database.Store, logger slog.Logger, snapshotter prebuilds.StateSnapshotter) *MetricsCollector {
85+
log := logger.Named("prebuilds_metrics_collector")
6986
return &MetricsCollector{
7087
database: db,
71-
logger: logger.Named("prebuilds_metrics_collector"),
88+
logger: log,
7289
snapshotter: snapshotter,
7390
}
7491
}
@@ -80,38 +97,34 @@ func (*MetricsCollector) Describe(descCh chan<- *prometheus.Desc) {
8097
descCh <- desiredPrebuildsDesc
8198
descCh <- runningPrebuildsDesc
8299
descCh <- eligiblePrebuildsDesc
100+
descCh <- lastUpdateDesc
83101
}
84102

103+
// Collect uses the cached state to set configured metrics.
104+
// The state is cached because this function can be called multiple times per second and retrieving the current state
105+
// is an expensive operation.
85106
func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) {
86-
// nolint:gocritic // We need to set an authz context to read metrics from the db.
87-
ctx, cancel := context.WithTimeout(dbauthz.AsPrebuildsOrchestrator(context.Background()), 10*time.Second)
88-
defer cancel()
89-
prebuildMetrics, err := mc.database.GetPrebuildMetrics(ctx)
90-
if err != nil {
91-
mc.logger.Error(ctx, "failed to get prebuild metrics", slog.Error(err))
107+
currentState := mc.latestState.Load() // Grab a copy; it's ok if it goes stale during the course of this func.
108+
if currentState == nil {
109+
mc.logger.Warn(context.Background(), "failed to set prebuilds metrics; state not set")
110+
metricsCh <- prometheus.MustNewConstMetric(lastUpdateDesc, prometheus.GaugeValue, 0)
92111
return
93112
}
94113

95-
for _, metric := range prebuildMetrics {
114+
for _, metric := range currentState.prebuildMetrics {
96115
metricsCh <- prometheus.MustNewConstMetric(createdPrebuildsDesc, prometheus.CounterValue, float64(metric.CreatedCount), metric.TemplateName, metric.PresetName, metric.OrganizationName)
97116
metricsCh <- prometheus.MustNewConstMetric(failedPrebuildsDesc, prometheus.CounterValue, float64(metric.FailedCount), metric.TemplateName, metric.PresetName, metric.OrganizationName)
98117
metricsCh <- prometheus.MustNewConstMetric(claimedPrebuildsDesc, prometheus.CounterValue, float64(metric.ClaimedCount), metric.TemplateName, metric.PresetName, metric.OrganizationName)
99118
}
100119

101-
snapshot, err := mc.snapshotter.SnapshotState(ctx, mc.database)
102-
if err != nil {
103-
mc.logger.Error(ctx, "failed to get latest prebuild state", slog.Error(err))
104-
return
105-
}
106-
107-
for _, preset := range snapshot.Presets {
120+
for _, preset := range currentState.snapshot.Presets {
108121
if !preset.UsingActiveVersion {
109122
continue
110123
}
111124

112-
presetSnapshot, err := snapshot.FilterByPreset(preset.ID)
125+
presetSnapshot, err := currentState.snapshot.FilterByPreset(preset.ID)
113126
if err != nil {
114-
mc.logger.Error(ctx, "failed to filter by preset", slog.Error(err))
127+
mc.logger.Error(context.Background(), "failed to filter by preset", slog.Error(err))
115128
continue
116129
}
117130
state := presetSnapshot.CalculateState()
@@ -120,4 +133,57 @@ func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) {
120133
metricsCh <- prometheus.MustNewConstMetric(runningPrebuildsDesc, prometheus.GaugeValue, float64(state.Actual), preset.TemplateName, preset.Name, preset.OrganizationName)
121134
metricsCh <- prometheus.MustNewConstMetric(eligiblePrebuildsDesc, prometheus.GaugeValue, float64(state.Eligible), preset.TemplateName, preset.Name, preset.OrganizationName)
122135
}
136+
137+
metricsCh <- prometheus.MustNewConstMetric(lastUpdateDesc, prometheus.GaugeValue, float64(currentState.createdAt.Unix()))
138+
}
139+
140+
type metricsState struct {
141+
prebuildMetrics []database.GetPrebuildMetricsRow
142+
snapshot *prebuilds.GlobalSnapshot
143+
createdAt time.Time
144+
}
145+
146+
// BackgroundFetch updates the metrics state every given interval.
147+
func (mc *MetricsCollector) BackgroundFetch(ctx context.Context, updateInterval, updateTimeout time.Duration) {
148+
tick := time.NewTicker(time.Nanosecond)
149+
defer tick.Stop()
150+
151+
for {
152+
select {
153+
case <-ctx.Done():
154+
return
155+
case <-tick.C:
156+
// Tick immediately, then set regular interval.
157+
tick.Reset(updateInterval)
158+
159+
if err := mc.UpdateState(ctx, updateTimeout); err != nil {
160+
mc.logger.Error(ctx, "failed to update prebuilds metrics state", slog.Error(err))
161+
}
162+
}
163+
}
164+
}
165+
166+
// UpdateState builds the current metrics state.
167+
func (mc *MetricsCollector) UpdateState(ctx context.Context, timeout time.Duration) error {
168+
start := time.Now()
169+
fetchCtx, fetchCancel := context.WithTimeout(ctx, timeout)
170+
defer fetchCancel()
171+
172+
prebuildMetrics, err := mc.database.GetPrebuildMetrics(fetchCtx)
173+
if err != nil {
174+
return xerrors.Errorf("fetch prebuild metrics: %w", err)
175+
}
176+
177+
snapshot, err := mc.snapshotter.SnapshotState(fetchCtx, mc.database)
178+
if err != nil {
179+
return xerrors.Errorf("snapshot state: %w", err)
180+
}
181+
mc.logger.Debug(ctx, "fetched prebuilds metrics state", slog.F("duration_secs", fmt.Sprintf("%.2f", time.Since(start).Seconds())))
182+
183+
mc.latestState.Store(&metricsState{
184+
prebuildMetrics: prebuildMetrics,
185+
snapshot: snapshot,
186+
createdAt: dbtime.Now(),
187+
})
188+
return nil
123189
}

enterprise/coderd/prebuilds/metricscollector_test.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import (
1616
"github.com/coder/quartz"
1717

1818
"github.com/coder/coder/v2/coderd/database"
19+
"github.com/coder/coder/v2/coderd/database/dbauthz"
1920
"github.com/coder/coder/v2/coderd/database/dbgen"
2021
"github.com/coder/coder/v2/coderd/database/dbtestutil"
2122
agplprebuilds "github.com/coder/coder/v2/coderd/prebuilds"
@@ -248,6 +249,10 @@ func TestMetricsCollector(t *testing.T) {
248249
setupTestDBWorkspaceAgent(t, db, workspace.ID, eligible)
249250
}
250251

252+
// Force an update to the metrics state to allow the collector to collect fresh metrics.
253+
// nolint:gocritic // Authz context needed to retrieve state.
254+
require.NoError(t, collector.UpdateState(dbauthz.AsPrebuildsOrchestrator(ctx), testutil.WaitLong))
255+
251256
metricsFamilies, err := registry.Gather()
252257
require.NoError(t, err)
253258

enterprise/coderd/prebuilds/reconcile.go

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"database/sql"
66
"fmt"
77
"math"
8+
"sync"
89
"sync/atomic"
910
"time"
1011

@@ -67,10 +68,12 @@ func NewStoreReconciler(store database.Store,
6768
provisionNotifyCh: make(chan database.ProvisionerJob, 10),
6869
}
6970

70-
reconciler.metrics = NewMetricsCollector(store, logger, reconciler)
71-
if err := registerer.Register(reconciler.metrics); err != nil {
72-
// If the registerer fails to register the metrics collector, it's not fatal.
73-
logger.Error(context.Background(), "failed to register prometheus metrics", slog.Error(err))
71+
if registerer != nil {
72+
reconciler.metrics = NewMetricsCollector(store, logger, reconciler)
73+
if err := registerer.Register(reconciler.metrics); err != nil {
74+
// If the registerer fails to register the metrics collector, it's not fatal.
75+
logger.Error(context.Background(), "failed to register prometheus metrics", slog.Error(err))
76+
}
7477
}
7578

7679
return reconciler
@@ -87,16 +90,27 @@ func (c *StoreReconciler) Run(ctx context.Context) {
8790
slog.F("backoff_interval", c.cfg.ReconciliationBackoffInterval.String()),
8891
slog.F("backoff_lookback", c.cfg.ReconciliationBackoffLookback.String()))
8992

93+
var wg sync.WaitGroup
9094
ticker := c.clock.NewTicker(reconciliationInterval)
9195
defer ticker.Stop()
9296
defer func() {
97+
wg.Wait()
9398
c.done <- struct{}{}
9499
}()
95100

96101
// nolint:gocritic // Reconciliation Loop needs Prebuilds Orchestrator permissions.
97102
ctx, cancel := context.WithCancelCause(dbauthz.AsPrebuildsOrchestrator(ctx))
98103
c.cancelFn = cancel
99104

105+
// Start updating metrics in the background.
106+
if c.metrics != nil {
107+
wg.Add(1)
108+
go func() {
109+
defer wg.Done()
110+
c.metrics.BackgroundFetch(ctx, metricsUpdateInterval, metricsUpdateTimeout)
111+
}()
112+
}
113+
100114
// Everything is in place, reconciler can now be considered as running.
101115
//
102116
// NOTE: without this atomic bool, Stop might race with Run for the c.cancelFn above.

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy