Skip to content

Commit 0347231

Browse files
authored
feat: expose agent metrics via Prometheus endpoint (coder#7011)
* WIP * WIP * WIP * Agents * fix * 1min * fix * WIP * Test * docs * fmt * Add timer to measure the metrics collection * Use CachedGaugeVec * Unit tests * Address PR comments
1 parent dd85ea8 commit 0347231

File tree

7 files changed

+629
-48
lines changed

7 files changed

+629
-48
lines changed

cli/server.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -896,6 +896,15 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
896896
return xerrors.Errorf("create coder API: %w", err)
897897
}
898898

899+
if cfg.Prometheus.Enable {
900+
// Agent metrics require reference to the tailnet coordinator, so must be initiated after Coder API.
901+
closeAgentsFunc, err := prometheusmetrics.Agents(ctx, logger, options.PrometheusRegistry, coderAPI.Database, &coderAPI.TailnetCoordinator, options.DERPMap, coderAPI.Options.AgentInactiveDisconnectTimeout, 0)
902+
if err != nil {
903+
return xerrors.Errorf("register agents prometheus metric: %w", err)
904+
}
905+
defer closeAgentsFunc()
906+
}
907+
899908
client := codersdk.New(localURL)
900909
if localURL.Scheme == "https" && isLocalhost(localURL.Hostname()) {
901910
// The certificate will likely be self-signed or for a different

coderd/prometheusmetrics/collector.go

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
package prometheusmetrics
2+
3+
import (
4+
"sync"
5+
6+
"github.com/prometheus/client_golang/prometheus"
7+
)
8+
9+
// CachedGaugeVec is a wrapper for the prometheus.GaugeVec which allows
10+
// for staging changes in the metrics vector. Calling "WithLabelValues(...)"
11+
// will update the internal gauge value, but it will not be returned by
12+
// "Collect(...)" until the "Commit()" method is called. The "Commit()" method
13+
// resets the internal gauge and applies all staged changes to it.
14+
//
15+
// The Use of CachedGaugeVec is recommended for use cases when there is a risk
16+
// that the Prometheus collector receives incomplete metrics, collected
17+
// in the middle of metrics recalculation, between "Reset()" and the last
18+
// "WithLabelValues()" call.
19+
type CachedGaugeVec struct {
20+
m sync.Mutex
21+
22+
gaugeVec *prometheus.GaugeVec
23+
records []vectorRecord
24+
}
25+
26+
var _ prometheus.Collector = new(CachedGaugeVec)
27+
28+
type VectorOperation int
29+
30+
const (
31+
VectorOperationAdd VectorOperation = iota
32+
VectorOperationSet
33+
)
34+
35+
type vectorRecord struct {
36+
operation VectorOperation
37+
value float64
38+
labelValues []string
39+
}
40+
41+
func NewCachedGaugeVec(gaugeVec *prometheus.GaugeVec) *CachedGaugeVec {
42+
return &CachedGaugeVec{
43+
gaugeVec: gaugeVec,
44+
}
45+
}
46+
47+
func (v *CachedGaugeVec) Describe(desc chan<- *prometheus.Desc) {
48+
v.gaugeVec.Describe(desc)
49+
}
50+
51+
func (v *CachedGaugeVec) Collect(ch chan<- prometheus.Metric) {
52+
v.m.Lock()
53+
defer v.m.Unlock()
54+
55+
v.gaugeVec.Collect(ch)
56+
}
57+
58+
func (v *CachedGaugeVec) WithLabelValues(operation VectorOperation, value float64, labelValues ...string) {
59+
switch operation {
60+
case VectorOperationAdd:
61+
case VectorOperationSet:
62+
default:
63+
panic("unsupported vector operation")
64+
}
65+
66+
v.m.Lock()
67+
defer v.m.Unlock()
68+
69+
v.records = append(v.records, vectorRecord{
70+
operation: operation,
71+
value: value,
72+
labelValues: labelValues,
73+
})
74+
}
75+
76+
// Commit will set the internal value as the cached value to return from "Collect()".
77+
// The internal metric value is completely reset, so the caller should expect
78+
// the gauge to be empty for the next 'WithLabelValues' values.
79+
func (v *CachedGaugeVec) Commit() {
80+
v.m.Lock()
81+
defer v.m.Unlock()
82+
83+
v.gaugeVec.Reset()
84+
for _, record := range v.records {
85+
g := v.gaugeVec.WithLabelValues(record.labelValues...)
86+
switch record.operation {
87+
case VectorOperationAdd:
88+
g.Add(record.value)
89+
case VectorOperationSet:
90+
g.Set(record.value)
91+
}
92+
}
93+
94+
v.records = nil
95+
}
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
package prometheusmetrics_test
2+
3+
import (
4+
"sort"
5+
"testing"
6+
7+
"github.com/prometheus/client_golang/prometheus"
8+
dto "github.com/prometheus/client_model/go"
9+
"github.com/stretchr/testify/assert"
10+
"github.com/stretchr/testify/require"
11+
12+
"github.com/coder/coder/coderd/prometheusmetrics"
13+
)
14+
15+
func TestCollector_Add(t *testing.T) {
16+
t.Parallel()
17+
18+
// given
19+
agentsGauge := prometheusmetrics.NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
20+
Namespace: "coderd",
21+
Subsystem: "agents",
22+
Name: "up",
23+
Help: "The number of active agents per workspace.",
24+
}, []string{"username", "workspace_name"}))
25+
26+
// when
27+
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 7, "first user", "my workspace")
28+
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 23, "second user", "your workspace")
29+
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 1, "first user", "my workspace")
30+
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 25, "second user", "your workspace")
31+
agentsGauge.Commit()
32+
33+
// then
34+
ch := make(chan prometheus.Metric, 2)
35+
agentsGauge.Collect(ch)
36+
37+
metrics := collectAndSortMetrics(t, agentsGauge, 2)
38+
39+
assert.Equal(t, "first user", metrics[0].Label[0].GetValue()) // Username
40+
assert.Equal(t, "my workspace", metrics[0].Label[1].GetValue()) // Workspace name
41+
assert.Equal(t, 8, int(metrics[0].Gauge.GetValue())) // Metric value
42+
43+
assert.Equal(t, "second user", metrics[1].Label[0].GetValue()) // Username
44+
assert.Equal(t, "your workspace", metrics[1].Label[1].GetValue()) // Workspace name
45+
assert.Equal(t, 48, int(metrics[1].Gauge.GetValue())) // Metric value
46+
}
47+
48+
func TestCollector_Set(t *testing.T) {
49+
t.Parallel()
50+
51+
// given
52+
agentsGauge := prometheusmetrics.NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
53+
Namespace: "coderd",
54+
Subsystem: "agents",
55+
Name: "up",
56+
Help: "The number of active agents per workspace.",
57+
}, []string{"username", "workspace_name"}))
58+
59+
// when
60+
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 3, "first user", "my workspace")
61+
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 4, "second user", "your workspace")
62+
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 5, "first user", "my workspace")
63+
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 6, "second user", "your workspace")
64+
agentsGauge.Commit()
65+
66+
// then
67+
ch := make(chan prometheus.Metric, 2)
68+
agentsGauge.Collect(ch)
69+
70+
metrics := collectAndSortMetrics(t, agentsGauge, 2)
71+
72+
assert.Equal(t, "first user", metrics[0].Label[0].GetValue()) // Username
73+
assert.Equal(t, "my workspace", metrics[0].Label[1].GetValue()) // Workspace name
74+
assert.Equal(t, 5, int(metrics[0].Gauge.GetValue())) // Metric value
75+
76+
assert.Equal(t, "second user", metrics[1].Label[0].GetValue()) // Username
77+
assert.Equal(t, "your workspace", metrics[1].Label[1].GetValue()) // Workspace name
78+
assert.Equal(t, 6, int(metrics[1].Gauge.GetValue())) // Metric value
79+
}
80+
81+
func TestCollector_Set_Add(t *testing.T) {
82+
t.Parallel()
83+
84+
// given
85+
agentsGauge := prometheusmetrics.NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{
86+
Namespace: "coderd",
87+
Subsystem: "agents",
88+
Name: "up",
89+
Help: "The number of active agents per workspace.",
90+
}, []string{"username", "workspace_name"}))
91+
92+
// when
93+
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 9, "first user", "my workspace")
94+
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 8, "second user", "your workspace")
95+
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 7, "first user", "my workspace")
96+
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 6, "second user", "your workspace")
97+
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 5, "first user", "my workspace")
98+
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 4, "second user", "your workspace")
99+
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 3, "first user", "my workspace")
100+
agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 2, "second user", "your workspace")
101+
agentsGauge.Commit()
102+
103+
// then
104+
ch := make(chan prometheus.Metric, 2)
105+
agentsGauge.Collect(ch)
106+
107+
metrics := collectAndSortMetrics(t, agentsGauge, 2)
108+
109+
assert.Equal(t, "first user", metrics[0].Label[0].GetValue()) // Username
110+
assert.Equal(t, "my workspace", metrics[0].Label[1].GetValue()) // Workspace name
111+
assert.Equal(t, 8, int(metrics[0].Gauge.GetValue())) // Metric value
112+
113+
assert.Equal(t, "second user", metrics[1].Label[0].GetValue()) // Username
114+
assert.Equal(t, "your workspace", metrics[1].Label[1].GetValue()) // Workspace name
115+
assert.Equal(t, 6, int(metrics[1].Gauge.GetValue())) // Metric value
116+
}
117+
118+
func collectAndSortMetrics(t *testing.T, collector prometheus.Collector, count int) []dto.Metric {
119+
ch := make(chan prometheus.Metric, count)
120+
defer close(ch)
121+
122+
var metrics []dto.Metric
123+
124+
collector.Collect(ch)
125+
for i := 0; i < count; i++ {
126+
m := <-ch
127+
128+
var metric dto.Metric
129+
err := m.Write(&metric)
130+
require.NoError(t, err)
131+
132+
metrics = append(metrics, metric)
133+
}
134+
135+
// Ensure always the same order of metrics
136+
sort.Slice(metrics, func(i, j int) bool {
137+
return sort.StringsAreSorted([]string{metrics[i].Label[0].GetValue(), metrics[j].Label[1].GetValue()})
138+
})
139+
return metrics
140+
}

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy