diff --git a/cli/server.go b/cli/server.go index 7d4261a2e2a7f..c93064f34c8ef 100644 --- a/cli/server.go +++ b/cli/server.go @@ -889,6 +889,15 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. return xerrors.Errorf("create coder API: %w", err) } + if cfg.Prometheus.Enable { + // Agent metrics require reference to the tailnet coordinator, so must be initiated after Coder API. + closeAgentsFunc, err := prometheusmetrics.Agents(ctx, logger, options.PrometheusRegistry, coderAPI.Database, &coderAPI.TailnetCoordinator, options.DERPMap, coderAPI.Options.AgentInactiveDisconnectTimeout, 0) + if err != nil { + return xerrors.Errorf("register agents prometheus metric: %w", err) + } + defer closeAgentsFunc() + } + client := codersdk.New(localURL) if localURL.Scheme == "https" && isLocalhost(localURL.Hostname()) { // The certificate will likely be self-signed or for a different diff --git a/coderd/prometheusmetrics/collector.go b/coderd/prometheusmetrics/collector.go new file mode 100644 index 0000000000000..8839553a1ffdd --- /dev/null +++ b/coderd/prometheusmetrics/collector.go @@ -0,0 +1,95 @@ +package prometheusmetrics + +import ( + "sync" + + "github.com/prometheus/client_golang/prometheus" +) + +// CachedGaugeVec is a wrapper for the prometheus.GaugeVec which allows +// for staging changes in the metrics vector. Calling "WithLabelValues(...)" +// will update the internal gauge value, but it will not be returned by +// "Collect(...)" until the "Commit()" method is called. The "Commit()" method +// resets the internal gauge and applies all staged changes to it. +// +// The Use of CachedGaugeVec is recommended for use cases when there is a risk +// that the Prometheus collector receives incomplete metrics, collected +// in the middle of metrics recalculation, between "Reset()" and the last +// "WithLabelValues()" call. +type CachedGaugeVec struct { + m sync.Mutex + + gaugeVec *prometheus.GaugeVec + records []vectorRecord +} + +var _ prometheus.Collector = new(CachedGaugeVec) + +type VectorOperation int + +const ( + VectorOperationAdd VectorOperation = iota + VectorOperationSet +) + +type vectorRecord struct { + operation VectorOperation + value float64 + labelValues []string +} + +func NewCachedGaugeVec(gaugeVec *prometheus.GaugeVec) *CachedGaugeVec { + return &CachedGaugeVec{ + gaugeVec: gaugeVec, + } +} + +func (v *CachedGaugeVec) Describe(desc chan<- *prometheus.Desc) { + v.gaugeVec.Describe(desc) +} + +func (v *CachedGaugeVec) Collect(ch chan<- prometheus.Metric) { + v.m.Lock() + defer v.m.Unlock() + + v.gaugeVec.Collect(ch) +} + +func (v *CachedGaugeVec) WithLabelValues(operation VectorOperation, value float64, labelValues ...string) { + switch operation { + case VectorOperationAdd: + case VectorOperationSet: + default: + panic("unsupported vector operation") + } + + v.m.Lock() + defer v.m.Unlock() + + v.records = append(v.records, vectorRecord{ + operation: operation, + value: value, + labelValues: labelValues, + }) +} + +// Commit will set the internal value as the cached value to return from "Collect()". +// The internal metric value is completely reset, so the caller should expect +// the gauge to be empty for the next 'WithLabelValues' values. +func (v *CachedGaugeVec) Commit() { + v.m.Lock() + defer v.m.Unlock() + + v.gaugeVec.Reset() + for _, record := range v.records { + g := v.gaugeVec.WithLabelValues(record.labelValues...) + switch record.operation { + case VectorOperationAdd: + g.Add(record.value) + case VectorOperationSet: + g.Set(record.value) + } + } + + v.records = nil +} diff --git a/coderd/prometheusmetrics/collector_test.go b/coderd/prometheusmetrics/collector_test.go new file mode 100644 index 0000000000000..9d63f6669113d --- /dev/null +++ b/coderd/prometheusmetrics/collector_test.go @@ -0,0 +1,140 @@ +package prometheusmetrics_test + +import ( + "sort" + "testing" + + "github.com/prometheus/client_golang/prometheus" + dto "github.com/prometheus/client_model/go" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/coder/coder/coderd/prometheusmetrics" +) + +func TestCollector_Add(t *testing.T) { + t.Parallel() + + // given + agentsGauge := prometheusmetrics.NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "agents", + Name: "up", + Help: "The number of active agents per workspace.", + }, []string{"username", "workspace_name"})) + + // when + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 7, "first user", "my workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 23, "second user", "your workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 1, "first user", "my workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 25, "second user", "your workspace") + agentsGauge.Commit() + + // then + ch := make(chan prometheus.Metric, 2) + agentsGauge.Collect(ch) + + metrics := collectAndSortMetrics(t, agentsGauge, 2) + + assert.Equal(t, "first user", metrics[0].Label[0].GetValue()) // Username + assert.Equal(t, "my workspace", metrics[0].Label[1].GetValue()) // Workspace name + assert.Equal(t, 8, int(metrics[0].Gauge.GetValue())) // Metric value + + assert.Equal(t, "second user", metrics[1].Label[0].GetValue()) // Username + assert.Equal(t, "your workspace", metrics[1].Label[1].GetValue()) // Workspace name + assert.Equal(t, 48, int(metrics[1].Gauge.GetValue())) // Metric value +} + +func TestCollector_Set(t *testing.T) { + t.Parallel() + + // given + agentsGauge := prometheusmetrics.NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "agents", + Name: "up", + Help: "The number of active agents per workspace.", + }, []string{"username", "workspace_name"})) + + // when + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 3, "first user", "my workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 4, "second user", "your workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 5, "first user", "my workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 6, "second user", "your workspace") + agentsGauge.Commit() + + // then + ch := make(chan prometheus.Metric, 2) + agentsGauge.Collect(ch) + + metrics := collectAndSortMetrics(t, agentsGauge, 2) + + assert.Equal(t, "first user", metrics[0].Label[0].GetValue()) // Username + assert.Equal(t, "my workspace", metrics[0].Label[1].GetValue()) // Workspace name + assert.Equal(t, 5, int(metrics[0].Gauge.GetValue())) // Metric value + + assert.Equal(t, "second user", metrics[1].Label[0].GetValue()) // Username + assert.Equal(t, "your workspace", metrics[1].Label[1].GetValue()) // Workspace name + assert.Equal(t, 6, int(metrics[1].Gauge.GetValue())) // Metric value +} + +func TestCollector_Set_Add(t *testing.T) { + t.Parallel() + + // given + agentsGauge := prometheusmetrics.NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "agents", + Name: "up", + Help: "The number of active agents per workspace.", + }, []string{"username", "workspace_name"})) + + // when + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 9, "first user", "my workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 8, "second user", "your workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 7, "first user", "my workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 6, "second user", "your workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 5, "first user", "my workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 4, "second user", "your workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 3, "first user", "my workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 2, "second user", "your workspace") + agentsGauge.Commit() + + // then + ch := make(chan prometheus.Metric, 2) + agentsGauge.Collect(ch) + + metrics := collectAndSortMetrics(t, agentsGauge, 2) + + assert.Equal(t, "first user", metrics[0].Label[0].GetValue()) // Username + assert.Equal(t, "my workspace", metrics[0].Label[1].GetValue()) // Workspace name + assert.Equal(t, 8, int(metrics[0].Gauge.GetValue())) // Metric value + + assert.Equal(t, "second user", metrics[1].Label[0].GetValue()) // Username + assert.Equal(t, "your workspace", metrics[1].Label[1].GetValue()) // Workspace name + assert.Equal(t, 6, int(metrics[1].Gauge.GetValue())) // Metric value +} + +func collectAndSortMetrics(t *testing.T, collector prometheus.Collector, count int) []dto.Metric { + ch := make(chan prometheus.Metric, count) + defer close(ch) + + var metrics []dto.Metric + + collector.Collect(ch) + for i := 0; i < count; i++ { + m := <-ch + + var metric dto.Metric + err := m.Write(&metric) + require.NoError(t, err) + + metrics = append(metrics, metric) + } + + // Ensure always the same order of metrics + sort.Slice(metrics, func(i, j int) bool { + return sort.StringsAreSorted([]string{metrics[i].Label[0].GetValue(), metrics[j].Label[1].GetValue()}) + }) + return metrics +} diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index 536522bf73e04..83e4af90d0765 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -2,13 +2,24 @@ package prometheusmetrics import ( "context" + "database/sql" + "errors" + "fmt" + "strconv" + "strings" + "sync/atomic" "time" "github.com/google/uuid" "github.com/prometheus/client_golang/prometheus" + "tailscale.com/tailcfg" + + "cdr.dev/slog" "github.com/coder/coder/coderd" "github.com/coder/coder/coderd/database" + "github.com/coder/coder/coderd/database/dbauthz" + "github.com/coder/coder/tailnet" ) // ActiveUsers tracks the number of users that have authenticated within the past hour. @@ -106,3 +117,175 @@ func Workspaces(ctx context.Context, registerer prometheus.Registerer, db databa }() return cancelFunc, nil } + +// Agents tracks the total number of workspaces with labels on status. +func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, coordinator *atomic.Pointer[tailnet.Coordinator], derpMap *tailcfg.DERPMap, agentInactiveDisconnectTimeout, duration time.Duration) (context.CancelFunc, error) { + if duration == 0 { + duration = 1 * time.Minute + } + + agentsGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "agents", + Name: "up", + Help: "The number of active agents per workspace.", + }, []string{"username", "workspace_name"})) + err := registerer.Register(agentsGauge) + if err != nil { + return nil, err + } + + agentsConnectionsGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "agents", + Name: "connections", + Help: "Agent connections with statuses.", + }, []string{"agent_name", "username", "workspace_name", "status", "lifecycle_state", "tailnet_node"})) + err = registerer.Register(agentsConnectionsGauge) + if err != nil { + return nil, err + } + + agentsConnectionLatenciesGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "agents", + Name: "connection_latencies_seconds", + Help: "Agent connection latencies in seconds.", + }, []string{"agent_id", "username", "workspace_name", "derp_region", "preferred"})) + err = registerer.Register(agentsConnectionLatenciesGauge) + if err != nil { + return nil, err + } + + agentsAppsGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "agents", + Name: "apps", + Help: "Agent applications with statuses.", + }, []string{"agent_name", "username", "workspace_name", "app_name", "health"})) + err = registerer.Register(agentsAppsGauge) + if err != nil { + return nil, err + } + + metricsCollectorAgents := prometheus.NewHistogram(prometheus.HistogramOpts{ + Namespace: "coderd", + Subsystem: "prometheusmetrics", + Name: "agents_execution_seconds", + Help: "Histogram for duration of agents metrics collection in seconds.", + Buckets: []float64{0.001, 0.005, 0.010, 0.025, 0.050, 0.100, 0.500, 1, 5, 10, 30}, + }) + err = registerer.Register(metricsCollectorAgents) + if err != nil { + return nil, err + } + + // nolint:gocritic // Prometheus must collect metrics for all Coder users. + ctx, cancelFunc := context.WithCancel(dbauthz.AsSystemRestricted(ctx)) + ticker := time.NewTicker(duration) + go func() { + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + } + + logger.Debug(ctx, "Agent metrics collection is starting") + timer := prometheus.NewTimer(metricsCollectorAgents) + + workspaceRows, err := db.GetWorkspaces(ctx, database.GetWorkspacesParams{ + AgentInactiveDisconnectTimeoutSeconds: int64(agentInactiveDisconnectTimeout.Seconds()), + }) + if err != nil { + logger.Error(ctx, "can't get workspace rows", slog.Error(err)) + continue + } + + for _, workspace := range workspaceRows { + user, err := db.GetUserByID(ctx, workspace.OwnerID) + if err != nil { + logger.Error(ctx, "can't get user", slog.F("user_id", workspace.OwnerID), slog.Error(err)) + agentsGauge.WithLabelValues(VectorOperationAdd, 0, user.Username, workspace.Name) + continue + } + + agents, err := db.GetWorkspaceAgentsInLatestBuildByWorkspaceID(ctx, workspace.ID) + if err != nil { + logger.Error(ctx, "can't get workspace agents", slog.F("workspace_id", workspace.ID), slog.Error(err)) + agentsGauge.WithLabelValues(VectorOperationAdd, 0, user.Username, workspace.Name) + continue + } + + if len(agents) == 0 { + logger.Debug(ctx, "workspace agents are unavailable", slog.F("workspace_id", workspace.ID)) + agentsGauge.WithLabelValues(VectorOperationAdd, 0, user.Username, workspace.Name) + continue + } + + for _, agent := range agents { + // Collect information about agents + agentsGauge.WithLabelValues(VectorOperationAdd, 1, user.Username, workspace.Name) + + connectionStatus := agent.Status(agentInactiveDisconnectTimeout) + node := (*coordinator.Load()).Node(agent.ID) + + tailnetNode := "unknown" + if node != nil { + tailnetNode = node.ID.String() + } + + agentsConnectionsGauge.WithLabelValues(VectorOperationSet, 1, agent.Name, user.Username, workspace.Name, string(connectionStatus.Status), string(agent.LifecycleState), tailnetNode) + + if node == nil { + logger.Debug(ctx, "can't read in-memory node for agent", slog.F("agent_id", agent.ID)) + } else { + // Collect information about connection latencies + for rawRegion, latency := range node.DERPLatency { + regionParts := strings.SplitN(rawRegion, "-", 2) + regionID, err := strconv.Atoi(regionParts[0]) + if err != nil { + logger.Error(ctx, "can't convert DERP region", slog.F("agent_id", agent.ID), slog.F("raw_region", rawRegion), slog.Error(err)) + continue + } + + region, found := derpMap.Regions[regionID] + if !found { + // It's possible that a workspace agent is using an old DERPMap + // and reports regions that do not exist. If that's the case, + // report the region as unknown! + region = &tailcfg.DERPRegion{ + RegionID: regionID, + RegionName: fmt.Sprintf("Unnamed %d", regionID), + } + } + + agentsConnectionLatenciesGauge.WithLabelValues(VectorOperationSet, latency, agent.Name, user.Username, workspace.Name, region.RegionName, fmt.Sprintf("%v", node.PreferredDERP == regionID)) + } + } + + // Collect information about registered applications + apps, err := db.GetWorkspaceAppsByAgentID(ctx, agent.ID) + if err != nil && !errors.Is(err, sql.ErrNoRows) { + logger.Error(ctx, "can't get workspace apps", slog.F("agent_id", agent.ID), slog.Error(err)) + continue + } + + for _, app := range apps { + agentsAppsGauge.WithLabelValues(VectorOperationAdd, 1, agent.Name, user.Username, workspace.Name, app.DisplayName, string(app.Health)) + } + } + } + + agentsGauge.Commit() + agentsConnectionsGauge.Commit() + agentsConnectionLatenciesGauge.Commit() + agentsAppsGauge.Commit() + + logger.Debug(ctx, "Agent metrics collection is done") + metricsCollectorAgents.Observe(timer.ObserveDuration().Seconds()) + } + }() + return cancelFunc, nil +} diff --git a/coderd/prometheusmetrics/prometheusmetrics_test.go b/coderd/prometheusmetrics/prometheusmetrics_test.go index a0b375ccf8622..e765c5f2a1128 100644 --- a/coderd/prometheusmetrics/prometheusmetrics_test.go +++ b/coderd/prometheusmetrics/prometheusmetrics_test.go @@ -3,6 +3,7 @@ package prometheusmetrics_test import ( "context" "database/sql" + "sync/atomic" "testing" "time" @@ -11,11 +12,18 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "cdr.dev/slog/sloggers/slogtest" + + "github.com/coder/coder/coderd/coderdtest" "github.com/coder/coder/coderd/database" "github.com/coder/coder/coderd/database/dbfake" "github.com/coder/coder/coderd/database/dbgen" "github.com/coder/coder/coderd/prometheusmetrics" "github.com/coder/coder/codersdk" + "github.com/coder/coder/provisioner/echo" + "github.com/coder/coder/provisionersdk/proto" + "github.com/coder/coder/tailnet" + "github.com/coder/coder/tailnet/tailnettest" "github.com/coder/coder/testutil" ) @@ -239,3 +247,108 @@ func TestWorkspaces(t *testing.T) { }) } } + +func TestAgents(t *testing.T) { + t.Parallel() + + // Build a sample workspace with test agent and fake application + client, _, api := coderdtest.NewWithAPI(t, &coderdtest.Options{IncludeProvisionerDaemon: true}) + db := api.Database + + user := coderdtest.CreateFirstUser(t, client) + version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{ + Parse: echo.ParseComplete, + ProvisionPlan: echo.ProvisionComplete, + ProvisionApply: []*proto.Provision_Response{{ + Type: &proto.Provision_Response_Complete{ + Complete: &proto.Provision_Complete{ + Resources: []*proto.Resource{{ + Name: "example", + Type: "aws_instance", + Agents: []*proto.Agent{{ + Id: uuid.NewString(), + Name: "testagent", + Directory: t.TempDir(), + Auth: &proto.Agent_Token{ + Token: uuid.NewString(), + }, + Apps: []*proto.App{ + { + Slug: "fake-app", + DisplayName: "Fake application", + SharingLevel: proto.AppSharingLevel_OWNER, + // Hopefully this IP and port doesn't exist. + Url: "http://127.1.0.1:65535", + }, + }, + }}, + }}, + }, + }, + }}, + }) + template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID) + coderdtest.AwaitTemplateVersionJob(t, client, version.ID) + workspace := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID) + coderdtest.AwaitWorkspaceBuildJob(t, client, workspace.LatestBuild.ID) + + // given + coordinator := tailnet.NewCoordinator() + coordinatorPtr := atomic.Pointer[tailnet.Coordinator]{} + coordinatorPtr.Store(&coordinator) + derpMap := tailnettest.RunDERPAndSTUN(t) + agentInactiveDisconnectTimeout := 1 * time.Hour // don't need to focus on this value in tests + registry := prometheus.NewRegistry() + + // when + cancel, err := prometheusmetrics.Agents(context.Background(), slogtest.Make(t, nil), registry, db, &coordinatorPtr, derpMap, agentInactiveDisconnectTimeout, time.Millisecond) + t.Cleanup(cancel) + + // then + require.NoError(t, err) + + var agentsUp bool + var agentsConnections bool + var agentsApps bool + var agentsExecutionInSeconds bool + require.Eventually(t, func() bool { + metrics, err := registry.Gather() + assert.NoError(t, err) + + if len(metrics) < 1 { + return false + } + + for _, metric := range metrics { + switch metric.GetName() { + case "coderd_agents_up": + assert.Equal(t, "testuser", metric.Metric[0].Label[0].GetValue()) // Username + assert.Equal(t, workspace.Name, metric.Metric[0].Label[1].GetValue()) // Workspace name + assert.Equal(t, 1, int(metric.Metric[0].Gauge.GetValue())) // Metric value + agentsUp = true + case "coderd_agents_connections": + assert.Equal(t, "testagent", metric.Metric[0].Label[0].GetValue()) // Agent name + assert.Equal(t, "created", metric.Metric[0].Label[1].GetValue()) // Lifecycle state + assert.Equal(t, "connecting", metric.Metric[0].Label[2].GetValue()) // Status + assert.Equal(t, "unknown", metric.Metric[0].Label[3].GetValue()) // Tailnet node + assert.Equal(t, "testuser", metric.Metric[0].Label[4].GetValue()) // Username + assert.Equal(t, workspace.Name, metric.Metric[0].Label[5].GetValue()) // Workspace name + assert.Equal(t, 1, int(metric.Metric[0].Gauge.GetValue())) // Metric value + agentsConnections = true + case "coderd_agents_apps": + assert.Equal(t, "testagent", metric.Metric[0].Label[0].GetValue()) // Agent name + assert.Equal(t, "Fake application", metric.Metric[0].Label[1].GetValue()) // App name + assert.Equal(t, "disabled", metric.Metric[0].Label[2].GetValue()) // Health + assert.Equal(t, "testuser", metric.Metric[0].Label[3].GetValue()) // Username + assert.Equal(t, workspace.Name, metric.Metric[0].Label[4].GetValue()) // Workspace name + assert.Equal(t, 1, int(metric.Metric[0].Gauge.GetValue())) // Metric value + agentsApps = true + case "coderd_prometheusmetrics_agents_execution_seconds": + agentsExecutionInSeconds = true + default: + require.FailNowf(t, "unexpected metric collected", "metric: %s", metric.GetName()) + } + } + return agentsUp && agentsConnections && agentsApps && agentsExecutionInSeconds + }, testutil.WaitShort, testutil.IntervalFast) +} diff --git a/docs/admin/prometheus.md b/docs/admin/prometheus.md index f35ba5d1c5182..2898f8f4a469c 100644 --- a/docs/admin/prometheus.md +++ b/docs/admin/prometheus.md @@ -29,53 +29,58 @@ The environment variable `CODER_PROMETHEUS_ENABLE` will be enabled automatically -| Name | Type | Description | Labels | -| -------------------------------------------- | --------- | ------------------------------------------------------------------ | ----------------------------------------------------------------------------------- | -| `coderd_api_active_users_duration_hour` | gauge | The number of users that have been active within the last hour. | | -| `coderd_api_concurrent_requests` | gauge | The number of concurrent API requests. | | -| `coderd_api_concurrent_websockets` | gauge | The total number of concurrent API websockets. | | -| `coderd_api_request_latencies_seconds` | histogram | Latency distribution of requests in seconds. | `method` `path` | -| `coderd_api_requests_processed_total` | counter | The total number of processed API requests | `code` `method` `path` | -| `coderd_api_websocket_durations_seconds` | histogram | Websocket duration distribution of requests in seconds. | `path` | -| `coderd_api_workspace_latest_build_total` | gauge | The latest workspace builds with a status. | `status` | -| `coderd_provisionerd_job_timings_seconds` | histogram | The provisioner job time duration in seconds. | `provisioner` `status` | -| `coderd_provisionerd_jobs_current` | gauge | The number of currently running provisioner jobs. | `provisioner` | -| `coderd_workspace_builds_total` | counter | The number of workspaces started, updated, or deleted. | `action` `owner_email` `status` `template_name` `template_version` `workspace_name` | -| `go_gc_duration_seconds` | summary | A summary of the pause duration of garbage collection cycles. | | -| `go_goroutines` | gauge | Number of goroutines that currently exist. | | -| `go_info` | gauge | Information about the Go environment. | `version` | -| `go_memstats_alloc_bytes` | gauge | Number of bytes allocated and still in use. | | -| `go_memstats_alloc_bytes_total` | counter | Total number of bytes allocated, even if freed. | | -| `go_memstats_buck_hash_sys_bytes` | gauge | Number of bytes used by the profiling bucket hash table. | | -| `go_memstats_frees_total` | counter | Total number of frees. | | -| `go_memstats_gc_sys_bytes` | gauge | Number of bytes used for garbage collection system metadata. | | -| `go_memstats_heap_alloc_bytes` | gauge | Number of heap bytes allocated and still in use. | | -| `go_memstats_heap_idle_bytes` | gauge | Number of heap bytes waiting to be used. | | -| `go_memstats_heap_inuse_bytes` | gauge | Number of heap bytes that are in use. | | -| `go_memstats_heap_objects` | gauge | Number of allocated objects. | | -| `go_memstats_heap_released_bytes` | gauge | Number of heap bytes released to OS. | | -| `go_memstats_heap_sys_bytes` | gauge | Number of heap bytes obtained from system. | | -| `go_memstats_last_gc_time_seconds` | gauge | Number of seconds since 1970 of last garbage collection. | | -| `go_memstats_lookups_total` | counter | Total number of pointer lookups. | | -| `go_memstats_mallocs_total` | counter | Total number of mallocs. | | -| `go_memstats_mcache_inuse_bytes` | gauge | Number of bytes in use by mcache structures. | | -| `go_memstats_mcache_sys_bytes` | gauge | Number of bytes used for mcache structures obtained from system. | | -| `go_memstats_mspan_inuse_bytes` | gauge | Number of bytes in use by mspan structures. | | -| `go_memstats_mspan_sys_bytes` | gauge | Number of bytes used for mspan structures obtained from system. | | -| `go_memstats_next_gc_bytes` | gauge | Number of heap bytes when next garbage collection will take place. | | -| `go_memstats_other_sys_bytes` | gauge | Number of bytes used for other system allocations. | | -| `go_memstats_stack_inuse_bytes` | gauge | Number of bytes in use by the stack allocator. | | -| `go_memstats_stack_sys_bytes` | gauge | Number of bytes obtained from system for stack allocator. | | -| `go_memstats_sys_bytes` | gauge | Number of bytes obtained from system. | | -| `go_threads` | gauge | Number of OS threads created. | | -| `process_cpu_seconds_total` | counter | Total user and system CPU time spent in seconds. | | -| `process_max_fds` | gauge | Maximum number of open file descriptors. | | -| `process_open_fds` | gauge | Number of open file descriptors. | | -| `process_resident_memory_bytes` | gauge | Resident memory size in bytes. | | -| `process_start_time_seconds` | gauge | Start time of the process since unix epoch in seconds. | | -| `process_virtual_memory_bytes` | gauge | Virtual memory size in bytes. | | -| `process_virtual_memory_max_bytes` | gauge | Maximum amount of virtual memory available in bytes. | | -| `promhttp_metric_handler_requests_in_flight` | gauge | Current number of scrapes being served. | | -| `promhttp_metric_handler_requests_total` | counter | Total number of scrapes by HTTP status code. | `code` | +| Name | Type | Description | Labels | +| --------------------------------------------------- | --------- | ------------------------------------------------------------------ | ----------------------------------------------------------------------------------- | +| `coderd_agents_apps` | gauge | Agent applications with statuses. | `agent_name` `app_name` `health` `username` `workspace_name` | +| `coderd_agents_connection_latencies_seconds` | gauge | Agent connection latencies in seconds. | `agent_id` `derp_region` `preferred` `username` `workspace_name` | +| `coderd_agents_connections` | gauge | Agent connections with statuses. | `agent_name` `lifecycle_state` `status` `tailnet_node` `username` `workspace_name` | +| `coderd_agents_up` | gauge | The number of active agents per workspace. | `username` `workspace_name` | +| `coderd_api_active_users_duration_hour` | gauge | The number of users that have been active within the last hour. | | +| `coderd_api_concurrent_requests` | gauge | The number of concurrent API requests. | | +| `coderd_api_concurrent_websockets` | gauge | The total number of concurrent API websockets. | | +| `coderd_api_request_latencies_seconds` | histogram | Latency distribution of requests in seconds. | `method` `path` | +| `coderd_api_requests_processed_total` | counter | The total number of processed API requests | `code` `method` `path` | +| `coderd_api_websocket_durations_seconds` | histogram | Websocket duration distribution of requests in seconds. | `path` | +| `coderd_api_workspace_latest_build_total` | gauge | The latest workspace builds with a status. | `status` | +| `coderd_metrics_collector_agents_execution_seconds` | histogram | Histogram for duration of agents metrics collection in seconds. | | +| `coderd_provisionerd_job_timings_seconds` | histogram | The provisioner job time duration in seconds. | `provisioner` `status` | +| `coderd_provisionerd_jobs_current` | gauge | The number of currently running provisioner jobs. | `provisioner` | +| `coderd_workspace_builds_total` | counter | The number of workspaces started, updated, or deleted. | `action` `owner_email` `status` `template_name` `template_version` `workspace_name` | +| `go_gc_duration_seconds` | summary | A summary of the pause duration of garbage collection cycles. | | +| `go_goroutines` | gauge | Number of goroutines that currently exist. | | +| `go_info` | gauge | Information about the Go environment. | `version` | +| `go_memstats_alloc_bytes` | gauge | Number of bytes allocated and still in use. | | +| `go_memstats_alloc_bytes_total` | counter | Total number of bytes allocated, even if freed. | | +| `go_memstats_buck_hash_sys_bytes` | gauge | Number of bytes used by the profiling bucket hash table. | | +| `go_memstats_frees_total` | counter | Total number of frees. | | +| `go_memstats_gc_sys_bytes` | gauge | Number of bytes used for garbage collection system metadata. | | +| `go_memstats_heap_alloc_bytes` | gauge | Number of heap bytes allocated and still in use. | | +| `go_memstats_heap_idle_bytes` | gauge | Number of heap bytes waiting to be used. | | +| `go_memstats_heap_inuse_bytes` | gauge | Number of heap bytes that are in use. | | +| `go_memstats_heap_objects` | gauge | Number of allocated objects. | | +| `go_memstats_heap_released_bytes` | gauge | Number of heap bytes released to OS. | | +| `go_memstats_heap_sys_bytes` | gauge | Number of heap bytes obtained from system. | | +| `go_memstats_last_gc_time_seconds` | gauge | Number of seconds since 1970 of last garbage collection. | | +| `go_memstats_lookups_total` | counter | Total number of pointer lookups. | | +| `go_memstats_mallocs_total` | counter | Total number of mallocs. | | +| `go_memstats_mcache_inuse_bytes` | gauge | Number of bytes in use by mcache structures. | | +| `go_memstats_mcache_sys_bytes` | gauge | Number of bytes used for mcache structures obtained from system. | | +| `go_memstats_mspan_inuse_bytes` | gauge | Number of bytes in use by mspan structures. | | +| `go_memstats_mspan_sys_bytes` | gauge | Number of bytes used for mspan structures obtained from system. | | +| `go_memstats_next_gc_bytes` | gauge | Number of heap bytes when next garbage collection will take place. | | +| `go_memstats_other_sys_bytes` | gauge | Number of bytes used for other system allocations. | | +| `go_memstats_stack_inuse_bytes` | gauge | Number of bytes in use by the stack allocator. | | +| `go_memstats_stack_sys_bytes` | gauge | Number of bytes obtained from system for stack allocator. | | +| `go_memstats_sys_bytes` | gauge | Number of bytes obtained from system. | | +| `go_threads` | gauge | Number of OS threads created. | | +| `process_cpu_seconds_total` | counter | Total user and system CPU time spent in seconds. | | +| `process_max_fds` | gauge | Maximum number of open file descriptors. | | +| `process_open_fds` | gauge | Number of open file descriptors. | | +| `process_resident_memory_bytes` | gauge | Resident memory size in bytes. | | +| `process_start_time_seconds` | gauge | Start time of the process since unix epoch in seconds. | | +| `process_virtual_memory_bytes` | gauge | Virtual memory size in bytes. | | +| `process_virtual_memory_max_bytes` | gauge | Maximum amount of virtual memory available in bytes. | | +| `promhttp_metric_handler_requests_in_flight` | gauge | Current number of scrapes being served. | | +| `promhttp_metric_handler_requests_total` | counter | Total number of scrapes by HTTP status code. | `code` | diff --git a/scripts/metricsdocgen/metrics b/scripts/metricsdocgen/metrics index 50bbc87990dda..7e598b17abe56 100644 --- a/scripts/metricsdocgen/metrics +++ b/scripts/metricsdocgen/metrics @@ -1,3 +1,23 @@ +# HELP coderd_agents_apps Agent applications with statuses. +# TYPE coderd_agents_apps gauge +coderd_agents_apps{agent_name="main",app_name="code-server",health="healthy",username="admin",workspace_name="workspace-1"} 1 +coderd_agents_apps{agent_name="main",app_name="code-server",health="healthy",username="admin",workspace_name="workspace-2"} 1 +coderd_agents_apps{agent_name="main",app_name="code-server",health="healthy",username="admin",workspace_name="workspace-3"} 1 +# HELP coderd_agents_connection_latencies_seconds Agent connection latencies in seconds. +# TYPE coderd_agents_connection_latencies_seconds gauge +coderd_agents_connection_latencies_seconds{agent_id="main",derp_region="Coder Embedded Relay",preferred="true",username="admin",workspace_name="workspace-1"} 0.03018125 +coderd_agents_connection_latencies_seconds{agent_id="main",derp_region="Coder Embedded Relay",preferred="true",username="admin",workspace_name="workspace-2"} 0.028658416 +coderd_agents_connection_latencies_seconds{agent_id="main",derp_region="Coder Embedded Relay",preferred="true",username="admin",workspace_name="workspace-3"} 0.028041416 +# HELP coderd_agents_connections Agent connections with statuses. +# TYPE coderd_agents_connections gauge +coderd_agents_connections{agent_name="main",lifecycle_state="ready",status="connected",tailnet_node="nodeid:16966f7df70d8cc5",username="admin",workspace_name="workspace-3"} 1 +coderd_agents_connections{agent_name="main",lifecycle_state="start_timeout",status="connected",tailnet_node="nodeid:3237d00938be23e3",username="admin",workspace_name="workspace-2"} 1 +coderd_agents_connections{agent_name="main",lifecycle_state="start_timeout",status="connected",tailnet_node="nodeid:3779bd45d00be0eb",username="admin",workspace_name="workspace-1"} 1 +# HELP coderd_agents_up The number of active agents per workspace. +# TYPE coderd_agents_up gauge +coderd_agents_up{username="admin",workspace_name="workspace-1"} 1 +coderd_agents_up{username="admin",workspace_name="workspace-2"} 1 +coderd_agents_up{username="admin",workspace_name="workspace-3"} 1 # HELP coderd_api_websocket_durations_seconds Websocket duration distribution of requests in seconds. # TYPE coderd_api_websocket_durations_seconds histogram coderd_api_websocket_durations_seconds_bucket{path="/api/v2/workspaceagents/me/coordinate",le="0.001"} 0 @@ -568,6 +588,22 @@ coderd_api_requests_processed_total{code="401",method="POST",path="/api/v2/files # HELP coderd_api_workspace_latest_build_total The latest workspace builds with a status. # TYPE coderd_api_workspace_latest_build_total gauge coderd_api_workspace_latest_build_total{status="succeeded"} 1 +# HELP coderd_metrics_collector_agents_execution_seconds Histogram for duration of agents metrics collection in seconds. +# TYPE coderd_metrics_collector_agents_execution_seconds histogram +coderd_metrics_collector_agents_execution_seconds_bucket{le="0.001"} 0 +coderd_metrics_collector_agents_execution_seconds_bucket{le="0.005"} 0 +coderd_metrics_collector_agents_execution_seconds_bucket{le="0.01"} 0 +coderd_metrics_collector_agents_execution_seconds_bucket{le="0.025"} 0 +coderd_metrics_collector_agents_execution_seconds_bucket{le="0.05"} 2 +coderd_metrics_collector_agents_execution_seconds_bucket{le="0.1"} 2 +coderd_metrics_collector_agents_execution_seconds_bucket{le="0.5"} 2 +coderd_metrics_collector_agents_execution_seconds_bucket{le="1"} 2 +coderd_metrics_collector_agents_execution_seconds_bucket{le="5"} 2 +coderd_metrics_collector_agents_execution_seconds_bucket{le="10"} 2 +coderd_metrics_collector_agents_execution_seconds_bucket{le="30"} 2 +coderd_metrics_collector_agents_execution_seconds_bucket{le="+Inf"} 2 +coderd_metrics_collector_agents_execution_seconds_sum 0.0592915 +coderd_metrics_collector_agents_execution_seconds_count 2 # HELP coderd_provisionerd_job_timings_seconds The provisioner job time duration in seconds. # TYPE coderd_provisionerd_job_timings_seconds histogram coderd_provisionerd_job_timings_seconds_bucket{provisioner="terraform",status="success",le="1"} 0 pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy