From 8d4e67d3a07ff3e40d62139fcd71f8488aa31884 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Mon, 3 Apr 2023 21:43:18 +0200 Subject: [PATCH 01/15] WIP --- cli/server.go | 6 ++ coderd/prometheusmetrics/prometheusmetrics.go | 75 +++++++++++++++++++ 2 files changed, 81 insertions(+) diff --git a/cli/server.go b/cli/server.go index b6fa7c31b647c..5ce53af185e1d 100644 --- a/cli/server.go +++ b/cli/server.go @@ -868,6 +868,12 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. } defer closeWorkspacesFunc() + closeAgentsFunc, err := prometheusmetrics.Agents(ctx, options.PrometheusRegistry, options.Database, 0) + if err != nil { + return xerrors.Errorf("register agents prometheus metric: %w", err) + } + defer closeAgentsFunc() + //nolint:revive defer serveHandler(ctx, logger, promhttp.InstrumentMetricHandler( options.PrometheusRegistry, promhttp.HandlerFor(options.PrometheusRegistry, promhttp.HandlerOpts{}), diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index 536522bf73e04..2c1c36dddbc34 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -106,3 +106,78 @@ func Workspaces(ctx context.Context, registerer prometheus.Registerer, db databa }() return cancelFunc, nil } + +// Agents tracks the total number of workspaces with labels on status. +func Agents(ctx context.Context, registerer prometheus.Registerer, db database.Store, duration time.Duration) (context.CancelFunc, error) { + if duration == 0 { + duration = 15 * time.Second // TODO 5 * time.Minute + } + + agentsConnectionGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "agents", + Name: "connection", + Help: "The agent connection with a status.", + }, []string{"agent_name", "workspace_name", "status"}) + err := registerer.Register(agentsConnectionGauge) + if err != nil { + return nil, err + } + + agentsUserLatenciesHistogram := prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: "coderd", + Subsystem: "agents", + Name: "user_latencies_seconds", + Help: "The user's agent latency in seconds.", + Buckets: []float64{0.001, 0.005, 0.010, 0.025, 0.050, 0.100, 0.500, 1, 5, 10, 30}, + }, []string{"agent_id", "workspace", "connection_type", "ide"}) + err = registerer.Register(agentsUserLatenciesHistogram) + if err != nil { + return nil, err + } + + ctx, cancelFunc := context.WithCancel(ctx) + ticker := time.NewTicker(duration) + go func() { + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + } + + // FIXME Optimize this routine: SQL db calls + + builds, err := db.GetLatestWorkspaceBuilds(ctx) + if err != nil { + continue + } + + agentsConnectionGauge.Reset() + for _, build := range builds { + workspace, err := db.GetWorkspaceByID(ctx, build.WorkspaceID) + if err != nil { + continue + } + + agents, err := db.GetWorkspaceAgentsInLatestBuildByWorkspaceID(ctx, build.WorkspaceID) + if err != nil { + continue + } + + if len(agents) == 0 { + continue + } + + for _, agent := range agents { + connectionStatus := agent.Status(6 * time.Second) + + // FIXME AgentInactiveDisconnectTimeout + agentsConnectionGauge.WithLabelValues(agent.Name, workspace.Name, string(connectionStatus.Status)).Set(1) + } + } + } + }() + return cancelFunc, nil +} From 9ad09b20f8e830c346eb3e41c4eaf4c9e7d29475 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Tue, 4 Apr 2023 19:47:27 +0200 Subject: [PATCH 02/15] WIP --- cli/server.go | 20 ++++--- coderd/prometheusmetrics/prometheusmetrics.go | 60 +++++++++++++++++-- 2 files changed, 66 insertions(+), 14 deletions(-) diff --git a/cli/server.go b/cli/server.go index 30ceb06aefc46..b3961b52e4e53 100644 --- a/cli/server.go +++ b/cli/server.go @@ -849,6 +849,16 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. defer options.Telemetry.Close() } + databaseStoreWithoutAuth := options.Database + + // We use a separate coderAPICloser so the Enterprise API + // can have it's own close functions. This is cleaner + // than abstracting the Coder API itself. + coderAPI, coderAPICloser, err := newAPI(ctx, options) + if err != nil { + return xerrors.Errorf("create coder API: %w", err) + } + // This prevents the pprof import from being accidentally deleted. _ = pprof.Handler if cfg.Pprof.Enable { @@ -871,7 +881,7 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. } defer closeWorkspacesFunc() - closeAgentsFunc, err := prometheusmetrics.Agents(ctx, options.PrometheusRegistry, options.Database, 0) + closeAgentsFunc, err := prometheusmetrics.Agents(ctx, options.PrometheusRegistry, databaseStoreWithoutAuth, &coderAPI.TailnetCoordinator, options.DERPMap, 0) if err != nil { return xerrors.Errorf("register agents prometheus metric: %w", err) } @@ -887,14 +897,6 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. options.SwaggerEndpoint = cfg.Swagger.Enable.Value() } - // We use a separate coderAPICloser so the Enterprise API - // can have it's own close functions. This is cleaner - // than abstracting the Coder API itself. - coderAPI, coderAPICloser, err := newAPI(ctx, options) - if err != nil { - return xerrors.Errorf("create coder API: %w", err) - } - client := codersdk.New(localURL) if localURL.Scheme == "https" && isLocalhost(localURL.Hostname()) { // The certificate will likely be self-signed or for a different diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index 2c1c36dddbc34..5ce6fbd51f6c5 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -2,13 +2,20 @@ package prometheusmetrics import ( "context" + "fmt" + "log" + "strconv" + "strings" + "sync/atomic" "time" "github.com/google/uuid" "github.com/prometheus/client_golang/prometheus" + "tailscale.com/tailcfg" "github.com/coder/coder/coderd" "github.com/coder/coder/coderd/database" + "github.com/coder/coder/tailnet" ) // ActiveUsers tracks the number of users that have authenticated within the past hour. @@ -108,7 +115,7 @@ func Workspaces(ctx context.Context, registerer prometheus.Registerer, db databa } // Agents tracks the total number of workspaces with labels on status. -func Agents(ctx context.Context, registerer prometheus.Registerer, db database.Store, duration time.Duration) (context.CancelFunc, error) { +func Agents(ctx context.Context, registerer prometheus.Registerer, db database.Store, coordinator *atomic.Pointer[tailnet.Coordinator], derpMap *tailcfg.DERPMap, duration time.Duration) (context.CancelFunc, error) { if duration == 0 { duration = 15 * time.Second // TODO 5 * time.Minute } @@ -124,23 +131,26 @@ func Agents(ctx context.Context, registerer prometheus.Registerer, db database.S return nil, err } - agentsUserLatenciesHistogram := prometheus.NewHistogramVec(prometheus.HistogramOpts{ + agentsUserLatenciesGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: "coderd", Subsystem: "agents", Name: "user_latencies_seconds", Help: "The user's agent latency in seconds.", - Buckets: []float64{0.001, 0.005, 0.010, 0.025, 0.050, 0.100, 0.500, 1, 5, 10, 30}, - }, []string{"agent_id", "workspace", "connection_type", "ide"}) - err = registerer.Register(agentsUserLatenciesHistogram) + }, []string{"agent_id", "workspace_name", "derp_region", "preferred"}) + err = registerer.Register(agentsUserLatenciesGauge) if err != nil { return nil, err } + // FIXME connection_type ide + ctx, cancelFunc := context.WithCancel(ctx) ticker := time.NewTicker(duration) go func() { defer ticker.Stop() for { + log.Println("Agents!!!") + select { case <-ctx.Done(): return @@ -151,18 +161,22 @@ func Agents(ctx context.Context, registerer prometheus.Registerer, db database.S builds, err := db.GetLatestWorkspaceBuilds(ctx) if err != nil { + log.Println("1", err) continue } agentsConnectionGauge.Reset() + agentsUserLatenciesGauge.Reset() for _, build := range builds { workspace, err := db.GetWorkspaceByID(ctx, build.WorkspaceID) if err != nil { + log.Println("2", err) continue } agents, err := db.GetWorkspaceAgentsInLatestBuildByWorkspaceID(ctx, build.WorkspaceID) if err != nil { + log.Println("3", err) continue } @@ -170,11 +184,47 @@ func Agents(ctx context.Context, registerer prometheus.Registerer, db database.S continue } + // FIXME publish workspace even if no agents + for _, agent := range agents { connectionStatus := agent.Status(6 * time.Second) // FIXME AgentInactiveDisconnectTimeout + log.Println("with value " + agent.Name) agentsConnectionGauge.WithLabelValues(agent.Name, workspace.Name, string(connectionStatus.Status)).Set(1) + + node := (*coordinator.Load()).Node(agent.ID) + if node != nil { + log.Println("coordinator") + + for rawRegion, latency := range node.DERPLatency { + log.Println(rawRegion, latency) + + regionParts := strings.SplitN(rawRegion, "-", 2) + regionID, err := strconv.Atoi(regionParts[0]) + if err != nil { + continue // xerrors.Errorf("convert derp region id %q: %w", rawRegion, err) + } + region, found := derpMap.Regions[regionID] + if !found { + // It's possible that a workspace agent is using an old DERPMap + // and reports regions that do not exist. If that's the case, + // report the region as unknown! + region = &tailcfg.DERPRegion{ + RegionID: regionID, + RegionName: fmt.Sprintf("Unnamed %d", regionID), + } + } + + log.Println(region, latency) + agentsUserLatenciesGauge.WithLabelValues(agent.Name, workspace.Name, region.RegionName, fmt.Sprintf("%v", node.PreferredDERP == regionID)).Set(latency) + } + } else { + log.Println("node is null") + } + + // FIXME publish agent even if DERP is missing + // FIXME IDE? } } } From 440657c16195948c17a72751193043ce44a15531 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Wed, 5 Apr 2023 10:43:30 +0200 Subject: [PATCH 03/15] WIP --- coderd/prometheusmetrics/prometheusmetrics.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index 5ce6fbd51f6c5..d1af83a4a4439 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -190,6 +190,8 @@ func Agents(ctx context.Context, registerer prometheus.Registerer, db database.S connectionStatus := agent.Status(6 * time.Second) // FIXME AgentInactiveDisconnectTimeout + // ? connection_timeout_seconds + // obok latency lifecycle_state log.Println("with value " + agent.Name) agentsConnectionGauge.WithLabelValues(agent.Name, workspace.Name, string(connectionStatus.Status)).Set(1) @@ -225,6 +227,7 @@ func Agents(ctx context.Context, registerer prometheus.Registerer, db database.S // FIXME publish agent even if DERP is missing // FIXME IDE? + // FIXME agent connection zero } } } From 8764f8975d75ebb45d9e5996fac9b7509c953e33 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Wed, 5 Apr 2023 12:08:27 +0200 Subject: [PATCH 04/15] Agents --- cli/server.go | 33 ++--- coderd/prometheusmetrics/prometheusmetrics.go | 133 ++++++++++-------- 2 files changed, 92 insertions(+), 74 deletions(-) diff --git a/cli/server.go b/cli/server.go index b3961b52e4e53..c93064f34c8ef 100644 --- a/cli/server.go +++ b/cli/server.go @@ -849,16 +849,6 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. defer options.Telemetry.Close() } - databaseStoreWithoutAuth := options.Database - - // We use a separate coderAPICloser so the Enterprise API - // can have it's own close functions. This is cleaner - // than abstracting the Coder API itself. - coderAPI, coderAPICloser, err := newAPI(ctx, options) - if err != nil { - return xerrors.Errorf("create coder API: %w", err) - } - // This prevents the pprof import from being accidentally deleted. _ = pprof.Handler if cfg.Pprof.Enable { @@ -881,12 +871,6 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. } defer closeWorkspacesFunc() - closeAgentsFunc, err := prometheusmetrics.Agents(ctx, options.PrometheusRegistry, databaseStoreWithoutAuth, &coderAPI.TailnetCoordinator, options.DERPMap, 0) - if err != nil { - return xerrors.Errorf("register agents prometheus metric: %w", err) - } - defer closeAgentsFunc() - //nolint:revive defer serveHandler(ctx, logger, promhttp.InstrumentMetricHandler( options.PrometheusRegistry, promhttp.HandlerFor(options.PrometheusRegistry, promhttp.HandlerOpts{}), @@ -897,6 +881,23 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. options.SwaggerEndpoint = cfg.Swagger.Enable.Value() } + // We use a separate coderAPICloser so the Enterprise API + // can have it's own close functions. This is cleaner + // than abstracting the Coder API itself. + coderAPI, coderAPICloser, err := newAPI(ctx, options) + if err != nil { + return xerrors.Errorf("create coder API: %w", err) + } + + if cfg.Prometheus.Enable { + // Agent metrics require reference to the tailnet coordinator, so must be initiated after Coder API. + closeAgentsFunc, err := prometheusmetrics.Agents(ctx, logger, options.PrometheusRegistry, coderAPI.Database, &coderAPI.TailnetCoordinator, options.DERPMap, coderAPI.Options.AgentInactiveDisconnectTimeout, 0) + if err != nil { + return xerrors.Errorf("register agents prometheus metric: %w", err) + } + defer closeAgentsFunc() + } + client := codersdk.New(localURL) if localURL.Scheme == "https" && isLocalhost(localURL.Hostname()) { // The certificate will likely be self-signed or for a different diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index d1af83a4a4439..31a0b712a1511 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -3,7 +3,6 @@ package prometheusmetrics import ( "context" "fmt" - "log" "strconv" "strings" "sync/atomic" @@ -13,8 +12,11 @@ import ( "github.com/prometheus/client_golang/prometheus" "tailscale.com/tailcfg" + "cdr.dev/slog" + "github.com/coder/coder/coderd" "github.com/coder/coder/coderd/database" + "github.com/coder/coder/coderd/database/dbauthz" "github.com/coder/coder/tailnet" ) @@ -115,119 +117,134 @@ func Workspaces(ctx context.Context, registerer prometheus.Registerer, db databa } // Agents tracks the total number of workspaces with labels on status. -func Agents(ctx context.Context, registerer prometheus.Registerer, db database.Store, coordinator *atomic.Pointer[tailnet.Coordinator], derpMap *tailcfg.DERPMap, duration time.Duration) (context.CancelFunc, error) { +func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, coordinator *atomic.Pointer[tailnet.Coordinator], derpMap *tailcfg.DERPMap, agentInactiveDisconnectTimeout, duration time.Duration) (context.CancelFunc, error) { if duration == 0 { duration = 15 * time.Second // TODO 5 * time.Minute } - agentsConnectionGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + workspaceAgentsGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: "coderd", Subsystem: "agents", - Name: "connection", - Help: "The agent connection with a status.", - }, []string{"agent_name", "workspace_name", "status"}) - err := registerer.Register(agentsConnectionGauge) + Name: "up", + Help: "The number of active agents per workspace.", + }, []string{"username", "workspace_name"}) + err := registerer.Register(workspaceAgentsGauge) if err != nil { return nil, err } - agentsUserLatenciesGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + agentsConnectionGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: "coderd", Subsystem: "agents", - Name: "user_latencies_seconds", - Help: "The user's agent latency in seconds.", - }, []string{"agent_id", "workspace_name", "derp_region", "preferred"}) - err = registerer.Register(agentsUserLatenciesGauge) + Name: "connections", + Help: "Agent connections with statuses.", + }, []string{"agent_name", "username", "workspace_name", "status", "lifecycle_state", "tailnet_node"}) + err = registerer.Register(agentsConnectionGauge) if err != nil { return nil, err } - // FIXME connection_type ide + agentsConnectionLatenciesGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "agents", + Name: "connection_latencies_seconds", + Help: "Agent connection latencies in seconds.", + }, []string{"agent_id", "username", "workspace_name", "derp_region", "preferred"}) + err = registerer.Register(agentsConnectionLatenciesGauge) + if err != nil { + return nil, err + } - ctx, cancelFunc := context.WithCancel(ctx) + // nolint:gocritic // Prometheus must collect metrics for all Coder users. + ctx, cancelFunc := context.WithCancel(dbauthz.AsSystemRestricted(ctx)) ticker := time.NewTicker(duration) go func() { defer ticker.Stop() for { - log.Println("Agents!!!") - select { case <-ctx.Done(): return case <-ticker.C: } - // FIXME Optimize this routine: SQL db calls + logger.Info(ctx, "Collect agent metrics now") - builds, err := db.GetLatestWorkspaceBuilds(ctx) + workspaceRows, err := db.GetWorkspaces(ctx, database.GetWorkspacesParams{ + AgentInactiveDisconnectTimeoutSeconds: int64(agentInactiveDisconnectTimeout.Seconds()), + }) if err != nil { - log.Println("1", err) + logger.Error(ctx, "can't get workspace rows", slog.Error(err)) continue } + workspaceAgentsGauge.Reset() agentsConnectionGauge.Reset() - agentsUserLatenciesGauge.Reset() - for _, build := range builds { - workspace, err := db.GetWorkspaceByID(ctx, build.WorkspaceID) + agentsConnectionLatenciesGauge.Reset() + + for _, workspace := range workspaceRows { + user, err := db.GetUserByID(ctx, workspace.OwnerID) if err != nil { - log.Println("2", err) + logger.Error(ctx, "can't get user", slog.Error(err), slog.F("user_id", workspace.OwnerID)) + workspaceAgentsGauge.WithLabelValues(user.Username, workspace.Name).Add(0) continue } - agents, err := db.GetWorkspaceAgentsInLatestBuildByWorkspaceID(ctx, build.WorkspaceID) + agents, err := db.GetWorkspaceAgentsInLatestBuildByWorkspaceID(ctx, workspace.ID) if err != nil { - log.Println("3", err) + logger.Error(ctx, "can't get workspace agents", slog.F("workspace_name", workspace.Name), slog.Error(err)) + workspaceAgentsGauge.WithLabelValues(user.Username, workspace.Name).Add(0) continue } if len(agents) == 0 { + logger.Info(ctx, "workspace agents are unavailable", slog.F("workspace_name", workspace.Name)) + workspaceAgentsGauge.WithLabelValues(user.Username, workspace.Name).Add(0) continue } - // FIXME publish workspace even if no agents - for _, agent := range agents { - connectionStatus := agent.Status(6 * time.Second) - - // FIXME AgentInactiveDisconnectTimeout - // ? connection_timeout_seconds - // obok latency lifecycle_state - log.Println("with value " + agent.Name) - agentsConnectionGauge.WithLabelValues(agent.Name, workspace.Name, string(connectionStatus.Status)).Set(1) + // Collect information about agents + workspaceAgentsGauge.WithLabelValues(user.Username, workspace.Name).Add(1) + connectionStatus := agent.Status(agentInactiveDisconnectTimeout) node := (*coordinator.Load()).Node(agent.ID) + + tailnetNode := "unknown" if node != nil { - log.Println("coordinator") + tailnetNode = node.ID.String() + } - for rawRegion, latency := range node.DERPLatency { - log.Println(rawRegion, latency) + agentsConnectionGauge.WithLabelValues(agent.Name, user.Username, workspace.Name, string(connectionStatus.Status), string(agent.LifecycleState), tailnetNode).Set(1) - regionParts := strings.SplitN(rawRegion, "-", 2) - regionID, err := strconv.Atoi(regionParts[0]) - if err != nil { - continue // xerrors.Errorf("convert derp region id %q: %w", rawRegion, err) - } - region, found := derpMap.Regions[regionID] - if !found { - // It's possible that a workspace agent is using an old DERPMap - // and reports regions that do not exist. If that's the case, - // report the region as unknown! - region = &tailcfg.DERPRegion{ - RegionID: regionID, - RegionName: fmt.Sprintf("Unnamed %d", regionID), - } - } + if node == nil { + logger.Info(ctx, "can't read in-memory node for agent", slog.F("workspace_name", workspace.Name), slog.F("agent_name", agent.Name)) + continue + } - log.Println(region, latency) - agentsUserLatenciesGauge.WithLabelValues(agent.Name, workspace.Name, region.RegionName, fmt.Sprintf("%v", node.PreferredDERP == regionID)).Set(latency) + // Collect information about connection latencies + for rawRegion, latency := range node.DERPLatency { + regionParts := strings.SplitN(rawRegion, "-", 2) + regionID, err := strconv.Atoi(regionParts[0]) + if err != nil { + logger.Error(ctx, "can't convert DERP region", slog.Error(err), slog.F("agent_name", agent.Name), slog.F("raw_region", rawRegion)) + continue } - } else { - log.Println("node is null") + region, found := derpMap.Regions[regionID] + if !found { + // It's possible that a workspace agent is using an old DERPMap + // and reports regions that do not exist. If that's the case, + // report the region as unknown! + region = &tailcfg.DERPRegion{ + RegionID: regionID, + RegionName: fmt.Sprintf("Unnamed %d", regionID), + } + } + + agentsConnectionLatenciesGauge.WithLabelValues(agent.Name, user.Username, workspace.Name, region.RegionName, fmt.Sprintf("%v", node.PreferredDERP == regionID)).Set(latency) } - // FIXME publish agent even if DERP is missing // FIXME IDE? - // FIXME agent connection zero + // FIXME connection_type ide } } } From 663b5d5d7a78fce6f6d99943fc30759684a36618 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Wed, 5 Apr 2023 12:31:06 +0200 Subject: [PATCH 05/15] fix --- coderd/prometheusmetrics/prometheusmetrics.go | 93 ++++++++++++------- 1 file changed, 57 insertions(+), 36 deletions(-) diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index 31a0b712a1511..4e4d828981bc6 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -122,24 +122,24 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis duration = 15 * time.Second // TODO 5 * time.Minute } - workspaceAgentsGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + agentsGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: "coderd", Subsystem: "agents", Name: "up", Help: "The number of active agents per workspace.", }, []string{"username", "workspace_name"}) - err := registerer.Register(workspaceAgentsGauge) + err := registerer.Register(agentsGauge) if err != nil { return nil, err } - agentsConnectionGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + agentsConnectionsGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: "coderd", Subsystem: "agents", Name: "connections", Help: "Agent connections with statuses.", }, []string{"agent_name", "username", "workspace_name", "status", "lifecycle_state", "tailnet_node"}) - err = registerer.Register(agentsConnectionGauge) + err = registerer.Register(agentsConnectionsGauge) if err != nil { return nil, err } @@ -155,6 +155,17 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis return nil, err } + agentsAppsGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "agents", + Name: "apps", + Help: "Agent applications with statuses.", + }, []string{"agent_name", "username", "workspace_name", "app_name", "health"}) + err = registerer.Register(agentsAppsGauge) + if err != nil { + return nil, err + } + // nolint:gocritic // Prometheus must collect metrics for all Coder users. ctx, cancelFunc := context.WithCancel(dbauthz.AsSystemRestricted(ctx)) ticker := time.NewTicker(duration) @@ -167,7 +178,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis case <-ticker.C: } - logger.Info(ctx, "Collect agent metrics now") + logger.Debug(ctx, "Collect agent metrics now") workspaceRows, err := db.GetWorkspaces(ctx, database.GetWorkspacesParams{ AgentInactiveDisconnectTimeoutSeconds: int64(agentInactiveDisconnectTimeout.Seconds()), @@ -177,34 +188,35 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis continue } - workspaceAgentsGauge.Reset() - agentsConnectionGauge.Reset() + agentsGauge.Reset() + agentsConnectionsGauge.Reset() agentsConnectionLatenciesGauge.Reset() + agentsAppsGauge.Reset() for _, workspace := range workspaceRows { user, err := db.GetUserByID(ctx, workspace.OwnerID) if err != nil { - logger.Error(ctx, "can't get user", slog.Error(err), slog.F("user_id", workspace.OwnerID)) - workspaceAgentsGauge.WithLabelValues(user.Username, workspace.Name).Add(0) + logger.Error(ctx, "can't get user", slog.F("user_id", workspace.OwnerID), slog.Error(err)) + agentsGauge.WithLabelValues(user.Username, workspace.Name).Add(0) continue } agents, err := db.GetWorkspaceAgentsInLatestBuildByWorkspaceID(ctx, workspace.ID) if err != nil { - logger.Error(ctx, "can't get workspace agents", slog.F("workspace_name", workspace.Name), slog.Error(err)) - workspaceAgentsGauge.WithLabelValues(user.Username, workspace.Name).Add(0) + logger.Error(ctx, "can't get workspace agents", slog.F("workspace_id", workspace.ID), slog.Error(err)) + agentsGauge.WithLabelValues(user.Username, workspace.Name).Add(0) continue } if len(agents) == 0 { - logger.Info(ctx, "workspace agents are unavailable", slog.F("workspace_name", workspace.Name)) - workspaceAgentsGauge.WithLabelValues(user.Username, workspace.Name).Add(0) + logger.Debug(ctx, "workspace agents are unavailable", slog.F("workspace_id", workspace.ID)) + agentsGauge.WithLabelValues(user.Username, workspace.Name).Add(0) continue } for _, agent := range agents { // Collect information about agents - workspaceAgentsGauge.WithLabelValues(user.Username, workspace.Name).Add(1) + agentsGauge.WithLabelValues(user.Username, workspace.Name).Add(1) connectionStatus := agent.Status(agentInactiveDisconnectTimeout) node := (*coordinator.Load()).Node(agent.ID) @@ -214,37 +226,46 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis tailnetNode = node.ID.String() } - agentsConnectionGauge.WithLabelValues(agent.Name, user.Username, workspace.Name, string(connectionStatus.Status), string(agent.LifecycleState), tailnetNode).Set(1) + agentsConnectionsGauge.WithLabelValues(agent.Name, user.Username, workspace.Name, string(connectionStatus.Status), string(agent.LifecycleState), tailnetNode).Set(1) if node == nil { - logger.Info(ctx, "can't read in-memory node for agent", slog.F("workspace_name", workspace.Name), slog.F("agent_name", agent.Name)) + logger.Debug(ctx, "can't read in-memory node for agent", slog.F("agent_id", agent.ID)) continue - } + } else { + // Collect information about connection latencies + for rawRegion, latency := range node.DERPLatency { + regionParts := strings.SplitN(rawRegion, "-", 2) + regionID, err := strconv.Atoi(regionParts[0]) + if err != nil { + logger.Error(ctx, "can't convert DERP region", slog.F("agent_id", agent.ID), slog.F("raw_region", rawRegion), slog.Error(err)) + continue + } - // Collect information about connection latencies - for rawRegion, latency := range node.DERPLatency { - regionParts := strings.SplitN(rawRegion, "-", 2) - regionID, err := strconv.Atoi(regionParts[0]) - if err != nil { - logger.Error(ctx, "can't convert DERP region", slog.Error(err), slog.F("agent_name", agent.Name), slog.F("raw_region", rawRegion)) - continue - } - region, found := derpMap.Regions[regionID] - if !found { - // It's possible that a workspace agent is using an old DERPMap - // and reports regions that do not exist. If that's the case, - // report the region as unknown! - region = &tailcfg.DERPRegion{ - RegionID: regionID, - RegionName: fmt.Sprintf("Unnamed %d", regionID), + region, found := derpMap.Regions[regionID] + if !found { + // It's possible that a workspace agent is using an old DERPMap + // and reports regions that do not exist. If that's the case, + // report the region as unknown! + region = &tailcfg.DERPRegion{ + RegionID: regionID, + RegionName: fmt.Sprintf("Unnamed %d", regionID), + } } + + agentsConnectionLatenciesGauge.WithLabelValues(agent.Name, user.Username, workspace.Name, region.RegionName, fmt.Sprintf("%v", node.PreferredDERP == regionID)).Set(latency) } + } - agentsConnectionLatenciesGauge.WithLabelValues(agent.Name, user.Username, workspace.Name, region.RegionName, fmt.Sprintf("%v", node.PreferredDERP == regionID)).Set(latency) + // Collect information about registered applications + apps, err := db.GetWorkspaceAppsByAgentID(ctx, agent.ID) + if err != nil { + logger.Error(ctx, "can't get workspace apps", slog.F("agent_id", agent.ID), slog.Error(err)) + continue } - // FIXME IDE? - // FIXME connection_type ide + for _, app := range apps { + agentsAppsGauge.WithLabelValues(agent.Name, user.Username, workspace.Name, app.DisplayName, string(app.Health)).Add(1) + } } } } From 63aff5ebf2f2c0695c1289fa557b9bd5fbfdb80f Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Wed, 5 Apr 2023 12:32:49 +0200 Subject: [PATCH 06/15] 1min --- coderd/prometheusmetrics/prometheusmetrics.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index 4e4d828981bc6..a9dc1933addc1 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -119,7 +119,7 @@ func Workspaces(ctx context.Context, registerer prometheus.Registerer, db databa // Agents tracks the total number of workspaces with labels on status. func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, coordinator *atomic.Pointer[tailnet.Coordinator], derpMap *tailcfg.DERPMap, agentInactiveDisconnectTimeout, duration time.Duration) (context.CancelFunc, error) { if duration == 0 { - duration = 15 * time.Second // TODO 5 * time.Minute + duration = 1 * time.Minute } agentsGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ From 390548159b6fc8cf9d9f10b02e276384ff974b7d Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Wed, 5 Apr 2023 12:45:16 +0200 Subject: [PATCH 07/15] fix --- coderd/prometheusmetrics/prometheusmetrics.go | 1 - 1 file changed, 1 deletion(-) diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index a9dc1933addc1..5dddd4d20e388 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -230,7 +230,6 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis if node == nil { logger.Debug(ctx, "can't read in-memory node for agent", slog.F("agent_id", agent.ID)) - continue } else { // Collect information about connection latencies for rawRegion, latency := range node.DERPLatency { From f8d6f464fe93d3f7799463be9c6e0c9c6c6f5135 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Wed, 5 Apr 2023 13:35:45 +0200 Subject: [PATCH 08/15] WIP --- .../prometheusmetrics_test.go | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/coderd/prometheusmetrics/prometheusmetrics_test.go b/coderd/prometheusmetrics/prometheusmetrics_test.go index a0b375ccf8622..7b828e0514fd8 100644 --- a/coderd/prometheusmetrics/prometheusmetrics_test.go +++ b/coderd/prometheusmetrics/prometheusmetrics_test.go @@ -6,16 +6,22 @@ import ( "testing" "time" + "sync/atomic" + "github.com/google/uuid" "github.com/prometheus/client_golang/prometheus" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "cdr.dev/slog/sloggers/slogtest" + "github.com/coder/coder/coderd/database" "github.com/coder/coder/coderd/database/dbfake" "github.com/coder/coder/coderd/database/dbgen" "github.com/coder/coder/coderd/prometheusmetrics" "github.com/coder/coder/codersdk" + "github.com/coder/coder/tailnet" + "github.com/coder/coder/tailnet/tailnettest" "github.com/coder/coder/testutil" ) @@ -239,3 +245,37 @@ func TestWorkspaces(t *testing.T) { }) } } + +func TestAgents(t *testing.T) { + t.Parallel() + + // given + db := dbfake.New() + + coordinator := tailnet.NewCoordinator() + coordinatorPtr := atomic.Pointer[tailnet.Coordinator]{} + coordinatorPtr.Store(&coordinator) + derpMap := tailnettest.RunDERPAndSTUN(t) + agentInactiveDisconnectTimeout := 1 * time.Hour + registry := prometheus.NewRegistry() + + // when + cancel, err := prometheusmetrics.Agents(context.Background(), slogtest.Make(t, nil), registry, db, &coordinatorPtr, derpMap, agentInactiveDisconnectTimeout, time.Millisecond) + t.Cleanup(cancel) + + // then + require.NoError(t, err) + require.Eventually(t, func() bool { + metrics, err := registry.Gather() + assert.NoError(t, err) + + if len(metrics) < 1 { + return false + } + + for _, metric := range metrics[0].Metric { + panic(metric) + } + return true + }, testutil.WaitShort, testutil.IntervalFast) +} From d487a77372e703abb9be64b38f4a8f8366bddf74 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Wed, 5 Apr 2023 14:18:06 +0200 Subject: [PATCH 09/15] Test --- coderd/prometheusmetrics/prometheusmetrics.go | 4 +- .../prometheusmetrics_test.go | 83 +++++++++++++++++-- 2 files changed, 80 insertions(+), 7 deletions(-) diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index 5dddd4d20e388..222809db00bf8 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -2,6 +2,8 @@ package prometheusmetrics import ( "context" + "database/sql" + "errors" "fmt" "strconv" "strings" @@ -257,7 +259,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis // Collect information about registered applications apps, err := db.GetWorkspaceAppsByAgentID(ctx, agent.ID) - if err != nil { + if err != nil && !errors.Is(err, sql.ErrNoRows) { logger.Error(ctx, "can't get workspace apps", slog.F("agent_id", agent.ID), slog.Error(err)) continue } diff --git a/coderd/prometheusmetrics/prometheusmetrics_test.go b/coderd/prometheusmetrics/prometheusmetrics_test.go index 7b828e0514fd8..e7829d8a34a18 100644 --- a/coderd/prometheusmetrics/prometheusmetrics_test.go +++ b/coderd/prometheusmetrics/prometheusmetrics_test.go @@ -15,11 +15,14 @@ import ( "cdr.dev/slog/sloggers/slogtest" + "github.com/coder/coder/coderd/coderdtest" "github.com/coder/coder/coderd/database" "github.com/coder/coder/coderd/database/dbfake" "github.com/coder/coder/coderd/database/dbgen" "github.com/coder/coder/coderd/prometheusmetrics" "github.com/coder/coder/codersdk" + "github.com/coder/coder/provisioner/echo" + "github.com/coder/coder/provisionersdk/proto" "github.com/coder/coder/tailnet" "github.com/coder/coder/tailnet/tailnettest" "github.com/coder/coder/testutil" @@ -249,14 +252,53 @@ func TestWorkspaces(t *testing.T) { func TestAgents(t *testing.T) { t.Parallel() - // given - db := dbfake.New() + // Build a sample workspace with test agent and fake application + client, _, api := coderdtest.NewWithAPI(t, &coderdtest.Options{IncludeProvisionerDaemon: true}) + db := api.Database + + user := coderdtest.CreateFirstUser(t, client) + version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{ + Parse: echo.ParseComplete, + ProvisionPlan: echo.ProvisionComplete, + ProvisionApply: []*proto.Provision_Response{{ + Type: &proto.Provision_Response_Complete{ + Complete: &proto.Provision_Complete{ + Resources: []*proto.Resource{{ + Name: "example", + Type: "aws_instance", + Agents: []*proto.Agent{{ + Id: uuid.NewString(), + Name: "testagent", + Directory: t.TempDir(), + Auth: &proto.Agent_Token{ + Token: uuid.NewString(), + }, + Apps: []*proto.App{ + { + Slug: "fake-app", + DisplayName: "Fake application", + SharingLevel: proto.AppSharingLevel_OWNER, + // Hopefully this IP and port doesn't exist. + Url: "http://127.1.0.1:65535", + }, + }, + }}, + }}, + }, + }, + }}, + }) + template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID) + coderdtest.AwaitTemplateVersionJob(t, client, version.ID) + workspace := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID) + coderdtest.AwaitWorkspaceBuildJob(t, client, workspace.LatestBuild.ID) + // given coordinator := tailnet.NewCoordinator() coordinatorPtr := atomic.Pointer[tailnet.Coordinator]{} coordinatorPtr.Store(&coordinator) derpMap := tailnettest.RunDERPAndSTUN(t) - agentInactiveDisconnectTimeout := 1 * time.Hour + agentInactiveDisconnectTimeout := 1 * time.Hour // don't need to focus on this value in tests registry := prometheus.NewRegistry() // when @@ -265,6 +307,10 @@ func TestAgents(t *testing.T) { // then require.NoError(t, err) + + var agentsUp bool + var agentsConnections bool + var agentsApps bool require.Eventually(t, func() bool { metrics, err := registry.Gather() assert.NoError(t, err) @@ -273,9 +319,34 @@ func TestAgents(t *testing.T) { return false } - for _, metric := range metrics[0].Metric { - panic(metric) + for _, metric := range metrics { + switch metric.GetName() { + case "coderd_agents_up": + assert.Equal(t, "testuser", metric.Metric[0].Label[0].GetValue()) // Username + assert.Equal(t, workspace.Name, metric.Metric[0].Label[1].GetValue()) // Workspace name + assert.Equal(t, 1, int(metric.Metric[0].Gauge.GetValue())) // Metric value + agentsUp = true + case "coderd_agents_connections": + assert.Equal(t, "testagent", metric.Metric[0].Label[0].GetValue()) // Agent name + assert.Equal(t, "created", metric.Metric[0].Label[1].GetValue()) // Lifecycle state + assert.Equal(t, "connecting", metric.Metric[0].Label[2].GetValue()) // Status + assert.Equal(t, "unknown", metric.Metric[0].Label[3].GetValue()) // Tailnet node + assert.Equal(t, "testuser", metric.Metric[0].Label[4].GetValue()) // Username + assert.Equal(t, workspace.Name, metric.Metric[0].Label[5].GetValue()) // Workspace name + assert.Equal(t, 1, int(metric.Metric[0].Gauge.GetValue())) // Metric value + agentsConnections = true + case "coderd_agents_apps": + assert.Equal(t, "testagent", metric.Metric[0].Label[0].GetValue()) // Agent name + assert.Equal(t, "Fake application", metric.Metric[0].Label[1].GetValue()) // App name + assert.Equal(t, "disabled", metric.Metric[0].Label[2].GetValue()) // Health + assert.Equal(t, "testuser", metric.Metric[0].Label[3].GetValue()) // Username + assert.Equal(t, workspace.Name, metric.Metric[0].Label[4].GetValue()) // Workspace name + assert.Equal(t, 1, int(metric.Metric[0].Gauge.GetValue())) // Metric value + agentsApps = true + default: + require.FailNowf(t, "unexpected metric collected", "metric: %s", metric.GetName()) + } } - return true + return agentsUp && agentsConnections && agentsApps }, testutil.WaitShort, testutil.IntervalFast) } From 7acbaf09ac86a9179d75949588099fecb708b3ad Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Wed, 5 Apr 2023 14:19:39 +0200 Subject: [PATCH 10/15] docs --- docs/admin/prometheus.md | 4 ++++ scripts/metricsdocgen/metrics | 20 ++++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/docs/admin/prometheus.md b/docs/admin/prometheus.md index f35ba5d1c5182..e6b23a12702f4 100644 --- a/docs/admin/prometheus.md +++ b/docs/admin/prometheus.md @@ -31,6 +31,10 @@ The environment variable `CODER_PROMETHEUS_ENABLE` will be enabled automatically | Name | Type | Description | Labels | | -------------------------------------------- | --------- | ------------------------------------------------------------------ | ----------------------------------------------------------------------------------- | +| `coderd_agents_apps` | gauge | Agent applications with statuses. | `agent_name` `app_name` `health` `username` `workspace_name` | +| `coderd_agents_connection_latencies_seconds` | gauge | Agent connection latencies in seconds. | `agent_id` `derp_region` `preferred` `username` `workspace_name` | +| `coderd_agents_connections` | gauge | Agent connections with statuses. | `agent_name` `lifecycle_state` `status` `tailnet_node` `username` `workspace_name` | +| `coderd_agents_up` | gauge | The number of active agents per workspace. | `username` `workspace_name` | | `coderd_api_active_users_duration_hour` | gauge | The number of users that have been active within the last hour. | | | `coderd_api_concurrent_requests` | gauge | The number of concurrent API requests. | | | `coderd_api_concurrent_websockets` | gauge | The total number of concurrent API websockets. | | diff --git a/scripts/metricsdocgen/metrics b/scripts/metricsdocgen/metrics index 50bbc87990dda..9a5fc20dff8e1 100644 --- a/scripts/metricsdocgen/metrics +++ b/scripts/metricsdocgen/metrics @@ -1,3 +1,23 @@ +# HELP coderd_agents_apps Agent applications with statuses. +# TYPE coderd_agents_apps gauge +coderd_agents_apps{agent_name="main",app_name="code-server",health="healthy",username="admin",workspace_name="workspace-1"} 1 +coderd_agents_apps{agent_name="main",app_name="code-server",health="healthy",username="admin",workspace_name="workspace-2"} 1 +coderd_agents_apps{agent_name="main",app_name="code-server",health="healthy",username="admin",workspace_name="workspace-3"} 1 +# HELP coderd_agents_connection_latencies_seconds Agent connection latencies in seconds. +# TYPE coderd_agents_connection_latencies_seconds gauge +coderd_agents_connection_latencies_seconds{agent_id="main",derp_region="Coder Embedded Relay",preferred="true",username="admin",workspace_name="workspace-1"} 0.03018125 +coderd_agents_connection_latencies_seconds{agent_id="main",derp_region="Coder Embedded Relay",preferred="true",username="admin",workspace_name="workspace-2"} 0.028658416 +coderd_agents_connection_latencies_seconds{agent_id="main",derp_region="Coder Embedded Relay",preferred="true",username="admin",workspace_name="workspace-3"} 0.028041416 +# HELP coderd_agents_connections Agent connections with statuses. +# TYPE coderd_agents_connections gauge +coderd_agents_connections{agent_name="main",lifecycle_state="ready",status="connected",tailnet_node="nodeid:16966f7df70d8cc5",username="admin",workspace_name="workspace-3"} 1 +coderd_agents_connections{agent_name="main",lifecycle_state="start_timeout",status="connected",tailnet_node="nodeid:3237d00938be23e3",username="admin",workspace_name="workspace-2"} 1 +coderd_agents_connections{agent_name="main",lifecycle_state="start_timeout",status="connected",tailnet_node="nodeid:3779bd45d00be0eb",username="admin",workspace_name="workspace-1"} 1 +# HELP coderd_agents_up The number of active agents per workspace. +# TYPE coderd_agents_up gauge +coderd_agents_up{username="admin",workspace_name="workspace-1"} 1 +coderd_agents_up{username="admin",workspace_name="workspace-2"} 1 +coderd_agents_up{username="admin",workspace_name="workspace-3"} 1 # HELP coderd_api_websocket_durations_seconds Websocket duration distribution of requests in seconds. # TYPE coderd_api_websocket_durations_seconds histogram coderd_api_websocket_durations_seconds_bucket{path="/api/v2/workspaceagents/me/coordinate",le="0.001"} 0 From 7418779f7b202e2df48e0fbca70906488d6db9ca Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Wed, 5 Apr 2023 14:26:14 +0200 Subject: [PATCH 11/15] fmt --- coderd/prometheusmetrics/prometheusmetrics_test.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/coderd/prometheusmetrics/prometheusmetrics_test.go b/coderd/prometheusmetrics/prometheusmetrics_test.go index e7829d8a34a18..f7c5fc689e937 100644 --- a/coderd/prometheusmetrics/prometheusmetrics_test.go +++ b/coderd/prometheusmetrics/prometheusmetrics_test.go @@ -3,11 +3,10 @@ package prometheusmetrics_test import ( "context" "database/sql" + "sync/atomic" "testing" "time" - "sync/atomic" - "github.com/google/uuid" "github.com/prometheus/client_golang/prometheus" "github.com/stretchr/testify/assert" From 3a8e4e6ddf9df8987858f8c30ac8c7f1d10c5495 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Thu, 6 Apr 2023 11:52:01 +0200 Subject: [PATCH 12/15] Add timer to measure the metrics collection --- coderd/prometheusmetrics/prometheusmetrics.go | 18 ++- .../prometheusmetrics_test.go | 5 +- docs/admin/prometheus.md | 105 +++++++++--------- scripts/metricsdocgen/metrics | 16 +++ 4 files changed, 90 insertions(+), 54 deletions(-) diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index 222809db00bf8..94c47230a4579 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -168,6 +168,18 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis return nil, err } + metricsCollectorAgents := prometheus.NewHistogram(prometheus.HistogramOpts{ + Namespace: "coderd", + Subsystem: "prometheusmetrics", + Name: "agents_execution_seconds", + Help: "Histogram for duration of agents metrics collection in seconds.", + Buckets: []float64{0.001, 0.005, 0.010, 0.025, 0.050, 0.100, 0.500, 1, 5, 10, 30}, + }) + err = registerer.Register(metricsCollectorAgents) + if err != nil { + return nil, err + } + // nolint:gocritic // Prometheus must collect metrics for all Coder users. ctx, cancelFunc := context.WithCancel(dbauthz.AsSystemRestricted(ctx)) ticker := time.NewTicker(duration) @@ -180,7 +192,8 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis case <-ticker.C: } - logger.Debug(ctx, "Collect agent metrics now") + logger.Debug(ctx, "Agent metrics collection is starting") + timer := prometheus.NewTimer(metricsCollectorAgents) workspaceRows, err := db.GetWorkspaces(ctx, database.GetWorkspacesParams{ AgentInactiveDisconnectTimeoutSeconds: int64(agentInactiveDisconnectTimeout.Seconds()), @@ -269,6 +282,9 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis } } } + + logger.Debug(ctx, "Agent metrics collection is done") + metricsCollectorAgents.Observe(timer.ObserveDuration().Seconds()) } }() return cancelFunc, nil diff --git a/coderd/prometheusmetrics/prometheusmetrics_test.go b/coderd/prometheusmetrics/prometheusmetrics_test.go index f7c5fc689e937..e765c5f2a1128 100644 --- a/coderd/prometheusmetrics/prometheusmetrics_test.go +++ b/coderd/prometheusmetrics/prometheusmetrics_test.go @@ -310,6 +310,7 @@ func TestAgents(t *testing.T) { var agentsUp bool var agentsConnections bool var agentsApps bool + var agentsExecutionInSeconds bool require.Eventually(t, func() bool { metrics, err := registry.Gather() assert.NoError(t, err) @@ -342,10 +343,12 @@ func TestAgents(t *testing.T) { assert.Equal(t, workspace.Name, metric.Metric[0].Label[4].GetValue()) // Workspace name assert.Equal(t, 1, int(metric.Metric[0].Gauge.GetValue())) // Metric value agentsApps = true + case "coderd_prometheusmetrics_agents_execution_seconds": + agentsExecutionInSeconds = true default: require.FailNowf(t, "unexpected metric collected", "metric: %s", metric.GetName()) } } - return agentsUp && agentsConnections && agentsApps + return agentsUp && agentsConnections && agentsApps && agentsExecutionInSeconds }, testutil.WaitShort, testutil.IntervalFast) } diff --git a/docs/admin/prometheus.md b/docs/admin/prometheus.md index e6b23a12702f4..2898f8f4a469c 100644 --- a/docs/admin/prometheus.md +++ b/docs/admin/prometheus.md @@ -29,57 +29,58 @@ The environment variable `CODER_PROMETHEUS_ENABLE` will be enabled automatically -| Name | Type | Description | Labels | -| -------------------------------------------- | --------- | ------------------------------------------------------------------ | ----------------------------------------------------------------------------------- | -| `coderd_agents_apps` | gauge | Agent applications with statuses. | `agent_name` `app_name` `health` `username` `workspace_name` | -| `coderd_agents_connection_latencies_seconds` | gauge | Agent connection latencies in seconds. | `agent_id` `derp_region` `preferred` `username` `workspace_name` | -| `coderd_agents_connections` | gauge | Agent connections with statuses. | `agent_name` `lifecycle_state` `status` `tailnet_node` `username` `workspace_name` | -| `coderd_agents_up` | gauge | The number of active agents per workspace. | `username` `workspace_name` | -| `coderd_api_active_users_duration_hour` | gauge | The number of users that have been active within the last hour. | | -| `coderd_api_concurrent_requests` | gauge | The number of concurrent API requests. | | -| `coderd_api_concurrent_websockets` | gauge | The total number of concurrent API websockets. | | -| `coderd_api_request_latencies_seconds` | histogram | Latency distribution of requests in seconds. | `method` `path` | -| `coderd_api_requests_processed_total` | counter | The total number of processed API requests | `code` `method` `path` | -| `coderd_api_websocket_durations_seconds` | histogram | Websocket duration distribution of requests in seconds. | `path` | -| `coderd_api_workspace_latest_build_total` | gauge | The latest workspace builds with a status. | `status` | -| `coderd_provisionerd_job_timings_seconds` | histogram | The provisioner job time duration in seconds. | `provisioner` `status` | -| `coderd_provisionerd_jobs_current` | gauge | The number of currently running provisioner jobs. | `provisioner` | -| `coderd_workspace_builds_total` | counter | The number of workspaces started, updated, or deleted. | `action` `owner_email` `status` `template_name` `template_version` `workspace_name` | -| `go_gc_duration_seconds` | summary | A summary of the pause duration of garbage collection cycles. | | -| `go_goroutines` | gauge | Number of goroutines that currently exist. | | -| `go_info` | gauge | Information about the Go environment. | `version` | -| `go_memstats_alloc_bytes` | gauge | Number of bytes allocated and still in use. | | -| `go_memstats_alloc_bytes_total` | counter | Total number of bytes allocated, even if freed. | | -| `go_memstats_buck_hash_sys_bytes` | gauge | Number of bytes used by the profiling bucket hash table. | | -| `go_memstats_frees_total` | counter | Total number of frees. | | -| `go_memstats_gc_sys_bytes` | gauge | Number of bytes used for garbage collection system metadata. | | -| `go_memstats_heap_alloc_bytes` | gauge | Number of heap bytes allocated and still in use. | | -| `go_memstats_heap_idle_bytes` | gauge | Number of heap bytes waiting to be used. | | -| `go_memstats_heap_inuse_bytes` | gauge | Number of heap bytes that are in use. | | -| `go_memstats_heap_objects` | gauge | Number of allocated objects. | | -| `go_memstats_heap_released_bytes` | gauge | Number of heap bytes released to OS. | | -| `go_memstats_heap_sys_bytes` | gauge | Number of heap bytes obtained from system. | | -| `go_memstats_last_gc_time_seconds` | gauge | Number of seconds since 1970 of last garbage collection. | | -| `go_memstats_lookups_total` | counter | Total number of pointer lookups. | | -| `go_memstats_mallocs_total` | counter | Total number of mallocs. | | -| `go_memstats_mcache_inuse_bytes` | gauge | Number of bytes in use by mcache structures. | | -| `go_memstats_mcache_sys_bytes` | gauge | Number of bytes used for mcache structures obtained from system. | | -| `go_memstats_mspan_inuse_bytes` | gauge | Number of bytes in use by mspan structures. | | -| `go_memstats_mspan_sys_bytes` | gauge | Number of bytes used for mspan structures obtained from system. | | -| `go_memstats_next_gc_bytes` | gauge | Number of heap bytes when next garbage collection will take place. | | -| `go_memstats_other_sys_bytes` | gauge | Number of bytes used for other system allocations. | | -| `go_memstats_stack_inuse_bytes` | gauge | Number of bytes in use by the stack allocator. | | -| `go_memstats_stack_sys_bytes` | gauge | Number of bytes obtained from system for stack allocator. | | -| `go_memstats_sys_bytes` | gauge | Number of bytes obtained from system. | | -| `go_threads` | gauge | Number of OS threads created. | | -| `process_cpu_seconds_total` | counter | Total user and system CPU time spent in seconds. | | -| `process_max_fds` | gauge | Maximum number of open file descriptors. | | -| `process_open_fds` | gauge | Number of open file descriptors. | | -| `process_resident_memory_bytes` | gauge | Resident memory size in bytes. | | -| `process_start_time_seconds` | gauge | Start time of the process since unix epoch in seconds. | | -| `process_virtual_memory_bytes` | gauge | Virtual memory size in bytes. | | -| `process_virtual_memory_max_bytes` | gauge | Maximum amount of virtual memory available in bytes. | | -| `promhttp_metric_handler_requests_in_flight` | gauge | Current number of scrapes being served. | | -| `promhttp_metric_handler_requests_total` | counter | Total number of scrapes by HTTP status code. | `code` | +| Name | Type | Description | Labels | +| --------------------------------------------------- | --------- | ------------------------------------------------------------------ | ----------------------------------------------------------------------------------- | +| `coderd_agents_apps` | gauge | Agent applications with statuses. | `agent_name` `app_name` `health` `username` `workspace_name` | +| `coderd_agents_connection_latencies_seconds` | gauge | Agent connection latencies in seconds. | `agent_id` `derp_region` `preferred` `username` `workspace_name` | +| `coderd_agents_connections` | gauge | Agent connections with statuses. | `agent_name` `lifecycle_state` `status` `tailnet_node` `username` `workspace_name` | +| `coderd_agents_up` | gauge | The number of active agents per workspace. | `username` `workspace_name` | +| `coderd_api_active_users_duration_hour` | gauge | The number of users that have been active within the last hour. | | +| `coderd_api_concurrent_requests` | gauge | The number of concurrent API requests. | | +| `coderd_api_concurrent_websockets` | gauge | The total number of concurrent API websockets. | | +| `coderd_api_request_latencies_seconds` | histogram | Latency distribution of requests in seconds. | `method` `path` | +| `coderd_api_requests_processed_total` | counter | The total number of processed API requests | `code` `method` `path` | +| `coderd_api_websocket_durations_seconds` | histogram | Websocket duration distribution of requests in seconds. | `path` | +| `coderd_api_workspace_latest_build_total` | gauge | The latest workspace builds with a status. | `status` | +| `coderd_metrics_collector_agents_execution_seconds` | histogram | Histogram for duration of agents metrics collection in seconds. | | +| `coderd_provisionerd_job_timings_seconds` | histogram | The provisioner job time duration in seconds. | `provisioner` `status` | +| `coderd_provisionerd_jobs_current` | gauge | The number of currently running provisioner jobs. | `provisioner` | +| `coderd_workspace_builds_total` | counter | The number of workspaces started, updated, or deleted. | `action` `owner_email` `status` `template_name` `template_version` `workspace_name` | +| `go_gc_duration_seconds` | summary | A summary of the pause duration of garbage collection cycles. | | +| `go_goroutines` | gauge | Number of goroutines that currently exist. | | +| `go_info` | gauge | Information about the Go environment. | `version` | +| `go_memstats_alloc_bytes` | gauge | Number of bytes allocated and still in use. | | +| `go_memstats_alloc_bytes_total` | counter | Total number of bytes allocated, even if freed. | | +| `go_memstats_buck_hash_sys_bytes` | gauge | Number of bytes used by the profiling bucket hash table. | | +| `go_memstats_frees_total` | counter | Total number of frees. | | +| `go_memstats_gc_sys_bytes` | gauge | Number of bytes used for garbage collection system metadata. | | +| `go_memstats_heap_alloc_bytes` | gauge | Number of heap bytes allocated and still in use. | | +| `go_memstats_heap_idle_bytes` | gauge | Number of heap bytes waiting to be used. | | +| `go_memstats_heap_inuse_bytes` | gauge | Number of heap bytes that are in use. | | +| `go_memstats_heap_objects` | gauge | Number of allocated objects. | | +| `go_memstats_heap_released_bytes` | gauge | Number of heap bytes released to OS. | | +| `go_memstats_heap_sys_bytes` | gauge | Number of heap bytes obtained from system. | | +| `go_memstats_last_gc_time_seconds` | gauge | Number of seconds since 1970 of last garbage collection. | | +| `go_memstats_lookups_total` | counter | Total number of pointer lookups. | | +| `go_memstats_mallocs_total` | counter | Total number of mallocs. | | +| `go_memstats_mcache_inuse_bytes` | gauge | Number of bytes in use by mcache structures. | | +| `go_memstats_mcache_sys_bytes` | gauge | Number of bytes used for mcache structures obtained from system. | | +| `go_memstats_mspan_inuse_bytes` | gauge | Number of bytes in use by mspan structures. | | +| `go_memstats_mspan_sys_bytes` | gauge | Number of bytes used for mspan structures obtained from system. | | +| `go_memstats_next_gc_bytes` | gauge | Number of heap bytes when next garbage collection will take place. | | +| `go_memstats_other_sys_bytes` | gauge | Number of bytes used for other system allocations. | | +| `go_memstats_stack_inuse_bytes` | gauge | Number of bytes in use by the stack allocator. | | +| `go_memstats_stack_sys_bytes` | gauge | Number of bytes obtained from system for stack allocator. | | +| `go_memstats_sys_bytes` | gauge | Number of bytes obtained from system. | | +| `go_threads` | gauge | Number of OS threads created. | | +| `process_cpu_seconds_total` | counter | Total user and system CPU time spent in seconds. | | +| `process_max_fds` | gauge | Maximum number of open file descriptors. | | +| `process_open_fds` | gauge | Number of open file descriptors. | | +| `process_resident_memory_bytes` | gauge | Resident memory size in bytes. | | +| `process_start_time_seconds` | gauge | Start time of the process since unix epoch in seconds. | | +| `process_virtual_memory_bytes` | gauge | Virtual memory size in bytes. | | +| `process_virtual_memory_max_bytes` | gauge | Maximum amount of virtual memory available in bytes. | | +| `promhttp_metric_handler_requests_in_flight` | gauge | Current number of scrapes being served. | | +| `promhttp_metric_handler_requests_total` | counter | Total number of scrapes by HTTP status code. | `code` | diff --git a/scripts/metricsdocgen/metrics b/scripts/metricsdocgen/metrics index 9a5fc20dff8e1..7e598b17abe56 100644 --- a/scripts/metricsdocgen/metrics +++ b/scripts/metricsdocgen/metrics @@ -588,6 +588,22 @@ coderd_api_requests_processed_total{code="401",method="POST",path="/api/v2/files # HELP coderd_api_workspace_latest_build_total The latest workspace builds with a status. # TYPE coderd_api_workspace_latest_build_total gauge coderd_api_workspace_latest_build_total{status="succeeded"} 1 +# HELP coderd_metrics_collector_agents_execution_seconds Histogram for duration of agents metrics collection in seconds. +# TYPE coderd_metrics_collector_agents_execution_seconds histogram +coderd_metrics_collector_agents_execution_seconds_bucket{le="0.001"} 0 +coderd_metrics_collector_agents_execution_seconds_bucket{le="0.005"} 0 +coderd_metrics_collector_agents_execution_seconds_bucket{le="0.01"} 0 +coderd_metrics_collector_agents_execution_seconds_bucket{le="0.025"} 0 +coderd_metrics_collector_agents_execution_seconds_bucket{le="0.05"} 2 +coderd_metrics_collector_agents_execution_seconds_bucket{le="0.1"} 2 +coderd_metrics_collector_agents_execution_seconds_bucket{le="0.5"} 2 +coderd_metrics_collector_agents_execution_seconds_bucket{le="1"} 2 +coderd_metrics_collector_agents_execution_seconds_bucket{le="5"} 2 +coderd_metrics_collector_agents_execution_seconds_bucket{le="10"} 2 +coderd_metrics_collector_agents_execution_seconds_bucket{le="30"} 2 +coderd_metrics_collector_agents_execution_seconds_bucket{le="+Inf"} 2 +coderd_metrics_collector_agents_execution_seconds_sum 0.0592915 +coderd_metrics_collector_agents_execution_seconds_count 2 # HELP coderd_provisionerd_job_timings_seconds The provisioner job time duration in seconds. # TYPE coderd_provisionerd_job_timings_seconds histogram coderd_provisionerd_job_timings_seconds_bucket{provisioner="terraform",status="success",le="1"} 0 From b5d0581caec093a09ae40e0477af35e095202e82 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Thu, 6 Apr 2023 13:26:11 +0200 Subject: [PATCH 13/15] Use CachedGaugeVec --- coderd/prometheusmetrics/collector.go | 80 +++++++++++++++++++ coderd/prometheusmetrics/prometheusmetrics.go | 40 +++++----- 2 files changed, 100 insertions(+), 20 deletions(-) create mode 100644 coderd/prometheusmetrics/collector.go diff --git a/coderd/prometheusmetrics/collector.go b/coderd/prometheusmetrics/collector.go new file mode 100644 index 0000000000000..b92f992a6a7ec --- /dev/null +++ b/coderd/prometheusmetrics/collector.go @@ -0,0 +1,80 @@ +package prometheusmetrics + +import ( + "sync" + + "github.com/prometheus/client_golang/prometheus" +) + +type CachedGaugeVec struct { + m sync.Mutex + + gaugeVec *prometheus.GaugeVec + records []vectorRecord +} + +var _ prometheus.Collector = new(CachedGaugeVec) + +type VectorOperation int + +const ( + VectorOperationAdd VectorOperation = iota + VectorOperationSet +) + +type vectorRecord struct { + operation VectorOperation + value float64 + labelValues []string +} + +func NewCachedGaugeVec(gaugeVec *prometheus.GaugeVec) *CachedGaugeVec { + return &CachedGaugeVec{ + gaugeVec: gaugeVec, + } +} + +func (v *CachedGaugeVec) Describe(desc chan<- *prometheus.Desc) { + v.m.Lock() + defer v.m.Unlock() + + v.gaugeVec.Describe(desc) +} + +func (v *CachedGaugeVec) Collect(ch chan<- prometheus.Metric) { + v.m.Lock() + defer v.m.Unlock() + + v.gaugeVec.Collect(ch) +} + +func (v *CachedGaugeVec) WithLabelValues(operation VectorOperation, value float64, labelValues ...string) { + v.m.Lock() + defer v.m.Unlock() + + v.records = append(v.records, vectorRecord{ + operation: operation, + value: value, + labelValues: labelValues, + }) +} + +func (v *CachedGaugeVec) Commit() { + v.m.Lock() + defer v.m.Unlock() + + v.gaugeVec.Reset() + for _, record := range v.records { + g := v.gaugeVec.WithLabelValues(record.labelValues...) + switch record.operation { + case VectorOperationAdd: + g.Add(record.value) + case VectorOperationSet: + g.Set(record.value) + default: + panic("unsupported vector operation") + } + } + + v.records = nil +} diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index 94c47230a4579..83e4af90d0765 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -124,45 +124,45 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis duration = 1 * time.Minute } - agentsGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + agentsGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: "coderd", Subsystem: "agents", Name: "up", Help: "The number of active agents per workspace.", - }, []string{"username", "workspace_name"}) + }, []string{"username", "workspace_name"})) err := registerer.Register(agentsGauge) if err != nil { return nil, err } - agentsConnectionsGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + agentsConnectionsGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: "coderd", Subsystem: "agents", Name: "connections", Help: "Agent connections with statuses.", - }, []string{"agent_name", "username", "workspace_name", "status", "lifecycle_state", "tailnet_node"}) + }, []string{"agent_name", "username", "workspace_name", "status", "lifecycle_state", "tailnet_node"})) err = registerer.Register(agentsConnectionsGauge) if err != nil { return nil, err } - agentsConnectionLatenciesGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + agentsConnectionLatenciesGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: "coderd", Subsystem: "agents", Name: "connection_latencies_seconds", Help: "Agent connection latencies in seconds.", - }, []string{"agent_id", "username", "workspace_name", "derp_region", "preferred"}) + }, []string{"agent_id", "username", "workspace_name", "derp_region", "preferred"})) err = registerer.Register(agentsConnectionLatenciesGauge) if err != nil { return nil, err } - agentsAppsGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + agentsAppsGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: "coderd", Subsystem: "agents", Name: "apps", Help: "Agent applications with statuses.", - }, []string{"agent_name", "username", "workspace_name", "app_name", "health"}) + }, []string{"agent_name", "username", "workspace_name", "app_name", "health"})) err = registerer.Register(agentsAppsGauge) if err != nil { return nil, err @@ -203,35 +203,30 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis continue } - agentsGauge.Reset() - agentsConnectionsGauge.Reset() - agentsConnectionLatenciesGauge.Reset() - agentsAppsGauge.Reset() - for _, workspace := range workspaceRows { user, err := db.GetUserByID(ctx, workspace.OwnerID) if err != nil { logger.Error(ctx, "can't get user", slog.F("user_id", workspace.OwnerID), slog.Error(err)) - agentsGauge.WithLabelValues(user.Username, workspace.Name).Add(0) + agentsGauge.WithLabelValues(VectorOperationAdd, 0, user.Username, workspace.Name) continue } agents, err := db.GetWorkspaceAgentsInLatestBuildByWorkspaceID(ctx, workspace.ID) if err != nil { logger.Error(ctx, "can't get workspace agents", slog.F("workspace_id", workspace.ID), slog.Error(err)) - agentsGauge.WithLabelValues(user.Username, workspace.Name).Add(0) + agentsGauge.WithLabelValues(VectorOperationAdd, 0, user.Username, workspace.Name) continue } if len(agents) == 0 { logger.Debug(ctx, "workspace agents are unavailable", slog.F("workspace_id", workspace.ID)) - agentsGauge.WithLabelValues(user.Username, workspace.Name).Add(0) + agentsGauge.WithLabelValues(VectorOperationAdd, 0, user.Username, workspace.Name) continue } for _, agent := range agents { // Collect information about agents - agentsGauge.WithLabelValues(user.Username, workspace.Name).Add(1) + agentsGauge.WithLabelValues(VectorOperationAdd, 1, user.Username, workspace.Name) connectionStatus := agent.Status(agentInactiveDisconnectTimeout) node := (*coordinator.Load()).Node(agent.ID) @@ -241,7 +236,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis tailnetNode = node.ID.String() } - agentsConnectionsGauge.WithLabelValues(agent.Name, user.Username, workspace.Name, string(connectionStatus.Status), string(agent.LifecycleState), tailnetNode).Set(1) + agentsConnectionsGauge.WithLabelValues(VectorOperationSet, 1, agent.Name, user.Username, workspace.Name, string(connectionStatus.Status), string(agent.LifecycleState), tailnetNode) if node == nil { logger.Debug(ctx, "can't read in-memory node for agent", slog.F("agent_id", agent.ID)) @@ -266,7 +261,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis } } - agentsConnectionLatenciesGauge.WithLabelValues(agent.Name, user.Username, workspace.Name, region.RegionName, fmt.Sprintf("%v", node.PreferredDERP == regionID)).Set(latency) + agentsConnectionLatenciesGauge.WithLabelValues(VectorOperationSet, latency, agent.Name, user.Username, workspace.Name, region.RegionName, fmt.Sprintf("%v", node.PreferredDERP == regionID)) } } @@ -278,11 +273,16 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis } for _, app := range apps { - agentsAppsGauge.WithLabelValues(agent.Name, user.Username, workspace.Name, app.DisplayName, string(app.Health)).Add(1) + agentsAppsGauge.WithLabelValues(VectorOperationAdd, 1, agent.Name, user.Username, workspace.Name, app.DisplayName, string(app.Health)) } } } + agentsGauge.Commit() + agentsConnectionsGauge.Commit() + agentsConnectionLatenciesGauge.Commit() + agentsAppsGauge.Commit() + logger.Debug(ctx, "Agent metrics collection is done") metricsCollectorAgents.Observe(timer.ObserveDuration().Seconds()) } From e4d708b8f36214712989605f1589ee0f1b9dc613 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Thu, 6 Apr 2023 14:40:36 +0200 Subject: [PATCH 14/15] Unit tests --- coderd/prometheusmetrics/collector_test.go | 140 +++++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 coderd/prometheusmetrics/collector_test.go diff --git a/coderd/prometheusmetrics/collector_test.go b/coderd/prometheusmetrics/collector_test.go new file mode 100644 index 0000000000000..9d63f6669113d --- /dev/null +++ b/coderd/prometheusmetrics/collector_test.go @@ -0,0 +1,140 @@ +package prometheusmetrics_test + +import ( + "sort" + "testing" + + "github.com/prometheus/client_golang/prometheus" + dto "github.com/prometheus/client_model/go" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/coder/coder/coderd/prometheusmetrics" +) + +func TestCollector_Add(t *testing.T) { + t.Parallel() + + // given + agentsGauge := prometheusmetrics.NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "agents", + Name: "up", + Help: "The number of active agents per workspace.", + }, []string{"username", "workspace_name"})) + + // when + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 7, "first user", "my workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 23, "second user", "your workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 1, "first user", "my workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 25, "second user", "your workspace") + agentsGauge.Commit() + + // then + ch := make(chan prometheus.Metric, 2) + agentsGauge.Collect(ch) + + metrics := collectAndSortMetrics(t, agentsGauge, 2) + + assert.Equal(t, "first user", metrics[0].Label[0].GetValue()) // Username + assert.Equal(t, "my workspace", metrics[0].Label[1].GetValue()) // Workspace name + assert.Equal(t, 8, int(metrics[0].Gauge.GetValue())) // Metric value + + assert.Equal(t, "second user", metrics[1].Label[0].GetValue()) // Username + assert.Equal(t, "your workspace", metrics[1].Label[1].GetValue()) // Workspace name + assert.Equal(t, 48, int(metrics[1].Gauge.GetValue())) // Metric value +} + +func TestCollector_Set(t *testing.T) { + t.Parallel() + + // given + agentsGauge := prometheusmetrics.NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "agents", + Name: "up", + Help: "The number of active agents per workspace.", + }, []string{"username", "workspace_name"})) + + // when + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 3, "first user", "my workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 4, "second user", "your workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 5, "first user", "my workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 6, "second user", "your workspace") + agentsGauge.Commit() + + // then + ch := make(chan prometheus.Metric, 2) + agentsGauge.Collect(ch) + + metrics := collectAndSortMetrics(t, agentsGauge, 2) + + assert.Equal(t, "first user", metrics[0].Label[0].GetValue()) // Username + assert.Equal(t, "my workspace", metrics[0].Label[1].GetValue()) // Workspace name + assert.Equal(t, 5, int(metrics[0].Gauge.GetValue())) // Metric value + + assert.Equal(t, "second user", metrics[1].Label[0].GetValue()) // Username + assert.Equal(t, "your workspace", metrics[1].Label[1].GetValue()) // Workspace name + assert.Equal(t, 6, int(metrics[1].Gauge.GetValue())) // Metric value +} + +func TestCollector_Set_Add(t *testing.T) { + t.Parallel() + + // given + agentsGauge := prometheusmetrics.NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "agents", + Name: "up", + Help: "The number of active agents per workspace.", + }, []string{"username", "workspace_name"})) + + // when + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 9, "first user", "my workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 8, "second user", "your workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 7, "first user", "my workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 6, "second user", "your workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 5, "first user", "my workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 4, "second user", "your workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 3, "first user", "my workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 2, "second user", "your workspace") + agentsGauge.Commit() + + // then + ch := make(chan prometheus.Metric, 2) + agentsGauge.Collect(ch) + + metrics := collectAndSortMetrics(t, agentsGauge, 2) + + assert.Equal(t, "first user", metrics[0].Label[0].GetValue()) // Username + assert.Equal(t, "my workspace", metrics[0].Label[1].GetValue()) // Workspace name + assert.Equal(t, 8, int(metrics[0].Gauge.GetValue())) // Metric value + + assert.Equal(t, "second user", metrics[1].Label[0].GetValue()) // Username + assert.Equal(t, "your workspace", metrics[1].Label[1].GetValue()) // Workspace name + assert.Equal(t, 6, int(metrics[1].Gauge.GetValue())) // Metric value +} + +func collectAndSortMetrics(t *testing.T, collector prometheus.Collector, count int) []dto.Metric { + ch := make(chan prometheus.Metric, count) + defer close(ch) + + var metrics []dto.Metric + + collector.Collect(ch) + for i := 0; i < count; i++ { + m := <-ch + + var metric dto.Metric + err := m.Write(&metric) + require.NoError(t, err) + + metrics = append(metrics, metric) + } + + // Ensure always the same order of metrics + sort.Slice(metrics, func(i, j int) bool { + return sort.StringsAreSorted([]string{metrics[i].Label[0].GetValue(), metrics[j].Label[1].GetValue()}) + }) + return metrics +} From e0669f013cc947f8719ddd53b49e18086b8eb54d Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Fri, 7 Apr 2023 17:38:01 +0200 Subject: [PATCH 15/15] Address PR comments --- coderd/prometheusmetrics/collector.go | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/coderd/prometheusmetrics/collector.go b/coderd/prometheusmetrics/collector.go index b92f992a6a7ec..8839553a1ffdd 100644 --- a/coderd/prometheusmetrics/collector.go +++ b/coderd/prometheusmetrics/collector.go @@ -6,6 +6,16 @@ import ( "github.com/prometheus/client_golang/prometheus" ) +// CachedGaugeVec is a wrapper for the prometheus.GaugeVec which allows +// for staging changes in the metrics vector. Calling "WithLabelValues(...)" +// will update the internal gauge value, but it will not be returned by +// "Collect(...)" until the "Commit()" method is called. The "Commit()" method +// resets the internal gauge and applies all staged changes to it. +// +// The Use of CachedGaugeVec is recommended for use cases when there is a risk +// that the Prometheus collector receives incomplete metrics, collected +// in the middle of metrics recalculation, between "Reset()" and the last +// "WithLabelValues()" call. type CachedGaugeVec struct { m sync.Mutex @@ -35,9 +45,6 @@ func NewCachedGaugeVec(gaugeVec *prometheus.GaugeVec) *CachedGaugeVec { } func (v *CachedGaugeVec) Describe(desc chan<- *prometheus.Desc) { - v.m.Lock() - defer v.m.Unlock() - v.gaugeVec.Describe(desc) } @@ -49,6 +56,13 @@ func (v *CachedGaugeVec) Collect(ch chan<- prometheus.Metric) { } func (v *CachedGaugeVec) WithLabelValues(operation VectorOperation, value float64, labelValues ...string) { + switch operation { + case VectorOperationAdd: + case VectorOperationSet: + default: + panic("unsupported vector operation") + } + v.m.Lock() defer v.m.Unlock() @@ -59,6 +73,9 @@ func (v *CachedGaugeVec) WithLabelValues(operation VectorOperation, value float6 }) } +// Commit will set the internal value as the cached value to return from "Collect()". +// The internal metric value is completely reset, so the caller should expect +// the gauge to be empty for the next 'WithLabelValues' values. func (v *CachedGaugeVec) Commit() { v.m.Lock() defer v.m.Unlock() @@ -71,8 +88,6 @@ func (v *CachedGaugeVec) Commit() { g.Add(record.value) case VectorOperationSet: g.Set(record.value) - default: - panic("unsupported vector operation") } } pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy