Skip to content

Commit bc609d0

Browse files
authored
feat: integrate agentAPI with resources monitoring logic (#16438)
As part of the new resources monitoring logic - more specifically for OOM & OOD Notifications , we need to update the AgentAPI , and the agents logic. This PR aims to do it, and more specifically : We are updating the AgentAPI & TailnetAPI to version 24 to add two new methods in the AgentAPI : - One method to fetch the resources monitoring configuration - One method to push the datapoints for the resources monitoring. Also, this PR adds a new logic on the agent side, with a routine running and ticking - fetching the resources usage each time , but also storing it in a FIFO like queue. Finally, this PR fixes a problem we had with RBAC logic on the resources monitoring model, applying the same logic than we have for similar entities.
1 parent edd982e commit bc609d0

File tree

19 files changed

+1830
-218
lines changed

19 files changed

+1830
-218
lines changed

agent/agent.go

Lines changed: 45 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -38,15 +38,18 @@ import (
3838
"github.com/coder/coder/v2/agent/agentscripts"
3939
"github.com/coder/coder/v2/agent/agentssh"
4040
"github.com/coder/coder/v2/agent/proto"
41+
"github.com/coder/coder/v2/agent/proto/resourcesmonitor"
4142
"github.com/coder/coder/v2/agent/reconnectingpty"
4243
"github.com/coder/coder/v2/buildinfo"
44+
"github.com/coder/coder/v2/cli/clistat"
4345
"github.com/coder/coder/v2/cli/gitauth"
4446
"github.com/coder/coder/v2/coderd/database/dbtime"
4547
"github.com/coder/coder/v2/codersdk"
4648
"github.com/coder/coder/v2/codersdk/agentsdk"
4749
"github.com/coder/coder/v2/codersdk/workspacesdk"
4850
"github.com/coder/coder/v2/tailnet"
4951
tailnetproto "github.com/coder/coder/v2/tailnet/proto"
52+
"github.com/coder/quartz"
5053
"github.com/coder/retry"
5154
)
5255

@@ -87,8 +90,8 @@ type Options struct {
8790
}
8891

8992
type Client interface {
90-
ConnectRPC23(ctx context.Context) (
91-
proto.DRPCAgentClient23, tailnetproto.DRPCTailnetClient23, error,
93+
ConnectRPC24(ctx context.Context) (
94+
proto.DRPCAgentClient24, tailnetproto.DRPCTailnetClient24, error,
9295
)
9396
RewriteDERPMap(derpMap *tailcfg.DERPMap)
9497
}
@@ -406,7 +409,7 @@ func (t *trySingleflight) Do(key string, fn func()) {
406409
fn()
407410
}
408411

409-
func (a *agent) reportMetadata(ctx context.Context, aAPI proto.DRPCAgentClient23) error {
412+
func (a *agent) reportMetadata(ctx context.Context, aAPI proto.DRPCAgentClient24) error {
410413
tickerDone := make(chan struct{})
411414
collectDone := make(chan struct{})
412415
ctx, cancel := context.WithCancel(ctx)
@@ -622,7 +625,7 @@ func (a *agent) reportMetadata(ctx context.Context, aAPI proto.DRPCAgentClient23
622625

623626
// reportLifecycle reports the current lifecycle state once. All state
624627
// changes are reported in order.
625-
func (a *agent) reportLifecycle(ctx context.Context, aAPI proto.DRPCAgentClient23) error {
628+
func (a *agent) reportLifecycle(ctx context.Context, aAPI proto.DRPCAgentClient24) error {
626629
for {
627630
select {
628631
case <-a.lifecycleUpdate:
@@ -704,7 +707,7 @@ func (a *agent) setLifecycle(state codersdk.WorkspaceAgentLifecycle) {
704707
// fetchServiceBannerLoop fetches the service banner on an interval. It will
705708
// not be fetched immediately; the expectation is that it is primed elsewhere
706709
// (and must be done before the session actually starts).
707-
func (a *agent) fetchServiceBannerLoop(ctx context.Context, aAPI proto.DRPCAgentClient23) error {
710+
func (a *agent) fetchServiceBannerLoop(ctx context.Context, aAPI proto.DRPCAgentClient24) error {
708711
ticker := time.NewTicker(a.announcementBannersRefreshInterval)
709712
defer ticker.Stop()
710713
for {
@@ -740,7 +743,7 @@ func (a *agent) run() (retErr error) {
740743
a.sessionToken.Store(&sessionToken)
741744

742745
// ConnectRPC returns the dRPC connection we use for the Agent and Tailnet v2+ APIs
743-
aAPI, tAPI, err := a.client.ConnectRPC23(a.hardCtx)
746+
aAPI, tAPI, err := a.client.ConnectRPC24(a.hardCtx)
744747
if err != nil {
745748
return err
746749
}
@@ -757,7 +760,7 @@ func (a *agent) run() (retErr error) {
757760
connMan := newAPIConnRoutineManager(a.gracefulCtx, a.hardCtx, a.logger, aAPI, tAPI)
758761

759762
connMan.startAgentAPI("init notification banners", gracefulShutdownBehaviorStop,
760-
func(ctx context.Context, aAPI proto.DRPCAgentClient23) error {
763+
func(ctx context.Context, aAPI proto.DRPCAgentClient24) error {
761764
bannersProto, err := aAPI.GetAnnouncementBanners(ctx, &proto.GetAnnouncementBannersRequest{})
762765
if err != nil {
763766
return xerrors.Errorf("fetch service banner: %w", err)
@@ -774,7 +777,7 @@ func (a *agent) run() (retErr error) {
774777
// sending logs gets gracefulShutdownBehaviorRemain because we want to send logs generated by
775778
// shutdown scripts.
776779
connMan.startAgentAPI("send logs", gracefulShutdownBehaviorRemain,
777-
func(ctx context.Context, aAPI proto.DRPCAgentClient23) error {
780+
func(ctx context.Context, aAPI proto.DRPCAgentClient24) error {
778781
err := a.logSender.SendLoop(ctx, aAPI)
779782
if xerrors.Is(err, agentsdk.LogLimitExceededError) {
780783
// we don't want this error to tear down the API connection and propagate to the
@@ -792,6 +795,25 @@ func (a *agent) run() (retErr error) {
792795
// metadata reporting can cease as soon as we start gracefully shutting down
793796
connMan.startAgentAPI("report metadata", gracefulShutdownBehaviorStop, a.reportMetadata)
794797

798+
// resources monitor can cease as soon as we start gracefully shutting down.
799+
connMan.startAgentAPI("resources monitor", gracefulShutdownBehaviorStop, func(ctx context.Context, aAPI proto.DRPCAgentClient24) error {
800+
logger := a.logger.Named("resources_monitor")
801+
clk := quartz.NewReal()
802+
config, err := aAPI.GetResourcesMonitoringConfiguration(ctx, &proto.GetResourcesMonitoringConfigurationRequest{})
803+
if err != nil {
804+
return xerrors.Errorf("failed to get resources monitoring configuration: %w", err)
805+
}
806+
807+
statfetcher, err := clistat.New()
808+
if err != nil {
809+
return xerrors.Errorf("failed to create resources fetcher: %w", err)
810+
}
811+
resourcesFetcher := resourcesmonitor.NewFetcher(statfetcher)
812+
813+
resourcesmonitor := resourcesmonitor.NewResourcesMonitor(logger, clk, config, resourcesFetcher, aAPI)
814+
return resourcesmonitor.Start(ctx)
815+
})
816+
795817
// channels to sync goroutines below
796818
// handle manifest
797819
// |
@@ -814,7 +836,7 @@ func (a *agent) run() (retErr error) {
814836
connMan.startAgentAPI("handle manifest", gracefulShutdownBehaviorStop, a.handleManifest(manifestOK))
815837

816838
connMan.startAgentAPI("app health reporter", gracefulShutdownBehaviorStop,
817-
func(ctx context.Context, aAPI proto.DRPCAgentClient23) error {
839+
func(ctx context.Context, aAPI proto.DRPCAgentClient24) error {
818840
if err := manifestOK.wait(ctx); err != nil {
819841
return xerrors.Errorf("no manifest: %w", err)
820842
}
@@ -829,7 +851,7 @@ func (a *agent) run() (retErr error) {
829851
a.createOrUpdateNetwork(manifestOK, networkOK))
830852

831853
connMan.startTailnetAPI("coordination", gracefulShutdownBehaviorStop,
832-
func(ctx context.Context, tAPI tailnetproto.DRPCTailnetClient23) error {
854+
func(ctx context.Context, tAPI tailnetproto.DRPCTailnetClient24) error {
833855
if err := networkOK.wait(ctx); err != nil {
834856
return xerrors.Errorf("no network: %w", err)
835857
}
@@ -838,7 +860,7 @@ func (a *agent) run() (retErr error) {
838860
)
839861

840862
connMan.startTailnetAPI("derp map subscriber", gracefulShutdownBehaviorStop,
841-
func(ctx context.Context, tAPI tailnetproto.DRPCTailnetClient23) error {
863+
func(ctx context.Context, tAPI tailnetproto.DRPCTailnetClient24) error {
842864
if err := networkOK.wait(ctx); err != nil {
843865
return xerrors.Errorf("no network: %w", err)
844866
}
@@ -847,7 +869,7 @@ func (a *agent) run() (retErr error) {
847869

848870
connMan.startAgentAPI("fetch service banner loop", gracefulShutdownBehaviorStop, a.fetchServiceBannerLoop)
849871

850-
connMan.startAgentAPI("stats report loop", gracefulShutdownBehaviorStop, func(ctx context.Context, aAPI proto.DRPCAgentClient23) error {
872+
connMan.startAgentAPI("stats report loop", gracefulShutdownBehaviorStop, func(ctx context.Context, aAPI proto.DRPCAgentClient24) error {
851873
if err := networkOK.wait(ctx); err != nil {
852874
return xerrors.Errorf("no network: %w", err)
853875
}
@@ -858,8 +880,8 @@ func (a *agent) run() (retErr error) {
858880
}
859881

860882
// handleManifest returns a function that fetches and processes the manifest
861-
func (a *agent) handleManifest(manifestOK *checkpoint) func(ctx context.Context, aAPI proto.DRPCAgentClient23) error {
862-
return func(ctx context.Context, aAPI proto.DRPCAgentClient23) error {
883+
func (a *agent) handleManifest(manifestOK *checkpoint) func(ctx context.Context, aAPI proto.DRPCAgentClient24) error {
884+
return func(ctx context.Context, aAPI proto.DRPCAgentClient24) error {
863885
var (
864886
sentResult = false
865887
err error
@@ -968,8 +990,8 @@ func (a *agent) handleManifest(manifestOK *checkpoint) func(ctx context.Context,
968990

969991
// createOrUpdateNetwork waits for the manifest to be set using manifestOK, then creates or updates
970992
// the tailnet using the information in the manifest
971-
func (a *agent) createOrUpdateNetwork(manifestOK, networkOK *checkpoint) func(context.Context, proto.DRPCAgentClient23) error {
972-
return func(ctx context.Context, _ proto.DRPCAgentClient23) (retErr error) {
993+
func (a *agent) createOrUpdateNetwork(manifestOK, networkOK *checkpoint) func(context.Context, proto.DRPCAgentClient24) error {
994+
return func(ctx context.Context, _ proto.DRPCAgentClient24) (retErr error) {
973995
if err := manifestOK.wait(ctx); err != nil {
974996
return xerrors.Errorf("no manifest: %w", err)
975997
}
@@ -1273,7 +1295,7 @@ func (a *agent) createTailnet(ctx context.Context, agentID uuid.UUID, derpMap *t
12731295

12741296
// runCoordinator runs a coordinator and returns whether a reconnect
12751297
// should occur.
1276-
func (a *agent) runCoordinator(ctx context.Context, tClient tailnetproto.DRPCTailnetClient23, network *tailnet.Conn) error {
1298+
func (a *agent) runCoordinator(ctx context.Context, tClient tailnetproto.DRPCTailnetClient24, network *tailnet.Conn) error {
12771299
defer a.logger.Debug(ctx, "disconnected from coordination RPC")
12781300
// we run the RPC on the hardCtx so that we have a chance to send the disconnect message if we
12791301
// gracefully shut down.
@@ -1320,7 +1342,7 @@ func (a *agent) runCoordinator(ctx context.Context, tClient tailnetproto.DRPCTai
13201342
}
13211343

13221344
// runDERPMapSubscriber runs a coordinator and returns if a reconnect should occur.
1323-
func (a *agent) runDERPMapSubscriber(ctx context.Context, tClient tailnetproto.DRPCTailnetClient23, network *tailnet.Conn) error {
1345+
func (a *agent) runDERPMapSubscriber(ctx context.Context, tClient tailnetproto.DRPCTailnetClient24, network *tailnet.Conn) error {
13241346
defer a.logger.Debug(ctx, "disconnected from derp map RPC")
13251347
ctx, cancel := context.WithCancel(ctx)
13261348
defer cancel()
@@ -1690,16 +1712,16 @@ const (
16901712

16911713
type apiConnRoutineManager struct {
16921714
logger slog.Logger
1693-
aAPI proto.DRPCAgentClient23
1694-
tAPI tailnetproto.DRPCTailnetClient23
1715+
aAPI proto.DRPCAgentClient24
1716+
tAPI tailnetproto.DRPCTailnetClient24
16951717
eg *errgroup.Group
16961718
stopCtx context.Context
16971719
remainCtx context.Context
16981720
}
16991721

17001722
func newAPIConnRoutineManager(
17011723
gracefulCtx, hardCtx context.Context, logger slog.Logger,
1702-
aAPI proto.DRPCAgentClient23, tAPI tailnetproto.DRPCTailnetClient23,
1724+
aAPI proto.DRPCAgentClient24, tAPI tailnetproto.DRPCTailnetClient24,
17031725
) *apiConnRoutineManager {
17041726
// routines that remain in operation during graceful shutdown use the remainCtx. They'll still
17051727
// exit if the errgroup hits an error, which usually means a problem with the conn.
@@ -1732,7 +1754,7 @@ func newAPIConnRoutineManager(
17321754
// but for Tailnet.
17331755
func (a *apiConnRoutineManager) startAgentAPI(
17341756
name string, behavior gracefulShutdownBehavior,
1735-
f func(context.Context, proto.DRPCAgentClient23) error,
1757+
f func(context.Context, proto.DRPCAgentClient24) error,
17361758
) {
17371759
logger := a.logger.With(slog.F("name", name))
17381760
var ctx context.Context
@@ -1769,7 +1791,7 @@ func (a *apiConnRoutineManager) startAgentAPI(
17691791
// but for the Agent API.
17701792
func (a *apiConnRoutineManager) startTailnetAPI(
17711793
name string, behavior gracefulShutdownBehavior,
1772-
f func(context.Context, tailnetproto.DRPCTailnetClient23) error,
1794+
f func(context.Context, tailnetproto.DRPCTailnetClient24) error,
17731795
) {
17741796
logger := a.logger.With(slog.F("name", name))
17751797
var ctx context.Context

agent/agenttest/client.go

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -96,8 +96,8 @@ func (c *Client) Close() {
9696
c.derpMapOnce.Do(func() { close(c.derpMapUpdates) })
9797
}
9898

99-
func (c *Client) ConnectRPC23(ctx context.Context) (
100-
agentproto.DRPCAgentClient23, proto.DRPCTailnetClient23, error,
99+
func (c *Client) ConnectRPC24(ctx context.Context) (
100+
agentproto.DRPCAgentClient24, proto.DRPCTailnetClient24, error,
101101
) {
102102
conn, lis := drpcsdk.MemTransportPipe()
103103
c.LastWorkspaceAgent = func() {
@@ -171,7 +171,9 @@ type FakeAgentAPI struct {
171171
metadata map[string]agentsdk.Metadata
172172
timings []*agentproto.Timing
173173

174-
getAnnouncementBannersFunc func() ([]codersdk.BannerConfig, error)
174+
getAnnouncementBannersFunc func() ([]codersdk.BannerConfig, error)
175+
getResourcesMonitoringConfigurationFunc func() (*agentproto.GetResourcesMonitoringConfigurationResponse, error)
176+
pushResourcesMonitoringUsageFunc func(*agentproto.PushResourcesMonitoringUsageRequest) (*agentproto.PushResourcesMonitoringUsageResponse, error)
175177
}
176178

177179
func (f *FakeAgentAPI) GetManifest(context.Context, *agentproto.GetManifestRequest) (*agentproto.Manifest, error) {
@@ -212,6 +214,33 @@ func (f *FakeAgentAPI) GetAnnouncementBanners(context.Context, *agentproto.GetAn
212214
return &agentproto.GetAnnouncementBannersResponse{AnnouncementBanners: bannersProto}, nil
213215
}
214216

217+
func (f *FakeAgentAPI) GetResourcesMonitoringConfiguration(_ context.Context, _ *agentproto.GetResourcesMonitoringConfigurationRequest) (*agentproto.GetResourcesMonitoringConfigurationResponse, error) {
218+
f.Lock()
219+
defer f.Unlock()
220+
221+
if f.getResourcesMonitoringConfigurationFunc == nil {
222+
return &agentproto.GetResourcesMonitoringConfigurationResponse{
223+
Config: &agentproto.GetResourcesMonitoringConfigurationResponse_Config{
224+
CollectionIntervalSeconds: 10,
225+
NumDatapoints: 20,
226+
},
227+
}, nil
228+
}
229+
230+
return f.getResourcesMonitoringConfigurationFunc()
231+
}
232+
233+
func (f *FakeAgentAPI) PushResourcesMonitoringUsage(_ context.Context, req *agentproto.PushResourcesMonitoringUsageRequest) (*agentproto.PushResourcesMonitoringUsageResponse, error) {
234+
f.Lock()
235+
defer f.Unlock()
236+
237+
if f.pushResourcesMonitoringUsageFunc == nil {
238+
return &agentproto.PushResourcesMonitoringUsageResponse{}, nil
239+
}
240+
241+
return f.pushResourcesMonitoringUsageFunc(req)
242+
}
243+
215244
func (f *FakeAgentAPI) UpdateStats(ctx context.Context, req *agentproto.UpdateStatsRequest) (*agentproto.UpdateStatsResponse, error) {
216245
f.logger.Debug(ctx, "update stats called", slog.F("req", req))
217246
// empty request is sent to get the interval; but our tests don't want empty stats requests

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy