Skip to content

Commit ae522c5

Browse files
feat: add agent timings (#14713)
* feat: begin impl of agent script timings * feat: add job_id and display_name to script timings * fix: increment migration number * fix: rename migrations from 251 to 254 * test: get tests compiling * fix: appease the linter * fix: get tests passing again * fix: drop column from correct table * test: add fixture for agent script timings * fix: typo * fix: use job id used in provisioner job timings * fix: increment migration number * test: behaviour of script runner * test: rewrite test * test: does exit 1 script break things? * test: rewrite test again * fix: revert change Not sure how this came to be, I do not recall manually changing these files. * fix: let code breathe * fix: wrap errors * fix: justify nolint * fix: swap require.Equal argument order * fix: add mutex operations * feat: add 'ran_on_start' and 'blocked_login' fields * fix: update testdata fixture * fix: refer to agent_id instead of job_id in timings * fix: JobID -> AgentID in dbauthz_test * fix: add 'id' to scripts, make timing refer to script id * fix: fix broken tests and convert bug * fix: update testdata fixtures * fix: update testdata fixtures again * feat: capture stage and if script timed out * fix: update migration number * test: add test for script api * fix: fake db query * fix: use UTC time * fix: ensure r.scriptComplete is not nil * fix: move err check to right after call * fix: uppercase sql * fix: use dbtime.Now() * fix: debug log on r.scriptCompleted being nil * fix: ensure correct rbac permissions * chore: remove DisplayName * fix: get tests passing * fix: remove space in sql up * docs: document ExecuteOption * fix: drop 'RETURNING' from sql * chore: remove 'display_name' from timing table * fix: testdata fixture * fix: put r.scriptCompleted call in goroutine * fix: track goroutine for test + use separate context for reporting * fix: appease linter, handle trackCommandGoroutine error * fix: resolve race condition * feat: replace timed_out column with status column * test: update testdata fixture * fix: apply suggestions from review * revert: linter changes
1 parent b894407 commit ae522c5

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+1367
-232
lines changed

agent/agent.go

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -941,17 +941,15 @@ func (a *agent) handleManifest(manifestOK *checkpoint) func(ctx context.Context,
941941
}
942942
}
943943

944-
err = a.scriptRunner.Init(manifest.Scripts)
944+
err = a.scriptRunner.Init(manifest.Scripts, aAPI.ScriptCompleted)
945945
if err != nil {
946946
return xerrors.Errorf("init script runner: %w", err)
947947
}
948948
err = a.trackGoroutine(func() {
949949
start := time.Now()
950950
// here we use the graceful context because the script runner is not directly tied
951951
// to the agent API.
952-
err := a.scriptRunner.Execute(a.gracefulCtx, func(script codersdk.WorkspaceAgentScript) bool {
953-
return script.RunOnStart
954-
})
952+
err := a.scriptRunner.Execute(a.gracefulCtx, agentscripts.ExecuteStartScripts)
955953
// Measure the time immediately after the script has finished
956954
dur := time.Since(start).Seconds()
957955
if err != nil {
@@ -1844,9 +1842,7 @@ func (a *agent) Close() error {
18441842
a.gracefulCancel()
18451843

18461844
lifecycleState := codersdk.WorkspaceAgentLifecycleOff
1847-
err = a.scriptRunner.Execute(a.hardCtx, func(script codersdk.WorkspaceAgentScript) bool {
1848-
return script.RunOnStop
1849-
})
1845+
err = a.scriptRunner.Execute(a.hardCtx, agentscripts.ExecuteStopScripts)
18501846
if err != nil {
18511847
a.logger.Warn(a.hardCtx, "shutdown script(s) failed", slog.Error(err))
18521848
if errors.Is(err, agentscripts.ErrTimeout) {

agent/agent_test.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1517,10 +1517,12 @@ func TestAgent_Lifecycle(t *testing.T) {
15171517
agentsdk.Manifest{
15181518
DERPMap: derpMap,
15191519
Scripts: []codersdk.WorkspaceAgentScript{{
1520+
ID: uuid.New(),
15201521
LogPath: "coder-startup-script.log",
15211522
Script: "echo 1",
15221523
RunOnStart: true,
15231524
}, {
1525+
ID: uuid.New(),
15241526
LogPath: "coder-shutdown-script.log",
15251527
Script: "echo " + expected,
15261528
RunOnStop: true,

agent/agentscripts/agentscripts.go

Lines changed: 97 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,13 @@ import (
1919
"github.com/spf13/afero"
2020
"golang.org/x/sync/errgroup"
2121
"golang.org/x/xerrors"
22+
"google.golang.org/protobuf/types/known/timestamppb"
2223

2324
"cdr.dev/slog"
2425

2526
"github.com/coder/coder/v2/agent/agentssh"
27+
"github.com/coder/coder/v2/agent/proto"
28+
"github.com/coder/coder/v2/coderd/database/dbtime"
2629
"github.com/coder/coder/v2/codersdk"
2730
"github.com/coder/coder/v2/codersdk/agentsdk"
2831
)
@@ -75,18 +78,21 @@ func New(opts Options) *Runner {
7578
}
7679
}
7780

81+
type ScriptCompletedFunc func(context.Context, *proto.WorkspaceAgentScriptCompletedRequest) (*proto.WorkspaceAgentScriptCompletedResponse, error)
82+
7883
type Runner struct {
7984
Options
8085

81-
cronCtx context.Context
82-
cronCtxCancel context.CancelFunc
83-
cmdCloseWait sync.WaitGroup
84-
closed chan struct{}
85-
closeMutex sync.Mutex
86-
cron *cron.Cron
87-
initialized atomic.Bool
88-
scripts []codersdk.WorkspaceAgentScript
89-
dataDir string
86+
cronCtx context.Context
87+
cronCtxCancel context.CancelFunc
88+
cmdCloseWait sync.WaitGroup
89+
closed chan struct{}
90+
closeMutex sync.Mutex
91+
cron *cron.Cron
92+
initialized atomic.Bool
93+
scripts []codersdk.WorkspaceAgentScript
94+
dataDir string
95+
scriptCompleted ScriptCompletedFunc
9096

9197
// scriptsExecuted includes all scripts executed by the workspace agent. Agents
9298
// execute startup scripts, and scripts on a cron schedule. Both will increment
@@ -116,12 +122,13 @@ func (r *Runner) RegisterMetrics(reg prometheus.Registerer) {
116122
// Init initializes the runner with the provided scripts.
117123
// It also schedules any scripts that have a schedule.
118124
// This function must be called before Execute.
119-
func (r *Runner) Init(scripts []codersdk.WorkspaceAgentScript) error {
125+
func (r *Runner) Init(scripts []codersdk.WorkspaceAgentScript, scriptCompleted ScriptCompletedFunc) error {
120126
if r.initialized.Load() {
121127
return xerrors.New("init: already initialized")
122128
}
123129
r.initialized.Store(true)
124130
r.scripts = scripts
131+
r.scriptCompleted = scriptCompleted
125132
r.Logger.Info(r.cronCtx, "initializing agent scripts", slog.F("script_count", len(scripts)), slog.F("log_dir", r.LogDir))
126133

127134
err := r.Filesystem.MkdirAll(r.ScriptBinDir(), 0o700)
@@ -135,7 +142,7 @@ func (r *Runner) Init(scripts []codersdk.WorkspaceAgentScript) error {
135142
}
136143
script := script
137144
_, err := r.cron.AddFunc(script.Cron, func() {
138-
err := r.trackRun(r.cronCtx, script)
145+
err := r.trackRun(r.cronCtx, script, ExecuteCronScripts)
139146
if err != nil {
140147
r.Logger.Warn(context.Background(), "run agent script on schedule", slog.Error(err))
141148
}
@@ -172,22 +179,33 @@ func (r *Runner) StartCron() {
172179
}
173180
}
174181

182+
// ExecuteOption describes what scripts we want to execute.
183+
type ExecuteOption int
184+
185+
// ExecuteOption enums.
186+
const (
187+
ExecuteAllScripts ExecuteOption = iota
188+
ExecuteStartScripts
189+
ExecuteStopScripts
190+
ExecuteCronScripts
191+
)
192+
175193
// Execute runs a set of scripts according to a filter.
176-
func (r *Runner) Execute(ctx context.Context, filter func(script codersdk.WorkspaceAgentScript) bool) error {
177-
if filter == nil {
178-
// Execute em' all!
179-
filter = func(script codersdk.WorkspaceAgentScript) bool {
180-
return true
181-
}
182-
}
194+
func (r *Runner) Execute(ctx context.Context, option ExecuteOption) error {
183195
var eg errgroup.Group
184196
for _, script := range r.scripts {
185-
if !filter(script) {
197+
runScript := (option == ExecuteStartScripts && script.RunOnStart) ||
198+
(option == ExecuteStopScripts && script.RunOnStop) ||
199+
(option == ExecuteCronScripts && script.Cron != "") ||
200+
option == ExecuteAllScripts
201+
202+
if !runScript {
186203
continue
187204
}
205+
188206
script := script
189207
eg.Go(func() error {
190-
err := r.trackRun(ctx, script)
208+
err := r.trackRun(ctx, script, option)
191209
if err != nil {
192210
return xerrors.Errorf("run agent script %q: %w", script.LogSourceID, err)
193211
}
@@ -198,8 +216,8 @@ func (r *Runner) Execute(ctx context.Context, filter func(script codersdk.Worksp
198216
}
199217

200218
// trackRun wraps "run" with metrics.
201-
func (r *Runner) trackRun(ctx context.Context, script codersdk.WorkspaceAgentScript) error {
202-
err := r.run(ctx, script)
219+
func (r *Runner) trackRun(ctx context.Context, script codersdk.WorkspaceAgentScript, option ExecuteOption) error {
220+
err := r.run(ctx, script, option)
203221
if err != nil {
204222
r.scriptsExecuted.WithLabelValues("false").Add(1)
205223
} else {
@@ -212,7 +230,7 @@ func (r *Runner) trackRun(ctx context.Context, script codersdk.WorkspaceAgentScr
212230
// If the timeout is exceeded, the process is sent an interrupt signal.
213231
// If the process does not exit after a few seconds, it is forcefully killed.
214232
// This function immediately returns after a timeout, and does not wait for the process to exit.
215-
func (r *Runner) run(ctx context.Context, script codersdk.WorkspaceAgentScript) error {
233+
func (r *Runner) run(ctx context.Context, script codersdk.WorkspaceAgentScript, option ExecuteOption) error {
216234
logPath := script.LogPath
217235
if logPath == "" {
218236
logPath = fmt.Sprintf("coder-script-%s.log", script.LogSourceID)
@@ -299,9 +317,9 @@ func (r *Runner) run(ctx context.Context, script codersdk.WorkspaceAgentScript)
299317
cmd.Stdout = io.MultiWriter(fileWriter, infoW)
300318
cmd.Stderr = io.MultiWriter(fileWriter, errW)
301319

302-
start := time.Now()
320+
start := dbtime.Now()
303321
defer func() {
304-
end := time.Now()
322+
end := dbtime.Now()
305323
execTime := end.Sub(start)
306324
exitCode := 0
307325
if err != nil {
@@ -314,6 +332,60 @@ func (r *Runner) run(ctx context.Context, script codersdk.WorkspaceAgentScript)
314332
} else {
315333
logger.Info(ctx, fmt.Sprintf("%s script completed", logPath), slog.F("execution_time", execTime), slog.F("exit_code", exitCode))
316334
}
335+
336+
if r.scriptCompleted == nil {
337+
logger.Debug(ctx, "r.scriptCompleted unexpectedly nil")
338+
return
339+
}
340+
341+
// We want to check this outside of the goroutine to avoid a race condition
342+
timedOut := errors.Is(err, ErrTimeout)
343+
pipesLeftOpen := errors.Is(err, ErrOutputPipesOpen)
344+
345+
err = r.trackCommandGoroutine(func() {
346+
var stage proto.Timing_Stage
347+
switch option {
348+
case ExecuteStartScripts:
349+
stage = proto.Timing_START
350+
case ExecuteStopScripts:
351+
stage = proto.Timing_STOP
352+
case ExecuteCronScripts:
353+
stage = proto.Timing_CRON
354+
}
355+
356+
var status proto.Timing_Status
357+
switch {
358+
case timedOut:
359+
status = proto.Timing_TIMED_OUT
360+
case pipesLeftOpen:
361+
status = proto.Timing_PIPES_LEFT_OPEN
362+
case exitCode != 0:
363+
status = proto.Timing_EXIT_FAILURE
364+
default:
365+
status = proto.Timing_OK
366+
}
367+
368+
reportTimeout := 30 * time.Second
369+
reportCtx, cancel := context.WithTimeout(context.Background(), reportTimeout)
370+
defer cancel()
371+
372+
_, err := r.scriptCompleted(reportCtx, &proto.WorkspaceAgentScriptCompletedRequest{
373+
Timing: &proto.Timing{
374+
ScriptId: script.ID[:],
375+
Start: timestamppb.New(start),
376+
End: timestamppb.New(end),
377+
ExitCode: int32(exitCode),
378+
Stage: stage,
379+
Status: status,
380+
},
381+
})
382+
if err != nil {
383+
logger.Error(ctx, fmt.Sprintf("reporting script completed: %s", err.Error()))
384+
}
385+
})
386+
if err != nil {
387+
logger.Error(ctx, fmt.Sprintf("reporting script completed: track command goroutine: %s", err.Error()))
388+
}
317389
}()
318390

319391
err = cmd.Start()

agent/agentscripts/agentscripts_test.go

Lines changed: 40 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import (
1717
"cdr.dev/slog/sloggers/slogtest"
1818
"github.com/coder/coder/v2/agent/agentscripts"
1919
"github.com/coder/coder/v2/agent/agentssh"
20+
"github.com/coder/coder/v2/agent/agenttest"
2021
"github.com/coder/coder/v2/codersdk"
2122
"github.com/coder/coder/v2/codersdk/agentsdk"
2223
"github.com/coder/coder/v2/testutil"
@@ -34,14 +35,13 @@ func TestExecuteBasic(t *testing.T) {
3435
return fLogger
3536
})
3637
defer runner.Close()
38+
aAPI := agenttest.NewFakeAgentAPI(t, slogtest.Make(t, nil), nil, nil)
3739
err := runner.Init([]codersdk.WorkspaceAgentScript{{
3840
LogSourceID: uuid.New(),
3941
Script: "echo hello",
40-
}})
42+
}}, aAPI.ScriptCompleted)
4143
require.NoError(t, err)
42-
require.NoError(t, runner.Execute(context.Background(), func(script codersdk.WorkspaceAgentScript) bool {
43-
return true
44-
}))
44+
require.NoError(t, runner.Execute(context.Background(), agentscripts.ExecuteAllScripts))
4545
log := testutil.RequireRecvCtx(ctx, t, fLogger.logs)
4646
require.Equal(t, "hello", log.Output)
4747
}
@@ -61,18 +61,17 @@ func TestEnv(t *testing.T) {
6161
cmd.exe /c echo %CODER_SCRIPT_BIN_DIR%
6262
`
6363
}
64+
aAPI := agenttest.NewFakeAgentAPI(t, slogtest.Make(t, nil), nil, nil)
6465
err := runner.Init([]codersdk.WorkspaceAgentScript{{
6566
LogSourceID: id,
6667
Script: script,
67-
}})
68+
}}, aAPI.ScriptCompleted)
6869
require.NoError(t, err)
6970

7071
ctx := testutil.Context(t, testutil.WaitLong)
7172

7273
done := testutil.Go(t, func() {
73-
err := runner.Execute(ctx, func(script codersdk.WorkspaceAgentScript) bool {
74-
return true
75-
})
74+
err := runner.Execute(ctx, agentscripts.ExecuteAllScripts)
7675
assert.NoError(t, err)
7776
})
7877
defer func() {
@@ -103,13 +102,44 @@ func TestTimeout(t *testing.T) {
103102
t.Parallel()
104103
runner := setup(t, nil)
105104
defer runner.Close()
105+
aAPI := agenttest.NewFakeAgentAPI(t, slogtest.Make(t, nil), nil, nil)
106106
err := runner.Init([]codersdk.WorkspaceAgentScript{{
107107
LogSourceID: uuid.New(),
108108
Script: "sleep infinity",
109109
Timeout: time.Millisecond,
110-
}})
110+
}}, aAPI.ScriptCompleted)
111111
require.NoError(t, err)
112-
require.ErrorIs(t, runner.Execute(context.Background(), nil), agentscripts.ErrTimeout)
112+
require.ErrorIs(t, runner.Execute(context.Background(), agentscripts.ExecuteAllScripts), agentscripts.ErrTimeout)
113+
}
114+
115+
func TestScriptReportsTiming(t *testing.T) {
116+
t.Parallel()
117+
118+
ctx := testutil.Context(t, testutil.WaitShort)
119+
fLogger := newFakeScriptLogger()
120+
runner := setup(t, func(uuid2 uuid.UUID) agentscripts.ScriptLogger {
121+
return fLogger
122+
})
123+
124+
aAPI := agenttest.NewFakeAgentAPI(t, slogtest.Make(t, nil), nil, nil)
125+
err := runner.Init([]codersdk.WorkspaceAgentScript{{
126+
DisplayName: "say-hello",
127+
LogSourceID: uuid.New(),
128+
Script: "echo hello",
129+
}}, aAPI.ScriptCompleted)
130+
require.NoError(t, err)
131+
require.NoError(t, runner.Execute(ctx, agentscripts.ExecuteAllScripts))
132+
runner.Close()
133+
134+
log := testutil.RequireRecvCtx(ctx, t, fLogger.logs)
135+
require.Equal(t, "hello", log.Output)
136+
137+
timings := aAPI.GetTimings()
138+
require.Equal(t, 1, len(timings))
139+
140+
timing := timings[0]
141+
require.Equal(t, int32(0), timing.ExitCode)
142+
require.GreaterOrEqual(t, timing.End.AsTime(), timing.Start.AsTime())
113143
}
114144

115145
// TestCronClose exists because cron.Run() can happen after cron.Close().

agent/agenttest/client.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,7 @@ type FakeAgentAPI struct {
170170
logsCh chan<- *agentproto.BatchCreateLogsRequest
171171
lifecycleStates []codersdk.WorkspaceAgentLifecycle
172172
metadata map[string]agentsdk.Metadata
173+
timings []*agentproto.Timing
173174

174175
getAnnouncementBannersFunc func() ([]codersdk.BannerConfig, error)
175176
}
@@ -182,6 +183,12 @@ func (*FakeAgentAPI) GetServiceBanner(context.Context, *agentproto.GetServiceBan
182183
return &agentproto.ServiceBanner{}, nil
183184
}
184185

186+
func (f *FakeAgentAPI) GetTimings() []*agentproto.Timing {
187+
f.Lock()
188+
defer f.Unlock()
189+
return slices.Clone(f.timings)
190+
}
191+
185192
func (f *FakeAgentAPI) SetAnnouncementBannersFunc(fn func() ([]codersdk.BannerConfig, error)) {
186193
f.Lock()
187194
defer f.Unlock()
@@ -301,6 +308,14 @@ func (f *FakeAgentAPI) BatchCreateLogs(ctx context.Context, req *agentproto.Batc
301308
return &agentproto.BatchCreateLogsResponse{}, nil
302309
}
303310

311+
func (f *FakeAgentAPI) ScriptCompleted(_ context.Context, req *agentproto.WorkspaceAgentScriptCompletedRequest) (*agentproto.WorkspaceAgentScriptCompletedResponse, error) {
312+
f.Lock()
313+
f.timings = append(f.timings, req.Timing)
314+
f.Unlock()
315+
316+
return &agentproto.WorkspaceAgentScriptCompletedResponse{}, nil
317+
}
318+
304319
func NewFakeAgentAPI(t testing.TB, logger slog.Logger, manifest *agentproto.Manifest, statsCh chan *agentproto.Stats) *FakeAgentAPI {
305320
return &FakeAgentAPI{
306321
t: t,

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy