Skip to content

Commit 6df164e

Browse files
committed
Backoff acquiring provisioner jobs when the database is unreachable
Signed-off-by: Danny Kopping <dannykopping@gmail.com>
1 parent 60fbe67 commit 6df164e

File tree

1 file changed

+13
-8
lines changed

1 file changed

+13
-8
lines changed

provisionerd/provisionerd.go

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,13 @@ import (
2020
"golang.org/x/xerrors"
2121

2222
"cdr.dev/slog"
23+
"github.com/coder/retry"
24+
2325
"github.com/coder/coder/v2/coderd/tracing"
2426
"github.com/coder/coder/v2/codersdk"
2527
"github.com/coder/coder/v2/provisionerd/proto"
2628
"github.com/coder/coder/v2/provisionerd/runner"
2729
sdkproto "github.com/coder/coder/v2/provisionersdk/proto"
28-
"github.com/coder/retry"
2930
)
3031

3132
// Dialer represents the function to create a daemon client connection.
@@ -290,7 +291,7 @@ func (p *Server) acquireLoop() {
290291
defer p.wg.Done()
291292
defer func() { close(p.acquireDoneCh) }()
292293
ctx := p.closeContext
293-
for {
294+
for retrier := retry.New(10*time.Millisecond, 1*time.Second); retrier.Wait(ctx); {
294295
if p.acquireExit() {
295296
return
296297
}
@@ -299,7 +300,10 @@ func (p *Server) acquireLoop() {
299300
p.opts.Logger.Debug(ctx, "shut down before client (re) connected")
300301
return
301302
}
302-
p.acquireAndRunOne(client)
303+
err := p.acquireAndRunOne(client)
304+
if err != nil && ctx.Err() == nil { // Only log if context is not done.
305+
p.opts.Logger.Debug(ctx, "retrying to acquire job", slog.F("retry_in_ms", retrier.Delay.Milliseconds()), slog.Error(err))
306+
}
303307
}
304308
}
305309

@@ -318,7 +322,7 @@ func (p *Server) acquireExit() bool {
318322
return false
319323
}
320324

321-
func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) {
325+
func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) error {
322326
ctx := p.closeContext
323327
p.opts.Logger.Debug(ctx, "start of acquireAndRunOne")
324328
job, err := p.acquireGraceful(client)
@@ -327,15 +331,15 @@ func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) {
327331
if errors.Is(err, context.Canceled) ||
328332
errors.Is(err, yamux.ErrSessionShutdown) ||
329333
errors.Is(err, fasthttputil.ErrInmemoryListenerClosed) {
330-
return
334+
return err
331335
}
332336

333337
p.opts.Logger.Warn(ctx, "provisionerd was unable to acquire job", slog.Error(err))
334-
return
338+
return xerrors.Errorf("failed to acquire job: %w", err)
335339
}
336340
if job.JobId == "" {
337341
p.opts.Logger.Debug(ctx, "acquire job successfully canceled")
338-
return
342+
return xerrors.New("canceled")
339343
}
340344

341345
if len(job.TraceMetadata) > 0 {
@@ -392,7 +396,7 @@ func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) {
392396
if err != nil {
393397
p.opts.Logger.Error(ctx, "provisioner job failed", slog.F("job_id", job.JobId), slog.Error(err))
394398
}
395-
return
399+
return xerrors.Errorf("provisioner job failed: %w", err)
396400
}
397401

398402
p.mutex.Lock()
@@ -416,6 +420,7 @@ func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) {
416420
p.mutex.Lock()
417421
p.activeJob = nil
418422
p.mutex.Unlock()
423+
return nil
419424
}
420425

421426
// acquireGraceful attempts to acquire a job from the server, handling canceling the acquisition if we gracefully shut

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy