Skip to content

Commit 9ea3910

Browse files
fix: reduce excessive logging when database is unreachable (cherry-pick #17363) (#17411)
Co-authored-by: Danny Kopping <danny@coder.com> Fixes #17045
1 parent 8b5adaa commit 9ea3910

File tree

10 files changed

+160
-20
lines changed

10 files changed

+160
-20
lines changed

coderd/coderd.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -665,10 +665,11 @@ func New(options *Options) *API {
665665
api.Auditor.Store(&options.Auditor)
666666
api.TailnetCoordinator.Store(&options.TailnetCoordinator)
667667
dialer := &InmemTailnetDialer{
668-
CoordPtr: &api.TailnetCoordinator,
669-
DERPFn: api.DERPMap,
670-
Logger: options.Logger,
671-
ClientID: uuid.New(),
668+
CoordPtr: &api.TailnetCoordinator,
669+
DERPFn: api.DERPMap,
670+
Logger: options.Logger,
671+
ClientID: uuid.New(),
672+
DatabaseHealthCheck: api.Database,
672673
}
673674
stn, err := NewServerTailnet(api.ctx,
674675
options.Logger,

coderd/tailnet.go

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,11 @@ import (
2424
"tailscale.com/tailcfg"
2525

2626
"cdr.dev/slog"
27+
2728
"github.com/coder/coder/v2/coderd/tracing"
2829
"github.com/coder/coder/v2/coderd/workspaceapps"
2930
"github.com/coder/coder/v2/coderd/workspaceapps/appurl"
31+
"github.com/coder/coder/v2/codersdk"
3032
"github.com/coder/coder/v2/codersdk/workspacesdk"
3133
"github.com/coder/coder/v2/site"
3234
"github.com/coder/coder/v2/tailnet"
@@ -534,16 +536,28 @@ func NewMultiAgentController(ctx context.Context, logger slog.Logger, tracer tra
534536
return m
535537
}
536538

539+
type Pinger interface {
540+
Ping(context.Context) (time.Duration, error)
541+
}
542+
537543
// InmemTailnetDialer is a tailnet.ControlProtocolDialer that connects to a Coordinator and DERPMap
538544
// service running in the same memory space.
539545
type InmemTailnetDialer struct {
540546
CoordPtr *atomic.Pointer[tailnet.Coordinator]
541547
DERPFn func() *tailcfg.DERPMap
542548
Logger slog.Logger
543549
ClientID uuid.UUID
550+
// DatabaseHealthCheck is used to validate that the store is reachable.
551+
DatabaseHealthCheck Pinger
544552
}
545553

546-
func (a *InmemTailnetDialer) Dial(_ context.Context, _ tailnet.ResumeTokenController) (tailnet.ControlProtocolClients, error) {
554+
func (a *InmemTailnetDialer) Dial(ctx context.Context, _ tailnet.ResumeTokenController) (tailnet.ControlProtocolClients, error) {
555+
if a.DatabaseHealthCheck != nil {
556+
if _, err := a.DatabaseHealthCheck.Ping(ctx); err != nil {
557+
return tailnet.ControlProtocolClients{}, xerrors.Errorf("%w: %v", codersdk.ErrDatabaseNotReachable, err)
558+
}
559+
}
560+
547561
coord := a.CoordPtr.Load()
548562
if coord == nil {
549563
return tailnet.ControlProtocolClients{}, xerrors.Errorf("tailnet coordinator not initialized")

coderd/tailnet_test.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,20 +11,23 @@ import (
1111
"strconv"
1212
"sync/atomic"
1313
"testing"
14+
"time"
1415

1516
"github.com/google/uuid"
1617
"github.com/prometheus/client_golang/prometheus"
1718
"github.com/spf13/afero"
1819
"github.com/stretchr/testify/assert"
1920
"github.com/stretchr/testify/require"
2021
"go.opentelemetry.io/otel/trace"
22+
"golang.org/x/xerrors"
2123
"tailscale.com/tailcfg"
2224

2325
"github.com/coder/coder/v2/agent"
2426
"github.com/coder/coder/v2/agent/agenttest"
2527
"github.com/coder/coder/v2/agent/proto"
2628
"github.com/coder/coder/v2/coderd"
2729
"github.com/coder/coder/v2/coderd/workspaceapps/appurl"
30+
"github.com/coder/coder/v2/codersdk"
2831
"github.com/coder/coder/v2/codersdk/agentsdk"
2932
"github.com/coder/coder/v2/codersdk/workspacesdk"
3033
"github.com/coder/coder/v2/tailnet"
@@ -365,6 +368,44 @@ func TestServerTailnet_ReverseProxy(t *testing.T) {
365368
})
366369
}
367370

371+
func TestDialFailure(t *testing.T) {
372+
t.Parallel()
373+
374+
// Setup.
375+
ctx := testutil.Context(t, testutil.WaitShort)
376+
logger := testutil.Logger(t)
377+
378+
// Given: a tailnet coordinator.
379+
coord := tailnet.NewCoordinator(logger)
380+
t.Cleanup(func() {
381+
_ = coord.Close()
382+
})
383+
coordPtr := atomic.Pointer[tailnet.Coordinator]{}
384+
coordPtr.Store(&coord)
385+
386+
// Given: a fake DB healthchecker which will always fail.
387+
fch := &failingHealthcheck{}
388+
389+
// When: dialing the in-memory coordinator.
390+
dialer := &coderd.InmemTailnetDialer{
391+
CoordPtr: &coordPtr,
392+
Logger: logger,
393+
ClientID: uuid.UUID{5},
394+
DatabaseHealthCheck: fch,
395+
}
396+
_, err := dialer.Dial(ctx, nil)
397+
398+
// Then: the error returned reflects the database has failed its healthcheck.
399+
require.ErrorIs(t, err, codersdk.ErrDatabaseNotReachable)
400+
}
401+
402+
type failingHealthcheck struct{}
403+
404+
func (failingHealthcheck) Ping(context.Context) (time.Duration, error) {
405+
// Simulate a database connection error.
406+
return 0, xerrors.New("oops")
407+
}
408+
368409
type wrappedListener struct {
369410
net.Listener
370411
dials int32

coderd/workspaceagents.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -989,6 +989,16 @@ func (api *API) derpMapUpdates(rw http.ResponseWriter, r *http.Request) {
989989
func (api *API) workspaceAgentClientCoordinate(rw http.ResponseWriter, r *http.Request) {
990990
ctx := r.Context()
991991

992+
// Ensure the database is reachable before proceeding.
993+
_, err := api.Database.Ping(ctx)
994+
if err != nil {
995+
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
996+
Message: codersdk.DatabaseNotReachable,
997+
Detail: err.Error(),
998+
})
999+
return
1000+
}
1001+
9921002
// This route accepts user API key auth and workspace proxy auth. The moon actor has
9931003
// full permissions so should be able to pass this authz check.
9941004
workspace := httpmw.WorkspaceParam(r)

codersdk/database.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
package codersdk
2+
3+
import "golang.org/x/xerrors"
4+
5+
const DatabaseNotReachable = "database not reachable"
6+
7+
var ErrDatabaseNotReachable = xerrors.New(DatabaseNotReachable)

codersdk/workspacesdk/dialer.go

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,17 +11,19 @@ import (
1111
"golang.org/x/xerrors"
1212

1313
"cdr.dev/slog"
14+
"github.com/coder/websocket"
15+
1416
"github.com/coder/coder/v2/buildinfo"
1517
"github.com/coder/coder/v2/codersdk"
1618
"github.com/coder/coder/v2/tailnet"
1719
"github.com/coder/coder/v2/tailnet/proto"
18-
"github.com/coder/websocket"
1920
)
2021

2122
var permanentErrorStatuses = []int{
22-
http.StatusConflict, // returned if client/agent connections disabled (browser only)
23-
http.StatusBadRequest, // returned if API mismatch
24-
http.StatusNotFound, // returned if user doesn't have permission or agent doesn't exist
23+
http.StatusConflict, // returned if client/agent connections disabled (browser only)
24+
http.StatusBadRequest, // returned if API mismatch
25+
http.StatusNotFound, // returned if user doesn't have permission or agent doesn't exist
26+
http.StatusInternalServerError, // returned if database is not reachable,
2527
}
2628

2729
type WebsocketDialer struct {
@@ -89,6 +91,11 @@ func (w *WebsocketDialer) Dial(ctx context.Context, r tailnet.ResumeTokenControl
8991
"Ensure your client release version (%s, different than the API version) matches the server release version",
9092
buildinfo.Version())
9193
}
94+
95+
if sdkErr.Message == codersdk.DatabaseNotReachable &&
96+
sdkErr.StatusCode() == http.StatusInternalServerError {
97+
err = xerrors.Errorf("%w: %v", codersdk.ErrDatabaseNotReachable, err)
98+
}
9299
}
93100
w.connected <- err
94101
return tailnet.ControlProtocolClients{}, err

codersdk/workspacesdk/workspacesdk_test.go

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,21 @@
11
package workspacesdk_test
22

33
import (
4+
"net/http"
5+
"net/http/httptest"
46
"net/url"
57
"testing"
68

79
"github.com/stretchr/testify/require"
810
"tailscale.com/tailcfg"
911

12+
"github.com/coder/websocket"
13+
14+
"github.com/coder/coder/v2/coderd/httpapi"
15+
"github.com/coder/coder/v2/codersdk"
1016
"github.com/coder/coder/v2/codersdk/agentsdk"
17+
"github.com/coder/coder/v2/codersdk/workspacesdk"
18+
"github.com/coder/coder/v2/testutil"
1119
)
1220

1321
func TestWorkspaceRewriteDERPMap(t *testing.T) {
@@ -37,3 +45,30 @@ func TestWorkspaceRewriteDERPMap(t *testing.T) {
3745
require.Equal(t, "coconuts.org", node.HostName)
3846
require.Equal(t, 44558, node.DERPPort)
3947
}
48+
49+
func TestWorkspaceDialerFailure(t *testing.T) {
50+
t.Parallel()
51+
52+
// Setup.
53+
ctx := testutil.Context(t, testutil.WaitShort)
54+
logger := testutil.Logger(t)
55+
56+
// Given: a mock HTTP server which mimicks an unreachable database when calling the coordination endpoint.
57+
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
58+
httpapi.Write(ctx, w, http.StatusInternalServerError, codersdk.Response{
59+
Message: codersdk.DatabaseNotReachable,
60+
Detail: "oops",
61+
})
62+
}))
63+
t.Cleanup(srv.Close)
64+
65+
u, err := url.Parse(srv.URL)
66+
require.NoError(t, err)
67+
68+
// When: calling the coordination endpoint.
69+
dialer := workspacesdk.NewWebsocketDialer(logger, u, &websocket.DialOptions{})
70+
_, err = dialer.Dial(ctx, nil)
71+
72+
// Then: an error indicating a database issue is returned, to conditionalize the behavior of the caller.
73+
require.ErrorIs(t, err, codersdk.ErrDatabaseNotReachable)
74+
}

provisionerd/provisionerd.go

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,13 @@ import (
2020
"golang.org/x/xerrors"
2121

2222
"cdr.dev/slog"
23+
"github.com/coder/retry"
24+
2325
"github.com/coder/coder/v2/coderd/tracing"
2426
"github.com/coder/coder/v2/codersdk"
2527
"github.com/coder/coder/v2/provisionerd/proto"
2628
"github.com/coder/coder/v2/provisionerd/runner"
2729
sdkproto "github.com/coder/coder/v2/provisionersdk/proto"
28-
"github.com/coder/retry"
2930
)
3031

3132
// Dialer represents the function to create a daemon client connection.
@@ -290,7 +291,7 @@ func (p *Server) acquireLoop() {
290291
defer p.wg.Done()
291292
defer func() { close(p.acquireDoneCh) }()
292293
ctx := p.closeContext
293-
for {
294+
for retrier := retry.New(10*time.Millisecond, 1*time.Second); retrier.Wait(ctx); {
294295
if p.acquireExit() {
295296
return
296297
}
@@ -299,7 +300,17 @@ func (p *Server) acquireLoop() {
299300
p.opts.Logger.Debug(ctx, "shut down before client (re) connected")
300301
return
301302
}
302-
p.acquireAndRunOne(client)
303+
err := p.acquireAndRunOne(client)
304+
if err != nil && ctx.Err() == nil { // Only log if context is not done.
305+
// Short-circuit: don't wait for the retry delay to exit, if required.
306+
if p.acquireExit() {
307+
return
308+
}
309+
p.opts.Logger.Warn(ctx, "failed to acquire job, retrying", slog.F("delay", fmt.Sprintf("%vms", retrier.Delay.Milliseconds())), slog.Error(err))
310+
} else {
311+
// Reset the retrier after each successful acquisition.
312+
retrier.Reset()
313+
}
303314
}
304315
}
305316

@@ -318,7 +329,7 @@ func (p *Server) acquireExit() bool {
318329
return false
319330
}
320331

321-
func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) {
332+
func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) error {
322333
ctx := p.closeContext
323334
p.opts.Logger.Debug(ctx, "start of acquireAndRunOne")
324335
job, err := p.acquireGraceful(client)
@@ -327,15 +338,15 @@ func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) {
327338
if errors.Is(err, context.Canceled) ||
328339
errors.Is(err, yamux.ErrSessionShutdown) ||
329340
errors.Is(err, fasthttputil.ErrInmemoryListenerClosed) {
330-
return
341+
return err
331342
}
332343

333344
p.opts.Logger.Warn(ctx, "provisionerd was unable to acquire job", slog.Error(err))
334-
return
345+
return xerrors.Errorf("failed to acquire job: %w", err)
335346
}
336347
if job.JobId == "" {
337348
p.opts.Logger.Debug(ctx, "acquire job successfully canceled")
338-
return
349+
return nil
339350
}
340351

341352
if len(job.TraceMetadata) > 0 {
@@ -390,9 +401,9 @@ func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) {
390401
Error: fmt.Sprintf("failed to connect to provisioner: %s", resp.Error),
391402
})
392403
if err != nil {
393-
p.opts.Logger.Error(ctx, "provisioner job failed", slog.F("job_id", job.JobId), slog.Error(err))
404+
p.opts.Logger.Error(ctx, "failed to report provisioner job failed", slog.F("job_id", job.JobId), slog.Error(err))
394405
}
395-
return
406+
return xerrors.Errorf("failed to report provisioner job failed: %w", err)
396407
}
397408

398409
p.mutex.Lock()
@@ -416,6 +427,7 @@ func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) {
416427
p.mutex.Lock()
417428
p.activeJob = nil
418429
p.mutex.Unlock()
430+
return nil
419431
}
420432

421433
// acquireGraceful attempts to acquire a job from the server, handling canceling the acquisition if we gracefully shut

site/src/api/typesGenerated.ts

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tailnet/controllers.go

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package tailnet
22

33
import (
44
"context"
5+
"errors"
56
"fmt"
67
"io"
78
"maps"
@@ -20,11 +21,12 @@ import (
2021
"tailscale.com/util/dnsname"
2122

2223
"cdr.dev/slog"
24+
"github.com/coder/quartz"
25+
"github.com/coder/retry"
26+
2327
"github.com/coder/coder/v2/coderd/util/ptr"
2428
"github.com/coder/coder/v2/codersdk"
2529
"github.com/coder/coder/v2/tailnet/proto"
26-
"github.com/coder/quartz"
27-
"github.com/coder/retry"
2830
)
2931

3032
// A Controller connects to the tailnet control plane, and then uses the control protocols to
@@ -1362,6 +1364,14 @@ func (c *Controller) Run(ctx context.Context) {
13621364
if xerrors.Is(err, context.Canceled) || xerrors.Is(err, context.DeadlineExceeded) {
13631365
return
13641366
}
1367+
1368+
// If the database is unreachable by the control plane, there's not much we can do, so we'll just retry later.
1369+
if errors.Is(err, codersdk.ErrDatabaseNotReachable) {
1370+
c.logger.Warn(c.ctx, "control plane lost connection to database, retrying",
1371+
slog.Error(err), slog.F("delay", fmt.Sprintf("%vms", retrier.Delay.Milliseconds())))
1372+
continue
1373+
}
1374+
13651375
errF := slog.Error(err)
13661376
var sdkErr *codersdk.Error
13671377
if xerrors.As(err, &sdkErr) {

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy