Skip to content

Commit 0b18e45

Browse files
authored
fix: reduce excessive logging when database is unreachable (#17363)
Fixes #17045 --------- Signed-off-by: Danny Kopping <dannykopping@gmail.com>
1 parent 2f99d70 commit 0b18e45

File tree

10 files changed

+160
-20
lines changed

10 files changed

+160
-20
lines changed

coderd/coderd.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -675,10 +675,11 @@ func New(options *Options) *API {
675675
api.Auditor.Store(&options.Auditor)
676676
api.TailnetCoordinator.Store(&options.TailnetCoordinator)
677677
dialer := &InmemTailnetDialer{
678-
CoordPtr: &api.TailnetCoordinator,
679-
DERPFn: api.DERPMap,
680-
Logger: options.Logger,
681-
ClientID: uuid.New(),
678+
CoordPtr: &api.TailnetCoordinator,
679+
DERPFn: api.DERPMap,
680+
Logger: options.Logger,
681+
ClientID: uuid.New(),
682+
DatabaseHealthCheck: api.Database,
682683
}
683684
stn, err := NewServerTailnet(api.ctx,
684685
options.Logger,

coderd/tailnet.go

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,11 @@ import (
2424
"tailscale.com/tailcfg"
2525

2626
"cdr.dev/slog"
27+
2728
"github.com/coder/coder/v2/coderd/tracing"
2829
"github.com/coder/coder/v2/coderd/workspaceapps"
2930
"github.com/coder/coder/v2/coderd/workspaceapps/appurl"
31+
"github.com/coder/coder/v2/codersdk"
3032
"github.com/coder/coder/v2/codersdk/workspacesdk"
3133
"github.com/coder/coder/v2/site"
3234
"github.com/coder/coder/v2/tailnet"
@@ -534,16 +536,28 @@ func NewMultiAgentController(ctx context.Context, logger slog.Logger, tracer tra
534536
return m
535537
}
536538

539+
type Pinger interface {
540+
Ping(context.Context) (time.Duration, error)
541+
}
542+
537543
// InmemTailnetDialer is a tailnet.ControlProtocolDialer that connects to a Coordinator and DERPMap
538544
// service running in the same memory space.
539545
type InmemTailnetDialer struct {
540546
CoordPtr *atomic.Pointer[tailnet.Coordinator]
541547
DERPFn func() *tailcfg.DERPMap
542548
Logger slog.Logger
543549
ClientID uuid.UUID
550+
// DatabaseHealthCheck is used to validate that the store is reachable.
551+
DatabaseHealthCheck Pinger
544552
}
545553

546-
func (a *InmemTailnetDialer) Dial(_ context.Context, _ tailnet.ResumeTokenController) (tailnet.ControlProtocolClients, error) {
554+
func (a *InmemTailnetDialer) Dial(ctx context.Context, _ tailnet.ResumeTokenController) (tailnet.ControlProtocolClients, error) {
555+
if a.DatabaseHealthCheck != nil {
556+
if _, err := a.DatabaseHealthCheck.Ping(ctx); err != nil {
557+
return tailnet.ControlProtocolClients{}, xerrors.Errorf("%w: %v", codersdk.ErrDatabaseNotReachable, err)
558+
}
559+
}
560+
547561
coord := a.CoordPtr.Load()
548562
if coord == nil {
549563
return tailnet.ControlProtocolClients{}, xerrors.Errorf("tailnet coordinator not initialized")

coderd/tailnet_test.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,20 +11,23 @@ import (
1111
"strconv"
1212
"sync/atomic"
1313
"testing"
14+
"time"
1415

1516
"github.com/google/uuid"
1617
"github.com/prometheus/client_golang/prometheus"
1718
"github.com/spf13/afero"
1819
"github.com/stretchr/testify/assert"
1920
"github.com/stretchr/testify/require"
2021
"go.opentelemetry.io/otel/trace"
22+
"golang.org/x/xerrors"
2123
"tailscale.com/tailcfg"
2224

2325
"github.com/coder/coder/v2/agent"
2426
"github.com/coder/coder/v2/agent/agenttest"
2527
"github.com/coder/coder/v2/agent/proto"
2628
"github.com/coder/coder/v2/coderd"
2729
"github.com/coder/coder/v2/coderd/workspaceapps/appurl"
30+
"github.com/coder/coder/v2/codersdk"
2831
"github.com/coder/coder/v2/codersdk/agentsdk"
2932
"github.com/coder/coder/v2/codersdk/workspacesdk"
3033
"github.com/coder/coder/v2/tailnet"
@@ -365,6 +368,44 @@ func TestServerTailnet_ReverseProxy(t *testing.T) {
365368
})
366369
}
367370

371+
func TestDialFailure(t *testing.T) {
372+
t.Parallel()
373+
374+
// Setup.
375+
ctx := testutil.Context(t, testutil.WaitShort)
376+
logger := testutil.Logger(t)
377+
378+
// Given: a tailnet coordinator.
379+
coord := tailnet.NewCoordinator(logger)
380+
t.Cleanup(func() {
381+
_ = coord.Close()
382+
})
383+
coordPtr := atomic.Pointer[tailnet.Coordinator]{}
384+
coordPtr.Store(&coord)
385+
386+
// Given: a fake DB healthchecker which will always fail.
387+
fch := &failingHealthcheck{}
388+
389+
// When: dialing the in-memory coordinator.
390+
dialer := &coderd.InmemTailnetDialer{
391+
CoordPtr: &coordPtr,
392+
Logger: logger,
393+
ClientID: uuid.UUID{5},
394+
DatabaseHealthCheck: fch,
395+
}
396+
_, err := dialer.Dial(ctx, nil)
397+
398+
// Then: the error returned reflects the database has failed its healthcheck.
399+
require.ErrorIs(t, err, codersdk.ErrDatabaseNotReachable)
400+
}
401+
402+
type failingHealthcheck struct{}
403+
404+
func (failingHealthcheck) Ping(context.Context) (time.Duration, error) {
405+
// Simulate a database connection error.
406+
return 0, xerrors.New("oops")
407+
}
408+
368409
type wrappedListener struct {
369410
net.Listener
370411
dials int32

coderd/workspaceagents.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -997,6 +997,16 @@ func (api *API) derpMapUpdates(rw http.ResponseWriter, r *http.Request) {
997997
func (api *API) workspaceAgentClientCoordinate(rw http.ResponseWriter, r *http.Request) {
998998
ctx := r.Context()
999999

1000+
// Ensure the database is reachable before proceeding.
1001+
_, err := api.Database.Ping(ctx)
1002+
if err != nil {
1003+
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
1004+
Message: codersdk.DatabaseNotReachable,
1005+
Detail: err.Error(),
1006+
})
1007+
return
1008+
}
1009+
10001010
// This route accepts user API key auth and workspace proxy auth. The moon actor has
10011011
// full permissions so should be able to pass this authz check.
10021012
workspace := httpmw.WorkspaceParam(r)

codersdk/database.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
package codersdk
2+
3+
import "golang.org/x/xerrors"
4+
5+
const DatabaseNotReachable = "database not reachable"
6+
7+
var ErrDatabaseNotReachable = xerrors.New(DatabaseNotReachable)

codersdk/workspacesdk/dialer.go

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,17 +11,19 @@ import (
1111
"golang.org/x/xerrors"
1212

1313
"cdr.dev/slog"
14+
"github.com/coder/websocket"
15+
1416
"github.com/coder/coder/v2/buildinfo"
1517
"github.com/coder/coder/v2/codersdk"
1618
"github.com/coder/coder/v2/tailnet"
1719
"github.com/coder/coder/v2/tailnet/proto"
18-
"github.com/coder/websocket"
1920
)
2021

2122
var permanentErrorStatuses = []int{
22-
http.StatusConflict, // returned if client/agent connections disabled (browser only)
23-
http.StatusBadRequest, // returned if API mismatch
24-
http.StatusNotFound, // returned if user doesn't have permission or agent doesn't exist
23+
http.StatusConflict, // returned if client/agent connections disabled (browser only)
24+
http.StatusBadRequest, // returned if API mismatch
25+
http.StatusNotFound, // returned if user doesn't have permission or agent doesn't exist
26+
http.StatusInternalServerError, // returned if database is not reachable,
2527
}
2628

2729
type WebsocketDialer struct {
@@ -89,6 +91,11 @@ func (w *WebsocketDialer) Dial(ctx context.Context, r tailnet.ResumeTokenControl
8991
"Ensure your client release version (%s, different than the API version) matches the server release version",
9092
buildinfo.Version())
9193
}
94+
95+
if sdkErr.Message == codersdk.DatabaseNotReachable &&
96+
sdkErr.StatusCode() == http.StatusInternalServerError {
97+
err = xerrors.Errorf("%w: %v", codersdk.ErrDatabaseNotReachable, err)
98+
}
9299
}
93100
w.connected <- err
94101
return tailnet.ControlProtocolClients{}, err

codersdk/workspacesdk/workspacesdk_test.go

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,21 @@
11
package workspacesdk_test
22

33
import (
4+
"net/http"
5+
"net/http/httptest"
46
"net/url"
57
"testing"
68

79
"github.com/stretchr/testify/require"
810
"tailscale.com/tailcfg"
911

12+
"github.com/coder/websocket"
13+
14+
"github.com/coder/coder/v2/coderd/httpapi"
15+
"github.com/coder/coder/v2/codersdk"
1016
"github.com/coder/coder/v2/codersdk/agentsdk"
17+
"github.com/coder/coder/v2/codersdk/workspacesdk"
18+
"github.com/coder/coder/v2/testutil"
1119
)
1220

1321
func TestWorkspaceRewriteDERPMap(t *testing.T) {
@@ -37,3 +45,30 @@ func TestWorkspaceRewriteDERPMap(t *testing.T) {
3745
require.Equal(t, "coconuts.org", node.HostName)
3846
require.Equal(t, 44558, node.DERPPort)
3947
}
48+
49+
func TestWorkspaceDialerFailure(t *testing.T) {
50+
t.Parallel()
51+
52+
// Setup.
53+
ctx := testutil.Context(t, testutil.WaitShort)
54+
logger := testutil.Logger(t)
55+
56+
// Given: a mock HTTP server which mimicks an unreachable database when calling the coordination endpoint.
57+
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
58+
httpapi.Write(ctx, w, http.StatusInternalServerError, codersdk.Response{
59+
Message: codersdk.DatabaseNotReachable,
60+
Detail: "oops",
61+
})
62+
}))
63+
t.Cleanup(srv.Close)
64+
65+
u, err := url.Parse(srv.URL)
66+
require.NoError(t, err)
67+
68+
// When: calling the coordination endpoint.
69+
dialer := workspacesdk.NewWebsocketDialer(logger, u, &websocket.DialOptions{})
70+
_, err = dialer.Dial(ctx, nil)
71+
72+
// Then: an error indicating a database issue is returned, to conditionalize the behavior of the caller.
73+
require.ErrorIs(t, err, codersdk.ErrDatabaseNotReachable)
74+
}

provisionerd/provisionerd.go

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,13 @@ import (
2020
"golang.org/x/xerrors"
2121

2222
"cdr.dev/slog"
23+
"github.com/coder/retry"
24+
2325
"github.com/coder/coder/v2/coderd/tracing"
2426
"github.com/coder/coder/v2/codersdk"
2527
"github.com/coder/coder/v2/provisionerd/proto"
2628
"github.com/coder/coder/v2/provisionerd/runner"
2729
sdkproto "github.com/coder/coder/v2/provisionersdk/proto"
28-
"github.com/coder/retry"
2930
)
3031

3132
// Dialer represents the function to create a daemon client connection.
@@ -290,7 +291,7 @@ func (p *Server) acquireLoop() {
290291
defer p.wg.Done()
291292
defer func() { close(p.acquireDoneCh) }()
292293
ctx := p.closeContext
293-
for {
294+
for retrier := retry.New(10*time.Millisecond, 1*time.Second); retrier.Wait(ctx); {
294295
if p.acquireExit() {
295296
return
296297
}
@@ -299,7 +300,17 @@ func (p *Server) acquireLoop() {
299300
p.opts.Logger.Debug(ctx, "shut down before client (re) connected")
300301
return
301302
}
302-
p.acquireAndRunOne(client)
303+
err := p.acquireAndRunOne(client)
304+
if err != nil && ctx.Err() == nil { // Only log if context is not done.
305+
// Short-circuit: don't wait for the retry delay to exit, if required.
306+
if p.acquireExit() {
307+
return
308+
}
309+
p.opts.Logger.Warn(ctx, "failed to acquire job, retrying", slog.F("delay", fmt.Sprintf("%vms", retrier.Delay.Milliseconds())), slog.Error(err))
310+
} else {
311+
// Reset the retrier after each successful acquisition.
312+
retrier.Reset()
313+
}
303314
}
304315
}
305316

@@ -318,7 +329,7 @@ func (p *Server) acquireExit() bool {
318329
return false
319330
}
320331

321-
func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) {
332+
func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) error {
322333
ctx := p.closeContext
323334
p.opts.Logger.Debug(ctx, "start of acquireAndRunOne")
324335
job, err := p.acquireGraceful(client)
@@ -327,15 +338,15 @@ func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) {
327338
if errors.Is(err, context.Canceled) ||
328339
errors.Is(err, yamux.ErrSessionShutdown) ||
329340
errors.Is(err, fasthttputil.ErrInmemoryListenerClosed) {
330-
return
341+
return err
331342
}
332343

333344
p.opts.Logger.Warn(ctx, "provisionerd was unable to acquire job", slog.Error(err))
334-
return
345+
return xerrors.Errorf("failed to acquire job: %w", err)
335346
}
336347
if job.JobId == "" {
337348
p.opts.Logger.Debug(ctx, "acquire job successfully canceled")
338-
return
349+
return nil
339350
}
340351

341352
if len(job.TraceMetadata) > 0 {
@@ -392,9 +403,9 @@ func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) {
392403
Error: fmt.Sprintf("failed to connect to provisioner: %s", resp.Error),
393404
})
394405
if err != nil {
395-
p.opts.Logger.Error(ctx, "provisioner job failed", slog.F("job_id", job.JobId), slog.Error(err))
406+
p.opts.Logger.Error(ctx, "failed to report provisioner job failed", slog.F("job_id", job.JobId), slog.Error(err))
396407
}
397-
return
408+
return xerrors.Errorf("failed to report provisioner job failed: %w", err)
398409
}
399410

400411
p.mutex.Lock()
@@ -418,6 +429,7 @@ func (p *Server) acquireAndRunOne(client proto.DRPCProvisionerDaemonClient) {
418429
p.mutex.Lock()
419430
p.activeJob = nil
420431
p.mutex.Unlock()
432+
return nil
421433
}
422434

423435
// acquireGraceful attempts to acquire a job from the server, handling canceling the acquisition if we gracefully shut

site/src/api/typesGenerated.ts

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tailnet/controllers.go

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package tailnet
22

33
import (
44
"context"
5+
"errors"
56
"fmt"
67
"io"
78
"maps"
@@ -21,11 +22,12 @@ import (
2122
"tailscale.com/util/dnsname"
2223

2324
"cdr.dev/slog"
25+
"github.com/coder/quartz"
26+
"github.com/coder/retry"
27+
2428
"github.com/coder/coder/v2/coderd/util/ptr"
2529
"github.com/coder/coder/v2/codersdk"
2630
"github.com/coder/coder/v2/tailnet/proto"
27-
"github.com/coder/quartz"
28-
"github.com/coder/retry"
2931
)
3032

3133
// A Controller connects to the tailnet control plane, and then uses the control protocols to
@@ -1381,6 +1383,14 @@ func (c *Controller) Run(ctx context.Context) {
13811383
if xerrors.Is(err, context.Canceled) || xerrors.Is(err, context.DeadlineExceeded) {
13821384
return
13831385
}
1386+
1387+
// If the database is unreachable by the control plane, there's not much we can do, so we'll just retry later.
1388+
if errors.Is(err, codersdk.ErrDatabaseNotReachable) {
1389+
c.logger.Warn(c.ctx, "control plane lost connection to database, retrying",
1390+
slog.Error(err), slog.F("delay", fmt.Sprintf("%vms", retrier.Delay.Milliseconds())))
1391+
continue
1392+
}
1393+
13841394
errF := slog.Error(err)
13851395
var sdkErr *codersdk.Error
13861396
if xerrors.As(err, &sdkErr) {

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy