Skip to content

Commit cb302b6

Browse files
committed
Checking for, and specifically handling, database unreachability in tailnet control protocol dialer
Signed-off-by: Danny Kopping <dannykopping@gmail.com>
1 parent 3f95841 commit cb302b6

File tree

9 files changed

+194
-20
lines changed

9 files changed

+194
-20
lines changed

coderd/coderd.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -679,6 +679,10 @@ func New(options *Options) *API {
679679
DERPFn: api.DERPMap,
680680
Logger: options.Logger,
681681
ClientID: uuid.New(),
682+
DatabaseHealthcheckFn: func(ctx context.Context) error {
683+
_, err := api.Database.Ping(ctx)
684+
return err
685+
},
682686
}
683687
stn, err := NewServerTailnet(api.ctx,
684688
options.Logger,

coderd/tailnet.go

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,11 @@ import (
2424
"tailscale.com/tailcfg"
2525

2626
"cdr.dev/slog"
27+
2728
"github.com/coder/coder/v2/coderd/tracing"
2829
"github.com/coder/coder/v2/coderd/workspaceapps"
2930
"github.com/coder/coder/v2/coderd/workspaceapps/appurl"
31+
"github.com/coder/coder/v2/codersdk"
3032
"github.com/coder/coder/v2/codersdk/workspacesdk"
3133
"github.com/coder/coder/v2/site"
3234
"github.com/coder/coder/v2/tailnet"
@@ -537,13 +539,20 @@ func NewMultiAgentController(ctx context.Context, logger slog.Logger, tracer tra
537539
// InmemTailnetDialer is a tailnet.ControlProtocolDialer that connects to a Coordinator and DERPMap
538540
// service running in the same memory space.
539541
type InmemTailnetDialer struct {
540-
CoordPtr *atomic.Pointer[tailnet.Coordinator]
541-
DERPFn func() *tailcfg.DERPMap
542-
Logger slog.Logger
543-
ClientID uuid.UUID
542+
CoordPtr *atomic.Pointer[tailnet.Coordinator]
543+
DERPFn func() *tailcfg.DERPMap
544+
Logger slog.Logger
545+
ClientID uuid.UUID
546+
DatabaseHealthcheckFn func(ctx context.Context) error
544547
}
545548

546-
func (a *InmemTailnetDialer) Dial(_ context.Context, _ tailnet.ResumeTokenController) (tailnet.ControlProtocolClients, error) {
549+
func (a *InmemTailnetDialer) Dial(ctx context.Context, _ tailnet.ResumeTokenController) (tailnet.ControlProtocolClients, error) {
550+
if a.DatabaseHealthcheckFn != nil {
551+
if err := a.DatabaseHealthcheckFn(ctx); err != nil {
552+
return tailnet.ControlProtocolClients{}, xerrors.Errorf("%s: %w", codersdk.DatabaseNotReachable, err)
553+
}
554+
}
555+
547556
coord := a.CoordPtr.Load()
548557
if coord == nil {
549558
return tailnet.ControlProtocolClients{}, xerrors.Errorf("tailnet coordinator not initialized")

coderd/tailnet_test.go

Lines changed: 74 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import (
1818
"github.com/stretchr/testify/assert"
1919
"github.com/stretchr/testify/require"
2020
"go.opentelemetry.io/otel/trace"
21+
"golang.org/x/xerrors"
2122
"tailscale.com/tailcfg"
2223

2324
"github.com/coder/coder/v2/agent"
@@ -56,8 +57,7 @@ func TestServerTailnet_AgentConn_NoSTUN(t *testing.T) {
5657
defer cancel()
5758

5859
// Connect through the ServerTailnet
59-
agents, serverTailnet := setupServerTailnetAgent(t, 1,
60-
tailnettest.DisableSTUN, tailnettest.DERPIsEmbedded)
60+
agents, serverTailnet := setupServerTailnetAgent(t, 1, withDERPAndStunOptions(tailnettest.DisableSTUN, tailnettest.DERPIsEmbedded))
6161
a := agents[0]
6262

6363
conn, release, err := serverTailnet.AgentConn(ctx, a.id)
@@ -340,7 +340,7 @@ func TestServerTailnet_ReverseProxy(t *testing.T) {
340340
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
341341
defer cancel()
342342

343-
agents, serverTailnet := setupServerTailnetAgent(t, 1, tailnettest.DisableSTUN)
343+
agents, serverTailnet := setupServerTailnetAgent(t, 1, withDERPAndStunOptions(tailnettest.DisableSTUN))
344344
a := agents[0]
345345

346346
require.True(t, serverTailnet.Conn().GetBlockEndpoints(), "expected BlockEndpoints to be set")
@@ -365,6 +365,43 @@ func TestServerTailnet_ReverseProxy(t *testing.T) {
365365
})
366366
}
367367

368+
func TestServerTailnet_Healthcheck(t *testing.T) {
369+
t.Parallel()
370+
371+
// Verifies that a non-nil healthcheck which returns a non-error response behaves as expected.
372+
t.Run("Passing", func(t *testing.T) {
373+
t.Parallel()
374+
375+
ctx := testutil.Context(t, testutil.WaitMedium)
376+
fn := func(ctx context.Context) error { return nil }
377+
378+
agents, serverTailnet := setupServerTailnetAgent(t, 1, withHealthcheckFn(fn))
379+
380+
a := agents[0]
381+
conn, release, err := serverTailnet.AgentConn(ctx, a.id)
382+
t.Cleanup(release)
383+
require.NoError(t, err)
384+
assert.True(t, conn.AwaitReachable(ctx))
385+
})
386+
387+
// If the healthcheck fails, we have no insight into this at this level.
388+
// The dial against the control plane is retried, so we wait for the context to timeout as an indication that the
389+
// healthcheck is performing as expected.
390+
t.Run("Failing", func(t *testing.T) {
391+
t.Parallel()
392+
393+
ctx := testutil.Context(t, testutil.WaitMedium)
394+
fn := func(ctx context.Context) error { return xerrors.Errorf("oops, db gone") }
395+
396+
agents, serverTailnet := setupServerTailnetAgent(t, 1, withHealthcheckFn(fn))
397+
398+
a := agents[0]
399+
_, release, err := serverTailnet.AgentConn(ctx, a.id)
400+
require.Nil(t, release)
401+
require.ErrorContains(t, err, "agent is unreachable")
402+
})
403+
}
404+
368405
type wrappedListener struct {
369406
net.Listener
370407
dials int32
@@ -389,9 +426,36 @@ type agentWithID struct {
389426
agent.Agent
390427
}
391428

392-
func setupServerTailnetAgent(t *testing.T, agentNum int, opts ...tailnettest.DERPAndStunOption) ([]agentWithID, *coderd.ServerTailnet) {
429+
type serverOption struct {
430+
HealthcheckFn func(ctx context.Context) error
431+
DERPAndStunOptions []tailnettest.DERPAndStunOption
432+
}
433+
434+
func withHealthcheckFn(fn func(ctx context.Context) error) serverOption {
435+
return serverOption{
436+
HealthcheckFn: fn,
437+
}
438+
}
439+
440+
func withDERPAndStunOptions(opts ...tailnettest.DERPAndStunOption) serverOption {
441+
return serverOption{
442+
DERPAndStunOptions: opts,
443+
}
444+
}
445+
446+
func setupServerTailnetAgent(t *testing.T, agentNum int, opts ...serverOption) ([]agentWithID, *coderd.ServerTailnet) {
393447
logger := testutil.Logger(t)
394-
derpMap, derpServer := tailnettest.RunDERPAndSTUN(t, opts...)
448+
449+
var healthcheckFn func(ctx context.Context) error
450+
var derpAndStunOptions []tailnettest.DERPAndStunOption
451+
for _, opt := range opts {
452+
derpAndStunOptions = append(derpAndStunOptions, opt.DERPAndStunOptions...)
453+
if opt.HealthcheckFn != nil {
454+
healthcheckFn = opt.HealthcheckFn
455+
}
456+
}
457+
458+
derpMap, derpServer := tailnettest.RunDERPAndSTUN(t, derpAndStunOptions...)
395459

396460
coord := tailnet.NewCoordinator(logger)
397461
t.Cleanup(func() {
@@ -431,10 +495,11 @@ func setupServerTailnetAgent(t *testing.T, agentNum int, opts ...tailnettest.DER
431495
}
432496

433497
dialer := &coderd.InmemTailnetDialer{
434-
CoordPtr: &coordPtr,
435-
DERPFn: func() *tailcfg.DERPMap { return derpMap },
436-
Logger: logger,
437-
ClientID: uuid.UUID{5},
498+
CoordPtr: &coordPtr,
499+
DERPFn: func() *tailcfg.DERPMap { return derpMap },
500+
Logger: logger,
501+
ClientID: uuid.UUID{5},
502+
DatabaseHealthcheckFn: healthcheckFn,
438503
}
439504
serverTailnet, err := coderd.NewServerTailnet(
440505
context.Background(),

coderd/workspaceagents.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -997,6 +997,16 @@ func (api *API) derpMapUpdates(rw http.ResponseWriter, r *http.Request) {
997997
func (api *API) workspaceAgentClientCoordinate(rw http.ResponseWriter, r *http.Request) {
998998
ctx := r.Context()
999999

1000+
// Ensure the database is reachable before proceeding.
1001+
_, err := api.Database.Ping(ctx)
1002+
if err != nil {
1003+
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
1004+
Message: codersdk.DatabaseNotReachable,
1005+
Detail: err.Error(),
1006+
})
1007+
return
1008+
}
1009+
10001010
// This route accepts user API key auth and workspace proxy auth. The moon actor has
10011011
// full permissions so should be able to pass this authz check.
10021012
workspace := httpmw.WorkspaceParam(r)

coderd/workspaceagents_test.go

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ import (
4545
"github.com/coder/coder/v2/coderd/database/dbfake"
4646
"github.com/coder/coder/v2/coderd/database/dbgen"
4747
"github.com/coder/coder/v2/coderd/database/dbmem"
48+
"github.com/coder/coder/v2/coderd/database/dbtestutil"
4849
"github.com/coder/coder/v2/coderd/database/dbtime"
4950
"github.com/coder/coder/v2/coderd/database/pubsub"
5051
"github.com/coder/coder/v2/coderd/externalauth"
@@ -55,6 +56,7 @@ import (
5556
"github.com/coder/coder/v2/codersdk"
5657
"github.com/coder/coder/v2/codersdk/agentsdk"
5758
"github.com/coder/coder/v2/codersdk/workspacesdk"
59+
"github.com/coder/coder/v2/enterprise/coderd/coderdenttest"
5860
"github.com/coder/coder/v2/provisioner/echo"
5961
"github.com/coder/coder/v2/provisionersdk/proto"
6062
"github.com/coder/coder/v2/tailnet"
@@ -495,6 +497,45 @@ func TestWorkspaceAgentConnectRPC(t *testing.T) {
495497
// Then: we should get a 401 Unauthorized response
496498
require.Equal(t, http.StatusUnauthorized, sdkErr.StatusCode())
497499
})
500+
501+
// This test validates that the tailnet controller will retry connecting to the control plane until context timeout
502+
// when the dialer fails its healthcheck.
503+
t.Run("DatabaseUnreachable", func(t *testing.T) {
504+
t.Parallel()
505+
506+
store, ps := dbtestutil.NewDB(t)
507+
508+
// Given: a database which will fail its Ping(ctx) call.
509+
// NOTE: The Ping(ctx) call is made by the Dialer.
510+
pdb := &pingFailingDB{
511+
Store: store,
512+
}
513+
client, user := coderdenttest.New(t, &coderdenttest.Options{
514+
Options: &coderdtest.Options{
515+
Database: pdb,
516+
Pubsub: ps,
517+
IncludeProvisionerDaemon: true,
518+
},
519+
})
520+
521+
// When: a workspace agent is setup and we try dial it.
522+
r := dbfake.WorkspaceBuild(t, pdb, database.WorkspaceTable{
523+
OrganizationID: user.OrganizationID,
524+
OwnerID: user.UserID,
525+
}).WithAgent().Do()
526+
_ = agenttest.New(t, client.URL, r.AgentToken)
527+
resources := coderdtest.AwaitWorkspaceAgents(t, client, r.Workspace.ID)
528+
529+
// When: the db is marked as unhealthy (i.e. will fail its Ping).
530+
// This needs to be done *after* the server "starts" otherwise it'll fail straight away when trying to initialize.
531+
pdb.MarkUnhealthy()
532+
533+
// Then: the tailnet controller will continually try to dial the coordination endpoint, exceeding its context timeout.
534+
ctx := testutil.Context(t, testutil.WaitMedium)
535+
conn, err := workspacesdk.New(client).DialAgent(ctx, resources[0].Agents[0].ID, nil)
536+
require.ErrorContains(t, err, codersdk.DatabaseNotReachable)
537+
require.Nil(t, conn)
538+
})
498539
}
499540

500541
func TestWorkspaceAgentTailnet(t *testing.T) {
@@ -2591,3 +2632,22 @@ func TestAgentConnectionInfo(t *testing.T) {
25912632
require.True(t, info.DisableDirectConnections)
25922633
require.True(t, info.DERPForceWebSockets)
25932634
}
2635+
2636+
type pingFailingDB struct {
2637+
database.Store
2638+
2639+
unhealthy bool
2640+
}
2641+
2642+
func (p *pingFailingDB) Ping(context.Context) (time.Duration, error) {
2643+
if !p.unhealthy {
2644+
return time.Nanosecond, nil
2645+
}
2646+
2647+
// Simulate a database connection error.
2648+
return 0, xerrors.New("oops")
2649+
}
2650+
2651+
func (p *pingFailingDB) MarkUnhealthy() {
2652+
p.unhealthy = true
2653+
}

codersdk/database.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
package codersdk
2+
3+
import "errors"
4+
5+
const DatabaseNotReachable = "database not reachable"
6+
7+
var ErrDatabaseNotReachable = errors.New(DatabaseNotReachable)

codersdk/workspacesdk/dialer.go

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,17 +11,19 @@ import (
1111
"golang.org/x/xerrors"
1212

1313
"cdr.dev/slog"
14+
"github.com/coder/websocket"
15+
1416
"github.com/coder/coder/v2/buildinfo"
1517
"github.com/coder/coder/v2/codersdk"
1618
"github.com/coder/coder/v2/tailnet"
1719
"github.com/coder/coder/v2/tailnet/proto"
18-
"github.com/coder/websocket"
1920
)
2021

2122
var permanentErrorStatuses = []int{
22-
http.StatusConflict, // returned if client/agent connections disabled (browser only)
23-
http.StatusBadRequest, // returned if API mismatch
24-
http.StatusNotFound, // returned if user doesn't have permission or agent doesn't exist
23+
http.StatusConflict, // returned if client/agent connections disabled (browser only)
24+
http.StatusBadRequest, // returned if API mismatch
25+
http.StatusNotFound, // returned if user doesn't have permission or agent doesn't exist
26+
http.StatusInternalServerError, // returned if database is not reachable,
2527
}
2628

2729
type WebsocketDialer struct {
@@ -89,6 +91,11 @@ func (w *WebsocketDialer) Dial(ctx context.Context, r tailnet.ResumeTokenControl
8991
"Ensure your client release version (%s, different than the API version) matches the server release version",
9092
buildinfo.Version())
9193
}
94+
95+
if sdkErr.Message == codersdk.DatabaseNotReachable &&
96+
sdkErr.StatusCode() == http.StatusInternalServerError {
97+
err = xerrors.Errorf("%s: %w", codersdk.DatabaseNotReachable, err)
98+
}
9299
}
93100
w.connected <- err
94101
return tailnet.ControlProtocolClients{}, err

site/src/api/typesGenerated.ts

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tailnet/controllers.go

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,12 @@ import (
2121
"tailscale.com/util/dnsname"
2222

2323
"cdr.dev/slog"
24+
"github.com/coder/quartz"
25+
"github.com/coder/retry"
26+
2427
"github.com/coder/coder/v2/coderd/util/ptr"
2528
"github.com/coder/coder/v2/codersdk"
2629
"github.com/coder/coder/v2/tailnet/proto"
27-
"github.com/coder/quartz"
28-
"github.com/coder/retry"
2930
)
3031

3132
// A Controller connects to the tailnet control plane, and then uses the control protocols to
@@ -1381,6 +1382,14 @@ func (c *Controller) Run(ctx context.Context) {
13811382
if xerrors.Is(err, context.Canceled) || xerrors.Is(err, context.DeadlineExceeded) {
13821383
return
13831384
}
1385+
1386+
// If the database is unreachable by the control plane, there's not much we can do, so we'll just retry later.
1387+
if strings.Contains(err.Error(), codersdk.DatabaseNotReachable) {
1388+
c.logger.Warn(c.ctx, "control plane lost connection to database, retrying",
1389+
slog.Error(err), slog.F("retry_in_ms", retrier.Delay.Milliseconds()))
1390+
continue
1391+
}
1392+
13841393
errF := slog.Error(err)
13851394
var sdkErr *codersdk.Error
13861395
if xerrors.As(err, &sdkErr) {

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy