From 35b2fed6860da12707266979f14286f73fe1468a Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Thu, 22 Sep 2022 17:40:59 -0500 Subject: [PATCH 01/79] feat: HA tailnet coordinator --- agent/agent_test.go | 2 +- coderd/coderd.go | 4 +- coderd/database/pubsub_memory.go | 3 +- coderd/workspaceagents.go | 2 +- coderd/wsconncache/wsconncache_test.go | 2 +- codersdk/workspaceagents.go | 1 - enterprise/tailnet/coordinator.go | 426 +++++++++++++++++++++++++ enterprise/tailnet/coordinator_test.go | 267 ++++++++++++++++ tailnet/coordinator.go | 203 +++++++----- tailnet/coordinator_test.go | 10 +- 10 files changed, 834 insertions(+), 86 deletions(-) create mode 100644 enterprise/tailnet/coordinator.go create mode 100644 enterprise/tailnet/coordinator_test.go diff --git a/agent/agent_test.go b/agent/agent_test.go index afed644f78e5e..d6ff21cdcd33d 100644 --- a/agent/agent_test.go +++ b/agent/agent_test.go @@ -572,7 +572,7 @@ func setupAgent(t *testing.T, metadata agent.Metadata, ptyTimeout time.Duration) if metadata.DERPMap == nil { metadata.DERPMap = tailnettest.RunDERPAndSTUN(t) } - coordinator := tailnet.NewCoordinator() + coordinator := tailnet.NewMemoryCoordinator() agentID := uuid.New() statsCh := make(chan *agent.Stats) closer := agent.New(agent.Options{ diff --git a/coderd/coderd.go b/coderd/coderd.go index 25ac1afec2f36..f183e4d9b9ab7 100644 --- a/coderd/coderd.go +++ b/coderd/coderd.go @@ -74,7 +74,7 @@ type Options struct { TracerProvider trace.TracerProvider AutoImportTemplates []AutoImportTemplate - TailnetCoordinator *tailnet.Coordinator + TailnetCoordinator tailnet.Coordinator DERPMap *tailcfg.DERPMap MetricsCacheRefreshInterval time.Duration @@ -121,7 +121,7 @@ func New(options *Options) *API { options.PrometheusRegistry = prometheus.NewRegistry() } if options.TailnetCoordinator == nil { - options.TailnetCoordinator = tailnet.NewCoordinator() + options.TailnetCoordinator = tailnet.NewMemoryCoordinator() } if options.Auditor == nil { options.Auditor = audit.NewNop() diff --git a/coderd/database/pubsub_memory.go b/coderd/database/pubsub_memory.go index 148d2f57b129f..de5a940414d6c 100644 --- a/coderd/database/pubsub_memory.go +++ b/coderd/database/pubsub_memory.go @@ -47,8 +47,9 @@ func (m *memoryPubsub) Publish(event string, message []byte) error { return nil } for _, listener := range listeners { - listener(context.Background(), message) + go listener(context.Background(), message) } + return nil } diff --git a/coderd/workspaceagents.go b/coderd/workspaceagents.go index 6167790fb8bb7..dd777913c452d 100644 --- a/coderd/workspaceagents.go +++ b/coderd/workspaceagents.go @@ -447,7 +447,7 @@ func convertApps(dbApps []database.WorkspaceApp) []codersdk.WorkspaceApp { return apps } -func convertWorkspaceAgent(derpMap *tailcfg.DERPMap, coordinator *tailnet.Coordinator, dbAgent database.WorkspaceAgent, apps []codersdk.WorkspaceApp, agentInactiveDisconnectTimeout time.Duration) (codersdk.WorkspaceAgent, error) { +func convertWorkspaceAgent(derpMap *tailcfg.DERPMap, coordinator tailnet.Coordinator, dbAgent database.WorkspaceAgent, apps []codersdk.WorkspaceApp, agentInactiveDisconnectTimeout time.Duration) (codersdk.WorkspaceAgent, error) { var envs map[string]string if dbAgent.EnvironmentVariables.Valid { err := json.Unmarshal(dbAgent.EnvironmentVariables.RawMessage, &envs) diff --git a/coderd/wsconncache/wsconncache_test.go b/coderd/wsconncache/wsconncache_test.go index a9ea85a2492ac..e4c7d58413110 100644 --- a/coderd/wsconncache/wsconncache_test.go +++ b/coderd/wsconncache/wsconncache_test.go @@ -142,7 +142,7 @@ func TestCache(t *testing.T) { func setupAgent(t *testing.T, metadata agent.Metadata, ptyTimeout time.Duration) *agent.Conn { metadata.DERPMap = tailnettest.RunDERPAndSTUN(t) - coordinator := tailnet.NewCoordinator() + coordinator := tailnet.NewMemoryCoordinator() agentID := uuid.New() closer := agent.New(agent.Options{ FetchMetadata: func(ctx context.Context) (agent.Metadata, error) { diff --git a/codersdk/workspaceagents.go b/codersdk/workspaceagents.go index 46d8ead8d2d6d..72e9767713c7c 100644 --- a/codersdk/workspaceagents.go +++ b/codersdk/workspaceagents.go @@ -20,7 +20,6 @@ import ( "tailscale.com/tailcfg" "cdr.dev/slog" - "github.com/coder/coder/agent" "github.com/coder/coder/tailnet" "github.com/coder/retry" diff --git a/enterprise/tailnet/coordinator.go b/enterprise/tailnet/coordinator.go new file mode 100644 index 0000000000000..8824f584d60da --- /dev/null +++ b/enterprise/tailnet/coordinator.go @@ -0,0 +1,426 @@ +package tailnet + +import ( + "bytes" + "context" + "encoding/json" + "errors" + "fmt" + "io" + "net" + "sync" + "time" + + "github.com/google/uuid" + "golang.org/x/xerrors" + + "cdr.dev/slog" + + "github.com/coder/coder/coderd/database" + agpl "github.com/coder/coder/tailnet" +) + +func NewHACoordinator(logger slog.Logger, pubsub database.Pubsub) (agpl.Coordinator, error) { + coord := &haCoordinator{ + id: uuid.New(), + log: logger, + pubsub: pubsub, + close: make(chan struct{}), + nodes: map[uuid.UUID]*agpl.Node{}, + agentSockets: map[uuid.UUID]net.Conn{}, + agentToConnectionSockets: map[uuid.UUID]map[uuid.UUID]net.Conn{}, + } + + if err := coord.runPubsub(); err != nil { + return nil, xerrors.Errorf("run coordinator pubsub: %w", err) + } + + return coord, nil +} + +type haCoordinator struct { + id uuid.UUID + log slog.Logger + mutex sync.RWMutex + pubsub database.Pubsub + close chan struct{} + + // nodes maps agent and connection IDs their respective node. + nodes map[uuid.UUID]*agpl.Node + // agentSockets maps agent IDs to their open websocket. + agentSockets map[uuid.UUID]net.Conn + // agentToConnectionSockets maps agent IDs to connection IDs of conns that + // are subscribed to updates for that agent. + agentToConnectionSockets map[uuid.UUID]map[uuid.UUID]net.Conn +} + +// Node returns an in-memory node by ID. +func (c *haCoordinator) Node(id uuid.UUID) *agpl.Node { + c.mutex.RLock() + defer c.mutex.RUnlock() + node := c.nodes[id] + return node +} + +// ServeClient accepts a WebSocket connection that wants to connect to an agent +// with the specified ID. +func (c *haCoordinator) ServeClient(conn net.Conn, id uuid.UUID, agent uuid.UUID) error { + c.mutex.Lock() + // When a new connection is requested, we update it with the latest + // node of the agent. This allows the connection to establish. + node, ok := c.nodes[agent] + if ok { + data, err := json.Marshal([]*agpl.Node{node}) + if err != nil { + c.mutex.Unlock() + return xerrors.Errorf("marshal node: %w", err) + } + _, err = conn.Write(data) + if err != nil { + c.mutex.Unlock() + return xerrors.Errorf("write nodes: %w", err) + } + } + + connectionSockets, ok := c.agentToConnectionSockets[agent] + if !ok { + connectionSockets = map[uuid.UUID]net.Conn{} + c.agentToConnectionSockets[agent] = connectionSockets + } + + // Insert this connection into a map so the agent can publish node updates. + connectionSockets[id] = conn + c.mutex.Unlock() + + defer func() { + c.mutex.Lock() + defer c.mutex.Unlock() + // Clean all traces of this connection from the map. + delete(c.nodes, id) + connectionSockets, ok := c.agentToConnectionSockets[agent] + if !ok { + return + } + delete(connectionSockets, id) + if len(connectionSockets) != 0 { + return + } + delete(c.agentToConnectionSockets, agent) + }() + + decoder := json.NewDecoder(conn) + // Indefinitely handle messages from the client websocket. + for { + err := c.handleNextClientMessage(id, agent, decoder) + if err != nil { + if errors.Is(err, io.EOF) { + return nil + } + return xerrors.Errorf("handle next client message: %w", err) + } + } +} + +func (c *haCoordinator) handleNextClientMessage(id, agent uuid.UUID, decoder *json.Decoder) error { + var node agpl.Node + err := decoder.Decode(&node) + if err != nil { + return xerrors.Errorf("read json: %w", err) + } + + c.mutex.Lock() + defer c.mutex.Unlock() + + // Update the node of this client in our in-memory map. If an agent entirely + // shuts down and reconnects, it needs to be aware of all clients attempting + // to establish connections. + c.nodes[id] = &node + + // Write the new node from this client to the actively connected agent. + err = c.writeNodeToAgent(agent, &node) + if err != nil { + return xerrors.Errorf("write node to agent: %w", err) + } + + return nil +} + +func (c *haCoordinator) writeNodeToAgent(agent uuid.UUID, node *agpl.Node) error { + agentSocket, ok := c.agentSockets[agent] + if !ok { + // If we don't own the agent locally, send it over pubsub to a node that + // owns the agent. + err := c.publishNodeToAgent(agent, node) + if err != nil { + return xerrors.Errorf("publish node to agent") + } + return nil + } + + // Write the new node from this client to the actively + // connected agent. + data, err := json.Marshal([]*agpl.Node{node}) + if err != nil { + return xerrors.Errorf("marshal nodes: %w", err) + } + + _, err = agentSocket.Write(data) + if err != nil { + if errors.Is(err, io.EOF) { + return nil + } + return xerrors.Errorf("write json: %w", err) + } + return nil +} + +// ServeAgent accepts a WebSocket connection to an agent that listens to +// incoming connections and publishes node updates. +func (c *haCoordinator) ServeAgent(conn net.Conn, id uuid.UUID) error { + c.mutex.Lock() + sockets, ok := c.agentToConnectionSockets[id] + if ok { + // Publish all nodes that want to connect to the + // desired agent ID. + nodes := make([]*agpl.Node, 0, len(sockets)) + for targetID := range sockets { + node, ok := c.nodes[targetID] + if !ok { + continue + } + nodes = append(nodes, node) + } + data, err := json.Marshal(nodes) + if err != nil { + c.mutex.Unlock() + return xerrors.Errorf("marshal json: %w", err) + } + _, err = conn.Write(data) + if err != nil { + c.mutex.Unlock() + return xerrors.Errorf("write nodes: %w", err) + } + } + + // If an old agent socket is connected, we close it + // to avoid any leaks. This shouldn't ever occur because + // we expect one agent to be running. + oldAgentSocket, ok := c.agentSockets[id] + if ok { + _ = oldAgentSocket.Close() + } + c.agentSockets[id] = conn + c.mutex.Unlock() + defer func() { + c.mutex.Lock() + defer c.mutex.Unlock() + delete(c.agentSockets, id) + delete(c.nodes, id) + }() + + decoder := json.NewDecoder(conn) + for { + err := c.hangleAgentUpdate(id, decoder, false) + if err != nil { + if errors.Is(err, io.EOF) { + return nil + } + return xerrors.Errorf("handle next agent message: %w", err) + } + } +} + +func (c *haCoordinator) hangleAgentUpdate(id uuid.UUID, decoder *json.Decoder, fromPubsub bool) error { + var node agpl.Node + err := decoder.Decode(&node) + if err != nil { + return xerrors.Errorf("read json: %w", err) + } + + c.mutex.Lock() + defer c.mutex.Unlock() + + c.nodes[id] = &node + + // Don't send the agent back over pubsub if that's where we received it from! + if !fromPubsub { + err = c.publishAgentToNodes(id, &node) + if err != nil { + return xerrors.Errorf("publish agent to nodes: %w", err) + } + } + + connectionSockets, ok := c.agentToConnectionSockets[id] + if !ok { + return nil + } + + data, err := json.Marshal([]*agpl.Node{&node}) + if err != nil { + return xerrors.Errorf("marshal nodes: %w", err) + } + + // Publish the new node to every listening socket. + var wg sync.WaitGroup + wg.Add(len(connectionSockets)) + for _, connectionSocket := range connectionSockets { + connectionSocket := connectionSocket + go func() { + _ = connectionSocket.SetWriteDeadline(time.Now().Add(5 * time.Second)) + _, _ = connectionSocket.Write(data) + wg.Done() + }() + } + + wg.Wait() + return nil +} + +func (c *haCoordinator) Close() error { + close(c.close) + return nil +} + +func (c *haCoordinator) publishNodeToAgent(recipient uuid.UUID, node *agpl.Node) error { + msg, err := c.formatCallMeMaybe(recipient, node) + if err != nil { + return xerrors.Errorf("format publish message: %w", err) + } + + fmt.Println("publishing callmemaybe", c.id.String()) + err = c.pubsub.Publish("wireguard_peers", msg) + if err != nil { + return xerrors.Errorf("publish message: %w", err) + } + + return nil +} + +func (c *haCoordinator) publishAgentToNodes(id uuid.UUID, node *agpl.Node) error { + msg, err := c.formatAgentUpdate(id, node) + if err != nil { + return xerrors.Errorf("format publish message: %w", err) + } + + fmt.Println("publishing agentupdate", c.id.String()) + err = c.pubsub.Publish("wireguard_peers", msg) + if err != nil { + return xerrors.Errorf("publish message: %w", err) + } + + return nil +} + +func (c *haCoordinator) runPubsub() error { + cancelSub, err := c.pubsub.Subscribe("wireguard_peers", func(ctx context.Context, message []byte) { + sp := bytes.Split(message, []byte("|")) + if len(sp) != 4 { + c.log.Error(ctx, "invalid wireguard peer message", slog.F("msg", string(message))) + return + } + + var ( + coordinatorID = sp[0] + eventType = sp[1] + agentID = sp[2] + nodeJSON = sp[3] + ) + + sender, err := uuid.ParseBytes(coordinatorID) + if err != nil { + c.log.Error(ctx, "invalid sender id", slog.F("id", string(coordinatorID)), slog.F("msg", string(message))) + return + } + + // We sent this message! + if sender == c.id { + return + } + + switch string(eventType) { + case "callmemaybe": + agentUUID, err := uuid.ParseBytes(agentID) + if err != nil { + c.log.Error(ctx, "invalid agent id", slog.F("id", string(agentID))) + return + } + + fmt.Println("got callmemaybe", agentUUID.String()) + c.mutex.Lock() + defer c.mutex.Unlock() + + fmt.Println("process callmemaybe", agentUUID.String()) + agentSocket, ok := c.agentSockets[agentUUID] + if !ok { + fmt.Println("no socket") + return + } + + // We get a single node over pubsub, so turn into an array. + _, err = agentSocket.Write(bytes.Join([][]byte{[]byte("["), nodeJSON, []byte("]")}, []byte{})) + if err != nil { + if errors.Is(err, io.EOF) { + return + } + c.log.Error(ctx, "send callmemaybe to agent", slog.Error(err)) + return + } + fmt.Println("success callmemaybe", agentUUID.String()) + + case "agentupdate": + agentUUID, err := uuid.ParseBytes(agentID) + if err != nil { + c.log.Error(ctx, "invalid agent id", slog.F("id", string(agentID))) + } + + decoder := json.NewDecoder(bytes.NewReader(nodeJSON)) + err = c.hangleAgentUpdate(agentUUID, decoder, true) + if err != nil { + c.log.Error(ctx, "handle agent update", slog.Error(err)) + } + + default: + c.log.Error(ctx, "unknown peer event", slog.F("name", string(eventType))) + } + }) + if err != nil { + return xerrors.Errorf("subscribe wireguard peers") + } + + go func() { + defer cancelSub() + <-c.close + }() + + return nil +} + +// format: |callmemaybe|| +func (c *haCoordinator) formatCallMeMaybe(recipient uuid.UUID, node *agpl.Node) ([]byte, error) { + buf := bytes.Buffer{} + + buf.WriteString(c.id.String() + "|") + buf.WriteString("callmemaybe|") + buf.WriteString(recipient.String() + "|") + err := json.NewEncoder(&buf).Encode(node) + if err != nil { + return nil, xerrors.Errorf("encode node: %w", err) + } + + return buf.Bytes(), nil +} + +// format: |agentupdate|| +func (c *haCoordinator) formatAgentUpdate(id uuid.UUID, node *agpl.Node) ([]byte, error) { + buf := bytes.Buffer{} + + buf.WriteString(c.id.String() + "|") + buf.WriteString("agentupdate|") + buf.WriteString(id.String() + "|") + err := json.NewEncoder(&buf).Encode(node) + if err != nil { + return nil, xerrors.Errorf("encode node: %w", err) + } + + return buf.Bytes(), nil +} diff --git a/enterprise/tailnet/coordinator_test.go b/enterprise/tailnet/coordinator_test.go new file mode 100644 index 0000000000000..48fce5bfd0f6f --- /dev/null +++ b/enterprise/tailnet/coordinator_test.go @@ -0,0 +1,267 @@ +package tailnet_test + +import ( + "fmt" + "net" + "testing" + + "github.com/google/uuid" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "cdr.dev/slog/sloggers/slogtest" + + "github.com/coder/coder/coderd/database" + "github.com/coder/coder/enterprise/tailnet" + agpl "github.com/coder/coder/tailnet" + "github.com/coder/coder/testutil" +) + +func TestCoordinatorSingle(t *testing.T) { + t.Parallel() + t.Run("ClientWithoutAgent", func(t *testing.T) { + t.Parallel() + coordinator, err := tailnet.NewHACoordinator(slogtest.Make(t, nil), database.NewPubsubInMemory()) + require.NoError(t, err) + defer coordinator.Close() + + client, server := net.Pipe() + sendNode, errChan := agpl.ServeCoordinator(client, func(node []*agpl.Node) error { + return nil + }) + id := uuid.New() + closeChan := make(chan struct{}) + go func() { + err := coordinator.ServeClient(server, id, uuid.New()) + assert.NoError(t, err) + close(closeChan) + }() + sendNode(&agpl.Node{}) + require.Eventually(t, func() bool { + return coordinator.Node(id) != nil + }, testutil.WaitShort, testutil.IntervalFast) + + err = client.Close() + require.NoError(t, err) + <-errChan + <-closeChan + }) + + t.Run("AgentWithoutClients", func(t *testing.T) { + t.Parallel() + coordinator, err := tailnet.NewHACoordinator(slogtest.Make(t, nil), database.NewPubsubInMemory()) + require.NoError(t, err) + defer coordinator.Close() + + client, server := net.Pipe() + sendNode, errChan := agpl.ServeCoordinator(client, func(node []*agpl.Node) error { + return nil + }) + id := uuid.New() + closeChan := make(chan struct{}) + go func() { + err := coordinator.ServeAgent(server, id) + assert.NoError(t, err) + close(closeChan) + }() + sendNode(&agpl.Node{}) + require.Eventually(t, func() bool { + return coordinator.Node(id) != nil + }, testutil.WaitShort, testutil.IntervalFast) + err = client.Close() + require.NoError(t, err) + <-errChan + <-closeChan + }) + + t.Run("AgentWithClient", func(t *testing.T) { + t.Parallel() + + coordinator, err := tailnet.NewHACoordinator(slogtest.Make(t, nil), database.NewPubsubInMemory()) + require.NoError(t, err) + defer coordinator.Close() + + agentWS, agentServerWS := net.Pipe() + defer agentWS.Close() + agentNodeChan := make(chan []*agpl.Node) + sendAgentNode, agentErrChan := agpl.ServeCoordinator(agentWS, func(nodes []*agpl.Node) error { + agentNodeChan <- nodes + return nil + }) + agentID := uuid.New() + closeAgentChan := make(chan struct{}) + go func() { + err := coordinator.ServeAgent(agentServerWS, agentID) + assert.NoError(t, err) + close(closeAgentChan) + }() + sendAgentNode(&agpl.Node{}) + require.Eventually(t, func() bool { + return coordinator.Node(agentID) != nil + }, testutil.WaitShort, testutil.IntervalFast) + + clientWS, clientServerWS := net.Pipe() + defer clientWS.Close() + defer clientServerWS.Close() + clientNodeChan := make(chan []*agpl.Node) + sendClientNode, clientErrChan := agpl.ServeCoordinator(clientWS, func(nodes []*agpl.Node) error { + clientNodeChan <- nodes + return nil + }) + clientID := uuid.New() + closeClientChan := make(chan struct{}) + go func() { + err := coordinator.ServeClient(clientServerWS, clientID, agentID) + assert.NoError(t, err) + close(closeClientChan) + }() + agentNodes := <-clientNodeChan + require.Len(t, agentNodes, 1) + sendClientNode(&agpl.Node{}) + clientNodes := <-agentNodeChan + require.Len(t, clientNodes, 1) + + // Ensure an update to the agent node reaches the client! + sendAgentNode(&agpl.Node{}) + agentNodes = <-clientNodeChan + require.Len(t, agentNodes, 1) + + // Close the agent WebSocket so a new one can connect. + err = agentWS.Close() + require.NoError(t, err) + <-agentErrChan + <-closeAgentChan + + // Create a new agent connection. This is to simulate a reconnect! + agentWS, agentServerWS = net.Pipe() + defer agentWS.Close() + agentNodeChan = make(chan []*agpl.Node) + _, agentErrChan = agpl.ServeCoordinator(agentWS, func(nodes []*agpl.Node) error { + agentNodeChan <- nodes + return nil + }) + closeAgentChan = make(chan struct{}) + go func() { + err := coordinator.ServeAgent(agentServerWS, agentID) + assert.NoError(t, err) + close(closeAgentChan) + }() + // Ensure the existing listening client sends it's node immediately! + clientNodes = <-agentNodeChan + require.Len(t, clientNodes, 1) + + err = agentWS.Close() + require.NoError(t, err) + <-agentErrChan + <-closeAgentChan + + err = clientWS.Close() + require.NoError(t, err) + <-clientErrChan + <-closeClientChan + }) +} + +func TestCoordinatorHA(t *testing.T) { + t.Parallel() + + t.Run("AgentWithClient", func(t *testing.T) { + t.Parallel() + + pubsub := database.NewPubsubInMemory() + + coordinator1, err := tailnet.NewHACoordinator(slogtest.Make(t, nil), pubsub) + require.NoError(t, err) + defer coordinator1.Close() + + coordinator2, err := tailnet.NewHACoordinator(slogtest.Make(t, nil), pubsub) + require.NoError(t, err) + defer coordinator2.Close() + + agentWS, agentServerWS := net.Pipe() + defer agentWS.Close() + agentNodeChan := make(chan []*agpl.Node) + sendAgentNode, agentErrChan := agpl.ServeCoordinator(agentWS, func(nodes []*agpl.Node) error { + fmt.Println("got agent node") + agentNodeChan <- nodes + fmt.Println("sent agent node") + return nil + }) + agentID := uuid.New() + closeAgentChan := make(chan struct{}) + go func() { + err := coordinator1.ServeAgent(agentServerWS, agentID) + assert.NoError(t, err) + close(closeAgentChan) + }() + sendAgentNode(&agpl.Node{}) + require.Eventually(t, func() bool { + return coordinator1.Node(agentID) != nil + }, testutil.WaitShort, testutil.IntervalFast) + + clientWS, clientServerWS := net.Pipe() + defer clientWS.Close() + defer clientServerWS.Close() + clientNodeChan := make(chan []*agpl.Node) + sendClientNode, clientErrChan := agpl.ServeCoordinator(clientWS, func(nodes []*agpl.Node) error { + fmt.Println("got client node") + clientNodeChan <- nodes + fmt.Println("sent client node") + return nil + }) + clientID := uuid.New() + closeClientChan := make(chan struct{}) + go func() { + err := coordinator2.ServeClient(clientServerWS, clientID, agentID) + assert.NoError(t, err) + close(closeClientChan) + }() + agentNodes := <-clientNodeChan + require.Len(t, agentNodes, 1) + sendClientNode(&agpl.Node{}) + _ = sendClientNode + clientNodes := <-agentNodeChan + require.Len(t, clientNodes, 1) + + // Ensure an update to the agent node reaches the client! + sendAgentNode(&agpl.Node{}) + agentNodes = <-clientNodeChan + require.Len(t, agentNodes, 1) + + // Close the agent WebSocket so a new one can connect. + require.NoError(t, agentWS.Close()) + require.NoError(t, agentServerWS.Close()) + <-agentErrChan + <-closeAgentChan + + // Create a new agent connection. This is to simulate a reconnect! + agentWS, agentServerWS = net.Pipe() + defer agentWS.Close() + agentNodeChan = make(chan []*agpl.Node) + _, agentErrChan = agpl.ServeCoordinator(agentWS, func(nodes []*agpl.Node) error { + fmt.Println("got agent node") + agentNodeChan <- nodes + fmt.Println("sent agent node") + return nil + }) + closeAgentChan = make(chan struct{}) + go func() { + err := coordinator1.ServeAgent(agentServerWS, agentID) + assert.NoError(t, err) + close(closeAgentChan) + }() + // Ensure the existing listening client sends it's node immediately! + clientNodes = <-agentNodeChan + require.Len(t, clientNodes, 1) + + err = agentWS.Close() + require.NoError(t, err) + <-agentErrChan + <-closeAgentChan + + err = clientWS.Close() + require.NoError(t, err) + <-clientErrChan + <-closeClientChan + }) +} diff --git a/tailnet/coordinator.go b/tailnet/coordinator.go index 95209d56559ff..af6a5fee58288 100644 --- a/tailnet/coordinator.go +++ b/tailnet/coordinator.go @@ -7,6 +7,7 @@ import ( "net" "net/netip" "sync" + "time" "github.com/google/uuid" "golang.org/x/xerrors" @@ -14,6 +15,24 @@ import ( "tailscale.com/types/key" ) +// Coordinator exchanges nodes with agents to establish connections. +// ┌──────────────────┐ ┌────────────────────┐ ┌───────────────────┐ ┌──────────────────┐ +// │tailnet.Coordinate├──►│tailnet.AcceptClient│◄─►│tailnet.AcceptAgent│◄──┤tailnet.Coordinate│ +// └──────────────────┘ └────────────────────┘ └───────────────────┘ └──────────────────┘ +// Coordinators have different guarantees for HA support. +type Coordinator interface { + // Node returns an in-memory node by ID. + Node(id uuid.UUID) *Node + // ServeClient accepts a WebSocket connection that wants to connect to an agent + // with the specified ID. + ServeClient(conn net.Conn, id uuid.UUID, agent uuid.UUID) error + // ServeAgent accepts a WebSocket connection to an agent that listens to + // incoming connections and publishes node updates. + ServeAgent(conn net.Conn, id uuid.UUID) error + // Close closes the coordinator. + Close() error +} + // Node represents a node in the network. type Node struct { ID tailcfg.NodeID `json:"id"` @@ -64,44 +83,46 @@ func ServeCoordinator(conn net.Conn, updateNodes func(node []*Node) error) (func }, errChan } -// NewCoordinator constructs a new in-memory connection coordinator. -func NewCoordinator() *Coordinator { - return &Coordinator{ +// NewMemoryCoordinator constructs a new in-memory connection coordinator. This +// coordinator is incompatible with multiple Coder replicas as all node data is +// in-memory. +func NewMemoryCoordinator() Coordinator { + return &memoryCoordinator{ nodes: map[uuid.UUID]*Node{}, agentSockets: map[uuid.UUID]net.Conn{}, agentToConnectionSockets: map[uuid.UUID]map[uuid.UUID]net.Conn{}, } } -// Coordinator exchanges nodes with agents to establish connections. +// MemoryCoordinator exchanges nodes with agents to establish connections. // ┌──────────────────┐ ┌────────────────────┐ ┌───────────────────┐ ┌──────────────────┐ // │tailnet.Coordinate├──►│tailnet.AcceptClient│◄─►│tailnet.AcceptAgent│◄──┤tailnet.Coordinate│ // └──────────────────┘ └────────────────────┘ └───────────────────┘ └──────────────────┘ // This coordinator is incompatible with multiple Coder // replicas as all node data is in-memory. -type Coordinator struct { +type memoryCoordinator struct { mutex sync.Mutex - // Maps agent and connection IDs to a node. + // nodes maps agent and connection IDs their respective node. nodes map[uuid.UUID]*Node - // Maps agent ID to an open socket. + // agentSockets maps agent IDs to their open websocket. agentSockets map[uuid.UUID]net.Conn - // Maps agent ID to connection ID for sending - // new node data as it comes in! + // agentToConnectionSockets maps agent IDs to connection IDs of conns that + // are subscribed to updates for that agent. agentToConnectionSockets map[uuid.UUID]map[uuid.UUID]net.Conn } // Node returns an in-memory node by ID. -func (c *Coordinator) Node(id uuid.UUID) *Node { +func (c *memoryCoordinator) Node(id uuid.UUID) *Node { c.mutex.Lock() defer c.mutex.Unlock() node := c.nodes[id] return node } -// ServeClient accepts a WebSocket connection that wants to -// connect to an agent with the specified ID. -func (c *Coordinator) ServeClient(conn net.Conn, id uuid.UUID, agent uuid.UUID) error { +// ServeClient accepts a WebSocket connection that wants to connect to an agent +// with the specified ID. +func (c *memoryCoordinator) ServeClient(conn net.Conn, id uuid.UUID, agent uuid.UUID) error { c.mutex.Lock() // When a new connection is requested, we update it with the latest // node of the agent. This allows the connection to establish. @@ -145,48 +166,67 @@ func (c *Coordinator) ServeClient(conn net.Conn, id uuid.UUID, agent uuid.UUID) decoder := json.NewDecoder(conn) for { - var node Node - err := decoder.Decode(&node) - if errors.Is(err, io.EOF) { - return nil - } - if err != nil { - return xerrors.Errorf("read json: %w", err) - } - c.mutex.Lock() - // Update the node of this client in our in-memory map. - // If an agent entirely shuts down and reconnects, it - // needs to be aware of all clients attempting to - // establish connections. - c.nodes[id] = &node - agentSocket, ok := c.agentSockets[agent] - if !ok { - c.mutex.Unlock() - continue - } - // Write the new node from this client to the actively - // connected agent. - data, err := json.Marshal([]*Node{&node}) + err := c.handleNextClientMessage(id, agent, decoder) if err != nil { - c.mutex.Unlock() - return xerrors.Errorf("marshal nodes: %w", err) + if errors.Is(err, io.EOF) { + return nil + } + return xerrors.Errorf("handle next client message: %w", err) } - _, err = agentSocket.Write(data) + } +} + +func (c *memoryCoordinator) handleNextClientMessage(id, agent uuid.UUID, decoder *json.Decoder) error { + var node Node + err := decoder.Decode(&node) + if err != nil { + return xerrors.Errorf("read json: %w", err) + } + + c.mutex.Lock() + defer c.mutex.Unlock() + + // Update the node of this client in our in-memory map. If an agent + // entirely shuts down and reconnects, it needs to be aware of all clients + // attempting to establish connections. + c.nodes[id] = &node + + // Write the new node from this client to the actively + // connected agent. + err = c.writeNodeToAgent(agent, &node) + if err != nil { + return xerrors.Errorf("write node to agent: %w", err) + } + + return nil +} + +func (c *memoryCoordinator) writeNodeToAgent(agent uuid.UUID, node *Node) error { + agentSocket, ok := c.agentSockets[agent] + if !ok { + return nil + } + + // Write the new node from this client to the actively + // connected agent. + data, err := json.Marshal([]*Node{node}) + if err != nil { + return xerrors.Errorf("marshal nodes: %w", err) + } + + _, err = agentSocket.Write(data) + if err != nil { if errors.Is(err, io.EOF) { - c.mutex.Unlock() return nil } - if err != nil { - c.mutex.Unlock() - return xerrors.Errorf("write json: %w", err) - } - c.mutex.Unlock() + return xerrors.Errorf("write json: %w", err) } + return nil } // ServeAgent accepts a WebSocket connection to an agent that // listens to incoming connections and publishes node updates. -func (c *Coordinator) ServeAgent(conn net.Conn, id uuid.UUID) error { +func (c *memoryCoordinator) ServeAgent(conn net.Conn, id uuid.UUID) error { c.mutex.Lock() sockets, ok := c.agentToConnectionSockets[id] if ok { @@ -230,36 +270,51 @@ func (c *Coordinator) ServeAgent(conn net.Conn, id uuid.UUID) error { decoder := json.NewDecoder(conn) for { - var node Node - err := decoder.Decode(&node) - if errors.Is(err, io.EOF) { - return nil - } - if err != nil { - return xerrors.Errorf("read json: %w", err) - } - c.mutex.Lock() - c.nodes[id] = &node - connectionSockets, ok := c.agentToConnectionSockets[id] - if !ok { - c.mutex.Unlock() - continue - } - data, err := json.Marshal([]*Node{&node}) + err := c.handleNextAgentMessage(id, decoder) if err != nil { - return xerrors.Errorf("marshal nodes: %w", err) - } - // Publish the new node to every listening socket. - var wg sync.WaitGroup - wg.Add(len(connectionSockets)) - for _, connectionSocket := range connectionSockets { - connectionSocket := connectionSocket - go func() { - _, _ = connectionSocket.Write(data) - wg.Done() - }() + if errors.Is(err, io.EOF) { + return nil + } + return xerrors.Errorf("handle next agent message: %w", err) } - wg.Wait() - c.mutex.Unlock() } } + +func (c *memoryCoordinator) handleNextAgentMessage(id uuid.UUID, decoder *json.Decoder) error { + var node Node + err := decoder.Decode(&node) + if err != nil { + return xerrors.Errorf("read json: %w", err) + } + + c.mutex.Lock() + defer c.mutex.Unlock() + + c.nodes[id] = &node + connectionSockets, ok := c.agentToConnectionSockets[id] + if !ok { + return nil + } + + data, err := json.Marshal([]*Node{&node}) + if err != nil { + return xerrors.Errorf("marshal nodes: %w", err) + } + + // Publish the new node to every listening socket. + var wg sync.WaitGroup + wg.Add(len(connectionSockets)) + for _, connectionSocket := range connectionSockets { + connectionSocket := connectionSocket + go func() { + _ = connectionSocket.SetWriteDeadline(time.Now().Add(5 * time.Second)) + _, _ = connectionSocket.Write(data) + wg.Done() + }() + } + + wg.Wait() + return nil +} + +func (*memoryCoordinator) Close() error { return nil } diff --git a/tailnet/coordinator_test.go b/tailnet/coordinator_test.go index f3fdab88d5ef8..e0ed44420ede2 100644 --- a/tailnet/coordinator_test.go +++ b/tailnet/coordinator_test.go @@ -16,7 +16,7 @@ func TestCoordinator(t *testing.T) { t.Parallel() t.Run("ClientWithoutAgent", func(t *testing.T) { t.Parallel() - coordinator := tailnet.NewCoordinator() + coordinator := tailnet.NewMemoryCoordinator() client, server := net.Pipe() sendNode, errChan := tailnet.ServeCoordinator(client, func(node []*tailnet.Node) error { return nil @@ -32,15 +32,15 @@ func TestCoordinator(t *testing.T) { require.Eventually(t, func() bool { return coordinator.Node(id) != nil }, testutil.WaitShort, testutil.IntervalFast) - err := client.Close() - require.NoError(t, err) + require.NoError(t, client.Close()) + require.NoError(t, server.Close()) <-errChan <-closeChan }) t.Run("AgentWithoutClients", func(t *testing.T) { t.Parallel() - coordinator := tailnet.NewCoordinator() + coordinator := tailnet.NewMemoryCoordinator() client, server := net.Pipe() sendNode, errChan := tailnet.ServeCoordinator(client, func(node []*tailnet.Node) error { return nil @@ -64,7 +64,7 @@ func TestCoordinator(t *testing.T) { t.Run("AgentWithClient", func(t *testing.T) { t.Parallel() - coordinator := tailnet.NewCoordinator() + coordinator := tailnet.NewMemoryCoordinator() agentWS, agentServerWS := net.Pipe() defer agentWS.Close() From 68a812b134d43b3777d7173fdeadc503eea9ad4e Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Fri, 23 Sep 2022 13:26:25 -0500 Subject: [PATCH 02/79] fixup! feat: HA tailnet coordinator --- enterprise/tailnet/coordinator.go | 132 +++++++++++++++++++++--------- 1 file changed, 92 insertions(+), 40 deletions(-) diff --git a/enterprise/tailnet/coordinator.go b/enterprise/tailnet/coordinator.go index 8824f584d60da..6999fa7157d48 100644 --- a/enterprise/tailnet/coordinator.go +++ b/enterprise/tailnet/coordinator.go @@ -5,7 +5,6 @@ import ( "context" "encoding/json" "errors" - "fmt" "io" "net" "sync" @@ -150,7 +149,7 @@ func (c *haCoordinator) writeNodeToAgent(agent uuid.UUID, node *agpl.Node) error if !ok { // If we don't own the agent locally, send it over pubsub to a node that // owns the agent. - err := c.publishNodeToAgent(agent, node) + err := c.publishNodesToAgent(agent, []*agpl.Node{node}) if err != nil { return xerrors.Errorf("publish node to agent") } @@ -178,18 +177,15 @@ func (c *haCoordinator) writeNodeToAgent(agent uuid.UUID, node *agpl.Node) error // incoming connections and publishes node updates. func (c *haCoordinator) ServeAgent(conn net.Conn, id uuid.UUID) error { c.mutex.Lock() - sockets, ok := c.agentToConnectionSockets[id] - if ok { - // Publish all nodes that want to connect to the - // desired agent ID. - nodes := make([]*agpl.Node, 0, len(sockets)) - for targetID := range sockets { - node, ok := c.nodes[targetID] - if !ok { - continue - } - nodes = append(nodes, node) - } + + // Tell clients on other instances to send a callmemaybe to us. + err := c.publishAgentHello(id) + if err != nil { + return xerrors.Errorf("publish agent hello: %w", err) + } + + nodes := c.nodesSubscribedToAgent(id) + if len(nodes) > 0 { data, err := json.Marshal(nodes) if err != nil { c.mutex.Unlock() @@ -220,21 +216,46 @@ func (c *haCoordinator) ServeAgent(conn net.Conn, id uuid.UUID) error { decoder := json.NewDecoder(conn) for { - err := c.hangleAgentUpdate(id, decoder, false) + node, err := c.hangleAgentUpdate(id, decoder) if err != nil { if errors.Is(err, io.EOF) { return nil } return xerrors.Errorf("handle next agent message: %w", err) } + + err = c.publishAgentToNodes(id, node) + if err != nil { + return xerrors.Errorf("publish agent to nodes: %w", err) + } } } -func (c *haCoordinator) hangleAgentUpdate(id uuid.UUID, decoder *json.Decoder, fromPubsub bool) error { +func (c *haCoordinator) nodesSubscribedToAgent(agentID uuid.UUID) []*agpl.Node { + sockets, ok := c.agentToConnectionSockets[agentID] + if !ok { + return nil + } + + // Publish all nodes that want to connect to the + // desired agent ID. + nodes := make([]*agpl.Node, 0, len(sockets)) + for targetID := range sockets { + node, ok := c.nodes[targetID] + if !ok { + continue + } + nodes = append(nodes, node) + } + + return nodes +} + +func (c *haCoordinator) hangleAgentUpdate(id uuid.UUID, decoder *json.Decoder) (*agpl.Node, error) { var node agpl.Node err := decoder.Decode(&node) if err != nil { - return xerrors.Errorf("read json: %w", err) + return nil, xerrors.Errorf("read json: %w", err) } c.mutex.Lock() @@ -242,22 +263,14 @@ func (c *haCoordinator) hangleAgentUpdate(id uuid.UUID, decoder *json.Decoder, f c.nodes[id] = &node - // Don't send the agent back over pubsub if that's where we received it from! - if !fromPubsub { - err = c.publishAgentToNodes(id, &node) - if err != nil { - return xerrors.Errorf("publish agent to nodes: %w", err) - } - } - connectionSockets, ok := c.agentToConnectionSockets[id] if !ok { - return nil + return &node, nil } data, err := json.Marshal([]*agpl.Node{&node}) if err != nil { - return xerrors.Errorf("marshal nodes: %w", err) + return nil, xerrors.Errorf("marshal nodes: %w", err) } // Publish the new node to every listening socket. @@ -273,7 +286,7 @@ func (c *haCoordinator) hangleAgentUpdate(id uuid.UUID, decoder *json.Decoder, f } wg.Wait() - return nil + return &node, nil } func (c *haCoordinator) Close() error { @@ -281,13 +294,26 @@ func (c *haCoordinator) Close() error { return nil } -func (c *haCoordinator) publishNodeToAgent(recipient uuid.UUID, node *agpl.Node) error { - msg, err := c.formatCallMeMaybe(recipient, node) +func (c *haCoordinator) publishNodesToAgent(recipient uuid.UUID, nodes []*agpl.Node) error { + msg, err := c.formatCallMeMaybe(recipient, nodes) + if err != nil { + return xerrors.Errorf("format publish message: %w", err) + } + + err = c.pubsub.Publish("wireguard_peers", msg) + if err != nil { + return xerrors.Errorf("publish message: %w", err) + } + + return nil +} + +func (c *haCoordinator) publishAgentHello(id uuid.UUID) error { + msg, err := c.formatAgentHello(id) if err != nil { return xerrors.Errorf("format publish message: %w", err) } - fmt.Println("publishing callmemaybe", c.id.String()) err = c.pubsub.Publish("wireguard_peers", msg) if err != nil { return xerrors.Errorf("publish message: %w", err) @@ -302,7 +328,6 @@ func (c *haCoordinator) publishAgentToNodes(id uuid.UUID, node *agpl.Node) error return xerrors.Errorf("format publish message: %w", err) } - fmt.Println("publishing agentupdate", c.id.String()) err = c.pubsub.Publish("wireguard_peers", msg) if err != nil { return xerrors.Errorf("publish message: %w", err) @@ -345,19 +370,16 @@ func (c *haCoordinator) runPubsub() error { return } - fmt.Println("got callmemaybe", agentUUID.String()) c.mutex.Lock() defer c.mutex.Unlock() - fmt.Println("process callmemaybe", agentUUID.String()) agentSocket, ok := c.agentSockets[agentUUID] if !ok { - fmt.Println("no socket") return } // We get a single node over pubsub, so turn into an array. - _, err = agentSocket.Write(bytes.Join([][]byte{[]byte("["), nodeJSON, []byte("]")}, []byte{})) + _, err = agentSocket.Write(nodeJSON) if err != nil { if errors.Is(err, io.EOF) { return @@ -365,18 +387,37 @@ func (c *haCoordinator) runPubsub() error { c.log.Error(ctx, "send callmemaybe to agent", slog.Error(err)) return } - fmt.Println("success callmemaybe", agentUUID.String()) + + case "agenthello": + agentUUID, err := uuid.ParseBytes(agentID) + if err != nil { + c.log.Error(ctx, "invalid agent id", slog.F("id", string(agentID))) + return + } + + c.mutex.Lock() + nodes := c.nodesSubscribedToAgent(agentUUID) + c.mutex.Unlock() + if len(nodes) > 0 { + err := c.publishNodesToAgent(agentUUID, nodes) + if err != nil { + c.log.Error(ctx, "publish nodes to agent", slog.Error(err)) + return + } + } case "agentupdate": agentUUID, err := uuid.ParseBytes(agentID) if err != nil { c.log.Error(ctx, "invalid agent id", slog.F("id", string(agentID))) + return } decoder := json.NewDecoder(bytes.NewReader(nodeJSON)) - err = c.hangleAgentUpdate(agentUUID, decoder, true) + _, err = c.hangleAgentUpdate(agentUUID, decoder) if err != nil { c.log.Error(ctx, "handle agent update", slog.Error(err)) + return } default: @@ -396,13 +437,13 @@ func (c *haCoordinator) runPubsub() error { } // format: |callmemaybe|| -func (c *haCoordinator) formatCallMeMaybe(recipient uuid.UUID, node *agpl.Node) ([]byte, error) { +func (c *haCoordinator) formatCallMeMaybe(recipient uuid.UUID, nodes []*agpl.Node) ([]byte, error) { buf := bytes.Buffer{} buf.WriteString(c.id.String() + "|") buf.WriteString("callmemaybe|") buf.WriteString(recipient.String() + "|") - err := json.NewEncoder(&buf).Encode(node) + err := json.NewEncoder(&buf).Encode(nodes) if err != nil { return nil, xerrors.Errorf("encode node: %w", err) } @@ -410,6 +451,17 @@ func (c *haCoordinator) formatCallMeMaybe(recipient uuid.UUID, node *agpl.Node) return buf.Bytes(), nil } +// format: |agenthello|| +func (c *haCoordinator) formatAgentHello(id uuid.UUID) ([]byte, error) { + buf := bytes.Buffer{} + + buf.WriteString(c.id.String() + "|") + buf.WriteString("agenthello|") + buf.WriteString(id.String() + "|") + + return buf.Bytes(), nil +} + // format: |agentupdate|| func (c *haCoordinator) formatAgentUpdate(id uuid.UUID, node *agpl.Node) ([]byte, error) { buf := bytes.Buffer{} From 774c5dafe3cb41a9eca1819531f073fd8ff9c9b9 Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Fri, 23 Sep 2022 13:29:40 -0500 Subject: [PATCH 03/79] fixup! feat: HA tailnet coordinator --- enterprise/tailnet/coordinator.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/enterprise/tailnet/coordinator.go b/enterprise/tailnet/coordinator.go index 6999fa7157d48..61b4bd5759ace 100644 --- a/enterprise/tailnet/coordinator.go +++ b/enterprise/tailnet/coordinator.go @@ -184,6 +184,7 @@ func (c *haCoordinator) ServeAgent(conn net.Conn, id uuid.UUID) error { return xerrors.Errorf("publish agent hello: %w", err) } + // Publish all nodes on this instance that want to connect to this agent. nodes := c.nodesSubscribedToAgent(id) if len(nodes) > 0 { data, err := json.Marshal(nodes) @@ -237,8 +238,6 @@ func (c *haCoordinator) nodesSubscribedToAgent(agentID uuid.UUID) []*agpl.Node { return nil } - // Publish all nodes that want to connect to the - // desired agent ID. nodes := make([]*agpl.Node, 0, len(sockets)) for targetID := range sockets { node, ok := c.nodes[targetID] From bd82c5e36c79c080954b38255c8d198a0f0b925f Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Fri, 23 Sep 2022 13:30:58 -0500 Subject: [PATCH 04/79] remove printlns --- enterprise/tailnet/coordinator_test.go | 7 ------- 1 file changed, 7 deletions(-) diff --git a/enterprise/tailnet/coordinator_test.go b/enterprise/tailnet/coordinator_test.go index 48fce5bfd0f6f..4889cd1c8ba60 100644 --- a/enterprise/tailnet/coordinator_test.go +++ b/enterprise/tailnet/coordinator_test.go @@ -1,7 +1,6 @@ package tailnet_test import ( - "fmt" "net" "testing" @@ -182,9 +181,7 @@ func TestCoordinatorHA(t *testing.T) { defer agentWS.Close() agentNodeChan := make(chan []*agpl.Node) sendAgentNode, agentErrChan := agpl.ServeCoordinator(agentWS, func(nodes []*agpl.Node) error { - fmt.Println("got agent node") agentNodeChan <- nodes - fmt.Println("sent agent node") return nil }) agentID := uuid.New() @@ -204,9 +201,7 @@ func TestCoordinatorHA(t *testing.T) { defer clientServerWS.Close() clientNodeChan := make(chan []*agpl.Node) sendClientNode, clientErrChan := agpl.ServeCoordinator(clientWS, func(nodes []*agpl.Node) error { - fmt.Println("got client node") clientNodeChan <- nodes - fmt.Println("sent client node") return nil }) clientID := uuid.New() @@ -239,9 +234,7 @@ func TestCoordinatorHA(t *testing.T) { defer agentWS.Close() agentNodeChan = make(chan []*agpl.Node) _, agentErrChan = agpl.ServeCoordinator(agentWS, func(nodes []*agpl.Node) error { - fmt.Println("got agent node") agentNodeChan <- nodes - fmt.Println("sent agent node") return nil }) closeAgentChan = make(chan struct{}) From fbad8d075ddfb47d99c9cd7f2d1696ded78266ed Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Fri, 7 Oct 2022 11:49:19 -0500 Subject: [PATCH 05/79] close all connections on coordinator --- codersdk/features.go | 12 ++++--- enterprise/coderd/coderd.go | 8 +++++ enterprise/coderd/license/license.go | 25 +++++++++++---- enterprise/tailnet/coordinator.go | 30 ++++++++++++++++- tailnet/coordinator.go | 48 ++++++++++++++++++++++++++-- 5 files changed, 109 insertions(+), 14 deletions(-) diff --git a/codersdk/features.go b/codersdk/features.go index fe8673ef028fd..6884f44087629 100644 --- a/codersdk/features.go +++ b/codersdk/features.go @@ -15,11 +15,12 @@ const ( ) const ( - FeatureUserLimit = "user_limit" - FeatureAuditLog = "audit_log" - FeatureBrowserOnly = "browser_only" - FeatureSCIM = "scim" - FeatureWorkspaceQuota = "workspace_quota" + FeatureUserLimit = "user_limit" + FeatureAuditLog = "audit_log" + FeatureBrowserOnly = "browser_only" + FeatureSCIM = "scim" + FeatureWorkspaceQuota = "workspace_quota" + FeatureHighAvailability = "high_availability" ) var FeatureNames = []string{ @@ -28,6 +29,7 @@ var FeatureNames = []string{ FeatureBrowserOnly, FeatureSCIM, FeatureWorkspaceQuota, + FeatureHighAvailability, } type Feature struct { diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index 11cceef98f0db..a6595e8bd6554 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -170,6 +170,14 @@ func (api *API) updateEntitlements(ctx context.Context) error { api.AGPL.WorkspaceQuotaEnforcer.Store(&enforcer) } + if changed, enabled := featureChanged(codersdk.FeatureHighAvailability); changed { + enforcer := workspacequota.NewNop() + if enabled { + enforcer = NewEnforcer(api.Options.UserWorkspaceQuota) + } + api.AGPL.WorkspaceQuotaEnforcer.Store(&enforcer) + } + api.entitlements = entitlements return nil diff --git a/enterprise/coderd/license/license.go b/enterprise/coderd/license/license.go index 55a62eee17eee..84d28dfcccb21 100644 --- a/enterprise/coderd/license/license.go +++ b/enterprise/coderd/license/license.go @@ -17,7 +17,13 @@ import ( ) // Entitlements processes licenses to return whether features are enabled or not. -func Entitlements(ctx context.Context, db database.Store, logger slog.Logger, keys map[string]ed25519.PublicKey, enablements map[string]bool) (codersdk.Entitlements, error) { +func Entitlements( + ctx context.Context, + db database.Store, + logger slog.Logger, + keys map[string]ed25519.PublicKey, + enablements map[string]bool, +) (codersdk.Entitlements, error) { now := time.Now() // Default all entitlements to be disabled. entitlements := codersdk.Entitlements{ @@ -96,6 +102,12 @@ func Entitlements(ctx context.Context, db database.Store, logger slog.Logger, ke Enabled: enablements[codersdk.FeatureWorkspaceQuota], } } + if claims.Features.HighAvailability > 0 { + entitlements.Features[codersdk.FeatureHighAvailability] = codersdk.Feature{ + Entitlement: entitlement, + Enabled: enablements[codersdk.FeatureHighAvailability], + } + } if claims.AllFeatures { allFeatures = true } @@ -165,11 +177,12 @@ var ( ) type Features struct { - UserLimit int64 `json:"user_limit"` - AuditLog int64 `json:"audit_log"` - BrowserOnly int64 `json:"browser_only"` - SCIM int64 `json:"scim"` - WorkspaceQuota int64 `json:"workspace_quota"` + UserLimit int64 `json:"user_limit"` + AuditLog int64 `json:"audit_log"` + BrowserOnly int64 `json:"browser_only"` + SCIM int64 `json:"scim"` + WorkspaceQuota int64 `json:"workspace_quota"` + HighAvailability int64 `json:"high_availability"` } type Claims struct { diff --git a/enterprise/tailnet/coordinator.go b/enterprise/tailnet/coordinator.go index 61b4bd5759ace..6bf2327507165 100644 --- a/enterprise/tailnet/coordinator.go +++ b/enterprise/tailnet/coordinator.go @@ -14,7 +14,6 @@ import ( "golang.org/x/xerrors" "cdr.dev/slog" - "github.com/coder/coder/coderd/database" agpl "github.com/coder/coder/tailnet" ) @@ -288,8 +287,37 @@ func (c *haCoordinator) hangleAgentUpdate(id uuid.UUID, decoder *json.Decoder) ( return &node, nil } +// Close closes all of the open connections in the coordinator and stops the +// coordinator from accepting new connections. func (c *haCoordinator) Close() error { + c.mutex.Lock() + defer c.mutex.Unlock() + close(c.close) + + wg := sync.WaitGroup{} + + wg.Add(len(c.agentSockets)) + for _, socket := range c.agentSockets { + socket := socket + go func() { + _ = socket.Close() + wg.Done() + }() + } + + for _, connMap := range c.agentToConnectionSockets { + wg.Add(len(connMap)) + for _, socket := range connMap { + socket := socket + go func() { + _ = socket.Close() + wg.Done() + }() + } + } + + wg.Wait() return nil } diff --git a/tailnet/coordinator.go b/tailnet/coordinator.go index d79ffa34a5a3b..150a323bcfe52 100644 --- a/tailnet/coordinator.go +++ b/tailnet/coordinator.go @@ -99,6 +99,7 @@ func ServeCoordinator(conn net.Conn, updateNodes func(node []*Node) error) (func // in-memory. func NewMemoryCoordinator() Coordinator { return &memoryCoordinator{ + closed: false, nodes: map[uuid.UUID]*Node{}, agentSockets: map[uuid.UUID]net.Conn{}, agentToConnectionSockets: map[uuid.UUID]map[uuid.UUID]net.Conn{}, @@ -112,7 +113,8 @@ func NewMemoryCoordinator() Coordinator { // This coordinator is incompatible with multiple Coder // replicas as all node data is in-memory. type memoryCoordinator struct { - mutex sync.Mutex + mutex sync.Mutex + closed bool // nodes maps agent and connection IDs their respective node. nodes map[uuid.UUID]*Node @@ -135,6 +137,11 @@ func (c *memoryCoordinator) Node(id uuid.UUID) *Node { // with the specified ID. func (c *memoryCoordinator) ServeClient(conn net.Conn, id uuid.UUID, agent uuid.UUID) error { c.mutex.Lock() + + if c.closed { + return xerrors.New("coordinator is closed") + } + // When a new connection is requested, we update it with the latest // node of the agent. This allows the connection to establish. node, ok := c.nodes[agent] @@ -229,6 +236,11 @@ func (c *memoryCoordinator) handleNextClientMessage(id, agent uuid.UUID, decoder // listens to incoming connections and publishes node updates. func (c *memoryCoordinator) ServeAgent(conn net.Conn, id uuid.UUID) error { c.mutex.Lock() + + if c.closed { + return xerrors.New("coordinator is closed") + } + sockets, ok := c.agentToConnectionSockets[id] if ok { // Publish all nodes that want to connect to the @@ -320,4 +332,36 @@ func (c *memoryCoordinator) handleNextAgentMessage(id uuid.UUID, decoder *json.D return nil } -func (*memoryCoordinator) Close() error { return nil } +// Close closes all of the open connections in the coordinator and stops the +// coordinator from accepting new connections. +func (c *memoryCoordinator) Close() error { + c.mutex.Lock() + defer c.mutex.Unlock() + + c.closed = true + + wg := sync.WaitGroup{} + + wg.Add(len(c.agentSockets)) + for _, socket := range c.agentSockets { + socket := socket + go func() { + _ = socket.Close() + wg.Done() + }() + } + + for _, connMap := range c.agentToConnectionSockets { + wg.Add(len(connMap)) + for _, socket := range connMap { + socket := socket + go func() { + _ = socket.Close() + wg.Done() + }() + } + } + + wg.Wait() + return nil +} From 46803aa38ba2d4189f687bda248f01bf933bf18e Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Fri, 7 Oct 2022 12:22:44 -0500 Subject: [PATCH 06/79] impelement high availability feature --- coderd/coderd.go | 2 ++ coderd/provisionerjobs.go | 2 +- coderd/workspaceagents.go | 16 ++++++++-------- coderd/workspacebuilds.go | 2 +- enterprise/coderd/coderd.go | 24 +++++++++++++++++++++--- 5 files changed, 33 insertions(+), 13 deletions(-) diff --git a/coderd/coderd.go b/coderd/coderd.go index 58686ae66fbcd..f3cdab0caea04 100644 --- a/coderd/coderd.go +++ b/coderd/coderd.go @@ -158,6 +158,7 @@ func New(options *Options) *API { api.Auditor.Store(&options.Auditor) api.WorkspaceQuotaEnforcer.Store(&options.WorkspaceQuotaEnforcer) api.workspaceAgentCache = wsconncache.New(api.dialWorkspaceAgentTailnet, 0) + api.TailnetCoordinator.Store(&options.TailnetCoordinator) api.derpServer = derp.NewServer(key.NewNode(), tailnet.Logger(options.Logger)) oauthConfigs := &httpmw.OAuth2Configs{ Github: options.GithubOAuth2Config, @@ -525,6 +526,7 @@ type API struct { Auditor atomic.Pointer[audit.Auditor] WorkspaceClientCoordinateOverride atomic.Pointer[func(rw http.ResponseWriter) bool] WorkspaceQuotaEnforcer atomic.Pointer[workspacequota.Enforcer] + TailnetCoordinator atomic.Pointer[tailnet.Coordinator] HTTPAuth *HTTPAuthorizer // APIHandler serves "/api/v2" diff --git a/coderd/provisionerjobs.go b/coderd/provisionerjobs.go index 56a825ea09a3a..68802df04e5ec 100644 --- a/coderd/provisionerjobs.go +++ b/coderd/provisionerjobs.go @@ -270,7 +270,7 @@ func (api *API) provisionerJobResources(rw http.ResponseWriter, r *http.Request, } } - apiAgent, err := convertWorkspaceAgent(api.DERPMap, api.TailnetCoordinator, agent, convertApps(dbApps), api.AgentInactiveDisconnectTimeout) + apiAgent, err := convertWorkspaceAgent(api.DERPMap, *api.TailnetCoordinator.Load(), agent, convertApps(dbApps), api.AgentInactiveDisconnectTimeout) if err != nil { httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{ Message: "Internal error reading job agent.", diff --git a/coderd/workspaceagents.go b/coderd/workspaceagents.go index 247915db99592..29943c8701ec8 100644 --- a/coderd/workspaceagents.go +++ b/coderd/workspaceagents.go @@ -48,7 +48,7 @@ func (api *API) workspaceAgent(rw http.ResponseWriter, r *http.Request) { }) return } - apiAgent, err := convertWorkspaceAgent(api.DERPMap, api.TailnetCoordinator, workspaceAgent, convertApps(dbApps), api.AgentInactiveDisconnectTimeout) + apiAgent, err := convertWorkspaceAgent(api.DERPMap, *api.TailnetCoordinator.Load(), workspaceAgent, convertApps(dbApps), api.AgentInactiveDisconnectTimeout) if err != nil { httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{ Message: "Internal error reading workspace agent.", @@ -77,7 +77,7 @@ func (api *API) workspaceAgentApps(rw http.ResponseWriter, r *http.Request) { func (api *API) workspaceAgentMetadata(rw http.ResponseWriter, r *http.Request) { ctx := r.Context() workspaceAgent := httpmw.WorkspaceAgent(r) - apiAgent, err := convertWorkspaceAgent(api.DERPMap, api.TailnetCoordinator, workspaceAgent, nil, api.AgentInactiveDisconnectTimeout) + apiAgent, err := convertWorkspaceAgent(api.DERPMap, *api.TailnetCoordinator.Load(), workspaceAgent, nil, api.AgentInactiveDisconnectTimeout) if err != nil { httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{ Message: "Internal error reading workspace agent.", @@ -97,7 +97,7 @@ func (api *API) workspaceAgentMetadata(rw http.ResponseWriter, r *http.Request) func (api *API) postWorkspaceAgentVersion(rw http.ResponseWriter, r *http.Request) { ctx := r.Context() workspaceAgent := httpmw.WorkspaceAgent(r) - apiAgent, err := convertWorkspaceAgent(api.DERPMap, api.TailnetCoordinator, workspaceAgent, nil, api.AgentInactiveDisconnectTimeout) + apiAgent, err := convertWorkspaceAgent(api.DERPMap, *api.TailnetCoordinator.Load(), workspaceAgent, nil, api.AgentInactiveDisconnectTimeout) if err != nil { httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{ Message: "Internal error reading workspace agent.", @@ -151,7 +151,7 @@ func (api *API) workspaceAgentPTY(rw http.ResponseWriter, r *http.Request) { httpapi.ResourceNotFound(rw) return } - apiAgent, err := convertWorkspaceAgent(api.DERPMap, api.TailnetCoordinator, workspaceAgent, nil, api.AgentInactiveDisconnectTimeout) + apiAgent, err := convertWorkspaceAgent(api.DERPMap, *api.TailnetCoordinator.Load(), workspaceAgent, nil, api.AgentInactiveDisconnectTimeout) if err != nil { httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{ Message: "Internal error reading workspace agent.", @@ -228,7 +228,7 @@ func (api *API) workspaceAgentListeningPorts(rw http.ResponseWriter, r *http.Req return } - apiAgent, err := convertWorkspaceAgent(api.DERPMap, api.TailnetCoordinator, workspaceAgent, nil, api.AgentInactiveDisconnectTimeout) + apiAgent, err := convertWorkspaceAgent(api.DERPMap, *api.TailnetCoordinator.Load(), workspaceAgent, nil, api.AgentInactiveDisconnectTimeout) if err != nil { httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{ Message: "Internal error reading workspace agent.", @@ -322,7 +322,7 @@ func (api *API) dialWorkspaceAgentTailnet(r *http.Request, agentID uuid.UUID) (* }) conn.SetNodeCallback(sendNodes) go func() { - err := api.TailnetCoordinator.ServeClient(serverConn, uuid.New(), agentID) + err := (*api.TailnetCoordinator.Load()).ServeClient(serverConn, uuid.New(), agentID) if err != nil { _ = conn.Close() } @@ -460,7 +460,7 @@ func (api *API) workspaceAgentCoordinate(rw http.ResponseWriter, r *http.Request closeChan := make(chan struct{}) go func() { defer close(closeChan) - err := api.TailnetCoordinator.ServeAgent(wsNetConn, workspaceAgent.ID) + err := (*api.TailnetCoordinator.Load()).ServeAgent(wsNetConn, workspaceAgent.ID) if err != nil { _ = conn.Close(websocket.StatusInternalError, err.Error()) return @@ -529,7 +529,7 @@ func (api *API) workspaceAgentClientCoordinate(rw http.ResponseWriter, r *http.R go httpapi.Heartbeat(ctx, conn) defer conn.Close(websocket.StatusNormalClosure, "") - err = api.TailnetCoordinator.ServeClient(websocket.NetConn(ctx, conn, websocket.MessageBinary), uuid.New(), workspaceAgent.ID) + err = (*api.TailnetCoordinator.Load()).ServeClient(websocket.NetConn(ctx, conn, websocket.MessageBinary), uuid.New(), workspaceAgent.ID) if err != nil { _ = conn.Close(websocket.StatusInternalError, err.Error()) return diff --git a/coderd/workspacebuilds.go b/coderd/workspacebuilds.go index 6ece8d379b153..88e162fa7db94 100644 --- a/coderd/workspacebuilds.go +++ b/coderd/workspacebuilds.go @@ -831,7 +831,7 @@ func (api *API) convertWorkspaceBuild( apiAgents := make([]codersdk.WorkspaceAgent, 0) for _, agent := range agents { apps := appsByAgentID[agent.ID] - apiAgent, err := convertWorkspaceAgent(api.DERPMap, api.TailnetCoordinator, agent, convertApps(apps), api.AgentInactiveDisconnectTimeout) + apiAgent, err := convertWorkspaceAgent(api.DERPMap, *api.TailnetCoordinator.Load(), agent, convertApps(apps), api.AgentInactiveDisconnectTimeout) if err != nil { return codersdk.WorkspaceBuild{}, xerrors.Errorf("converting workspace agent: %w", err) } diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index a6595e8bd6554..8eddcf42e325b 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -22,6 +22,8 @@ import ( "github.com/coder/coder/enterprise/audit" "github.com/coder/coder/enterprise/audit/backends" "github.com/coder/coder/enterprise/coderd/license" + "github.com/coder/coder/enterprise/tailnet" + agpltailnet "github.com/coder/coder/tailnet" ) // New constructs an Enterprise coderd API instance. @@ -171,11 +173,27 @@ func (api *API) updateEntitlements(ctx context.Context) error { } if changed, enabled := featureChanged(codersdk.FeatureHighAvailability); changed { - enforcer := workspacequota.NewNop() + coordinator := agpltailnet.NewMemoryCoordinator() if enabled { - enforcer = NewEnforcer(api.Options.UserWorkspaceQuota) + haCoordinator, err := tailnet.NewHACoordinator(api.Logger, api.Pubsub) + if err != nil { + api.Logger.Error(ctx, "unable to setup HA tailnet coordinator", slog.Error(err)) + // If we try to setup the HA coordinator and it fails, nothing + // is actually changing. + changed = false + } else { + coordinator = haCoordinator + } + } + + // Recheck changed in case the HA coordinator failed to set up. + if changed { + oldCoordinator := *api.AGPL.TailnetCoordinator.Swap(&coordinator) + err := oldCoordinator.Close() + if err != nil { + api.Logger.Error(ctx, "unable to setup HA tailnet coordinator", slog.Error(err)) + } } - api.AGPL.WorkspaceQuotaEnforcer.Store(&enforcer) } api.entitlements = entitlements From d38391e9f6ff27351e33017540efcc21f3dcc7d8 Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Fri, 7 Oct 2022 12:45:29 -0500 Subject: [PATCH 07/79] fixup! impelement high availability feature --- enterprise/coderd/coderd.go | 2 +- .../coderd/coderdenttest/coderdenttest.go | 23 ++++++++++--------- enterprise/coderd/license/license_test.go | 9 ++++---- enterprise/coderd/licenses_test.go | 22 ++++++++++-------- 4 files changed, 30 insertions(+), 26 deletions(-) diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index 8eddcf42e325b..d52596c547027 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -191,7 +191,7 @@ func (api *API) updateEntitlements(ctx context.Context) error { oldCoordinator := *api.AGPL.TailnetCoordinator.Swap(&coordinator) err := oldCoordinator.Close() if err != nil { - api.Logger.Error(ctx, "unable to setup HA tailnet coordinator", slog.Error(err)) + api.Logger.Error(ctx, "close old tailnet coordinator", slog.Error(err)) } } } diff --git a/enterprise/coderd/coderdenttest/coderdenttest.go b/enterprise/coderd/coderdenttest/coderdenttest.go index 90d09fd5c9c85..a9e08b4aac088 100644 --- a/enterprise/coderd/coderdenttest/coderdenttest.go +++ b/enterprise/coderd/coderdenttest/coderdenttest.go @@ -85,17 +85,18 @@ func NewWithAPI(t *testing.T, options *Options) (*codersdk.Client, io.Closer, *c } type LicenseOptions struct { - AccountType string - AccountID string - Trial bool - AllFeatures bool - GraceAt time.Time - ExpiresAt time.Time - UserLimit int64 - AuditLog bool - BrowserOnly bool - SCIM bool - WorkspaceQuota bool + AccountType string + AccountID string + Trial bool + AllFeatures bool + GraceAt time.Time + ExpiresAt time.Time + UserLimit int64 + AuditLog bool + BrowserOnly bool + SCIM bool + WorkspaceQuota bool + HighAvailability bool } // AddLicense generates a new license with the options provided and inserts it. diff --git a/enterprise/coderd/license/license_test.go b/enterprise/coderd/license/license_test.go index 85958fbf4f60d..39d6e05fb50d3 100644 --- a/enterprise/coderd/license/license_test.go +++ b/enterprise/coderd/license/license_test.go @@ -20,10 +20,11 @@ import ( func TestEntitlements(t *testing.T) { t.Parallel() all := map[string]bool{ - codersdk.FeatureAuditLog: true, - codersdk.FeatureBrowserOnly: true, - codersdk.FeatureSCIM: true, - codersdk.FeatureWorkspaceQuota: true, + codersdk.FeatureAuditLog: true, + codersdk.FeatureBrowserOnly: true, + codersdk.FeatureSCIM: true, + codersdk.FeatureWorkspaceQuota: true, + codersdk.FeatureHighAvailability: true, } t.Run("Defaults", func(t *testing.T) { diff --git a/enterprise/coderd/licenses_test.go b/enterprise/coderd/licenses_test.go index 59d36cc9157a6..5b4c89212578d 100644 --- a/enterprise/coderd/licenses_test.go +++ b/enterprise/coderd/licenses_test.go @@ -99,21 +99,23 @@ func TestGetLicense(t *testing.T) { assert.Equal(t, int32(1), licenses[0].ID) assert.Equal(t, "testing", licenses[0].Claims["account_id"]) assert.Equal(t, map[string]interface{}{ - codersdk.FeatureUserLimit: json.Number("0"), - codersdk.FeatureAuditLog: json.Number("1"), - codersdk.FeatureSCIM: json.Number("1"), - codersdk.FeatureBrowserOnly: json.Number("1"), - codersdk.FeatureWorkspaceQuota: json.Number("0"), + codersdk.FeatureUserLimit: json.Number("0"), + codersdk.FeatureAuditLog: json.Number("1"), + codersdk.FeatureSCIM: json.Number("1"), + codersdk.FeatureBrowserOnly: json.Number("1"), + codersdk.FeatureWorkspaceQuota: json.Number("0"), + codersdk.FeatureHighAvailability: json.Number("0"), }, licenses[0].Claims["features"]) assert.Equal(t, int32(2), licenses[1].ID) assert.Equal(t, "testing2", licenses[1].Claims["account_id"]) assert.Equal(t, true, licenses[1].Claims["trial"]) assert.Equal(t, map[string]interface{}{ - codersdk.FeatureUserLimit: json.Number("200"), - codersdk.FeatureAuditLog: json.Number("1"), - codersdk.FeatureSCIM: json.Number("1"), - codersdk.FeatureBrowserOnly: json.Number("1"), - codersdk.FeatureWorkspaceQuota: json.Number("0"), + codersdk.FeatureUserLimit: json.Number("200"), + codersdk.FeatureAuditLog: json.Number("1"), + codersdk.FeatureSCIM: json.Number("1"), + codersdk.FeatureBrowserOnly: json.Number("1"), + codersdk.FeatureWorkspaceQuota: json.Number("0"), + codersdk.FeatureHighAvailability: json.Number("0"), }, licenses[1].Claims["features"]) }) } From a0bcd6464f16483c9524a69137de1dcc7d309095 Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Fri, 7 Oct 2022 12:53:18 -0500 Subject: [PATCH 08/79] fixup! impelement high availability feature --- enterprise/coderd/license/license_test.go | 26 ++++++++++++----------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/enterprise/coderd/license/license_test.go b/enterprise/coderd/license/license_test.go index 39d6e05fb50d3..204c6e7c3f5a2 100644 --- a/enterprise/coderd/license/license_test.go +++ b/enterprise/coderd/license/license_test.go @@ -60,11 +60,12 @@ func TestEntitlements(t *testing.T) { db := databasefake.New() db.InsertLicense(context.Background(), database.InsertLicenseParams{ JWT: coderdenttest.GenerateLicense(t, coderdenttest.LicenseOptions{ - UserLimit: 100, - AuditLog: true, - BrowserOnly: true, - SCIM: true, - WorkspaceQuota: true, + UserLimit: 100, + AuditLog: true, + BrowserOnly: true, + SCIM: true, + WorkspaceQuota: true, + HighAvailability: true, }), Exp: time.Now().Add(time.Hour), }) @@ -81,13 +82,14 @@ func TestEntitlements(t *testing.T) { db := databasefake.New() db.InsertLicense(context.Background(), database.InsertLicenseParams{ JWT: coderdenttest.GenerateLicense(t, coderdenttest.LicenseOptions{ - UserLimit: 100, - AuditLog: true, - BrowserOnly: true, - SCIM: true, - WorkspaceQuota: true, - GraceAt: time.Now().Add(-time.Hour), - ExpiresAt: time.Now().Add(time.Hour), + UserLimit: 100, + AuditLog: true, + BrowserOnly: true, + SCIM: true, + WorkspaceQuota: true, + HighAvailability: true, + GraceAt: time.Now().Add(-time.Hour), + ExpiresAt: time.Now().Add(time.Hour), }), Exp: time.Now().Add(time.Hour), }) From 1f33018bd1c586956c748e65c08e2049fcfdee78 Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Fri, 7 Oct 2022 13:02:40 -0500 Subject: [PATCH 09/79] fixup! impelement high availability feature --- enterprise/coderd/coderdenttest/coderdenttest.go | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/enterprise/coderd/coderdenttest/coderdenttest.go b/enterprise/coderd/coderdenttest/coderdenttest.go index a9e08b4aac088..2c4250325b567 100644 --- a/enterprise/coderd/coderdenttest/coderdenttest.go +++ b/enterprise/coderd/coderdenttest/coderdenttest.go @@ -132,6 +132,10 @@ func GenerateLicense(t *testing.T, options LicenseOptions) string { if options.WorkspaceQuota { workspaceQuota = 1 } + highAvailability := int64(0) + if options.HighAvailability { + highAvailability = 1 + } c := &license.Claims{ RegisteredClaims: jwt.RegisteredClaims{ @@ -147,11 +151,12 @@ func GenerateLicense(t *testing.T, options LicenseOptions) string { Version: license.CurrentVersion, AllFeatures: options.AllFeatures, Features: license.Features{ - UserLimit: options.UserLimit, - AuditLog: auditLog, - BrowserOnly: browserOnly, - SCIM: scim, - WorkspaceQuota: workspaceQuota, + UserLimit: options.UserLimit, + AuditLog: auditLog, + BrowserOnly: browserOnly, + SCIM: scim, + WorkspaceQuota: workspaceQuota, + HighAvailability: highAvailability, }, } tok := jwt.NewWithClaims(jwt.SigningMethodEdDSA, c) From b6a507020417a5704d7d1336336cb5b961fa42eb Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Fri, 7 Oct 2022 13:11:20 -0500 Subject: [PATCH 10/79] fixup! impelement high availability feature --- enterprise/cli/features_test.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/enterprise/cli/features_test.go b/enterprise/cli/features_test.go index f5e7b1ff3520a..f892182f164fe 100644 --- a/enterprise/cli/features_test.go +++ b/enterprise/cli/features_test.go @@ -57,7 +57,7 @@ func TestFeaturesList(t *testing.T) { var entitlements codersdk.Entitlements err := json.Unmarshal(buf.Bytes(), &entitlements) require.NoError(t, err, "unmarshal JSON output") - assert.Len(t, entitlements.Features, 5) + assert.Len(t, entitlements.Features, 6) assert.Empty(t, entitlements.Warnings) assert.Equal(t, codersdk.EntitlementNotEntitled, entitlements.Features[codersdk.FeatureUserLimit].Entitlement) @@ -69,6 +69,8 @@ func TestFeaturesList(t *testing.T) { entitlements.Features[codersdk.FeatureWorkspaceQuota].Entitlement) assert.Equal(t, codersdk.EntitlementNotEntitled, entitlements.Features[codersdk.FeatureSCIM].Entitlement) + assert.Equal(t, codersdk.EntitlementNotEntitled, + entitlements.Features[codersdk.FeatureHighAvailability].Entitlement) assert.False(t, entitlements.HasLicense) assert.False(t, entitlements.Experimental) }) From 1883430c952607d28b188ca41c51fee7006e2250 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Wed, 12 Oct 2022 04:22:37 +0000 Subject: [PATCH 11/79] Add replicas --- coderd/database/databasefake/databasefake.go | 78 ++++ coderd/database/dump.sql | 16 +- .../migrations/000059_replicas.down.sql | 2 + .../migrations/000059_replicas.up.sql | 26 ++ coderd/database/models.go | 14 + coderd/database/querier.go | 5 + coderd/database/queries.sql.go | 183 ++++++++- coderd/database/queries/replicas.sql | 33 ++ enterprise/replica/replica.go | 371 ++++++++++++++++++ enterprise/replica/replica_test.go | 193 +++++++++ enterprise/tailmesh/tailmesh.go | 32 ++ 11 files changed, 949 insertions(+), 4 deletions(-) create mode 100644 coderd/database/migrations/000059_replicas.down.sql create mode 100644 coderd/database/migrations/000059_replicas.up.sql create mode 100644 coderd/database/queries/replicas.sql create mode 100644 enterprise/replica/replica.go create mode 100644 enterprise/replica/replica_test.go create mode 100644 enterprise/tailmesh/tailmesh.go diff --git a/coderd/database/databasefake/databasefake.go b/coderd/database/databasefake/databasefake.go index 1a2a919925ec2..ae41d9c23620b 100644 --- a/coderd/database/databasefake/databasefake.go +++ b/coderd/database/databasefake/databasefake.go @@ -107,6 +107,7 @@ type data struct { workspaceApps []database.WorkspaceApp workspaces []database.Workspace licenses []database.License + replicas []database.Replica deploymentID string lastLicenseID int32 @@ -3025,3 +3026,80 @@ func (q *fakeQuerier) DeleteGroupByID(_ context.Context, id uuid.UUID) error { return sql.ErrNoRows } + +func (q *fakeQuerier) DeleteReplicasUpdatedBefore(ctx context.Context, before time.Time) error { + q.mutex.Lock() + defer q.mutex.Unlock() + + for i, replica := range q.replicas { + if replica.UpdatedAt.Before(before) { + q.replicas = append(q.replicas[:i], q.replicas[i+1:]...) + } + } + + return nil +} + +func (q *fakeQuerier) InsertReplica(_ context.Context, arg database.InsertReplicaParams) (database.Replica, error) { + q.mutex.Lock() + defer q.mutex.Unlock() + + replica := database.Replica{ + ID: arg.ID, + CreatedAt: arg.CreatedAt, + StartedAt: arg.StartedAt, + UpdatedAt: arg.UpdatedAt, + Hostname: arg.Hostname, + RegionID: arg.RegionID, + RelayAddress: arg.RelayAddress, + Version: arg.Version, + } + q.replicas = append(q.replicas, replica) + return replica, nil +} + +func (q *fakeQuerier) UpdateReplica(_ context.Context, arg database.UpdateReplicaParams) (database.Replica, error) { + q.mutex.Lock() + defer q.mutex.Unlock() + + for index, replica := range q.replicas { + if replica.ID != arg.ID { + continue + } + replica.Hostname = arg.Hostname + replica.StartedAt = arg.StartedAt + replica.StoppedAt = arg.StoppedAt + replica.UpdatedAt = arg.UpdatedAt + replica.RelayAddress = arg.RelayAddress + replica.RegionID = arg.RegionID + replica.Version = arg.Version + replica.Error = arg.Error + q.replicas[index] = replica + return replica, nil + } + return database.Replica{}, sql.ErrNoRows +} + +func (q *fakeQuerier) GetReplicasUpdatedAfter(_ context.Context, updatedAt time.Time) ([]database.Replica, error) { + q.mutex.RLock() + defer q.mutex.RUnlock() + replicas := make([]database.Replica, 0) + for _, replica := range q.replicas { + if replica.UpdatedAt.After(updatedAt) && !replica.StoppedAt.Valid { + replicas = append(replicas, replica) + } + } + return replicas, nil +} + +func (q *fakeQuerier) GetReplicaByID(_ context.Context, id uuid.UUID) (database.Replica, error) { + q.mutex.RLock() + defer q.mutex.RUnlock() + + for _, replica := range q.replicas { + if replica.ID == id { + return replica, nil + } + } + return database.Replica{}, sql.ErrNoRows +} diff --git a/coderd/database/dump.sql b/coderd/database/dump.sql index eb16074e90525..4b956fb64f10e 100644 --- a/coderd/database/dump.sql +++ b/coderd/database/dump.sql @@ -245,7 +245,8 @@ CREATE TABLE provisioner_daemons ( created_at timestamp with time zone NOT NULL, updated_at timestamp with time zone, name character varying(64) NOT NULL, - provisioners provisioner_type[] NOT NULL + provisioners provisioner_type[] NOT NULL, + replica_id uuid ); CREATE TABLE provisioner_job_logs ( @@ -276,6 +277,19 @@ CREATE TABLE provisioner_jobs ( worker_id uuid ); +CREATE TABLE replicas ( + id uuid NOT NULL, + created_at timestamp with time zone NOT NULL, + started_at timestamp with time zone NOT NULL, + stopped_at timestamp with time zone, + updated_at timestamp with time zone NOT NULL, + hostname text NOT NULL, + region_id integer NOT NULL, + relay_address text NOT NULL, + version text NOT NULL, + error text +); + CREATE TABLE site_configs ( key character varying(256) NOT NULL, value character varying(8192) NOT NULL diff --git a/coderd/database/migrations/000059_replicas.down.sql b/coderd/database/migrations/000059_replicas.down.sql new file mode 100644 index 0000000000000..4cca6615d4213 --- /dev/null +++ b/coderd/database/migrations/000059_replicas.down.sql @@ -0,0 +1,2 @@ +DROP TABLE replicas; +ALTER TABLE provisioner_daemons DROP COLUMN replica_id; diff --git a/coderd/database/migrations/000059_replicas.up.sql b/coderd/database/migrations/000059_replicas.up.sql new file mode 100644 index 0000000000000..a07587f35a234 --- /dev/null +++ b/coderd/database/migrations/000059_replicas.up.sql @@ -0,0 +1,26 @@ +CREATE TABLE IF NOT EXISTS replicas ( + -- A unique identifier for the replica that is stored on disk. + -- For persistent replicas, this will be reused. + -- For ephemeral replicas, this will be a new UUID for each one. + id uuid NOT NULL, + created_at timestamp with time zone NOT NULL, + -- The time the replica was created. + started_at timestamp with time zone NOT NULL, + -- The time the replica was last seen. + stopped_at timestamp with time zone, + -- Updated periodically to ensure the replica is still alive. + updated_at timestamp with time zone NOT NULL, + -- Hostname is the hostname of the replica. + hostname text NOT NULL, + -- Region is the region the replica is in. + -- We only DERP mesh to the same region ID of a running replica. + region_id integer NOT NULL, + -- An address that should be accessible to other replicas. + relay_address text NOT NULL, + -- Version is the Coder version of the replica. + version text NOT NULL, + error text +); + +-- Associates a provisioner daemon with a replica. +ALTER TABLE provisioner_daemons ADD COLUMN replica_id uuid; diff --git a/coderd/database/models.go b/coderd/database/models.go index f669b5e618138..9d73e097bfe0f 100644 --- a/coderd/database/models.go +++ b/coderd/database/models.go @@ -487,6 +487,7 @@ type ProvisionerDaemon struct { UpdatedAt sql.NullTime `db:"updated_at" json:"updated_at"` Name string `db:"name" json:"name"` Provisioners []ProvisionerType `db:"provisioners" json:"provisioners"` + ReplicaID uuid.NullUUID `db:"replica_id" json:"replica_id"` } type ProvisionerJob struct { @@ -517,6 +518,19 @@ type ProvisionerJobLog struct { Output string `db:"output" json:"output"` } +type Replica struct { + ID uuid.UUID `db:"id" json:"id"` + CreatedAt time.Time `db:"created_at" json:"created_at"` + StartedAt time.Time `db:"started_at" json:"started_at"` + StoppedAt sql.NullTime `db:"stopped_at" json:"stopped_at"` + UpdatedAt time.Time `db:"updated_at" json:"updated_at"` + Hostname string `db:"hostname" json:"hostname"` + RegionID int32 `db:"region_id" json:"region_id"` + RelayAddress string `db:"relay_address" json:"relay_address"` + Version string `db:"version" json:"version"` + Error sql.NullString `db:"error" json:"error"` +} + type SiteConfig struct { Key string `db:"key" json:"key"` Value string `db:"value" json:"value"` diff --git a/coderd/database/querier.go b/coderd/database/querier.go index b58f6abbccfb8..db789e3399939 100644 --- a/coderd/database/querier.go +++ b/coderd/database/querier.go @@ -26,6 +26,7 @@ type sqlcQuerier interface { DeleteLicense(ctx context.Context, id int32) (int32, error) DeleteOldAgentStats(ctx context.Context) error DeleteParameterValueByID(ctx context.Context, id uuid.UUID) error + DeleteReplicasUpdatedBefore(ctx context.Context, updatedAt time.Time) error GetAPIKeyByID(ctx context.Context, id string) (APIKey, error) GetAPIKeysByLoginType(ctx context.Context, loginType LoginType) ([]APIKey, error) GetAPIKeysLastUsedAfter(ctx context.Context, lastUsed time.Time) ([]APIKey, error) @@ -66,6 +67,8 @@ type sqlcQuerier interface { GetProvisionerJobsByIDs(ctx context.Context, ids []uuid.UUID) ([]ProvisionerJob, error) GetProvisionerJobsCreatedAfter(ctx context.Context, createdAt time.Time) ([]ProvisionerJob, error) GetProvisionerLogsByIDBetween(ctx context.Context, arg GetProvisionerLogsByIDBetweenParams) ([]ProvisionerJobLog, error) + GetReplicaByID(ctx context.Context, id uuid.UUID) (Replica, error) + GetReplicasUpdatedAfter(ctx context.Context, updatedAt time.Time) ([]Replica, error) GetTemplateByID(ctx context.Context, id uuid.UUID) (Template, error) GetTemplateByOrganizationAndName(ctx context.Context, arg GetTemplateByOrganizationAndNameParams) (Template, error) GetTemplateDAUs(ctx context.Context, templateID uuid.UUID) ([]GetTemplateDAUsRow, error) @@ -134,6 +137,7 @@ type sqlcQuerier interface { InsertProvisionerDaemon(ctx context.Context, arg InsertProvisionerDaemonParams) (ProvisionerDaemon, error) InsertProvisionerJob(ctx context.Context, arg InsertProvisionerJobParams) (ProvisionerJob, error) InsertProvisionerJobLogs(ctx context.Context, arg InsertProvisionerJobLogsParams) ([]ProvisionerJobLog, error) + InsertReplica(ctx context.Context, arg InsertReplicaParams) (Replica, error) InsertTemplate(ctx context.Context, arg InsertTemplateParams) (Template, error) InsertTemplateVersion(ctx context.Context, arg InsertTemplateVersionParams) (TemplateVersion, error) InsertUser(ctx context.Context, arg InsertUserParams) (User, error) @@ -154,6 +158,7 @@ type sqlcQuerier interface { UpdateProvisionerJobByID(ctx context.Context, arg UpdateProvisionerJobByIDParams) error UpdateProvisionerJobWithCancelByID(ctx context.Context, arg UpdateProvisionerJobWithCancelByIDParams) error UpdateProvisionerJobWithCompleteByID(ctx context.Context, arg UpdateProvisionerJobWithCompleteByIDParams) error + UpdateReplica(ctx context.Context, arg UpdateReplicaParams) (Replica, error) UpdateTemplateActiveVersionByID(ctx context.Context, arg UpdateTemplateActiveVersionByIDParams) error UpdateTemplateDeletedByID(ctx context.Context, arg UpdateTemplateDeletedByIDParams) error UpdateTemplateMetaByID(ctx context.Context, arg UpdateTemplateMetaByIDParams) (Template, error) diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index ba90e102b819a..adfe532446a4e 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -1985,7 +1985,7 @@ func (q *sqlQuerier) ParameterValues(ctx context.Context, arg ParameterValuesPar const getProvisionerDaemonByID = `-- name: GetProvisionerDaemonByID :one SELECT - id, created_at, updated_at, name, provisioners + id, created_at, updated_at, name, provisioners, replica_id FROM provisioner_daemons WHERE @@ -2001,13 +2001,14 @@ func (q *sqlQuerier) GetProvisionerDaemonByID(ctx context.Context, id uuid.UUID) &i.UpdatedAt, &i.Name, pq.Array(&i.Provisioners), + &i.ReplicaID, ) return i, err } const getProvisionerDaemons = `-- name: GetProvisionerDaemons :many SELECT - id, created_at, updated_at, name, provisioners + id, created_at, updated_at, name, provisioners, replica_id FROM provisioner_daemons ` @@ -2027,6 +2028,7 @@ func (q *sqlQuerier) GetProvisionerDaemons(ctx context.Context) ([]ProvisionerDa &i.UpdatedAt, &i.Name, pq.Array(&i.Provisioners), + &i.ReplicaID, ); err != nil { return nil, err } @@ -2050,7 +2052,7 @@ INSERT INTO provisioners ) VALUES - ($1, $2, $3, $4) RETURNING id, created_at, updated_at, name, provisioners + ($1, $2, $3, $4) RETURNING id, created_at, updated_at, name, provisioners, replica_id ` type InsertProvisionerDaemonParams struct { @@ -2074,6 +2076,7 @@ func (q *sqlQuerier) InsertProvisionerDaemon(ctx context.Context, arg InsertProv &i.UpdatedAt, &i.Name, pq.Array(&i.Provisioners), + &i.ReplicaID, ) return i, err } @@ -2531,6 +2534,180 @@ func (q *sqlQuerier) UpdateProvisionerJobWithCompleteByID(ctx context.Context, a return err } +const deleteReplicasUpdatedBefore = `-- name: DeleteReplicasUpdatedBefore :exec +DELETE FROM replicas WHERE updated_at < $1 +` + +func (q *sqlQuerier) DeleteReplicasUpdatedBefore(ctx context.Context, updatedAt time.Time) error { + _, err := q.db.ExecContext(ctx, deleteReplicasUpdatedBefore, updatedAt) + return err +} + +const getReplicaByID = `-- name: GetReplicaByID :one +SELECT id, created_at, started_at, stopped_at, updated_at, hostname, region_id, relay_address, version, error FROM replicas WHERE id = $1 +` + +func (q *sqlQuerier) GetReplicaByID(ctx context.Context, id uuid.UUID) (Replica, error) { + row := q.db.QueryRowContext(ctx, getReplicaByID, id) + var i Replica + err := row.Scan( + &i.ID, + &i.CreatedAt, + &i.StartedAt, + &i.StoppedAt, + &i.UpdatedAt, + &i.Hostname, + &i.RegionID, + &i.RelayAddress, + &i.Version, + &i.Error, + ) + return i, err +} + +const getReplicasUpdatedAfter = `-- name: GetReplicasUpdatedAfter :many +SELECT id, created_at, started_at, stopped_at, updated_at, hostname, region_id, relay_address, version, error FROM replicas WHERE updated_at > $1 AND stopped_at IS NULL +` + +func (q *sqlQuerier) GetReplicasUpdatedAfter(ctx context.Context, updatedAt time.Time) ([]Replica, error) { + rows, err := q.db.QueryContext(ctx, getReplicasUpdatedAfter, updatedAt) + if err != nil { + return nil, err + } + defer rows.Close() + var items []Replica + for rows.Next() { + var i Replica + if err := rows.Scan( + &i.ID, + &i.CreatedAt, + &i.StartedAt, + &i.StoppedAt, + &i.UpdatedAt, + &i.Hostname, + &i.RegionID, + &i.RelayAddress, + &i.Version, + &i.Error, + ); err != nil { + return nil, err + } + items = append(items, i) + } + if err := rows.Close(); err != nil { + return nil, err + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} + +const insertReplica = `-- name: InsertReplica :one +INSERT INTO replicas ( + id, + created_at, + started_at, + updated_at, + hostname, + region_id, + relay_address, + version + +) VALUES ($1, $2, $3, $4, $5, $6, $7, $8) RETURNING id, created_at, started_at, stopped_at, updated_at, hostname, region_id, relay_address, version, error +` + +type InsertReplicaParams struct { + ID uuid.UUID `db:"id" json:"id"` + CreatedAt time.Time `db:"created_at" json:"created_at"` + StartedAt time.Time `db:"started_at" json:"started_at"` + UpdatedAt time.Time `db:"updated_at" json:"updated_at"` + Hostname string `db:"hostname" json:"hostname"` + RegionID int32 `db:"region_id" json:"region_id"` + RelayAddress string `db:"relay_address" json:"relay_address"` + Version string `db:"version" json:"version"` +} + +func (q *sqlQuerier) InsertReplica(ctx context.Context, arg InsertReplicaParams) (Replica, error) { + row := q.db.QueryRowContext(ctx, insertReplica, + arg.ID, + arg.CreatedAt, + arg.StartedAt, + arg.UpdatedAt, + arg.Hostname, + arg.RegionID, + arg.RelayAddress, + arg.Version, + ) + var i Replica + err := row.Scan( + &i.ID, + &i.CreatedAt, + &i.StartedAt, + &i.StoppedAt, + &i.UpdatedAt, + &i.Hostname, + &i.RegionID, + &i.RelayAddress, + &i.Version, + &i.Error, + ) + return i, err +} + +const updateReplica = `-- name: UpdateReplica :one +UPDATE replicas SET + updated_at = $2, + started_at = $3, + stopped_at = $4, + relay_address = $5, + region_id = $6, + hostname = $7, + version = $8, + error = $9 +WHERE id = $1 RETURNING id, created_at, started_at, stopped_at, updated_at, hostname, region_id, relay_address, version, error +` + +type UpdateReplicaParams struct { + ID uuid.UUID `db:"id" json:"id"` + UpdatedAt time.Time `db:"updated_at" json:"updated_at"` + StartedAt time.Time `db:"started_at" json:"started_at"` + StoppedAt sql.NullTime `db:"stopped_at" json:"stopped_at"` + RelayAddress string `db:"relay_address" json:"relay_address"` + RegionID int32 `db:"region_id" json:"region_id"` + Hostname string `db:"hostname" json:"hostname"` + Version string `db:"version" json:"version"` + Error sql.NullString `db:"error" json:"error"` +} + +func (q *sqlQuerier) UpdateReplica(ctx context.Context, arg UpdateReplicaParams) (Replica, error) { + row := q.db.QueryRowContext(ctx, updateReplica, + arg.ID, + arg.UpdatedAt, + arg.StartedAt, + arg.StoppedAt, + arg.RelayAddress, + arg.RegionID, + arg.Hostname, + arg.Version, + arg.Error, + ) + var i Replica + err := row.Scan( + &i.ID, + &i.CreatedAt, + &i.StartedAt, + &i.StoppedAt, + &i.UpdatedAt, + &i.Hostname, + &i.RegionID, + &i.RelayAddress, + &i.Version, + &i.Error, + ) + return i, err +} + const getDeploymentID = `-- name: GetDeploymentID :one SELECT value FROM site_configs WHERE key = 'deployment_id' ` diff --git a/coderd/database/queries/replicas.sql b/coderd/database/queries/replicas.sql new file mode 100644 index 0000000000000..a7aa5b0aa1dee --- /dev/null +++ b/coderd/database/queries/replicas.sql @@ -0,0 +1,33 @@ +-- name: GetReplicasUpdatedAfter :many +SELECT * FROM replicas WHERE updated_at > $1 AND stopped_at IS NULL; + +-- name: GetReplicaByID :one +SELECT * FROM replicas WHERE id = $1; + +-- name: InsertReplica :one +INSERT INTO replicas ( + id, + created_at, + started_at, + updated_at, + hostname, + region_id, + relay_address, + version + +) VALUES ($1, $2, $3, $4, $5, $6, $7, $8) RETURNING *; + +-- name: UpdateReplica :one +UPDATE replicas SET + updated_at = $2, + started_at = $3, + stopped_at = $4, + relay_address = $5, + region_id = $6, + hostname = $7, + version = $8, + error = $9 +WHERE id = $1 RETURNING *; + +-- name: DeleteReplicasUpdatedBefore :exec +DELETE FROM replicas WHERE updated_at < $1; diff --git a/enterprise/replica/replica.go b/enterprise/replica/replica.go new file mode 100644 index 0000000000000..ca0c450651e64 --- /dev/null +++ b/enterprise/replica/replica.go @@ -0,0 +1,371 @@ +package replica + +import ( + "context" + "database/sql" + "errors" + "fmt" + "net/http" + "os" + "strings" + "sync" + "time" + + "github.com/google/uuid" + "golang.org/x/xerrors" + + "cdr.dev/slog" + + "github.com/coder/coder/buildinfo" + "github.com/coder/coder/coderd/database" +) + +var ( + PubsubEvent = "replica" +) + +type Options struct { + ID uuid.UUID + UpdateInterval time.Duration + PeerTimeout time.Duration + // Mesh will dial active replicas with the same region ID to ensure + // they are reachable. If not, an error will be updated on the replica. + Mesh bool + RelayAddress string + RegionID int32 +} + +// New registers the replica with the database and periodically updates to ensure +// it's healthy. It contacts all other alive replicas to ensure they are reachable. +func New(ctx context.Context, logger slog.Logger, db database.Store, pubsub database.Pubsub, options Options) (*Server, error) { + if options.ID == uuid.Nil { + panic("An ID must be provided!") + } + if options.PeerTimeout == 0 { + options.PeerTimeout = 3 * time.Second + } + if options.UpdateInterval == 0 { + options.UpdateInterval = 5 * time.Second + } + hostname, err := os.Hostname() + if err != nil { + return nil, xerrors.Errorf("get hostname: %w", err) + } + var replica database.Replica + _, err = db.GetReplicaByID(ctx, options.ID) + if err != nil { + if !errors.Is(err, sql.ErrNoRows) { + return nil, xerrors.Errorf("get replica: %w", err) + } + replica, err = db.InsertReplica(ctx, database.InsertReplicaParams{ + ID: options.ID, + CreatedAt: database.Now(), + StartedAt: database.Now(), + UpdatedAt: database.Now(), + Hostname: hostname, + RegionID: options.RegionID, + RelayAddress: options.RelayAddress, + Version: buildinfo.Version(), + }) + if err != nil { + return nil, xerrors.Errorf("insert replica: %w", err) + } + } else { + replica, err = db.UpdateReplica(ctx, database.UpdateReplicaParams{ + ID: options.ID, + UpdatedAt: database.Now(), + StartedAt: database.Now(), + StoppedAt: sql.NullTime{}, + RelayAddress: options.RelayAddress, + RegionID: options.RegionID, + Hostname: hostname, + Version: buildinfo.Version(), + Error: sql.NullString{}, + }) + if err != nil { + return nil, xerrors.Errorf("update replica: %w", err) + } + } + err = pubsub.Publish(PubsubEvent, []byte(options.ID.String())) + if err != nil { + return nil, xerrors.Errorf("publish new replica: %w", err) + } + ctx, cancelFunc := context.WithCancel(ctx) + server := &Server{ + options: &options, + db: db, + pubsub: pubsub, + self: replica, + logger: logger, + closed: make(chan struct{}), + closeCancel: cancelFunc, + } + err = server.run(ctx) + if err != nil { + return nil, xerrors.Errorf("run replica: %w", err) + } + err = server.subscribe(ctx) + if err != nil { + return nil, xerrors.Errorf("subscribe: %w", err) + } + server.closeWait.Add(1) + go server.loop(ctx) + return server, nil +} + +type Server struct { + options *Options + db database.Store + pubsub database.Pubsub + logger slog.Logger + + closeWait sync.WaitGroup + closeMutex sync.Mutex + closed chan (struct{}) + closeCancel context.CancelFunc + + self database.Replica + mutex sync.Mutex + peers []database.Replica + callback func() +} + +// loop runs the replica update sequence on an update interval. +func (s *Server) loop(ctx context.Context) { + defer s.closeWait.Done() + ticker := time.NewTicker(s.options.UpdateInterval) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + } + err := s.run(ctx) + if err != nil && !errors.Is(err, context.Canceled) { + s.logger.Warn(ctx, "run replica update loop", slog.Error(err)) + } + } +} + +// subscribe listens for new replica information! +func (s *Server) subscribe(ctx context.Context) error { + needsUpdate := false + updating := false + updateMutex := sync.Mutex{} + + // This loop will continually update nodes as updates are processed. + // The intent is to always be up to date without spamming the run + // function, so if a new update comes in while one is being processed, + // it will reprocess afterwards. + var update func() + update = func() { + err := s.run(ctx) + if err != nil && !errors.Is(err, context.Canceled) { + s.logger.Error(ctx, "run replica from subscribe", slog.Error(err)) + } + updateMutex.Lock() + if needsUpdate { + needsUpdate = false + updateMutex.Unlock() + update() + return + } + updating = false + updateMutex.Unlock() + } + cancelFunc, err := s.pubsub.Subscribe(PubsubEvent, func(ctx context.Context, message []byte) { + updateMutex.Lock() + defer updateMutex.Unlock() + id, err := uuid.Parse(string(message)) + if err != nil { + return + } + // Don't process updates for ourself! + if id == s.options.ID { + return + } + if updating { + needsUpdate = true + return + } + updating = true + go update() + }) + if err != nil { + return err + } + go func() { + <-ctx.Done() + cancelFunc() + }() + return nil +} + +func (s *Server) run(ctx context.Context) error { + s.closeMutex.Lock() + s.closeWait.Add(1) + s.closeMutex.Unlock() + go func() { + s.closeWait.Done() + }() + // Expect replicas to update once every three times the interval... + // If they don't, assume death! + replicas, err := s.db.GetReplicasUpdatedAfter(ctx, database.Now().Add(-3*s.options.UpdateInterval)) + if err != nil { + return xerrors.Errorf("get replicas: %w", err) + } + + s.mutex.Lock() + s.peers = make([]database.Replica, 0, len(replicas)) + for _, replica := range replicas { + if replica.ID == s.options.ID { + continue + } + s.peers = append(s.peers, replica) + } + s.mutex.Unlock() + + var wg sync.WaitGroup + var mu sync.Mutex + failed := make([]string, 0) + for _, peer := range s.Regional() { + wg.Add(1) + peer := peer + go func() { + defer wg.Done() + req, err := http.NewRequestWithContext(ctx, http.MethodGet, peer.RelayAddress, nil) + if err != nil { + s.logger.Error(ctx, "create http request for relay probe", + slog.F("relay_address", peer.RelayAddress), slog.Error(err)) + return + } + client := http.Client{ + Timeout: s.options.PeerTimeout, + } + res, err := client.Do(req) + if err != nil { + mu.Lock() + failed = append(failed, fmt.Sprintf("relay %s (%s): %s", peer.Hostname, peer.RelayAddress, err)) + mu.Unlock() + return + } + _ = res.Body.Close() + }() + } + wg.Wait() + replicaError := sql.NullString{} + if len(failed) > 0 { + replicaError = sql.NullString{ + Valid: true, + String: fmt.Sprintf("Failed to dial peers: %s", strings.Join(failed, ", ")), + } + } + + replica, err := s.db.UpdateReplica(ctx, database.UpdateReplicaParams{ + ID: s.self.ID, + UpdatedAt: database.Now(), + StartedAt: s.self.StartedAt, + StoppedAt: s.self.StoppedAt, + RelayAddress: s.self.RelayAddress, + RegionID: s.self.RegionID, + Hostname: s.self.Hostname, + Version: s.self.Version, + Error: replicaError, + }) + if err != nil { + return xerrors.Errorf("update replica: %w", err) + } + s.mutex.Lock() + if s.self.Error.String != replica.Error.String { + // Publish an update occurred! + err = s.pubsub.Publish(PubsubEvent, []byte(s.self.ID.String())) + if err != nil { + s.mutex.Unlock() + return xerrors.Errorf("publish replica update: %w", err) + } + } + s.self = replica + if s.callback != nil { + go s.callback() + } + s.mutex.Unlock() + return nil +} + +// Self represents the current replica. +func (s *Server) Self() database.Replica { + s.mutex.Lock() + defer s.mutex.Unlock() + return s.self +} + +// All returns every replica, including itself. +func (s *Server) All() []database.Replica { + s.mutex.Lock() + defer s.mutex.Unlock() + return append(s.peers, s.self) +} + +// Regional returns all replicas in the same region excluding itself. +func (s *Server) Regional() []database.Replica { + s.mutex.Lock() + defer s.mutex.Unlock() + replicas := make([]database.Replica, 0) + for _, replica := range s.peers { + if replica.RegionID != s.self.RegionID { + continue + } + replicas = append(replicas, replica) + } + return replicas +} + +// SetCallback sets a function to execute whenever new peers +// are refreshed or updated. +func (s *Server) SetCallback(callback func()) { + s.mutex.Lock() + defer s.mutex.Unlock() + s.callback = callback + // Instantly call the callback to inform replicas! + go callback() +} + +func (s *Server) Close() error { + s.closeMutex.Lock() + select { + case <-s.closed: + s.closeMutex.Unlock() + return nil + default: + } + close(s.closed) + s.closeCancel() + s.closeWait.Wait() + s.closeMutex.Unlock() + + ctx, cancelFunc := context.WithTimeout(context.Background(), 5*time.Second) + defer cancelFunc() + _, err := s.db.UpdateReplica(ctx, database.UpdateReplicaParams{ + ID: s.self.ID, + UpdatedAt: database.Now(), + StartedAt: s.self.StartedAt, + StoppedAt: sql.NullTime{ + Time: database.Now(), + Valid: true, + }, + RelayAddress: s.self.RelayAddress, + RegionID: s.self.RegionID, + Hostname: s.self.Hostname, + Version: s.self.Version, + Error: s.self.Error, + }) + if err != nil { + return xerrors.Errorf("update replica: %w", err) + } + err = s.pubsub.Publish(PubsubEvent, []byte(s.self.ID.String())) + if err != nil { + return xerrors.Errorf("publish replica update: %w", err) + } + return nil +} diff --git a/enterprise/replica/replica_test.go b/enterprise/replica/replica_test.go new file mode 100644 index 0000000000000..74efb3b40470e --- /dev/null +++ b/enterprise/replica/replica_test.go @@ -0,0 +1,193 @@ +package replica_test + +import ( + "context" + "net/http" + "net/http/httptest" + "sync" + "sync/atomic" + "testing" + "time" + + "github.com/google/uuid" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/goleak" + + "cdr.dev/slog/sloggers/slogtest" + "github.com/coder/coder/coderd/database" + "github.com/coder/coder/coderd/database/dbtestutil" + "github.com/coder/coder/enterprise/replica" + "github.com/coder/coder/testutil" +) + +func TestMain(m *testing.M) { + goleak.VerifyTestMain(m) +} + +func TestReplica(t *testing.T) { + t.Parallel() + t.Run("CreateOnNew", func(t *testing.T) { + // This ensures that a new replica is created on New. + t.Parallel() + db, pubsub := dbtestutil.NewDB(t) + id := uuid.New() + cancel, err := pubsub.Subscribe(replica.PubsubEvent, func(ctx context.Context, message []byte) { + assert.Equal(t, []byte(id.String()), message) + }) + require.NoError(t, err) + defer cancel() + server, err := replica.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replica.Options{ + ID: id, + }) + require.NoError(t, err) + _ = server.Close() + require.NoError(t, err) + }) + t.Run("UpdatesOnNew", func(t *testing.T) { + // This ensures that a replica is updated when it initially connects + // and immediately publishes it's existence! + t.Parallel() + db, pubsub := dbtestutil.NewDB(t) + id := uuid.New() + _, err := db.InsertReplica(context.Background(), database.InsertReplicaParams{ + ID: id, + }) + require.NoError(t, err) + cancel, err := pubsub.Subscribe(replica.PubsubEvent, func(ctx context.Context, message []byte) { + assert.Equal(t, []byte(id.String()), message) + }) + require.NoError(t, err) + defer cancel() + server, err := replica.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replica.Options{ + ID: id, + }) + require.NoError(t, err) + _ = server.Close() + require.NoError(t, err) + }) + t.Run("ConnectsToPeerReplica", func(t *testing.T) { + // Ensures that the replica reports a successful status for + // accessing all of its peers. + t.Parallel() + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() + db, pubsub := dbtestutil.NewDB(t) + peer, err := db.InsertReplica(context.Background(), database.InsertReplicaParams{ + ID: uuid.New(), + CreatedAt: database.Now(), + StartedAt: database.Now(), + UpdatedAt: database.Now(), + Hostname: "something", + RelayAddress: srv.URL, + }) + require.NoError(t, err) + server, err := replica.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replica.Options{ + ID: uuid.New(), + }) + require.NoError(t, err) + require.Len(t, server.Regional(), 1) + require.Equal(t, peer.ID, server.Regional()[0].ID) + require.False(t, server.Self().Error.Valid) + _ = server.Close() + }) + t.Run("ConnectsToFakePeerWithError", func(t *testing.T) { + t.Parallel() + db, pubsub := dbtestutil.NewDB(t) + var count atomic.Int32 + cancel, err := pubsub.Subscribe(replica.PubsubEvent, func(ctx context.Context, message []byte) { + count.Add(1) + }) + require.NoError(t, err) + defer cancel() + peer, err := db.InsertReplica(context.Background(), database.InsertReplicaParams{ + ID: uuid.New(), + CreatedAt: database.Now(), + StartedAt: database.Now(), + UpdatedAt: database.Now(), + Hostname: "something", + // Fake address to hit! + RelayAddress: "http://169.254.169.254", + }) + require.NoError(t, err) + server, err := replica.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replica.Options{ + ID: uuid.New(), + PeerTimeout: 1 * time.Millisecond, + }) + require.NoError(t, err) + require.Len(t, server.Regional(), 1) + require.Equal(t, peer.ID, server.Regional()[0].ID) + require.True(t, server.Self().Error.Valid) + require.Contains(t, server.Self().Error.String, "Failed to dial peers") + // Once for the initial creation of a replica, and another time for the error. + require.Equal(t, int32(2), count.Load()) + _ = server.Close() + }) + t.Run("RefreshOnPublish", func(t *testing.T) { + // Refresh when a new replica appears! + t.Parallel() + db, pubsub := dbtestutil.NewDB(t) + id := uuid.New() + server, err := replica.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replica.Options{ + ID: id, + }) + require.NoError(t, err) + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() + peer, err := db.InsertReplica(context.Background(), database.InsertReplicaParams{ + ID: uuid.New(), + RelayAddress: srv.URL, + UpdatedAt: database.Now(), + }) + require.NoError(t, err) + // Publish multiple times to ensure it can handle that case. + err = pubsub.Publish(replica.PubsubEvent, []byte(peer.ID.String())) + require.NoError(t, err) + err = pubsub.Publish(replica.PubsubEvent, []byte(peer.ID.String())) + require.NoError(t, err) + require.Eventually(t, func() bool { + return len(server.Regional()) == 1 + }, testutil.WaitShort, testutil.IntervalFast) + _ = server.Close() + }) + t.Run("TwentyConcurrent", func(t *testing.T) { + // Ensures that twenty concurrent replicas can spawn and all + // discover each other in parallel! + t.Parallel() + db, pubsub := dbtestutil.NewDB(t) + logger := slogtest.Make(t, nil) + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() + var wg sync.WaitGroup + count := 20 + wg.Add(count) + for i := 0; i < count; i++ { + server, err := replica.New(context.Background(), logger, db, pubsub, replica.Options{ + ID: uuid.New(), + RelayAddress: srv.URL, + }) + require.NoError(t, err) + t.Cleanup(func() { + _ = server.Close() + }) + done := false + server.SetCallback(func() { + if len(server.All()) != count { + return + } + if done { + return + } + done = true + wg.Done() + }) + } + wg.Wait() + }) +} diff --git a/enterprise/tailmesh/tailmesh.go b/enterprise/tailmesh/tailmesh.go new file mode 100644 index 0000000000000..46e1c97fffcc9 --- /dev/null +++ b/enterprise/tailmesh/tailmesh.go @@ -0,0 +1,32 @@ +package tailmesh + +import ( + "context" + + "cdr.dev/slog" + "github.com/coder/coder/tailnet" + "tailscale.com/derp" + "tailscale.com/derp/derphttp" +) + +func New(logger slog.Logger, server *derp.Server) *Mesh { + +} + +type Mesh struct { + logger slog.Logger + server *derp.Server + ctx context.Context + + active map[string]context.CancelFunc +} + +func (m *Mesh) SetAddresses(addresses []string) { + for _, address := range addresses { + client, err := derphttp.NewClient(m.server.PrivateKey(), address, tailnet.Logger(m.logger)) + if err != nil { + + } + go client.RunWatchConnectionLoop() + } +} From 7dc968c52313e1dbe485ea5b0b2d9f8edc01c0b3 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Wed, 12 Oct 2022 17:53:00 +0000 Subject: [PATCH 12/79] Add DERP meshing to arbitrary addresses --- enterprise/derpmesh/derpmesh.go | 124 +++++++++++++++++++++++ enterprise/derpmesh/derpmesh_test.go | 146 +++++++++++++++++++++++++++ enterprise/tailmesh/tailmesh.go | 32 ------ 3 files changed, 270 insertions(+), 32 deletions(-) create mode 100644 enterprise/derpmesh/derpmesh.go create mode 100644 enterprise/derpmesh/derpmesh_test.go delete mode 100644 enterprise/tailmesh/tailmesh.go diff --git a/enterprise/derpmesh/derpmesh.go b/enterprise/derpmesh/derpmesh.go new file mode 100644 index 0000000000000..610fd749132cc --- /dev/null +++ b/enterprise/derpmesh/derpmesh.go @@ -0,0 +1,124 @@ +package derpmesh + +import ( + "context" + "sync" + + "golang.org/x/xerrors" + "tailscale.com/derp" + "tailscale.com/derp/derphttp" + "tailscale.com/types/key" + + "github.com/coder/coder/tailnet" + + "cdr.dev/slog" +) + +func New(logger slog.Logger, server *derp.Server) *Mesh { + return &Mesh{ + logger: logger, + server: server, + ctx: context.Background(), + closed: make(chan struct{}), + active: make(map[string]context.CancelFunc), + } +} + +type Mesh struct { + logger slog.Logger + server *derp.Server + ctx context.Context + + mutex sync.Mutex + closed chan struct{} + active map[string]context.CancelFunc +} + +// SetAddresses performs a diff of the incoming addresses and adds +// or removes DERP clients from the mesh. +func (m *Mesh) SetAddresses(addresses []string) { + total := make(map[string]struct{}, 0) + for _, address := range addresses { + total[address] = struct{}{} + added, err := m.addAddress(address) + if err != nil { + m.logger.Error(m.ctx, "failed to add address", slog.F("address", address), slog.Error(err)) + continue + } + if added { + m.logger.Debug(m.ctx, "added mesh address", slog.F("address", address)) + } + } + + m.mutex.Lock() + for address := range m.active { + _, found := total[address] + if found { + continue + } + removed := m.removeAddress(address) + if removed { + m.logger.Debug(m.ctx, "removed mesh address", slog.F("address", address)) + } + } + m.mutex.Unlock() +} + +// addAddress begins meshing with a new address. +// It's expected that this is a full HTTP address with a path. +// e.g. http://127.0.0.1:8080/derp +func (m *Mesh) addAddress(address string) (bool, error) { + m.mutex.Lock() + defer m.mutex.Unlock() + _, isActive := m.active[address] + if isActive { + return false, nil + } + client, err := derphttp.NewClient(m.server.PrivateKey(), address, tailnet.Logger(m.logger)) + if err != nil { + return false, xerrors.Errorf("create derp client: %w", err) + } + client.MeshKey = m.server.MeshKey() + ctx, cancelFunc := context.WithCancel(m.ctx) + closed := make(chan struct{}) + closeFunc := func() { + cancelFunc() + _ = client.Close() + <-closed + } + m.active[address] = closeFunc + go func() { + defer close(closed) + client.RunWatchConnectionLoop(ctx, m.server.PublicKey(), tailnet.Logger(m.logger), func(np key.NodePublic) { + m.server.AddPacketForwarder(np, client) + }, func(np key.NodePublic) { + m.server.RemovePacketForwarder(np, client) + }) + }() + return true, nil +} + +// removeAddress stops meshing with a given address. +func (m *Mesh) removeAddress(address string) bool { + cancelFunc, isActive := m.active[address] + if isActive { + cancelFunc() + } + return isActive +} + +// Close ends all active meshes with the DERP server. +func (m *Mesh) Close() error { + m.mutex.Lock() + defer m.mutex.Unlock() + select { + case <-m.closed: + return nil + default: + } + close(m.closed) + for _, cancelFunc := range m.active { + cancelFunc() + } + return nil +} diff --git a/enterprise/derpmesh/derpmesh_test.go b/enterprise/derpmesh/derpmesh_test.go new file mode 100644 index 0000000000000..313c33da99bad --- /dev/null +++ b/enterprise/derpmesh/derpmesh_test.go @@ -0,0 +1,146 @@ +package derpmesh_test + +import ( + "context" + "errors" + "io" + "net/http/httptest" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/goleak" + "tailscale.com/derp" + "tailscale.com/derp/derphttp" + "tailscale.com/types/key" + + "cdr.dev/slog" + "cdr.dev/slog/sloggers/slogtest" + "github.com/coder/coder/enterprise/derpmesh" + "github.com/coder/coder/tailnet" +) + +func TestMain(m *testing.M) { + goleak.VerifyTestMain(m) +} + +func TestDERPMesh(t *testing.T) { + t.Parallel() + t.Run("ExchangeMessages", func(t *testing.T) { + // This tests messages passing through multiple DERP servers. + t.Parallel() + firstServer, firstServerURL := startDERP(t) + defer firstServer.Close() + secondServer, secondServerURL := startDERP(t) + firstMesh := derpmesh.New(slogtest.Make(t, nil).Named("first").Leveled(slog.LevelDebug), firstServer) + firstMesh.SetAddresses([]string{secondServerURL}) + secondMesh := derpmesh.New(slogtest.Make(t, nil).Named("second").Leveled(slog.LevelDebug), secondServer) + secondMesh.SetAddresses([]string{firstServerURL}) + defer firstMesh.Close() + defer secondMesh.Close() + + first := key.NewNode() + second := key.NewNode() + firstClient, err := derphttp.NewClient(first, secondServerURL, tailnet.Logger(slogtest.Make(t, nil))) + require.NoError(t, err) + secondClient, err := derphttp.NewClient(second, firstServerURL, tailnet.Logger(slogtest.Make(t, nil))) + require.NoError(t, err) + err = secondClient.Connect(context.Background()) + require.NoError(t, err) + + sent := []byte("hello world") + err = firstClient.Send(second.Public(), sent) + require.NoError(t, err) + + got := recvData(t, secondClient) + require.Equal(t, sent, got) + }) + t.Run("RemoveAddress", func(t *testing.T) { + // This tests messages passing through multiple DERP servers. + t.Parallel() + server, serverURL := startDERP(t) + mesh := derpmesh.New(slogtest.Make(t, nil).Named("first").Leveled(slog.LevelDebug), server) + mesh.SetAddresses([]string{"http://fake.com"}) + // This should trigger a removal... + mesh.SetAddresses([]string{}) + defer mesh.Close() + + first := key.NewNode() + second := key.NewNode() + firstClient, err := derphttp.NewClient(first, serverURL, tailnet.Logger(slogtest.Make(t, nil))) + require.NoError(t, err) + secondClient, err := derphttp.NewClient(second, serverURL, tailnet.Logger(slogtest.Make(t, nil))) + require.NoError(t, err) + err = secondClient.Connect(context.Background()) + require.NoError(t, err) + sent := []byte("hello world") + err = firstClient.Send(second.Public(), sent) + require.NoError(t, err) + got := recvData(t, secondClient) + require.Equal(t, sent, got) + }) + t.Run("TwentyMeshes", func(t *testing.T) { + t.Parallel() + meshes := make([]*derpmesh.Mesh, 0, 20) + serverURLs := make([]string, 0, 20) + for i := 0; i < 20; i++ { + server, url := startDERP(t) + mesh := derpmesh.New(slogtest.Make(t, nil).Named("mesh").Leveled(slog.LevelDebug), server) + t.Cleanup(func() { + _ = server.Close() + _ = mesh.Close() + }) + serverURLs = append(serverURLs, url) + meshes = append(meshes, mesh) + } + for _, mesh := range meshes { + mesh.SetAddresses(serverURLs) + } + + first := key.NewNode() + second := key.NewNode() + firstClient, err := derphttp.NewClient(first, serverURLs[9], tailnet.Logger(slogtest.Make(t, nil))) + require.NoError(t, err) + secondClient, err := derphttp.NewClient(second, serverURLs[16], tailnet.Logger(slogtest.Make(t, nil))) + require.NoError(t, err) + err = secondClient.Connect(context.Background()) + require.NoError(t, err) + + sent := []byte("hello world") + err = firstClient.Send(second.Public(), sent) + require.NoError(t, err) + + got := recvData(t, secondClient) + require.Equal(t, sent, got) + }) +} + +func recvData(t *testing.T, client *derphttp.Client) []byte { + for { + msg, err := client.Recv() + if errors.Is(err, io.EOF) { + return nil + } + assert.NoError(t, err) + t.Logf("derp: %T", msg) + switch msg := msg.(type) { + case derp.ReceivedPacket: + return msg.Data + default: + // Drop all others! + } + } +} + +func startDERP(t *testing.T) (*derp.Server, string) { + logf := tailnet.Logger(slogtest.Make(t, nil)) + d := derp.NewServer(key.NewNode(), logf) + d.SetMeshKey("some-key") + server := httptest.NewUnstartedServer(derphttp.Handler(d)) + server.Start() + t.Cleanup(func() { + _ = d.Close() + }) + t.Cleanup(server.Close) + return d, server.URL +} diff --git a/enterprise/tailmesh/tailmesh.go b/enterprise/tailmesh/tailmesh.go deleted file mode 100644 index 46e1c97fffcc9..0000000000000 --- a/enterprise/tailmesh/tailmesh.go +++ /dev/null @@ -1,32 +0,0 @@ -package tailmesh - -import ( - "context" - - "cdr.dev/slog" - "github.com/coder/coder/tailnet" - "tailscale.com/derp" - "tailscale.com/derp/derphttp" -) - -func New(logger slog.Logger, server *derp.Server) *Mesh { - -} - -type Mesh struct { - logger slog.Logger - server *derp.Server - ctx context.Context - - active map[string]context.CancelFunc -} - -func (m *Mesh) SetAddresses(addresses []string) { - for _, address := range addresses { - client, err := derphttp.NewClient(m.server.PrivateKey(), address, tailnet.Logger(m.logger)) - if err != nil { - - } - go client.RunWatchConnectionLoop() - } -} From 1dcf0d01899a2544cbf0e3a1134b0a0c6d0e4bce Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Wed, 12 Oct 2022 17:55:22 +0000 Subject: [PATCH 13/79] Move packages to highavailability folder --- enterprise/{ => highavailability}/derpmesh/derpmesh.go | 0 enterprise/{ => highavailability}/derpmesh/derpmesh_test.go | 2 +- enterprise/{ => highavailability}/replica/replica.go | 0 enterprise/{ => highavailability}/replica/replica_test.go | 2 +- 4 files changed, 2 insertions(+), 2 deletions(-) rename enterprise/{ => highavailability}/derpmesh/derpmesh.go (100%) rename enterprise/{ => highavailability}/derpmesh/derpmesh_test.go (98%) rename enterprise/{ => highavailability}/replica/replica.go (100%) rename enterprise/{ => highavailability}/replica/replica_test.go (98%) diff --git a/enterprise/derpmesh/derpmesh.go b/enterprise/highavailability/derpmesh/derpmesh.go similarity index 100% rename from enterprise/derpmesh/derpmesh.go rename to enterprise/highavailability/derpmesh/derpmesh.go diff --git a/enterprise/derpmesh/derpmesh_test.go b/enterprise/highavailability/derpmesh/derpmesh_test.go similarity index 98% rename from enterprise/derpmesh/derpmesh_test.go rename to enterprise/highavailability/derpmesh/derpmesh_test.go index 313c33da99bad..6e1154fc3d6a8 100644 --- a/enterprise/derpmesh/derpmesh_test.go +++ b/enterprise/highavailability/derpmesh/derpmesh_test.go @@ -16,7 +16,7 @@ import ( "cdr.dev/slog" "cdr.dev/slog/sloggers/slogtest" - "github.com/coder/coder/enterprise/derpmesh" + "github.com/coder/coder/enterprise/highavailability/derpmesh" "github.com/coder/coder/tailnet" ) diff --git a/enterprise/replica/replica.go b/enterprise/highavailability/replica/replica.go similarity index 100% rename from enterprise/replica/replica.go rename to enterprise/highavailability/replica/replica.go diff --git a/enterprise/replica/replica_test.go b/enterprise/highavailability/replica/replica_test.go similarity index 98% rename from enterprise/replica/replica_test.go rename to enterprise/highavailability/replica/replica_test.go index 74efb3b40470e..a5bda874ea166 100644 --- a/enterprise/replica/replica_test.go +++ b/enterprise/highavailability/replica/replica_test.go @@ -17,7 +17,7 @@ import ( "cdr.dev/slog/sloggers/slogtest" "github.com/coder/coder/coderd/database" "github.com/coder/coder/coderd/database/dbtestutil" - "github.com/coder/coder/enterprise/replica" + "github.com/coder/coder/enterprise/highavailability/replica" "github.com/coder/coder/testutil" ) From 289e13913a8586f2e160d56406e5e2a3e4d47254 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Wed, 12 Oct 2022 18:03:51 +0000 Subject: [PATCH 14/79] Move coordinator to high availability package --- agent/agent_test.go | 2 +- coderd/coderd.go | 2 +- coderd/wsconncache/wsconncache_test.go | 2 +- enterprise/coderd/coderd.go | 8 +++---- .../coordinator.go | 6 +++-- .../coordinator_test.go | 14 +++++------ tailnet/coordinator.go | 23 ++++++++++--------- tailnet/coordinator_test.go | 6 ++--- 8 files changed, 33 insertions(+), 30 deletions(-) rename enterprise/{tailnet => highavailability}/coordinator.go (98%) rename enterprise/{tailnet => highavailability}/coordinator_test.go (92%) diff --git a/agent/agent_test.go b/agent/agent_test.go index 38d70846dfd8b..06a33598b755f 100644 --- a/agent/agent_test.go +++ b/agent/agent_test.go @@ -560,7 +560,7 @@ func setupAgent(t *testing.T, metadata codersdk.WorkspaceAgentMetadata, ptyTimeo if metadata.DERPMap == nil { metadata.DERPMap = tailnettest.RunDERPAndSTUN(t) } - coordinator := tailnet.NewMemoryCoordinator() + coordinator := tailnet.NewCoordinator() agentID := uuid.New() statsCh := make(chan *codersdk.AgentStats) closer := agent.New(agent.Options{ diff --git a/coderd/coderd.go b/coderd/coderd.go index d6dba70b9f80b..4976b3a58cde2 100644 --- a/coderd/coderd.go +++ b/coderd/coderd.go @@ -119,7 +119,7 @@ func New(options *Options) *API { options.PrometheusRegistry = prometheus.NewRegistry() } if options.TailnetCoordinator == nil { - options.TailnetCoordinator = tailnet.NewMemoryCoordinator() + options.TailnetCoordinator = tailnet.NewCoordinator() } if options.Auditor == nil { options.Auditor = audit.NewNop() diff --git a/coderd/wsconncache/wsconncache_test.go b/coderd/wsconncache/wsconncache_test.go index 2b5ed06b45784..003d3cddb8b7a 100644 --- a/coderd/wsconncache/wsconncache_test.go +++ b/coderd/wsconncache/wsconncache_test.go @@ -143,7 +143,7 @@ func TestCache(t *testing.T) { func setupAgent(t *testing.T, metadata codersdk.WorkspaceAgentMetadata, ptyTimeout time.Duration) *codersdk.AgentConn { metadata.DERPMap = tailnettest.RunDERPAndSTUN(t) - coordinator := tailnet.NewMemoryCoordinator() + coordinator := tailnet.NewCoordinator() agentID := uuid.New() closer := agent.New(agent.Options{ FetchMetadata: func(ctx context.Context) (codersdk.WorkspaceAgentMetadata, error) { diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index d49a6cd2c8a9d..dfa9c25e4cf77 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -23,8 +23,8 @@ import ( "github.com/coder/coder/enterprise/audit" "github.com/coder/coder/enterprise/audit/backends" "github.com/coder/coder/enterprise/coderd/license" - "github.com/coder/coder/enterprise/tailnet" - agpltailnet "github.com/coder/coder/tailnet" + "github.com/coder/coder/enterprise/highavailability" + "github.com/coder/coder/tailnet" ) // New constructs an Enterprise coderd API instance. @@ -206,9 +206,9 @@ func (api *API) updateEntitlements(ctx context.Context) error { } if changed, enabled := featureChanged(codersdk.FeatureHighAvailability); changed { - coordinator := agpltailnet.NewMemoryCoordinator() + coordinator := tailnet.NewCoordinator() if enabled { - haCoordinator, err := tailnet.NewHACoordinator(api.Logger, api.Pubsub) + haCoordinator, err := highavailability.NewCoordinator(api.Logger, api.Pubsub) if err != nil { api.Logger.Error(ctx, "unable to setup HA tailnet coordinator", slog.Error(err)) // If we try to setup the HA coordinator and it fails, nothing diff --git a/enterprise/tailnet/coordinator.go b/enterprise/highavailability/coordinator.go similarity index 98% rename from enterprise/tailnet/coordinator.go rename to enterprise/highavailability/coordinator.go index 6bf2327507165..7c41e47d44f1d 100644 --- a/enterprise/tailnet/coordinator.go +++ b/enterprise/highavailability/coordinator.go @@ -1,4 +1,4 @@ -package tailnet +package highavailability import ( "bytes" @@ -18,7 +18,9 @@ import ( agpl "github.com/coder/coder/tailnet" ) -func NewHACoordinator(logger slog.Logger, pubsub database.Pubsub) (agpl.Coordinator, error) { +// NewCoordinator creates a new high availability coordinator +// that uses PostgreSQL pubsub to exchange handshakes. +func NewCoordinator(logger slog.Logger, pubsub database.Pubsub) (agpl.Coordinator, error) { coord := &haCoordinator{ id: uuid.New(), log: logger, diff --git a/enterprise/tailnet/coordinator_test.go b/enterprise/highavailability/coordinator_test.go similarity index 92% rename from enterprise/tailnet/coordinator_test.go rename to enterprise/highavailability/coordinator_test.go index 4889cd1c8ba60..1e86c08f1b1ed 100644 --- a/enterprise/tailnet/coordinator_test.go +++ b/enterprise/highavailability/coordinator_test.go @@ -1,4 +1,4 @@ -package tailnet_test +package highavailability_test import ( "net" @@ -11,7 +11,7 @@ import ( "cdr.dev/slog/sloggers/slogtest" "github.com/coder/coder/coderd/database" - "github.com/coder/coder/enterprise/tailnet" + "github.com/coder/coder/enterprise/highavailability" agpl "github.com/coder/coder/tailnet" "github.com/coder/coder/testutil" ) @@ -20,7 +20,7 @@ func TestCoordinatorSingle(t *testing.T) { t.Parallel() t.Run("ClientWithoutAgent", func(t *testing.T) { t.Parallel() - coordinator, err := tailnet.NewHACoordinator(slogtest.Make(t, nil), database.NewPubsubInMemory()) + coordinator, err := highavailability.NewCoordinator(slogtest.Make(t, nil), database.NewPubsubInMemory()) require.NoError(t, err) defer coordinator.Close() @@ -48,7 +48,7 @@ func TestCoordinatorSingle(t *testing.T) { t.Run("AgentWithoutClients", func(t *testing.T) { t.Parallel() - coordinator, err := tailnet.NewHACoordinator(slogtest.Make(t, nil), database.NewPubsubInMemory()) + coordinator, err := highavailability.NewCoordinator(slogtest.Make(t, nil), database.NewPubsubInMemory()) require.NoError(t, err) defer coordinator.Close() @@ -76,7 +76,7 @@ func TestCoordinatorSingle(t *testing.T) { t.Run("AgentWithClient", func(t *testing.T) { t.Parallel() - coordinator, err := tailnet.NewHACoordinator(slogtest.Make(t, nil), database.NewPubsubInMemory()) + coordinator, err := highavailability.NewCoordinator(slogtest.Make(t, nil), database.NewPubsubInMemory()) require.NoError(t, err) defer coordinator.Close() @@ -169,11 +169,11 @@ func TestCoordinatorHA(t *testing.T) { pubsub := database.NewPubsubInMemory() - coordinator1, err := tailnet.NewHACoordinator(slogtest.Make(t, nil), pubsub) + coordinator1, err := highavailability.NewCoordinator(slogtest.Make(t, nil), pubsub) require.NoError(t, err) defer coordinator1.Close() - coordinator2, err := tailnet.NewHACoordinator(slogtest.Make(t, nil), pubsub) + coordinator2, err := highavailability.NewCoordinator(slogtest.Make(t, nil), pubsub) require.NoError(t, err) defer coordinator2.Close() diff --git a/tailnet/coordinator.go b/tailnet/coordinator.go index 150a323bcfe52..96de8d295162e 100644 --- a/tailnet/coordinator.go +++ b/tailnet/coordinator.go @@ -94,11 +94,11 @@ func ServeCoordinator(conn net.Conn, updateNodes func(node []*Node) error) (func }, errChan } -// NewMemoryCoordinator constructs a new in-memory connection coordinator. This +// NewCoordinator constructs a new in-memory connection coordinator. This // coordinator is incompatible with multiple Coder replicas as all node data is // in-memory. -func NewMemoryCoordinator() Coordinator { - return &memoryCoordinator{ +func NewCoordinator() Coordinator { + return &coordinator{ closed: false, nodes: map[uuid.UUID]*Node{}, agentSockets: map[uuid.UUID]net.Conn{}, @@ -106,13 +106,14 @@ func NewMemoryCoordinator() Coordinator { } } -// MemoryCoordinator exchanges nodes with agents to establish connections. +// coordinator exchanges nodes with agents to establish connections entirely in-memory. +// The Enterprise implementation provides this for high-availability. // ┌──────────────────┐ ┌────────────────────┐ ┌───────────────────┐ ┌──────────────────┐ // │tailnet.Coordinate├──►│tailnet.AcceptClient│◄─►│tailnet.AcceptAgent│◄──┤tailnet.Coordinate│ // └──────────────────┘ └────────────────────┘ └───────────────────┘ └──────────────────┘ // This coordinator is incompatible with multiple Coder // replicas as all node data is in-memory. -type memoryCoordinator struct { +type coordinator struct { mutex sync.Mutex closed bool @@ -126,7 +127,7 @@ type memoryCoordinator struct { } // Node returns an in-memory node by ID. -func (c *memoryCoordinator) Node(id uuid.UUID) *Node { +func (c *coordinator) Node(id uuid.UUID) *Node { c.mutex.Lock() defer c.mutex.Unlock() node := c.nodes[id] @@ -135,7 +136,7 @@ func (c *memoryCoordinator) Node(id uuid.UUID) *Node { // ServeClient accepts a WebSocket connection that wants to connect to an agent // with the specified ID. -func (c *memoryCoordinator) ServeClient(conn net.Conn, id uuid.UUID, agent uuid.UUID) error { +func (c *coordinator) ServeClient(conn net.Conn, id uuid.UUID, agent uuid.UUID) error { c.mutex.Lock() if c.closed { @@ -194,7 +195,7 @@ func (c *memoryCoordinator) ServeClient(conn net.Conn, id uuid.UUID, agent uuid. } } -func (c *memoryCoordinator) handleNextClientMessage(id, agent uuid.UUID, decoder *json.Decoder) error { +func (c *coordinator) handleNextClientMessage(id, agent uuid.UUID, decoder *json.Decoder) error { var node Node err := decoder.Decode(&node) if err != nil { @@ -234,7 +235,7 @@ func (c *memoryCoordinator) handleNextClientMessage(id, agent uuid.UUID, decoder // ServeAgent accepts a WebSocket connection to an agent that // listens to incoming connections and publishes node updates. -func (c *memoryCoordinator) ServeAgent(conn net.Conn, id uuid.UUID) error { +func (c *coordinator) ServeAgent(conn net.Conn, id uuid.UUID) error { c.mutex.Lock() if c.closed { @@ -293,7 +294,7 @@ func (c *memoryCoordinator) ServeAgent(conn net.Conn, id uuid.UUID) error { } } -func (c *memoryCoordinator) handleNextAgentMessage(id uuid.UUID, decoder *json.Decoder) error { +func (c *coordinator) handleNextAgentMessage(id uuid.UUID, decoder *json.Decoder) error { var node Node err := decoder.Decode(&node) if err != nil { @@ -334,7 +335,7 @@ func (c *memoryCoordinator) handleNextAgentMessage(id uuid.UUID, decoder *json.D // Close closes all of the open connections in the coordinator and stops the // coordinator from accepting new connections. -func (c *memoryCoordinator) Close() error { +func (c *coordinator) Close() error { c.mutex.Lock() defer c.mutex.Unlock() diff --git a/tailnet/coordinator_test.go b/tailnet/coordinator_test.go index e0ed44420ede2..a4a020deadf93 100644 --- a/tailnet/coordinator_test.go +++ b/tailnet/coordinator_test.go @@ -16,7 +16,7 @@ func TestCoordinator(t *testing.T) { t.Parallel() t.Run("ClientWithoutAgent", func(t *testing.T) { t.Parallel() - coordinator := tailnet.NewMemoryCoordinator() + coordinator := tailnet.NewCoordinator() client, server := net.Pipe() sendNode, errChan := tailnet.ServeCoordinator(client, func(node []*tailnet.Node) error { return nil @@ -40,7 +40,7 @@ func TestCoordinator(t *testing.T) { t.Run("AgentWithoutClients", func(t *testing.T) { t.Parallel() - coordinator := tailnet.NewMemoryCoordinator() + coordinator := tailnet.NewCoordinator() client, server := net.Pipe() sendNode, errChan := tailnet.ServeCoordinator(client, func(node []*tailnet.Node) error { return nil @@ -64,7 +64,7 @@ func TestCoordinator(t *testing.T) { t.Run("AgentWithClient", func(t *testing.T) { t.Parallel() - coordinator := tailnet.NewMemoryCoordinator() + coordinator := tailnet.NewCoordinator() agentWS, agentServerWS := net.Pipe() defer agentWS.Close() From 585bc1dfc81996b9967475de7f4249c93b24aa46 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Wed, 12 Oct 2022 22:36:05 +0000 Subject: [PATCH 15/79] Add flags for HA --- cli/config/file.go | 5 ++ cli/deployment/flags.go | 15 +++++ cli/root.go | 2 +- cli/server.go | 4 +- coderd/coderd.go | 8 ++- coderd/coderdtest/coderdtest.go | 17 +++-- codersdk/flags.go | 2 + codersdk/replicas.go | 22 +++++++ enterprise/cli/server.go | 25 +++++++- enterprise/coderd/coderd.go | 64 ++++++++++++++++--- .../coderd/coderdenttest/coderdenttest.go | 7 +- enterprise/coderd/replicas.go | 1 + enterprise/coderd/replicas_test.go | 38 +++++++++++ .../highavailability/derpmesh/derpmesh.go | 1 + .../highavailability/replica/replica.go | 7 +- 15 files changed, 190 insertions(+), 28 deletions(-) create mode 100644 codersdk/replicas.go create mode 100644 enterprise/coderd/replicas.go create mode 100644 enterprise/coderd/replicas_test.go diff --git a/cli/config/file.go b/cli/config/file.go index a98237afed22b..388ce0881f304 100644 --- a/cli/config/file.go +++ b/cli/config/file.go @@ -13,6 +13,11 @@ func (r Root) Session() File { return File(filepath.Join(string(r), "session")) } +// ReplicaID is a unique identifier for the Coder server. +func (r Root) ReplicaID() File { + return File(filepath.Join(string(r), "replica_id")) +} + func (r Root) URL() File { return File(filepath.Join(string(r), "url")) } diff --git a/cli/deployment/flags.go b/cli/deployment/flags.go index 3a03bea762b1c..35ae248a0a722 100644 --- a/cli/deployment/flags.go +++ b/cli/deployment/flags.go @@ -85,6 +85,13 @@ func Flags() *codersdk.DeploymentFlags { Description: "Addresses for STUN servers to establish P2P connections. Set empty to disable P2P connections.", Default: []string{"stun.l.google.com:19302"}, }, + DerpServerRelayAddress: &codersdk.StringFlag{ + Name: "DERP Server Relay Address", + Flag: "derp-server-relay-address", + EnvVar: "CODER_DERP_SERVER_RELAY_ADDRESS", + Description: "An HTTP address that is accessible by other replicas to relay DERP traffic. Required for high availability.", + Enterprise: true, + }, DerpConfigURL: &codersdk.StringFlag{ Name: "DERP Config URL", Flag: "derp-config-url", @@ -123,6 +130,14 @@ func Flags() *codersdk.DeploymentFlags { Description: "The bind address to serve pprof.", Default: "127.0.0.1:6060", }, + HighAvailability: &codersdk.BoolFlag{ + Name: "High Availability", + Flag: "high-availability", + EnvVar: "CODER_HIGH_AVAILABILITY", + Description: "Specifies whether high availability is enabled.", + Default: true, + Enterprise: true, + }, CacheDir: &codersdk.StringFlag{ Name: "Cache Directory", Flag: "cache-dir", diff --git a/cli/root.go b/cli/root.go index e7104e64284eb..e29aa534da0a8 100644 --- a/cli/root.go +++ b/cli/root.go @@ -100,7 +100,7 @@ func Core() []*cobra.Command { } func AGPL() []*cobra.Command { - all := append(Core(), Server(deployment.Flags(), func(_ context.Context, o *coderd.Options) (*coderd.API, error) { + all := append(Core(), Server(deployment.Flags(), func(_ context.Context, _ config.Root, o *coderd.Options) (*coderd.API, error) { return coderd.New(o), nil })) return all diff --git a/cli/server.go b/cli/server.go index e3cad09ca27ff..fc5f131da3d7b 100644 --- a/cli/server.go +++ b/cli/server.go @@ -67,7 +67,7 @@ import ( ) // nolint:gocyclo -func Server(dflags *codersdk.DeploymentFlags, newAPI func(context.Context, *coderd.Options) (*coderd.API, error)) *cobra.Command { +func Server(dflags *codersdk.DeploymentFlags, newAPI func(context.Context, config.Root, *coderd.Options) (*coderd.API, error)) *cobra.Command { root := &cobra.Command{ Use: "server", Short: "Start a Coder server", @@ -463,7 +463,7 @@ func Server(dflags *codersdk.DeploymentFlags, newAPI func(context.Context, *code ), dflags.PromAddress.Value, "prometheus")() } - coderAPI, err := newAPI(ctx, options) + coderAPI, err := newAPI(ctx, config, options) if err != nil { return err } diff --git a/coderd/coderd.go b/coderd/coderd.go index 4976b3a58cde2..57b78520d1b50 100644 --- a/coderd/coderd.go +++ b/coderd/coderd.go @@ -77,6 +77,7 @@ type Options struct { AutoImportTemplates []AutoImportTemplate TailnetCoordinator tailnet.Coordinator + DERPServer *derp.Server DERPMap *tailcfg.DERPMap MetricsCacheRefreshInterval time.Duration @@ -121,6 +122,9 @@ func New(options *Options) *API { if options.TailnetCoordinator == nil { options.TailnetCoordinator = tailnet.NewCoordinator() } + if options.DERPServer == nil { + options.DERPServer = derp.NewServer(key.NewNode(), tailnet.Logger(options.Logger)) + } if options.Auditor == nil { options.Auditor = audit.NewNop() } @@ -160,7 +164,6 @@ func New(options *Options) *API { api.WorkspaceQuotaEnforcer.Store(&options.WorkspaceQuotaEnforcer) api.workspaceAgentCache = wsconncache.New(api.dialWorkspaceAgentTailnet, 0) api.TailnetCoordinator.Store(&options.TailnetCoordinator) - api.derpServer = derp.NewServer(key.NewNode(), tailnet.Logger(options.Logger)) oauthConfigs := &httpmw.OAuth2Configs{ Github: options.GithubOAuth2Config, OIDC: options.OIDCConfig, @@ -228,7 +231,7 @@ func New(options *Options) *API { r.Route("/%40{user}/{workspace_and_agent}/apps/{workspaceapp}", apps) r.Route("/@{user}/{workspace_and_agent}/apps/{workspaceapp}", apps) r.Route("/derp", func(r chi.Router) { - r.Get("/", derphttp.Handler(api.derpServer).ServeHTTP) + r.Get("/", derphttp.Handler(api.DERPServer).ServeHTTP) // This is used when UDP is blocked, and latency must be checked via HTTP(s). r.Get("/latency-check", func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusOK) @@ -540,7 +543,6 @@ type API struct { // RootHandler serves "/" RootHandler chi.Router - derpServer *derp.Server metricsCache *metricscache.Cache siteHandler http.Handler websocketWaitMutex sync.Mutex diff --git a/coderd/coderdtest/coderdtest.go b/coderd/coderdtest/coderdtest.go index d7ac4eb14be97..23305cbcbab36 100644 --- a/coderd/coderdtest/coderdtest.go +++ b/coderd/coderdtest/coderdtest.go @@ -81,6 +81,12 @@ type Options struct { MetricsCacheRefreshInterval time.Duration AgentStatsRefreshInterval time.Duration DeploymentFlags *codersdk.DeploymentFlags + + // Overriding the database is heavily discouraged. + // It should only be used in cases where multiple Coder + // test instances are running against the same database. + Database database.Store + Pubsub database.Pubsub } // New constructs a codersdk client connected to an in-memory API instance. @@ -135,13 +141,14 @@ func NewOptions(t *testing.T, options *Options) (*httptest.Server, context.Cance close(options.AutobuildStats) }) } - - db, pubsub := dbtestutil.NewDB(t) + if options.Database == nil { + options.Database, options.Pubsub = dbtestutil.NewDB(t) + } ctx, cancelFunc := context.WithCancel(context.Background()) lifecycleExecutor := executor.New( ctx, - db, + options.Database, slogtest.Make(t, nil).Named("autobuild.executor").Leveled(slog.LevelDebug), options.AutobuildTicker, ).WithStatsChannel(options.AutobuildStats) @@ -181,8 +188,8 @@ func NewOptions(t *testing.T, options *Options) (*httptest.Server, context.Cance AppHostname: options.AppHostname, Logger: slogtest.Make(t, nil).Leveled(slog.LevelDebug), CacheDir: t.TempDir(), - Database: db, - Pubsub: pubsub, + Database: options.Database, + Pubsub: options.Pubsub, Auditor: options.Auditor, AWSCertificates: options.AWSCertificates, diff --git a/codersdk/flags.go b/codersdk/flags.go index 92f02941a57f8..2dd1323a1fddc 100644 --- a/codersdk/flags.go +++ b/codersdk/flags.go @@ -19,6 +19,7 @@ type DeploymentFlags struct { DerpServerRegionCode *StringFlag `json:"derp_server_region_code" typescript:",notnull"` DerpServerRegionName *StringFlag `json:"derp_server_region_name" typescript:",notnull"` DerpServerSTUNAddresses *StringArrayFlag `json:"derp_server_stun_address" typescript:",notnull"` + DerpServerRelayAddress *StringFlag `json:"derp_server_relay_address" typescript:",notnull"` DerpConfigURL *StringFlag `json:"derp_config_url" typescript:",notnull"` DerpConfigPath *StringFlag `json:"derp_config_path" typescript:",notnull"` PromEnabled *BoolFlag `json:"prom_enabled" typescript:",notnull"` @@ -59,6 +60,7 @@ type DeploymentFlags struct { Verbose *BoolFlag `json:"verbose" typescript:",notnull"` AuditLogging *BoolFlag `json:"audit_logging" typescript:",notnull"` BrowserOnly *BoolFlag `json:"browser_only" typescript:",notnull"` + HighAvailability *BoolFlag `json:"high_availability" typescript:",notnull"` SCIMAuthHeader *StringFlag `json:"scim_auth_header" typescript:",notnull"` UserWorkspaceQuota *IntFlag `json:"user_workspace_quota" typescript:",notnull"` } diff --git a/codersdk/replicas.go b/codersdk/replicas.go new file mode 100644 index 0000000000000..341b460792ddd --- /dev/null +++ b/codersdk/replicas.go @@ -0,0 +1,22 @@ +package codersdk + +import ( + "time" + + "github.com/google/uuid" +) + +type Replica struct { + // ID is the unique identifier for the replica. + ID uuid.UUID `json:"id"` + // Hostname is the hostname of the replica. + Hostname string `json:"hostname"` + // CreatedAt is when the replica was first seen. + CreatedAt time.Time `json:"created_at"` + // Active determines whether the replica is online. + Active bool `json:"active"` + // RelayAddress is the accessible address to relay DERP connections. + RelayAddress string `json:"relay_address"` + // Error is the error. + Error string `json:"error"` +} diff --git a/enterprise/cli/server.go b/enterprise/cli/server.go index 62af6f2888373..e34bdaccfd342 100644 --- a/enterprise/cli/server.go +++ b/enterprise/cli/server.go @@ -3,8 +3,12 @@ package cli import ( "context" + "github.com/google/uuid" "github.com/spf13/cobra" + "cdr.dev/slog" + + "github.com/coder/coder/cli/config" "github.com/coder/coder/cli/deployment" "github.com/coder/coder/enterprise/coderd" @@ -14,14 +18,29 @@ import ( func server() *cobra.Command { dflags := deployment.Flags() - cmd := agpl.Server(dflags, func(ctx context.Context, options *agplcoderd.Options) (*agplcoderd.API, error) { + cmd := agpl.Server(dflags, func(ctx context.Context, cfg config.Root, options *agplcoderd.Options) (*agplcoderd.API, error) { + replicaIDRaw, err := cfg.ReplicaID().Read() + if err != nil { + replicaIDRaw = uuid.NewString() + } + replicaID, err := uuid.Parse(replicaIDRaw) + if err != nil { + options.Logger.Warn(ctx, "failed to parse replica id", slog.Error(err), slog.F("replica_id", replicaIDRaw)) + replicaID = uuid.New() + } o := &coderd.Options{ AuditLogging: dflags.AuditLogging.Value, BrowserOnly: dflags.BrowserOnly.Value, SCIMAPIKey: []byte(dflags.SCIMAuthHeader.Value), UserWorkspaceQuota: dflags.UserWorkspaceQuota.Value, - RBACEnabled: true, - Options: options, + RBAC: true, + HighAvailability: dflags.HighAvailability.Value, + + ReplicaID: replicaID, + DERPServerRelayAddress: dflags.DerpServerRelayAddress.Value, + DERPServerRegionID: dflags.DerpServerRegionID.Value, + + Options: options, } api, err := coderd.New(ctx, o) if err != nil { diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index dfa9c25e4cf77..f18776ade2c61 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -11,6 +11,7 @@ import ( "github.com/cenkalti/backoff/v4" "github.com/go-chi/chi/v5" + "github.com/google/uuid" "cdr.dev/slog" "github.com/coder/coder/coderd" @@ -24,6 +25,8 @@ import ( "github.com/coder/coder/enterprise/audit/backends" "github.com/coder/coder/enterprise/coderd/license" "github.com/coder/coder/enterprise/highavailability" + "github.com/coder/coder/enterprise/highavailability/derpmesh" + "github.com/coder/coder/enterprise/highavailability/replica" "github.com/coder/coder/tailnet" ) @@ -43,6 +46,7 @@ func New(ctx context.Context, options *Options) (*API, error) { Options: options, cancelEntitlementsLoop: cancelFunc, } + oauthConfigs := &httpmw.OAuth2Configs{ Github: options.GithubOAuth2Config, OIDC: options.OIDCConfig, @@ -113,7 +117,27 @@ func New(ctx context.Context, options *Options) (*API, error) { }) } - err := api.updateEntitlements(ctx) + // If high availability is disabled and multiple replicas appear, show an error. + // If high availability is enabled and the built-in DERP is but the DERP relay isn't set, show an error. + // We need to block meshing if high availability is disabled, because the meshing code would just work. + // SetAddresses([]string{}) + + api.AGPL.RootHandler.Route("/replicas", func(r chi.Router) { + + }) + + var err error + api.replica, err = replica.New(ctx, options.Logger, options.Database, options.Pubsub, replica.Options{ + ID: options.ReplicaID, + RelayAddress: options.DERPServerRelayAddress, + RegionID: int32(options.DERPServerRegionID), + }) + if err != nil { + return nil, xerrors.Errorf("initialize replica: %w", err) + } + api.derpMesh = derpmesh.New(options.Logger, api.DERPServer) + + err = api.updateEntitlements(ctx) if err != nil { return nil, xerrors.Errorf("update entitlements: %w", err) } @@ -125,12 +149,18 @@ func New(ctx context.Context, options *Options) (*API, error) { type Options struct { *coderd.Options - RBACEnabled bool + RBAC bool AuditLogging bool // Whether to block non-browser connections. BrowserOnly bool SCIMAPIKey []byte UserWorkspaceQuota int + HighAvailability bool + + // Used for high availability. + DERPServerRelayAddress string + DERPServerRegionID int + ReplicaID uuid.UUID EntitlementsUpdateInterval time.Duration Keys map[string]ed25519.PublicKey @@ -140,6 +170,11 @@ type API struct { AGPL *coderd.API *Options + // Detects multiple Coder replicas running at the same time. + replica *replica.Server + // Meshes DERP connections from multiple replicas. + derpMesh *derpmesh.Mesh + cancelEntitlementsLoop func() entitlementsMu sync.RWMutex entitlements codersdk.Entitlements @@ -147,6 +182,8 @@ type API struct { func (api *API) Close() error { api.cancelEntitlementsLoop() + _ = api.replica.Close() + _ = api.derpMesh.Close() return api.AGPL.Close() } @@ -155,11 +192,12 @@ func (api *API) updateEntitlements(ctx context.Context) error { defer api.entitlementsMu.Unlock() entitlements, err := license.Entitlements(ctx, api.Database, api.Logger, api.Keys, map[string]bool{ - codersdk.FeatureAuditLog: api.AuditLogging, - codersdk.FeatureBrowserOnly: api.BrowserOnly, - codersdk.FeatureSCIM: len(api.SCIMAPIKey) != 0, - codersdk.FeatureWorkspaceQuota: api.UserWorkspaceQuota != 0, - codersdk.FeatureTemplateRBAC: api.RBACEnabled, + codersdk.FeatureAuditLog: api.AuditLogging, + codersdk.FeatureBrowserOnly: api.BrowserOnly, + codersdk.FeatureSCIM: len(api.SCIMAPIKey) != 0, + codersdk.FeatureWorkspaceQuota: api.UserWorkspaceQuota != 0, + codersdk.FeatureHighAvailability: api.HighAvailability, + codersdk.FeatureTemplateRBAC: api.RBAC, }) if err != nil { return err @@ -210,13 +248,23 @@ func (api *API) updateEntitlements(ctx context.Context) error { if enabled { haCoordinator, err := highavailability.NewCoordinator(api.Logger, api.Pubsub) if err != nil { - api.Logger.Error(ctx, "unable to setup HA tailnet coordinator", slog.Error(err)) + api.Logger.Error(ctx, "unable to set up high availability coordinator", slog.Error(err)) // If we try to setup the HA coordinator and it fails, nothing // is actually changing. changed = false } else { coordinator = haCoordinator } + + api.replica.SetCallback(func() { + addresses := make([]string, 0) + for _, replica := range api.replica.Regional() { + addresses = append(addresses, replica.RelayAddress) + } + api.derpMesh.SetAddresses(addresses) + }) + } else { + api.derpMesh.SetAddresses([]string{}) } // Recheck changed in case the HA coordinator failed to set up. diff --git a/enterprise/coderd/coderdenttest/coderdenttest.go b/enterprise/coderd/coderdenttest/coderdenttest.go index bc6b0375df638..c5ec2391d97bf 100644 --- a/enterprise/coderd/coderdenttest/coderdenttest.go +++ b/enterprise/coderd/coderdenttest/coderdenttest.go @@ -9,6 +9,7 @@ import ( "time" "github.com/golang-jwt/jwt/v4" + "github.com/google/uuid" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -62,10 +63,14 @@ func NewWithAPI(t *testing.T, options *Options) (*codersdk.Client, io.Closer, *c } srv, cancelFunc, oop := coderdtest.NewOptions(t, options.Options) coderAPI, err := coderd.New(context.Background(), &coderd.Options{ - RBACEnabled: true, + RBAC: true, AuditLogging: options.AuditLogging, BrowserOnly: options.BrowserOnly, SCIMAPIKey: options.SCIMAPIKey, + DERPServerRelayAddress: oop.AccessURL.String(), + DERPServerRegionID: 1, + HighAvailability: true, + ReplicaID: uuid.New(), UserWorkspaceQuota: options.UserWorkspaceQuota, Options: oop, EntitlementsUpdateInterval: options.EntitlementsUpdateInterval, diff --git a/enterprise/coderd/replicas.go b/enterprise/coderd/replicas.go new file mode 100644 index 0000000000000..ddb2b8b672186 --- /dev/null +++ b/enterprise/coderd/replicas.go @@ -0,0 +1 @@ +package coderd diff --git a/enterprise/coderd/replicas_test.go b/enterprise/coderd/replicas_test.go new file mode 100644 index 0000000000000..1a5a3ed5f4eee --- /dev/null +++ b/enterprise/coderd/replicas_test.go @@ -0,0 +1,38 @@ +package coderd_test + +import ( + "context" + "fmt" + "testing" + + "github.com/stretchr/testify/require" + + "github.com/coder/coder/coderd/coderdtest" + "github.com/coder/coder/coderd/database/dbtestutil" + "github.com/coder/coder/codersdk" + "github.com/coder/coder/enterprise/coderd/coderdenttest" +) + +func TestReplicas(t *testing.T) { + t.Parallel() + db, pubsub := dbtestutil.NewDB(t) + firstClient := coderdenttest.New(t, &coderdenttest.Options{ + Options: &coderdtest.Options{ + Database: db, + Pubsub: pubsub, + }, + }) + _ = coderdtest.CreateFirstUser(t, firstClient) + + secondClient := coderdenttest.New(t, &coderdenttest.Options{ + Options: &coderdtest.Options{ + Database: db, + Pubsub: pubsub, + }, + }) + secondClient.SessionToken = firstClient.SessionToken + + user, err := secondClient.User(context.Background(), codersdk.Me) + require.NoError(t, err) + fmt.Printf("%+v\n", user) +} diff --git a/enterprise/highavailability/derpmesh/derpmesh.go b/enterprise/highavailability/derpmesh/derpmesh.go index 610fd749132cc..94341079cd43f 100644 --- a/enterprise/highavailability/derpmesh/derpmesh.go +++ b/enterprise/highavailability/derpmesh/derpmesh.go @@ -14,6 +14,7 @@ import ( "cdr.dev/slog" ) +// New constructs a new mesh for DERP servers. func New(logger slog.Logger, server *derp.Server) *Mesh { return &Mesh{ logger: logger, diff --git a/enterprise/highavailability/replica/replica.go b/enterprise/highavailability/replica/replica.go index ca0c450651e64..6855b32852e3e 100644 --- a/enterprise/highavailability/replica/replica.go +++ b/enterprise/highavailability/replica/replica.go @@ -28,11 +28,8 @@ type Options struct { ID uuid.UUID UpdateInterval time.Duration PeerTimeout time.Duration - // Mesh will dial active replicas with the same region ID to ensure - // they are reachable. If not, an error will be updated on the replica. - Mesh bool - RelayAddress string - RegionID int32 + RelayAddress string + RegionID int32 } // New registers the replica with the database and periodically updates to ensure From fdb3557f7fe4599c77ccd06689712cc6856a6319 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Thu, 13 Oct 2022 00:35:43 +0000 Subject: [PATCH 16/79] Rename to replicasync --- .vscode/settings.json | 1 + enterprise/coderd/coderd.go | 12 ++++----- .../replica.go => replicasync/replicasync.go} | 25 +++++++++--------- .../replicasync_test.go} | 26 +++++++++---------- 4 files changed, 33 insertions(+), 31 deletions(-) rename enterprise/highavailability/{replica/replica.go => replicasync/replicasync.go} (93%) rename enterprise/highavailability/{replica/replica_test.go => replicasync/replicasync_test.go} (80%) diff --git a/.vscode/settings.json b/.vscode/settings.json index e9a32e850c980..f556563596bc0 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -85,6 +85,7 @@ "ptytest", "quickstart", "reconfig", + "replicasync", "retrier", "rpty", "sdkproto", diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index f18776ade2c61..19812ea1f8b42 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -26,7 +26,7 @@ import ( "github.com/coder/coder/enterprise/coderd/license" "github.com/coder/coder/enterprise/highavailability" "github.com/coder/coder/enterprise/highavailability/derpmesh" - "github.com/coder/coder/enterprise/highavailability/replica" + "github.com/coder/coder/enterprise/highavailability/replicasync" "github.com/coder/coder/tailnet" ) @@ -127,7 +127,7 @@ func New(ctx context.Context, options *Options) (*API, error) { }) var err error - api.replica, err = replica.New(ctx, options.Logger, options.Database, options.Pubsub, replica.Options{ + api.replicaManager, err = replicasync.New(ctx, options.Logger, options.Database, options.Pubsub, replicasync.Options{ ID: options.ReplicaID, RelayAddress: options.DERPServerRelayAddress, RegionID: int32(options.DERPServerRegionID), @@ -171,7 +171,7 @@ type API struct { *Options // Detects multiple Coder replicas running at the same time. - replica *replica.Server + replicaManager *replicasync.Manager // Meshes DERP connections from multiple replicas. derpMesh *derpmesh.Mesh @@ -182,7 +182,7 @@ type API struct { func (api *API) Close() error { api.cancelEntitlementsLoop() - _ = api.replica.Close() + _ = api.replicaManager.Close() _ = api.derpMesh.Close() return api.AGPL.Close() } @@ -256,9 +256,9 @@ func (api *API) updateEntitlements(ctx context.Context) error { coordinator = haCoordinator } - api.replica.SetCallback(func() { + api.replicaManager.SetCallback(func() { addresses := make([]string, 0) - for _, replica := range api.replica.Regional() { + for _, replica := range api.replicaManager.Regional() { addresses = append(addresses, replica.RelayAddress) } api.derpMesh.SetAddresses(addresses) diff --git a/enterprise/highavailability/replica/replica.go b/enterprise/highavailability/replicasync/replicasync.go similarity index 93% rename from enterprise/highavailability/replica/replica.go rename to enterprise/highavailability/replicasync/replicasync.go index 6855b32852e3e..c632f8df2462b 100644 --- a/enterprise/highavailability/replica/replica.go +++ b/enterprise/highavailability/replicasync/replicasync.go @@ -1,4 +1,4 @@ -package replica +package replicasync import ( "context" @@ -34,7 +34,7 @@ type Options struct { // New registers the replica with the database and periodically updates to ensure // it's healthy. It contacts all other alive replicas to ensure they are reachable. -func New(ctx context.Context, logger slog.Logger, db database.Store, pubsub database.Pubsub, options Options) (*Server, error) { +func New(ctx context.Context, logger slog.Logger, db database.Store, pubsub database.Pubsub, options Options) (*Manager, error) { if options.ID == uuid.Nil { panic("An ID must be provided!") } @@ -88,7 +88,7 @@ func New(ctx context.Context, logger slog.Logger, db database.Store, pubsub data return nil, xerrors.Errorf("publish new replica: %w", err) } ctx, cancelFunc := context.WithCancel(ctx) - server := &Server{ + server := &Manager{ options: &options, db: db, pubsub: pubsub, @@ -110,7 +110,8 @@ func New(ctx context.Context, logger slog.Logger, db database.Store, pubsub data return server, nil } -type Server struct { +// Manager keeps the replica up to date and in sync with other replicas. +type Manager struct { options *Options db database.Store pubsub database.Pubsub @@ -128,7 +129,7 @@ type Server struct { } // loop runs the replica update sequence on an update interval. -func (s *Server) loop(ctx context.Context) { +func (s *Manager) loop(ctx context.Context) { defer s.closeWait.Done() ticker := time.NewTicker(s.options.UpdateInterval) defer ticker.Stop() @@ -146,7 +147,7 @@ func (s *Server) loop(ctx context.Context) { } // subscribe listens for new replica information! -func (s *Server) subscribe(ctx context.Context) error { +func (s *Manager) subscribe(ctx context.Context) error { needsUpdate := false updating := false updateMutex := sync.Mutex{} @@ -199,7 +200,7 @@ func (s *Server) subscribe(ctx context.Context) error { return nil } -func (s *Server) run(ctx context.Context) error { +func (s *Manager) run(ctx context.Context) error { s.closeMutex.Lock() s.closeWait.Add(1) s.closeMutex.Unlock() @@ -291,21 +292,21 @@ func (s *Server) run(ctx context.Context) error { } // Self represents the current replica. -func (s *Server) Self() database.Replica { +func (s *Manager) Self() database.Replica { s.mutex.Lock() defer s.mutex.Unlock() return s.self } // All returns every replica, including itself. -func (s *Server) All() []database.Replica { +func (s *Manager) All() []database.Replica { s.mutex.Lock() defer s.mutex.Unlock() return append(s.peers, s.self) } // Regional returns all replicas in the same region excluding itself. -func (s *Server) Regional() []database.Replica { +func (s *Manager) Regional() []database.Replica { s.mutex.Lock() defer s.mutex.Unlock() replicas := make([]database.Replica, 0) @@ -320,7 +321,7 @@ func (s *Server) Regional() []database.Replica { // SetCallback sets a function to execute whenever new peers // are refreshed or updated. -func (s *Server) SetCallback(callback func()) { +func (s *Manager) SetCallback(callback func()) { s.mutex.Lock() defer s.mutex.Unlock() s.callback = callback @@ -328,7 +329,7 @@ func (s *Server) SetCallback(callback func()) { go callback() } -func (s *Server) Close() error { +func (s *Manager) Close() error { s.closeMutex.Lock() select { case <-s.closed: diff --git a/enterprise/highavailability/replica/replica_test.go b/enterprise/highavailability/replicasync/replicasync_test.go similarity index 80% rename from enterprise/highavailability/replica/replica_test.go rename to enterprise/highavailability/replicasync/replicasync_test.go index a5bda874ea166..f4d800650f939 100644 --- a/enterprise/highavailability/replica/replica_test.go +++ b/enterprise/highavailability/replicasync/replicasync_test.go @@ -1,4 +1,4 @@ -package replica_test +package replicasync_test import ( "context" @@ -17,7 +17,7 @@ import ( "cdr.dev/slog/sloggers/slogtest" "github.com/coder/coder/coderd/database" "github.com/coder/coder/coderd/database/dbtestutil" - "github.com/coder/coder/enterprise/highavailability/replica" + "github.com/coder/coder/enterprise/highavailability/replicasync" "github.com/coder/coder/testutil" ) @@ -32,12 +32,12 @@ func TestReplica(t *testing.T) { t.Parallel() db, pubsub := dbtestutil.NewDB(t) id := uuid.New() - cancel, err := pubsub.Subscribe(replica.PubsubEvent, func(ctx context.Context, message []byte) { + cancel, err := pubsub.Subscribe(replicasync.PubsubEvent, func(ctx context.Context, message []byte) { assert.Equal(t, []byte(id.String()), message) }) require.NoError(t, err) defer cancel() - server, err := replica.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replica.Options{ + server, err := replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replicasync.Options{ ID: id, }) require.NoError(t, err) @@ -54,12 +54,12 @@ func TestReplica(t *testing.T) { ID: id, }) require.NoError(t, err) - cancel, err := pubsub.Subscribe(replica.PubsubEvent, func(ctx context.Context, message []byte) { + cancel, err := pubsub.Subscribe(replicasync.PubsubEvent, func(ctx context.Context, message []byte) { assert.Equal(t, []byte(id.String()), message) }) require.NoError(t, err) defer cancel() - server, err := replica.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replica.Options{ + server, err := replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replicasync.Options{ ID: id, }) require.NoError(t, err) @@ -84,7 +84,7 @@ func TestReplica(t *testing.T) { RelayAddress: srv.URL, }) require.NoError(t, err) - server, err := replica.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replica.Options{ + server, err := replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replicasync.Options{ ID: uuid.New(), }) require.NoError(t, err) @@ -97,7 +97,7 @@ func TestReplica(t *testing.T) { t.Parallel() db, pubsub := dbtestutil.NewDB(t) var count atomic.Int32 - cancel, err := pubsub.Subscribe(replica.PubsubEvent, func(ctx context.Context, message []byte) { + cancel, err := pubsub.Subscribe(replicasync.PubsubEvent, func(ctx context.Context, message []byte) { count.Add(1) }) require.NoError(t, err) @@ -112,7 +112,7 @@ func TestReplica(t *testing.T) { RelayAddress: "http://169.254.169.254", }) require.NoError(t, err) - server, err := replica.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replica.Options{ + server, err := replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replicasync.Options{ ID: uuid.New(), PeerTimeout: 1 * time.Millisecond, }) @@ -130,7 +130,7 @@ func TestReplica(t *testing.T) { t.Parallel() db, pubsub := dbtestutil.NewDB(t) id := uuid.New() - server, err := replica.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replica.Options{ + server, err := replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replicasync.Options{ ID: id, }) require.NoError(t, err) @@ -145,9 +145,9 @@ func TestReplica(t *testing.T) { }) require.NoError(t, err) // Publish multiple times to ensure it can handle that case. - err = pubsub.Publish(replica.PubsubEvent, []byte(peer.ID.String())) + err = pubsub.Publish(replicasync.PubsubEvent, []byte(peer.ID.String())) require.NoError(t, err) - err = pubsub.Publish(replica.PubsubEvent, []byte(peer.ID.String())) + err = pubsub.Publish(replicasync.PubsubEvent, []byte(peer.ID.String())) require.NoError(t, err) require.Eventually(t, func() bool { return len(server.Regional()) == 1 @@ -168,7 +168,7 @@ func TestReplica(t *testing.T) { count := 20 wg.Add(count) for i := 0; i < count; i++ { - server, err := replica.New(context.Background(), logger, db, pubsub, replica.Options{ + server, err := replicasync.New(context.Background(), logger, db, pubsub, replicasync.Options{ ID: uuid.New(), RelayAddress: srv.URL, }) From 9124b0045cacf0b518a7299f9a8eb48842d2ecc7 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Thu, 13 Oct 2022 03:44:47 +0000 Subject: [PATCH 17/79] Denest packages for replicas --- enterprise/coderd/coderd.go | 22 ++++++------------- .../derpmesh/derpmesh.go | 0 .../derpmesh/derpmesh_test.go | 2 +- .../replicasync/replicasync.go | 0 .../replicasync/replicasync_test.go | 2 +- .../coordinator.go | 2 +- .../coordinator_test.go | 14 ++++++------ 7 files changed, 17 insertions(+), 25 deletions(-) rename enterprise/{highavailability => }/derpmesh/derpmesh.go (100%) rename enterprise/{highavailability => }/derpmesh/derpmesh_test.go (98%) rename enterprise/{highavailability => }/replicasync/replicasync.go (100%) rename enterprise/{highavailability => }/replicasync/replicasync_test.go (98%) rename enterprise/{highavailability => tailnet}/coordinator.go (99%) rename enterprise/{highavailability => tailnet}/coordinator_test.go (92%) diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index 19812ea1f8b42..342b992c8076f 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -24,10 +24,10 @@ import ( "github.com/coder/coder/enterprise/audit" "github.com/coder/coder/enterprise/audit/backends" "github.com/coder/coder/enterprise/coderd/license" - "github.com/coder/coder/enterprise/highavailability" - "github.com/coder/coder/enterprise/highavailability/derpmesh" - "github.com/coder/coder/enterprise/highavailability/replicasync" - "github.com/coder/coder/tailnet" + "github.com/coder/coder/enterprise/derpmesh" + "github.com/coder/coder/enterprise/replicasync" + "github.com/coder/coder/enterprise/tailnet" + agpltailnet "github.com/coder/coder/tailnet" ) // New constructs an Enterprise coderd API instance. @@ -117,15 +117,6 @@ func New(ctx context.Context, options *Options) (*API, error) { }) } - // If high availability is disabled and multiple replicas appear, show an error. - // If high availability is enabled and the built-in DERP is but the DERP relay isn't set, show an error. - // We need to block meshing if high availability is disabled, because the meshing code would just work. - // SetAddresses([]string{}) - - api.AGPL.RootHandler.Route("/replicas", func(r chi.Router) { - - }) - var err error api.replicaManager, err = replicasync.New(ctx, options.Logger, options.Database, options.Pubsub, replicasync.Options{ ID: options.ReplicaID, @@ -244,9 +235,9 @@ func (api *API) updateEntitlements(ctx context.Context) error { } if changed, enabled := featureChanged(codersdk.FeatureHighAvailability); changed { - coordinator := tailnet.NewCoordinator() + coordinator := agpltailnet.NewCoordinator() if enabled { - haCoordinator, err := highavailability.NewCoordinator(api.Logger, api.Pubsub) + haCoordinator, err := tailnet.NewCoordinator(api.Logger, api.Pubsub) if err != nil { api.Logger.Error(ctx, "unable to set up high availability coordinator", slog.Error(err)) // If we try to setup the HA coordinator and it fails, nothing @@ -265,6 +256,7 @@ func (api *API) updateEntitlements(ctx context.Context) error { }) } else { api.derpMesh.SetAddresses([]string{}) + api.replicaManager.SetCallback(func() {}) } // Recheck changed in case the HA coordinator failed to set up. diff --git a/enterprise/highavailability/derpmesh/derpmesh.go b/enterprise/derpmesh/derpmesh.go similarity index 100% rename from enterprise/highavailability/derpmesh/derpmesh.go rename to enterprise/derpmesh/derpmesh.go diff --git a/enterprise/highavailability/derpmesh/derpmesh_test.go b/enterprise/derpmesh/derpmesh_test.go similarity index 98% rename from enterprise/highavailability/derpmesh/derpmesh_test.go rename to enterprise/derpmesh/derpmesh_test.go index 6e1154fc3d6a8..313c33da99bad 100644 --- a/enterprise/highavailability/derpmesh/derpmesh_test.go +++ b/enterprise/derpmesh/derpmesh_test.go @@ -16,7 +16,7 @@ import ( "cdr.dev/slog" "cdr.dev/slog/sloggers/slogtest" - "github.com/coder/coder/enterprise/highavailability/derpmesh" + "github.com/coder/coder/enterprise/derpmesh" "github.com/coder/coder/tailnet" ) diff --git a/enterprise/highavailability/replicasync/replicasync.go b/enterprise/replicasync/replicasync.go similarity index 100% rename from enterprise/highavailability/replicasync/replicasync.go rename to enterprise/replicasync/replicasync.go diff --git a/enterprise/highavailability/replicasync/replicasync_test.go b/enterprise/replicasync/replicasync_test.go similarity index 98% rename from enterprise/highavailability/replicasync/replicasync_test.go rename to enterprise/replicasync/replicasync_test.go index f4d800650f939..5ce774ea5f29a 100644 --- a/enterprise/highavailability/replicasync/replicasync_test.go +++ b/enterprise/replicasync/replicasync_test.go @@ -17,7 +17,7 @@ import ( "cdr.dev/slog/sloggers/slogtest" "github.com/coder/coder/coderd/database" "github.com/coder/coder/coderd/database/dbtestutil" - "github.com/coder/coder/enterprise/highavailability/replicasync" + "github.com/coder/coder/enterprise/replicasync" "github.com/coder/coder/testutil" ) diff --git a/enterprise/highavailability/coordinator.go b/enterprise/tailnet/coordinator.go similarity index 99% rename from enterprise/highavailability/coordinator.go rename to enterprise/tailnet/coordinator.go index 7c41e47d44f1d..0643f7a259719 100644 --- a/enterprise/highavailability/coordinator.go +++ b/enterprise/tailnet/coordinator.go @@ -1,4 +1,4 @@ -package highavailability +package tailnet import ( "bytes" diff --git a/enterprise/highavailability/coordinator_test.go b/enterprise/tailnet/coordinator_test.go similarity index 92% rename from enterprise/highavailability/coordinator_test.go rename to enterprise/tailnet/coordinator_test.go index 1e86c08f1b1ed..83fac250b2916 100644 --- a/enterprise/highavailability/coordinator_test.go +++ b/enterprise/tailnet/coordinator_test.go @@ -1,4 +1,4 @@ -package highavailability_test +package tailnet_test import ( "net" @@ -11,7 +11,7 @@ import ( "cdr.dev/slog/sloggers/slogtest" "github.com/coder/coder/coderd/database" - "github.com/coder/coder/enterprise/highavailability" + "github.com/coder/coder/enterprise/tailnet" agpl "github.com/coder/coder/tailnet" "github.com/coder/coder/testutil" ) @@ -20,7 +20,7 @@ func TestCoordinatorSingle(t *testing.T) { t.Parallel() t.Run("ClientWithoutAgent", func(t *testing.T) { t.Parallel() - coordinator, err := highavailability.NewCoordinator(slogtest.Make(t, nil), database.NewPubsubInMemory()) + coordinator, err := tailnet.NewCoordinator(slogtest.Make(t, nil), database.NewPubsubInMemory()) require.NoError(t, err) defer coordinator.Close() @@ -48,7 +48,7 @@ func TestCoordinatorSingle(t *testing.T) { t.Run("AgentWithoutClients", func(t *testing.T) { t.Parallel() - coordinator, err := highavailability.NewCoordinator(slogtest.Make(t, nil), database.NewPubsubInMemory()) + coordinator, err := tailnet.NewCoordinator(slogtest.Make(t, nil), database.NewPubsubInMemory()) require.NoError(t, err) defer coordinator.Close() @@ -76,7 +76,7 @@ func TestCoordinatorSingle(t *testing.T) { t.Run("AgentWithClient", func(t *testing.T) { t.Parallel() - coordinator, err := highavailability.NewCoordinator(slogtest.Make(t, nil), database.NewPubsubInMemory()) + coordinator, err := tailnet.NewCoordinator(slogtest.Make(t, nil), database.NewPubsubInMemory()) require.NoError(t, err) defer coordinator.Close() @@ -169,11 +169,11 @@ func TestCoordinatorHA(t *testing.T) { pubsub := database.NewPubsubInMemory() - coordinator1, err := highavailability.NewCoordinator(slogtest.Make(t, nil), pubsub) + coordinator1, err := tailnet.NewCoordinator(slogtest.Make(t, nil), pubsub) require.NoError(t, err) defer coordinator1.Close() - coordinator2, err := highavailability.NewCoordinator(slogtest.Make(t, nil), pubsub) + coordinator2, err := tailnet.NewCoordinator(slogtest.Make(t, nil), pubsub) require.NoError(t, err) defer coordinator2.Close() From d5555f6938978c38906aaf4ae6518c290ff0b983 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Thu, 13 Oct 2022 17:53:54 +0000 Subject: [PATCH 18/79] Add test for multiple replicas --- coderd/coderd.go | 1 + .../coderd/coderdenttest/coderdenttest.go | 11 ++++---- enterprise/coderd/license/license.go | 11 +++++--- enterprise/coderd/license/license_test.go | 2 +- enterprise/coderd/replicas_test.go | 26 ++++++++++++++----- 5 files changed, 35 insertions(+), 16 deletions(-) diff --git a/coderd/coderd.go b/coderd/coderd.go index 57b78520d1b50..1b4e674de11cb 100644 --- a/coderd/coderd.go +++ b/coderd/coderd.go @@ -124,6 +124,7 @@ func New(options *Options) *API { } if options.DERPServer == nil { options.DERPServer = derp.NewServer(key.NewNode(), tailnet.Logger(options.Logger)) + options.DERPServer.SetMeshKey("todo-kyle-change-this") } if options.Auditor == nil { options.Auditor = audit.NewNop() diff --git a/enterprise/coderd/coderdenttest/coderdenttest.go b/enterprise/coderd/coderdenttest/coderdenttest.go index c5ec2391d97bf..57440ac37082e 100644 --- a/enterprise/coderd/coderdenttest/coderdenttest.go +++ b/enterprise/coderd/coderdenttest/coderdenttest.go @@ -63,11 +63,12 @@ func NewWithAPI(t *testing.T, options *Options) (*codersdk.Client, io.Closer, *c } srv, cancelFunc, oop := coderdtest.NewOptions(t, options.Options) coderAPI, err := coderd.New(context.Background(), &coderd.Options{ - RBAC: true, - AuditLogging: options.AuditLogging, - BrowserOnly: options.BrowserOnly, - SCIMAPIKey: options.SCIMAPIKey, - DERPServerRelayAddress: oop.AccessURL.String(), + RBAC: true, + AuditLogging: options.AuditLogging, + BrowserOnly: options.BrowserOnly, + SCIMAPIKey: options.SCIMAPIKey, + // TODO: Kyle change this before merge! + DERPServerRelayAddress: oop.AccessURL.String() + "/derp", DERPServerRegionID: 1, HighAvailability: true, ReplicaID: uuid.New(), diff --git a/enterprise/coderd/license/license.go b/enterprise/coderd/license/license.go index d7643683a6d2f..43f8b53094c7c 100644 --- a/enterprise/coderd/license/license.go +++ b/enterprise/coderd/license/license.go @@ -153,9 +153,6 @@ func Entitlements( case codersdk.EntitlementNotEntitled: entitlements.Warnings = append(entitlements.Warnings, fmt.Sprintf("%s is enabled but your license is not entitled to this feature.", niceName)) - // Disable the feature and add a warning... - feature.Enabled = false - entitlements.Features[featureName] = feature case codersdk.EntitlementGracePeriod: entitlements.Warnings = append(entitlements.Warnings, fmt.Sprintf("%s is enabled but your license for this feature is expired.", niceName)) @@ -164,6 +161,14 @@ func Entitlements( } } + for _, featureName := range codersdk.FeatureNames { + feature := entitlements.Features[featureName] + if feature.Entitlement == codersdk.EntitlementNotEntitled { + feature.Enabled = false + entitlements.Features[featureName] = feature + } + } + return entitlements, nil } diff --git a/enterprise/coderd/license/license_test.go b/enterprise/coderd/license/license_test.go index e1fbdc6d3d9fa..f1318e26bae47 100644 --- a/enterprise/coderd/license/license_test.go +++ b/enterprise/coderd/license/license_test.go @@ -31,7 +31,7 @@ func TestEntitlements(t *testing.T) { t.Run("Defaults", func(t *testing.T) { t.Parallel() db := databasefake.New() - entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, coderdenttest.Keys, map[string]bool{}) + entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, coderdenttest.Keys, all) require.NoError(t, err) require.False(t, entitlements.HasLicense) require.False(t, entitlements.Trial) diff --git a/enterprise/coderd/replicas_test.go b/enterprise/coderd/replicas_test.go index 1a5a3ed5f4eee..52836a720f623 100644 --- a/enterprise/coderd/replicas_test.go +++ b/enterprise/coderd/replicas_test.go @@ -2,14 +2,16 @@ package coderd_test import ( "context" - "fmt" "testing" + "time" "github.com/stretchr/testify/require" + "cdr.dev/slog" + "cdr.dev/slog/sloggers/slogtest" + "github.com/coder/coder/coderd/coderdtest" "github.com/coder/coder/coderd/database/dbtestutil" - "github.com/coder/coder/codersdk" "github.com/coder/coder/enterprise/coderd/coderdenttest" ) @@ -18,11 +20,15 @@ func TestReplicas(t *testing.T) { db, pubsub := dbtestutil.NewDB(t) firstClient := coderdenttest.New(t, &coderdenttest.Options{ Options: &coderdtest.Options{ - Database: db, - Pubsub: pubsub, + IncludeProvisionerDaemon: true, + Database: db, + Pubsub: pubsub, }, }) - _ = coderdtest.CreateFirstUser(t, firstClient) + firstUser := coderdtest.CreateFirstUser(t, firstClient) + coderdenttest.AddLicense(t, firstClient, coderdenttest.LicenseOptions{ + HighAvailability: true, + }) secondClient := coderdenttest.New(t, &coderdenttest.Options{ Options: &coderdtest.Options{ @@ -32,7 +38,13 @@ func TestReplicas(t *testing.T) { }) secondClient.SessionToken = firstClient.SessionToken - user, err := secondClient.User(context.Background(), codersdk.Me) + agentID := setupWorkspaceAgent(t, firstClient, firstUser) + conn, err := secondClient.DialWorkspaceAgentTailnet(context.Background(), slogtest.Make(t, nil).Leveled(slog.LevelDebug), agentID) require.NoError(t, err) - fmt.Printf("%+v\n", user) + require.Eventually(t, func() bool { + _, err = conn.Ping() + return err == nil + }, 10*time.Second, 250*time.Millisecond) + + _ = conn.Close() } From 8dfc261c7bb6bfa440fa62389b647414fdc57ddb Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Thu, 13 Oct 2022 19:26:36 +0000 Subject: [PATCH 19/79] Fix coordination test --- coderd/coderd.go | 5 ++++- enterprise/coderd/workspaceagents_test.go | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/coderd/coderd.go b/coderd/coderd.go index 1b4e674de11cb..6b4b335161c32 100644 --- a/coderd/coderd.go +++ b/coderd/coderd.go @@ -558,7 +558,10 @@ func (api *API) Close() error { api.websocketWaitMutex.Unlock() api.metricsCache.Close() - + coordinator := api.TailnetCoordinator.Load() + if coordinator != nil { + _ = (*coordinator).Close() + } return api.workspaceAgentCache.Close() } diff --git a/enterprise/coderd/workspaceagents_test.go b/enterprise/coderd/workspaceagents_test.go index 3bb40b75b00f8..24e24e3f5f540 100644 --- a/enterprise/coderd/workspaceagents_test.go +++ b/enterprise/coderd/workspaceagents_test.go @@ -89,9 +89,9 @@ func setupWorkspaceAgent(t *testing.T, client *codersdk.Client, user codersdk.Cr CoordinatorDialer: agentClient.ListenWorkspaceAgentTailnet, Logger: slogtest.Make(t, nil).Named("agent"), }) - defer func() { + t.Cleanup(func() { _ = agentCloser.Close() - }() + }) resources := coderdtest.AwaitWorkspaceAgents(t, client, workspace.ID) return resources[0].Agents[0].ID } From ff5968bd9c5386dea16ed68ed9684870cf6bb0bd Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Thu, 13 Oct 2022 19:46:40 +0000 Subject: [PATCH 20/79] Add HA to the helm chart --- helm/templates/coder.yaml | 12 ++++++++---- helm/values.yaml | 8 ++++---- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/helm/templates/coder.yaml b/helm/templates/coder.yaml index 45f3f6e29a32e..1165251fc885b 100644 --- a/helm/templates/coder.yaml +++ b/helm/templates/coder.yaml @@ -14,10 +14,7 @@ metadata: {{- include "coder.labels" . | nindent 4 }} annotations: {{ toYaml .Values.coder.annotations | nindent 4}} spec: - # NOTE: this is currently not used as coder v2 does not support high - # availability yet. - # replicas: {{ .Values.coder.replicaCount }} - replicas: 1 + replicas: {{ .Values.coder.replicaCount }} selector: matchLabels: {{- include "coder.selectorLabels" . | nindent 6 }} @@ -38,6 +35,13 @@ spec: env: - name: CODER_ADDRESS value: "0.0.0.0:{{ include "coder.port" . }}" + # Used for inter-pod communication with high-availability. + - name: KUBE_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: CODER_DERP_SERVER_RELAY_ADDRESS + value: "{{ include "coder.portName" . }}://$(KUBE_POD_IP):{{ include "coder.port" . }}" {{- include "coder.tlsEnv" . | nindent 12 }} {{- with .Values.coder.env -}} {{ toYaml . | nindent 12 }} diff --git a/helm/values.yaml b/helm/values.yaml index cfba214ee6028..3beebdd3fc3b9 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -1,9 +1,9 @@ # coder -- Primary configuration for `coder server`. coder: - # NOTE: this is currently not used as coder v2 does not support high - # availability yet. - # # coder.replicaCount -- The number of Kubernetes deployment replicas. - # replicaCount: 1 + # coder.replicaCount -- The number of Kubernetes deployment replicas. + # This should only be increased if High Availability is enabled. + # This is an Enterprise feature. Contact sales@coder.com. + replicaCount: 1 # coder.image -- The image to use for Coder. image: From 557b390f62d2eee07e0131238c1b25a4e130078d Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Thu, 13 Oct 2022 19:49:28 +0000 Subject: [PATCH 21/79] Rename function pointer --- codersdk/replicas.go | 22 ---- enterprise/replicasync/replicasync.go | 148 +++++++++++++------------- 2 files changed, 74 insertions(+), 96 deletions(-) delete mode 100644 codersdk/replicas.go diff --git a/codersdk/replicas.go b/codersdk/replicas.go deleted file mode 100644 index 341b460792ddd..0000000000000 --- a/codersdk/replicas.go +++ /dev/null @@ -1,22 +0,0 @@ -package codersdk - -import ( - "time" - - "github.com/google/uuid" -) - -type Replica struct { - // ID is the unique identifier for the replica. - ID uuid.UUID `json:"id"` - // Hostname is the hostname of the replica. - Hostname string `json:"hostname"` - // CreatedAt is when the replica was first seen. - CreatedAt time.Time `json:"created_at"` - // Active determines whether the replica is online. - Active bool `json:"active"` - // RelayAddress is the accessible address to relay DERP connections. - RelayAddress string `json:"relay_address"` - // Error is the error. - Error string `json:"error"` -} diff --git a/enterprise/replicasync/replicasync.go b/enterprise/replicasync/replicasync.go index c632f8df2462b..4d6038a694940 100644 --- a/enterprise/replicasync/replicasync.go +++ b/enterprise/replicasync/replicasync.go @@ -129,9 +129,9 @@ type Manager struct { } // loop runs the replica update sequence on an update interval. -func (s *Manager) loop(ctx context.Context) { - defer s.closeWait.Done() - ticker := time.NewTicker(s.options.UpdateInterval) +func (m *Manager) loop(ctx context.Context) { + defer m.closeWait.Done() + ticker := time.NewTicker(m.options.UpdateInterval) defer ticker.Stop() for { select { @@ -139,15 +139,15 @@ func (s *Manager) loop(ctx context.Context) { return case <-ticker.C: } - err := s.run(ctx) + err := m.run(ctx) if err != nil && !errors.Is(err, context.Canceled) { - s.logger.Warn(ctx, "run replica update loop", slog.Error(err)) + m.logger.Warn(ctx, "run replica update loop", slog.Error(err)) } } } // subscribe listens for new replica information! -func (s *Manager) subscribe(ctx context.Context) error { +func (m *Manager) subscribe(ctx context.Context) error { needsUpdate := false updating := false updateMutex := sync.Mutex{} @@ -158,9 +158,9 @@ func (s *Manager) subscribe(ctx context.Context) error { // it will reprocess afterwards. var update func() update = func() { - err := s.run(ctx) + err := m.run(ctx) if err != nil && !errors.Is(err, context.Canceled) { - s.logger.Error(ctx, "run replica from subscribe", slog.Error(err)) + m.logger.Error(ctx, "run replica from subscribe", slog.Error(err)) } updateMutex.Lock() if needsUpdate { @@ -172,7 +172,7 @@ func (s *Manager) subscribe(ctx context.Context) error { updating = false updateMutex.Unlock() } - cancelFunc, err := s.pubsub.Subscribe(PubsubEvent, func(ctx context.Context, message []byte) { + cancelFunc, err := m.pubsub.Subscribe(PubsubEvent, func(ctx context.Context, message []byte) { updateMutex.Lock() defer updateMutex.Unlock() id, err := uuid.Parse(string(message)) @@ -180,7 +180,7 @@ func (s *Manager) subscribe(ctx context.Context) error { return } // Don't process updates for ourself! - if id == s.options.ID { + if id == m.options.ID { return } if updating { @@ -200,46 +200,46 @@ func (s *Manager) subscribe(ctx context.Context) error { return nil } -func (s *Manager) run(ctx context.Context) error { - s.closeMutex.Lock() - s.closeWait.Add(1) - s.closeMutex.Unlock() +func (m *Manager) run(ctx context.Context) error { + m.closeMutex.Lock() + m.closeWait.Add(1) + m.closeMutex.Unlock() go func() { - s.closeWait.Done() + m.closeWait.Done() }() // Expect replicas to update once every three times the interval... // If they don't, assume death! - replicas, err := s.db.GetReplicasUpdatedAfter(ctx, database.Now().Add(-3*s.options.UpdateInterval)) + replicas, err := m.db.GetReplicasUpdatedAfter(ctx, database.Now().Add(-3*m.options.UpdateInterval)) if err != nil { return xerrors.Errorf("get replicas: %w", err) } - s.mutex.Lock() - s.peers = make([]database.Replica, 0, len(replicas)) + m.mutex.Lock() + m.peers = make([]database.Replica, 0, len(replicas)) for _, replica := range replicas { - if replica.ID == s.options.ID { + if replica.ID == m.options.ID { continue } - s.peers = append(s.peers, replica) + m.peers = append(m.peers, replica) } - s.mutex.Unlock() + m.mutex.Unlock() var wg sync.WaitGroup var mu sync.Mutex failed := make([]string, 0) - for _, peer := range s.Regional() { + for _, peer := range m.Regional() { wg.Add(1) peer := peer go func() { defer wg.Done() req, err := http.NewRequestWithContext(ctx, http.MethodGet, peer.RelayAddress, nil) if err != nil { - s.logger.Error(ctx, "create http request for relay probe", + m.logger.Error(ctx, "create http request for relay probe", slog.F("relay_address", peer.RelayAddress), slog.Error(err)) return } client := http.Client{ - Timeout: s.options.PeerTimeout, + Timeout: m.options.PeerTimeout, } res, err := client.Do(req) if err != nil { @@ -260,58 +260,58 @@ func (s *Manager) run(ctx context.Context) error { } } - replica, err := s.db.UpdateReplica(ctx, database.UpdateReplicaParams{ - ID: s.self.ID, + replica, err := m.db.UpdateReplica(ctx, database.UpdateReplicaParams{ + ID: m.self.ID, UpdatedAt: database.Now(), - StartedAt: s.self.StartedAt, - StoppedAt: s.self.StoppedAt, - RelayAddress: s.self.RelayAddress, - RegionID: s.self.RegionID, - Hostname: s.self.Hostname, - Version: s.self.Version, + StartedAt: m.self.StartedAt, + StoppedAt: m.self.StoppedAt, + RelayAddress: m.self.RelayAddress, + RegionID: m.self.RegionID, + Hostname: m.self.Hostname, + Version: m.self.Version, Error: replicaError, }) if err != nil { return xerrors.Errorf("update replica: %w", err) } - s.mutex.Lock() - if s.self.Error.String != replica.Error.String { + m.mutex.Lock() + if m.self.Error.String != replica.Error.String { // Publish an update occurred! - err = s.pubsub.Publish(PubsubEvent, []byte(s.self.ID.String())) + err = m.pubsub.Publish(PubsubEvent, []byte(m.self.ID.String())) if err != nil { - s.mutex.Unlock() + m.mutex.Unlock() return xerrors.Errorf("publish replica update: %w", err) } } - s.self = replica - if s.callback != nil { - go s.callback() + m.self = replica + if m.callback != nil { + go m.callback() } - s.mutex.Unlock() + m.mutex.Unlock() return nil } // Self represents the current replica. -func (s *Manager) Self() database.Replica { - s.mutex.Lock() - defer s.mutex.Unlock() - return s.self +func (m *Manager) Self() database.Replica { + m.mutex.Lock() + defer m.mutex.Unlock() + return m.self } // All returns every replica, including itself. -func (s *Manager) All() []database.Replica { - s.mutex.Lock() - defer s.mutex.Unlock() - return append(s.peers, s.self) +func (m *Manager) All() []database.Replica { + m.mutex.Lock() + defer m.mutex.Unlock() + return append(m.peers, m.self) } // Regional returns all replicas in the same region excluding itself. -func (s *Manager) Regional() []database.Replica { - s.mutex.Lock() - defer s.mutex.Unlock() +func (m *Manager) Regional() []database.Replica { + m.mutex.Lock() + defer m.mutex.Unlock() replicas := make([]database.Replica, 0) - for _, replica := range s.peers { - if replica.RegionID != s.self.RegionID { + for _, replica := range m.peers { + if replica.RegionID != m.self.RegionID { continue } replicas = append(replicas, replica) @@ -321,47 +321,47 @@ func (s *Manager) Regional() []database.Replica { // SetCallback sets a function to execute whenever new peers // are refreshed or updated. -func (s *Manager) SetCallback(callback func()) { - s.mutex.Lock() - defer s.mutex.Unlock() - s.callback = callback +func (m *Manager) SetCallback(callback func()) { + m.mutex.Lock() + defer m.mutex.Unlock() + m.callback = callback // Instantly call the callback to inform replicas! go callback() } -func (s *Manager) Close() error { - s.closeMutex.Lock() +func (m *Manager) Close() error { + m.closeMutex.Lock() select { - case <-s.closed: - s.closeMutex.Unlock() + case <-m.closed: + m.closeMutex.Unlock() return nil default: } - close(s.closed) - s.closeCancel() - s.closeWait.Wait() - s.closeMutex.Unlock() + close(m.closed) + m.closeCancel() + m.closeWait.Wait() + m.closeMutex.Unlock() ctx, cancelFunc := context.WithTimeout(context.Background(), 5*time.Second) defer cancelFunc() - _, err := s.db.UpdateReplica(ctx, database.UpdateReplicaParams{ - ID: s.self.ID, + _, err := m.db.UpdateReplica(ctx, database.UpdateReplicaParams{ + ID: m.self.ID, UpdatedAt: database.Now(), - StartedAt: s.self.StartedAt, + StartedAt: m.self.StartedAt, StoppedAt: sql.NullTime{ Time: database.Now(), Valid: true, }, - RelayAddress: s.self.RelayAddress, - RegionID: s.self.RegionID, - Hostname: s.self.Hostname, - Version: s.self.Version, - Error: s.self.Error, + RelayAddress: m.self.RelayAddress, + RegionID: m.self.RegionID, + Hostname: m.self.Hostname, + Version: m.self.Version, + Error: m.self.Error, }) if err != nil { return xerrors.Errorf("update replica: %w", err) } - err = s.pubsub.Publish(PubsubEvent, []byte(s.self.ID.String())) + err = m.pubsub.Publish(PubsubEvent, []byte(m.self.ID.String())) if err != nil { return xerrors.Errorf("publish replica update: %w", err) } From 186a5e2623d3aa57c7816a0fd6d9ff917e531298 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Thu, 13 Oct 2022 20:27:32 +0000 Subject: [PATCH 22/79] Add warnings for HA --- .vscode/settings.json | 2 + cli/deployment/flags.go | 8 --- codersdk/deployment.go | 26 +++++++ codersdk/flags.go | 1 - enterprise/cli/server.go | 30 +++++--- enterprise/coderd/coderd.go | 5 +- .../coderd/coderdenttest/coderdenttest.go | 12 ++-- enterprise/coderd/license/license.go | 23 ++++++ enterprise/coderd/license/license_test.go | 72 ++++++++++++++++--- enterprise/derpmesh/derpmesh.go | 13 ++++ 10 files changed, 156 insertions(+), 36 deletions(-) create mode 100644 codersdk/deployment.go diff --git a/.vscode/settings.json b/.vscode/settings.json index f556563596bc0..2e6ff3d23704c 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -19,6 +19,7 @@ "derphttp", "derpmap", "devel", + "dflags", "drpc", "drpcconn", "drpcmux", @@ -88,6 +89,7 @@ "replicasync", "retrier", "rpty", + "SCIM", "sdkproto", "sdktrace", "Signup", diff --git a/cli/deployment/flags.go b/cli/deployment/flags.go index 35ae248a0a722..8c6608b552586 100644 --- a/cli/deployment/flags.go +++ b/cli/deployment/flags.go @@ -130,14 +130,6 @@ func Flags() *codersdk.DeploymentFlags { Description: "The bind address to serve pprof.", Default: "127.0.0.1:6060", }, - HighAvailability: &codersdk.BoolFlag{ - Name: "High Availability", - Flag: "high-availability", - EnvVar: "CODER_HIGH_AVAILABILITY", - Description: "Specifies whether high availability is enabled.", - Default: true, - Enterprise: true, - }, CacheDir: &codersdk.StringFlag{ Name: "Cache Directory", Flag: "cache-dir", diff --git a/codersdk/deployment.go b/codersdk/deployment.go new file mode 100644 index 0000000000000..a1227b09e3f63 --- /dev/null +++ b/codersdk/deployment.go @@ -0,0 +1,26 @@ +package codersdk + +import ( + "time" + + "github.com/google/uuid" +) + +type DeploymentInfo struct { + Replicas []Replica `json:"replicas"` +} + +type Replica struct { + // ID is the unique identifier for the replica. + ID uuid.UUID `json:"id"` + // Hostname is the hostname of the replica. + Hostname string `json:"hostname"` + // CreatedAt is when the replica was first seen. + CreatedAt time.Time `json:"created_at"` + // Active determines whether the replica is online. + Active bool `json:"active"` + // RelayAddress is the accessible address to relay DERP connections. + RelayAddress string `json:"relay_address"` + // Error is the error. + Error string `json:"error"` +} diff --git a/codersdk/flags.go b/codersdk/flags.go index 2dd1323a1fddc..09ca65b1ea813 100644 --- a/codersdk/flags.go +++ b/codersdk/flags.go @@ -60,7 +60,6 @@ type DeploymentFlags struct { Verbose *BoolFlag `json:"verbose" typescript:",notnull"` AuditLogging *BoolFlag `json:"audit_logging" typescript:",notnull"` BrowserOnly *BoolFlag `json:"browser_only" typescript:",notnull"` - HighAvailability *BoolFlag `json:"high_availability" typescript:",notnull"` SCIMAuthHeader *StringFlag `json:"scim_auth_header" typescript:",notnull"` UserWorkspaceQuota *IntFlag `json:"user_workspace_quota" typescript:",notnull"` } diff --git a/enterprise/cli/server.go b/enterprise/cli/server.go index e34bdaccfd342..cc44985e0a4d4 100644 --- a/enterprise/cli/server.go +++ b/enterprise/cli/server.go @@ -2,9 +2,11 @@ package cli import ( "context" + "net/url" "github.com/google/uuid" "github.com/spf13/cobra" + "golang.org/x/xerrors" "cdr.dev/slog" @@ -20,22 +22,35 @@ func server() *cobra.Command { dflags := deployment.Flags() cmd := agpl.Server(dflags, func(ctx context.Context, cfg config.Root, options *agplcoderd.Options) (*agplcoderd.API, error) { replicaIDRaw, err := cfg.ReplicaID().Read() + generatedReplicaID := false if err != nil { replicaIDRaw = uuid.NewString() + generatedReplicaID = true } replicaID, err := uuid.Parse(replicaIDRaw) if err != nil { options.Logger.Warn(ctx, "failed to parse replica id", slog.Error(err), slog.F("replica_id", replicaIDRaw)) replicaID = uuid.New() + generatedReplicaID = true + } + if generatedReplicaID { + // Make sure we save it to be reused later! + _ = cfg.ReplicaID().Write(replicaID.String()) + } + + if dflags.DerpServerRelayAddress.Value != "" { + _, err := url.Parse(dflags.DerpServerRelayAddress.Value) + if err != nil { + return nil, xerrors.Errorf("derp-server-relay-address must be a valid HTTP URL: %w", err) + } } - o := &coderd.Options{ - AuditLogging: dflags.AuditLogging.Value, - BrowserOnly: dflags.BrowserOnly.Value, - SCIMAPIKey: []byte(dflags.SCIMAuthHeader.Value), - UserWorkspaceQuota: dflags.UserWorkspaceQuota.Value, - RBAC: true, - HighAvailability: dflags.HighAvailability.Value, + o := &coderd.Options{ + AuditLogging: dflags.AuditLogging.Value, + BrowserOnly: dflags.BrowserOnly.Value, + SCIMAPIKey: []byte(dflags.SCIMAuthHeader.Value), + UserWorkspaceQuota: dflags.UserWorkspaceQuota.Value, + RBAC: true, ReplicaID: replicaID, DERPServerRelayAddress: dflags.DerpServerRelayAddress.Value, DERPServerRegionID: dflags.DerpServerRegionID.Value, @@ -50,6 +65,5 @@ func server() *cobra.Command { }) deployment.AttachFlags(cmd.Flags(), dflags, true) - return cmd } diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index 342b992c8076f..b06e843b658a5 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -146,7 +146,6 @@ type Options struct { BrowserOnly bool SCIMAPIKey []byte UserWorkspaceQuota int - HighAvailability bool // Used for high availability. DERPServerRelayAddress string @@ -182,12 +181,12 @@ func (api *API) updateEntitlements(ctx context.Context) error { api.entitlementsMu.Lock() defer api.entitlementsMu.Unlock() - entitlements, err := license.Entitlements(ctx, api.Database, api.Logger, api.Keys, map[string]bool{ + entitlements, err := license.Entitlements(ctx, api.Database, api.Logger, len(api.replicaManager.All()), api.Keys, map[string]bool{ codersdk.FeatureAuditLog: api.AuditLogging, codersdk.FeatureBrowserOnly: api.BrowserOnly, codersdk.FeatureSCIM: len(api.SCIMAPIKey) != 0, codersdk.FeatureWorkspaceQuota: api.UserWorkspaceQuota != 0, - codersdk.FeatureHighAvailability: api.HighAvailability, + codersdk.FeatureHighAvailability: api.DERPServerRelayAddress != "", codersdk.FeatureTemplateRBAC: api.RBAC, }) if err != nil { diff --git a/enterprise/coderd/coderdenttest/coderdenttest.go b/enterprise/coderd/coderdenttest/coderdenttest.go index 57440ac37082e..24f0bffd5017f 100644 --- a/enterprise/coderd/coderdenttest/coderdenttest.go +++ b/enterprise/coderd/coderdenttest/coderdenttest.go @@ -63,14 +63,12 @@ func NewWithAPI(t *testing.T, options *Options) (*codersdk.Client, io.Closer, *c } srv, cancelFunc, oop := coderdtest.NewOptions(t, options.Options) coderAPI, err := coderd.New(context.Background(), &coderd.Options{ - RBAC: true, - AuditLogging: options.AuditLogging, - BrowserOnly: options.BrowserOnly, - SCIMAPIKey: options.SCIMAPIKey, - // TODO: Kyle change this before merge! - DERPServerRelayAddress: oop.AccessURL.String() + "/derp", + RBAC: true, + AuditLogging: options.AuditLogging, + BrowserOnly: options.BrowserOnly, + SCIMAPIKey: options.SCIMAPIKey, + DERPServerRelayAddress: oop.AccessURL.String(), DERPServerRegionID: 1, - HighAvailability: true, ReplicaID: uuid.New(), UserWorkspaceQuota: options.UserWorkspaceQuota, Options: oop, diff --git a/enterprise/coderd/license/license.go b/enterprise/coderd/license/license.go index 43f8b53094c7c..633b1a5056cab 100644 --- a/enterprise/coderd/license/license.go +++ b/enterprise/coderd/license/license.go @@ -21,6 +21,7 @@ func Entitlements( ctx context.Context, db database.Store, logger slog.Logger, + replicaCount int, keys map[string]ed25519.PublicKey, enablements map[string]bool, ) (codersdk.Entitlements, error) { @@ -144,6 +145,10 @@ func Entitlements( if featureName == codersdk.FeatureUserLimit { continue } + // High availability has it's own warnings based on replica count! + if featureName == codersdk.FeatureHighAvailability { + continue + } feature := entitlements.Features[featureName] if !feature.Enabled { continue @@ -161,6 +166,24 @@ func Entitlements( } } + if replicaCount > 1 { + feature := entitlements.Features[codersdk.FeatureHighAvailability] + + switch feature.Entitlement { + case codersdk.EntitlementNotEntitled: + if entitlements.HasLicense { + entitlements.Warnings = append(entitlements.Warnings, + "You have multiple replicas but your license is not entitled to high availability.") + } else { + entitlements.Warnings = append(entitlements.Warnings, + "You have multiple replicas but high availability is an Enterprise feature. Contact sales to get a license.") + } + case codersdk.EntitlementGracePeriod: + entitlements.Warnings = append(entitlements.Warnings, + "You have multiple replicas but your license for high availability is expired.") + } + } + for _, featureName := range codersdk.FeatureNames { feature := entitlements.Features[featureName] if feature.Entitlement == codersdk.EntitlementNotEntitled { diff --git a/enterprise/coderd/license/license_test.go b/enterprise/coderd/license/license_test.go index f1318e26bae47..5b50bdb97cfe2 100644 --- a/enterprise/coderd/license/license_test.go +++ b/enterprise/coderd/license/license_test.go @@ -31,7 +31,7 @@ func TestEntitlements(t *testing.T) { t.Run("Defaults", func(t *testing.T) { t.Parallel() db := databasefake.New() - entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, coderdenttest.Keys, all) + entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, 1, coderdenttest.Keys, all) require.NoError(t, err) require.False(t, entitlements.HasLicense) require.False(t, entitlements.Trial) @@ -47,7 +47,7 @@ func TestEntitlements(t *testing.T) { JWT: coderdenttest.GenerateLicense(t, coderdenttest.LicenseOptions{}), Exp: time.Now().Add(time.Hour), }) - entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, coderdenttest.Keys, map[string]bool{}) + entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, 1, coderdenttest.Keys, map[string]bool{}) require.NoError(t, err) require.True(t, entitlements.HasLicense) require.False(t, entitlements.Trial) @@ -71,7 +71,7 @@ func TestEntitlements(t *testing.T) { }), Exp: time.Now().Add(time.Hour), }) - entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, coderdenttest.Keys, map[string]bool{}) + entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, 1, coderdenttest.Keys, map[string]bool{}) require.NoError(t, err) require.True(t, entitlements.HasLicense) require.False(t, entitlements.Trial) @@ -96,7 +96,7 @@ func TestEntitlements(t *testing.T) { }), Exp: time.Now().Add(time.Hour), }) - entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, coderdenttest.Keys, all) + entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, 1, coderdenttest.Keys, all) require.NoError(t, err) require.True(t, entitlements.HasLicense) require.False(t, entitlements.Trial) @@ -104,6 +104,9 @@ func TestEntitlements(t *testing.T) { if featureName == codersdk.FeatureUserLimit { continue } + if featureName == codersdk.FeatureHighAvailability { + continue + } niceName := strings.Title(strings.ReplaceAll(featureName, "_", " ")) require.Equal(t, codersdk.EntitlementGracePeriod, entitlements.Features[featureName].Entitlement) require.Contains(t, entitlements.Warnings, fmt.Sprintf("%s is enabled but your license for this feature is expired.", niceName)) @@ -116,7 +119,7 @@ func TestEntitlements(t *testing.T) { JWT: coderdenttest.GenerateLicense(t, coderdenttest.LicenseOptions{}), Exp: time.Now().Add(time.Hour), }) - entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, coderdenttest.Keys, all) + entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, 1, coderdenttest.Keys, all) require.NoError(t, err) require.True(t, entitlements.HasLicense) require.False(t, entitlements.Trial) @@ -124,6 +127,9 @@ func TestEntitlements(t *testing.T) { if featureName == codersdk.FeatureUserLimit { continue } + if featureName == codersdk.FeatureHighAvailability { + continue + } niceName := strings.Title(strings.ReplaceAll(featureName, "_", " ")) // Ensures features that are not entitled are properly disabled. require.False(t, entitlements.Features[featureName].Enabled) @@ -142,7 +148,7 @@ func TestEntitlements(t *testing.T) { }), Exp: time.Now().Add(time.Hour), }) - entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, coderdenttest.Keys, map[string]bool{}) + entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, 1, coderdenttest.Keys, map[string]bool{}) require.NoError(t, err) require.True(t, entitlements.HasLicense) require.Contains(t, entitlements.Warnings, "Your deployment has 2 active users but is only licensed for 1.") @@ -164,7 +170,7 @@ func TestEntitlements(t *testing.T) { }), Exp: time.Now().Add(time.Hour), }) - entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, coderdenttest.Keys, map[string]bool{}) + entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, 1, coderdenttest.Keys, map[string]bool{}) require.NoError(t, err) require.True(t, entitlements.HasLicense) require.Empty(t, entitlements.Warnings) @@ -187,7 +193,7 @@ func TestEntitlements(t *testing.T) { }), }) - entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, coderdenttest.Keys, map[string]bool{}) + entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, 1, coderdenttest.Keys, map[string]bool{}) require.NoError(t, err) require.True(t, entitlements.HasLicense) require.False(t, entitlements.Trial) @@ -202,7 +208,7 @@ func TestEntitlements(t *testing.T) { AllFeatures: true, }), }) - entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, coderdenttest.Keys, all) + entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, 1, coderdenttest.Keys, all) require.NoError(t, err) require.True(t, entitlements.HasLicense) require.False(t, entitlements.Trial) @@ -214,4 +220,52 @@ func TestEntitlements(t *testing.T) { require.Equal(t, codersdk.EntitlementEntitled, entitlements.Features[featureName].Entitlement) } }) + + t.Run("MultipleReplicasNoLicense", func(t *testing.T) { + t.Parallel() + db := databasefake.New() + entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, 2, coderdenttest.Keys, all) + require.NoError(t, err) + require.False(t, entitlements.HasLicense) + require.Len(t, entitlements.Warnings, 1) + require.Equal(t, "You have multiple replicas but high availability is an Enterprise feature. Contact sales to get a license.", entitlements.Warnings[0]) + }) + + t.Run("MultipleReplicasNotEntitled", func(t *testing.T) { + t.Parallel() + db := databasefake.New() + db.InsertLicense(context.Background(), database.InsertLicenseParams{ + Exp: time.Now().Add(time.Hour), + JWT: coderdenttest.GenerateLicense(t, coderdenttest.LicenseOptions{ + AuditLog: true, + }), + }) + entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, 2, coderdenttest.Keys, map[string]bool{ + codersdk.FeatureHighAvailability: true, + }) + require.NoError(t, err) + require.True(t, entitlements.HasLicense) + require.Len(t, entitlements.Warnings, 1) + require.Equal(t, "You have multiple replicas but your license is not entitled to high availability.", entitlements.Warnings[0]) + }) + + t.Run("MultipleReplicasGrace", func(t *testing.T) { + t.Parallel() + db := databasefake.New() + db.InsertLicense(context.Background(), database.InsertLicenseParams{ + JWT: coderdenttest.GenerateLicense(t, coderdenttest.LicenseOptions{ + HighAvailability: true, + GraceAt: time.Now().Add(-time.Hour), + ExpiresAt: time.Now().Add(time.Hour), + }), + Exp: time.Now().Add(time.Hour), + }) + entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, 2, coderdenttest.Keys, map[string]bool{ + codersdk.FeatureHighAvailability: true, + }) + require.NoError(t, err) + require.True(t, entitlements.HasLicense) + require.Len(t, entitlements.Warnings, 1) + require.Equal(t, "You have multiple replicas but your license for high availability is expired.", entitlements.Warnings[0]) + }) } diff --git a/enterprise/derpmesh/derpmesh.go b/enterprise/derpmesh/derpmesh.go index 94341079cd43f..3ce22c1bd9a11 100644 --- a/enterprise/derpmesh/derpmesh.go +++ b/enterprise/derpmesh/derpmesh.go @@ -2,6 +2,7 @@ package derpmesh import ( "context" + "net/url" "sync" "golang.org/x/xerrors" @@ -40,6 +41,18 @@ type Mesh struct { func (m *Mesh) SetAddresses(addresses []string) { total := make(map[string]struct{}, 0) for _, address := range addresses { + addressURL, err := url.Parse(address) + if err != nil { + m.logger.Error(m.ctx, "invalid address", slog.F("address", err), slog.Error(err)) + continue + } + derpURL, err := addressURL.Parse("/derp") + if err != nil { + m.logger.Error(m.ctx, "parse derp", slog.F("address", err), slog.Error(err)) + continue + } + address = derpURL.String() + total[address] = struct{}{} added, err := m.addAddress(address) if err != nil { From de5b13b380795544cb38cd697c07cd0c63a39e51 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Thu, 13 Oct 2022 21:35:41 +0000 Subject: [PATCH 23/79] Add the ability to block endpoints --- .../coderd/coderdenttest/coderdenttest.go | 2 +- enterprise/coderd/replicas_test.go | 80 ++++++++++++------- tailnet/conn.go | 20 +++-- 3 files changed, 67 insertions(+), 35 deletions(-) diff --git a/enterprise/coderd/coderdenttest/coderdenttest.go b/enterprise/coderd/coderdenttest/coderdenttest.go index 24f0bffd5017f..ea172c43116e4 100644 --- a/enterprise/coderd/coderdenttest/coderdenttest.go +++ b/enterprise/coderd/coderdenttest/coderdenttest.go @@ -68,7 +68,7 @@ func NewWithAPI(t *testing.T, options *Options) (*codersdk.Client, io.Closer, *c BrowserOnly: options.BrowserOnly, SCIMAPIKey: options.SCIMAPIKey, DERPServerRelayAddress: oop.AccessURL.String(), - DERPServerRegionID: 1, + DERPServerRegionID: oop.DERPMap.RegionIDs()[0], ReplicaID: uuid.New(), UserWorkspaceQuota: options.UserWorkspaceQuota, Options: oop, diff --git a/enterprise/coderd/replicas_test.go b/enterprise/coderd/replicas_test.go index 52836a720f623..01c6be90199f0 100644 --- a/enterprise/coderd/replicas_test.go +++ b/enterprise/coderd/replicas_test.go @@ -3,7 +3,6 @@ package coderd_test import ( "context" "testing" - "time" "github.com/stretchr/testify/require" @@ -13,38 +12,63 @@ import ( "github.com/coder/coder/coderd/coderdtest" "github.com/coder/coder/coderd/database/dbtestutil" "github.com/coder/coder/enterprise/coderd/coderdenttest" + "github.com/coder/coder/testutil" ) func TestReplicas(t *testing.T) { t.Parallel() - db, pubsub := dbtestutil.NewDB(t) - firstClient := coderdenttest.New(t, &coderdenttest.Options{ - Options: &coderdtest.Options{ - IncludeProvisionerDaemon: true, - Database: db, - Pubsub: pubsub, - }, + t.Run("WarningsWithoutLicense", func(t *testing.T) { + t.Parallel() + db, pubsub := dbtestutil.NewDB(t) + firstClient := coderdenttest.New(t, &coderdenttest.Options{ + Options: &coderdtest.Options{ + IncludeProvisionerDaemon: true, + Database: db, + Pubsub: pubsub, + }, + }) + _ = coderdtest.CreateFirstUser(t, firstClient) + secondClient := coderdenttest.New(t, &coderdenttest.Options{ + Options: &coderdtest.Options{ + Database: db, + Pubsub: pubsub, + }, + }) + secondClient.SessionToken = firstClient.SessionToken + ents, err := secondClient.Entitlements(context.Background()) + require.NoError(t, err) + require.Len(t, ents.Warnings, 1) }) - firstUser := coderdtest.CreateFirstUser(t, firstClient) - coderdenttest.AddLicense(t, firstClient, coderdenttest.LicenseOptions{ - HighAvailability: true, - }) - - secondClient := coderdenttest.New(t, &coderdenttest.Options{ - Options: &coderdtest.Options{ - Database: db, - Pubsub: pubsub, - }, - }) - secondClient.SessionToken = firstClient.SessionToken + t.Run("ConnectAcrossMultiple", func(t *testing.T) { + t.Parallel() + db, pubsub := dbtestutil.NewDB(t) + firstClient := coderdenttest.New(t, &coderdenttest.Options{ + Options: &coderdtest.Options{ + IncludeProvisionerDaemon: true, + Database: db, + Pubsub: pubsub, + }, + }) + firstUser := coderdtest.CreateFirstUser(t, firstClient) + coderdenttest.AddLicense(t, firstClient, coderdenttest.LicenseOptions{ + HighAvailability: true, + }) - agentID := setupWorkspaceAgent(t, firstClient, firstUser) - conn, err := secondClient.DialWorkspaceAgentTailnet(context.Background(), slogtest.Make(t, nil).Leveled(slog.LevelDebug), agentID) - require.NoError(t, err) - require.Eventually(t, func() bool { - _, err = conn.Ping() - return err == nil - }, 10*time.Second, 250*time.Millisecond) + secondClient := coderdenttest.New(t, &coderdenttest.Options{ + Options: &coderdtest.Options{ + Database: db, + Pubsub: pubsub, + }, + }) + secondClient.SessionToken = firstClient.SessionToken - _ = conn.Close() + agentID := setupWorkspaceAgent(t, firstClient, firstUser) + conn, err := secondClient.DialWorkspaceAgentTailnet(context.Background(), slogtest.Make(t, nil).Leveled(slog.LevelDebug), agentID) + require.NoError(t, err) + require.Eventually(t, func() bool { + _, err = conn.Ping() + return err == nil + }, testutil.WaitShort, testutil.IntervalFast) + _ = conn.Close() + }) } diff --git a/tailnet/conn.go b/tailnet/conn.go index 1b454d6346b97..19a0cd50f49e6 100644 --- a/tailnet/conn.go +++ b/tailnet/conn.go @@ -48,7 +48,10 @@ type Options struct { Addresses []netip.Prefix DERPMap *tailcfg.DERPMap - Logger slog.Logger + // BlockEndpoints specifies whether P2P endpoints are blocked. + // If so, only DERPs can establish connections. + BlockEndpoints bool + Logger slog.Logger } // NewConn constructs a new Wireguard server that will accept connections from the addresses provided. @@ -175,6 +178,7 @@ func NewConn(options *Options) (*Conn, error) { wireguardEngine.SetFilter(filter.New(netMap.PacketFilter, localIPs, logIPs, nil, Logger(options.Logger.Named("packet-filter")))) dialContext, dialCancel := context.WithCancel(context.Background()) server := &Conn{ + blockEndpoints: options.BlockEndpoints, dialContext: dialContext, dialCancel: dialCancel, closed: make(chan struct{}), @@ -240,11 +244,12 @@ func IP() netip.Addr { // Conn is an actively listening Wireguard connection. type Conn struct { - dialContext context.Context - dialCancel context.CancelFunc - mutex sync.Mutex - closed chan struct{} - logger slog.Logger + dialContext context.Context + dialCancel context.CancelFunc + mutex sync.Mutex + closed chan struct{} + logger slog.Logger + blockEndpoints bool dialer *tsdial.Dialer tunDevice *tstun.Wrapper @@ -429,6 +434,9 @@ func (c *Conn) sendNode() { PreferredDERP: c.lastPreferredDERP, DERPLatency: c.lastDERPLatency, } + if c.blockEndpoints { + node.Endpoints = nil + } nodeCallback := c.nodeCallback if nodeCallback == nil { return From 9a50ac496ef1f0329f15add7ab4e58652f03b184 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Fri, 14 Oct 2022 15:37:38 +0000 Subject: [PATCH 24/79] Add flag to disable P2P connections --- cli/agent_test.go | 8 +++----- cli/configssh_test.go | 3 +-- cli/portforward.go | 3 +-- cli/speedtest.go | 4 +++- cli/ssh.go | 4 +--- coderd/activitybump_test.go | 4 +++- coderd/coderd.go | 2 +- coderd/templates_test.go | 4 +++- coderd/workspaceagents_test.go | 6 ++++-- codersdk/workspaceagents.go | 23 +++++++++++++++-------- enterprise/coderd/coderd.go | 4 +++- enterprise/coderd/replicas_test.go | 9 ++++++--- enterprise/coderd/workspaceagents_test.go | 5 ++--- enterprise/derpmesh/derpmesh.go | 9 +++++++-- tailnet/conn.go | 3 +++ 15 files changed, 56 insertions(+), 35 deletions(-) diff --git a/cli/agent_test.go b/cli/agent_test.go index dd0cb1d789349..8a90bb4cada3b 100644 --- a/cli/agent_test.go +++ b/cli/agent_test.go @@ -7,8 +7,6 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - "cdr.dev/slog" - "github.com/coder/coder/cli/clitest" "github.com/coder/coder/coderd/coderdtest" "github.com/coder/coder/provisioner/echo" @@ -67,7 +65,7 @@ func TestWorkspaceAgent(t *testing.T) { if assert.NotEmpty(t, workspace.LatestBuild.Resources) && assert.NotEmpty(t, resources[0].Agents) { assert.NotEmpty(t, resources[0].Agents[0].Version) } - dialer, err := client.DialWorkspaceAgentTailnet(ctx, slog.Logger{}, resources[0].Agents[0].ID) + dialer, err := client.DialWorkspaceAgent(ctx, resources[0].Agents[0].ID, nil) require.NoError(t, err) defer dialer.Close() require.Eventually(t, func() bool { @@ -128,7 +126,7 @@ func TestWorkspaceAgent(t *testing.T) { if assert.NotEmpty(t, resources) && assert.NotEmpty(t, resources[0].Agents) { assert.NotEmpty(t, resources[0].Agents[0].Version) } - dialer, err := client.DialWorkspaceAgentTailnet(ctx, slog.Logger{}, resources[0].Agents[0].ID) + dialer, err := client.DialWorkspaceAgent(ctx, resources[0].Agents[0].ID, nil) require.NoError(t, err) defer dialer.Close() require.Eventually(t, func() bool { @@ -189,7 +187,7 @@ func TestWorkspaceAgent(t *testing.T) { if assert.NotEmpty(t, resources) && assert.NotEmpty(t, resources[0].Agents) { assert.NotEmpty(t, resources[0].Agents[0].Version) } - dialer, err := client.DialWorkspaceAgentTailnet(ctx, slog.Logger{}, resources[0].Agents[0].ID) + dialer, err := client.DialWorkspaceAgent(ctx, resources[0].Agents[0].ID, nil) require.NoError(t, err) defer dialer.Close() require.Eventually(t, func() bool { diff --git a/cli/configssh_test.go b/cli/configssh_test.go index 3e1512a0c3471..4553cbe431221 100644 --- a/cli/configssh_test.go +++ b/cli/configssh_test.go @@ -19,7 +19,6 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - "cdr.dev/slog" "cdr.dev/slog/sloggers/slogtest" "github.com/coder/coder/agent" @@ -115,7 +114,7 @@ func TestConfigSSH(t *testing.T) { _ = agentCloser.Close() }() resources := coderdtest.AwaitWorkspaceAgents(t, client, workspace.ID) - agentConn, err := client.DialWorkspaceAgentTailnet(context.Background(), slog.Logger{}, resources[0].Agents[0].ID) + agentConn, err := client.DialWorkspaceAgent(context.Background(), resources[0].Agents[0].ID, nil) require.NoError(t, err) defer agentConn.Close() diff --git a/cli/portforward.go b/cli/portforward.go index 476809d601558..9cd3bc317c3b4 100644 --- a/cli/portforward.go +++ b/cli/portforward.go @@ -16,7 +16,6 @@ import ( "github.com/spf13/cobra" "golang.org/x/xerrors" - "cdr.dev/slog" "github.com/coder/coder/agent" "github.com/coder/coder/cli/cliflag" "github.com/coder/coder/cli/cliui" @@ -96,7 +95,7 @@ func portForward() *cobra.Command { return xerrors.Errorf("await agent: %w", err) } - conn, err := client.DialWorkspaceAgentTailnet(ctx, slog.Logger{}, workspaceAgent.ID) + conn, err := client.DialWorkspaceAgent(ctx, workspaceAgent.ID, nil) if err != nil { return err } diff --git a/cli/speedtest.go b/cli/speedtest.go index 357048f63ea34..cbb226b341342 100644 --- a/cli/speedtest.go +++ b/cli/speedtest.go @@ -55,7 +55,9 @@ func speedtest() *cobra.Command { if cliflag.IsSetBool(cmd, varVerbose) { logger = logger.Leveled(slog.LevelDebug) } - conn, err := client.DialWorkspaceAgentTailnet(ctx, logger, workspaceAgent.ID) + conn, err := client.DialWorkspaceAgent(ctx, workspaceAgent.ID, &codersdk.DialWorkspaceAgentOptions{ + Logger: logger, + }) if err != nil { return err } diff --git a/cli/ssh.go b/cli/ssh.go index ef8538764e3ac..b4d4f6420da78 100644 --- a/cli/ssh.go +++ b/cli/ssh.go @@ -20,8 +20,6 @@ import ( "golang.org/x/term" "golang.org/x/xerrors" - "cdr.dev/slog" - "github.com/coder/coder/cli/cliflag" "github.com/coder/coder/cli/cliui" "github.com/coder/coder/coderd/autobuild/notify" @@ -86,7 +84,7 @@ func ssh() *cobra.Command { return xerrors.Errorf("await agent: %w", err) } - conn, err := client.DialWorkspaceAgentTailnet(ctx, slog.Logger{}, workspaceAgent.ID) + conn, err := client.DialWorkspaceAgent(ctx, workspaceAgent.ID, nil) if err != nil { return err } diff --git a/coderd/activitybump_test.go b/coderd/activitybump_test.go index b12c8bc170a29..746bef0c9994e 100644 --- a/coderd/activitybump_test.go +++ b/coderd/activitybump_test.go @@ -74,7 +74,9 @@ func TestWorkspaceActivityBump(t *testing.T) { client, workspace, assertBumped := setupActivityTest(t) resources := coderdtest.AwaitWorkspaceAgents(t, client, workspace.ID) - conn, err := client.DialWorkspaceAgentTailnet(ctx, slogtest.Make(t, nil), resources[0].Agents[0].ID) + conn, err := client.DialWorkspaceAgent(ctx, resources[0].Agents[0].ID, &codersdk.DialWorkspaceAgentOptions{ + Logger: slogtest.Make(t, nil), + }) require.NoError(t, err) defer conn.Close() diff --git a/coderd/coderd.go b/coderd/coderd.go index 6b4b335161c32..da1fc0572ccc6 100644 --- a/coderd/coderd.go +++ b/coderd/coderd.go @@ -123,7 +123,7 @@ func New(options *Options) *API { options.TailnetCoordinator = tailnet.NewCoordinator() } if options.DERPServer == nil { - options.DERPServer = derp.NewServer(key.NewNode(), tailnet.Logger(options.Logger)) + options.DERPServer = derp.NewServer(key.NewNode(), tailnet.Logger(options.Logger.Named("derp").Leveled(slog.LevelDebug))) options.DERPServer.SetMeshKey("todo-kyle-change-this") } if options.Auditor == nil { diff --git a/coderd/templates_test.go b/coderd/templates_test.go index bf547c4d0eb9a..861ad6f459035 100644 --- a/coderd/templates_test.go +++ b/coderd/templates_test.go @@ -626,7 +626,9 @@ func TestTemplateDAUs(t *testing.T) { require.NoError(t, err) assert.Zero(t, workspaces[0].LastUsedAt) - conn, err := client.DialWorkspaceAgentTailnet(ctx, slogtest.Make(t, nil).Named("tailnet"), resources[0].Agents[0].ID) + conn, err := client.DialWorkspaceAgent(ctx, resources[0].Agents[0].ID, &codersdk.DialWorkspaceAgentOptions{ + Logger: slogtest.Make(t, nil).Named("tailnet"), + }) require.NoError(t, err) defer func() { _ = conn.Close() diff --git a/coderd/workspaceagents_test.go b/coderd/workspaceagents_test.go index 6bd569dde9f71..c5f3d9f16c0d8 100644 --- a/coderd/workspaceagents_test.go +++ b/coderd/workspaceagents_test.go @@ -123,7 +123,7 @@ func TestWorkspaceAgentListen(t *testing.T) { defer cancel() resources := coderdtest.AwaitWorkspaceAgents(t, client, workspace.ID) - conn, err := client.DialWorkspaceAgentTailnet(ctx, slog.Logger{}, resources[0].Agents[0].ID) + conn, err := client.DialWorkspaceAgent(ctx, resources[0].Agents[0].ID, nil) require.NoError(t, err) defer func() { _ = conn.Close() @@ -253,7 +253,9 @@ func TestWorkspaceAgentTailnet(t *testing.T) { ctx, cancelFunc := context.WithCancel(context.Background()) defer cancelFunc() - conn, err := client.DialWorkspaceAgentTailnet(ctx, slogtest.Make(t, nil).Named("client").Leveled(slog.LevelDebug), resources[0].Agents[0].ID) + conn, err := client.DialWorkspaceAgent(ctx, resources[0].Agents[0].ID, &codersdk.DialWorkspaceAgentOptions{ + Logger: slogtest.Make(t, nil).Named("client").Leveled(slog.LevelDebug), + }) require.NoError(t, err) defer conn.Close() sshClient, err := conn.SSHClient() diff --git a/codersdk/workspaceagents.go b/codersdk/workspaceagents.go index 81f82b08d3efa..97d225c3eebb3 100644 --- a/codersdk/workspaceagents.go +++ b/codersdk/workspaceagents.go @@ -331,7 +331,13 @@ func (c *Client) ListenWorkspaceAgentTailnet(ctx context.Context) (net.Conn, err return websocket.NetConn(ctx, conn, websocket.MessageBinary), nil } -func (c *Client) DialWorkspaceAgentTailnet(ctx context.Context, logger slog.Logger, agentID uuid.UUID) (*AgentConn, error) { +type DialWorkspaceAgentOptions struct { + Logger slog.Logger + // BlockEndpoints forced a direct connection through DERP. + BlockEndpoints bool +} + +func (c *Client) DialWorkspaceAgent(ctx context.Context, agentID uuid.UUID, options *DialWorkspaceAgentOptions) (*AgentConn, error) { res, err := c.Request(ctx, http.MethodGet, fmt.Sprintf("/api/v2/workspaceagents/%s/connection", agentID), nil) if err != nil { return nil, err @@ -348,9 +354,10 @@ func (c *Client) DialWorkspaceAgentTailnet(ctx context.Context, logger slog.Logg ip := tailnet.IP() conn, err := tailnet.NewConn(&tailnet.Options{ - Addresses: []netip.Prefix{netip.PrefixFrom(ip, 128)}, - DERPMap: connInfo.DERPMap, - Logger: logger, + Addresses: []netip.Prefix{netip.PrefixFrom(ip, 128)}, + DERPMap: connInfo.DERPMap, + Logger: options.Logger, + BlockEndpoints: options.BlockEndpoints, }) if err != nil { return nil, xerrors.Errorf("create tailnet: %w", err) @@ -378,7 +385,7 @@ func (c *Client) DialWorkspaceAgentTailnet(ctx context.Context, logger slog.Logg defer close(closed) isFirst := true for retrier := retry.New(50*time.Millisecond, 10*time.Second); retrier.Wait(ctx); { - logger.Debug(ctx, "connecting") + options.Logger.Debug(ctx, "connecting") // nolint:bodyclose ws, res, err := websocket.Dial(ctx, coordinateURL.String(), &websocket.DialOptions{ HTTPClient: httpClient, @@ -397,21 +404,21 @@ func (c *Client) DialWorkspaceAgentTailnet(ctx context.Context, logger slog.Logg if errors.Is(err, context.Canceled) { return } - logger.Debug(ctx, "failed to dial", slog.Error(err)) + options.Logger.Debug(ctx, "failed to dial", slog.Error(err)) continue } sendNode, errChan := tailnet.ServeCoordinator(websocket.NetConn(ctx, ws, websocket.MessageBinary), func(node []*tailnet.Node) error { return conn.UpdateNodes(node) }) conn.SetNodeCallback(sendNode) - logger.Debug(ctx, "serving coordinator") + options.Logger.Debug(ctx, "serving coordinator") err = <-errChan if errors.Is(err, context.Canceled) { _ = ws.Close(websocket.StatusGoingAway, "") return } if err != nil { - logger.Debug(ctx, "error serving coordinator", slog.Error(err)) + options.Logger.Debug(ctx, "error serving coordinator", slog.Error(err)) _ = ws.Close(websocket.StatusGoingAway, "") continue } diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index b06e843b658a5..252da9ac6f01a 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -3,6 +3,7 @@ package coderd import ( "context" "crypto/ed25519" + "fmt" "net/http" "sync" "time" @@ -126,7 +127,7 @@ func New(ctx context.Context, options *Options) (*API, error) { if err != nil { return nil, xerrors.Errorf("initialize replica: %w", err) } - api.derpMesh = derpmesh.New(options.Logger, api.DERPServer) + api.derpMesh = derpmesh.New(options.Logger.Named("derpmesh"), api.DERPServer) err = api.updateEntitlements(ctx) if err != nil { @@ -246,6 +247,7 @@ func (api *API) updateEntitlements(ctx context.Context) error { coordinator = haCoordinator } + fmt.Printf("HA enabled\n") api.replicaManager.SetCallback(func() { addresses := make([]string, 0) for _, replica := range api.replicaManager.Regional() { diff --git a/enterprise/coderd/replicas_test.go b/enterprise/coderd/replicas_test.go index 01c6be90199f0..1d60b24e6e81a 100644 --- a/enterprise/coderd/replicas_test.go +++ b/enterprise/coderd/replicas_test.go @@ -11,6 +11,7 @@ import ( "github.com/coder/coder/coderd/coderdtest" "github.com/coder/coder/coderd/database/dbtestutil" + "github.com/coder/coder/codersdk" "github.com/coder/coder/enterprise/coderd/coderdenttest" "github.com/coder/coder/testutil" ) @@ -61,14 +62,16 @@ func TestReplicas(t *testing.T) { }, }) secondClient.SessionToken = firstClient.SessionToken - agentID := setupWorkspaceAgent(t, firstClient, firstUser) - conn, err := secondClient.DialWorkspaceAgentTailnet(context.Background(), slogtest.Make(t, nil).Leveled(slog.LevelDebug), agentID) + conn, err := secondClient.DialWorkspaceAgent(context.Background(), agentID, &codersdk.DialWorkspaceAgentOptions{ + BlockEndpoints: true, + Logger: slogtest.Make(t, nil).Leveled(slog.LevelDebug), + }) require.NoError(t, err) require.Eventually(t, func() bool { _, err = conn.Ping() return err == nil - }, testutil.WaitShort, testutil.IntervalFast) + }, testutil.WaitLong, testutil.IntervalFast) _ = conn.Close() }) } diff --git a/enterprise/coderd/workspaceagents_test.go b/enterprise/coderd/workspaceagents_test.go index 24e24e3f5f540..a5250b3b81b44 100644 --- a/enterprise/coderd/workspaceagents_test.go +++ b/enterprise/coderd/workspaceagents_test.go @@ -8,7 +8,6 @@ import ( "github.com/google/uuid" "github.com/stretchr/testify/require" - "cdr.dev/slog" "cdr.dev/slog/sloggers/slogtest" "github.com/coder/coder/agent" "github.com/coder/coder/coderd/coderdtest" @@ -33,7 +32,7 @@ func TestBlockNonBrowser(t *testing.T) { BrowserOnly: true, }) id := setupWorkspaceAgent(t, client, user) - _, err := client.DialWorkspaceAgentTailnet(context.Background(), slog.Logger{}, id) + _, err := client.DialWorkspaceAgent(context.Background(), id, nil) var apiErr *codersdk.Error require.ErrorAs(t, err, &apiErr) require.Equal(t, http.StatusConflict, apiErr.StatusCode()) @@ -50,7 +49,7 @@ func TestBlockNonBrowser(t *testing.T) { BrowserOnly: false, }) id := setupWorkspaceAgent(t, client, user) - conn, err := client.DialWorkspaceAgentTailnet(context.Background(), slog.Logger{}, id) + conn, err := client.DialWorkspaceAgent(context.Background(), id, nil) require.NoError(t, err) _ = conn.Close() }) diff --git a/enterprise/derpmesh/derpmesh.go b/enterprise/derpmesh/derpmesh.go index 3ce22c1bd9a11..dbdf7bc1b1f3a 100644 --- a/enterprise/derpmesh/derpmesh.go +++ b/enterprise/derpmesh/derpmesh.go @@ -2,6 +2,7 @@ package derpmesh import ( "context" + "net" "net/url" "sync" @@ -88,11 +89,15 @@ func (m *Mesh) addAddress(address string) (bool, error) { if isActive { return false, nil } - client, err := derphttp.NewClient(m.server.PrivateKey(), address, tailnet.Logger(m.logger)) + client, err := derphttp.NewClient(m.server.PrivateKey(), address, tailnet.Logger(m.logger.Named("client"))) if err != nil { return false, xerrors.Errorf("create derp client: %w", err) } client.MeshKey = m.server.MeshKey() + client.SetURLDialer(func(ctx context.Context, network, addr string) (net.Conn, error) { + var dialer net.Dialer + return dialer.DialContext(ctx, network, addr) + }) ctx, cancelFunc := context.WithCancel(m.ctx) closed := make(chan struct{}) closeFunc := func() { @@ -103,7 +108,7 @@ func (m *Mesh) addAddress(address string) (bool, error) { m.active[address] = closeFunc go func() { defer close(closed) - client.RunWatchConnectionLoop(ctx, m.server.PublicKey(), tailnet.Logger(m.logger), func(np key.NodePublic) { + client.RunWatchConnectionLoop(ctx, m.server.PublicKey(), tailnet.Logger(m.logger.Named("loop")), func(np key.NodePublic) { m.server.AddPacketForwarder(np, client) }, func(np key.NodePublic) { m.server.RemovePacketForwarder(np, client) diff --git a/tailnet/conn.go b/tailnet/conn.go index 19a0cd50f49e6..e41ed60a527f3 100644 --- a/tailnet/conn.go +++ b/tailnet/conn.go @@ -344,6 +344,9 @@ func (c *Conn) UpdateNodes(nodes []*Node) error { // reason. TODO: @kylecarbs debug this! KeepAlive: ok && peerStatus.Active, } + if c.blockEndpoints { + peerNode.Endpoints = nil + } c.peerMap[node.ID] = peerNode } c.netMap.Peers = make([]*tailcfg.Node, 0, len(c.peerMap)) From 6fa941f958ab91be3b30ae2679f58bfdd33ec9b2 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Fri, 14 Oct 2022 16:16:08 +0000 Subject: [PATCH 25/79] Wow, I made the tests pass --- agent/agent_test.go | 2 +- cli/agent_test.go | 6 +++--- cli/portforward.go | 2 +- cli/speedtest.go | 2 +- coderd/workspaceagents_test.go | 2 +- codersdk/agentconn.go | 4 +++- enterprise/coderd/coderd.go | 2 -- enterprise/coderd/replicas_test.go | 5 ++++- 8 files changed, 14 insertions(+), 11 deletions(-) diff --git a/agent/agent_test.go b/agent/agent_test.go index 06a33598b755f..e1269d6003922 100644 --- a/agent/agent_test.go +++ b/agent/agent_test.go @@ -465,7 +465,7 @@ func TestAgent(t *testing.T) { conn, _ := setupAgent(t, codersdk.WorkspaceAgentMetadata{}, 0) require.Eventually(t, func() bool { - _, err := conn.Ping() + _, err := conn.Ping(context.Background()) return err == nil }, testutil.WaitMedium, testutil.IntervalFast) conn1, err := conn.DialContext(context.Background(), l.Addr().Network(), l.Addr().String()) diff --git a/cli/agent_test.go b/cli/agent_test.go index 8a90bb4cada3b..f487ebfc005ed 100644 --- a/cli/agent_test.go +++ b/cli/agent_test.go @@ -69,7 +69,7 @@ func TestWorkspaceAgent(t *testing.T) { require.NoError(t, err) defer dialer.Close() require.Eventually(t, func() bool { - _, err := dialer.Ping() + _, err := dialer.Ping(ctx) return err == nil }, testutil.WaitMedium, testutil.IntervalFast) cancelFunc() @@ -130,7 +130,7 @@ func TestWorkspaceAgent(t *testing.T) { require.NoError(t, err) defer dialer.Close() require.Eventually(t, func() bool { - _, err := dialer.Ping() + _, err := dialer.Ping(ctx) return err == nil }, testutil.WaitMedium, testutil.IntervalFast) cancelFunc() @@ -191,7 +191,7 @@ func TestWorkspaceAgent(t *testing.T) { require.NoError(t, err) defer dialer.Close() require.Eventually(t, func() bool { - _, err := dialer.Ping() + _, err := dialer.Ping(ctx) return err == nil }, testutil.WaitMedium, testutil.IntervalFast) cancelFunc() diff --git a/cli/portforward.go b/cli/portforward.go index 9cd3bc317c3b4..5a6f4391dd897 100644 --- a/cli/portforward.go +++ b/cli/portforward.go @@ -155,7 +155,7 @@ func portForward() *cobra.Command { case <-ticker.C: } - _, err = conn.Ping() + _, err = conn.Ping(ctx) if err != nil { continue } diff --git a/cli/speedtest.go b/cli/speedtest.go index cbb226b341342..f6c06641ec26f 100644 --- a/cli/speedtest.go +++ b/cli/speedtest.go @@ -70,7 +70,7 @@ func speedtest() *cobra.Command { return ctx.Err() case <-ticker.C: } - dur, err := conn.Ping() + dur, err := conn.Ping(ctx) if err != nil { continue } diff --git a/coderd/workspaceagents_test.go b/coderd/workspaceagents_test.go index c5f3d9f16c0d8..e8dd772095736 100644 --- a/coderd/workspaceagents_test.go +++ b/coderd/workspaceagents_test.go @@ -129,7 +129,7 @@ func TestWorkspaceAgentListen(t *testing.T) { _ = conn.Close() }() require.Eventually(t, func() bool { - _, err := conn.Ping() + _, err := conn.Ping(ctx) return err == nil }, testutil.WaitLong, testutil.IntervalFast) }) diff --git a/codersdk/agentconn.go b/codersdk/agentconn.go index b11c440ce3a65..e75edf1ca6bb0 100644 --- a/codersdk/agentconn.go +++ b/codersdk/agentconn.go @@ -132,7 +132,7 @@ type AgentConn struct { CloseFunc func() } -func (c *AgentConn) Ping() (time.Duration, error) { +func (c *AgentConn) Ping(ctx context.Context) (time.Duration, error) { errCh := make(chan error, 1) durCh := make(chan time.Duration, 1) c.Conn.Ping(TailnetIP, tailcfg.PingDisco, func(pr *ipnstate.PingResult) { @@ -145,6 +145,8 @@ func (c *AgentConn) Ping() (time.Duration, error) { select { case err := <-errCh: return 0, err + case <-ctx.Done(): + return 0, ctx.Err() case dur := <-durCh: return dur, nil } diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index 252da9ac6f01a..21bc6f497ee1f 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -3,7 +3,6 @@ package coderd import ( "context" "crypto/ed25519" - "fmt" "net/http" "sync" "time" @@ -247,7 +246,6 @@ func (api *API) updateEntitlements(ctx context.Context) error { coordinator = haCoordinator } - fmt.Printf("HA enabled\n") api.replicaManager.SetCallback(func() { addresses := make([]string, 0) for _, replica := range api.replicaManager.Regional() { diff --git a/enterprise/coderd/replicas_test.go b/enterprise/coderd/replicas_test.go index 1d60b24e6e81a..0da4a05dbbb60 100644 --- a/enterprise/coderd/replicas_test.go +++ b/enterprise/coderd/replicas_test.go @@ -3,6 +3,7 @@ package coderd_test import ( "context" "testing" + "time" "github.com/stretchr/testify/require" @@ -69,7 +70,9 @@ func TestReplicas(t *testing.T) { }) require.NoError(t, err) require.Eventually(t, func() bool { - _, err = conn.Ping() + ctx, cancelFunc := context.WithTimeout(context.Background(), 3*time.Second) + defer cancelFunc() + _, err = conn.Ping(ctx) return err == nil }, testutil.WaitLong, testutil.IntervalFast) _ = conn.Close() From abff96b103bcc4d6a72697154d95477bd9b69aed Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Fri, 14 Oct 2022 16:54:21 +0000 Subject: [PATCH 26/79] Add replicas endpoint --- coderd/rbac/object.go | 4 ++ codersdk/deployment.go | 26 ------------ codersdk/replicas.go | 42 +++++++++++++++++++ codersdk/workspaceagents.go | 4 ++ enterprise/cli/features_test.go | 2 +- enterprise/coderd/coderd.go | 4 ++ enterprise/coderd/coderd_test.go | 2 +- .../coderdenttest/coderdenttest_test.go | 4 ++ enterprise/coderd/replicas.go | 35 ++++++++++++++++ enterprise/coderd/replicas_test.go | 5 +++ enterprise/replicasync/replicasync.go | 8 ++++ enterprise/replicasync/replicasync_test.go | 36 ++++++++++------ site/src/api/typesGenerated.ts | 11 +++++ 13 files changed, 143 insertions(+), 40 deletions(-) delete mode 100644 codersdk/deployment.go create mode 100644 codersdk/replicas.go diff --git a/coderd/rbac/object.go b/coderd/rbac/object.go index 5492e4397d5f7..1a8861c984ce9 100644 --- a/coderd/rbac/object.go +++ b/coderd/rbac/object.go @@ -146,6 +146,10 @@ var ( ResourceDeploymentFlags = Object{ Type: "deployment_flags", } + + ResourceReplicas = Object{ + Type: "replicas", + } ) // Object is used to create objects for authz checks when you have none in diff --git a/codersdk/deployment.go b/codersdk/deployment.go deleted file mode 100644 index a1227b09e3f63..0000000000000 --- a/codersdk/deployment.go +++ /dev/null @@ -1,26 +0,0 @@ -package codersdk - -import ( - "time" - - "github.com/google/uuid" -) - -type DeploymentInfo struct { - Replicas []Replica `json:"replicas"` -} - -type Replica struct { - // ID is the unique identifier for the replica. - ID uuid.UUID `json:"id"` - // Hostname is the hostname of the replica. - Hostname string `json:"hostname"` - // CreatedAt is when the replica was first seen. - CreatedAt time.Time `json:"created_at"` - // Active determines whether the replica is online. - Active bool `json:"active"` - // RelayAddress is the accessible address to relay DERP connections. - RelayAddress string `json:"relay_address"` - // Error is the error. - Error string `json:"error"` -} diff --git a/codersdk/replicas.go b/codersdk/replicas.go new file mode 100644 index 0000000000000..8e698fd3e6345 --- /dev/null +++ b/codersdk/replicas.go @@ -0,0 +1,42 @@ +package codersdk + +import ( + "context" + "encoding/json" + "net/http" + "time" + + "github.com/google/uuid" + "golang.org/x/xerrors" +) + +type Replica struct { + // ID is the unique identifier for the replica. + ID uuid.UUID `json:"id"` + // Hostname is the hostname of the replica. + Hostname string `json:"hostname"` + // CreatedAt is when the replica was first seen. + CreatedAt time.Time `json:"created_at"` + // RelayAddress is the accessible address to relay DERP connections. + RelayAddress string `json:"relay_address"` + // RegionID is the region of the replica. + RegionID int32 `json:"region_id"` + // Error is the error. + Error string `json:"error"` +} + +// Replicas fetches the list of replicas. +func (c *Client) Replicas(ctx context.Context) ([]Replica, error) { + res, err := c.Request(ctx, http.MethodGet, "/api/v2/replicas", nil) + if err != nil { + return nil, xerrors.Errorf("execute request: %w", err) + } + defer res.Body.Close() + + if res.StatusCode != http.StatusOK { + return nil, readBodyAsError(res) + } + + var replicas []Replica + return replicas, json.NewDecoder(res.Body).Decode(&replicas) +} diff --git a/codersdk/workspaceagents.go b/codersdk/workspaceagents.go index 97d225c3eebb3..c86b399e189ab 100644 --- a/codersdk/workspaceagents.go +++ b/codersdk/workspaceagents.go @@ -331,6 +331,7 @@ func (c *Client) ListenWorkspaceAgentTailnet(ctx context.Context) (net.Conn, err return websocket.NetConn(ctx, conn, websocket.MessageBinary), nil } +// @typescript-ignore DialWorkspaceAgentOptions type DialWorkspaceAgentOptions struct { Logger slog.Logger // BlockEndpoints forced a direct connection through DERP. @@ -338,6 +339,9 @@ type DialWorkspaceAgentOptions struct { } func (c *Client) DialWorkspaceAgent(ctx context.Context, agentID uuid.UUID, options *DialWorkspaceAgentOptions) (*AgentConn, error) { + if options == nil { + options = &DialWorkspaceAgentOptions{} + } res, err := c.Request(ctx, http.MethodGet, fmt.Sprintf("/api/v2/workspaceagents/%s/connection", agentID), nil) if err != nil { return nil, err diff --git a/enterprise/cli/features_test.go b/enterprise/cli/features_test.go index 1a59e095c3594..78b94a6509526 100644 --- a/enterprise/cli/features_test.go +++ b/enterprise/cli/features_test.go @@ -57,7 +57,7 @@ func TestFeaturesList(t *testing.T) { var entitlements codersdk.Entitlements err := json.Unmarshal(buf.Bytes(), &entitlements) require.NoError(t, err, "unmarshal JSON output") - assert.Len(t, entitlements.Features, 6) + assert.Len(t, entitlements.Features, 7) assert.Empty(t, entitlements.Warnings) assert.Equal(t, codersdk.EntitlementNotEntitled, entitlements.Features[codersdk.FeatureUserLimit].Entitlement) diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index 21bc6f497ee1f..1634f82d45366 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -59,6 +59,10 @@ func New(ctx context.Context, options *Options) (*API, error) { api.AGPL.APIHandler.Group(func(r chi.Router) { r.Get("/entitlements", api.serveEntitlements) + r.Route("/replicas", func(r chi.Router) { + r.Use(apiKeyMiddleware) + r.Get("/", api.replicas) + }) r.Route("/licenses", func(r chi.Router) { r.Use(apiKeyMiddleware) r.Post("/", api.postLicense) diff --git a/enterprise/coderd/coderd_test.go b/enterprise/coderd/coderd_test.go index 40421450415a5..7b51845ff3986 100644 --- a/enterprise/coderd/coderd_test.go +++ b/enterprise/coderd/coderd_test.go @@ -85,7 +85,7 @@ func TestEntitlements(t *testing.T) { assert.False(t, res.HasLicense) al = res.Features[codersdk.FeatureAuditLog] assert.Equal(t, codersdk.EntitlementNotEntitled, al.Entitlement) - assert.True(t, al.Enabled) + assert.False(t, al.Enabled) }) t.Run("Pubsub", func(t *testing.T) { t.Parallel() diff --git a/enterprise/coderd/coderdenttest/coderdenttest_test.go b/enterprise/coderd/coderdenttest/coderdenttest_test.go index ef7657ee5301c..0c4e4b3568bf3 100644 --- a/enterprise/coderd/coderdenttest/coderdenttest_test.go +++ b/enterprise/coderd/coderdenttest/coderdenttest_test.go @@ -58,6 +58,10 @@ func TestAuthorizeAllEndpoints(t *testing.T) { AssertAction: rbac.ActionRead, AssertObject: rbac.ResourceLicense, } + assertRoute["GET:/api/v2/replicas"] = coderdtest.RouteCheck{ + AssertAction: rbac.ActionRead, + AssertObject: rbac.ResourceReplicas, + } assertRoute["DELETE:/api/v2/licenses/{id}"] = coderdtest.RouteCheck{ AssertAction: rbac.ActionDelete, AssertObject: rbac.ResourceLicense, diff --git a/enterprise/coderd/replicas.go b/enterprise/coderd/replicas.go index ddb2b8b672186..f8cb64fe553dc 100644 --- a/enterprise/coderd/replicas.go +++ b/enterprise/coderd/replicas.go @@ -1 +1,36 @@ package coderd + +import ( + "net/http" + + "github.com/coder/coder/coderd/database" + "github.com/coder/coder/coderd/httpapi" + "github.com/coder/coder/coderd/rbac" + "github.com/coder/coder/codersdk" +) + +// replicas returns the number of replicas that are active in Coder. +func (api *API) replicas(rw http.ResponseWriter, r *http.Request) { + if !api.AGPL.Authorize(r, rbac.ActionRead, rbac.ResourceReplicas) { + httpapi.ResourceNotFound(rw) + return + } + + replicas := api.replicaManager.All() + res := make([]codersdk.Replica, 0, len(replicas)) + for _, replica := range replicas { + res = append(res, convertReplica(replica)) + } + httpapi.Write(r.Context(), rw, http.StatusOK, res) +} + +func convertReplica(replica database.Replica) codersdk.Replica { + return codersdk.Replica{ + ID: replica.ID, + Hostname: replica.Hostname, + CreatedAt: replica.CreatedAt, + RelayAddress: replica.RelayAddress, + RegionID: replica.RegionID, + Error: replica.Error.String, + } +} diff --git a/enterprise/coderd/replicas_test.go b/enterprise/coderd/replicas_test.go index 0da4a05dbbb60..e51f9cc330dc8 100644 --- a/enterprise/coderd/replicas_test.go +++ b/enterprise/coderd/replicas_test.go @@ -63,6 +63,10 @@ func TestReplicas(t *testing.T) { }, }) secondClient.SessionToken = firstClient.SessionToken + replicas, err := secondClient.Replicas(context.Background()) + require.NoError(t, err) + require.Len(t, replicas, 2) + agentID := setupWorkspaceAgent(t, firstClient, firstUser) conn, err := secondClient.DialWorkspaceAgent(context.Background(), agentID, &codersdk.DialWorkspaceAgentOptions{ BlockEndpoints: true, @@ -76,5 +80,6 @@ func TestReplicas(t *testing.T) { return err == nil }, testutil.WaitLong, testutil.IntervalFast) _ = conn.Close() + }) } diff --git a/enterprise/replicasync/replicasync.go b/enterprise/replicasync/replicasync.go index 4d6038a694940..8b8327038e088 100644 --- a/enterprise/replicasync/replicasync.go +++ b/enterprise/replicasync/replicasync.go @@ -101,6 +101,14 @@ func New(ctx context.Context, logger slog.Logger, db database.Store, pubsub data if err != nil { return nil, xerrors.Errorf("run replica: %w", err) } + peers := server.Regional() + if len(peers) > 0 { + self := server.Self() + if self.RelayAddress == "" { + return nil, xerrors.Errorf("a relay address must be specified when running multiple replicas in the same region") + } + } + err = server.subscribe(ctx) if err != nil { return nil, xerrors.Errorf("subscribe: %w", err) diff --git a/enterprise/replicasync/replicasync_test.go b/enterprise/replicasync/replicasync_test.go index 5ce774ea5f29a..ccacbeb310c23 100644 --- a/enterprise/replicasync/replicasync_test.go +++ b/enterprise/replicasync/replicasync_test.go @@ -5,7 +5,6 @@ import ( "net/http" "net/http/httptest" "sync" - "sync/atomic" "testing" "time" @@ -66,6 +65,25 @@ func TestReplica(t *testing.T) { _ = server.Close() require.NoError(t, err) }) + t.Run("ErrorsWithoutRelayAddress", func(t *testing.T) { + // Ensures that the replica reports a successful status for + // accessing all of its peers. + t.Parallel() + db, pubsub := dbtestutil.NewDB(t) + _, err := db.InsertReplica(context.Background(), database.InsertReplicaParams{ + ID: uuid.New(), + CreatedAt: database.Now(), + StartedAt: database.Now(), + UpdatedAt: database.Now(), + Hostname: "something", + }) + require.NoError(t, err) + _, err = replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replicasync.Options{ + ID: uuid.New(), + }) + require.Error(t, err) + require.Equal(t, "a relay address must be specified when running multiple replicas in the same region", err.Error()) + }) t.Run("ConnectsToPeerReplica", func(t *testing.T) { // Ensures that the replica reports a successful status for // accessing all of its peers. @@ -85,7 +103,8 @@ func TestReplica(t *testing.T) { }) require.NoError(t, err) server, err := replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replicasync.Options{ - ID: uuid.New(), + ID: uuid.New(), + RelayAddress: "http://169.254.169.254", }) require.NoError(t, err) require.Len(t, server.Regional(), 1) @@ -96,12 +115,6 @@ func TestReplica(t *testing.T) { t.Run("ConnectsToFakePeerWithError", func(t *testing.T) { t.Parallel() db, pubsub := dbtestutil.NewDB(t) - var count atomic.Int32 - cancel, err := pubsub.Subscribe(replicasync.PubsubEvent, func(ctx context.Context, message []byte) { - count.Add(1) - }) - require.NoError(t, err) - defer cancel() peer, err := db.InsertReplica(context.Background(), database.InsertReplicaParams{ ID: uuid.New(), CreatedAt: database.Now(), @@ -113,16 +126,15 @@ func TestReplica(t *testing.T) { }) require.NoError(t, err) server, err := replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replicasync.Options{ - ID: uuid.New(), - PeerTimeout: 1 * time.Millisecond, + ID: uuid.New(), + PeerTimeout: 1 * time.Millisecond, + RelayAddress: "http://169.254.169.254", }) require.NoError(t, err) require.Len(t, server.Regional(), 1) require.Equal(t, peer.ID, server.Regional()[0].ID) require.True(t, server.Self().Error.Valid) require.Contains(t, server.Self().Error.String, "Failed to dial peers") - // Once for the initial creation of a replica, and another time for the error. - require.Equal(t, int32(2), count.Load()) _ = server.Close() }) t.Run("RefreshOnPublish", func(t *testing.T) { diff --git a/site/src/api/typesGenerated.ts b/site/src/api/typesGenerated.ts index 72abae519b469..2289d2100be92 100644 --- a/site/src/api/typesGenerated.ts +++ b/site/src/api/typesGenerated.ts @@ -268,6 +268,7 @@ export interface DeploymentFlags { readonly derp_server_region_code: StringFlag readonly derp_server_region_name: StringFlag readonly derp_server_stun_address: StringArrayFlag + readonly derp_server_relay_address: StringFlag readonly derp_config_url: StringFlag readonly derp_config_path: StringFlag readonly prom_enabled: BoolFlag @@ -522,6 +523,16 @@ export interface PutExtendWorkspaceRequest { readonly deadline: string } +// From codersdk/replicas.go +export interface Replica { + readonly id: string + readonly hostname: string + readonly created_at: string + readonly relay_address: string + readonly region_id: number + readonly error: string +} + // From codersdk/error.go export interface Response { readonly message: string From d6ce2167a243349472929136e0d1fab032c97ee0 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Fri, 14 Oct 2022 17:23:59 +0000 Subject: [PATCH 27/79] Ensure close kills replica --- cli/root.go | 6 ++-- cli/server.go | 9 ++++-- coderd/coderd.go | 2 +- enterprise/cli/server.go | 31 +++---------------- enterprise/coderd/coderd.go | 11 +++++-- .../coderd/coderdenttest/coderdenttest.go | 2 -- enterprise/coderd/license/license.go | 2 +- enterprise/coderd/license/license_test.go | 2 +- enterprise/coderd/replicas_test.go | 7 ++++- 9 files changed, 32 insertions(+), 40 deletions(-) diff --git a/cli/root.go b/cli/root.go index e29aa534da0a8..91d4551916cc0 100644 --- a/cli/root.go +++ b/cli/root.go @@ -4,6 +4,7 @@ import ( "context" "flag" "fmt" + "io" "net/http" "net/url" "os" @@ -100,8 +101,9 @@ func Core() []*cobra.Command { } func AGPL() []*cobra.Command { - all := append(Core(), Server(deployment.Flags(), func(_ context.Context, _ config.Root, o *coderd.Options) (*coderd.API, error) { - return coderd.New(o), nil + all := append(Core(), Server(deployment.Flags(), func(_ context.Context, o *coderd.Options) (*coderd.API, io.Closer, error) { + api := coderd.New(o) + return api, api, nil })) return all } diff --git a/cli/server.go b/cli/server.go index fc5f131da3d7b..3a94716be064d 100644 --- a/cli/server.go +++ b/cli/server.go @@ -67,7 +67,7 @@ import ( ) // nolint:gocyclo -func Server(dflags *codersdk.DeploymentFlags, newAPI func(context.Context, config.Root, *coderd.Options) (*coderd.API, error)) *cobra.Command { +func Server(dflags *codersdk.DeploymentFlags, newAPI func(context.Context, *coderd.Options) (*coderd.API, io.Closer, error)) *cobra.Command { root := &cobra.Command{ Use: "server", Short: "Start a Coder server", @@ -463,11 +463,14 @@ func Server(dflags *codersdk.DeploymentFlags, newAPI func(context.Context, confi ), dflags.PromAddress.Value, "prometheus")() } - coderAPI, err := newAPI(ctx, config, options) + // We use a separate closer so the Enterprise API + // can have it's own close functions. This is cleaner + // than abstracting the Coder API itself. + coderAPI, closer, err := newAPI(ctx, options) if err != nil { return err } - defer coderAPI.Close() + defer closer.Close() client := codersdk.New(localURL) if dflags.TLSEnable.Value { diff --git a/coderd/coderd.go b/coderd/coderd.go index da1fc0572ccc6..bb16553e47c66 100644 --- a/coderd/coderd.go +++ b/coderd/coderd.go @@ -123,7 +123,7 @@ func New(options *Options) *API { options.TailnetCoordinator = tailnet.NewCoordinator() } if options.DERPServer == nil { - options.DERPServer = derp.NewServer(key.NewNode(), tailnet.Logger(options.Logger.Named("derp").Leveled(slog.LevelDebug))) + options.DERPServer = derp.NewServer(key.NewNode(), tailnet.Logger(options.Logger.Named("derp"))) options.DERPServer.SetMeshKey("todo-kyle-change-this") } if options.Auditor == nil { diff --git a/enterprise/cli/server.go b/enterprise/cli/server.go index cc44985e0a4d4..f3e99c1613ab8 100644 --- a/enterprise/cli/server.go +++ b/enterprise/cli/server.go @@ -2,15 +2,12 @@ package cli import ( "context" + "io" "net/url" - "github.com/google/uuid" "github.com/spf13/cobra" "golang.org/x/xerrors" - "cdr.dev/slog" - - "github.com/coder/coder/cli/config" "github.com/coder/coder/cli/deployment" "github.com/coder/coder/enterprise/coderd" @@ -20,28 +17,11 @@ import ( func server() *cobra.Command { dflags := deployment.Flags() - cmd := agpl.Server(dflags, func(ctx context.Context, cfg config.Root, options *agplcoderd.Options) (*agplcoderd.API, error) { - replicaIDRaw, err := cfg.ReplicaID().Read() - generatedReplicaID := false - if err != nil { - replicaIDRaw = uuid.NewString() - generatedReplicaID = true - } - replicaID, err := uuid.Parse(replicaIDRaw) - if err != nil { - options.Logger.Warn(ctx, "failed to parse replica id", slog.Error(err), slog.F("replica_id", replicaIDRaw)) - replicaID = uuid.New() - generatedReplicaID = true - } - if generatedReplicaID { - // Make sure we save it to be reused later! - _ = cfg.ReplicaID().Write(replicaID.String()) - } - + cmd := agpl.Server(dflags, func(ctx context.Context, options *agplcoderd.Options) (*agplcoderd.API, io.Closer, error) { if dflags.DerpServerRelayAddress.Value != "" { _, err := url.Parse(dflags.DerpServerRelayAddress.Value) if err != nil { - return nil, xerrors.Errorf("derp-server-relay-address must be a valid HTTP URL: %w", err) + return nil, nil, xerrors.Errorf("derp-server-relay-address must be a valid HTTP URL: %w", err) } } @@ -51,7 +31,6 @@ func server() *cobra.Command { SCIMAPIKey: []byte(dflags.SCIMAuthHeader.Value), UserWorkspaceQuota: dflags.UserWorkspaceQuota.Value, RBAC: true, - ReplicaID: replicaID, DERPServerRelayAddress: dflags.DerpServerRelayAddress.Value, DERPServerRegionID: dflags.DerpServerRegionID.Value, @@ -59,9 +38,9 @@ func server() *cobra.Command { } api, err := coderd.New(ctx, o) if err != nil { - return nil, err + return nil, nil, err } - return api.AGPL, nil + return api.AGPL, api, nil }) deployment.AttachFlags(cmd.Flags(), dflags, true) diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index 1634f82d45366..a25b432a16a7c 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -123,7 +123,8 @@ func New(ctx context.Context, options *Options) (*API, error) { var err error api.replicaManager, err = replicasync.New(ctx, options.Logger, options.Database, options.Pubsub, replicasync.Options{ - ID: options.ReplicaID, + // Create a new replica ID for each Coder instance! + ID: uuid.New(), RelayAddress: options.DERPServerRelayAddress, RegionID: int32(options.DERPServerRegionID), }) @@ -154,7 +155,6 @@ type Options struct { // Used for high availability. DERPServerRelayAddress string DERPServerRegionID int - ReplicaID uuid.UUID EntitlementsUpdateInterval time.Duration Keys map[string]ed25519.PublicKey @@ -256,10 +256,15 @@ func (api *API) updateEntitlements(ctx context.Context) error { addresses = append(addresses, replica.RelayAddress) } api.derpMesh.SetAddresses(addresses) + _ = api.updateEntitlements(ctx) }) } else { api.derpMesh.SetAddresses([]string{}) - api.replicaManager.SetCallback(func() {}) + api.replicaManager.SetCallback(func() { + // If the amount of replicas change, so should our entitlements. + // This is to display a warning in the UI if the user is unlicensed. + _ = api.updateEntitlements(ctx) + }) } // Recheck changed in case the HA coordinator failed to set up. diff --git a/enterprise/coderd/coderdenttest/coderdenttest.go b/enterprise/coderd/coderdenttest/coderdenttest.go index ea172c43116e4..fd1080a3ff30f 100644 --- a/enterprise/coderd/coderdenttest/coderdenttest.go +++ b/enterprise/coderd/coderdenttest/coderdenttest.go @@ -9,7 +9,6 @@ import ( "time" "github.com/golang-jwt/jwt/v4" - "github.com/google/uuid" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -69,7 +68,6 @@ func NewWithAPI(t *testing.T, options *Options) (*codersdk.Client, io.Closer, *c SCIMAPIKey: options.SCIMAPIKey, DERPServerRelayAddress: oop.AccessURL.String(), DERPServerRegionID: oop.DERPMap.RegionIDs()[0], - ReplicaID: uuid.New(), UserWorkspaceQuota: options.UserWorkspaceQuota, Options: oop, EntitlementsUpdateInterval: options.EntitlementsUpdateInterval, diff --git a/enterprise/coderd/license/license.go b/enterprise/coderd/license/license.go index 633b1a5056cab..f168f7472c80c 100644 --- a/enterprise/coderd/license/license.go +++ b/enterprise/coderd/license/license.go @@ -176,7 +176,7 @@ func Entitlements( "You have multiple replicas but your license is not entitled to high availability.") } else { entitlements.Warnings = append(entitlements.Warnings, - "You have multiple replicas but high availability is an Enterprise feature. Contact sales to get a license.") + "You have multiple replicas but high availability is an Enterprise feature.") } case codersdk.EntitlementGracePeriod: entitlements.Warnings = append(entitlements.Warnings, diff --git a/enterprise/coderd/license/license_test.go b/enterprise/coderd/license/license_test.go index 5b50bdb97cfe2..4d0f09913037d 100644 --- a/enterprise/coderd/license/license_test.go +++ b/enterprise/coderd/license/license_test.go @@ -228,7 +228,7 @@ func TestEntitlements(t *testing.T) { require.NoError(t, err) require.False(t, entitlements.HasLicense) require.Len(t, entitlements.Warnings, 1) - require.Equal(t, "You have multiple replicas but high availability is an Enterprise feature. Contact sales to get a license.", entitlements.Warnings[0]) + require.Equal(t, "You have multiple replicas but high availability is an Enterprise feature.", entitlements.Warnings[0]) }) t.Run("MultipleReplicasNotEntitled", func(t *testing.T) { diff --git a/enterprise/coderd/replicas_test.go b/enterprise/coderd/replicas_test.go index e51f9cc330dc8..3d41e83deb964 100644 --- a/enterprise/coderd/replicas_test.go +++ b/enterprise/coderd/replicas_test.go @@ -30,7 +30,7 @@ func TestReplicas(t *testing.T) { }, }) _ = coderdtest.CreateFirstUser(t, firstClient) - secondClient := coderdenttest.New(t, &coderdenttest.Options{ + secondClient, _, secondAPI := coderdenttest.NewWithAPI(t, &coderdenttest.Options{ Options: &coderdtest.Options{ Database: db, Pubsub: pubsub, @@ -40,6 +40,11 @@ func TestReplicas(t *testing.T) { ents, err := secondClient.Entitlements(context.Background()) require.NoError(t, err) require.Len(t, ents.Warnings, 1) + _ = secondAPI.Close() + + ents, err = firstClient.Entitlements(context.Background()) + require.NoError(t, err) + require.Len(t, ents.Warnings, 0) }) t.Run("ConnectAcrossMultiple", func(t *testing.T) { t.Parallel() From d7cc0ff9bb7255b150a84b4bb683ac90f87d4089 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Fri, 14 Oct 2022 22:36:51 +0000 Subject: [PATCH 28/79] Update sql --- cli/server.go | 10 +- ...icas.down.sql => 000061_replicas.down.sql} | 0 ...replicas.up.sql => 000061_replicas.up.sql} | 0 codersdk/features.go | 1 + enterprise/coderd/coderd.go | 2 +- enterprise/coderd/license/license.go | 11 ++- enterprise/coderd/license/license_test.go | 10 +- enterprise/derpmesh/derpmesh.go | 22 +++-- enterprise/derpmesh/derpmesh_test.go | 99 +++++++++++++++++-- go.mod | 2 +- go.sum | 4 +- 11 files changed, 124 insertions(+), 37 deletions(-) rename coderd/database/migrations/{000059_replicas.down.sql => 000061_replicas.down.sql} (100%) rename coderd/database/migrations/{000059_replicas.up.sql => 000061_replicas.up.sql} (100%) diff --git a/cli/server.go b/cli/server.go index 3a94716be064d..1ab1a6228f356 100644 --- a/cli/server.go +++ b/cli/server.go @@ -165,9 +165,10 @@ func Server(dflags *codersdk.DeploymentFlags, newAPI func(context.Context, *code } defer listener.Close() + var tlsConfig *tls.Config if dflags.TLSEnable.Value { - listener, err = configureServerTLS( - listener, dflags.TLSMinVersion.Value, + tlsConfig, err = configureTLS( + dflags.TLSMinVersion.Value, dflags.TLSClientAuth.Value, dflags.TLSCertFiles.Value, dflags.TLSKeyFiles.Value, @@ -176,6 +177,7 @@ func Server(dflags *codersdk.DeploymentFlags, newAPI func(context.Context, *code if err != nil { return xerrors.Errorf("configure tls: %w", err) } + listener = tls.NewListener(listener, tlsConfig) } tcpAddr, valid := listener.Addr().(*net.TCPAddr) @@ -888,7 +890,7 @@ func loadCertificates(tlsCertFiles, tlsKeyFiles []string) ([]tls.Certificate, er return certs, nil } -func configureServerTLS(listener net.Listener, tlsMinVersion, tlsClientAuth string, tlsCertFiles, tlsKeyFiles []string, tlsClientCAFile string) (net.Listener, error) { +func configureTLS(tlsMinVersion, tlsClientAuth string, tlsCertFiles, tlsKeyFiles []string, tlsClientCAFile string) (*tls.Config, error) { tlsConfig := &tls.Config{ MinVersion: tls.VersionTLS12, } @@ -958,7 +960,7 @@ func configureServerTLS(listener net.Listener, tlsMinVersion, tlsClientAuth stri tlsConfig.ClientCAs = caPool } - return tls.NewListener(listener, tlsConfig), nil + return tlsConfig, nil } func configureGithubOAuth2(accessURL *url.URL, clientID, clientSecret string, allowSignups bool, allowOrgs []string, rawTeams []string, enterpriseBaseURL string) (*coderd.GithubOAuth2Config, error) { diff --git a/coderd/database/migrations/000059_replicas.down.sql b/coderd/database/migrations/000061_replicas.down.sql similarity index 100% rename from coderd/database/migrations/000059_replicas.down.sql rename to coderd/database/migrations/000061_replicas.down.sql diff --git a/coderd/database/migrations/000059_replicas.up.sql b/coderd/database/migrations/000061_replicas.up.sql similarity index 100% rename from coderd/database/migrations/000059_replicas.up.sql rename to coderd/database/migrations/000061_replicas.up.sql diff --git a/codersdk/features.go b/codersdk/features.go index 799307e8fe898..862411de62872 100644 --- a/codersdk/features.go +++ b/codersdk/features.go @@ -44,6 +44,7 @@ type Feature struct { type Entitlements struct { Features map[string]Feature `json:"features"` Warnings []string `json:"warnings"` + Errors []string `json:"errors"` HasLicense bool `json:"has_license"` Experimental bool `json:"experimental"` Trial bool `json:"trial"` diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index 294ff0eef1c71..612e710395722 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -137,7 +137,7 @@ func New(ctx context.Context, options *Options) (*API, error) { if err != nil { return nil, xerrors.Errorf("initialize replica: %w", err) } - api.derpMesh = derpmesh.New(options.Logger.Named("derpmesh"), api.DERPServer) + api.derpMesh = derpmesh.New(options.Logger.Named("derpmesh"), api.DERPServer, nil) err = api.updateEntitlements(ctx) if err != nil { diff --git a/enterprise/coderd/license/license.go b/enterprise/coderd/license/license.go index f168f7472c80c..c5bb689db65a9 100644 --- a/enterprise/coderd/license/license.go +++ b/enterprise/coderd/license/license.go @@ -30,6 +30,7 @@ func Entitlements( entitlements := codersdk.Entitlements{ Features: map[string]codersdk.Feature{}, Warnings: []string{}, + Errors: []string{}, } for _, featureName := range codersdk.FeatureNames { entitlements.Features[featureName] = codersdk.Feature{ @@ -172,15 +173,15 @@ func Entitlements( switch feature.Entitlement { case codersdk.EntitlementNotEntitled: if entitlements.HasLicense { - entitlements.Warnings = append(entitlements.Warnings, - "You have multiple replicas but your license is not entitled to high availability.") + entitlements.Errors = append(entitlements.Warnings, + "You have multiple replicas but your license is not entitled to high availability. You will be unable to connect to workspaces.") } else { - entitlements.Warnings = append(entitlements.Warnings, - "You have multiple replicas but high availability is an Enterprise feature.") + entitlements.Errors = append(entitlements.Warnings, + "You have multiple replicas but high availability is an Enterprise feature. You will be unable to connect to workspaces.") } case codersdk.EntitlementGracePeriod: entitlements.Warnings = append(entitlements.Warnings, - "You have multiple replicas but your license for high availability is expired.") + "You have multiple replicas but your license for high availability is expired. Reduce to one replica or workspace connections will stop working.") } } diff --git a/enterprise/coderd/license/license_test.go b/enterprise/coderd/license/license_test.go index 4d0f09913037d..6def291e3e24c 100644 --- a/enterprise/coderd/license/license_test.go +++ b/enterprise/coderd/license/license_test.go @@ -227,8 +227,8 @@ func TestEntitlements(t *testing.T) { entitlements, err := license.Entitlements(context.Background(), db, slog.Logger{}, 2, coderdenttest.Keys, all) require.NoError(t, err) require.False(t, entitlements.HasLicense) - require.Len(t, entitlements.Warnings, 1) - require.Equal(t, "You have multiple replicas but high availability is an Enterprise feature.", entitlements.Warnings[0]) + require.Len(t, entitlements.Errors, 1) + require.Equal(t, "You have multiple replicas but high availability is an Enterprise feature. You will be unable to connect to workspaces.", entitlements.Errors[0]) }) t.Run("MultipleReplicasNotEntitled", func(t *testing.T) { @@ -245,8 +245,8 @@ func TestEntitlements(t *testing.T) { }) require.NoError(t, err) require.True(t, entitlements.HasLicense) - require.Len(t, entitlements.Warnings, 1) - require.Equal(t, "You have multiple replicas but your license is not entitled to high availability.", entitlements.Warnings[0]) + require.Len(t, entitlements.Errors, 1) + require.Equal(t, "You have multiple replicas but your license is not entitled to high availability. You will be unable to connect to workspaces.", entitlements.Errors[0]) }) t.Run("MultipleReplicasGrace", func(t *testing.T) { @@ -266,6 +266,6 @@ func TestEntitlements(t *testing.T) { require.NoError(t, err) require.True(t, entitlements.HasLicense) require.Len(t, entitlements.Warnings, 1) - require.Equal(t, "You have multiple replicas but your license for high availability is expired.", entitlements.Warnings[0]) + require.Equal(t, "You have multiple replicas but your license for high availability is expired. Reduce to one replica or workspace connections will stop working.", entitlements.Warnings[0]) }) } diff --git a/enterprise/derpmesh/derpmesh.go b/enterprise/derpmesh/derpmesh.go index dbdf7bc1b1f3a..8f51343017593 100644 --- a/enterprise/derpmesh/derpmesh.go +++ b/enterprise/derpmesh/derpmesh.go @@ -2,6 +2,7 @@ package derpmesh import ( "context" + "crypto/tls" "net" "net/url" "sync" @@ -17,20 +18,22 @@ import ( ) // New constructs a new mesh for DERP servers. -func New(logger slog.Logger, server *derp.Server) *Mesh { +func New(logger slog.Logger, server *derp.Server, tlsConfig *tls.Config) *Mesh { return &Mesh{ - logger: logger, - server: server, - ctx: context.Background(), - closed: make(chan struct{}), - active: make(map[string]context.CancelFunc), + logger: logger, + server: server, + tlsConfig: tlsConfig, + ctx: context.Background(), + closed: make(chan struct{}), + active: make(map[string]context.CancelFunc), } } type Mesh struct { - logger slog.Logger - server *derp.Server - ctx context.Context + logger slog.Logger + server *derp.Server + ctx context.Context + tlsConfig *tls.Config mutex sync.Mutex closed chan struct{} @@ -93,6 +96,7 @@ func (m *Mesh) addAddress(address string) (bool, error) { if err != nil { return false, xerrors.Errorf("create derp client: %w", err) } + client.TLSConfig = m.tlsConfig client.MeshKey = m.server.MeshKey() client.SetURLDialer(func(ctx context.Context, network, addr string) (net.Conn, error) { var dialer net.Dialer diff --git a/enterprise/derpmesh/derpmesh_test.go b/enterprise/derpmesh/derpmesh_test.go index 313c33da99bad..139e42566ffb1 100644 --- a/enterprise/derpmesh/derpmesh_test.go +++ b/enterprise/derpmesh/derpmesh_test.go @@ -1,11 +1,22 @@ package derpmesh_test import ( + "bytes" "context" + "crypto/ecdsa" + "crypto/elliptic" + "crypto/rand" + "crypto/tls" + "crypto/x509" + "crypto/x509/pkix" + "encoding/pem" "errors" "io" + "math/big" + "net" "net/http/httptest" "testing" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -29,12 +40,41 @@ func TestDERPMesh(t *testing.T) { t.Run("ExchangeMessages", func(t *testing.T) { // This tests messages passing through multiple DERP servers. t.Parallel() - firstServer, firstServerURL := startDERP(t) + firstServer, firstServerURL, firstTLSName := startDERP(t) defer firstServer.Close() - secondServer, secondServerURL := startDERP(t) - firstMesh := derpmesh.New(slogtest.Make(t, nil).Named("first").Leveled(slog.LevelDebug), firstServer) + secondServer, secondServerURL, secondTLSName := startDERP(t) + firstMesh := derpmesh.New(slogtest.Make(t, nil).Named("first").Leveled(slog.LevelDebug), firstServer, firstTLSName) firstMesh.SetAddresses([]string{secondServerURL}) - secondMesh := derpmesh.New(slogtest.Make(t, nil).Named("second").Leveled(slog.LevelDebug), secondServer) + secondMesh := derpmesh.New(slogtest.Make(t, nil).Named("second").Leveled(slog.LevelDebug), secondServer, secondTLSName) + secondMesh.SetAddresses([]string{firstServerURL}) + defer firstMesh.Close() + defer secondMesh.Close() + + first := key.NewNode() + second := key.NewNode() + firstClient, err := derphttp.NewClient(first, secondServerURL, tailnet.Logger(slogtest.Make(t, nil))) + require.NoError(t, err) + secondClient, err := derphttp.NewClient(second, firstServerURL, tailnet.Logger(slogtest.Make(t, nil))) + require.NoError(t, err) + err = secondClient.Connect(context.Background()) + require.NoError(t, err) + + sent := []byte("hello world") + err = firstClient.Send(second.Public(), sent) + require.NoError(t, err) + + got := recvData(t, secondClient) + require.Equal(t, sent, got) + }) + t.Run("ExchangeMessages", func(t *testing.T) { + // This tests messages passing through multiple DERP servers. + t.Parallel() + firstServer, firstServerURL, firstTLSName := startDERP(t) + defer firstServer.Close() + secondServer, secondServerURL, secondTLSName := startDERP(t) + firstMesh := derpmesh.New(slogtest.Make(t, nil).Named("first").Leveled(slog.LevelDebug), firstServer, firstTLSName) + firstMesh.SetAddresses([]string{secondServerURL}) + secondMesh := derpmesh.New(slogtest.Make(t, nil).Named("second").Leveled(slog.LevelDebug), secondServer, secondTLSName) secondMesh.SetAddresses([]string{firstServerURL}) defer firstMesh.Close() defer secondMesh.Close() @@ -58,8 +98,8 @@ func TestDERPMesh(t *testing.T) { t.Run("RemoveAddress", func(t *testing.T) { // This tests messages passing through multiple DERP servers. t.Parallel() - server, serverURL := startDERP(t) - mesh := derpmesh.New(slogtest.Make(t, nil).Named("first").Leveled(slog.LevelDebug), server) + server, serverURL, tlsName := startDERP(t) + mesh := derpmesh.New(slogtest.Make(t, nil).Named("first").Leveled(slog.LevelDebug), server, tlsName) mesh.SetAddresses([]string{"http://fake.com"}) // This should trigger a removal... mesh.SetAddresses([]string{}) @@ -84,8 +124,8 @@ func TestDERPMesh(t *testing.T) { meshes := make([]*derpmesh.Mesh, 0, 20) serverURLs := make([]string, 0, 20) for i := 0; i < 20; i++ { - server, url := startDERP(t) - mesh := derpmesh.New(slogtest.Make(t, nil).Named("mesh").Leveled(slog.LevelDebug), server) + server, url, tlsName := startDERP(t) + mesh := derpmesh.New(slogtest.Make(t, nil).Named("mesh").Leveled(slog.LevelDebug), server, tlsName) t.Cleanup(func() { _ = server.Close() _ = mesh.Close() @@ -132,15 +172,54 @@ func recvData(t *testing.T, client *derphttp.Client) []byte { } } -func startDERP(t *testing.T) (*derp.Server, string) { +func startDERP(t *testing.T) (*derp.Server, string, *tls.Config) { logf := tailnet.Logger(slogtest.Make(t, nil)) d := derp.NewServer(key.NewNode(), logf) d.SetMeshKey("some-key") server := httptest.NewUnstartedServer(derphttp.Handler(d)) + commonName := "something.org" + server.TLS = &tls.Config{ + Certificates: []tls.Certificate{generateTLSCertificate(t, commonName)}, + } server.Start() t.Cleanup(func() { _ = d.Close() }) t.Cleanup(server.Close) - return d, server.URL + return d, server.URL, server.TLS +} + +func generateTLSCertificate(t testing.TB, commonName string) tls.Certificate { + privateKey, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader) + require.NoError(t, err) + template := x509.Certificate{ + SerialNumber: big.NewInt(1), + Subject: pkix.Name{ + Organization: []string{"Acme Co"}, + CommonName: commonName, + }, + DNSNames: []string{commonName}, + NotBefore: time.Now(), + NotAfter: time.Now().Add(time.Hour * 24 * 180), + + KeyUsage: x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature, + ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth}, + BasicConstraintsValid: true, + IPAddresses: []net.IP{net.ParseIP("127.0.0.1")}, + } + + derBytes, err := x509.CreateCertificate(rand.Reader, &template, &template, &privateKey.PublicKey, privateKey) + require.NoError(t, err) + var certFile bytes.Buffer + require.NoError(t, err) + _, err = certFile.Write(pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: derBytes})) + require.NoError(t, err) + privateKeyBytes, err := x509.MarshalPKCS8PrivateKey(privateKey) + require.NoError(t, err) + var keyFile bytes.Buffer + err = pem.Encode(&keyFile, &pem.Block{Type: "PRIVATE KEY", Bytes: privateKeyBytes}) + require.NoError(t, err) + cert, err := tls.X509KeyPair(certFile.Bytes(), keyFile.Bytes()) + require.NoError(t, err) + return cert } diff --git a/go.mod b/go.mod index 9834e27e5f39c..b33a438eb3d08 100644 --- a/go.mod +++ b/go.mod @@ -40,7 +40,7 @@ replace github.com/tcnksm/go-httpstat => github.com/kylecarbs/go-httpstat v0.0.0 // There are a few minor changes we make to Tailscale that we're slowly upstreaming. Compare here: // https://github.com/tailscale/tailscale/compare/main...coder:tailscale:main -replace tailscale.com => github.com/coder/tailscale v1.1.1-0.20220926024748-50f068456c6c +replace tailscale.com => github.com/coder/tailscale v1.1.1-0.20221014173742-9f1da7795630 // Switch to our fork that imports fixes from http://github.com/tailscale/ssh. // See: https://github.com/coder/coder/issues/3371 diff --git a/go.sum b/go.sum index 13fdc5724f6b6..5852582c26c4a 100644 --- a/go.sum +++ b/go.sum @@ -351,8 +351,8 @@ github.com/coder/retry v1.3.0 h1:5lAAwt/2Cm6lVmnfBY7sOMXcBOwcwJhmV5QGSELIVWY= github.com/coder/retry v1.3.0/go.mod h1:tXuRgZgWjUnU5LZPT4lJh4ew2elUhexhlnXzrJWdyFY= github.com/coder/ssh v0.0.0-20220811105153-fcea99919338 h1:tN5GKFT68YLVzJoA8AHuiMNJ0qlhoD3pGN3JY9gxSko= github.com/coder/ssh v0.0.0-20220811105153-fcea99919338/go.mod h1:ZSS+CUoKHDrqVakTfTWUlKSr9MtMFkC4UvtQKD7O914= -github.com/coder/tailscale v1.1.1-0.20220926024748-50f068456c6c h1:xa6lr5Pj87Is26tgpzwBsEGKL7aVz7/fRGgY9QIbf3E= -github.com/coder/tailscale v1.1.1-0.20220926024748-50f068456c6c/go.mod h1:5amxy08qijEa8bcTW2SeIy4MIqcmd7LMsuOxqOlj2Ak= +github.com/coder/tailscale v1.1.1-0.20221014173742-9f1da7795630 h1:FgWWdu0fnFEpUNjW0vOaCuOxOZ/GQzn6oo7p5IMlSA0= +github.com/coder/tailscale v1.1.1-0.20221014173742-9f1da7795630/go.mod h1:5amxy08qijEa8bcTW2SeIy4MIqcmd7LMsuOxqOlj2Ak= github.com/containerd/aufs v0.0.0-20200908144142-dab0cbea06f4/go.mod h1:nukgQABAEopAHvB6j7cnP5zJ+/3aVcE7hCYqvIwAHyE= github.com/containerd/aufs v0.0.0-20201003224125-76a6863f2989/go.mod h1:AkGGQs9NM2vtYHaUen+NljV0/baGCAPELGm2q9ZXpWU= github.com/containerd/aufs v0.0.0-20210316121734-20793ff83c97/go.mod h1:kL5kd6KM5TzQjR79jljyi4olc1Vrx6XBlcyj3gNv2PU= From 9914840133605ac75f90357ae3801355cd90c91d Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 15 Oct 2022 00:23:22 +0000 Subject: [PATCH 29/79] Add database latency to high availability --- coderd/database/databasefake/databasefake.go | 22 ++++--- coderd/database/db.go | 9 +++ coderd/database/dump.sql | 1 + .../migrations/000061_replicas.up.sql | 2 + coderd/database/models.go | 21 +++--- coderd/database/queries.sql.go | 57 +++++++++------- coderd/database/queries/replicas.sql | 9 +-- codersdk/replicas.go | 2 + enterprise/coderd/replicas.go | 13 ++-- enterprise/replicasync/replicasync.go | 64 ++++++++++-------- site/src/api/typesGenerated.ts | 1 + .../LicenseBanner/LicenseBanner.tsx | 6 +- .../LicenseBannerView.stories.tsx | 10 +++ .../LicenseBanner/LicenseBannerView.tsx | 66 +++++++++++-------- site/src/testHelpers/entities.ts | 3 + .../entitlements/entitlementsXService.ts | 1 + 16 files changed, 180 insertions(+), 107 deletions(-) diff --git a/coderd/database/databasefake/databasefake.go b/coderd/database/databasefake/databasefake.go index e58fb990271ca..b4724a9afe0aa 100644 --- a/coderd/database/databasefake/databasefake.go +++ b/coderd/database/databasefake/databasefake.go @@ -113,6 +113,10 @@ type data struct { lastLicenseID int32 } +func (q *fakeQuerier) Ping(_ context.Context) (time.Duration, error) { + return 0, nil +} + // InTx doesn't rollback data properly for in-memory yet. func (q *fakeQuerier) InTx(fn func(database.Store) error) error { q.mutex.Lock() @@ -3170,14 +3174,15 @@ func (q *fakeQuerier) InsertReplica(_ context.Context, arg database.InsertReplic defer q.mutex.Unlock() replica := database.Replica{ - ID: arg.ID, - CreatedAt: arg.CreatedAt, - StartedAt: arg.StartedAt, - UpdatedAt: arg.UpdatedAt, - Hostname: arg.Hostname, - RegionID: arg.RegionID, - RelayAddress: arg.RelayAddress, - Version: arg.Version, + ID: arg.ID, + CreatedAt: arg.CreatedAt, + StartedAt: arg.StartedAt, + UpdatedAt: arg.UpdatedAt, + Hostname: arg.Hostname, + RegionID: arg.RegionID, + RelayAddress: arg.RelayAddress, + Version: arg.Version, + DatabaseLatency: arg.DatabaseLatency, } q.replicas = append(q.replicas, replica) return replica, nil @@ -3199,6 +3204,7 @@ func (q *fakeQuerier) UpdateReplica(_ context.Context, arg database.UpdateReplic replica.RegionID = arg.RegionID replica.Version = arg.Version replica.Error = arg.Error + replica.DatabaseLatency = arg.DatabaseLatency q.replicas[index] = replica return replica, nil } diff --git a/coderd/database/db.go b/coderd/database/db.go index 4cbbdb399f193..020000888f8eb 100644 --- a/coderd/database/db.go +++ b/coderd/database/db.go @@ -12,6 +12,7 @@ import ( "context" "database/sql" "errors" + "time" "github.com/jmoiron/sqlx" "golang.org/x/xerrors" @@ -24,6 +25,7 @@ type Store interface { // customQuerier contains custom queries that are not generated. customQuerier + Ping(ctx context.Context) (time.Duration, error) InTx(func(Store) error) error } @@ -58,6 +60,13 @@ type sqlQuerier struct { db DBTX } +// Ping returns the time it takes to ping the database. +func (q *sqlQuerier) Ping(ctx context.Context) (time.Duration, error) { + start := time.Now() + err := q.sdb.PingContext(ctx) + return time.Since(start), err +} + // InTx performs database operations inside a transaction. func (q *sqlQuerier) InTx(function func(Store) error) error { if _, ok := q.db.(*sqlx.Tx); ok { diff --git a/coderd/database/dump.sql b/coderd/database/dump.sql index ca301ac8504b7..1e0a18c1dafef 100644 --- a/coderd/database/dump.sql +++ b/coderd/database/dump.sql @@ -293,6 +293,7 @@ CREATE TABLE replicas ( hostname text NOT NULL, region_id integer NOT NULL, relay_address text NOT NULL, + database_latency integer NOT NULL, version text NOT NULL, error text ); diff --git a/coderd/database/migrations/000061_replicas.up.sql b/coderd/database/migrations/000061_replicas.up.sql index a07587f35a234..b1d1a1ab13ee0 100644 --- a/coderd/database/migrations/000061_replicas.up.sql +++ b/coderd/database/migrations/000061_replicas.up.sql @@ -17,6 +17,8 @@ CREATE TABLE IF NOT EXISTS replicas ( region_id integer NOT NULL, -- An address that should be accessible to other replicas. relay_address text NOT NULL, + -- The latency of the replica to the database in microseconds. + database_latency int NOT NULL, -- Version is the Coder version of the replica. version text NOT NULL, error text diff --git a/coderd/database/models.go b/coderd/database/models.go index 55867a164bd98..b4601ecadeb78 100644 --- a/coderd/database/models.go +++ b/coderd/database/models.go @@ -540,16 +540,17 @@ type ProvisionerJobLog struct { } type Replica struct { - ID uuid.UUID `db:"id" json:"id"` - CreatedAt time.Time `db:"created_at" json:"created_at"` - StartedAt time.Time `db:"started_at" json:"started_at"` - StoppedAt sql.NullTime `db:"stopped_at" json:"stopped_at"` - UpdatedAt time.Time `db:"updated_at" json:"updated_at"` - Hostname string `db:"hostname" json:"hostname"` - RegionID int32 `db:"region_id" json:"region_id"` - RelayAddress string `db:"relay_address" json:"relay_address"` - Version string `db:"version" json:"version"` - Error sql.NullString `db:"error" json:"error"` + ID uuid.UUID `db:"id" json:"id"` + CreatedAt time.Time `db:"created_at" json:"created_at"` + StartedAt time.Time `db:"started_at" json:"started_at"` + StoppedAt sql.NullTime `db:"stopped_at" json:"stopped_at"` + UpdatedAt time.Time `db:"updated_at" json:"updated_at"` + Hostname string `db:"hostname" json:"hostname"` + RegionID int32 `db:"region_id" json:"region_id"` + RelayAddress string `db:"relay_address" json:"relay_address"` + DatabaseLatency int32 `db:"database_latency" json:"database_latency"` + Version string `db:"version" json:"version"` + Error sql.NullString `db:"error" json:"error"` } type SiteConfig struct { diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index 8577903ecc0a2..241474e7e66bd 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -2580,7 +2580,7 @@ func (q *sqlQuerier) DeleteReplicasUpdatedBefore(ctx context.Context, updatedAt } const getReplicaByID = `-- name: GetReplicaByID :one -SELECT id, created_at, started_at, stopped_at, updated_at, hostname, region_id, relay_address, version, error FROM replicas WHERE id = $1 +SELECT id, created_at, started_at, stopped_at, updated_at, hostname, region_id, relay_address, database_latency, version, error FROM replicas WHERE id = $1 ` func (q *sqlQuerier) GetReplicaByID(ctx context.Context, id uuid.UUID) (Replica, error) { @@ -2595,6 +2595,7 @@ func (q *sqlQuerier) GetReplicaByID(ctx context.Context, id uuid.UUID) (Replica, &i.Hostname, &i.RegionID, &i.RelayAddress, + &i.DatabaseLatency, &i.Version, &i.Error, ) @@ -2602,7 +2603,7 @@ func (q *sqlQuerier) GetReplicaByID(ctx context.Context, id uuid.UUID) (Replica, } const getReplicasUpdatedAfter = `-- name: GetReplicasUpdatedAfter :many -SELECT id, created_at, started_at, stopped_at, updated_at, hostname, region_id, relay_address, version, error FROM replicas WHERE updated_at > $1 AND stopped_at IS NULL +SELECT id, created_at, started_at, stopped_at, updated_at, hostname, region_id, relay_address, database_latency, version, error FROM replicas WHERE updated_at > $1 AND stopped_at IS NULL ` func (q *sqlQuerier) GetReplicasUpdatedAfter(ctx context.Context, updatedAt time.Time) ([]Replica, error) { @@ -2623,6 +2624,7 @@ func (q *sqlQuerier) GetReplicasUpdatedAfter(ctx context.Context, updatedAt time &i.Hostname, &i.RegionID, &i.RelayAddress, + &i.DatabaseLatency, &i.Version, &i.Error, ); err != nil { @@ -2648,20 +2650,21 @@ INSERT INTO replicas ( hostname, region_id, relay_address, - version - -) VALUES ($1, $2, $3, $4, $5, $6, $7, $8) RETURNING id, created_at, started_at, stopped_at, updated_at, hostname, region_id, relay_address, version, error + version, + database_latency +) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9) RETURNING id, created_at, started_at, stopped_at, updated_at, hostname, region_id, relay_address, database_latency, version, error ` type InsertReplicaParams struct { - ID uuid.UUID `db:"id" json:"id"` - CreatedAt time.Time `db:"created_at" json:"created_at"` - StartedAt time.Time `db:"started_at" json:"started_at"` - UpdatedAt time.Time `db:"updated_at" json:"updated_at"` - Hostname string `db:"hostname" json:"hostname"` - RegionID int32 `db:"region_id" json:"region_id"` - RelayAddress string `db:"relay_address" json:"relay_address"` - Version string `db:"version" json:"version"` + ID uuid.UUID `db:"id" json:"id"` + CreatedAt time.Time `db:"created_at" json:"created_at"` + StartedAt time.Time `db:"started_at" json:"started_at"` + UpdatedAt time.Time `db:"updated_at" json:"updated_at"` + Hostname string `db:"hostname" json:"hostname"` + RegionID int32 `db:"region_id" json:"region_id"` + RelayAddress string `db:"relay_address" json:"relay_address"` + Version string `db:"version" json:"version"` + DatabaseLatency int32 `db:"database_latency" json:"database_latency"` } func (q *sqlQuerier) InsertReplica(ctx context.Context, arg InsertReplicaParams) (Replica, error) { @@ -2674,6 +2677,7 @@ func (q *sqlQuerier) InsertReplica(ctx context.Context, arg InsertReplicaParams) arg.RegionID, arg.RelayAddress, arg.Version, + arg.DatabaseLatency, ) var i Replica err := row.Scan( @@ -2685,6 +2689,7 @@ func (q *sqlQuerier) InsertReplica(ctx context.Context, arg InsertReplicaParams) &i.Hostname, &i.RegionID, &i.RelayAddress, + &i.DatabaseLatency, &i.Version, &i.Error, ) @@ -2700,20 +2705,22 @@ UPDATE replicas SET region_id = $6, hostname = $7, version = $8, - error = $9 -WHERE id = $1 RETURNING id, created_at, started_at, stopped_at, updated_at, hostname, region_id, relay_address, version, error + error = $9, + database_latency = $10 +WHERE id = $1 RETURNING id, created_at, started_at, stopped_at, updated_at, hostname, region_id, relay_address, database_latency, version, error ` type UpdateReplicaParams struct { - ID uuid.UUID `db:"id" json:"id"` - UpdatedAt time.Time `db:"updated_at" json:"updated_at"` - StartedAt time.Time `db:"started_at" json:"started_at"` - StoppedAt sql.NullTime `db:"stopped_at" json:"stopped_at"` - RelayAddress string `db:"relay_address" json:"relay_address"` - RegionID int32 `db:"region_id" json:"region_id"` - Hostname string `db:"hostname" json:"hostname"` - Version string `db:"version" json:"version"` - Error sql.NullString `db:"error" json:"error"` + ID uuid.UUID `db:"id" json:"id"` + UpdatedAt time.Time `db:"updated_at" json:"updated_at"` + StartedAt time.Time `db:"started_at" json:"started_at"` + StoppedAt sql.NullTime `db:"stopped_at" json:"stopped_at"` + RelayAddress string `db:"relay_address" json:"relay_address"` + RegionID int32 `db:"region_id" json:"region_id"` + Hostname string `db:"hostname" json:"hostname"` + Version string `db:"version" json:"version"` + Error sql.NullString `db:"error" json:"error"` + DatabaseLatency int32 `db:"database_latency" json:"database_latency"` } func (q *sqlQuerier) UpdateReplica(ctx context.Context, arg UpdateReplicaParams) (Replica, error) { @@ -2727,6 +2734,7 @@ func (q *sqlQuerier) UpdateReplica(ctx context.Context, arg UpdateReplicaParams) arg.Hostname, arg.Version, arg.Error, + arg.DatabaseLatency, ) var i Replica err := row.Scan( @@ -2738,6 +2746,7 @@ func (q *sqlQuerier) UpdateReplica(ctx context.Context, arg UpdateReplicaParams) &i.Hostname, &i.RegionID, &i.RelayAddress, + &i.DatabaseLatency, &i.Version, &i.Error, ) diff --git a/coderd/database/queries/replicas.sql b/coderd/database/queries/replicas.sql index a7aa5b0aa1dee..5a62527fac107 100644 --- a/coderd/database/queries/replicas.sql +++ b/coderd/database/queries/replicas.sql @@ -13,9 +13,9 @@ INSERT INTO replicas ( hostname, region_id, relay_address, - version - -) VALUES ($1, $2, $3, $4, $5, $6, $7, $8) RETURNING *; + version, + database_latency +) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9) RETURNING *; -- name: UpdateReplica :one UPDATE replicas SET @@ -26,7 +26,8 @@ UPDATE replicas SET region_id = $6, hostname = $7, version = $8, - error = $9 + error = $9, + database_latency = $10 WHERE id = $1 RETURNING *; -- name: DeleteReplicasUpdatedBefore :exec diff --git a/codersdk/replicas.go b/codersdk/replicas.go index 8e698fd3e6345..e74af021ee9a3 100644 --- a/codersdk/replicas.go +++ b/codersdk/replicas.go @@ -23,6 +23,8 @@ type Replica struct { RegionID int32 `json:"region_id"` // Error is the error. Error string `json:"error"` + // DatabaseLatency is the latency in microseconds to the database. + DatabaseLatency int32 `json:"database_latency"` } // Replicas fetches the list of replicas. diff --git a/enterprise/coderd/replicas.go b/enterprise/coderd/replicas.go index f8cb64fe553dc..c07c37243d0ca 100644 --- a/enterprise/coderd/replicas.go +++ b/enterprise/coderd/replicas.go @@ -26,11 +26,12 @@ func (api *API) replicas(rw http.ResponseWriter, r *http.Request) { func convertReplica(replica database.Replica) codersdk.Replica { return codersdk.Replica{ - ID: replica.ID, - Hostname: replica.Hostname, - CreatedAt: replica.CreatedAt, - RelayAddress: replica.RelayAddress, - RegionID: replica.RegionID, - Error: replica.Error.String, + ID: replica.ID, + Hostname: replica.Hostname, + CreatedAt: replica.CreatedAt, + RelayAddress: replica.RelayAddress, + RegionID: replica.RegionID, + Error: replica.Error.String, + DatabaseLatency: replica.DatabaseLatency, } } diff --git a/enterprise/replicasync/replicasync.go b/enterprise/replicasync/replicasync.go index 8b8327038e088..75ba041aaa6e1 100644 --- a/enterprise/replicasync/replicasync.go +++ b/enterprise/replicasync/replicasync.go @@ -48,6 +48,10 @@ func New(ctx context.Context, logger slog.Logger, db database.Store, pubsub data if err != nil { return nil, xerrors.Errorf("get hostname: %w", err) } + databaseLatency, err := db.Ping(ctx) + if err != nil { + return nil, xerrors.Errorf("ping database: %w", err) + } var replica database.Replica _, err = db.GetReplicaByID(ctx, options.ID) if err != nil { @@ -55,29 +59,31 @@ func New(ctx context.Context, logger slog.Logger, db database.Store, pubsub data return nil, xerrors.Errorf("get replica: %w", err) } replica, err = db.InsertReplica(ctx, database.InsertReplicaParams{ - ID: options.ID, - CreatedAt: database.Now(), - StartedAt: database.Now(), - UpdatedAt: database.Now(), - Hostname: hostname, - RegionID: options.RegionID, - RelayAddress: options.RelayAddress, - Version: buildinfo.Version(), + ID: options.ID, + CreatedAt: database.Now(), + StartedAt: database.Now(), + UpdatedAt: database.Now(), + Hostname: hostname, + RegionID: options.RegionID, + RelayAddress: options.RelayAddress, + Version: buildinfo.Version(), + DatabaseLatency: int32(databaseLatency.Microseconds()), }) if err != nil { return nil, xerrors.Errorf("insert replica: %w", err) } } else { replica, err = db.UpdateReplica(ctx, database.UpdateReplicaParams{ - ID: options.ID, - UpdatedAt: database.Now(), - StartedAt: database.Now(), - StoppedAt: sql.NullTime{}, - RelayAddress: options.RelayAddress, - RegionID: options.RegionID, - Hostname: hostname, - Version: buildinfo.Version(), - Error: sql.NullString{}, + ID: options.ID, + UpdatedAt: database.Now(), + StartedAt: database.Now(), + StoppedAt: sql.NullTime{}, + RelayAddress: options.RelayAddress, + RegionID: options.RegionID, + Hostname: hostname, + Version: buildinfo.Version(), + Error: sql.NullString{}, + DatabaseLatency: int32(databaseLatency.Microseconds()), }) if err != nil { return nil, xerrors.Errorf("update replica: %w", err) @@ -268,16 +274,22 @@ func (m *Manager) run(ctx context.Context) error { } } + databaseLatency, err := m.db.Ping(ctx) + if err != nil { + return xerrors.Errorf("ping database: %w", err) + } + replica, err := m.db.UpdateReplica(ctx, database.UpdateReplicaParams{ - ID: m.self.ID, - UpdatedAt: database.Now(), - StartedAt: m.self.StartedAt, - StoppedAt: m.self.StoppedAt, - RelayAddress: m.self.RelayAddress, - RegionID: m.self.RegionID, - Hostname: m.self.Hostname, - Version: m.self.Version, - Error: replicaError, + ID: m.self.ID, + UpdatedAt: database.Now(), + StartedAt: m.self.StartedAt, + StoppedAt: m.self.StoppedAt, + RelayAddress: m.self.RelayAddress, + RegionID: m.self.RegionID, + Hostname: m.self.Hostname, + Version: m.self.Version, + Error: replicaError, + DatabaseLatency: int32(databaseLatency.Microseconds()), }) if err != nil { return xerrors.Errorf("update replica: %w", err) diff --git a/site/src/api/typesGenerated.ts b/site/src/api/typesGenerated.ts index 7a8af9278b1eb..92db958074a68 100644 --- a/site/src/api/typesGenerated.ts +++ b/site/src/api/typesGenerated.ts @@ -338,6 +338,7 @@ export interface DurationFlag { export interface Entitlements { readonly features: Record readonly warnings: string[] + readonly errors: string[] readonly has_license: boolean readonly experimental: boolean readonly trial: boolean diff --git a/site/src/components/LicenseBanner/LicenseBanner.tsx b/site/src/components/LicenseBanner/LicenseBanner.tsx index 8532bfca2ecbe..7ecfc2a2a2fac 100644 --- a/site/src/components/LicenseBanner/LicenseBanner.tsx +++ b/site/src/components/LicenseBanner/LicenseBanner.tsx @@ -8,15 +8,15 @@ export const LicenseBanner: React.FC = () => { const [entitlementsState, entitlementsSend] = useActor( xServices.entitlementsXService, ) - const { warnings } = entitlementsState.context.entitlements + const { errors, warnings } = entitlementsState.context.entitlements /** Gets license data on app mount because LicenseBanner is mounted in App */ useEffect(() => { entitlementsSend("GET_ENTITLEMENTS") }, [entitlementsSend]) - if (warnings.length > 0) { - return + if (errors.length > 0 || warnings.length > 0) { + return } else { return null } diff --git a/site/src/components/LicenseBanner/LicenseBannerView.stories.tsx b/site/src/components/LicenseBanner/LicenseBannerView.stories.tsx index c37653eff7bd5..c7ee69c261e38 100644 --- a/site/src/components/LicenseBanner/LicenseBannerView.stories.tsx +++ b/site/src/components/LicenseBanner/LicenseBannerView.stories.tsx @@ -12,13 +12,23 @@ const Template: Story = (args) => ( export const OneWarning = Template.bind({}) OneWarning.args = { + errors: [], warnings: ["You have exceeded the number of seats in your license."], } export const TwoWarnings = Template.bind({}) TwoWarnings.args = { + errors: [], warnings: [ "You have exceeded the number of seats in your license.", "You are flying too close to the sun.", ], } + +export const OneError = Template.bind({}) +OneError.args = { + errors: [ + "You have multiple replicas but high availability is an Enterprise feature. You will be unable to connect to workspaces.", + ], + warnings: [], +} diff --git a/site/src/components/LicenseBanner/LicenseBannerView.tsx b/site/src/components/LicenseBanner/LicenseBannerView.tsx index 49276b1f0d5ed..792bc191a0a2a 100644 --- a/site/src/components/LicenseBanner/LicenseBannerView.tsx +++ b/site/src/components/LicenseBanner/LicenseBannerView.tsx @@ -2,47 +2,56 @@ import { makeStyles } from "@material-ui/core/styles" import { Expander } from "components/Expander/Expander" import { Pill } from "components/Pill/Pill" import { useState } from "react" +import { colors } from "theme/colors" export const Language = { licenseIssue: "License Issue", licenseIssues: (num: number): string => `${num} License Issues`, - upgrade: "Contact us to upgrade your license.", + upgrade: "Contact sales@coder.com.", exceeded: "It looks like you've exceeded some limits of your license.", lessDetails: "Less", moreDetails: "More", } export interface LicenseBannerViewProps { + errors: string[] warnings: string[] } export const LicenseBannerView: React.FC = ({ + errors, warnings, }) => { const styles = useStyles() const [showDetails, setShowDetails] = useState(false) - if (warnings.length === 1) { + const isError = errors.length > 0 + const messages = [...errors, ...warnings] + const type = isError ? "error" : "warning" + + if (messages.length === 1) { return ( -
- - {warnings[0]} -   - - {Language.upgrade} - +
+ +
+ {messages[0]} +   + + {Language.upgrade} + +
) } else { return ( -
-
-
- - {Language.exceeded} +
+ +
+
    - {warnings.map((warning) => ( -
  • - {warning} + {messages.map((message) => ( +
  • + {message}
  • ))}
@@ -67,14 +76,18 @@ const useStyles = makeStyles((theme) => ({ container: { padding: theme.spacing(1.5), backgroundColor: theme.palette.warning.main, + display: "flex", + alignItems: "center", + + "&.error": { + backgroundColor: colors.red[12], + }, }, flex: { - display: "flex", + display: "column", }, leftContent: { marginRight: theme.spacing(1), - }, - text: { marginLeft: theme.spacing(1), }, link: { @@ -83,9 +96,10 @@ const useStyles = makeStyles((theme) => ({ fontWeight: "bold", }, list: { - margin: theme.spacing(1.5), + padding: theme.spacing(1), + margin: 0, }, listItem: { - margin: theme.spacing(1), + margin: theme.spacing(0.5), }, })) diff --git a/site/src/testHelpers/entities.ts b/site/src/testHelpers/entities.ts index 6e26a4fee5944..7080d2a8d6002 100644 --- a/site/src/testHelpers/entities.ts +++ b/site/src/testHelpers/entities.ts @@ -816,6 +816,7 @@ export const makeMockApiError = ({ }) export const MockEntitlements: TypesGen.Entitlements = { + errors: [], warnings: [], has_license: false, features: {}, @@ -824,6 +825,7 @@ export const MockEntitlements: TypesGen.Entitlements = { } export const MockEntitlementsWithWarnings: TypesGen.Entitlements = { + errors: [], warnings: ["You are over your active user limit.", "And another thing."], has_license: true, experimental: false, @@ -847,6 +849,7 @@ export const MockEntitlementsWithWarnings: TypesGen.Entitlements = { } export const MockEntitlementsWithAuditLog: TypesGen.Entitlements = { + errors: [], warnings: [], has_license: true, experimental: false, diff --git a/site/src/xServices/entitlements/entitlementsXService.ts b/site/src/xServices/entitlements/entitlementsXService.ts index 83ed44d12052d..a1e8bb0d9b895 100644 --- a/site/src/xServices/entitlements/entitlementsXService.ts +++ b/site/src/xServices/entitlements/entitlementsXService.ts @@ -20,6 +20,7 @@ export type EntitlementsEvent = | { type: "HIDE_MOCK_BANNER" } const emptyEntitlements = { + errors: [], warnings: [], features: {}, has_license: false, From c1aa3d230740ab4e2388c14781131b0f76686b33 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 15 Oct 2022 00:36:24 +0000 Subject: [PATCH 30/79] Pipe TLS to DERP mesh --- cli/server.go | 3 +++ coderd/coderd.go | 3 +++ enterprise/coderd/coderd.go | 7 ++++++- 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/cli/server.go b/cli/server.go index 1ab1a6228f356..de15b7c63c84b 100644 --- a/cli/server.go +++ b/cli/server.go @@ -322,6 +322,9 @@ func Server(dflags *codersdk.DeploymentFlags, newAPI func(context.Context, *code Experimental: ExperimentalEnabled(cmd), DeploymentFlags: dflags, } + if tlsConfig != nil { + options.TLSCertificates = tlsConfig.Certificates + } if dflags.OAuth2GithubClientSecret.Value != "" { options.GithubOAuth2Config, err = configureGithubOAuth2(accessURLParsed, diff --git a/coderd/coderd.go b/coderd/coderd.go index df5c85f030d09..735190373a4f3 100644 --- a/coderd/coderd.go +++ b/coderd/coderd.go @@ -1,6 +1,7 @@ package coderd import ( + "crypto/tls" "crypto/x509" "io" "net/http" @@ -76,6 +77,8 @@ type Options struct { TracerProvider trace.TracerProvider AutoImportTemplates []AutoImportTemplate + // TLSCertificates is used to mesh DERP servers securely. + TLSCertificates []tls.Certificate TailnetCoordinator tailnet.Coordinator DERPServer *derp.Server DERPMap *tailcfg.DERPMap diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index 612e710395722..8a92c0b1c641a 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -3,6 +3,7 @@ package coderd import ( "context" "crypto/ed25519" + "crypto/tls" "net/http" "sync" "time" @@ -137,7 +138,11 @@ func New(ctx context.Context, options *Options) (*API, error) { if err != nil { return nil, xerrors.Errorf("initialize replica: %w", err) } - api.derpMesh = derpmesh.New(options.Logger.Named("derpmesh"), api.DERPServer, nil) + // nolint:gosec + api.derpMesh = derpmesh.New(options.Logger.Named("derpmesh"), api.DERPServer, &tls.Config{ + Certificates: options.TLSCertificates, + ServerName: options.AccessURL.Host, + }) err = api.updateEntitlements(ctx) if err != nil { From 0cc4263715d02cbf83d7ef8c853d85fa427cb8ad Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 15 Oct 2022 03:41:53 +0000 Subject: [PATCH 31/79] Fix DERP mesh with TLS --- enterprise/coderd/coderd.go | 17 ++++++- enterprise/derpmesh/derpmesh_test.go | 72 +++++++++++----------------- enterprise/tailnet/coordinator.go | 6 ++- go.mod | 2 +- go.sum | 4 +- site/src/api/typesGenerated.ts | 1 + 6 files changed, 53 insertions(+), 49 deletions(-) diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index 8a92c0b1c641a..803d13b44b7c4 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -4,6 +4,7 @@ import ( "context" "crypto/ed25519" "crypto/tls" + "crypto/x509" "net/http" "sync" "time" @@ -138,10 +139,22 @@ func New(ctx context.Context, options *Options) (*API, error) { if err != nil { return nil, xerrors.Errorf("initialize replica: %w", err) } + + rootCA := x509.NewCertPool() + for _, certificate := range options.TLSCertificates { + for _, certificatePart := range certificate.Certificate { + certificate, err := x509.ParseCertificate(certificatePart) + if err != nil { + return nil, xerrors.Errorf("parse certificate %s: %w", certificate.Subject.CommonName, err) + } + rootCA.AddCert(certificate) + } + } + // nolint:gosec api.derpMesh = derpmesh.New(options.Logger.Named("derpmesh"), api.DERPServer, &tls.Config{ - Certificates: options.TLSCertificates, - ServerName: options.AccessURL.Host, + ServerName: options.AccessURL.Host, + RootCAs: rootCA, }) err = api.updateEntitlements(ctx) diff --git a/enterprise/derpmesh/derpmesh_test.go b/enterprise/derpmesh/derpmesh_test.go index 139e42566ffb1..353e51dd2983f 100644 --- a/enterprise/derpmesh/derpmesh_test.go +++ b/enterprise/derpmesh/derpmesh_test.go @@ -37,44 +37,27 @@ func TestMain(m *testing.M) { func TestDERPMesh(t *testing.T) { t.Parallel() - t.Run("ExchangeMessages", func(t *testing.T) { - // This tests messages passing through multiple DERP servers. - t.Parallel() - firstServer, firstServerURL, firstTLSName := startDERP(t) - defer firstServer.Close() - secondServer, secondServerURL, secondTLSName := startDERP(t) - firstMesh := derpmesh.New(slogtest.Make(t, nil).Named("first").Leveled(slog.LevelDebug), firstServer, firstTLSName) - firstMesh.SetAddresses([]string{secondServerURL}) - secondMesh := derpmesh.New(slogtest.Make(t, nil).Named("second").Leveled(slog.LevelDebug), secondServer, secondTLSName) - secondMesh.SetAddresses([]string{firstServerURL}) - defer firstMesh.Close() - defer secondMesh.Close() - - first := key.NewNode() - second := key.NewNode() - firstClient, err := derphttp.NewClient(first, secondServerURL, tailnet.Logger(slogtest.Make(t, nil))) - require.NoError(t, err) - secondClient, err := derphttp.NewClient(second, firstServerURL, tailnet.Logger(slogtest.Make(t, nil))) - require.NoError(t, err) - err = secondClient.Connect(context.Background()) - require.NoError(t, err) - - sent := []byte("hello world") - err = firstClient.Send(second.Public(), sent) - require.NoError(t, err) + commonName := "something.org" + rawCert := generateTLSCertificate(t, commonName) + certificate, err := x509.ParseCertificate(rawCert.Certificate[0]) + require.NoError(t, err) + pool := x509.NewCertPool() + pool.AddCert(certificate) + tlsConfig := &tls.Config{ + ServerName: commonName, + RootCAs: pool, + Certificates: []tls.Certificate{rawCert}, + } - got := recvData(t, secondClient) - require.Equal(t, sent, got) - }) t.Run("ExchangeMessages", func(t *testing.T) { // This tests messages passing through multiple DERP servers. t.Parallel() - firstServer, firstServerURL, firstTLSName := startDERP(t) + firstServer, firstServerURL := startDERP(t, tlsConfig) defer firstServer.Close() - secondServer, secondServerURL, secondTLSName := startDERP(t) - firstMesh := derpmesh.New(slogtest.Make(t, nil).Named("first").Leveled(slog.LevelDebug), firstServer, firstTLSName) + secondServer, secondServerURL := startDERP(t, tlsConfig) + firstMesh := derpmesh.New(slogtest.Make(t, nil).Named("first").Leveled(slog.LevelDebug), firstServer, tlsConfig) firstMesh.SetAddresses([]string{secondServerURL}) - secondMesh := derpmesh.New(slogtest.Make(t, nil).Named("second").Leveled(slog.LevelDebug), secondServer, secondTLSName) + secondMesh := derpmesh.New(slogtest.Make(t, nil).Named("second").Leveled(slog.LevelDebug), secondServer, tlsConfig) secondMesh.SetAddresses([]string{firstServerURL}) defer firstMesh.Close() defer secondMesh.Close() @@ -83,8 +66,10 @@ func TestDERPMesh(t *testing.T) { second := key.NewNode() firstClient, err := derphttp.NewClient(first, secondServerURL, tailnet.Logger(slogtest.Make(t, nil))) require.NoError(t, err) + firstClient.TLSConfig = tlsConfig secondClient, err := derphttp.NewClient(second, firstServerURL, tailnet.Logger(slogtest.Make(t, nil))) require.NoError(t, err) + secondClient.TLSConfig = tlsConfig err = secondClient.Connect(context.Background()) require.NoError(t, err) @@ -98,8 +83,8 @@ func TestDERPMesh(t *testing.T) { t.Run("RemoveAddress", func(t *testing.T) { // This tests messages passing through multiple DERP servers. t.Parallel() - server, serverURL, tlsName := startDERP(t) - mesh := derpmesh.New(slogtest.Make(t, nil).Named("first").Leveled(slog.LevelDebug), server, tlsName) + server, serverURL := startDERP(t, tlsConfig) + mesh := derpmesh.New(slogtest.Make(t, nil).Named("first").Leveled(slog.LevelDebug), server, tlsConfig) mesh.SetAddresses([]string{"http://fake.com"}) // This should trigger a removal... mesh.SetAddresses([]string{}) @@ -109,8 +94,10 @@ func TestDERPMesh(t *testing.T) { second := key.NewNode() firstClient, err := derphttp.NewClient(first, serverURL, tailnet.Logger(slogtest.Make(t, nil))) require.NoError(t, err) + firstClient.TLSConfig = tlsConfig secondClient, err := derphttp.NewClient(second, serverURL, tailnet.Logger(slogtest.Make(t, nil))) require.NoError(t, err) + secondClient.TLSConfig = tlsConfig err = secondClient.Connect(context.Background()) require.NoError(t, err) sent := []byte("hello world") @@ -124,8 +111,8 @@ func TestDERPMesh(t *testing.T) { meshes := make([]*derpmesh.Mesh, 0, 20) serverURLs := make([]string, 0, 20) for i := 0; i < 20; i++ { - server, url, tlsName := startDERP(t) - mesh := derpmesh.New(slogtest.Make(t, nil).Named("mesh").Leveled(slog.LevelDebug), server, tlsName) + server, url := startDERP(t, tlsConfig) + mesh := derpmesh.New(slogtest.Make(t, nil).Named("mesh").Leveled(slog.LevelDebug), server, tlsConfig) t.Cleanup(func() { _ = server.Close() _ = mesh.Close() @@ -141,8 +128,10 @@ func TestDERPMesh(t *testing.T) { second := key.NewNode() firstClient, err := derphttp.NewClient(first, serverURLs[9], tailnet.Logger(slogtest.Make(t, nil))) require.NoError(t, err) + firstClient.TLSConfig = tlsConfig secondClient, err := derphttp.NewClient(second, serverURLs[16], tailnet.Logger(slogtest.Make(t, nil))) require.NoError(t, err) + secondClient.TLSConfig = tlsConfig err = secondClient.Connect(context.Background()) require.NoError(t, err) @@ -172,21 +161,18 @@ func recvData(t *testing.T, client *derphttp.Client) []byte { } } -func startDERP(t *testing.T) (*derp.Server, string, *tls.Config) { +func startDERP(t *testing.T, tlsConfig *tls.Config) (*derp.Server, string) { logf := tailnet.Logger(slogtest.Make(t, nil)) d := derp.NewServer(key.NewNode(), logf) d.SetMeshKey("some-key") server := httptest.NewUnstartedServer(derphttp.Handler(d)) - commonName := "something.org" - server.TLS = &tls.Config{ - Certificates: []tls.Certificate{generateTLSCertificate(t, commonName)}, - } - server.Start() + server.TLS = tlsConfig + server.StartTLS() t.Cleanup(func() { _ = d.Close() }) t.Cleanup(server.Close) - return d, server.URL, server.TLS + return d, server.URL } func generateTLSCertificate(t testing.TB, commonName string) tls.Certificate { diff --git a/enterprise/tailnet/coordinator.go b/enterprise/tailnet/coordinator.go index 0643f7a259719..206dc68d6319c 100644 --- a/enterprise/tailnet/coordinator.go +++ b/enterprise/tailnet/coordinator.go @@ -294,7 +294,11 @@ func (c *haCoordinator) hangleAgentUpdate(id uuid.UUID, decoder *json.Decoder) ( func (c *haCoordinator) Close() error { c.mutex.Lock() defer c.mutex.Unlock() - + select { + case <-c.close: + return nil + default: + } close(c.close) wg := sync.WaitGroup{} diff --git a/go.mod b/go.mod index b33a438eb3d08..195a09ae2b8fd 100644 --- a/go.mod +++ b/go.mod @@ -40,7 +40,7 @@ replace github.com/tcnksm/go-httpstat => github.com/kylecarbs/go-httpstat v0.0.0 // There are a few minor changes we make to Tailscale that we're slowly upstreaming. Compare here: // https://github.com/tailscale/tailscale/compare/main...coder:tailscale:main -replace tailscale.com => github.com/coder/tailscale v1.1.1-0.20221014173742-9f1da7795630 +replace tailscale.com => github.com/coder/tailscale v1.1.1-0.20221015033036-5861cbbf7bf5 // Switch to our fork that imports fixes from http://github.com/tailscale/ssh. // See: https://github.com/coder/coder/issues/3371 diff --git a/go.sum b/go.sum index 5852582c26c4a..b80c0d4173a5f 100644 --- a/go.sum +++ b/go.sum @@ -351,8 +351,8 @@ github.com/coder/retry v1.3.0 h1:5lAAwt/2Cm6lVmnfBY7sOMXcBOwcwJhmV5QGSELIVWY= github.com/coder/retry v1.3.0/go.mod h1:tXuRgZgWjUnU5LZPT4lJh4ew2elUhexhlnXzrJWdyFY= github.com/coder/ssh v0.0.0-20220811105153-fcea99919338 h1:tN5GKFT68YLVzJoA8AHuiMNJ0qlhoD3pGN3JY9gxSko= github.com/coder/ssh v0.0.0-20220811105153-fcea99919338/go.mod h1:ZSS+CUoKHDrqVakTfTWUlKSr9MtMFkC4UvtQKD7O914= -github.com/coder/tailscale v1.1.1-0.20221014173742-9f1da7795630 h1:FgWWdu0fnFEpUNjW0vOaCuOxOZ/GQzn6oo7p5IMlSA0= -github.com/coder/tailscale v1.1.1-0.20221014173742-9f1da7795630/go.mod h1:5amxy08qijEa8bcTW2SeIy4MIqcmd7LMsuOxqOlj2Ak= +github.com/coder/tailscale v1.1.1-0.20221015033036-5861cbbf7bf5 h1:WVH6e/qK3Wpl0wbmpORD2oQ1qLJborF3fsFHyO1ps0Y= +github.com/coder/tailscale v1.1.1-0.20221015033036-5861cbbf7bf5/go.mod h1:5amxy08qijEa8bcTW2SeIy4MIqcmd7LMsuOxqOlj2Ak= github.com/containerd/aufs v0.0.0-20200908144142-dab0cbea06f4/go.mod h1:nukgQABAEopAHvB6j7cnP5zJ+/3aVcE7hCYqvIwAHyE= github.com/containerd/aufs v0.0.0-20201003224125-76a6863f2989/go.mod h1:AkGGQs9NM2vtYHaUen+NljV0/baGCAPELGm2q9ZXpWU= github.com/containerd/aufs v0.0.0-20210316121734-20793ff83c97/go.mod h1:kL5kd6KM5TzQjR79jljyi4olc1Vrx6XBlcyj3gNv2PU= diff --git a/site/src/api/typesGenerated.ts b/site/src/api/typesGenerated.ts index 92db958074a68..11b4c64e34786 100644 --- a/site/src/api/typesGenerated.ts +++ b/site/src/api/typesGenerated.ts @@ -538,6 +538,7 @@ export interface Replica { readonly relay_address: string readonly region_id: number readonly error: string + readonly database_latency: number } // From codersdk/error.go From f9177e40ecea7499f4c7d972b9a66f49fbe2689a Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 15 Oct 2022 03:59:20 +0000 Subject: [PATCH 32/79] Add tests for TLS --- coderd/coderdtest/coderdtest.go | 14 ++++- codersdk/workspaceagents.go | 6 ++- enterprise/coderd/coderd.go | 3 +- .../coderd/coderdenttest/coderdenttest.go | 13 ++++- enterprise/coderd/replicas_test.go | 47 +++++++++++++++- enterprise/coderd/workspaceagents_test.go | 9 ++++ enterprise/derpmesh/derpmesh_test.go | 47 +--------------- testutil/certificate.go | 53 +++++++++++++++++++ 8 files changed, 138 insertions(+), 54 deletions(-) create mode 100644 testutil/certificate.go diff --git a/coderd/coderdtest/coderdtest.go b/coderd/coderdtest/coderdtest.go index 59b414cad8903..cbbcd7aaa493a 100644 --- a/coderd/coderdtest/coderdtest.go +++ b/coderd/coderdtest/coderdtest.go @@ -7,6 +7,7 @@ import ( "crypto/rand" "crypto/rsa" "crypto/sha256" + "crypto/tls" "crypto/x509" "crypto/x509/pkix" "encoding/base64" @@ -75,6 +76,7 @@ type Options struct { AutobuildTicker <-chan time.Time AutobuildStats chan<- executor.Stats Auditor audit.Auditor + TLSCertificates []tls.Certificate // IncludeProvisionerDaemon when true means to start an in-memory provisionerD IncludeProvisionerDaemon bool @@ -158,7 +160,14 @@ func NewOptions(t *testing.T, options *Options) (*httptest.Server, context.Cance srv.Config.BaseContext = func(_ net.Listener) context.Context { return ctx } - srv.Start() + if options.TLSCertificates != nil { + srv.TLS = &tls.Config{ + Certificates: options.TLSCertificates, + } + srv.StartTLS() + } else { + srv.Start() + } t.Cleanup(srv.Close) tcpAddr, ok := srv.Listener.Addr().(*net.TCPAddr) @@ -201,6 +210,7 @@ func NewOptions(t *testing.T, options *Options) (*httptest.Server, context.Cance APIRateLimit: options.APIRateLimit, Authorizer: options.Authorizer, Telemetry: telemetry.NewNoop(), + TLSCertificates: options.TLSCertificates, DERPMap: &tailcfg.DERPMap{ Regions: map[int]*tailcfg.DERPRegion{ 1: { @@ -215,7 +225,7 @@ func NewOptions(t *testing.T, options *Options) (*httptest.Server, context.Cance DERPPort: derpPort, STUNPort: stunAddr.Port, InsecureForTests: true, - ForceHTTP: true, + ForceHTTP: options.TLSCertificates == nil, }}, }, }, diff --git a/codersdk/workspaceagents.go b/codersdk/workspaceagents.go index c86b399e189ab..c86944ae2b629 100644 --- a/codersdk/workspaceagents.go +++ b/codersdk/workspaceagents.go @@ -315,7 +315,8 @@ func (c *Client) ListenWorkspaceAgentTailnet(ctx context.Context) (net.Conn, err Value: c.SessionToken, }}) httpClient := &http.Client{ - Jar: jar, + Jar: jar, + Transport: c.HTTPClient.Transport, } // nolint:bodyclose conn, res, err := websocket.Dial(ctx, coordinateURL.String(), &websocket.DialOptions{ @@ -380,7 +381,8 @@ func (c *Client) DialWorkspaceAgent(ctx context.Context, agentID uuid.UUID, opti Value: c.SessionToken, }}) httpClient := &http.Client{ - Jar: jar, + Jar: jar, + Transport: c.HTTPClient.Transport, } ctx, cancelFunc := context.WithCancel(ctx) closed := make(chan struct{}) diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index 803d13b44b7c4..469205cfa2e96 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -150,10 +150,9 @@ func New(ctx context.Context, options *Options) (*API, error) { rootCA.AddCert(certificate) } } - // nolint:gosec api.derpMesh = derpmesh.New(options.Logger.Named("derpmesh"), api.DERPServer, &tls.Config{ - ServerName: options.AccessURL.Host, + ServerName: options.AccessURL.Hostname(), RootCAs: rootCA, }) diff --git a/enterprise/coderd/coderdenttest/coderdenttest.go b/enterprise/coderd/coderdenttest/coderdenttest.go index a503a22ce459c..02eff4e2acf2e 100644 --- a/enterprise/coderd/coderdenttest/coderdenttest.go +++ b/enterprise/coderd/coderdenttest/coderdenttest.go @@ -4,7 +4,9 @@ import ( "context" "crypto/ed25519" "crypto/rand" + "crypto/tls" "io" + "net/http" "testing" "time" @@ -85,7 +87,16 @@ func NewWithAPI(t *testing.T, options *Options) (*codersdk.Client, io.Closer, *c _ = provisionerCloser.Close() _ = coderAPI.Close() }) - return codersdk.New(coderAPI.AccessURL), provisionerCloser, coderAPI + client := codersdk.New(coderAPI.AccessURL) + client.HTTPClient = &http.Client{ + Transport: &http.Transport{ + TLSClientConfig: &tls.Config{ + //nolint:gosec + InsecureSkipVerify: true, + }, + }, + } + return client, provisionerCloser, coderAPI } type LicenseOptions struct { diff --git a/enterprise/coderd/replicas_test.go b/enterprise/coderd/replicas_test.go index 0272fb018f3d6..b66bcaef9f976 100644 --- a/enterprise/coderd/replicas_test.go +++ b/enterprise/coderd/replicas_test.go @@ -2,6 +2,7 @@ package coderd_test import ( "context" + "crypto/tls" "testing" "time" @@ -19,7 +20,7 @@ import ( func TestReplicas(t *testing.T) { t.Parallel() - t.Run("WarningsWithoutLicense", func(t *testing.T) { + t.Run("ErrorWithoutLicense", func(t *testing.T) { t.Parallel() db, pubsub := dbtestutil.NewDB(t) firstClient := coderdenttest.New(t, &coderdenttest.Options{ @@ -39,7 +40,7 @@ func TestReplicas(t *testing.T) { secondClient.SessionToken = firstClient.SessionToken ents, err := secondClient.Entitlements(context.Background()) require.NoError(t, err) - require.Len(t, ents.Warnings, 1) + require.Len(t, ents.Errors, 1) _ = secondAPI.Close() ents, err = firstClient.Entitlements(context.Background()) @@ -85,6 +86,48 @@ func TestReplicas(t *testing.T) { return err == nil }, testutil.WaitLong, testutil.IntervalFast) _ = conn.Close() + }) + t.Run("ConnectAcrossMultipleTLS", func(t *testing.T) { + t.Parallel() + db, pubsub := dbtestutil.NewDB(t) + certificates := []tls.Certificate{testutil.GenerateTLSCertificate(t, "localhost")} + firstClient := coderdenttest.New(t, &coderdenttest.Options{ + Options: &coderdtest.Options{ + IncludeProvisionerDaemon: true, + Database: db, + Pubsub: pubsub, + TLSCertificates: certificates, + }, + }) + firstUser := coderdtest.CreateFirstUser(t, firstClient) + coderdenttest.AddLicense(t, firstClient, coderdenttest.LicenseOptions{ + HighAvailability: true, + }) + + secondClient := coderdenttest.New(t, &coderdenttest.Options{ + Options: &coderdtest.Options{ + Database: db, + Pubsub: pubsub, + TLSCertificates: certificates, + }, + }) + secondClient.SessionToken = firstClient.SessionToken + replicas, err := secondClient.Replicas(context.Background()) + require.NoError(t, err) + require.Len(t, replicas, 2) + _, agent := setupWorkspaceAgent(t, firstClient, firstUser, 0) + conn, err := secondClient.DialWorkspaceAgent(context.Background(), agent.ID, &codersdk.DialWorkspaceAgentOptions{ + BlockEndpoints: true, + Logger: slogtest.Make(t, nil).Leveled(slog.LevelDebug), + }) + require.NoError(t, err) + require.Eventually(t, func() bool { + ctx, cancelFunc := context.WithTimeout(context.Background(), 3*time.Second) + defer cancelFunc() + _, err = conn.Ping(ctx) + return err == nil + }, testutil.WaitLong, testutil.IntervalFast) + _ = conn.Close() }) } diff --git a/enterprise/coderd/workspaceagents_test.go b/enterprise/coderd/workspaceagents_test.go index 097bab354ba74..18285bcb94317 100644 --- a/enterprise/coderd/workspaceagents_test.go +++ b/enterprise/coderd/workspaceagents_test.go @@ -2,6 +2,7 @@ package coderd_test import ( "context" + "crypto/tls" "fmt" "net/http" "testing" @@ -108,6 +109,14 @@ func setupWorkspaceAgent(t *testing.T, client *codersdk.Client, user codersdk.Cr workspace := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID) coderdtest.AwaitWorkspaceBuildJob(t, client, workspace.LatestBuild.ID) agentClient := codersdk.New(client.URL) + agentClient.HTTPClient = &http.Client{ + Transport: &http.Transport{ + TLSClientConfig: &tls.Config{ + //nolint:gosec + InsecureSkipVerify: true, + }, + }, + } agentClient.SessionToken = authToken agentCloser := agent.New(agent.Options{ FetchMetadata: agentClient.WorkspaceAgentMetadata, diff --git a/enterprise/derpmesh/derpmesh_test.go b/enterprise/derpmesh/derpmesh_test.go index 353e51dd2983f..fcf410ac0e574 100644 --- a/enterprise/derpmesh/derpmesh_test.go +++ b/enterprise/derpmesh/derpmesh_test.go @@ -1,22 +1,13 @@ package derpmesh_test import ( - "bytes" "context" - "crypto/ecdsa" - "crypto/elliptic" - "crypto/rand" "crypto/tls" "crypto/x509" - "crypto/x509/pkix" - "encoding/pem" "errors" "io" - "math/big" - "net" "net/http/httptest" "testing" - "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -29,6 +20,7 @@ import ( "cdr.dev/slog/sloggers/slogtest" "github.com/coder/coder/enterprise/derpmesh" "github.com/coder/coder/tailnet" + "github.com/coder/coder/testutil" ) func TestMain(m *testing.M) { @@ -38,7 +30,7 @@ func TestMain(m *testing.M) { func TestDERPMesh(t *testing.T) { t.Parallel() commonName := "something.org" - rawCert := generateTLSCertificate(t, commonName) + rawCert := testutil.GenerateTLSCertificate(t, commonName) certificate, err := x509.ParseCertificate(rawCert.Certificate[0]) require.NoError(t, err) pool := x509.NewCertPool() @@ -174,38 +166,3 @@ func startDERP(t *testing.T, tlsConfig *tls.Config) (*derp.Server, string) { t.Cleanup(server.Close) return d, server.URL } - -func generateTLSCertificate(t testing.TB, commonName string) tls.Certificate { - privateKey, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader) - require.NoError(t, err) - template := x509.Certificate{ - SerialNumber: big.NewInt(1), - Subject: pkix.Name{ - Organization: []string{"Acme Co"}, - CommonName: commonName, - }, - DNSNames: []string{commonName}, - NotBefore: time.Now(), - NotAfter: time.Now().Add(time.Hour * 24 * 180), - - KeyUsage: x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature, - ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth}, - BasicConstraintsValid: true, - IPAddresses: []net.IP{net.ParseIP("127.0.0.1")}, - } - - derBytes, err := x509.CreateCertificate(rand.Reader, &template, &template, &privateKey.PublicKey, privateKey) - require.NoError(t, err) - var certFile bytes.Buffer - require.NoError(t, err) - _, err = certFile.Write(pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: derBytes})) - require.NoError(t, err) - privateKeyBytes, err := x509.MarshalPKCS8PrivateKey(privateKey) - require.NoError(t, err) - var keyFile bytes.Buffer - err = pem.Encode(&keyFile, &pem.Block{Type: "PRIVATE KEY", Bytes: privateKeyBytes}) - require.NoError(t, err) - cert, err := tls.X509KeyPair(certFile.Bytes(), keyFile.Bytes()) - require.NoError(t, err) - return cert -} diff --git a/testutil/certificate.go b/testutil/certificate.go new file mode 100644 index 0000000000000..1edc975746958 --- /dev/null +++ b/testutil/certificate.go @@ -0,0 +1,53 @@ +package testutil + +import ( + "bytes" + "crypto/ecdsa" + "crypto/elliptic" + "crypto/rand" + "crypto/tls" + "crypto/x509" + "crypto/x509/pkix" + "encoding/pem" + "math/big" + "net" + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +func GenerateTLSCertificate(t testing.TB, commonName string) tls.Certificate { + privateKey, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader) + require.NoError(t, err) + template := x509.Certificate{ + SerialNumber: big.NewInt(1), + Subject: pkix.Name{ + Organization: []string{"Acme Co"}, + CommonName: commonName, + }, + DNSNames: []string{commonName}, + NotBefore: time.Now(), + NotAfter: time.Now().Add(time.Hour * 24 * 180), + + KeyUsage: x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature, + ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth}, + BasicConstraintsValid: true, + IPAddresses: []net.IP{net.ParseIP("127.0.0.1")}, + } + + derBytes, err := x509.CreateCertificate(rand.Reader, &template, &template, &privateKey.PublicKey, privateKey) + require.NoError(t, err) + var certFile bytes.Buffer + require.NoError(t, err) + _, err = certFile.Write(pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: derBytes})) + require.NoError(t, err) + privateKeyBytes, err := x509.MarshalPKCS8PrivateKey(privateKey) + require.NoError(t, err) + var keyFile bytes.Buffer + err = pem.Encode(&keyFile, &pem.Block{Type: "PRIVATE KEY", Bytes: privateKeyBytes}) + require.NoError(t, err) + cert, err := tls.X509KeyPair(certFile.Bytes(), keyFile.Bytes()) + require.NoError(t, err) + return cert +} From ee59d88a087408885a8297afb8eda41b0bd73a54 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 15 Oct 2022 04:09:16 +0000 Subject: [PATCH 33/79] Fix replica sync TLS --- enterprise/coderd/coderd.go | 38 ++++++++++++++++----------- enterprise/coderd/replicas_test.go | 6 +++++ enterprise/replicasync/replicasync.go | 5 ++++ 3 files changed, 33 insertions(+), 16 deletions(-) diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index 469205cfa2e96..6bbbcb16f33cf 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -129,32 +129,38 @@ func New(ctx context.Context, options *Options) (*API, error) { }) } - var err error - api.replicaManager, err = replicasync.New(ctx, options.Logger, options.Database, options.Pubsub, replicasync.Options{ - // Create a new replica ID for each Coder instance! - ID: uuid.New(), - RelayAddress: options.DERPServerRelayAddress, - RegionID: int32(options.DERPServerRegionID), - }) - if err != nil { - return nil, xerrors.Errorf("initialize replica: %w", err) - } - - rootCA := x509.NewCertPool() + meshRootCA := x509.NewCertPool() for _, certificate := range options.TLSCertificates { for _, certificatePart := range certificate.Certificate { certificate, err := x509.ParseCertificate(certificatePart) if err != nil { return nil, xerrors.Errorf("parse certificate %s: %w", certificate.Subject.CommonName, err) } - rootCA.AddCert(certificate) + meshRootCA.AddCert(certificate) } } - // nolint:gosec - api.derpMesh = derpmesh.New(options.Logger.Named("derpmesh"), api.DERPServer, &tls.Config{ + // This TLS configuration spoofs access from the access URL hostname + // assuming that the certificates provided will cover that hostname. + // + // Replica sync and DERP meshing require accessing replicas via their + // internal IP addresses, and if TLS is configured we use the same + // certificates. + meshTLSConfig := &tls.Config{ ServerName: options.AccessURL.Hostname(), - RootCAs: rootCA, + RootCAs: meshRootCA, + } + var err error + api.replicaManager, err = replicasync.New(ctx, options.Logger, options.Database, options.Pubsub, replicasync.Options{ + // Create a new replica ID for each Coder instance! + ID: uuid.New(), + RelayAddress: options.DERPServerRelayAddress, + RegionID: int32(options.DERPServerRegionID), + TLSConfig: meshTLSConfig, }) + if err != nil { + return nil, xerrors.Errorf("initialize replica: %w", err) + } + api.derpMesh = derpmesh.New(options.Logger.Named("derpmesh"), api.DERPServer, meshTLSConfig) err = api.updateEntitlements(ctx) if err != nil { diff --git a/enterprise/coderd/replicas_test.go b/enterprise/coderd/replicas_test.go index b66bcaef9f976..63bae9ebce9e6 100644 --- a/enterprise/coderd/replicas_test.go +++ b/enterprise/coderd/replicas_test.go @@ -129,5 +129,11 @@ func TestReplicas(t *testing.T) { return err == nil }, testutil.WaitLong, testutil.IntervalFast) _ = conn.Close() + replicas, err = secondClient.Replicas(context.Background()) + require.NoError(t, err) + require.Len(t, replicas, 2) + for _, replica := range replicas { + require.Empty(t, replica.Error) + } }) } diff --git a/enterprise/replicasync/replicasync.go b/enterprise/replicasync/replicasync.go index 75ba041aaa6e1..758a11a84e842 100644 --- a/enterprise/replicasync/replicasync.go +++ b/enterprise/replicasync/replicasync.go @@ -2,6 +2,7 @@ package replicasync import ( "context" + "crypto/tls" "database/sql" "errors" "fmt" @@ -30,6 +31,7 @@ type Options struct { PeerTimeout time.Duration RelayAddress string RegionID int32 + TLSConfig *tls.Config } // New registers the replica with the database and periodically updates to ensure @@ -254,6 +256,9 @@ func (m *Manager) run(ctx context.Context) error { } client := http.Client{ Timeout: m.options.PeerTimeout, + Transport: &http.Transport{ + TLSClientConfig: m.options.TLSConfig, + }, } res, err := client.Do(req) if err != nil { From 8641e58790b067a70ae29fb4e67e89d952b1fff3 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 15 Oct 2022 04:28:50 +0000 Subject: [PATCH 34/79] Fix RootCA for replica meshing --- enterprise/coderd/coderd.go | 5 ++- enterprise/replicasync/replicasync_test.go | 44 ++++++++++++++++++++++ helm/templates/service.yaml | 1 + 3 files changed, 48 insertions(+), 2 deletions(-) diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index 6bbbcb16f33cf..f836f786463d2 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -146,8 +146,9 @@ func New(ctx context.Context, options *Options) (*API, error) { // internal IP addresses, and if TLS is configured we use the same // certificates. meshTLSConfig := &tls.Config{ - ServerName: options.AccessURL.Hostname(), - RootCAs: meshRootCA, + Certificates: options.TLSCertificates, + RootCAs: meshRootCA, + ServerName: options.AccessURL.Hostname(), } var err error api.replicaManager, err = replicasync.New(ctx, options.Logger, options.Database, options.Pubsub, replicasync.Options{ diff --git a/enterprise/replicasync/replicasync_test.go b/enterprise/replicasync/replicasync_test.go index ccacbeb310c23..faba7345183ff 100644 --- a/enterprise/replicasync/replicasync_test.go +++ b/enterprise/replicasync/replicasync_test.go @@ -2,6 +2,8 @@ package replicasync_test import ( "context" + "crypto/tls" + "crypto/x509" "net/http" "net/http/httptest" "sync" @@ -112,6 +114,48 @@ func TestReplica(t *testing.T) { require.False(t, server.Self().Error.Valid) _ = server.Close() }) + t.Run("ConnectsToPeerReplicaTLS", func(t *testing.T) { + // Ensures that the replica reports a successful status for + // accessing all of its peers. + t.Parallel() + rawCert := testutil.GenerateTLSCertificate(t, "hello.org") + certificate, err := x509.ParseCertificate(rawCert.Certificate[0]) + require.NoError(t, err) + pool := x509.NewCertPool() + pool.AddCert(certificate) + // nolint:gosec + tlsConfig := &tls.Config{ + Certificates: []tls.Certificate{rawCert}, + ServerName: "hello.org", + RootCAs: pool, + } + srv := httptest.NewUnstartedServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + })) + srv.TLS = tlsConfig + srv.StartTLS() + defer srv.Close() + db, pubsub := dbtestutil.NewDB(t) + peer, err := db.InsertReplica(context.Background(), database.InsertReplicaParams{ + ID: uuid.New(), + CreatedAt: database.Now(), + StartedAt: database.Now(), + UpdatedAt: database.Now(), + Hostname: "something", + RelayAddress: srv.URL, + }) + require.NoError(t, err) + server, err := replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replicasync.Options{ + ID: uuid.New(), + RelayAddress: "http://169.254.169.254", + TLSConfig: tlsConfig, + }) + require.NoError(t, err) + require.Len(t, server.Regional(), 1) + require.Equal(t, peer.ID, server.Regional()[0].ID) + require.False(t, server.Self().Error.Valid) + _ = server.Close() + }) t.Run("ConnectsToFakePeerWithError", func(t *testing.T) { t.Parallel() db, pubsub := dbtestutil.NewDB(t) diff --git a/helm/templates/service.yaml b/helm/templates/service.yaml index 28fe0e9f9aa8c..b9a7e9a2f0886 100644 --- a/helm/templates/service.yaml +++ b/helm/templates/service.yaml @@ -10,6 +10,7 @@ metadata: {{- toYaml .Values.coder.service.annotations | nindent 4 }} spec: type: {{ .Values.coder.service.type }} + sessionAffinity: ClientIP ports: - name: {{ include "coder.portName" . | quote }} port: {{ include "coder.servicePort" . }} From 3dfb796c29ae142be30399ffb4b1dd279f567ca0 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 15 Oct 2022 04:51:04 +0000 Subject: [PATCH 35/79] Remove ID from replicasync --- enterprise/coderd/coderd.go | 6 +- enterprise/coderd/replicas_test.go | 2 +- enterprise/replicasync/replicasync.go | 65 ++++++++-------------- enterprise/replicasync/replicasync_test.go | 53 ++++-------------- 4 files changed, 35 insertions(+), 91 deletions(-) diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index f836f786463d2..371ac12fe21b8 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -13,7 +13,6 @@ import ( "github.com/cenkalti/backoff/v4" "github.com/go-chi/chi/v5" - "github.com/google/uuid" "cdr.dev/slog" "github.com/coder/coder/coderd" @@ -146,14 +145,13 @@ func New(ctx context.Context, options *Options) (*API, error) { // internal IP addresses, and if TLS is configured we use the same // certificates. meshTLSConfig := &tls.Config{ + MinVersion: tls.VersionTLS12, Certificates: options.TLSCertificates, RootCAs: meshRootCA, ServerName: options.AccessURL.Hostname(), } var err error - api.replicaManager, err = replicasync.New(ctx, options.Logger, options.Database, options.Pubsub, replicasync.Options{ - // Create a new replica ID for each Coder instance! - ID: uuid.New(), + api.replicaManager, err = replicasync.New(ctx, options.Logger, options.Database, options.Pubsub, &replicasync.Options{ RelayAddress: options.DERPServerRelayAddress, RegionID: int32(options.DERPServerRegionID), TLSConfig: meshTLSConfig, diff --git a/enterprise/coderd/replicas_test.go b/enterprise/coderd/replicas_test.go index 63bae9ebce9e6..9d6970823befb 100644 --- a/enterprise/coderd/replicas_test.go +++ b/enterprise/coderd/replicas_test.go @@ -123,7 +123,7 @@ func TestReplicas(t *testing.T) { }) require.NoError(t, err) require.Eventually(t, func() bool { - ctx, cancelFunc := context.WithTimeout(context.Background(), 3*time.Second) + ctx, cancelFunc := context.WithTimeout(context.Background(), testutil.IntervalMedium) defer cancelFunc() _, err = conn.Ping(ctx) return err == nil diff --git a/enterprise/replicasync/replicasync.go b/enterprise/replicasync/replicasync.go index 758a11a84e842..82e7d74273eeb 100644 --- a/enterprise/replicasync/replicasync.go +++ b/enterprise/replicasync/replicasync.go @@ -26,7 +26,6 @@ var ( ) type Options struct { - ID uuid.UUID UpdateInterval time.Duration PeerTimeout time.Duration RelayAddress string @@ -36,9 +35,9 @@ type Options struct { // New registers the replica with the database and periodically updates to ensure // it's healthy. It contacts all other alive replicas to ensure they are reachable. -func New(ctx context.Context, logger slog.Logger, db database.Store, pubsub database.Pubsub, options Options) (*Manager, error) { - if options.ID == uuid.Nil { - panic("An ID must be provided!") +func New(ctx context.Context, logger slog.Logger, db database.Store, pubsub database.Pubsub, options *Options) (*Manager, error) { + if options == nil { + options = &Options{} } if options.PeerTimeout == 0 { options.PeerTimeout = 3 * time.Second @@ -54,50 +53,29 @@ func New(ctx context.Context, logger slog.Logger, db database.Store, pubsub data if err != nil { return nil, xerrors.Errorf("ping database: %w", err) } - var replica database.Replica - _, err = db.GetReplicaByID(ctx, options.ID) + id := uuid.New() + replica, err := db.InsertReplica(ctx, database.InsertReplicaParams{ + ID: id, + CreatedAt: database.Now(), + StartedAt: database.Now(), + UpdatedAt: database.Now(), + Hostname: hostname, + RegionID: options.RegionID, + RelayAddress: options.RelayAddress, + Version: buildinfo.Version(), + DatabaseLatency: int32(databaseLatency.Microseconds()), + }) if err != nil { - if !errors.Is(err, sql.ErrNoRows) { - return nil, xerrors.Errorf("get replica: %w", err) - } - replica, err = db.InsertReplica(ctx, database.InsertReplicaParams{ - ID: options.ID, - CreatedAt: database.Now(), - StartedAt: database.Now(), - UpdatedAt: database.Now(), - Hostname: hostname, - RegionID: options.RegionID, - RelayAddress: options.RelayAddress, - Version: buildinfo.Version(), - DatabaseLatency: int32(databaseLatency.Microseconds()), - }) - if err != nil { - return nil, xerrors.Errorf("insert replica: %w", err) - } - } else { - replica, err = db.UpdateReplica(ctx, database.UpdateReplicaParams{ - ID: options.ID, - UpdatedAt: database.Now(), - StartedAt: database.Now(), - StoppedAt: sql.NullTime{}, - RelayAddress: options.RelayAddress, - RegionID: options.RegionID, - Hostname: hostname, - Version: buildinfo.Version(), - Error: sql.NullString{}, - DatabaseLatency: int32(databaseLatency.Microseconds()), - }) - if err != nil { - return nil, xerrors.Errorf("update replica: %w", err) - } + return nil, xerrors.Errorf("insert replica: %w", err) } - err = pubsub.Publish(PubsubEvent, []byte(options.ID.String())) + err = pubsub.Publish(PubsubEvent, []byte(id.String())) if err != nil { return nil, xerrors.Errorf("publish new replica: %w", err) } ctx, cancelFunc := context.WithCancel(ctx) server := &Manager{ - options: &options, + id: id, + options: options, db: db, pubsub: pubsub, self: replica, @@ -128,6 +106,7 @@ func New(ctx context.Context, logger slog.Logger, db database.Store, pubsub data // Manager keeps the replica up to date and in sync with other replicas. type Manager struct { + id uuid.UUID options *Options db database.Store pubsub database.Pubsub @@ -196,7 +175,7 @@ func (m *Manager) subscribe(ctx context.Context) error { return } // Don't process updates for ourself! - if id == m.options.ID { + if id == m.id { return } if updating { @@ -233,7 +212,7 @@ func (m *Manager) run(ctx context.Context) error { m.mutex.Lock() m.peers = make([]database.Replica, 0, len(replicas)) for _, replica := range replicas { - if replica.ID == m.options.ID { + if replica.ID == m.id { continue } m.peers = append(m.peers, replica) diff --git a/enterprise/replicasync/replicasync_test.go b/enterprise/replicasync/replicasync_test.go index faba7345183ff..0b42f44791df4 100644 --- a/enterprise/replicasync/replicasync_test.go +++ b/enterprise/replicasync/replicasync_test.go @@ -11,7 +11,6 @@ import ( "time" "github.com/google/uuid" - "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "go.uber.org/goleak" @@ -32,38 +31,15 @@ func TestReplica(t *testing.T) { // This ensures that a new replica is created on New. t.Parallel() db, pubsub := dbtestutil.NewDB(t) - id := uuid.New() + closeChan := make(chan struct{}, 1) cancel, err := pubsub.Subscribe(replicasync.PubsubEvent, func(ctx context.Context, message []byte) { - assert.Equal(t, []byte(id.String()), message) + closeChan <- struct{}{} }) require.NoError(t, err) defer cancel() - server, err := replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replicasync.Options{ - ID: id, - }) - require.NoError(t, err) - _ = server.Close() - require.NoError(t, err) - }) - t.Run("UpdatesOnNew", func(t *testing.T) { - // This ensures that a replica is updated when it initially connects - // and immediately publishes it's existence! - t.Parallel() - db, pubsub := dbtestutil.NewDB(t) - id := uuid.New() - _, err := db.InsertReplica(context.Background(), database.InsertReplicaParams{ - ID: id, - }) - require.NoError(t, err) - cancel, err := pubsub.Subscribe(replicasync.PubsubEvent, func(ctx context.Context, message []byte) { - assert.Equal(t, []byte(id.String()), message) - }) - require.NoError(t, err) - defer cancel() - server, err := replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replicasync.Options{ - ID: id, - }) + server, err := replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, nil) require.NoError(t, err) + <-closeChan _ = server.Close() require.NoError(t, err) }) @@ -80,9 +56,7 @@ func TestReplica(t *testing.T) { Hostname: "something", }) require.NoError(t, err) - _, err = replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replicasync.Options{ - ID: uuid.New(), - }) + _, err = replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, nil) require.Error(t, err) require.Equal(t, "a relay address must be specified when running multiple replicas in the same region", err.Error()) }) @@ -104,8 +78,7 @@ func TestReplica(t *testing.T) { RelayAddress: srv.URL, }) require.NoError(t, err) - server, err := replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replicasync.Options{ - ID: uuid.New(), + server, err := replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, &replicasync.Options{ RelayAddress: "http://169.254.169.254", }) require.NoError(t, err) @@ -145,8 +118,7 @@ func TestReplica(t *testing.T) { RelayAddress: srv.URL, }) require.NoError(t, err) - server, err := replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replicasync.Options{ - ID: uuid.New(), + server, err := replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, &replicasync.Options{ RelayAddress: "http://169.254.169.254", TLSConfig: tlsConfig, }) @@ -169,8 +141,7 @@ func TestReplica(t *testing.T) { RelayAddress: "http://169.254.169.254", }) require.NoError(t, err) - server, err := replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replicasync.Options{ - ID: uuid.New(), + server, err := replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, &replicasync.Options{ PeerTimeout: 1 * time.Millisecond, RelayAddress: "http://169.254.169.254", }) @@ -185,10 +156,7 @@ func TestReplica(t *testing.T) { // Refresh when a new replica appears! t.Parallel() db, pubsub := dbtestutil.NewDB(t) - id := uuid.New() - server, err := replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, replicasync.Options{ - ID: id, - }) + server, err := replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, nil) require.NoError(t, err) srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusOK) @@ -224,8 +192,7 @@ func TestReplica(t *testing.T) { count := 20 wg.Add(count) for i := 0; i < count; i++ { - server, err := replicasync.New(context.Background(), logger, db, pubsub, replicasync.Options{ - ID: uuid.New(), + server, err := replicasync.New(context.Background(), logger, db, pubsub, &replicasync.Options{ RelayAddress: srv.URL, }) require.NoError(t, err) From ec2c1f13403216460e6ab3104c3c5fe5b5355667 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 15 Oct 2022 04:56:58 +0000 Subject: [PATCH 36/79] Fix getting certificates for meshing --- cli/server.go | 1 + 1 file changed, 1 insertion(+) diff --git a/cli/server.go b/cli/server.go index de15b7c63c84b..0704aca10b07e 100644 --- a/cli/server.go +++ b/cli/server.go @@ -929,6 +929,7 @@ func configureTLS(tlsMinVersion, tlsClientAuth string, tlsCertFiles, tlsKeyFiles if err != nil { return nil, xerrors.Errorf("load certificates: %w", err) } + tlsConfig.Certificates = certs tlsConfig.GetCertificate = func(hi *tls.ClientHelloInfo) (*tls.Certificate, error) { // If there's only one certificate, return it. if len(certs) == 1 { From 590f0f896ae74d9765d7014150936bbc8ed6278d Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 15 Oct 2022 05:22:04 +0000 Subject: [PATCH 37/79] Remove excessive locking --- enterprise/replicasync/replicasync.go | 22 +++++++++--------- enterprise/tailnet/coordinator.go | 32 +++++++++------------------ tailnet/coordinator.go | 24 +++++++++++--------- 3 files changed, 33 insertions(+), 45 deletions(-) diff --git a/enterprise/replicasync/replicasync.go b/enterprise/replicasync/replicasync.go index 82e7d74273eeb..b635a84991e24 100644 --- a/enterprise/replicasync/replicasync.go +++ b/enterprise/replicasync/replicasync.go @@ -143,9 +143,11 @@ func (m *Manager) loop(ctx context.Context) { // subscribe listens for new replica information! func (m *Manager) subscribe(ctx context.Context) error { - needsUpdate := false - updating := false - updateMutex := sync.Mutex{} + var ( + needsUpdate = false + updating = false + updateMutex = sync.Mutex{} + ) // This loop will continually update nodes as updates are processed. // The intent is to always be up to date without spamming the run @@ -199,9 +201,7 @@ func (m *Manager) run(ctx context.Context) error { m.closeMutex.Lock() m.closeWait.Add(1) m.closeMutex.Unlock() - go func() { - m.closeWait.Done() - }() + defer m.closeWait.Done() // Expect replicas to update once every three times the interval... // If they don't, assume death! replicas, err := m.db.GetReplicasUpdatedAfter(ctx, database.Now().Add(-3*m.options.UpdateInterval)) @@ -224,8 +224,7 @@ func (m *Manager) run(ctx context.Context) error { failed := make([]string, 0) for _, peer := range m.Regional() { wg.Add(1) - peer := peer - go func() { + go func(peer database.Replica) { defer wg.Done() req, err := http.NewRequestWithContext(ctx, http.MethodGet, peer.RelayAddress, nil) if err != nil { @@ -247,7 +246,7 @@ func (m *Manager) run(ctx context.Context) error { return } _ = res.Body.Close() - }() + }(peer) } wg.Wait() replicaError := sql.NullString{} @@ -279,11 +278,11 @@ func (m *Manager) run(ctx context.Context) error { return xerrors.Errorf("update replica: %w", err) } m.mutex.Lock() + defer m.mutex.Unlock() if m.self.Error.String != replica.Error.String { // Publish an update occurred! err = m.pubsub.Publish(PubsubEvent, []byte(m.self.ID.String())) if err != nil { - m.mutex.Unlock() return xerrors.Errorf("publish replica update: %w", err) } } @@ -291,7 +290,6 @@ func (m *Manager) run(ctx context.Context) error { if m.callback != nil { go m.callback() } - m.mutex.Unlock() return nil } @@ -306,7 +304,7 @@ func (m *Manager) Self() database.Replica { func (m *Manager) All() []database.Replica { m.mutex.Lock() defer m.mutex.Unlock() - return append(m.peers, m.self) + return append(m.peers[:], m.self) } // Regional returns all replicas in the same region excluding itself. diff --git a/enterprise/tailnet/coordinator.go b/enterprise/tailnet/coordinator.go index 206dc68d6319c..1ccf56f50da11 100644 --- a/enterprise/tailnet/coordinator.go +++ b/enterprise/tailnet/coordinator.go @@ -69,19 +69,19 @@ func (c *haCoordinator) ServeClient(conn net.Conn, id uuid.UUID, agent uuid.UUID // When a new connection is requested, we update it with the latest // node of the agent. This allows the connection to establish. node, ok := c.nodes[agent] + c.mutex.Unlock() if ok { data, err := json.Marshal([]*agpl.Node{node}) if err != nil { - c.mutex.Unlock() return xerrors.Errorf("marshal node: %w", err) } _, err = conn.Write(data) if err != nil { - c.mutex.Unlock() return xerrors.Errorf("write nodes: %w", err) } } + c.mutex.Lock() connectionSockets, ok := c.agentToConnectionSockets[agent] if !ok { connectionSockets = map[uuid.UUID]net.Conn{} @@ -129,28 +129,17 @@ func (c *haCoordinator) handleNextClientMessage(id, agent uuid.UUID, decoder *js } c.mutex.Lock() - defer c.mutex.Unlock() - // Update the node of this client in our in-memory map. If an agent entirely // shuts down and reconnects, it needs to be aware of all clients attempting // to establish connections. c.nodes[id] = &node - // Write the new node from this client to the actively connected agent. - err = c.writeNodeToAgent(agent, &node) - if err != nil { - return xerrors.Errorf("write node to agent: %w", err) - } - - return nil -} - -func (c *haCoordinator) writeNodeToAgent(agent uuid.UUID, node *agpl.Node) error { agentSocket, ok := c.agentSockets[agent] + c.mutex.Unlock() if !ok { // If we don't own the agent locally, send it over pubsub to a node that // owns the agent. - err := c.publishNodesToAgent(agent, []*agpl.Node{node}) + err := c.publishNodesToAgent(agent, []*agpl.Node{&node}) if err != nil { return xerrors.Errorf("publish node to agent") } @@ -159,7 +148,7 @@ func (c *haCoordinator) writeNodeToAgent(agent uuid.UUID, node *agpl.Node) error // Write the new node from this client to the actively // connected agent. - data, err := json.Marshal([]*agpl.Node{node}) + data, err := json.Marshal([]*agpl.Node{&node}) if err != nil { return xerrors.Errorf("marshal nodes: %w", err) } @@ -171,14 +160,13 @@ func (c *haCoordinator) writeNodeToAgent(agent uuid.UUID, node *agpl.Node) error } return xerrors.Errorf("write json: %w", err) } + return nil } // ServeAgent accepts a WebSocket connection to an agent that listens to // incoming connections and publishes node updates. func (c *haCoordinator) ServeAgent(conn net.Conn, id uuid.UUID) error { - c.mutex.Lock() - // Tell clients on other instances to send a callmemaybe to us. err := c.publishAgentHello(id) if err != nil { @@ -203,6 +191,7 @@ func (c *haCoordinator) ServeAgent(conn net.Conn, id uuid.UUID) error { // If an old agent socket is connected, we close it // to avoid any leaks. This shouldn't ever occur because // we expect one agent to be running. + c.mutex.Lock() oldAgentSocket, ok := c.agentSockets[id] if ok { _ = oldAgentSocket.Close() @@ -234,6 +223,8 @@ func (c *haCoordinator) ServeAgent(conn net.Conn, id uuid.UUID) error { } func (c *haCoordinator) nodesSubscribedToAgent(agentID uuid.UUID) []*agpl.Node { + c.mutex.Lock() + defer c.mutex.Unlock() sockets, ok := c.agentToConnectionSockets[agentID] if !ok { return nil @@ -279,12 +270,11 @@ func (c *haCoordinator) hangleAgentUpdate(id uuid.UUID, decoder *json.Decoder) ( for _, connectionSocket := range connectionSockets { connectionSocket := connectionSocket go func() { + defer wg.Done() _ = connectionSocket.SetWriteDeadline(time.Now().Add(5 * time.Second)) _, _ = connectionSocket.Write(data) - wg.Done() }() } - wg.Wait() return &node, nil } @@ -428,9 +418,7 @@ func (c *haCoordinator) runPubsub() error { return } - c.mutex.Lock() nodes := c.nodesSubscribedToAgent(agentUUID) - c.mutex.Unlock() if len(nodes) > 0 { err := c.publishNodesToAgent(agentUUID, nodes) if err != nil { diff --git a/tailnet/coordinator.go b/tailnet/coordinator.go index 96de8d295162e..23531af1260f5 100644 --- a/tailnet/coordinator.go +++ b/tailnet/coordinator.go @@ -127,25 +127,26 @@ type coordinator struct { } // Node returns an in-memory node by ID. +// If the node does not exist, nil is returned. func (c *coordinator) Node(id uuid.UUID) *Node { c.mutex.Lock() defer c.mutex.Unlock() - node := c.nodes[id] - return node + return c.nodes[id] } // ServeClient accepts a WebSocket connection that wants to connect to an agent // with the specified ID. func (c *coordinator) ServeClient(conn net.Conn, id uuid.UUID, agent uuid.UUID) error { c.mutex.Lock() - if c.closed { + c.mutex.Unlock() return xerrors.New("coordinator is closed") } // When a new connection is requested, we update it with the latest // node of the agent. This allows the connection to establish. node, ok := c.nodes[agent] + c.mutex.Unlock() if ok { data, err := json.Marshal([]*Node{node}) if err != nil { @@ -158,6 +159,7 @@ func (c *coordinator) ServeClient(conn net.Conn, id uuid.UUID, agent uuid.UUID) return xerrors.Errorf("write nodes: %w", err) } } + c.mutex.Lock() connectionSockets, ok := c.agentToConnectionSockets[agent] if !ok { connectionSockets = map[uuid.UUID]net.Conn{} @@ -203,7 +205,6 @@ func (c *coordinator) handleNextClientMessage(id, agent uuid.UUID, decoder *json } c.mutex.Lock() - // Update the node of this client in our in-memory map. If an agent entirely // shuts down and reconnects, it needs to be aware of all clients attempting // to establish connections. @@ -237,12 +238,13 @@ func (c *coordinator) handleNextClientMessage(id, agent uuid.UUID, decoder *json // listens to incoming connections and publishes node updates. func (c *coordinator) ServeAgent(conn net.Conn, id uuid.UUID) error { c.mutex.Lock() - if c.closed { + c.mutex.Unlock() return xerrors.New("coordinator is closed") } sockets, ok := c.agentToConnectionSockets[id] + c.mutex.Unlock() if ok { // Publish all nodes that want to connect to the // desired agent ID. @@ -269,6 +271,7 @@ func (c *coordinator) ServeAgent(conn net.Conn, id uuid.UUID) error { // If an old agent socket is connected, we close it // to avoid any leaks. This shouldn't ever occur because // we expect one agent to be running. + c.mutex.Lock() oldAgentSocket, ok := c.agentSockets[id] if ok { _ = oldAgentSocket.Close() @@ -302,17 +305,15 @@ func (c *coordinator) handleNextAgentMessage(id uuid.UUID, decoder *json.Decoder } c.mutex.Lock() - c.nodes[id] = &node connectionSockets, ok := c.agentToConnectionSockets[id] if !ok { c.mutex.Unlock() return nil } - + c.mutex.Unlock() data, err := json.Marshal([]*Node{&node}) if err != nil { - c.mutex.Unlock() return xerrors.Errorf("marshal nodes: %w", err) } @@ -328,7 +329,6 @@ func (c *coordinator) handleNextAgentMessage(id uuid.UUID, decoder *json.Decoder }() } - c.mutex.Unlock() wg.Wait() return nil } @@ -337,9 +337,11 @@ func (c *coordinator) handleNextAgentMessage(id uuid.UUID, decoder *json.Decoder // coordinator from accepting new connections. func (c *coordinator) Close() error { c.mutex.Lock() - defer c.mutex.Unlock() - + if c.closed { + return nil + } c.closed = true + c.mutex.Unlock() wg := sync.WaitGroup{} From d8580d107a16c4802262b9f8919175512c8d6ec6 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 15 Oct 2022 05:24:35 +0000 Subject: [PATCH 38/79] Fix linting --- coderd/coderdtest/coderdtest.go | 1 + enterprise/coderd/replicas_test.go | 3 +-- enterprise/derpmesh/derpmesh_test.go | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/coderd/coderdtest/coderdtest.go b/coderd/coderdtest/coderdtest.go index cbbcd7aaa493a..4141e33cf8648 100644 --- a/coderd/coderdtest/coderdtest.go +++ b/coderd/coderdtest/coderdtest.go @@ -163,6 +163,7 @@ func NewOptions(t *testing.T, options *Options) (*httptest.Server, context.Cance if options.TLSCertificates != nil { srv.TLS = &tls.Config{ Certificates: options.TLSCertificates, + MinVersion: tls.VersionTLS12, } srv.StartTLS() } else { diff --git a/enterprise/coderd/replicas_test.go b/enterprise/coderd/replicas_test.go index 9d6970823befb..f9f6e138bd3cc 100644 --- a/enterprise/coderd/replicas_test.go +++ b/enterprise/coderd/replicas_test.go @@ -4,7 +4,6 @@ import ( "context" "crypto/tls" "testing" - "time" "github.com/stretchr/testify/require" @@ -80,7 +79,7 @@ func TestReplicas(t *testing.T) { }) require.NoError(t, err) require.Eventually(t, func() bool { - ctx, cancelFunc := context.WithTimeout(context.Background(), 3*time.Second) + ctx, cancelFunc := context.WithTimeout(context.Background(), testutil.IntervalSlow) defer cancelFunc() _, err = conn.Ping(ctx) return err == nil diff --git a/enterprise/derpmesh/derpmesh_test.go b/enterprise/derpmesh/derpmesh_test.go index fcf410ac0e574..d1131d59da25b 100644 --- a/enterprise/derpmesh/derpmesh_test.go +++ b/enterprise/derpmesh/derpmesh_test.go @@ -36,6 +36,7 @@ func TestDERPMesh(t *testing.T) { pool := x509.NewCertPool() pool.AddCert(certificate) tlsConfig := &tls.Config{ + MinVersion: tls.VersionTLS12, ServerName: commonName, RootCAs: pool, Certificates: []tls.Certificate{rawCert}, From ae956fbc00df6bed796e3624b515849002b8bb21 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 15 Oct 2022 05:44:04 +0000 Subject: [PATCH 39/79] Store mesh key in the database --- coderd/coderd.go | 1 - coderd/database/databasefake/databasefake.go | 20 +++++++++++++++-- coderd/database/querier.go | 2 ++ coderd/database/queries.sql.go | 20 +++++++++++++++++ coderd/database/queries/siteconfig.sql | 6 +++++ enterprise/cli/server.go | 23 ++++++++++++++++++++ 6 files changed, 69 insertions(+), 3 deletions(-) diff --git a/coderd/coderd.go b/coderd/coderd.go index 735190373a4f3..2fe0a5dc0d08e 100644 --- a/coderd/coderd.go +++ b/coderd/coderd.go @@ -127,7 +127,6 @@ func New(options *Options) *API { } if options.DERPServer == nil { options.DERPServer = derp.NewServer(key.NewNode(), tailnet.Logger(options.Logger.Named("derp"))) - options.DERPServer.SetMeshKey("todo-kyle-change-this") } if options.Auditor == nil { options.Auditor = audit.NewNop() diff --git a/coderd/database/databasefake/databasefake.go b/coderd/database/databasefake/databasefake.go index b4724a9afe0aa..e860c69dd11cb 100644 --- a/coderd/database/databasefake/databasefake.go +++ b/coderd/database/databasefake/databasefake.go @@ -110,10 +110,11 @@ type data struct { replicas []database.Replica deploymentID string + derpMeshKey string lastLicenseID int32 } -func (q *fakeQuerier) Ping(_ context.Context) (time.Duration, error) { +func (*fakeQuerier) Ping(_ context.Context) (time.Duration, error) { return 0, nil } @@ -2890,6 +2891,21 @@ func (q *fakeQuerier) GetDeploymentID(_ context.Context) (string, error) { return q.deploymentID, nil } +func (q *fakeQuerier) InsertDERPMeshKey(_ context.Context, id string) error { + q.mutex.Lock() + defer q.mutex.Unlock() + + q.derpMeshKey = id + return nil +} + +func (q *fakeQuerier) GetDERPMeshKey(_ context.Context) (string, error) { + q.mutex.RLock() + defer q.mutex.RUnlock() + + return q.derpMeshKey, nil +} + func (q *fakeQuerier) InsertLicense( _ context.Context, arg database.InsertLicenseParams, ) (database.License, error) { @@ -3156,7 +3172,7 @@ func (q *fakeQuerier) DeleteGroupByID(_ context.Context, id uuid.UUID) error { return sql.ErrNoRows } -func (q *fakeQuerier) DeleteReplicasUpdatedBefore(ctx context.Context, before time.Time) error { +func (q *fakeQuerier) DeleteReplicasUpdatedBefore(_ context.Context, before time.Time) error { q.mutex.Lock() defer q.mutex.Unlock() diff --git a/coderd/database/querier.go b/coderd/database/querier.go index 957de26c89e05..7f2f0d942bb10 100644 --- a/coderd/database/querier.go +++ b/coderd/database/querier.go @@ -39,6 +39,7 @@ type sqlcQuerier interface { // This function returns roles for authorization purposes. Implied member roles // are included. GetAuthorizationUserRoles(ctx context.Context, userID uuid.UUID) (GetAuthorizationUserRolesRow, error) + GetDERPMeshKey(ctx context.Context) (string, error) GetDeploymentID(ctx context.Context) (string, error) GetFileByHashAndCreator(ctx context.Context, arg GetFileByHashAndCreatorParams) (File, error) GetFileByID(ctx context.Context, id uuid.UUID) (File, error) @@ -125,6 +126,7 @@ type sqlcQuerier interface { // every member of the org. InsertAllUsersGroup(ctx context.Context, organizationID uuid.UUID) (Group, error) InsertAuditLog(ctx context.Context, arg InsertAuditLogParams) (AuditLog, error) + InsertDERPMeshKey(ctx context.Context, value string) error InsertDeploymentID(ctx context.Context, value string) error InsertFile(ctx context.Context, arg InsertFileParams) (File, error) InsertGitSSHKey(ctx context.Context, arg InsertGitSSHKeyParams) (GitSSHKey, error) diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index 241474e7e66bd..c40b93426ddee 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -2753,6 +2753,17 @@ func (q *sqlQuerier) UpdateReplica(ctx context.Context, arg UpdateReplicaParams) return i, err } +const getDERPMeshKey = `-- name: GetDERPMeshKey :one +SELECT value FROM site_configs WHERE key = 'derp_mesh_key' +` + +func (q *sqlQuerier) GetDERPMeshKey(ctx context.Context) (string, error) { + row := q.db.QueryRowContext(ctx, getDERPMeshKey) + var value string + err := row.Scan(&value) + return value, err +} + const getDeploymentID = `-- name: GetDeploymentID :one SELECT value FROM site_configs WHERE key = 'deployment_id' ` @@ -2764,6 +2775,15 @@ func (q *sqlQuerier) GetDeploymentID(ctx context.Context) (string, error) { return value, err } +const insertDERPMeshKey = `-- name: InsertDERPMeshKey :exec +INSERT INTO site_configs (key, value) VALUES ('derp_mesh_key', $1) +` + +func (q *sqlQuerier) InsertDERPMeshKey(ctx context.Context, value string) error { + _, err := q.db.ExecContext(ctx, insertDERPMeshKey, value) + return err +} + const insertDeploymentID = `-- name: InsertDeploymentID :exec INSERT INTO site_configs (key, value) VALUES ('deployment_id', $1) ` diff --git a/coderd/database/queries/siteconfig.sql b/coderd/database/queries/siteconfig.sql index 9d3936e23886d..b975d2f68cc3c 100644 --- a/coderd/database/queries/siteconfig.sql +++ b/coderd/database/queries/siteconfig.sql @@ -3,3 +3,9 @@ INSERT INTO site_configs (key, value) VALUES ('deployment_id', $1); -- name: GetDeploymentID :one SELECT value FROM site_configs WHERE key = 'deployment_id'; + +-- name: InsertDERPMeshKey :exec +INSERT INTO site_configs (key, value) VALUES ('derp_mesh_key', $1); + +-- name: GetDERPMeshKey :one +SELECT value FROM site_configs WHERE key = 'derp_mesh_key'; diff --git a/enterprise/cli/server.go b/enterprise/cli/server.go index f3e99c1613ab8..a65b8e8faa6e0 100644 --- a/enterprise/cli/server.go +++ b/enterprise/cli/server.go @@ -2,14 +2,20 @@ package cli import ( "context" + "database/sql" + "errors" "io" "net/url" "github.com/spf13/cobra" "golang.org/x/xerrors" + "tailscale.com/derp" + "tailscale.com/types/key" "github.com/coder/coder/cli/deployment" + "github.com/coder/coder/cryptorand" "github.com/coder/coder/enterprise/coderd" + "github.com/coder/coder/tailnet" agpl "github.com/coder/coder/cli" agplcoderd "github.com/coder/coder/coderd" @@ -25,6 +31,23 @@ func server() *cobra.Command { } } + options.DERPServer = derp.NewServer(key.NewNode(), tailnet.Logger(options.Logger.Named("derp"))) + meshKey, err := options.Database.GetDERPMeshKey(ctx) + if err != nil { + if !errors.Is(err, sql.ErrNoRows) { + return nil, nil, xerrors.Errorf("get mesh key: %w", err) + } + meshKey, err = cryptorand.String(32) + if err != nil { + return nil, nil, xerrors.Errorf("generate mesh key: %w", err) + } + err = options.Database.InsertDERPMeshKey(ctx, meshKey) + if err != nil { + return nil, nil, xerrors.Errorf("insert mesh key: %w", err) + } + } + options.DERPServer.SetMeshKey(meshKey) + o := &coderd.Options{ AuditLogging: dflags.AuditLogging.Value, BrowserOnly: dflags.BrowserOnly.Value, From d703e2d08aeb82ac80e5adea594ec0951a1e80a6 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 15 Oct 2022 06:37:32 +0000 Subject: [PATCH 40/79] Fix replica key for tests --- coderd/coderdtest/coderdtest.go | 7 +++++++ enterprise/coderd/replicas_test.go | 4 ++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/coderd/coderdtest/coderdtest.go b/coderd/coderdtest/coderdtest.go index 4141e33cf8648..c0361dfa8e2b8 100644 --- a/coderd/coderdtest/coderdtest.go +++ b/coderd/coderdtest/coderdtest.go @@ -37,8 +37,10 @@ import ( "golang.org/x/xerrors" "google.golang.org/api/idtoken" "google.golang.org/api/option" + "tailscale.com/derp" "tailscale.com/net/stun/stuntest" "tailscale.com/tailcfg" + "tailscale.com/types/key" "tailscale.com/types/nettype" "cdr.dev/slog" @@ -59,6 +61,7 @@ import ( "github.com/coder/coder/provisionerd" "github.com/coder/coder/provisionersdk" "github.com/coder/coder/provisionersdk/proto" + "github.com/coder/coder/tailnet" "github.com/coder/coder/testutil" ) @@ -184,6 +187,9 @@ func NewOptions(t *testing.T, options *Options) (*httptest.Server, context.Cance stunAddr, stunCleanup := stuntest.ServeWithPacketListener(t, nettype.Std{}) t.Cleanup(stunCleanup) + derpServer := derp.NewServer(key.NewNode(), tailnet.Logger(slogtest.Make(t, nil).Named("derp"))) + derpServer.SetMeshKey("test-key") + // match default with cli default if options.SSHKeygenAlgorithm == "" { options.SSHKeygenAlgorithm = gitsshkey.AlgorithmEd25519 @@ -208,6 +214,7 @@ func NewOptions(t *testing.T, options *Options) (*httptest.Server, context.Cance OIDCConfig: options.OIDCConfig, GoogleTokenValidator: options.GoogleTokenValidator, SSHKeygenAlgorithm: options.SSHKeygenAlgorithm, + DERPServer: derpServer, APIRateLimit: options.APIRateLimit, Authorizer: options.Authorizer, Telemetry: telemetry.NewNoop(), diff --git a/enterprise/coderd/replicas_test.go b/enterprise/coderd/replicas_test.go index f9f6e138bd3cc..fae418ab87261 100644 --- a/enterprise/coderd/replicas_test.go +++ b/enterprise/coderd/replicas_test.go @@ -79,7 +79,7 @@ func TestReplicas(t *testing.T) { }) require.NoError(t, err) require.Eventually(t, func() bool { - ctx, cancelFunc := context.WithTimeout(context.Background(), testutil.IntervalSlow) + ctx, cancelFunc := context.WithTimeout(context.Background(), testutil.WaitShort) defer cancelFunc() _, err = conn.Ping(ctx) return err == nil @@ -122,7 +122,7 @@ func TestReplicas(t *testing.T) { }) require.NoError(t, err) require.Eventually(t, func() bool { - ctx, cancelFunc := context.WithTimeout(context.Background(), testutil.IntervalMedium) + ctx, cancelFunc := context.WithTimeout(context.Background(), testutil.IntervalSlow) defer cancelFunc() _, err = conn.Ping(ctx) return err == nil From 9bb021c0e69f07efb4fb499eecad1359575432fc Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 15 Oct 2022 06:38:08 +0000 Subject: [PATCH 41/79] Fix types gen --- site/src/api/api.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/site/src/api/api.ts b/site/src/api/api.ts index 2e60a88b8469c..fb12571fd91ae 100644 --- a/site/src/api/api.ts +++ b/site/src/api/api.ts @@ -28,6 +28,7 @@ export const defaultEntitlements = (): TypesGen.Entitlements => { return { features: features, has_license: false, + errors: [], warnings: [], experimental: false, trial: false, From 76c9e2c959bb7b260ebed3274ce525e5cac813a0 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 15 Oct 2022 06:44:56 +0000 Subject: [PATCH 42/79] Fix unlocking unlocked --- tailnet/coordinator.go | 1 - 1 file changed, 1 deletion(-) diff --git a/tailnet/coordinator.go b/tailnet/coordinator.go index 23531af1260f5..9d722ddeee117 100644 --- a/tailnet/coordinator.go +++ b/tailnet/coordinator.go @@ -263,7 +263,6 @@ func (c *coordinator) ServeAgent(conn net.Conn, id uuid.UUID) error { } _, err = conn.Write(data) if err != nil { - c.mutex.Unlock() return xerrors.Errorf("write nodes: %w", err) } } From 09e87b0aa06da43125ed9cc767c2fd7877657f9f Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 15 Oct 2022 06:57:46 +0000 Subject: [PATCH 43/79] Fix race in tests --- tailnet/coordinator.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tailnet/coordinator.go b/tailnet/coordinator.go index 9d722ddeee117..491c0db885224 100644 --- a/tailnet/coordinator.go +++ b/tailnet/coordinator.go @@ -310,7 +310,6 @@ func (c *coordinator) handleNextAgentMessage(id uuid.UUID, decoder *json.Decoder c.mutex.Unlock() return nil } - c.mutex.Unlock() data, err := json.Marshal([]*Node{&node}) if err != nil { return xerrors.Errorf("marshal nodes: %w", err) @@ -328,6 +327,7 @@ func (c *coordinator) handleNextAgentMessage(id uuid.UUID, decoder *json.Decoder }() } + c.mutex.Unlock() wg.Wait() return nil } From 18c0464e7f106d81c3dbdb84f2f7ad48d75fc5e3 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 15 Oct 2022 01:58:05 -0500 Subject: [PATCH 44/79] Update enterprise/derpmesh/derpmesh.go Co-authored-by: Colin Adler --- enterprise/derpmesh/derpmesh.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/enterprise/derpmesh/derpmesh.go b/enterprise/derpmesh/derpmesh.go index 8f51343017593..059eac5a107e7 100644 --- a/enterprise/derpmesh/derpmesh.go +++ b/enterprise/derpmesh/derpmesh.go @@ -82,7 +82,7 @@ func (m *Mesh) SetAddresses(addresses []string) { m.mutex.Unlock() } -// addAddress begins meshing with a new address. +// addAddress begins meshing with a new address. It returns false if the address is already being meshed with. // It's expected that this is a full HTTP address with a path. // e.g. http://127.0.0.1:8080/derp func (m *Mesh) addAddress(address string) (bool, error) { From 6f25b2d44b66b12c5c65c20c0095588c3b51347b Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 15 Oct 2022 06:58:54 +0000 Subject: [PATCH 45/79] Rename to syncReplicas --- enterprise/replicasync/replicasync.go | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/enterprise/replicasync/replicasync.go b/enterprise/replicasync/replicasync.go index b635a84991e24..46123953298de 100644 --- a/enterprise/replicasync/replicasync.go +++ b/enterprise/replicasync/replicasync.go @@ -73,7 +73,7 @@ func New(ctx context.Context, logger slog.Logger, db database.Store, pubsub data return nil, xerrors.Errorf("publish new replica: %w", err) } ctx, cancelFunc := context.WithCancel(ctx) - server := &Manager{ + manager := &Manager{ id: id, options: options, db: db, @@ -83,25 +83,25 @@ func New(ctx context.Context, logger slog.Logger, db database.Store, pubsub data closed: make(chan struct{}), closeCancel: cancelFunc, } - err = server.run(ctx) + err = manager.syncReplicas(ctx) if err != nil { return nil, xerrors.Errorf("run replica: %w", err) } - peers := server.Regional() + peers := manager.Regional() if len(peers) > 0 { - self := server.Self() + self := manager.Self() if self.RelayAddress == "" { return nil, xerrors.Errorf("a relay address must be specified when running multiple replicas in the same region") } } - err = server.subscribe(ctx) + err = manager.subscribe(ctx) if err != nil { return nil, xerrors.Errorf("subscribe: %w", err) } - server.closeWait.Add(1) - go server.loop(ctx) - return server, nil + manager.closeWait.Add(1) + go manager.loop(ctx) + return manager, nil } // Manager keeps the replica up to date and in sync with other replicas. @@ -134,7 +134,7 @@ func (m *Manager) loop(ctx context.Context) { return case <-ticker.C: } - err := m.run(ctx) + err := m.syncReplicas(ctx) if err != nil && !errors.Is(err, context.Canceled) { m.logger.Warn(ctx, "run replica update loop", slog.Error(err)) } @@ -155,7 +155,7 @@ func (m *Manager) subscribe(ctx context.Context) error { // it will reprocess afterwards. var update func() update = func() { - err := m.run(ctx) + err := m.syncReplicas(ctx) if err != nil && !errors.Is(err, context.Canceled) { m.logger.Error(ctx, "run replica from subscribe", slog.Error(err)) } @@ -197,7 +197,7 @@ func (m *Manager) subscribe(ctx context.Context) error { return nil } -func (m *Manager) run(ctx context.Context) error { +func (m *Manager) syncReplicas(ctx context.Context) error { m.closeMutex.Lock() m.closeWait.Add(1) m.closeMutex.Unlock() From 1e85039d346e5e143d5a3ae462681f11d5cff093 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 15 Oct 2022 19:52:19 +0000 Subject: [PATCH 46/79] Reuse http client --- enterprise/replicasync/replicasync.go | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/enterprise/replicasync/replicasync.go b/enterprise/replicasync/replicasync.go index 46123953298de..317f6dc274bdc 100644 --- a/enterprise/replicasync/replicasync.go +++ b/enterprise/replicasync/replicasync.go @@ -219,6 +219,13 @@ func (m *Manager) syncReplicas(ctx context.Context) error { } m.mutex.Unlock() + client := http.Client{ + Timeout: m.options.PeerTimeout, + Transport: &http.Transport{ + TLSClientConfig: m.options.TLSConfig, + }, + } + defer client.CloseIdleConnections() var wg sync.WaitGroup var mu sync.Mutex failed := make([]string, 0) @@ -232,12 +239,6 @@ func (m *Manager) syncReplicas(ctx context.Context) error { slog.F("relay_address", peer.RelayAddress), slog.Error(err)) return } - client := http.Client{ - Timeout: m.options.PeerTimeout, - Transport: &http.Transport{ - TLSClientConfig: m.options.TLSConfig, - }, - } res, err := client.Do(req) if err != nil { mu.Lock() From ae0aa5f226bdc2e0e692e09fcee05e6a45d0247e Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 15 Oct 2022 20:55:11 +0000 Subject: [PATCH 47/79] Delete old replicas on a CRON --- coderd/database/databasefake/databasefake.go | 12 ------ coderd/database/querier.go | 1 - coderd/database/queries.sql.go | 23 ------------ coderd/database/queries/replicas.sql | 3 -- enterprise/replicasync/replicasync.go | 39 +++++++++++++++----- enterprise/replicasync/replicasync_test.go | 18 +++++++++ 6 files changed, 48 insertions(+), 48 deletions(-) diff --git a/coderd/database/databasefake/databasefake.go b/coderd/database/databasefake/databasefake.go index e860c69dd11cb..d95499147066b 100644 --- a/coderd/database/databasefake/databasefake.go +++ b/coderd/database/databasefake/databasefake.go @@ -3238,15 +3238,3 @@ func (q *fakeQuerier) GetReplicasUpdatedAfter(_ context.Context, updatedAt time. } return replicas, nil } - -func (q *fakeQuerier) GetReplicaByID(_ context.Context, id uuid.UUID) (database.Replica, error) { - q.mutex.RLock() - defer q.mutex.RUnlock() - - for _, replica := range q.replicas { - if replica.ID == id { - return replica, nil - } - } - return database.Replica{}, sql.ErrNoRows -} diff --git a/coderd/database/querier.go b/coderd/database/querier.go index 7f2f0d942bb10..8d1ea946ff5b6 100644 --- a/coderd/database/querier.go +++ b/coderd/database/querier.go @@ -69,7 +69,6 @@ type sqlcQuerier interface { GetProvisionerJobsByIDs(ctx context.Context, ids []uuid.UUID) ([]ProvisionerJob, error) GetProvisionerJobsCreatedAfter(ctx context.Context, createdAt time.Time) ([]ProvisionerJob, error) GetProvisionerLogsByIDBetween(ctx context.Context, arg GetProvisionerLogsByIDBetweenParams) ([]ProvisionerJobLog, error) - GetReplicaByID(ctx context.Context, id uuid.UUID) (Replica, error) GetReplicasUpdatedAfter(ctx context.Context, updatedAt time.Time) ([]Replica, error) GetTemplateByID(ctx context.Context, id uuid.UUID) (Template, error) GetTemplateByOrganizationAndName(ctx context.Context, arg GetTemplateByOrganizationAndNameParams) (Template, error) diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index c40b93426ddee..ff72247ad1e0f 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -2579,29 +2579,6 @@ func (q *sqlQuerier) DeleteReplicasUpdatedBefore(ctx context.Context, updatedAt return err } -const getReplicaByID = `-- name: GetReplicaByID :one -SELECT id, created_at, started_at, stopped_at, updated_at, hostname, region_id, relay_address, database_latency, version, error FROM replicas WHERE id = $1 -` - -func (q *sqlQuerier) GetReplicaByID(ctx context.Context, id uuid.UUID) (Replica, error) { - row := q.db.QueryRowContext(ctx, getReplicaByID, id) - var i Replica - err := row.Scan( - &i.ID, - &i.CreatedAt, - &i.StartedAt, - &i.StoppedAt, - &i.UpdatedAt, - &i.Hostname, - &i.RegionID, - &i.RelayAddress, - &i.DatabaseLatency, - &i.Version, - &i.Error, - ) - return i, err -} - const getReplicasUpdatedAfter = `-- name: GetReplicasUpdatedAfter :many SELECT id, created_at, started_at, stopped_at, updated_at, hostname, region_id, relay_address, database_latency, version, error FROM replicas WHERE updated_at > $1 AND stopped_at IS NULL ` diff --git a/coderd/database/queries/replicas.sql b/coderd/database/queries/replicas.sql index 5a62527fac107..e87c1f46432f2 100644 --- a/coderd/database/queries/replicas.sql +++ b/coderd/database/queries/replicas.sql @@ -1,9 +1,6 @@ -- name: GetReplicasUpdatedAfter :many SELECT * FROM replicas WHERE updated_at > $1 AND stopped_at IS NULL; --- name: GetReplicaByID :one -SELECT * FROM replicas WHERE id = $1; - -- name: InsertReplica :one INSERT INTO replicas ( id, diff --git a/enterprise/replicasync/replicasync.go b/enterprise/replicasync/replicasync.go index 317f6dc274bdc..d6cd846d6c96f 100644 --- a/enterprise/replicasync/replicasync.go +++ b/enterprise/replicasync/replicasync.go @@ -26,11 +26,12 @@ var ( ) type Options struct { - UpdateInterval time.Duration - PeerTimeout time.Duration - RelayAddress string - RegionID int32 - TLSConfig *tls.Config + CleanupInterval time.Duration + UpdateInterval time.Duration + PeerTimeout time.Duration + RelayAddress string + RegionID int32 + TLSConfig *tls.Config } // New registers the replica with the database and periodically updates to ensure @@ -45,6 +46,11 @@ func New(ctx context.Context, logger slog.Logger, db database.Store, pubsub data if options.UpdateInterval == 0 { options.UpdateInterval = 5 * time.Second } + if options.CleanupInterval == 0 { + // The cleanup interval can be quite long, because it's + // primary purpose is to clean up dead replicas. + options.CleanupInterval = 30 * time.Minute + } hostname, err := os.Hostname() if err != nil { return nil, xerrors.Errorf("get hostname: %w", err) @@ -123,16 +129,31 @@ type Manager struct { callback func() } +// updateInterval is used to determine a replicas state. +// If the replica was updated > the time, it's considered healthy. +// If the replica was updated < the time, it's considered stale. +func (m *Manager) updateInterval() time.Time { + return database.Now().Add(-3 * m.options.UpdateInterval) +} + // loop runs the replica update sequence on an update interval. func (m *Manager) loop(ctx context.Context) { defer m.closeWait.Done() - ticker := time.NewTicker(m.options.UpdateInterval) - defer ticker.Stop() + updateTicker := time.NewTicker(m.options.UpdateInterval) + defer updateTicker.Stop() + deleteTicker := time.NewTicker(m.options.CleanupInterval) + defer deleteTicker.Stop() for { select { case <-ctx.Done(): return - case <-ticker.C: + case <-deleteTicker.C: + err := m.db.DeleteReplicasUpdatedBefore(ctx, m.updateInterval()) + if err != nil { + m.logger.Warn(ctx, "delete old replicas", slog.Error(err)) + } + continue + case <-updateTicker.C: } err := m.syncReplicas(ctx) if err != nil && !errors.Is(err, context.Canceled) { @@ -204,7 +225,7 @@ func (m *Manager) syncReplicas(ctx context.Context) error { defer m.closeWait.Done() // Expect replicas to update once every three times the interval... // If they don't, assume death! - replicas, err := m.db.GetReplicasUpdatedAfter(ctx, database.Now().Add(-3*m.options.UpdateInterval)) + replicas, err := m.db.GetReplicasUpdatedAfter(ctx, m.updateInterval()) if err != nil { return xerrors.Errorf("get replicas: %w", err) } diff --git a/enterprise/replicasync/replicasync_test.go b/enterprise/replicasync/replicasync_test.go index 0b42f44791df4..79acf86865839 100644 --- a/enterprise/replicasync/replicasync_test.go +++ b/enterprise/replicasync/replicasync_test.go @@ -178,6 +178,24 @@ func TestReplica(t *testing.T) { }, testutil.WaitShort, testutil.IntervalFast) _ = server.Close() }) + t.Run("DeletesOld", func(t *testing.T) { + t.Parallel() + db, pubsub := dbtestutil.NewDB(t) + _, err := db.InsertReplica(context.Background(), database.InsertReplicaParams{ + ID: uuid.New(), + UpdatedAt: database.Now().Add(-time.Hour), + }) + require.NoError(t, err) + server, err := replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, &replicasync.Options{ + RelayAddress: "google.com", + CleanupInterval: time.Millisecond, + }) + require.NoError(t, err) + defer server.Close() + require.Eventually(t, func() bool { + return len(server.Regional()) == 0 + }, testutil.WaitShort, testutil.IntervalFast) + }) t.Run("TwentyConcurrent", func(t *testing.T) { // Ensures that twenty concurrent replicas can spawn and all // discover each other in parallel! From bd7fb1314256227c578d659911f3f7ba0d37743f Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 15 Oct 2022 23:18:43 +0000 Subject: [PATCH 48/79] Fix race condition in connection tests --- enterprise/coderd/coderd.go | 4 ++-- enterprise/derpmesh/derpmesh.go | 12 +++++++++--- enterprise/derpmesh/derpmesh_test.go | 10 +++++----- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/enterprise/coderd/coderd.go b/enterprise/coderd/coderd.go index 371ac12fe21b8..1250e6ae129da 100644 --- a/enterprise/coderd/coderd.go +++ b/enterprise/coderd/coderd.go @@ -283,11 +283,11 @@ func (api *API) updateEntitlements(ctx context.Context) error { for _, replica := range api.replicaManager.Regional() { addresses = append(addresses, replica.RelayAddress) } - api.derpMesh.SetAddresses(addresses) + api.derpMesh.SetAddresses(addresses, false) _ = api.updateEntitlements(ctx) }) } else { - api.derpMesh.SetAddresses([]string{}) + api.derpMesh.SetAddresses([]string{}, false) api.replicaManager.SetCallback(func() { // If the amount of replicas change, so should our entitlements. // This is to display a warning in the UI if the user is unlicensed. diff --git a/enterprise/derpmesh/derpmesh.go b/enterprise/derpmesh/derpmesh.go index 059eac5a107e7..530c799908fca 100644 --- a/enterprise/derpmesh/derpmesh.go +++ b/enterprise/derpmesh/derpmesh.go @@ -42,7 +42,10 @@ type Mesh struct { // SetAddresses performs a diff of the incoming addresses and adds // or removes DERP clients from the mesh. -func (m *Mesh) SetAddresses(addresses []string) { +// +// Connect is only used for testing to ensure DERPs are meshed before +// exchanging messages. +func (m *Mesh) SetAddresses(addresses []string, connect bool) { total := make(map[string]struct{}, 0) for _, address := range addresses { addressURL, err := url.Parse(address) @@ -58,7 +61,7 @@ func (m *Mesh) SetAddresses(addresses []string) { address = derpURL.String() total[address] = struct{}{} - added, err := m.addAddress(address) + added, err := m.addAddress(address, connect) if err != nil { m.logger.Error(m.ctx, "failed to add address", slog.F("address", address), slog.Error(err)) continue @@ -85,7 +88,7 @@ func (m *Mesh) SetAddresses(addresses []string) { // addAddress begins meshing with a new address. It returns false if the address is already being meshed with. // It's expected that this is a full HTTP address with a path. // e.g. http://127.0.0.1:8080/derp -func (m *Mesh) addAddress(address string) (bool, error) { +func (m *Mesh) addAddress(address string, connect bool) (bool, error) { m.mutex.Lock() defer m.mutex.Unlock() _, isActive := m.active[address] @@ -102,6 +105,9 @@ func (m *Mesh) addAddress(address string) (bool, error) { var dialer net.Dialer return dialer.DialContext(ctx, network, addr) }) + if connect { + _ = client.Connect(m.ctx) + } ctx, cancelFunc := context.WithCancel(m.ctx) closed := make(chan struct{}) closeFunc := func() { diff --git a/enterprise/derpmesh/derpmesh_test.go b/enterprise/derpmesh/derpmesh_test.go index d1131d59da25b..84875f106c7f2 100644 --- a/enterprise/derpmesh/derpmesh_test.go +++ b/enterprise/derpmesh/derpmesh_test.go @@ -49,9 +49,9 @@ func TestDERPMesh(t *testing.T) { defer firstServer.Close() secondServer, secondServerURL := startDERP(t, tlsConfig) firstMesh := derpmesh.New(slogtest.Make(t, nil).Named("first").Leveled(slog.LevelDebug), firstServer, tlsConfig) - firstMesh.SetAddresses([]string{secondServerURL}) + firstMesh.SetAddresses([]string{secondServerURL}, true) secondMesh := derpmesh.New(slogtest.Make(t, nil).Named("second").Leveled(slog.LevelDebug), secondServer, tlsConfig) - secondMesh.SetAddresses([]string{firstServerURL}) + secondMesh.SetAddresses([]string{firstServerURL}, true) defer firstMesh.Close() defer secondMesh.Close() @@ -78,9 +78,9 @@ func TestDERPMesh(t *testing.T) { t.Parallel() server, serverURL := startDERP(t, tlsConfig) mesh := derpmesh.New(slogtest.Make(t, nil).Named("first").Leveled(slog.LevelDebug), server, tlsConfig) - mesh.SetAddresses([]string{"http://fake.com"}) + mesh.SetAddresses([]string{"http://fake.com"}, false) // This should trigger a removal... - mesh.SetAddresses([]string{}) + mesh.SetAddresses([]string{}, false) defer mesh.Close() first := key.NewNode() @@ -114,7 +114,7 @@ func TestDERPMesh(t *testing.T) { meshes = append(meshes, mesh) } for _, mesh := range meshes { - mesh.SetAddresses(serverURLs) + mesh.SetAddresses(serverURLs, true) } first := key.NewNode() From bb5b347ada043f7ed0a30eb9f6754c8ae25416d4 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 15 Oct 2022 23:26:50 +0000 Subject: [PATCH 49/79] Fix linting --- enterprise/derpmesh/derpmesh.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/enterprise/derpmesh/derpmesh.go b/enterprise/derpmesh/derpmesh.go index 530c799908fca..5de7799aa74eb 100644 --- a/enterprise/derpmesh/derpmesh.go +++ b/enterprise/derpmesh/derpmesh.go @@ -45,6 +45,7 @@ type Mesh struct { // // Connect is only used for testing to ensure DERPs are meshed before // exchanging messages. +// nolint:revive func (m *Mesh) SetAddresses(addresses []string, connect bool) { total := make(map[string]struct{}, 0) for _, address := range addresses { @@ -88,6 +89,7 @@ func (m *Mesh) SetAddresses(addresses []string, connect bool) { // addAddress begins meshing with a new address. It returns false if the address is already being meshed with. // It's expected that this is a full HTTP address with a path. // e.g. http://127.0.0.1:8080/derp +// nolint:revive func (m *Mesh) addAddress(address string, connect bool) (bool, error) { m.mutex.Lock() defer m.mutex.Unlock() From 76e0511efef918d121b7834c11bedf3cea0a4771 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sat, 15 Oct 2022 23:38:15 +0000 Subject: [PATCH 50/79] Fix nil type --- coderd/database/dump.sql | 2 +- .../migrations/000061_replicas.up.sql | 2 +- coderd/database/models.go | 22 +++++++++---------- coderd/database/queries.sql.go | 20 ++++++++--------- enterprise/coderd/replicas.go | 2 +- enterprise/replicasync/replicasync.go | 9 +++----- enterprise/replicasync/replicasync_test.go | 8 +++---- 7 files changed, 31 insertions(+), 34 deletions(-) diff --git a/coderd/database/dump.sql b/coderd/database/dump.sql index 1e0a18c1dafef..8e31a990a8925 100644 --- a/coderd/database/dump.sql +++ b/coderd/database/dump.sql @@ -295,7 +295,7 @@ CREATE TABLE replicas ( relay_address text NOT NULL, database_latency integer NOT NULL, version text NOT NULL, - error text + error text DEFAULT ''::text NOT NULL ); CREATE TABLE site_configs ( diff --git a/coderd/database/migrations/000061_replicas.up.sql b/coderd/database/migrations/000061_replicas.up.sql index b1d1a1ab13ee0..1400662e30582 100644 --- a/coderd/database/migrations/000061_replicas.up.sql +++ b/coderd/database/migrations/000061_replicas.up.sql @@ -21,7 +21,7 @@ CREATE TABLE IF NOT EXISTS replicas ( database_latency int NOT NULL, -- Version is the Coder version of the replica. version text NOT NULL, - error text + error text NOT NULL DEFAULT '' ); -- Associates a provisioner daemon with a replica. diff --git a/coderd/database/models.go b/coderd/database/models.go index b4601ecadeb78..53e074984ac11 100644 --- a/coderd/database/models.go +++ b/coderd/database/models.go @@ -540,17 +540,17 @@ type ProvisionerJobLog struct { } type Replica struct { - ID uuid.UUID `db:"id" json:"id"` - CreatedAt time.Time `db:"created_at" json:"created_at"` - StartedAt time.Time `db:"started_at" json:"started_at"` - StoppedAt sql.NullTime `db:"stopped_at" json:"stopped_at"` - UpdatedAt time.Time `db:"updated_at" json:"updated_at"` - Hostname string `db:"hostname" json:"hostname"` - RegionID int32 `db:"region_id" json:"region_id"` - RelayAddress string `db:"relay_address" json:"relay_address"` - DatabaseLatency int32 `db:"database_latency" json:"database_latency"` - Version string `db:"version" json:"version"` - Error sql.NullString `db:"error" json:"error"` + ID uuid.UUID `db:"id" json:"id"` + CreatedAt time.Time `db:"created_at" json:"created_at"` + StartedAt time.Time `db:"started_at" json:"started_at"` + StoppedAt sql.NullTime `db:"stopped_at" json:"stopped_at"` + UpdatedAt time.Time `db:"updated_at" json:"updated_at"` + Hostname string `db:"hostname" json:"hostname"` + RegionID int32 `db:"region_id" json:"region_id"` + RelayAddress string `db:"relay_address" json:"relay_address"` + DatabaseLatency int32 `db:"database_latency" json:"database_latency"` + Version string `db:"version" json:"version"` + Error string `db:"error" json:"error"` } type SiteConfig struct { diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index 1c10bc259c72c..aa76ddfec52a3 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -2698,16 +2698,16 @@ WHERE id = $1 RETURNING id, created_at, started_at, stopped_at, updated_at, host ` type UpdateReplicaParams struct { - ID uuid.UUID `db:"id" json:"id"` - UpdatedAt time.Time `db:"updated_at" json:"updated_at"` - StartedAt time.Time `db:"started_at" json:"started_at"` - StoppedAt sql.NullTime `db:"stopped_at" json:"stopped_at"` - RelayAddress string `db:"relay_address" json:"relay_address"` - RegionID int32 `db:"region_id" json:"region_id"` - Hostname string `db:"hostname" json:"hostname"` - Version string `db:"version" json:"version"` - Error sql.NullString `db:"error" json:"error"` - DatabaseLatency int32 `db:"database_latency" json:"database_latency"` + ID uuid.UUID `db:"id" json:"id"` + UpdatedAt time.Time `db:"updated_at" json:"updated_at"` + StartedAt time.Time `db:"started_at" json:"started_at"` + StoppedAt sql.NullTime `db:"stopped_at" json:"stopped_at"` + RelayAddress string `db:"relay_address" json:"relay_address"` + RegionID int32 `db:"region_id" json:"region_id"` + Hostname string `db:"hostname" json:"hostname"` + Version string `db:"version" json:"version"` + Error string `db:"error" json:"error"` + DatabaseLatency int32 `db:"database_latency" json:"database_latency"` } func (q *sqlQuerier) UpdateReplica(ctx context.Context, arg UpdateReplicaParams) (Replica, error) { diff --git a/enterprise/coderd/replicas.go b/enterprise/coderd/replicas.go index c07c37243d0ca..906597f257f04 100644 --- a/enterprise/coderd/replicas.go +++ b/enterprise/coderd/replicas.go @@ -31,7 +31,7 @@ func convertReplica(replica database.Replica) codersdk.Replica { CreatedAt: replica.CreatedAt, RelayAddress: replica.RelayAddress, RegionID: replica.RegionID, - Error: replica.Error.String, + Error: replica.Error, DatabaseLatency: replica.DatabaseLatency, } } diff --git a/enterprise/replicasync/replicasync.go b/enterprise/replicasync/replicasync.go index d6cd846d6c96f..aa8eba46613ff 100644 --- a/enterprise/replicasync/replicasync.go +++ b/enterprise/replicasync/replicasync.go @@ -271,12 +271,9 @@ func (m *Manager) syncReplicas(ctx context.Context) error { }(peer) } wg.Wait() - replicaError := sql.NullString{} + replicaError := "" if len(failed) > 0 { - replicaError = sql.NullString{ - Valid: true, - String: fmt.Sprintf("Failed to dial peers: %s", strings.Join(failed, ", ")), - } + replicaError = fmt.Sprintf("Failed to dial peers: %s", strings.Join(failed, ", ")) } databaseLatency, err := m.db.Ping(ctx) @@ -301,7 +298,7 @@ func (m *Manager) syncReplicas(ctx context.Context) error { } m.mutex.Lock() defer m.mutex.Unlock() - if m.self.Error.String != replica.Error.String { + if m.self.Error != replica.Error { // Publish an update occurred! err = m.pubsub.Publish(PubsubEvent, []byte(m.self.ID.String())) if err != nil { diff --git a/enterprise/replicasync/replicasync_test.go b/enterprise/replicasync/replicasync_test.go index 79acf86865839..40e087a7616ce 100644 --- a/enterprise/replicasync/replicasync_test.go +++ b/enterprise/replicasync/replicasync_test.go @@ -84,7 +84,7 @@ func TestReplica(t *testing.T) { require.NoError(t, err) require.Len(t, server.Regional(), 1) require.Equal(t, peer.ID, server.Regional()[0].ID) - require.False(t, server.Self().Error.Valid) + require.Empty(t, server.Self().Error) _ = server.Close() }) t.Run("ConnectsToPeerReplicaTLS", func(t *testing.T) { @@ -125,7 +125,7 @@ func TestReplica(t *testing.T) { require.NoError(t, err) require.Len(t, server.Regional(), 1) require.Equal(t, peer.ID, server.Regional()[0].ID) - require.False(t, server.Self().Error.Valid) + require.Empty(t, server.Self().Error) _ = server.Close() }) t.Run("ConnectsToFakePeerWithError", func(t *testing.T) { @@ -148,8 +148,8 @@ func TestReplica(t *testing.T) { require.NoError(t, err) require.Len(t, server.Regional(), 1) require.Equal(t, peer.ID, server.Regional()[0].ID) - require.True(t, server.Self().Error.Valid) - require.Contains(t, server.Self().Error.String, "Failed to dial peers") + require.NotEmpty(t, server.Self().Error) + require.Contains(t, server.Self().Error, "Failed to dial peers") _ = server.Close() }) t.Run("RefreshOnPublish", func(t *testing.T) { From 1ff5f7d81cad45cf6c6346f6763df3dd8ad1bdb5 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sun, 16 Oct 2022 00:51:49 +0000 Subject: [PATCH 51/79] Move pubsub to in-memory for twenty test --- enterprise/replicasync/replicasync_test.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/enterprise/replicasync/replicasync_test.go b/enterprise/replicasync/replicasync_test.go index 40e087a7616ce..2e3d1deafc68c 100644 --- a/enterprise/replicasync/replicasync_test.go +++ b/enterprise/replicasync/replicasync_test.go @@ -16,6 +16,7 @@ import ( "cdr.dev/slog/sloggers/slogtest" "github.com/coder/coder/coderd/database" + "github.com/coder/coder/coderd/database/databasefake" "github.com/coder/coder/coderd/database/dbtestutil" "github.com/coder/coder/enterprise/replicasync" "github.com/coder/coder/testutil" @@ -200,7 +201,8 @@ func TestReplica(t *testing.T) { // Ensures that twenty concurrent replicas can spawn and all // discover each other in parallel! t.Parallel() - db, pubsub := dbtestutil.NewDB(t) + db := databasefake.New() + pubsub := database.NewPubsubInMemory() logger := slogtest.Make(t, nil) srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusOK) From b732184e0a52d7c121d5b4a1d696c7721510d57f Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sun, 16 Oct 2022 00:54:19 +0000 Subject: [PATCH 52/79] Add comment for configuration tweaking --- enterprise/replicasync/replicasync_test.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/enterprise/replicasync/replicasync_test.go b/enterprise/replicasync/replicasync_test.go index 2e3d1deafc68c..b6eb45bb9d316 100644 --- a/enterprise/replicasync/replicasync_test.go +++ b/enterprise/replicasync/replicasync_test.go @@ -201,6 +201,9 @@ func TestReplica(t *testing.T) { // Ensures that twenty concurrent replicas can spawn and all // discover each other in parallel! t.Parallel() + // This doesn't use the database fake because creating + // this many PostgreSQL connections takes some + // configuration tweaking. db := databasefake.New() pubsub := database.NewPubsubInMemory() logger := slogtest.Make(t, nil) From 38465ac4f9306bac1669335c4155b2440838d361 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sun, 16 Oct 2022 01:12:32 +0000 Subject: [PATCH 53/79] Fix leak with transport --- coderd/wsconncache/wsconncache_test.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/coderd/wsconncache/wsconncache_test.go b/coderd/wsconncache/wsconncache_test.go index 003d3cddb8b7a..d4345ce9d5f05 100644 --- a/coderd/wsconncache/wsconncache_test.go +++ b/coderd/wsconncache/wsconncache_test.go @@ -128,7 +128,9 @@ func TestCache(t *testing.T) { return } defer release() - proxy.Transport = conn.HTTPTransport() + transport := conn.HTTPTransport() + defer transport.CloseIdleConnections() + proxy.Transport = transport res := httptest.NewRecorder() proxy.ServeHTTP(res, req) resp := res.Result() From 72555e2d8cd81cd7dd02498f13f59071d6ef1705 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sun, 16 Oct 2022 01:17:47 +0000 Subject: [PATCH 54/79] Fix close leak in derpmesh --- enterprise/derpmesh/derpmesh.go | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/enterprise/derpmesh/derpmesh.go b/enterprise/derpmesh/derpmesh.go index 5de7799aa74eb..3982542167073 100644 --- a/enterprise/derpmesh/derpmesh.go +++ b/enterprise/derpmesh/derpmesh.go @@ -93,6 +93,9 @@ func (m *Mesh) SetAddresses(addresses []string, connect bool) { func (m *Mesh) addAddress(address string, connect bool) (bool, error) { m.mutex.Lock() defer m.mutex.Unlock() + if m.isClosed() { + return false, nil + } _, isActive := m.active[address] if isActive { return false, nil @@ -142,10 +145,8 @@ func (m *Mesh) removeAddress(address string) bool { func (m *Mesh) Close() error { m.mutex.Lock() defer m.mutex.Unlock() - select { - case <-m.closed: + if m.isClosed() { return nil - default: } close(m.closed) for _, cancelFunc := range m.active { @@ -153,3 +154,12 @@ func (m *Mesh) Close() error { } return nil } + +func (m *Mesh) isClosed() bool { + select { + case <-m.closed: + return true + default: + } + return false +} From e54072a53a80d78c54ea82897301f8120d35ad15 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sun, 16 Oct 2022 01:30:09 +0000 Subject: [PATCH 55/79] Fix race when creating server --- coderd/coderdtest/coderdtest.go | 8 +++++--- enterprise/coderd/coderdenttest/coderdenttest.go | 5 +++++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/coderd/coderdtest/coderdtest.go b/coderd/coderdtest/coderdtest.go index de49afaa1c269..4a7c3e38b69e1 100644 --- a/coderd/coderdtest/coderdtest.go +++ b/coderd/coderdtest/coderdtest.go @@ -170,9 +170,6 @@ func NewOptions(t *testing.T, options *Options) (*httptest.Server, context.Cance Certificates: options.TLSCertificates, MinVersion: tls.VersionTLS12, } - srv.StartTLS() - } else { - srv.Start() } t.Cleanup(srv.Close) @@ -266,6 +263,11 @@ func NewWithAPI(t *testing.T, options *Options) (*codersdk.Client, io.Closer, *c // We set the handler after server creation for the access URL. coderAPI := coderd.New(newOptions) srv.Config.Handler = coderAPI.RootHandler + if newOptions.TLSCertificates != nil { + srv.StartTLS() + } else { + srv.Start() + } var provisionerCloser io.Closer = nopcloser{} if options.IncludeProvisionerDaemon { provisionerCloser = NewProvisionerDaemon(t, coderAPI) diff --git a/enterprise/coderd/coderdenttest/coderdenttest.go b/enterprise/coderd/coderdenttest/coderdenttest.go index 02eff4e2acf2e..fbffa683bea8b 100644 --- a/enterprise/coderd/coderdenttest/coderdenttest.go +++ b/enterprise/coderd/coderdenttest/coderdenttest.go @@ -77,6 +77,11 @@ func NewWithAPI(t *testing.T, options *Options) (*codersdk.Client, io.Closer, *c }) assert.NoError(t, err) srv.Config.Handler = coderAPI.AGPL.RootHandler + if oop.TLSCertificates != nil { + srv.StartTLS() + } else { + srv.Start() + } var provisionerCloser io.Closer = nopcloser{} if options.IncludeProvisionerDaemon { provisionerCloser = coderdtest.NewProvisionerDaemon(t, coderAPI.AGPL) From 27d5f40619e47cc5729dce114fba34b771daf612 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sun, 16 Oct 2022 02:48:52 +0000 Subject: [PATCH 56/79] Remove handler update --- coderd/coderdtest/coderdtest.go | 8 +++----- enterprise/coderd/coderdenttest/coderdenttest.go | 5 ----- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/coderd/coderdtest/coderdtest.go b/coderd/coderdtest/coderdtest.go index 4a7c3e38b69e1..de49afaa1c269 100644 --- a/coderd/coderdtest/coderdtest.go +++ b/coderd/coderdtest/coderdtest.go @@ -170,6 +170,9 @@ func NewOptions(t *testing.T, options *Options) (*httptest.Server, context.Cance Certificates: options.TLSCertificates, MinVersion: tls.VersionTLS12, } + srv.StartTLS() + } else { + srv.Start() } t.Cleanup(srv.Close) @@ -263,11 +266,6 @@ func NewWithAPI(t *testing.T, options *Options) (*codersdk.Client, io.Closer, *c // We set the handler after server creation for the access URL. coderAPI := coderd.New(newOptions) srv.Config.Handler = coderAPI.RootHandler - if newOptions.TLSCertificates != nil { - srv.StartTLS() - } else { - srv.Start() - } var provisionerCloser io.Closer = nopcloser{} if options.IncludeProvisionerDaemon { provisionerCloser = NewProvisionerDaemon(t, coderAPI) diff --git a/enterprise/coderd/coderdenttest/coderdenttest.go b/enterprise/coderd/coderdenttest/coderdenttest.go index fbffa683bea8b..02eff4e2acf2e 100644 --- a/enterprise/coderd/coderdenttest/coderdenttest.go +++ b/enterprise/coderd/coderdenttest/coderdenttest.go @@ -77,11 +77,6 @@ func NewWithAPI(t *testing.T, options *Options) (*codersdk.Client, io.Closer, *c }) assert.NoError(t, err) srv.Config.Handler = coderAPI.AGPL.RootHandler - if oop.TLSCertificates != nil { - srv.StartTLS() - } else { - srv.Start() - } var provisionerCloser io.Closer = nopcloser{} if options.IncludeProvisionerDaemon { provisionerCloser = coderdtest.NewProvisionerDaemon(t, coderAPI.AGPL) From 4d0b1d86854a41a4616578c5655e00d65f2cfffa Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sun, 16 Oct 2022 03:23:53 +0000 Subject: [PATCH 57/79] Skip test on Windows --- enterprise/derpmesh/derpmesh_test.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/enterprise/derpmesh/derpmesh_test.go b/enterprise/derpmesh/derpmesh_test.go index 84875f106c7f2..1c1d658bb03c2 100644 --- a/enterprise/derpmesh/derpmesh_test.go +++ b/enterprise/derpmesh/derpmesh_test.go @@ -7,6 +7,7 @@ import ( "errors" "io" "net/http/httptest" + "runtime" "testing" "github.com/stretchr/testify/assert" @@ -101,6 +102,9 @@ func TestDERPMesh(t *testing.T) { }) t.Run("TwentyMeshes", func(t *testing.T) { t.Parallel() + if runtime.GOOS == "windows" { + t.Skip("This test is races on Windows... I think because it's too slow.") + } meshes := make([]*derpmesh.Mesh, 0, 20) serverURLs := make([]string, 0, 20) for i := 0; i < 20; i++ { From 129f5ba6511615e0a8920164ce56b90efb41e214 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sun, 16 Oct 2022 03:39:45 +0000 Subject: [PATCH 58/79] Fix DERP mesh test --- enterprise/derpmesh/derpmesh_test.go | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/enterprise/derpmesh/derpmesh_test.go b/enterprise/derpmesh/derpmesh_test.go index 1c1d658bb03c2..7ca844c57e6fc 100644 --- a/enterprise/derpmesh/derpmesh_test.go +++ b/enterprise/derpmesh/derpmesh_test.go @@ -7,8 +7,8 @@ import ( "errors" "io" "net/http/httptest" - "runtime" "testing" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -102,9 +102,6 @@ func TestDERPMesh(t *testing.T) { }) t.Run("TwentyMeshes", func(t *testing.T) { t.Parallel() - if runtime.GOOS == "windows" { - t.Skip("This test is races on Windows... I think because it's too slow.") - } meshes := make([]*derpmesh.Mesh, 0, 20) serverURLs := make([]string, 0, 20) for i := 0; i < 20; i++ { @@ -132,12 +129,28 @@ func TestDERPMesh(t *testing.T) { err = secondClient.Connect(context.Background()) require.NoError(t, err) + closed := make(chan struct{}) + ctx, cancelFunc := context.WithCancel(context.Background()) + defer cancelFunc() sent := []byte("hello world") - err = firstClient.Send(second.Public(), sent) - require.NoError(t, err) + go func() { + defer close(closed) + ticker := time.NewTicker(time.Second) + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + } + err = firstClient.Send(second.Public(), sent) + require.NoError(t, err) + } + }() got := recvData(t, secondClient) require.Equal(t, sent, got) + cancelFunc() + <-closed }) } From 4e5d30e6267ca70ea318ca2ffe84fea369725617 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sun, 16 Oct 2022 04:47:04 +0000 Subject: [PATCH 59/79] Wrap HTTP handler replacement in mutex --- coderd/coderdtest/coderdtest.go | 115 ++++++++++-------- .../coderd/coderdenttest/coderdenttest.go | 4 +- 2 files changed, 66 insertions(+), 53 deletions(-) diff --git a/coderd/coderdtest/coderdtest.go b/coderd/coderdtest/coderdtest.go index de49afaa1c269..31785ff7e7950 100644 --- a/coderd/coderdtest/coderdtest.go +++ b/coderd/coderdtest/coderdtest.go @@ -24,6 +24,7 @@ import ( "regexp" "strconv" "strings" + "sync" "testing" "time" @@ -127,7 +128,7 @@ func newWithCloser(t *testing.T, options *Options) (*codersdk.Client, io.Closer) return client, closer } -func NewOptions(t *testing.T, options *Options) (*httptest.Server, context.CancelFunc, *coderd.Options) { +func NewOptions(t *testing.T, options *Options) (func(http.Handler), context.CancelFunc, *coderd.Options) { if options == nil { options = &Options{} } @@ -161,7 +162,15 @@ func NewOptions(t *testing.T, options *Options) (*httptest.Server, context.Cance ).WithStatsChannel(options.AutobuildStats) lifecycleExecutor.Run() - srv := httptest.NewUnstartedServer(nil) + var mutex sync.RWMutex + var handler http.Handler + srv := httptest.NewUnstartedServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + mutex.RLock() + defer mutex.RUnlock() + if handler != nil { + handler.ServeHTTP(w, r) + } + })) srv.Config.BaseContext = func(_ net.Listener) context.Context { return ctx } @@ -204,55 +213,59 @@ func NewOptions(t *testing.T, options *Options) (*httptest.Server, context.Cance require.NoError(t, err) } - return srv, cancelFunc, &coderd.Options{ - AgentConnectionUpdateFrequency: 150 * time.Millisecond, - // Force a long disconnection timeout to ensure - // agents are not marked as disconnected during slow tests. - AgentInactiveDisconnectTimeout: testutil.WaitShort, - AccessURL: serverURL, - AppHostname: options.AppHostname, - AppHostnameRegex: appHostnameRegex, - Logger: slogtest.Make(t, nil).Leveled(slog.LevelDebug), - CacheDir: t.TempDir(), - Database: options.Database, - Pubsub: options.Pubsub, - - Auditor: options.Auditor, - AWSCertificates: options.AWSCertificates, - AzureCertificates: options.AzureCertificates, - GithubOAuth2Config: options.GithubOAuth2Config, - OIDCConfig: options.OIDCConfig, - GoogleTokenValidator: options.GoogleTokenValidator, - SSHKeygenAlgorithm: options.SSHKeygenAlgorithm, - DERPServer: derpServer, - APIRateLimit: options.APIRateLimit, - Authorizer: options.Authorizer, - Telemetry: telemetry.NewNoop(), - TLSCertificates: options.TLSCertificates, - DERPMap: &tailcfg.DERPMap{ - Regions: map[int]*tailcfg.DERPRegion{ - 1: { - EmbeddedRelay: true, - RegionID: 1, - RegionCode: "coder", - RegionName: "Coder", - Nodes: []*tailcfg.DERPNode{{ - Name: "1a", - RegionID: 1, - IPv4: "127.0.0.1", - DERPPort: derpPort, - STUNPort: stunAddr.Port, - InsecureForTests: true, - ForceHTTP: options.TLSCertificates == nil, - }}, + return func(h http.Handler) { + mutex.Lock() + handler = h + mutex.Unlock() + }, cancelFunc, &coderd.Options{ + AgentConnectionUpdateFrequency: 150 * time.Millisecond, + // Force a long disconnection timeout to ensure + // agents are not marked as disconnected during slow tests. + AgentInactiveDisconnectTimeout: testutil.WaitShort, + AccessURL: serverURL, + AppHostname: options.AppHostname, + AppHostnameRegex: appHostnameRegex, + Logger: slogtest.Make(t, nil).Leveled(slog.LevelDebug), + CacheDir: t.TempDir(), + Database: options.Database, + Pubsub: options.Pubsub, + + Auditor: options.Auditor, + AWSCertificates: options.AWSCertificates, + AzureCertificates: options.AzureCertificates, + GithubOAuth2Config: options.GithubOAuth2Config, + OIDCConfig: options.OIDCConfig, + GoogleTokenValidator: options.GoogleTokenValidator, + SSHKeygenAlgorithm: options.SSHKeygenAlgorithm, + DERPServer: derpServer, + APIRateLimit: options.APIRateLimit, + Authorizer: options.Authorizer, + Telemetry: telemetry.NewNoop(), + TLSCertificates: options.TLSCertificates, + DERPMap: &tailcfg.DERPMap{ + Regions: map[int]*tailcfg.DERPRegion{ + 1: { + EmbeddedRelay: true, + RegionID: 1, + RegionCode: "coder", + RegionName: "Coder", + Nodes: []*tailcfg.DERPNode{{ + Name: "1a", + RegionID: 1, + IPv4: "127.0.0.1", + DERPPort: derpPort, + STUNPort: stunAddr.Port, + InsecureForTests: true, + ForceHTTP: options.TLSCertificates == nil, + }}, + }, }, }, - }, - AutoImportTemplates: options.AutoImportTemplates, - MetricsCacheRefreshInterval: options.MetricsCacheRefreshInterval, - AgentStatsRefreshInterval: options.AgentStatsRefreshInterval, - DeploymentFlags: options.DeploymentFlags, - } + AutoImportTemplates: options.AutoImportTemplates, + MetricsCacheRefreshInterval: options.MetricsCacheRefreshInterval, + AgentStatsRefreshInterval: options.AgentStatsRefreshInterval, + DeploymentFlags: options.DeploymentFlags, + } } // NewWithAPI constructs an in-memory API instance and returns a client to talk to it. @@ -262,10 +275,10 @@ func NewWithAPI(t *testing.T, options *Options) (*codersdk.Client, io.Closer, *c if options == nil { options = &Options{} } - srv, cancelFunc, newOptions := NewOptions(t, options) + setHandler, cancelFunc, newOptions := NewOptions(t, options) // We set the handler after server creation for the access URL. coderAPI := coderd.New(newOptions) - srv.Config.Handler = coderAPI.RootHandler + setHandler(coderAPI.APIHandler) var provisionerCloser io.Closer = nopcloser{} if options.IncludeProvisionerDaemon { provisionerCloser = NewProvisionerDaemon(t, coderAPI) diff --git a/enterprise/coderd/coderdenttest/coderdenttest.go b/enterprise/coderd/coderdenttest/coderdenttest.go index 02eff4e2acf2e..a8595b5bc6ede 100644 --- a/enterprise/coderd/coderdenttest/coderdenttest.go +++ b/enterprise/coderd/coderdenttest/coderdenttest.go @@ -62,7 +62,7 @@ func NewWithAPI(t *testing.T, options *Options) (*codersdk.Client, io.Closer, *c if options.Options == nil { options.Options = &coderdtest.Options{} } - srv, cancelFunc, oop := coderdtest.NewOptions(t, options.Options) + setHandler, cancelFunc, oop := coderdtest.NewOptions(t, options.Options) coderAPI, err := coderd.New(context.Background(), &coderd.Options{ RBAC: true, AuditLogging: options.AuditLogging, @@ -76,7 +76,7 @@ func NewWithAPI(t *testing.T, options *Options) (*codersdk.Client, io.Closer, *c Keys: Keys, }) assert.NoError(t, err) - srv.Config.Handler = coderAPI.AGPL.RootHandler + setHandler(coderAPI.AGPL.RootHandler) var provisionerCloser io.Closer = nopcloser{} if options.IncludeProvisionerDaemon { provisionerCloser = coderdtest.NewProvisionerDaemon(t, coderAPI.AGPL) From 0359a7e9a79debe4f4e28fdde81faa27094e153e Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sun, 16 Oct 2022 14:44:24 +0000 Subject: [PATCH 60/79] Fix error message for relay --- enterprise/replicasync/replicasync.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/enterprise/replicasync/replicasync.go b/enterprise/replicasync/replicasync.go index aa8eba46613ff..4aeabd2a05742 100644 --- a/enterprise/replicasync/replicasync.go +++ b/enterprise/replicasync/replicasync.go @@ -178,7 +178,7 @@ func (m *Manager) subscribe(ctx context.Context) error { update = func() { err := m.syncReplicas(ctx) if err != nil && !errors.Is(err, context.Canceled) { - m.logger.Error(ctx, "run replica from subscribe", slog.Error(err)) + m.logger.Warn(ctx, "run replica from subscribe", slog.Error(err)) } updateMutex.Lock() if needsUpdate { @@ -256,7 +256,7 @@ func (m *Manager) syncReplicas(ctx context.Context) error { defer wg.Done() req, err := http.NewRequestWithContext(ctx, http.MethodGet, peer.RelayAddress, nil) if err != nil { - m.logger.Error(ctx, "create http request for relay probe", + m.logger.Warn(ctx, "create http request for relay probe", slog.F("relay_address", peer.RelayAddress), slog.Error(err)) return } From f364d1fe7523af406fb047828ba93fd6c2b139df Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sun, 16 Oct 2022 14:51:48 +0000 Subject: [PATCH 61/79] Fix API handler for normal tests --- coderd/coderdtest/coderdtest.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/coderd/coderdtest/coderdtest.go b/coderd/coderdtest/coderdtest.go index 31785ff7e7950..2a7184e2ca05d 100644 --- a/coderd/coderdtest/coderdtest.go +++ b/coderd/coderdtest/coderdtest.go @@ -215,8 +215,8 @@ func NewOptions(t *testing.T, options *Options) (func(http.Handler), context.Can return func(h http.Handler) { mutex.Lock() + defer mutex.Unlock() handler = h - mutex.Unlock() }, cancelFunc, &coderd.Options{ AgentConnectionUpdateFrequency: 150 * time.Millisecond, // Force a long disconnection timeout to ensure @@ -278,7 +278,7 @@ func NewWithAPI(t *testing.T, options *Options) (*codersdk.Client, io.Closer, *c setHandler, cancelFunc, newOptions := NewOptions(t, options) // We set the handler after server creation for the access URL. coderAPI := coderd.New(newOptions) - setHandler(coderAPI.APIHandler) + setHandler(coderAPI.RootHandler) var provisionerCloser io.Closer = nopcloser{} if options.IncludeProvisionerDaemon { provisionerCloser = NewProvisionerDaemon(t, coderAPI) From 423a47e1dd9aa227a3a48ada5ac343a732bb9d01 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sun, 16 Oct 2022 15:25:34 +0000 Subject: [PATCH 62/79] Fix speedtest --- agent/agent_test.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/agent/agent_test.go b/agent/agent_test.go index e1269d6003922..e10eee7f111a0 100644 --- a/agent/agent_test.go +++ b/agent/agent_test.go @@ -483,9 +483,7 @@ func TestAgent(t *testing.T) { t.Run("Speedtest", func(t *testing.T) { t.Parallel() - if testing.Short() { - t.Skip("The minimum duration for a speedtest is hardcoded in Tailscale to 5s!") - } + t.Skip("This test is relatively flakey because of Tailscale's speedtest code...") derpMap := tailnettest.RunDERPAndSTUN(t) conn, _ := setupAgent(t, codersdk.WorkspaceAgentMetadata{ DERPMap: derpMap, From c3a77fe2d048b88d4e6e1f5d1ad8475735a18f18 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sun, 16 Oct 2022 16:21:13 +0000 Subject: [PATCH 63/79] Fix replica resend --- enterprise/derpmesh/derpmesh_test.go | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/enterprise/derpmesh/derpmesh_test.go b/enterprise/derpmesh/derpmesh_test.go index 7ca844c57e6fc..2878346d4ee43 100644 --- a/enterprise/derpmesh/derpmesh_test.go +++ b/enterprise/derpmesh/derpmesh_test.go @@ -94,11 +94,28 @@ func TestDERPMesh(t *testing.T) { secondClient.TLSConfig = tlsConfig err = secondClient.Connect(context.Background()) require.NoError(t, err) + + closed := make(chan struct{}) + ctx, cancelFunc := context.WithCancel(context.Background()) + defer cancelFunc() sent := []byte("hello world") - err = firstClient.Send(second.Public(), sent) - require.NoError(t, err) + go func() { + defer close(closed) + ticker := time.NewTicker(50 * time.Millisecond) + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + } + err = firstClient.Send(second.Public(), sent) + require.NoError(t, err) + } + }() got := recvData(t, secondClient) require.Equal(t, sent, got) + cancelFunc() + <-closed }) t.Run("TwentyMeshes", func(t *testing.T) { t.Parallel() @@ -135,7 +152,7 @@ func TestDERPMesh(t *testing.T) { sent := []byte("hello world") go func() { defer close(closed) - ticker := time.NewTicker(time.Second) + ticker := time.NewTicker(50 * time.Millisecond) for { select { case <-ctx.Done(): From 729f8a07acda2301d0fa60dc350b3838da956a04 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sun, 16 Oct 2022 16:34:38 +0000 Subject: [PATCH 64/79] Fix derpmesh send --- enterprise/derpmesh/derpmesh_test.go | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/enterprise/derpmesh/derpmesh_test.go b/enterprise/derpmesh/derpmesh_test.go index 2878346d4ee43..7fad141238442 100644 --- a/enterprise/derpmesh/derpmesh_test.go +++ b/enterprise/derpmesh/derpmesh_test.go @@ -67,12 +67,28 @@ func TestDERPMesh(t *testing.T) { err = secondClient.Connect(context.Background()) require.NoError(t, err) + closed := make(chan struct{}) + ctx, cancelFunc := context.WithCancel(context.Background()) + defer cancelFunc() sent := []byte("hello world") - err = firstClient.Send(second.Public(), sent) - require.NoError(t, err) + go func() { + defer close(closed) + ticker := time.NewTicker(50 * time.Millisecond) + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + } + err = firstClient.Send(second.Public(), sent) + require.NoError(t, err) + } + }() got := recvData(t, secondClient) require.Equal(t, sent, got) + cancelFunc() + <-closed }) t.Run("RemoveAddress", func(t *testing.T) { // This tests messages passing through multiple DERP servers. From ae0bc5df1e0538fc4e4a0155044bd37724dc2b23 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sun, 16 Oct 2022 21:17:12 +0000 Subject: [PATCH 65/79] Ping async --- codersdk/agentconn.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codersdk/agentconn.go b/codersdk/agentconn.go index e75edf1ca6bb0..ddfb9541a186a 100644 --- a/codersdk/agentconn.go +++ b/codersdk/agentconn.go @@ -135,7 +135,7 @@ type AgentConn struct { func (c *AgentConn) Ping(ctx context.Context) (time.Duration, error) { errCh := make(chan error, 1) durCh := make(chan time.Duration, 1) - c.Conn.Ping(TailnetIP, tailcfg.PingDisco, func(pr *ipnstate.PingResult) { + go c.Conn.Ping(TailnetIP, tailcfg.PingDisco, func(pr *ipnstate.PingResult) { if pr.Err != "" { errCh <- xerrors.New(pr.Err) return From d7d50db6dd6cec7adefbf4153cc075a3588c4be3 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sun, 16 Oct 2022 21:23:26 +0000 Subject: [PATCH 66/79] Increase wait time of template version jobd --- coderd/coderdtest/coderdtest.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/coderd/coderdtest/coderdtest.go b/coderd/coderdtest/coderdtest.go index 2a7184e2ca05d..5cf307d842e90 100644 --- a/coderd/coderdtest/coderdtest.go +++ b/coderd/coderdtest/coderdtest.go @@ -497,7 +497,7 @@ func AwaitTemplateVersionJob(t *testing.T, client *codersdk.Client, version uuid var err error templateVersion, err = client.TemplateVersion(context.Background(), version) return assert.NoError(t, err) && templateVersion.Job.CompletedAt != nil - }, testutil.WaitShort, testutil.IntervalFast) + }, testutil.WaitMedium, testutil.IntervalFast) return templateVersion } From 77d23dc113a1fef2670ec9a898a94c9d6eb0ec18 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sun, 16 Oct 2022 22:24:18 +0000 Subject: [PATCH 67/79] Fix race when closing replica sync --- enterprise/replicasync/replicasync.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/enterprise/replicasync/replicasync.go b/enterprise/replicasync/replicasync.go index 4aeabd2a05742..0534c55246824 100644 --- a/enterprise/replicasync/replicasync.go +++ b/enterprise/replicasync/replicasync.go @@ -362,7 +362,8 @@ func (m *Manager) Close() error { m.closeCancel() m.closeWait.Wait() m.closeMutex.Unlock() - + m.mutex.Lock() + defer m.mutex.Unlock() ctx, cancelFunc := context.WithTimeout(context.Background(), 5*time.Second) defer cancelFunc() _, err := m.db.UpdateReplica(ctx, database.UpdateReplicaParams{ From 435bbbb364a47bffeada924e45772059a4250022 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Sun, 16 Oct 2022 22:57:10 +0000 Subject: [PATCH 68/79] Add name to client --- enterprise/coderd/replicas_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/enterprise/coderd/replicas_test.go b/enterprise/coderd/replicas_test.go index fae418ab87261..7a3e130cf7770 100644 --- a/enterprise/coderd/replicas_test.go +++ b/enterprise/coderd/replicas_test.go @@ -118,7 +118,7 @@ func TestReplicas(t *testing.T) { _, agent := setupWorkspaceAgent(t, firstClient, firstUser, 0) conn, err := secondClient.DialWorkspaceAgent(context.Background(), agent.ID, &codersdk.DialWorkspaceAgentOptions{ BlockEndpoints: true, - Logger: slogtest.Make(t, nil).Leveled(slog.LevelDebug), + Logger: slogtest.Make(t, nil).Named("client").Leveled(slog.LevelDebug), }) require.NoError(t, err) require.Eventually(t, func() bool { From 9b7c41afd462daf8f42d57d865b07e33d744f141 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Mon, 17 Oct 2022 00:27:24 +0000 Subject: [PATCH 69/79] Log the derpmap being used --- agent/agent.go | 1 + 1 file changed, 1 insertion(+) diff --git a/agent/agent.go b/agent/agent.go index 6d0a9a952f44b..f7c5598b7b710 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -170,6 +170,7 @@ func (a *agent) runTailnet(ctx context.Context, derpMap *tailcfg.DERPMap) { if a.isClosed() { return } + a.logger.Debug(ctx, "running tailnet with derpmap", slog.F("derpmap", derpMap)) if a.network != nil { a.network.SetDERPMap(derpMap) return From 961540291ae4137848850dfd2c04bf78a3bd1079 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Mon, 17 Oct 2022 00:30:43 +0000 Subject: [PATCH 70/79] Don't connect if DERP is empty --- tailnet/conn.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tailnet/conn.go b/tailnet/conn.go index e41ed60a527f3..7c572a55e1b66 100644 --- a/tailnet/conn.go +++ b/tailnet/conn.go @@ -344,9 +344,14 @@ func (c *Conn) UpdateNodes(nodes []*Node) error { // reason. TODO: @kylecarbs debug this! KeepAlive: ok && peerStatus.Active, } + // If no preferred DERP is provided, don't set an IP! + if node.PreferredDERP == 0 { + peerNode.DERP = "" + } if c.blockEndpoints { peerNode.Endpoints = nil } + c.logger.Debug(context.Background(), "adding node", slog.F("node", peerNode)) c.peerMap[node.ID] = peerNode } c.netMap.Peers = make([]*tailcfg.Node, 0, len(c.peerMap)) From bcb97ac7087b105b78c087ac89203024e51192f5 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Mon, 17 Oct 2022 01:30:29 +0000 Subject: [PATCH 71/79] Improve agent coordinator logging --- coderd/workspaceagents.go | 2 ++ tailnet/conn.go | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/coderd/workspaceagents.go b/coderd/workspaceagents.go index b93369b0cb9cb..fb7f765cc7519 100644 --- a/coderd/workspaceagents.go +++ b/coderd/workspaceagents.go @@ -378,6 +378,7 @@ func (api *API) dialWorkspaceAgentTailnet(r *http.Request, agentID uuid.UUID) (* go func() { err := (*api.TailnetCoordinator.Load()).ServeClient(serverConn, uuid.New(), agentID) if err != nil { + api.Logger.Warn(r.Context(), "tailnet coordinator client error", slog.Error(err)) _ = conn.Close() } }() @@ -516,6 +517,7 @@ func (api *API) workspaceAgentCoordinate(rw http.ResponseWriter, r *http.Request defer close(closeChan) err := (*api.TailnetCoordinator.Load()).ServeAgent(wsNetConn, workspaceAgent.ID) if err != nil { + api.Logger.Warn(ctx, "tailnet coordinator agent error", slog.Error(err)) _ = conn.Close(websocket.StatusInternalError, err.Error()) return } diff --git a/tailnet/conn.go b/tailnet/conn.go index 7c572a55e1b66..2f2549718880d 100644 --- a/tailnet/conn.go +++ b/tailnet/conn.go @@ -328,6 +328,8 @@ func (c *Conn) UpdateNodes(nodes []*Node) error { delete(c.peerMap, peer.ID) } for _, node := range nodes { + c.logger.Debug(context.Background(), "adding node", slog.F("node", node)) + peerStatus, ok := status.Peer[node.Key] peerNode := &tailcfg.Node{ ID: node.ID, @@ -351,7 +353,6 @@ func (c *Conn) UpdateNodes(nodes []*Node) error { if c.blockEndpoints { peerNode.Endpoints = nil } - c.logger.Debug(context.Background(), "adding node", slog.F("node", peerNode)) c.peerMap[node.ID] = peerNode } c.netMap.Peers = make([]*tailcfg.Node, 0, len(c.peerMap)) From e2f6a1939f819feb3a1785c31e90a44e239b6f35 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Mon, 17 Oct 2022 01:35:07 +0000 Subject: [PATCH 72/79] Fix lock in coordinator --- enterprise/tailnet/coordinator.go | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/enterprise/tailnet/coordinator.go b/enterprise/tailnet/coordinator.go index 1ccf56f50da11..4bfae463e202a 100644 --- a/enterprise/tailnet/coordinator.go +++ b/enterprise/tailnet/coordinator.go @@ -178,12 +178,10 @@ func (c *haCoordinator) ServeAgent(conn net.Conn, id uuid.UUID) error { if len(nodes) > 0 { data, err := json.Marshal(nodes) if err != nil { - c.mutex.Unlock() return xerrors.Errorf("marshal json: %w", err) } _, err = conn.Write(data) if err != nil { - c.mutex.Unlock() return xerrors.Errorf("write nodes: %w", err) } } @@ -250,17 +248,16 @@ func (c *haCoordinator) hangleAgentUpdate(id uuid.UUID, decoder *json.Decoder) ( } c.mutex.Lock() - defer c.mutex.Unlock() - c.nodes[id] = &node - connectionSockets, ok := c.agentToConnectionSockets[id] if !ok { + c.mutex.Unlock() return &node, nil } data, err := json.Marshal([]*agpl.Node{&node}) if err != nil { + c.mutex.Unlock() return nil, xerrors.Errorf("marshal nodes: %w", err) } @@ -275,6 +272,7 @@ func (c *haCoordinator) hangleAgentUpdate(id uuid.UUID, decoder *json.Decoder) ( _, _ = connectionSocket.Write(data) }() } + c.mutex.Unlock() wg.Wait() return &node, nil } @@ -394,12 +392,12 @@ func (c *haCoordinator) runPubsub() error { } c.mutex.Lock() - defer c.mutex.Unlock() - agentSocket, ok := c.agentSockets[agentUUID] if !ok { + c.mutex.Unlock() return } + c.mutex.Unlock() // We get a single node over pubsub, so turn into an array. _, err = agentSocket.Write(nodeJSON) @@ -410,7 +408,6 @@ func (c *haCoordinator) runPubsub() error { c.log.Error(ctx, "send callmemaybe to agent", slog.Error(err)) return } - case "agenthello": agentUUID, err := uuid.ParseBytes(agentID) if err != nil { @@ -426,7 +423,6 @@ func (c *haCoordinator) runPubsub() error { return } } - case "agentupdate": agentUUID, err := uuid.ParseBytes(agentID) if err != nil { @@ -440,7 +436,6 @@ func (c *haCoordinator) runPubsub() error { c.log.Error(ctx, "handle agent update", slog.Error(err)) return } - default: c.log.Error(ctx, "unknown peer event", slog.F("name", string(eventType))) } From c855c9ba60d511153ea9d1f30f57f6e8ccca626b Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Mon, 17 Oct 2022 01:53:50 +0000 Subject: [PATCH 73/79] Fix relay addr --- enterprise/replicasync/replicasync_test.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/enterprise/replicasync/replicasync_test.go b/enterprise/replicasync/replicasync_test.go index b6eb45bb9d316..b7709c1f6f814 100644 --- a/enterprise/replicasync/replicasync_test.go +++ b/enterprise/replicasync/replicasync_test.go @@ -134,17 +134,17 @@ func TestReplica(t *testing.T) { db, pubsub := dbtestutil.NewDB(t) peer, err := db.InsertReplica(context.Background(), database.InsertReplicaParams{ ID: uuid.New(), - CreatedAt: database.Now(), - StartedAt: database.Now(), - UpdatedAt: database.Now(), + CreatedAt: database.Now().Add(time.Minute), + StartedAt: database.Now().Add(time.Minute), + UpdatedAt: database.Now().Add(time.Minute), Hostname: "something", - // Fake address to hit! - RelayAddress: "http://169.254.169.254", + // Fake address to dial! + RelayAddress: "http://127.0.0.1:1", }) require.NoError(t, err) server, err := replicasync.New(context.Background(), slogtest.Make(t, nil), db, pubsub, &replicasync.Options{ PeerTimeout: 1 * time.Millisecond, - RelayAddress: "http://169.254.169.254", + RelayAddress: "http://127.0.0.1:1", }) require.NoError(t, err) require.Len(t, server.Regional(), 1) From a0e5cab653e6000e149404838fa90e8a27813ae6 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Mon, 17 Oct 2022 02:04:41 +0000 Subject: [PATCH 74/79] Fix race when updating durations --- coderd/activitybump_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/coderd/activitybump_test.go b/coderd/activitybump_test.go index dec5ec42f6556..e498b98fa0c80 100644 --- a/coderd/activitybump_test.go +++ b/coderd/activitybump_test.go @@ -72,7 +72,7 @@ func TestWorkspaceActivityBump(t *testing.T) { "deadline %v never updated", firstDeadline, ) - require.WithinDuration(t, database.Now().Add(time.Hour), workspace.LatestBuild.Deadline.Time, time.Second) + require.WithinDuration(t, database.Now().Add(time.Hour), workspace.LatestBuild.Deadline.Time, 3*time.Second) } } From 9878fc51d4b6a464e87bcb5f59f6891e1e39d5d9 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Mon, 17 Oct 2022 02:37:28 +0000 Subject: [PATCH 75/79] Fix client publish race --- enterprise/tailnet/coordinator.go | 60 +++++++++++++++++++++++--- enterprise/tailnet/coordinator_test.go | 11 ++--- 2 files changed, 61 insertions(+), 10 deletions(-) diff --git a/enterprise/tailnet/coordinator.go b/enterprise/tailnet/coordinator.go index 4bfae463e202a..2f284cb00ff61 100644 --- a/enterprise/tailnet/coordinator.go +++ b/enterprise/tailnet/coordinator.go @@ -56,8 +56,8 @@ type haCoordinator struct { // Node returns an in-memory node by ID. func (c *haCoordinator) Node(id uuid.UUID) *agpl.Node { - c.mutex.RLock() - defer c.mutex.RUnlock() + c.mutex.Lock() + defer c.mutex.Unlock() node := c.nodes[id] return node } @@ -79,6 +79,11 @@ func (c *haCoordinator) ServeClient(conn net.Conn, id uuid.UUID, agent uuid.UUID if err != nil { return xerrors.Errorf("write nodes: %w", err) } + } else { + err := c.publishClientHello(agent) + if err != nil { + return xerrors.Errorf("publish client hello: %w", err) + } } c.mutex.Lock() @@ -205,7 +210,7 @@ func (c *haCoordinator) ServeAgent(conn net.Conn, id uuid.UUID) error { decoder := json.NewDecoder(conn) for { - node, err := c.hangleAgentUpdate(id, decoder) + node, err := c.handleAgentUpdate(id, decoder) if err != nil { if errors.Is(err, io.EOF) { return nil @@ -240,7 +245,17 @@ func (c *haCoordinator) nodesSubscribedToAgent(agentID uuid.UUID) []*agpl.Node { return nodes } -func (c *haCoordinator) hangleAgentUpdate(id uuid.UUID, decoder *json.Decoder) (*agpl.Node, error) { +func (c *haCoordinator) handleClientHello(id uuid.UUID) error { + c.mutex.Lock() + node, ok := c.nodes[id] + c.mutex.Unlock() + if !ok { + return nil + } + return c.publishAgentToNodes(id, node) +} + +func (c *haCoordinator) handleAgentUpdate(id uuid.UUID, decoder *json.Decoder) (*agpl.Node, error) { var node agpl.Node err := decoder.Decode(&node) if err != nil { @@ -343,6 +358,18 @@ func (c *haCoordinator) publishAgentHello(id uuid.UUID) error { return nil } +func (c *haCoordinator) publishClientHello(id uuid.UUID) error { + msg, err := c.formatClientHello(id) + if err != nil { + return xerrors.Errorf("format client hello: %w", err) + } + err = c.pubsub.Publish("wireguard_peers", msg) + if err != nil { + return xerrors.Errorf("publish client hello: %w", err) + } + return nil +} + func (c *haCoordinator) publishAgentToNodes(id uuid.UUID, node *agpl.Node) error { msg, err := c.formatAgentUpdate(id, node) if err != nil { @@ -408,6 +435,18 @@ func (c *haCoordinator) runPubsub() error { c.log.Error(ctx, "send callmemaybe to agent", slog.Error(err)) return } + case "clienthello": + agentUUID, err := uuid.ParseBytes(agentID) + if err != nil { + c.log.Error(ctx, "invalid agent id", slog.F("id", string(agentID))) + return + } + + err = c.handleClientHello(agentUUID) + if err != nil { + c.log.Error(ctx, "handle agent request node", slog.Error(err)) + return + } case "agenthello": agentUUID, err := uuid.ParseBytes(agentID) if err != nil { @@ -431,7 +470,7 @@ func (c *haCoordinator) runPubsub() error { } decoder := json.NewDecoder(bytes.NewReader(nodeJSON)) - _, err = c.hangleAgentUpdate(agentUUID, decoder) + _, err = c.handleAgentUpdate(agentUUID, decoder) if err != nil { c.log.Error(ctx, "handle agent update", slog.Error(err)) return @@ -478,6 +517,17 @@ func (c *haCoordinator) formatAgentHello(id uuid.UUID) ([]byte, error) { return buf.Bytes(), nil } +// format: |clienthello|| +func (c *haCoordinator) formatClientHello(id uuid.UUID) ([]byte, error) { + buf := bytes.Buffer{} + + buf.WriteString(c.id.String() + "|") + buf.WriteString("clienthello|") + buf.WriteString(id.String() + "|") + + return buf.Bytes(), nil +} + // format: |agentupdate|| func (c *haCoordinator) formatAgentUpdate(id uuid.UUID, node *agpl.Node) ([]byte, error) { buf := bytes.Buffer{} diff --git a/enterprise/tailnet/coordinator_test.go b/enterprise/tailnet/coordinator_test.go index 83fac250b2916..86cee94dbdf5b 100644 --- a/enterprise/tailnet/coordinator_test.go +++ b/enterprise/tailnet/coordinator_test.go @@ -11,6 +11,7 @@ import ( "cdr.dev/slog/sloggers/slogtest" "github.com/coder/coder/coderd/database" + "github.com/coder/coder/coderd/database/dbtestutil" "github.com/coder/coder/enterprise/tailnet" agpl "github.com/coder/coder/tailnet" "github.com/coder/coder/testutil" @@ -167,16 +168,12 @@ func TestCoordinatorHA(t *testing.T) { t.Run("AgentWithClient", func(t *testing.T) { t.Parallel() - pubsub := database.NewPubsubInMemory() + _, pubsub := dbtestutil.NewDB(t) coordinator1, err := tailnet.NewCoordinator(slogtest.Make(t, nil), pubsub) require.NoError(t, err) defer coordinator1.Close() - coordinator2, err := tailnet.NewCoordinator(slogtest.Make(t, nil), pubsub) - require.NoError(t, err) - defer coordinator2.Close() - agentWS, agentServerWS := net.Pipe() defer agentWS.Close() agentNodeChan := make(chan []*agpl.Node) @@ -196,6 +193,10 @@ func TestCoordinatorHA(t *testing.T) { return coordinator1.Node(agentID) != nil }, testutil.WaitShort, testutil.IntervalFast) + coordinator2, err := tailnet.NewCoordinator(slogtest.Make(t, nil), pubsub) + require.NoError(t, err) + defer coordinator2.Close() + clientWS, clientServerWS := net.Pipe() defer clientWS.Close() defer clientServerWS.Close() From 7a40bf801f89fc09d35fa1bc9716504f09595f2c Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Mon, 17 Oct 2022 02:52:02 +0000 Subject: [PATCH 76/79] Run pubsub loop in a queue --- enterprise/tailnet/coordinator.go | 202 +++++++++++++++++------------- 1 file changed, 113 insertions(+), 89 deletions(-) diff --git a/enterprise/tailnet/coordinator.go b/enterprise/tailnet/coordinator.go index 2f284cb00ff61..f001d4a9643dd 100644 --- a/enterprise/tailnet/coordinator.go +++ b/enterprise/tailnet/coordinator.go @@ -21,17 +21,19 @@ import ( // NewCoordinator creates a new high availability coordinator // that uses PostgreSQL pubsub to exchange handshakes. func NewCoordinator(logger slog.Logger, pubsub database.Pubsub) (agpl.Coordinator, error) { + ctx, cancelFunc := context.WithCancel(context.Background()) coord := &haCoordinator{ id: uuid.New(), log: logger, pubsub: pubsub, + closeFunc: cancelFunc, close: make(chan struct{}), nodes: map[uuid.UUID]*agpl.Node{}, agentSockets: map[uuid.UUID]net.Conn{}, agentToConnectionSockets: map[uuid.UUID]map[uuid.UUID]net.Conn{}, } - if err := coord.runPubsub(); err != nil { + if err := coord.runPubsub(ctx); err != nil { return nil, xerrors.Errorf("run coordinator pubsub: %w", err) } @@ -39,11 +41,12 @@ func NewCoordinator(logger slog.Logger, pubsub database.Pubsub) (agpl.Coordinato } type haCoordinator struct { - id uuid.UUID - log slog.Logger - mutex sync.RWMutex - pubsub database.Pubsub - close chan struct{} + id uuid.UUID + log slog.Logger + mutex sync.RWMutex + pubsub database.Pubsub + close chan struct{} + closeFunc context.CancelFunc // nodes maps agent and connection IDs their respective node. nodes map[uuid.UUID]*agpl.Node @@ -303,6 +306,7 @@ func (c *haCoordinator) Close() error { default: } close(c.close) + c.closeFunc() wg := sync.WaitGroup{} @@ -384,111 +388,131 @@ func (c *haCoordinator) publishAgentToNodes(id uuid.UUID, node *agpl.Node) error return nil } -func (c *haCoordinator) runPubsub() error { +func (c *haCoordinator) runPubsub(ctx context.Context) error { + messageQueue := make(chan []byte, 64) cancelSub, err := c.pubsub.Subscribe("wireguard_peers", func(ctx context.Context, message []byte) { - sp := bytes.Split(message, []byte("|")) - if len(sp) != 4 { - c.log.Error(ctx, "invalid wireguard peer message", slog.F("msg", string(message))) + select { + case messageQueue <- message: + case <-ctx.Done(): return } + }) + if err != nil { + return xerrors.Errorf("subscribe wireguard peers") + } + go func() { + for { + var message []byte + select { + case <-ctx.Done(): + return + case message = <-messageQueue: + } + c.handlePubsubMessage(ctx, message) + } + }() + + go func() { + defer cancelSub() + <-c.close + }() + + return nil +} + +func (c *haCoordinator) handlePubsubMessage(ctx context.Context, message []byte) { + sp := bytes.Split(message, []byte("|")) + if len(sp) != 4 { + c.log.Error(ctx, "invalid wireguard peer message", slog.F("msg", string(message))) + return + } + + var ( + coordinatorID = sp[0] + eventType = sp[1] + agentID = sp[2] + nodeJSON = sp[3] + ) - var ( - coordinatorID = sp[0] - eventType = sp[1] - agentID = sp[2] - nodeJSON = sp[3] - ) + sender, err := uuid.ParseBytes(coordinatorID) + if err != nil { + c.log.Error(ctx, "invalid sender id", slog.F("id", string(coordinatorID)), slog.F("msg", string(message))) + return + } - sender, err := uuid.ParseBytes(coordinatorID) + // We sent this message! + if sender == c.id { + return + } + + switch string(eventType) { + case "callmemaybe": + agentUUID, err := uuid.ParseBytes(agentID) if err != nil { - c.log.Error(ctx, "invalid sender id", slog.F("id", string(coordinatorID)), slog.F("msg", string(message))) + c.log.Error(ctx, "invalid agent id", slog.F("id", string(agentID))) return } - // We sent this message! - if sender == c.id { + c.mutex.Lock() + agentSocket, ok := c.agentSockets[agentUUID] + if !ok { + c.mutex.Unlock() return } + c.mutex.Unlock() - switch string(eventType) { - case "callmemaybe": - agentUUID, err := uuid.ParseBytes(agentID) - if err != nil { - c.log.Error(ctx, "invalid agent id", slog.F("id", string(agentID))) - return - } - - c.mutex.Lock() - agentSocket, ok := c.agentSockets[agentUUID] - if !ok { - c.mutex.Unlock() - return - } - c.mutex.Unlock() - - // We get a single node over pubsub, so turn into an array. - _, err = agentSocket.Write(nodeJSON) - if err != nil { - if errors.Is(err, io.EOF) { - return - } - c.log.Error(ctx, "send callmemaybe to agent", slog.Error(err)) - return - } - case "clienthello": - agentUUID, err := uuid.ParseBytes(agentID) - if err != nil { - c.log.Error(ctx, "invalid agent id", slog.F("id", string(agentID))) + // We get a single node over pubsub, so turn into an array. + _, err = agentSocket.Write(nodeJSON) + if err != nil { + if errors.Is(err, io.EOF) { return } + c.log.Error(ctx, "send callmemaybe to agent", slog.Error(err)) + return + } + case "clienthello": + agentUUID, err := uuid.ParseBytes(agentID) + if err != nil { + c.log.Error(ctx, "invalid agent id", slog.F("id", string(agentID))) + return + } - err = c.handleClientHello(agentUUID) - if err != nil { - c.log.Error(ctx, "handle agent request node", slog.Error(err)) - return - } - case "agenthello": - agentUUID, err := uuid.ParseBytes(agentID) - if err != nil { - c.log.Error(ctx, "invalid agent id", slog.F("id", string(agentID))) - return - } + err = c.handleClientHello(agentUUID) + if err != nil { + c.log.Error(ctx, "handle agent request node", slog.Error(err)) + return + } + case "agenthello": + agentUUID, err := uuid.ParseBytes(agentID) + if err != nil { + c.log.Error(ctx, "invalid agent id", slog.F("id", string(agentID))) + return + } - nodes := c.nodesSubscribedToAgent(agentUUID) - if len(nodes) > 0 { - err := c.publishNodesToAgent(agentUUID, nodes) - if err != nil { - c.log.Error(ctx, "publish nodes to agent", slog.Error(err)) - return - } - } - case "agentupdate": - agentUUID, err := uuid.ParseBytes(agentID) + nodes := c.nodesSubscribedToAgent(agentUUID) + if len(nodes) > 0 { + err := c.publishNodesToAgent(agentUUID, nodes) if err != nil { - c.log.Error(ctx, "invalid agent id", slog.F("id", string(agentID))) + c.log.Error(ctx, "publish nodes to agent", slog.Error(err)) return } + } + case "agentupdate": + agentUUID, err := uuid.ParseBytes(agentID) + if err != nil { + c.log.Error(ctx, "invalid agent id", slog.F("id", string(agentID))) + return + } - decoder := json.NewDecoder(bytes.NewReader(nodeJSON)) - _, err = c.handleAgentUpdate(agentUUID, decoder) - if err != nil { - c.log.Error(ctx, "handle agent update", slog.Error(err)) - return - } - default: - c.log.Error(ctx, "unknown peer event", slog.F("name", string(eventType))) + decoder := json.NewDecoder(bytes.NewReader(nodeJSON)) + _, err = c.handleAgentUpdate(agentUUID, decoder) + if err != nil { + c.log.Error(ctx, "handle agent update", slog.Error(err)) + return } - }) - if err != nil { - return xerrors.Errorf("subscribe wireguard peers") + default: + c.log.Error(ctx, "unknown peer event", slog.F("name", string(eventType))) } - - go func() { - defer cancelSub() - <-c.close - }() - - return nil } // format: |callmemaybe|| From 08b9681baac814f970b455432d845a6028a80779 Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Mon, 17 Oct 2022 03:03:16 +0000 Subject: [PATCH 77/79] Store agent nodes in order --- enterprise/tailnet/coordinator.go | 7 +++++++ tailnet/conn.go | 1 + tailnet/coordinator.go | 2 ++ 3 files changed, 10 insertions(+) diff --git a/enterprise/tailnet/coordinator.go b/enterprise/tailnet/coordinator.go index f001d4a9643dd..da3845f70b4c3 100644 --- a/enterprise/tailnet/coordinator.go +++ b/enterprise/tailnet/coordinator.go @@ -266,6 +266,13 @@ func (c *haCoordinator) handleAgentUpdate(id uuid.UUID, decoder *json.Decoder) ( } c.mutex.Lock() + oldNode := c.nodes[id] + if oldNode != nil { + if oldNode.AsOf.After(node.AsOf) { + c.mutex.Unlock() + return oldNode, nil + } + } c.nodes[id] = &node connectionSockets, ok := c.agentToConnectionSockets[id] if !ok { diff --git a/tailnet/conn.go b/tailnet/conn.go index 2f2549718880d..e3af3786ec92f 100644 --- a/tailnet/conn.go +++ b/tailnet/conn.go @@ -435,6 +435,7 @@ func (c *Conn) sendNode() { } node := &Node{ ID: c.netMap.SelfNode.ID, + AsOf: c.lastStatus, Key: c.netMap.SelfNode.Key, Addresses: c.netMap.SelfNode.Addresses, AllowedIPs: c.netMap.SelfNode.AllowedIPs, diff --git a/tailnet/coordinator.go b/tailnet/coordinator.go index 491c0db885224..52c1fa1e66ec4 100644 --- a/tailnet/coordinator.go +++ b/tailnet/coordinator.go @@ -37,6 +37,8 @@ type Coordinator interface { type Node struct { // ID is used to identify the connection. ID tailcfg.NodeID `json:"id"` + // AsOf is the time the node was created. + AsOf time.Time `json:"as_of"` // Key is the Wireguard public key of the node. Key key.NodePublic `json:"key"` // DiscoKey is used for discovery messages over DERP to establish peer-to-peer connections. From 79991a939139c64d0b66582d32aeb7078befb43c Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Mon, 17 Oct 2022 03:18:19 +0000 Subject: [PATCH 78/79] Fix coordinator locking --- tailnet/coordinator.go | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tailnet/coordinator.go b/tailnet/coordinator.go index 52c1fa1e66ec4..4216bbc624d48 100644 --- a/tailnet/coordinator.go +++ b/tailnet/coordinator.go @@ -246,7 +246,6 @@ func (c *coordinator) ServeAgent(conn net.Conn, id uuid.UUID) error { } sockets, ok := c.agentToConnectionSockets[id] - c.mutex.Unlock() if ok { // Publish all nodes that want to connect to the // desired agent ID. @@ -258,21 +257,21 @@ func (c *coordinator) ServeAgent(conn net.Conn, id uuid.UUID) error { } nodes = append(nodes, node) } + c.mutex.Unlock() data, err := json.Marshal(nodes) if err != nil { - c.mutex.Unlock() return xerrors.Errorf("marshal json: %w", err) } _, err = conn.Write(data) if err != nil { return xerrors.Errorf("write nodes: %w", err) } + c.mutex.Lock() } // If an old agent socket is connected, we close it // to avoid any leaks. This shouldn't ever occur because // we expect one agent to be running. - c.mutex.Lock() oldAgentSocket, ok := c.agentSockets[id] if ok { _ = oldAgentSocket.Close() From 020171b65e4a19faebc97f2983cf740c8d53755f Mon Sep 17 00:00:00 2001 From: Kyle Carberry Date: Mon, 17 Oct 2022 04:06:01 +0000 Subject: [PATCH 79/79] Check for closed pipe --- enterprise/tailnet/coordinator.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/enterprise/tailnet/coordinator.go b/enterprise/tailnet/coordinator.go index da3845f70b4c3..5749d9ef47c7a 100644 --- a/enterprise/tailnet/coordinator.go +++ b/enterprise/tailnet/coordinator.go @@ -121,7 +121,7 @@ func (c *haCoordinator) ServeClient(conn net.Conn, id uuid.UUID, agent uuid.UUID for { err := c.handleNextClientMessage(id, agent, decoder) if err != nil { - if errors.Is(err, io.EOF) { + if errors.Is(err, io.EOF) || errors.Is(err, io.ErrClosedPipe) { return nil } return xerrors.Errorf("handle next client message: %w", err) @@ -163,7 +163,7 @@ func (c *haCoordinator) handleNextClientMessage(id, agent uuid.UUID, decoder *js _, err = agentSocket.Write(data) if err != nil { - if errors.Is(err, io.EOF) { + if errors.Is(err, io.EOF) || errors.Is(err, io.ErrClosedPipe) { return nil } return xerrors.Errorf("write json: %w", err) @@ -215,7 +215,7 @@ func (c *haCoordinator) ServeAgent(conn net.Conn, id uuid.UUID) error { for { node, err := c.handleAgentUpdate(id, decoder) if err != nil { - if errors.Is(err, io.EOF) { + if errors.Is(err, io.EOF) || errors.Is(err, io.ErrClosedPipe) { return nil } return xerrors.Errorf("handle next agent message: %w", err) @@ -471,7 +471,7 @@ func (c *haCoordinator) handlePubsubMessage(ctx context.Context, message []byte) // We get a single node over pubsub, so turn into an array. _, err = agentSocket.Write(nodeJSON) if err != nil { - if errors.Is(err, io.EOF) { + if errors.Is(err, io.EOF) || errors.Is(err, io.ErrClosedPipe) { return } c.log.Error(ctx, "send callmemaybe to agent", slog.Error(err)) pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy