Skip to content

Commit e01bbcf

Browse files
committed
chore: add support for peer updates to tailnet.configMaps
1 parent 0ec3722 commit e01bbcf

File tree

4 files changed

+719
-21
lines changed

4 files changed

+719
-21
lines changed

go.mod

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,8 @@ require (
206206

207207
require go.uber.org/mock v0.4.0
208208

209+
require github.com/benbjohnson/clock v1.3.5 // indirect
210+
209211
require (
210212
cloud.google.com/go/compute v1.23.3 // indirect
211213
cloud.google.com/go/logging v1.8.1 // indirect

go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,8 @@ github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiE
123123
github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ2ASbcXHWsFqH8hp8=
124124
github.com/aymerick/douceur v0.2.0 h1:Mv+mAeH1Q+n9Fr+oyamOlAkUNPWPlA8PPGR0QAaYuPk=
125125
github.com/aymerick/douceur v0.2.0/go.mod h1:wlT5vV2O3h55X9m7iVYN0TBM0NH/MmbLnd30/FjWUq4=
126+
github.com/benbjohnson/clock v1.3.5 h1:VvXlSJBzZpA/zum6Sj74hxwYI2DIxRWuNIoXAzHZz5o=
127+
github.com/benbjohnson/clock v1.3.5/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA=
126128
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
127129
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
128130
github.com/bep/godartsass v1.2.0 h1:E2VvQrxAHAFwbjyOIExAMmogTItSKodoKuijNrGm5yU=

tailnet/configmaps.go

Lines changed: 215 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,15 @@ package tailnet
33
import (
44
"context"
55
"errors"
6+
"fmt"
67
"net/netip"
78
"sync"
9+
"time"
810

11+
"github.com/benbjohnson/clock"
912
"github.com/google/uuid"
1013
"go4.org/netipx"
14+
"tailscale.com/ipn/ipnstate"
1115
"tailscale.com/net/dns"
1216
"tailscale.com/tailcfg"
1317
"tailscale.com/types/ipproto"
@@ -23,10 +27,13 @@ import (
2327
"github.com/coder/coder/v2/tailnet/proto"
2428
)
2529

30+
const lostTimeout = 15 * time.Minute
31+
2632
// engineConfigurable is the subset of wgengine.Engine that we use for configuration.
2733
//
2834
// This allows us to test configuration code without faking the whole interface.
2935
type engineConfigurable interface {
36+
UpdateStatus(*ipnstate.StatusBuilder)
3037
SetNetworkMap(*netmap.NetworkMap)
3138
Reconfig(*wgcfg.Config, *router.Config, *dns.Config, *tailcfg.Debug) error
3239
SetDERPMap(*tailcfg.DERPMap)
@@ -49,12 +56,16 @@ type configMaps struct {
4956
closing bool
5057
phase phase
5158

52-
engine engineConfigurable
53-
static netmap.NetworkMap
54-
peers map[uuid.UUID]*peerLifecycle
55-
addresses []netip.Prefix
56-
derpMap *proto.DERPMap
57-
logger slog.Logger
59+
engine engineConfigurable
60+
static netmap.NetworkMap
61+
peers map[uuid.UUID]*peerLifecycle
62+
addresses []netip.Prefix
63+
derpMap *proto.DERPMap
64+
logger slog.Logger
65+
blockEndpoints bool
66+
67+
// for testing
68+
clock clock.Clock
5869
}
5970

6071
func newConfigMaps(logger slog.Logger, engine engineConfigurable, nodeID tailcfg.NodeID, nodeKey key.NodePrivate, discoKey key.DiscoPublic, addresses []netip.Prefix) *configMaps {
@@ -101,6 +112,7 @@ func newConfigMaps(logger slog.Logger, engine engineConfigurable, nodeID tailcfg
101112
},
102113
peers: make(map[uuid.UUID]*peerLifecycle),
103114
addresses: addresses,
115+
clock: clock.New(),
104116
}
105117
go c.configLoop()
106118
return c
@@ -164,6 +176,9 @@ func (c *configMaps) configLoop() {
164176
func (c *configMaps) close() {
165177
c.L.Lock()
166178
defer c.L.Unlock()
179+
for _, lc := range c.peers {
180+
lc.resetTimer()
181+
}
167182
c.closing = true
168183
c.Broadcast()
169184
for c.phase != closed {
@@ -247,11 +262,201 @@ func (c *configMaps) filterLocked() *filter.Filter {
247262
)
248263
}
249264

265+
func (c *configMaps) updatePeers(updates []*proto.CoordinateResponse_PeerUpdate) {
266+
status := c.status()
267+
c.L.Lock()
268+
defer c.L.Unlock()
269+
270+
// Update all the lastHandshake values here. That way we don't have to
271+
// worry about them being up-to-date when handling updates below, and it covers
272+
// all peers, not just the ones we got updates about.
273+
for _, lc := range c.peers {
274+
if peerStatus, ok := status.Peer[lc.node.Key]; ok {
275+
lc.lastHandshake = peerStatus.LastHandshake
276+
}
277+
}
278+
279+
for _, update := range updates {
280+
if dirty := c.updatePeerLocked(update, status); dirty {
281+
c.netmapDirty = true
282+
}
283+
}
284+
if c.netmapDirty {
285+
c.Broadcast()
286+
}
287+
}
288+
289+
func (c *configMaps) status() *ipnstate.Status {
290+
sb := &ipnstate.StatusBuilder{WantPeers: true}
291+
c.engine.UpdateStatus(sb)
292+
return sb.Status()
293+
}
294+
295+
func (c *configMaps) updatePeerLocked(update *proto.CoordinateResponse_PeerUpdate, status *ipnstate.Status) (dirty bool) {
296+
id, err := uuid.FromBytes(update.Id)
297+
if err != nil {
298+
c.logger.Critical(context.Background(), "received update with bad id", slog.F("id", update.Id))
299+
return false
300+
}
301+
logger := c.logger.With(slog.F("peer_id", id))
302+
lc, ok := c.peers[id]
303+
var node *tailcfg.Node
304+
if update.Kind == proto.CoordinateResponse_PeerUpdate_NODE {
305+
// If no preferred DERP is provided, we can't reach the node.
306+
if update.Node.PreferredDerp == 0 {
307+
logger.Warn(context.Background(), "no preferred DERP, peer update", slog.F("node_proto", update.Node))
308+
return false
309+
}
310+
node, err = c.protoNodeToTailcfg(update.Node)
311+
if err != nil {
312+
logger.Critical(context.Background(), "failed to convert proto node to tailcfg", slog.F("node_proto", update.Node))
313+
return false
314+
}
315+
logger = logger.With(slog.F("key_id", node.Key.ShortString()), slog.F("node", node))
316+
peerStatus, ok := status.Peer[node.Key]
317+
// Starting KeepAlive messages at the initialization of a connection
318+
// causes a race condition. If we send the handshake before the peer has
319+
// our node, we'll have to wait for 5 seconds before trying again.
320+
// Ideally, the first handshake starts when the user first initiates a
321+
// connection to the peer. After a successful connection we enable
322+
// keep alives to persist the connection and keep it from becoming idle.
323+
// SSH connections don't send packets while idle, so we use keep alives
324+
// to avoid random hangs while we set up the connection again after
325+
// inactivity.
326+
node.KeepAlive = ok && peerStatus.Active
327+
if c.blockEndpoints {
328+
node.Endpoints = nil
329+
}
330+
}
331+
switch {
332+
case !ok && update.Kind == proto.CoordinateResponse_PeerUpdate_NODE:
333+
// new!
334+
var lastHandshake time.Time
335+
if ps, ok := status.Peer[node.Key]; ok {
336+
lastHandshake = ps.LastHandshake
337+
}
338+
c.peers[id] = &peerLifecycle{
339+
peerID: id,
340+
node: node,
341+
lastHandshake: lastHandshake,
342+
lost: false,
343+
}
344+
logger.Debug(context.Background(), "adding new peer")
345+
return true
346+
case ok && update.Kind == proto.CoordinateResponse_PeerUpdate_NODE:
347+
// update
348+
node.Created = lc.node.Created
349+
dirty = !lc.node.Equal(node)
350+
lc.node = node
351+
lc.lost = false
352+
lc.resetTimer()
353+
logger.Debug(context.Background(), "node update to existing peer", slog.F("dirty", dirty))
354+
return dirty
355+
case !ok:
356+
// disconnected or lost, but we don't have the node. No op
357+
logger.Debug(context.Background(), "skipping update for peer we don't recognize")
358+
return false
359+
case update.Kind == proto.CoordinateResponse_PeerUpdate_DISCONNECTED:
360+
lc.resetTimer()
361+
delete(c.peers, id)
362+
logger.Debug(context.Background(), "disconnected peer")
363+
return true
364+
case update.Kind == proto.CoordinateResponse_PeerUpdate_LOST:
365+
lc.lost = true
366+
lc.setLostTimer(c)
367+
logger.Debug(context.Background(), "marked peer lost")
368+
// marking a node lost doesn't change anything right now, so dirty=false
369+
return false
370+
default:
371+
logger.Warn(context.Background(), "unknown peer update", slog.F("kind", update.Kind))
372+
return false
373+
}
374+
}
375+
376+
func (c *configMaps) peerLostTimeout(id uuid.UUID) {
377+
logger := c.logger.With(slog.F("peer_id", id))
378+
logger.Debug(context.Background(),
379+
"peer lost timeout")
380+
381+
// First do a status update to see if the peer did a handshake while we were
382+
// waiting
383+
status := c.status()
384+
c.L.Lock()
385+
defer c.L.Unlock()
386+
387+
lc, ok := c.peers[id]
388+
if !ok {
389+
logger.Debug(context.Background(),
390+
"timeout triggered for peer that is removed from the map")
391+
return
392+
}
393+
if peerStatus, ok := status.Peer[lc.node.Key]; ok {
394+
lc.lastHandshake = peerStatus.LastHandshake
395+
}
396+
logger = logger.With(slog.F("key_id", lc.node.Key.ShortString()))
397+
if !lc.lost {
398+
logger.Debug(context.Background(),
399+
"timeout triggered for peer that is no longer lost")
400+
return
401+
}
402+
since := c.clock.Since(lc.lastHandshake)
403+
if since >= lostTimeout {
404+
logger.Info(
405+
context.Background(), "removing lost peer")
406+
delete(c.peers, id)
407+
c.netmapDirty = true
408+
c.Broadcast()
409+
return
410+
}
411+
logger.Debug(context.Background(),
412+
"timeout triggered for peer but it had handshake in meantime")
413+
lc.setLostTimer(c)
414+
}
415+
416+
func (c *configMaps) protoNodeToTailcfg(p *proto.Node) (*tailcfg.Node, error) {
417+
node, err := ProtoToNode(p)
418+
if err != nil {
419+
return nil, err
420+
}
421+
return &tailcfg.Node{
422+
ID: tailcfg.NodeID(p.GetId()),
423+
Created: c.clock.Now(),
424+
Key: node.Key,
425+
DiscoKey: node.DiscoKey,
426+
Addresses: node.Addresses,
427+
AllowedIPs: node.AllowedIPs,
428+
Endpoints: node.Endpoints,
429+
DERP: fmt.Sprintf("%s:%d", tailcfg.DerpMagicIP, node.PreferredDERP),
430+
Hostinfo: (&tailcfg.Hostinfo{}).View(),
431+
}, nil
432+
}
433+
250434
type peerLifecycle struct {
251-
node *tailcfg.Node
252-
// TODO: implement timers to track lost peers
253-
// lastHandshake time.Time
254-
// timer time.Timer
435+
peerID uuid.UUID
436+
node *tailcfg.Node
437+
lost bool
438+
lastHandshake time.Time
439+
timer *clock.Timer
440+
}
441+
442+
func (l *peerLifecycle) resetTimer() {
443+
if l.timer != nil {
444+
l.timer.Stop()
445+
l.timer = nil
446+
}
447+
}
448+
449+
func (l *peerLifecycle) setLostTimer(c *configMaps) {
450+
if l.timer != nil {
451+
l.timer.Stop()
452+
}
453+
ttl := lostTimeout - c.clock.Since(l.lastHandshake)
454+
if ttl <= 0 {
455+
ttl = time.Nanosecond
456+
}
457+
l.timer = c.clock.AfterFunc(ttl, func() {
458+
c.peerLostTimeout(l.peerID)
459+
})
255460
}
256461

257462
// prefixesDifferent returns true if the two slices contain different prefixes

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy