Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

clustering: delay startup until after the HTTP server is up #3909

Merged
merged 3 commits into from
May 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions cmd/internal/flowmode/cmd_run.go
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,12 @@ func (fr *flowRun) Run(configFile string) error {
}()
}

// Start the Clusterer's Node implementation.
err = clusterer.Start(ctx)
if err != nil {
return fmt.Errorf("failed to start the clusterer: %w", err)
}

// Perform the initial reload. This is done after starting the HTTP server so
// that /metric and pprof endpoints are available while the Flow controller
// is loading.
Expand Down
70 changes: 42 additions & 28 deletions pkg/cluster/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -177,39 +177,53 @@ func New(log log.Logger, reg prometheus.Registerer, clusterEnabled bool, listenA
return nil, err
}

// Attempt to start the Node by connecting to the peers in gossipConfig.
// If we cannot connect to any peers, fall back to bootstrapping a new
// cluster by ourselves.
err = gossipNode.Start()
if err != nil {
level.Debug(log).Log("msg", "failed to connect to peers; bootstrapping a new cluster")
gossipConfig.JoinPeers = nil
err = gossipNode.Start()
return &Clusterer{Node: gossipNode}, nil
}

// Start starts the node.
// For the localNode implementation, this is a no-op.
// For the gossipNode implementation, Start will attempt to connect to the
// configured list of peers; if this fails it will fall back to bootstrapping a
// new cluster of its own.
func (c *Clusterer) Start(ctx context.Context) error {
switch node := c.Node.(type) {
thampiotr marked this conversation as resolved.
Show resolved Hide resolved
case *localNode:
return nil // no-op, always ready
case *GossipNode:
err := node.Start() // TODO(@tpaschalis) Should we backoff and retry before moving on to the fallback here?
if err != nil {
return nil, err
level.Debug(node.log).Log("msg", "failed to connect to peers; bootstrapping a new cluster")
node.cfg.JoinPeers = nil
err = node.Start()
if err != nil {
return err
}
}
}

// Nodes initially join the cluster in the Viewer state. We can move to the
// Participant state to signal that we wish to participate in reading or
// writing data.
err = gossipNode.ChangeState(context.Background(), peer.StateParticipant)
if err != nil {
return nil, err
}

res := &Clusterer{Node: gossipNode}

gossipNode.Observe(ckit.FuncObserver(func(peers []peer.Peer) (reregister bool) {
names := make([]string, len(peers))
for i, p := range peers {
names[i] = p.Name
// We now have either joined or started a new cluster.
// Nodes initially join in the Viewer state. We can move to the
// Participant state to signal that we wish to participate in reading
// or writing data.
ctx, ccl := context.WithTimeout(ctx, 5*time.Second)
defer ccl()
err = node.ChangeState(ctx, peer.StateParticipant)
if err != nil {
return err
}
level.Info(log).Log("msg", "peers changed", "new_peers", strings.Join(names, ","))
return true
}))

return res, nil
node.Observe(ckit.FuncObserver(func(peers []peer.Peer) (reregister bool) {
names := make([]string, len(peers))
for i, p := range peers {
names[i] = p.Name
}
level.Info(node.log).Log("msg", "peers changed", "new_peers", strings.Join(names, ","))
return true
}))
return nil
default:
msg := fmt.Sprintf("node type: %T", c.Node)
panic("cluster: unreachable:" + msg)
}
}

func deadlineDuration(ctx context.Context) (d time.Duration, ok bool) {
Expand Down