Skip to content

Commit

Permalink
Retry cluster join on "too many learners" error
Browse files Browse the repository at this point in the history
Signed-off-by: Brad Davidson <brad.davidson@rancher.com>
  • Loading branch information
brandond committed Apr 25, 2023
1 parent 87f0dc5 commit 1a727d5
Showing 1 changed file with 12 additions and 3 deletions.
15 changes: 12 additions & 3 deletions pkg/etcd/etcd.go
Original file line number Diff line number Diff line change
Expand Up @@ -419,10 +419,19 @@ func (e *ETCD) Start(ctx context.Context, clientAccessInfo *clientaccess.Info) e
for {
select {
case <-time.After(30 * time.Second):
logrus.Infof("Waiting for agent to become ready before joining ETCD cluster")
logrus.Infof("Waiting for agent to become ready before joining etcd cluster")
case <-e.config.Runtime.AgentReady:
if err := e.join(ctx, clientAccessInfo); err != nil {
logrus.Fatalf("ETCD join failed: %v", err)
if err := wait.PollImmediateUntiWithContext(ctx, time.Second, func(ctx context.Context) (bool, error) {
if err := e.join(ctx, clientAccessInfo); err != nil {
if errors.Is(err, rpctypes.ErrTooManyLearners) {
logrus.Infof("Waiting for other members to finish joining the etcd cluster")
return false, nil
}
return false, err
}
return true, nil
}); err != nil {
logrus.Fatalf("etcd cluster join failed: %v", err)
}
return
case <-ctx.Done():
Expand Down

0 comments on commit 1a727d5

Please sign in to comment.