Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

*: configure Raft Pre-Vote to reduce disruptive rejoining servers #9352

Merged
merged 4 commits into from
Mar 7, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions embed/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,13 @@ type Config struct {
ExperimentalInitialCorruptCheck bool `json:"experimental-initial-corrupt-check"`
ExperimentalCorruptCheckTime time.Duration `json:"experimental-corrupt-check-time"`
ExperimentalEnableV2V3 string `json:"experimental-enable-v2v3"`

// ExperimentalPreVote is true to enable Raft Pre-Vote.
// If enabled, Raft runs an additional election phase
// to check whether it would get enough votes to win
// an election, thus minimizing disruptions.
// TODO: change to "pre-vote" and enable by default in 3.5.
ExperimentalPreVote bool `json:"experimental-pre-vote"`
}

// configYAML holds the config suitable for yaml parsing
Expand Down Expand Up @@ -293,6 +300,7 @@ func NewConfig() *Config {
EnableV2: DefaultEnableV2,
HostWhitelist: defaultHostWhitelist,
AuthToken: "simple",
ExperimentalPreVote: false, // TODO: enable by default in v3.5
}
cfg.InitialCluster = cfg.InitialClusterFromName(cfg.Name)
return cfg
Expand Down
1 change: 1 addition & 0 deletions embed/etcd.go
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@ func StartEtcd(inCfg *Config) (e *Etcd, err error) {
AuthToken: cfg.AuthToken,
InitialCorruptCheck: cfg.ExperimentalInitialCorruptCheck,
CorruptCheckTime: cfg.ExperimentalCorruptCheckTime,
PreVote: cfg.ExperimentalPreVote,
Debug: cfg.Debug,
}

Expand Down
1 change: 1 addition & 0 deletions etcdmain/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,7 @@ func newConfig() *config {
// experimental
fs.BoolVar(&cfg.ec.ExperimentalInitialCorruptCheck, "experimental-initial-corrupt-check", cfg.ec.ExperimentalInitialCorruptCheck, "Enable to check data corruption before serving any client/peer traffic.")
fs.DurationVar(&cfg.ec.ExperimentalCorruptCheckTime, "experimental-corrupt-check-time", cfg.ec.ExperimentalCorruptCheckTime, "Duration of time between cluster corruption check passes.")
fs.BoolVar(&cfg.ec.ExperimentalPreVote, "experimental-pre-vote", cfg.ec.ExperimentalPreVote, "Enable to run an additional Raft election phase.")

// ignored
for _, f := range cfg.ignored {
Expand Down
2 changes: 2 additions & 0 deletions etcdmain/help.go
Original file line number Diff line number Diff line change
Expand Up @@ -197,5 +197,7 @@ experimental flags:
duration of time between cluster corruption check passes.
--experimental-enable-v2v3 ''
serve v2 requests through the v3 backend under a given prefix.
--experimental-pre-vote 'false'
enable to run an additional Raft election phase.
`
)
3 changes: 3 additions & 0 deletions etcdserver/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@ type ServerConfig struct {
InitialCorruptCheck bool
CorruptCheckTime time.Duration

// PreVote is true to enable Raft Pre-Vote.
PreVote bool

Debug bool
}

Expand Down
3 changes: 3 additions & 0 deletions etcdserver/raft.go
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,7 @@ func startNode(cfg ServerConfig, cl *membership.RaftCluster, ids []types.ID) (id
MaxSizePerMsg: maxSizePerMsg,
MaxInflightMsgs: maxInflightMsgs,
CheckQuorum: true,
PreVote: cfg.PreVote,
}

n = raft.StartNode(c, peers)
Expand Down Expand Up @@ -445,6 +446,7 @@ func restartNode(cfg ServerConfig, snapshot *raftpb.Snapshot) (types.ID, *member
MaxSizePerMsg: maxSizePerMsg,
MaxInflightMsgs: maxInflightMsgs,
CheckQuorum: true,
PreVote: cfg.PreVote,
}

n := raft.RestartNode(c)
Expand Down Expand Up @@ -501,6 +503,7 @@ func restartAsStandaloneNode(cfg ServerConfig, snapshot *raftpb.Snapshot) (types
MaxSizePerMsg: maxSizePerMsg,
MaxInflightMsgs: maxInflightMsgs,
CheckQuorum: true,
PreVote: cfg.PreVote,
}
n := raft.RestartNode(c)
raftStatus = n.Status
Expand Down
6 changes: 6 additions & 0 deletions etcdserver/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,12 @@ type EtcdServer struct {
// NewServer creates a new EtcdServer from the supplied configuration. The
// configuration is considered static for the lifetime of the EtcdServer.
func NewServer(cfg ServerConfig) (srv *EtcdServer, err error) {
if cfg.PreVote {
plog.Info("Raft Pre-Vote is enabled")
} else {
plog.Info("Raft Pre-Vote is disabled")
}

st := v2store.New(StoreClusterPrefix, StoreKeysPrefix)

var (
Expand Down
9 changes: 8 additions & 1 deletion raft/raft.go
Original file line number Diff line number Diff line change
Expand Up @@ -817,8 +817,15 @@ func (r *raft) Step(m pb.Message) error {
// nodes that have been removed from the cluster's configuration: a
// removed node will send MsgVotes (or MsgPreVotes) which will be ignored,
// but it will not receive MsgApp or MsgHeartbeat, so it will not create
// disruptive term increases
// disruptive term increases, by notifying leader of this node's activeness.
// The above comments also true for Pre-Vote
//
// When follower gets isolated, it soon starts an election ending
// up with a higher term than leader, although it won't receive enough
// votes to win the election. When it regains connectivity, this response
// with "pb.MsgAppResp" of higher term would force leader to step down.
// However, this disruption is inevitable to free this stuck node with
// fresh election. This can be prevented with Pre-Vote phase.
r.send(pb.Message{To: m.From, Type: pb.MsgAppResp})
} else if m.Type == pb.MsgPreVote {
// Before Pre-Vote enable, there may have candidate with higher term,
Expand Down
201 changes: 201 additions & 0 deletions raft/raft_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1993,6 +1993,207 @@ func TestNonPromotableVoterWithCheckQuorum(t *testing.T) {
}
}

// TestDisruptiveFollower tests isolated follower,
// with slow network incoming from leader, election times out
// to become a candidate with an increased term. Then, the
// candiate's response to late leader heartbeat forces the leader
// to step down.
func TestDisruptiveFollower(t *testing.T) {
n1 := newTestRaft(1, []uint64{1, 2, 3}, 10, 1, NewMemoryStorage())
n2 := newTestRaft(2, []uint64{1, 2, 3}, 10, 1, NewMemoryStorage())
n3 := newTestRaft(3, []uint64{1, 2, 3}, 10, 1, NewMemoryStorage())

n1.checkQuorum = true
n2.checkQuorum = true
n3.checkQuorum = true

n1.becomeFollower(1, None)
n2.becomeFollower(1, None)
n3.becomeFollower(1, None)

nt := newNetwork(n1, n2, n3)

nt.send(pb.Message{From: 1, To: 1, Type: pb.MsgHup})

// check state
// n1.state == StateLeader
// n2.state == StateFollower
// n3.state == StateFollower
if n1.state != StateLeader {
t.Fatalf("node 1 state: %s, want %s", n1.state, StateLeader)
}
if n2.state != StateFollower {
t.Fatalf("node 2 state: %s, want %s", n2.state, StateFollower)
}
if n3.state != StateFollower {
t.Fatalf("node 3 state: %s, want %s", n3.state, StateFollower)
}

// etcd server "advanceTicksForElection" on restart;
// this is to expedite campaign trigger when given larger
// election timeouts (e.g. multi-datacenter deploy)
// Or leader messages are being delayed while ticks elapse
setRandomizedElectionTimeout(n3, n3.electionTimeout+2)
for i := 0; i < n3.randomizedElectionTimeout-1; i++ {
n3.tick()
}

// ideally, before last election tick elapses,
// the follower n3 receives "pb.MsgApp" or "pb.MsgHeartbeat"
// from leader n1, and then resets its "electionElapsed"
// however, last tick may elapse before receiving any
// messages from leader, thus triggering campaign
n3.tick()

// n1 is still leader yet
// while its heartbeat to candidate n3 is being delayed

// check state
// n1.state == StateLeader
// n2.state == StateFollower
// n3.state == StateCandidate
if n1.state != StateLeader {
t.Fatalf("node 1 state: %s, want %s", n1.state, StateLeader)
}
if n2.state != StateFollower {
t.Fatalf("node 2 state: %s, want %s", n2.state, StateFollower)
}
if n3.state != StateCandidate {
t.Fatalf("node 3 state: %s, want %s", n3.state, StateCandidate)
}
// check term
// n1.Term == 2
// n2.Term == 2
// n3.Term == 3
if n1.Term != 2 {
t.Fatalf("node 1 term: %d, want %d", n1.Term, 2)
}
if n2.Term != 2 {
t.Fatalf("node 2 term: %d, want %d", n2.Term, 2)
}
if n3.Term != 3 {
t.Fatalf("node 3 term: %d, want %d", n3.Term, 3)
}

// while outgoing vote requests are still queued in n3,
// leader heartbeat finally arrives at candidate n3
// however, due to delayed network from leader, leader
// heartbeat was sent with lower term than candidate's
nt.send(pb.Message{From: 1, To: 3, Term: n1.Term, Type: pb.MsgHeartbeat})

// then candidate n3 responds with "pb.MsgAppResp" of higher term
// and leader steps down from a message with higher term
// this is to disrupt the current leader, so that candidate
// with higher term can be freed with following election

// check state
// n1.state == StateFollower
// n2.state == StateFollower
// n3.state == StateCandidate
if n1.state != StateFollower {
t.Fatalf("node 1 state: %s, want %s", n1.state, StateFollower)
}
if n2.state != StateFollower {
t.Fatalf("node 2 state: %s, want %s", n2.state, StateFollower)
}
if n3.state != StateCandidate {
t.Fatalf("node 3 state: %s, want %s", n3.state, StateCandidate)
}
// check term
// n1.Term == 3
// n2.Term == 2
// n3.Term == 3
if n1.Term != 3 {
t.Fatalf("node 1 term: %d, want %d", n1.Term, 3)
}
if n2.Term != 2 {
t.Fatalf("node 2 term: %d, want %d", n2.Term, 2)
}
if n3.Term != 3 {
t.Fatalf("node 3 term: %d, want %d", n3.Term, 3)
}
}

// TestDisruptiveFollowerPreVote tests isolated follower,
// with slow network incoming from leader, election times out
// to become a pre-candidate with less log than current leader.
// Then pre-vote phase prevents this isolated node from forcing
// current leader to step down, thus less disruptions.
func TestDisruptiveFollowerPreVote(t *testing.T) {
n1 := newTestRaft(1, []uint64{1, 2, 3}, 10, 1, NewMemoryStorage())
n2 := newTestRaft(2, []uint64{1, 2, 3}, 10, 1, NewMemoryStorage())
n3 := newTestRaft(3, []uint64{1, 2, 3}, 10, 1, NewMemoryStorage())

n1.checkQuorum = true
n2.checkQuorum = true
n3.checkQuorum = true

n1.becomeFollower(1, None)
n2.becomeFollower(1, None)
n3.becomeFollower(1, None)

nt := newNetwork(n1, n2, n3)

nt.send(pb.Message{From: 1, To: 1, Type: pb.MsgHup})

// check state
// n1.state == StateLeader
// n2.state == StateFollower
// n3.state == StateFollower
if n1.state != StateLeader {
t.Fatalf("node 1 state: %s, want %s", n1.state, StateLeader)
}
if n2.state != StateFollower {
t.Fatalf("node 2 state: %s, want %s", n2.state, StateFollower)
}
if n3.state != StateFollower {
t.Fatalf("node 3 state: %s, want %s", n3.state, StateFollower)
}

nt.isolate(3)
nt.send(pb.Message{From: 1, To: 1, Type: pb.MsgProp, Entries: []pb.Entry{{Data: []byte("somedata")}}})
nt.send(pb.Message{From: 1, To: 1, Type: pb.MsgProp, Entries: []pb.Entry{{Data: []byte("somedata")}}})
nt.send(pb.Message{From: 1, To: 1, Type: pb.MsgProp, Entries: []pb.Entry{{Data: []byte("somedata")}}})
n1.preVote = true
n2.preVote = true
n3.preVote = true
nt.recover()
nt.send(pb.Message{From: 3, To: 3, Type: pb.MsgHup})

// check state
// n1.state == StateLeader
// n2.state == StateFollower
// n3.state == StatePreCandidate
if n1.state != StateLeader {
t.Fatalf("node 1 state: %s, want %s", n1.state, StateLeader)
}
if n2.state != StateFollower {
t.Fatalf("node 2 state: %s, want %s", n2.state, StateFollower)
}
if n3.state != StatePreCandidate {
t.Fatalf("node 3 state: %s, want %s", n3.state, StatePreCandidate)
}
// check term
// n1.Term == 2
// n2.Term == 2
// n3.Term == 2
if n1.Term != 2 {
t.Fatalf("node 1 term: %d, want %d", n1.Term, 2)
}
if n2.Term != 2 {
t.Fatalf("node 2 term: %d, want %d", n2.Term, 2)
}
if n3.Term != 2 {
t.Fatalf("node 2 term: %d, want %d", n3.Term, 2)
}

// delayed leader heartbeat does not force current leader to step down
nt.send(pb.Message{From: 1, To: 3, Term: n1.Term, Type: pb.MsgHeartbeat})
if n1.state != StateLeader {
t.Fatalf("node 1 state: %s, want %s", n1.state, StateLeader)
}
}

func TestReadOnlyOptionSafe(t *testing.T) {
a := newTestRaft(1, []uint64{1, 2, 3}, 10, 1, NewMemoryStorage())
b := newTestRaft(2, []uint64{1, 2, 3}, 10, 1, NewMemoryStorage())
Expand Down