Skip to content

Commit

Permalink
storage: add regression test for cockroachdb#21146
Browse files Browse the repository at this point in the history
This verifies the behavior of when the application of some split command
(part of the lhs's log) is delayed on some store and meanwhile the rhs
has rebalanced away and back, ending up with a larger ReplicaID than the
split thinks it will have.

Release note: None
  • Loading branch information
danhhz committed Aug 16, 2019
1 parent 7880622 commit 95f440f
Show file tree
Hide file tree
Showing 4 changed files with 124 additions and 0 deletions.
8 changes: 8 additions & 0 deletions pkg/server/testserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,14 @@ func (ts *TestServer) PGServer() *pgwire.Server {
return nil
}

// RaftTransport returns the RaftTransport used by the TestServer.
func (ts *TestServer) RaftTransport() *storage.RaftTransport {
if ts != nil {
return ts.raftTransport
}
return nil
}

// Start starts the TestServer by bootstrapping an in-memory store
// (defaults to maximum of 100M). The server is started, launching the
// node RPC server and all HTTP endpoints. Use the value of
Expand Down
110 changes: 110 additions & 0 deletions pkg/storage/client_split_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/storage/storagebase"
"github.com/cockroachdb/cockroach/pkg/testutils"
"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
"github.com/cockroachdb/cockroach/pkg/testutils/testcluster"
"github.com/cockroachdb/cockroach/pkg/ts"
"github.com/cockroachdb/cockroach/pkg/ts/tspb"
Expand Down Expand Up @@ -3191,3 +3192,112 @@ func TestStoreSplitDisappearingReplicas(t *testing.T) {
}
}
}

// Regression test for #21146. This verifies the behavior of when the
// application of some split command (part of the lhs's log) is delayed on some
// store and meanwhile the rhs has rebalanced away and back, ending up with a
// larger ReplicaID than the split thinks it will have.
func TestSplitTriggerMeetsUnexpectedReplicaID(t *testing.T) {
defer leaktest.AfterTest(t)()
ctx := context.Background()

blockPromoteCh := make(chan struct{})
knobs := base.TestingKnobs{Store: &storage.StoreTestingKnobs{
ReplicaAddStopAfterLearnerSnapshot: func() bool {
<-blockPromoteCh
return false
},
ReplicaAddSkipLearnerRollback: func() bool {
return true
},
}}
tc := testcluster.StartTestCluster(t, 2, base.TestClusterArgs{
ServerArgs: base.TestServerArgs{Knobs: knobs},
ReplicationMode: base.ReplicationManual,
})
defer tc.Stopper().Stop(ctx)
db := sqlutils.MakeSQLRunner(tc.ServerConn(0))
db.Exec(t, `SET CLUSTER SETTING kv.learner_replicas.enabled = true`)

k := tc.ScratchRange(t)
desc := tc.LookupRangeOrFatal(t, k)

// First construct a range with a learner replica on the second node (index 1)
// and split it, ending up with an orphaned learner on each side of the split.
// After the learner is created, but before the split, block all incoming raft
// traffic to the learner on the lhs of the split (which is still on the
// second node).
g := ctxgroup.WithContext(ctx)
g.GoCtx(func(ctx context.Context) error {
_, err := tc.AddReplicas(k, tc.Target(1))
return err
})

store, _ := getFirstStoreReplica(t, tc.Server(1), k)
tc.Servers[1].RaftTransport().Listen(store.StoreID(), &unreliableRaftHandler{
rangeID: desc.RangeID,
RaftMessageHandler: store,
})

_, kRHS := k, k.Next()
descLHS, descRHS := tc.SplitRangeOrFatal(t, kRHS)

close(blockPromoteCh)
if err := g.Wait(); !testutils.IsError(err, `descriptor changed`) {
t.Fatalf(`expected "descriptor changed" error got: %+v`, err)
}

// Now repeatedly remove and re-add the learner on the rhs, so it has a
// different replicaID than the split trigger expects.
for i := 0; i < 5; i++ {
_, err := tc.RemoveReplicas(kRHS, tc.Target(1))
require.NoError(t, err)
_, err = tc.AddReplicas(kRHS, tc.Target(1))
if !testutils.IsError(err, `snapshot intersects existing range`) {
t.Fatalf(`expected snapshot intersects existing range" error got: %+v`, err)
}
}

// Normally AddReplicas will return the latest version of the RangeDescriptor,
// but because we're getting snapshot errors and using the
// ReplicaAddSkipLearnerRollback hook, we have to look it up again ourselves
// to find the current replicaID for the RHS learner.
descRHS = tc.LookupRangeOrFatal(t, kRHS)
learnerDescRHS, ok := descRHS.GetReplicaDescriptor(store.StoreID())
require.True(t, ok)

// Wait for there to be an in-memory, uninitialized learner replica with the
// latest ReplicaID. Note: it cannot become initialized at this point because
// it needs a snapshot to do that and (as can be seen in the error check
// above) snapshots will intersect the lhs replica (which doesn't know about
// the split because we've blocked its raft traffic, and so it still covers
// the pre-split keyspace).
testutils.SucceedsSoon(t, func() error {
repl, err := store.GetReplica(descRHS.RangeID)
if err != nil {
return err
}
status := repl.RaftStatus()
if status == nil {
return errors.New("raft group not initialized")
}
if replicaID := roachpb.ReplicaID(status.ID); replicaID != learnerDescRHS.ReplicaID {
return errors.Errorf("expected %d got %d", learnerDescRHS.ReplicaID, replicaID)
}
return nil
})

// Re-enable raft and wait for the lhs to catch up to the post-split
// descriptor. This used to panic with "raft group deleted".
tc.Servers[1].RaftTransport().Listen(store.StoreID(), store)
testutils.SucceedsSoon(t, func() error {
repl, err := store.GetReplica(descLHS.RangeID)
if err != nil {
return err
}
if desc := repl.Desc(); !descLHS.Equal(desc) {
return errors.Errorf("expected %s got %s", &descLHS, desc)
}
return nil
})
}
3 changes: 3 additions & 0 deletions pkg/storage/replica_command.go
Original file line number Diff line number Diff line change
Expand Up @@ -936,6 +936,9 @@ func (r *Replica) ChangeReplicas(
// final voters and removes any undesirable replicas.
desc, err := r.finalizeChangeReplicas(ctx, desc, priority, reason, details, chgs)
if err != nil {
if fn := r.store.cfg.TestingKnobs.ReplicaAddSkipLearnerRollback; fn != nil && fn() {
return nil, err
}
// Don't leave a learner replica lying around if we didn't succeed in
// promoting it to a voter.
targets := chgs.Additions()
Expand Down
3 changes: 3 additions & 0 deletions pkg/storage/testing_knobs.go
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,9 @@ type StoreTestingKnobs struct {
// acquiring snapshot quota or doing shouldAcceptSnapshotData checks. If an
// error is returned from the hook, it's sent as an ERROR SnapshotResponse.
ReceiveSnapshot func(*SnapshotRequest_Header) error
// ReplicaAddSkipRollback causes replica addition to skip the learner rollback
// that happens when promotion to a voter fails.
ReplicaAddSkipLearnerRollback func() bool
// ReplicaAddStopAfterLearnerSnapshot causes replica addition to return early
// if the func returns true. Specifically, after the learner txn is successful
// and after the LEARNER type snapshot, but before promoting it to a voter.
Expand Down

0 comments on commit 95f440f

Please sign in to comment.