Merge #39796

39796: storage: avoid (one) fatal error from splitPostApply r=ajwerner a=tbg This is the next band-aid on top of #39658 and #39571. The descriptor lookup I added sometimes fails because replicas can process a split trigger in which they're not a member of the range: > F190821 15:14:28.241623 312191 storage/store.go:2172 > [n2,s2,r21/3:/{Table/54-Max}] replica descriptor of local store not > found in right hand side of split I saw this randomly in `make test PKG=./pkg/ccl/partitionccl`. @danhhz could you take a stab at replicating this in the test you added recently? It's probably too ambitious but it'd be nice to verify this is actually fixed (I didn't). I haven't seen this fail on master (though I also didn't look very hard) so maybe this is rare (i.e. we have time), but then I just got it doing nothing related in particular. Release note: None Co-authored-by: Tobias Schottdorf <tobias.schottdorf@gmail.com>
cockroachdb · Sep 6, 2019 · 4c2138b · 4c2138b
2 parents a72c573 + 4cdf0f9
commit 4c2138b
Show file tree

Hide file tree

Showing 6 changed files with 139 additions and 22 deletions.
diff --git a/pkg/storage/client_split_test.go b/pkg/storage/client_split_test.go
@@ -3195,22 +3195,71 @@ func TestStoreSplitDisappearingReplicas(t *testing.T) {
 // Regression test for #21146. This verifies the behavior of when the
 // application of some split command (part of the lhs's log) is delayed on some
 // store and meanwhile the rhs has rebalanced away and back, ending up with a
-// larger ReplicaID than the split thinks it will have.
+// larger ReplicaID than the split thinks it will have. Additionally we remove
+// the LHS replica on the same store before the split and re-add it after, so
+// that when the connectivity restores the LHS will apply a split trigger while
+// it is not a part of the descriptor.
+//
+// Or, in pictures (s3 looks like s1 throughout and is omitted):
+//
+//     s1:  [----r1@all-------------]
+//     s2:  [----r1@all-------------]
+// Remove s2:
+//     s1:  [----r1@s1s3------------]
+//     s2:  [----r1@all-------------] (outdated)
+// Split r1:
+//     s1:  [-r1@s1s3-|--r2@s1s3----]
+//     s2:  [----r1@all-------------] (outdated)
+// Add s2:
+//     s1:  [-r1@all-|--r2@s1s3-----]
+//     s2:  [----r1@all-------------] (outdated)
+// Add learner to s2 on r2 (remains uninitialized due to LHS state blocking it):
+//     s1:  [-r1@s1s3-|--r2@all-----]
+//     s2:  [----r1@all-------------] (outdated), uninitialized replica r2/3
+// Remove and re-add learner multiple times: r2/3 becomes r2/100
+//     (diagram looks the same except for replacing r2/3)
+//
+// When connectivity is restored, r1@s2 will start to catch up on the raft log
+// after it learns of its new replicaID. It first processes the replication
+// change that removes it and switches to a desc that doesn't contain itself as
+// a replica. Next it sees the split trigger that once caused a crash because
+// the store tried to look up itself and failed. This being handled correctly,
+// the split trigger next has to look up the right hand side, which surprisingly
+// has a higher replicaID than that seen in the split trigger. This too needs to
+// be tolerated.
 func TestSplitTriggerMeetsUnexpectedReplicaID(t *testing.T) {
 	defer leaktest.AfterTest(t)()
 	ctx := context.Background()
 
 	blockPromoteCh := make(chan struct{})
+	var skipLearnerSnaps int32
+	withoutLearnerSnap := func(fn func()) {
+		atomic.StoreInt32(&skipLearnerSnaps, 1)
+		fn()
+		atomic.StoreInt32(&skipLearnerSnaps, 0)
+	}
 	knobs := base.TestingKnobs{Store: &storage.StoreTestingKnobs{
-		ReplicaAddStopAfterLearnerSnapshot: func() bool {
-			<-blockPromoteCh
+		ReplicaSkipLearnerSnapshot: func() bool {
+			return atomic.LoadInt32(&skipLearnerSnaps) != 0
+		},
+		ReplicaAddStopAfterLearnerSnapshot: func(targets []roachpb.ReplicationTarget) bool {
+			if atomic.LoadInt32(&skipLearnerSnaps) != 0 {
+				return false
+			}
+			if len(targets) > 0 && targets[0].StoreID == 2 {
+				<-blockPromoteCh
+			}
 			return false
 		},
 		ReplicaAddSkipLearnerRollback: func() bool {
 			return true
 		},
+		// We rely on replicas remaining where they are even when they are removed
+		// from the range as this lets us set up a split trigger that will apply
+		// on a replica that is (at the time of the split trigger) not a member.
+		DisableReplicaGCQueue: true,
 	}}
-	tc := testcluster.StartTestCluster(t, 2, base.TestClusterArgs{
+	tc := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{
 		ServerArgs:      base.TestServerArgs{Knobs: knobs},
 		ReplicationMode: base.ReplicationManual,
 	})
@@ -3219,6 +3268,9 @@ func TestSplitTriggerMeetsUnexpectedReplicaID(t *testing.T) {
 	k := tc.ScratchRange(t)
 	desc := tc.LookupRangeOrFatal(t, k)
 
+	// Add a replica on n3 which we'll need to achieve quorum while we cut off n2 below.
+	tc.AddReplicasOrFatal(t, k, tc.Target(2))
+
 	// First construct a range with a learner replica on the second node (index 1)
 	// and split it, ending up with an orphaned learner on each side of the split.
 	// After the learner is created, but before the split, block all incoming raft
@@ -3237,23 +3289,40 @@ func TestSplitTriggerMeetsUnexpectedReplicaID(t *testing.T) {
 	})
 
 	_, kRHS := k, k.Next()
+	// Remove the LHS on the isolated store, split the range, and re-add it.
+	tc.RemoveReplicasOrFatal(t, k, tc.Target(1))
 	descLHS, descRHS := tc.SplitRangeOrFatal(t, kRHS)
+	withoutLearnerSnap(func() {
+		// NB: can't use AddReplicas since that waits for the target to be up
+		// to date, which it won't in this case.
+		//
+		// We avoid sending a snapshot because that snapshot would include the
+		// split trigger and we want that to be processed via the log.
+		d, err := tc.Servers[0].DB().AdminChangeReplicas(
+			ctx, descLHS.StartKey.AsRawKey(), descLHS, roachpb.MakeReplicationChanges(roachpb.ADD_REPLICA, tc.Target(1)),
+		)
+		require.NoError(t, err)
+		descLHS = *d
+	})
 
 	close(blockPromoteCh)
 	if err := g.Wait(); !testutils.IsError(err, `descriptor changed`) {
 		t.Fatalf(`expected "descriptor changed" error got: %+v`, err)
 	}
 
-	// Now repeatedly remove and re-add the learner on the rhs, so it has a
+	// Now repeatedly re-add the learner on the rhs, so it has a
 	// different replicaID than the split trigger expects.
-	for i := 0; i < 5; i++ {
-		_, err := tc.RemoveReplicas(kRHS, tc.Target(1))
-		require.NoError(t, err)
-		_, err = tc.AddReplicas(kRHS, tc.Target(1))
+	add := func() {
+		_, err := tc.AddReplicas(kRHS, tc.Target(1))
 		if !testutils.IsError(err, `snapshot intersects existing range`) {
 			t.Fatalf(`expected snapshot intersects existing range" error got: %+v`, err)
 		}
 	}
+	for i := 0; i < 5; i++ {
+		add()
+		tc.RemoveReplicasOrFatal(t, kRHS, tc.Target(1))
+	}
+	add()
 
 	// Normally AddReplicas will return the latest version of the RangeDescriptor,
 	// but because we're getting snapshot errors and using the

diff --git a/pkg/storage/raft_snapshot_queue.go b/pkg/storage/raft_snapshot_queue.go
@@ -113,6 +113,9 @@ func (rq *raftSnapshotQueue) processRaftSnapshot(
 	// the replicate queue. Either way, no point in also sending it a snapshot of
 	// type RAFT.
 	if repDesc.GetType() == roachpb.LEARNER {
+		if fn := repl.store.cfg.TestingKnobs.ReplicaSkipLearnerSnapshot; fn != nil && fn() {
+			return nil
+		}
 		snapType = SnapshotRequest_LEARNER
 		if index := repl.getAndGCSnapshotLogTruncationConstraints(timeutil.Now(), repDesc.StoreID); index > 0 {
 			// There is a snapshot being transferred. It's probably a LEARNER snap, so

diff --git a/pkg/storage/replica_command.go b/pkg/storage/replica_command.go
@@ -1211,6 +1211,10 @@ func (r *Replica) atomicReplicationChange(
 			return nil, errors.Errorf("programming error: cannot promote replica of type %s", rDesc.Type)
 		}
 
+		if fn := r.store.cfg.TestingKnobs.ReplicaSkipLearnerSnapshot; fn != nil && fn() {
+			continue
+		}
+
 		// Note that raft snapshot queue will refuse to send a snapshot to a learner
 		// replica if its store is already sending a snapshot to that replica. That
 		// would race with this snapshot, except that we've put a (best effort) lock
@@ -1231,8 +1235,8 @@ func (r *Replica) atomicReplicationChange(
 		}
 	}
 
-	if len(chgs.Additions()) > 0 {
-		if fn := r.store.cfg.TestingKnobs.ReplicaAddStopAfterLearnerSnapshot; fn != nil && fn() {
+	if adds := chgs.Additions(); len(adds) > 0 {
+		if fn := r.store.cfg.TestingKnobs.ReplicaAddStopAfterLearnerSnapshot; fn != nil && fn(adds) {
 			return desc, nil
 		}
 	}

diff --git a/pkg/storage/replica_learner_test.go b/pkg/storage/replica_learner_test.go
@@ -68,7 +68,7 @@ func (rtl *replicationTestKnobs) withStopAfterJointConfig(f func()) {
 
 func makeReplicationTestKnobs() (base.TestingKnobs, *replicationTestKnobs) {
 	var k replicationTestKnobs
-	k.storeKnobs.ReplicaAddStopAfterLearnerSnapshot = func() bool {
+	k.storeKnobs.ReplicaAddStopAfterLearnerSnapshot = func(_ []roachpb.ReplicationTarget) bool {
 		return atomic.LoadInt64(&k.replicaAddStopAfterLearnerAtomic) > 0
 	}
 	k.storeKnobs.ReplicaAddStopAfterJointConfig = func() bool {

diff --git a/pkg/storage/store.go b/pkg/storage/store.go
@@ -2138,6 +2138,17 @@ func splitPostApply(
 		//
 		// See:
 		// https://github.com/cockroachdb/cockroach/issues/21146#issuecomment-365757329
+		//
+		// TODO(tbg): this argument is flawed - it's possible for a tombstone
+		// to exist on the RHS:
+		// https://github.com/cockroachdb/cockroach/issues/40470
+		// Morally speaking, this means that we should throw away the data we
+		// moved from the LHS to the RHS (depending on the tombstone).
+		// Realistically speaking it will probably be easier to create the RHS
+		// anyway, even though there's a tombstone and it may just get gc'ed
+		// again. Note that for extra flavor, we may not even know whether the
+		// RHS is currently supposed to exist or not, lending more weight to the
+		// second approach.
 		tombstoneKey := keys.RaftTombstoneKey(rightRng.RangeID)
 		var tombstone roachpb.RaftTombstone
 		if ok, err := engine.MVCCGetProto(
@@ -2147,21 +2158,46 @@ func splitPostApply(
 		} else if ok {
 			log.Fatalf(ctx, "split trigger found right-hand side with tombstone %+v: %v", tombstone, rightRng)
 		}
-		// NB: the safety argument above implies that we don't have to worry
-		// about restoring the existing minReplicaID if it's nonzero. No
-		// promises have been made, so none need to be kept.
-		rightRng.mu.minReplicaID = 0
 		rightDesc, ok := split.RightDesc.GetReplicaDescriptor(r.StoreID())
 		if !ok {
-			log.Fatalf(ctx, "replica descriptor of local store not found in right hand side of split")
+			// This is yet another special quirky case. The local store is not
+			// necessarily a member of the split; this can occur if this store
+			// wasn't a member at the time of the split, but is nevertheless
+			// catching up across the split. For example, add a learner, and
+			// while it is being caught up via a snapshot, remove the learner
+			// again, then execute a split, and re-add it. Upon being re-added
+			// the learner will likely catch up from where the snapshot left it,
+			// and it will see itself get removed, then we hit this branch when
+			// the split trigger is applied, and eventually there's a
+			// ChangeReplicas that re-adds the local store under a new
+			// replicaID.
+			//
+			// So our trigger will have a smaller replicaID for our RHS, which
+			// will blow up in initRaftMuLockedReplicaMuLocked. We still want
+			// to force the RHS to accept the descriptor, even though that
+			// rewinds the replicaID. To do that we want to change the existing
+			// replicaID, but we didn't find one -- zero is then the only reasonable
+			// choice. Note that this is also the replicaID a replica that is
+			// not reflected in its own descriptor will have, i.e. we're cooking
+			// up less of a special case here than you'd expect at first glance.
+			//
+			// Note that futzing with the replicaID is a high-risk operation as
+			// it is what the raft peer will believe itself to be identified by.
+			// Under no circumstances must we use a replicaID that belongs to
+			// someone else, or a byzantine situation will result. Zero is
+			// special-cased and will never init a raft group until the real
+			// ID is known from inbound raft traffic.
+			rightDesc.ReplicaID = 0 // for clarity only; it's already zero
 		}
-		// We also have to potentially wind back the replicaID to avoid an error
-		// below. We can't overwrite unconditionally since usually the replicaID is
-		// zero (and thus special) and in that case, moving it forward out-of-band
-		// results in an broken state.
 		if rightRng.mu.replicaID > rightDesc.ReplicaID {
 			rightRng.mu.replicaID = rightDesc.ReplicaID
 		}
+		// NB: the safety argument above implies that we don't have to worry
+		// about restoring the existing minReplicaID if it's nonzero. No
+		// promises have been made, so none need to be kept. So we clear this
+		// unconditionally, making sure that it doesn't block us from init'ing
+		// the RHS.
+		rightRng.mu.minReplicaID = 0
 		err := rightRng.initRaftMuLockedReplicaMuLocked(&split.RightDesc, r.store.Clock(), 0)
 		rightRng.mu.Unlock()
 		if err != nil {

diff --git a/pkg/storage/testing_knobs.go b/pkg/storage/testing_knobs.go
@@ -195,7 +195,12 @@ type StoreTestingKnobs struct {
 	// and after the LEARNER type snapshot, but before promoting it to a voter.
 	// This ensures the `*Replica` will be materialized on the Store when it
 	// returns.
-	ReplicaAddStopAfterLearnerSnapshot func() bool
+	ReplicaAddStopAfterLearnerSnapshot func([]roachpb.ReplicationTarget) bool
+	// ReplicaSkipLearnerSnapshot causes snapshots to never be sent to learners
+	// if the func returns true. Adding replicas proceeds as usual, though if
+	// the added replica has no prior state which can be caught up from the raft
+	// log, the result will be an voter that is unable to participate in quorum.
+	ReplicaSkipLearnerSnapshot func() bool
 	// ReplicaAddStopAfterJointConfig causes replica addition to return early if
 	// the func returns true. This happens before transitioning out of a joint
 	// configuration, after the joint configuration has been entered by means