diff --git a/pkg/restore/range.go b/pkg/restore/range.go index ac512de75..81881e78e 100644 --- a/pkg/restore/range.go +++ b/pkg/restore/range.go @@ -75,8 +75,10 @@ func SortRanges(ranges []rtree.Range, rewriteRules *RewriteRules) ([]rtree.Range // RegionInfo includes a region and the leader of the region. type RegionInfo struct { - Region *metapb.Region - Leader *metapb.Peer + Region *metapb.Region + Leader *metapb.Peer + PendingPeers []*metapb.Peer + DownPeers []*metapb.Peer } // ContainsInterior returns whether the region contains the given key, and also diff --git a/pkg/restore/split.go b/pkg/restore/split.go index 87bc2183e..dfd93893b 100644 --- a/pkg/restore/split.go +++ b/pkg/restore/split.go @@ -185,12 +185,27 @@ SplitRegions: return nil } -func (rs *RegionSplitter) hasRegion(ctx context.Context, regionID uint64) (bool, error) { +func (rs *RegionSplitter) hasHealthyRegion(ctx context.Context, regionID uint64) (bool, error) { regionInfo, err := rs.client.GetRegionByID(ctx, regionID) if err != nil { return false, errors.Trace(err) } - return regionInfo != nil, nil + // the region hasn't get ready. + if regionInfo == nil { + return false, nil + } + + // check whether the region is healthy and report. + // TODO: the log may be too verbose. we should use Prometheus metrics once it get ready for BR. + for _, peer := range regionInfo.PendingPeers { + log.Debug("unhealthy region detected", logutil.Peer(peer), zap.String("type", "pending")) + } + for _, peer := range regionInfo.DownPeers { + log.Debug("unhealthy region detected", logutil.Peer(peer), zap.String("type", "down")) + } + // we ignore down peers for they are (normally) hard to be fixed in reasonable time. + // (or once there is a peer down, we may get stuck at waiting region get ready.) + return len(regionInfo.PendingPeers) == 0, nil } func (rs *RegionSplitter) isScatterRegionFinished(ctx context.Context, regionID uint64) (bool, error) { @@ -218,7 +233,7 @@ func (rs *RegionSplitter) isScatterRegionFinished(ctx context.Context, regionID func (rs *RegionSplitter) waitForSplit(ctx context.Context, regionID uint64) { interval := SplitCheckInterval for i := 0; i < SplitCheckMaxRetryTimes; i++ { - ok, err := rs.hasRegion(ctx, regionID) + ok, err := rs.hasHealthyRegion(ctx, regionID) if err != nil { log.Warn("wait for split failed", zap.Error(err)) return diff --git a/pkg/restore/split_client.go b/pkg/restore/split_client.go index ed24fc398..add5ec02a 100755 --- a/pkg/restore/split_client.go +++ b/pkg/restore/split_client.go @@ -172,8 +172,10 @@ func (c *pdClient) GetRegionByID(ctx context.Context, regionID uint64) (*RegionI return nil, nil } return &RegionInfo{ - Region: region.Meta, - Leader: region.Leader, + Region: region.Meta, + Leader: region.Leader, + PendingPeers: region.PendingPeers, + DownPeers: region.DownPeers, }, nil }