temporalio · yux0 · Jan 13, 2023 · Jan 9, 2023 · Jan 10, 2023 · Jan 12, 2023
diff --git a/service/history/replication/poller_manager.go b/service/history/replication/poller_manager.go
@@ -31,12 +31,18 @@ import (
 )
 
 type (
+	pollerManager interface {
+		getSourceClusterShardIDs(sourceClusterName string) []int32
+	}
+
 	pollerManagerImpl struct {
 		currentShardId  int32
 		clusterMetadata cluster.Metadata
 	}
 )
 
+var _ pollerManager = (*pollerManagerImpl)(nil)
+
 func newPollerManager(
 	currentShardId int32,
 	clusterMetadata cluster.Metadata,
@@ -47,24 +53,25 @@ func newPollerManager(
 	}
 }
 
-func (p pollerManagerImpl) getPollingShardIDs(remoteClusterName string) []int32 {
+func (p pollerManagerImpl) getSourceClusterShardIDs(sourceClusterName string) []int32 {
 	currentCluster := p.clusterMetadata.GetCurrentClusterName()
 	allClusters := p.clusterMetadata.GetAllClusterInfo()
 	currentClusterInfo, ok := allClusters[currentCluster]
 	if !ok {
 		panic("Cannot get current cluster info from cluster metadata cache")
 	}
-	remoteClusterInfo, ok := allClusters[remoteClusterName]
+	remoteClusterInfo, ok := allClusters[sourceClusterName]
 	if !ok {
-		panic(fmt.Sprintf("Cannot get remote cluster %s info from cluster metadata cache", remoteClusterName))
+		panic(fmt.Sprintf("Cannot get source cluster %s info from cluster metadata cache", sourceClusterName))
 	}
-	return generatePollingShardIDs(p.currentShardId, currentClusterInfo.ShardCount, remoteClusterInfo.ShardCount)
+	return generateShardIDs(p.currentShardId, currentClusterInfo.ShardCount, remoteClusterInfo.ShardCount)
 }
 
-func generatePollingShardIDs(localShardId int32, localShardCount int32, remoteShardCount int32) []int32 {
+func generateShardIDs(localShardId int32, localShardCount int32, remoteShardCount int32) []int32 {
 	var pollingShards []int32
 	if remoteShardCount <= localShardCount {
-		if localShardId <= remoteShardCount {
+		if localShardId <= remoteShardCount || remoteShardCount == 0 {
+			// TODO: remove remoteShardCount == 0. This is due to current NDC/XDC functional test setup.
 			pollingShards = append(pollingShards, localShardId)
 		}
 		return pollingShards

diff --git a/service/history/replication/poller_manager_test.go b/service/history/replication/poller_manager_test.go
@@ -90,7 +90,7 @@ func TestGetPollingShardIds(t *testing.T) {
 					t.Errorf("The code did not panic")
 				}
 			}()
-			shardIDs := generatePollingShardIDs(tt.shardID, tt.localShardCount, tt.remoteShardCount)
+			shardIDs := generateShardIDs(tt.shardID, tt.localShardCount, tt.remoteShardCount)
 			assert.Equal(t, tt.expectedShardIDs, shardIDs)
 		})
 	}

diff --git a/service/history/replication/task_processor.go b/service/history/replication/task_processor.go
@@ -74,9 +74,10 @@ type (
 
 	// taskProcessorImpl is responsible for processing replication tasks for a shard.
 	taskProcessorImpl struct {
-		currentCluster          string
+		status int32
+
 		sourceCluster           string
-		status                  int32
+		pollingShardID          int32
 		shard                   shard.Context
 		historyEngine           shard.Engine
 		historySerializer       serialization.Serializer
@@ -109,6 +110,7 @@ type (
 
 // NewTaskProcessor creates a new replication task processor.
 func NewTaskProcessor(
+	pollingShardID int32,
 	shard shard.Context,
 	historyEngine shard.Engine,
 	config *configs.Config,
@@ -117,24 +119,23 @@ func NewTaskProcessor(
 	replicationTaskExecutor TaskExecutor,
 	eventSerializer serialization.Serializer,
 ) TaskProcessor {
-	shardID := shard.GetShardID()
-	taskRetryPolicy := backoff.NewExponentialRetryPolicy(config.ReplicationTaskProcessorErrorRetryWait(shardID)).
-		WithBackoffCoefficient(config.ReplicationTaskProcessorErrorRetryBackoffCoefficient(shardID)).
-		WithMaximumInterval(config.ReplicationTaskProcessorErrorRetryMaxInterval(shardID)).
-		WithMaximumAttempts(config.ReplicationTaskProcessorErrorRetryMaxAttempts(shardID)).
-		WithExpirationInterval(config.ReplicationTaskProcessorErrorRetryExpiration(shardID))
+	taskRetryPolicy := backoff.NewExponentialRetryPolicy(config.ReplicationTaskProcessorErrorRetryWait(pollingShardID)).
+		WithBackoffCoefficient(config.ReplicationTaskProcessorErrorRetryBackoffCoefficient(pollingShardID)).
+		WithMaximumInterval(config.ReplicationTaskProcessorErrorRetryMaxInterval(pollingShardID)).
+		WithMaximumAttempts(config.ReplicationTaskProcessorErrorRetryMaxAttempts(pollingShardID)).
+		WithExpirationInterval(config.ReplicationTaskProcessorErrorRetryExpiration(pollingShardID))
 
 	// TODO: define separate set of configs for dlq retry
-	dlqRetryPolicy := backoff.NewExponentialRetryPolicy(config.ReplicationTaskProcessorErrorRetryWait(shardID)).
-		WithBackoffCoefficient(config.ReplicationTaskProcessorErrorRetryBackoffCoefficient(shardID)).
-		WithMaximumInterval(config.ReplicationTaskProcessorErrorRetryMaxInterval(shardID)).
-		WithMaximumAttempts(config.ReplicationTaskProcessorErrorRetryMaxAttempts(shardID)).
-		WithExpirationInterval(config.ReplicationTaskProcessorErrorRetryExpiration(shardID))
+	dlqRetryPolicy := backoff.NewExponentialRetryPolicy(config.ReplicationTaskProcessorErrorRetryWait(pollingShardID)).
+		WithBackoffCoefficient(config.ReplicationTaskProcessorErrorRetryBackoffCoefficient(pollingShardID)).
+		WithMaximumInterval(config.ReplicationTaskProcessorErrorRetryMaxInterval(pollingShardID)).
+		WithMaximumAttempts(config.ReplicationTaskProcessorErrorRetryMaxAttempts(pollingShardID)).
+		WithExpirationInterval(config.ReplicationTaskProcessorErrorRetryExpiration(pollingShardID))
 
 	return &taskProcessorImpl{
-		currentCluster:          shard.GetClusterMetadata().GetCurrentClusterName(),
-		sourceCluster:           replicationTaskFetcher.getSourceCluster(),
 		status:                  common.DaemonStatusInitialized,
+		pollingShardID:          pollingShardID,
+		sourceCluster:           replicationTaskFetcher.getSourceCluster(),
 		shard:                   shard,
 		historyEngine:           historyEngine,
 		historySerializer:       eventSerializer,
@@ -370,6 +371,7 @@ func (p *taskProcessorImpl) convertTaskToDLQTask(
 	switch replicationTask.TaskType {
 	case enumsspb.REPLICATION_TASK_TYPE_SYNC_ACTIVITY_TASK:
 		taskAttributes := replicationTask.GetSyncActivityTaskAttributes()
+		// TODO: GetShardID will break GetDLQReplicationMessages we need to handle DLQ for cross shard replication.
 		return &persistence.PutReplicationTaskToDLQRequest{
 			ShardID:           p.shard.GetShardID(),
 			SourceClusterName: p.sourceCluster,
@@ -401,6 +403,7 @@ func (p *taskProcessorImpl) convertTaskToDLQTask(
 		// NOTE: last event vs next event, next event ID is exclusive
 		nextEventID := lastEvent.GetEventId() + 1
 
+		// TODO: GetShardID will break GetDLQReplicationMessages we need to handle DLQ for cross shard replication.
 		return &persistence.PutReplicationTaskToDLQRequest{
 			ShardID:           p.shard.GetShardID(),
 			SourceClusterName: p.sourceCluster,
@@ -429,6 +432,7 @@ func (p *taskProcessorImpl) convertTaskToDLQTask(
 			return nil, err
 		}
 
+		// TODO: GetShardID will break GetDLQReplicationMessages we need to handle DLQ for cross shard replication.
 		return &persistence.PutReplicationTaskToDLQRequest{
 			ShardID:           p.shard.GetShardID(),
 			SourceClusterName: p.sourceCluster,
@@ -451,7 +455,7 @@ func (p *taskProcessorImpl) paginationFn(_ []byte) ([]interface{}, []byte, error
 	respChan := make(chan *replicationspb.ReplicationMessages, 1)
 	p.requestChan <- &replicationTaskRequest{
 		token: &replicationspb.ReplicationToken{
-			ShardId:                     p.shard.GetShardID(),
+			ShardId:                     p.pollingShardID,
 			LastProcessedMessageId:      p.maxRxProcessedTaskID,
 			LastProcessedVisibilityTime: &p.maxRxProcessedTimestamp,
 			LastRetrievedMessageId:      p.maxRxReceivedTaskID,
@@ -486,7 +490,7 @@ func (p *taskProcessorImpl) paginationFn(_ []byte) ([]interface{}, []byte, error
 		if resp.GetHasMore() {
 			p.rxTaskBackoff = time.Duration(0)
 		} else {
-			p.rxTaskBackoff = p.config.ReplicationTaskProcessorNoTaskRetryWait(p.shard.GetShardID())
+			p.rxTaskBackoff = p.config.ReplicationTaskProcessorNoTaskRetryWait(p.pollingShardID)
 		}
 		return tasks, nil, nil
 

diff --git a/service/history/replication/task_processor_manager.go b/service/history/replication/task_processor_manager.go
@@ -26,6 +26,7 @@ package replication
 
 import (
 	"context"
+	"fmt"
 	"sync"
 	"sync/atomic"
 	"time"
@@ -49,6 +50,10 @@ import (
 	wcache "go.temporal.io/server/service/history/workflow/cache"
 )
 
+const (
+	clusterCallbackKey = "%s-%d" // <cluster name>-<polling shard id>
+)
+
 type (
 	// taskProcessorManagerImpl is to manage replication task processors
 	taskProcessorManagerImpl struct {
@@ -62,6 +67,7 @@ type (
 		workflowCache                 wcache.Cache
 		resender                      xdc.NDCHistoryResender
 		taskExecutorProvider          TaskExecutorProvider
+		taskPollerManager             pollerManager
 		metricsHandler                metrics.Handler
 		logger                        log.Logger
 
@@ -110,6 +116,7 @@ func NewTaskProcessorManager(
 		metricsHandler:       shard.GetMetricsHandler(),
 		taskProcessors:       make(map[string]TaskProcessor),
 		taskExecutorProvider: taskExecutorProvider,
+		taskPollerManager:    newPollerManager(shard.GetShardID(), shard.GetClusterMetadata()),
 		minTxAckedTaskID:     persistence.EmptyQueueMessageID,
 		shutdownChan:         make(chan struct{}),
 	}
@@ -167,37 +174,40 @@ func (r *taskProcessorManagerImpl) handleClusterMetadataUpdate(
 		if clusterName == currentClusterName {
 			continue
 		}
-		// The metadata triggers a update when the following fields update: 1. Enabled 2. Initial Failover Version 3. Cluster address
-		// The callback covers three cases:
-		// Case 1: Remove a cluster Case 2: Add a new cluster Case 3: Refresh cluster metadata.
-
-		if processor, ok := r.taskProcessors[clusterName]; ok {
-			// Case 1 and Case 3
-			processor.Stop()
-			delete(r.taskProcessors, clusterName)
-		}
-
-		if clusterInfo := newClusterMetadata[clusterName]; clusterInfo != nil && clusterInfo.Enabled {
-			// Case 2 and Case 3
-			fetcher := r.replicationTaskFetcherFactory.GetOrCreateFetcher(clusterName)
-			replicationTaskProcessor := NewTaskProcessor(
-				r.shard,
-				r.engine,
-				r.config,
-				r.shard.GetMetricsHandler(),
-				fetcher,
-				r.taskExecutorProvider(TaskExecutorParams{
-					RemoteCluster:   clusterName,
-					Shard:           r.shard,
-					HistoryResender: r.resender,
-					HistoryEngine:   r.engine,
-					DeleteManager:   r.deleteMgr,
-					WorkflowCache:   r.workflowCache,
-				}),
-				r.eventSerializer,
-			)
-			replicationTaskProcessor.Start()
-			r.taskProcessors[clusterName] = replicationTaskProcessor
+		pollingShardIds := r.taskPollerManager.getSourceClusterShardIDs(clusterName)
+		for _, pollingShardId := range pollingShardIds {
+			perShardTaskProcessorKey := fmt.Sprintf(clusterCallbackKey, clusterName, pollingShardId)
+			// The metadata triggers an update when the following fields update: 1. Enabled 2. Initial Failover Version 3. Cluster address
+			// The callback covers three cases:
+			// Case 1: Remove a cluster Case 2: Add a new cluster Case 3: Refresh cluster metadata.
+			if processor, ok := r.taskProcessors[perShardTaskProcessorKey]; ok {
+				// Case 1 and Case 3
+				processor.Stop()
+				delete(r.taskProcessors, perShardTaskProcessorKey)
+			}
+			if clusterInfo := newClusterMetadata[clusterName]; clusterInfo != nil && clusterInfo.Enabled {
+				// Case 2 and Case 3
+				fetcher := r.replicationTaskFetcherFactory.GetOrCreateFetcher(clusterName)
+				replicationTaskProcessor := NewTaskProcessor(
+					pollingShardId,
+					r.shard,
+					r.engine,
+					r.config,
+					r.shard.GetMetricsHandler(),
+					fetcher,
+					r.taskExecutorProvider(TaskExecutorParams{
+						RemoteCluster:   clusterName,
+						Shard:           r.shard,
+						HistoryResender: r.resender,
+						HistoryEngine:   r.engine,
+						DeleteManager:   r.deleteMgr,
+						WorkflowCache:   r.workflowCache,
+					}),
+					r.eventSerializer,
+				)
+				replicationTaskProcessor.Start()
+				r.taskProcessors[perShardTaskProcessorKey] = replicationTaskProcessor
+			}
 		}
 	}
 }

diff --git a/service/history/replication/task_processor_test.go b/service/history/replication/task_processor_test.go
@@ -148,6 +148,7 @@ func (s *taskProcessorSuite) SetupTest() {
 	metricsClient := metrics.NoopMetricsHandler
 
 	s.replicationTaskProcessor = NewTaskProcessor(
+		s.shardID,
 		s.mockShard,
 		s.mockEngine,
 		s.config,