From a7c0b903d153ace47c6bbc7f4a4b106c99a6da95 Mon Sep 17 00:00:00 2001 From: yux0 Date: Thu, 9 Feb 2023 16:27:28 -0800 Subject: [PATCH 1/4] Make frontend drain traffic time configurable --- service/frontend/service.go | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/service/frontend/service.go b/service/frontend/service.go index 63d6b4dd8c7..4d2bd91cba6 100644 --- a/service/frontend/service.go +++ b/service/frontend/service.go @@ -334,19 +334,18 @@ func (s *Service) Stop() { // initiate graceful shutdown: // 1. Fail rpc health check, this will cause client side load balancer to stop forwarding requests to this node - // 2. wait for failure detection time + // 2. wait for 10 seconds failure detection time // 3. stop taking new requests by returning InternalServiceError - // 4. Wait for a second + // 4. Wait for X second // 5. Stop everything forcefully and return - requestDrainTime := util.Min(time.Second, s.config.ShutdownDrainDuration()) - failureDetectionTime := util.Max(0, s.config.ShutdownDrainDuration()-requestDrainTime) + requestDrainTime := util.Max(time.Second, s.config.ShutdownDrainDuration()) logger.Info("ShutdownHandler: Updating gRPC health status to ShuttingDown") s.healthServer.Shutdown() logger.Info("ShutdownHandler: Waiting for others to discover I am unhealthy") - time.Sleep(failureDetectionTime) + time.Sleep(10 * time.Second) s.handler.Stop() s.operatorHandler.Stop() From d27e14be55dce5ba429c31383802132ac8b9efdc Mon Sep 17 00:00:00 2001 From: yux0 Date: Fri, 10 Feb 2023 09:02:13 -0800 Subject: [PATCH 2/4] add config for failure detection time --- common/dynamicconfig/constants.go | 2 ++ service/frontend/service.go | 7 +++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/common/dynamicconfig/constants.go b/common/dynamicconfig/constants.go index 5e5196894b8..d97af6af530 100644 --- a/common/dynamicconfig/constants.go +++ b/common/dynamicconfig/constants.go @@ -202,6 +202,8 @@ const ( FrontendThrottledLogRPS = "frontend.throttledLogRPS" // FrontendShutdownDrainDuration is the duration of traffic drain during shutdown FrontendShutdownDrainDuration = "frontend.shutdownDrainDuration" + // FrontendMembershipFailureDetectionDuration is the duration of membership failure detection + FrontendMembershipFailureDetectionDuration = "frontend.membershipFailureDetectionDuration" // FrontendMaxBadBinaries is the max number of bad binaries in namespace config FrontendMaxBadBinaries = "frontend.maxBadBinaries" // SendRawWorkflowHistory is whether to enable raw history retrieving diff --git a/service/frontend/service.go b/service/frontend/service.go index 4d2bd91cba6..eb6b0e21490 100644 --- a/service/frontend/service.go +++ b/service/frontend/service.go @@ -85,6 +85,7 @@ type Config struct { WorkerBuildIdSizeLimit dynamicconfig.IntPropertyFn DisallowQuery dynamicconfig.BoolPropertyFnWithNamespaceFilter ShutdownDrainDuration dynamicconfig.DurationPropertyFn + MembershipFailureDetectionDuration dynamicconfig.DurationPropertyFn MaxBadBinaries dynamicconfig.IntPropertyFnWithNamespaceFilter @@ -207,6 +208,7 @@ func NewConfig(dc *dynamicconfig.Collection, numHistoryShards int32, enableReadF BlobSizeLimitWarn: dc.GetIntPropertyFilteredByNamespace(dynamicconfig.BlobSizeLimitWarn, 256*1024), ThrottledLogRPS: dc.GetIntProperty(dynamicconfig.FrontendThrottledLogRPS, 20), ShutdownDrainDuration: dc.GetDurationProperty(dynamicconfig.FrontendShutdownDrainDuration, 0*time.Second), + MembershipFailureDetectionDuration: dc.GetDurationProperty(dynamicconfig.FrontendMembershipFailureDetectionDuration, 10*time.Second), EnableNamespaceNotActiveAutoForwarding: dc.GetBoolPropertyFnWithNamespaceFilter(dynamicconfig.EnableNamespaceNotActiveAutoForwarding, true), SearchAttributesNumberOfKeysLimit: dc.GetIntPropertyFilteredByNamespace(dynamicconfig.SearchAttributesNumberOfKeysLimit, 100), SearchAttributesSizeOfValueLimit: dc.GetIntPropertyFilteredByNamespace(dynamicconfig.SearchAttributesSizeOfValueLimit, 2*1024), @@ -334,18 +336,19 @@ func (s *Service) Stop() { // initiate graceful shutdown: // 1. Fail rpc health check, this will cause client side load balancer to stop forwarding requests to this node - // 2. wait for 10 seconds failure detection time + // 2. wait for failure detection time // 3. stop taking new requests by returning InternalServiceError // 4. Wait for X second // 5. Stop everything forcefully and return requestDrainTime := util.Max(time.Second, s.config.ShutdownDrainDuration()) + failureDetectionTime := util.Max(0, s.config.MembershipFailureDetectionDuration()) logger.Info("ShutdownHandler: Updating gRPC health status to ShuttingDown") s.healthServer.Shutdown() logger.Info("ShutdownHandler: Waiting for others to discover I am unhealthy") - time.Sleep(10 * time.Second) + time.Sleep(failureDetectionTime) s.handler.Stop() s.operatorHandler.Stop() From f087c9e523e80f6e8158e85a362e47dcc4668693 Mon Sep 17 00:00:00 2001 From: yux0 Date: Fri, 10 Feb 2023 11:33:07 -0800 Subject: [PATCH 3/4] rename --- common/dynamicconfig/constants.go | 4 ++-- service/frontend/service.go | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/common/dynamicconfig/constants.go b/common/dynamicconfig/constants.go index d97af6af530..3b952ce8da5 100644 --- a/common/dynamicconfig/constants.go +++ b/common/dynamicconfig/constants.go @@ -202,8 +202,8 @@ const ( FrontendThrottledLogRPS = "frontend.throttledLogRPS" // FrontendShutdownDrainDuration is the duration of traffic drain during shutdown FrontendShutdownDrainDuration = "frontend.shutdownDrainDuration" - // FrontendMembershipFailureDetectionDuration is the duration of membership failure detection - FrontendMembershipFailureDetectionDuration = "frontend.membershipFailureDetectionDuration" + // FrontendShutdownFailHealthcheckDuration is the duration of shutdown failure detection + FrontendShutdownFailHealthcheckDuration = "frontend.shutdownFailHealthcheckDuration" // FrontendMaxBadBinaries is the max number of bad binaries in namespace config FrontendMaxBadBinaries = "frontend.maxBadBinaries" // SendRawWorkflowHistory is whether to enable raw history retrieving diff --git a/service/frontend/service.go b/service/frontend/service.go index eb6b0e21490..5a12f6143e3 100644 --- a/service/frontend/service.go +++ b/service/frontend/service.go @@ -85,7 +85,7 @@ type Config struct { WorkerBuildIdSizeLimit dynamicconfig.IntPropertyFn DisallowQuery dynamicconfig.BoolPropertyFnWithNamespaceFilter ShutdownDrainDuration dynamicconfig.DurationPropertyFn - MembershipFailureDetectionDuration dynamicconfig.DurationPropertyFn + ShutdownFailureDetectionDuration dynamicconfig.DurationPropertyFn MaxBadBinaries dynamicconfig.IntPropertyFnWithNamespaceFilter @@ -208,7 +208,7 @@ func NewConfig(dc *dynamicconfig.Collection, numHistoryShards int32, enableReadF BlobSizeLimitWarn: dc.GetIntPropertyFilteredByNamespace(dynamicconfig.BlobSizeLimitWarn, 256*1024), ThrottledLogRPS: dc.GetIntProperty(dynamicconfig.FrontendThrottledLogRPS, 20), ShutdownDrainDuration: dc.GetDurationProperty(dynamicconfig.FrontendShutdownDrainDuration, 0*time.Second), - MembershipFailureDetectionDuration: dc.GetDurationProperty(dynamicconfig.FrontendMembershipFailureDetectionDuration, 10*time.Second), + ShutdownFailureDetectionDuration: dc.GetDurationProperty(dynamicconfig.FrontendShutdownFailHealthcheckDuration, 10*time.Second), EnableNamespaceNotActiveAutoForwarding: dc.GetBoolPropertyFnWithNamespaceFilter(dynamicconfig.EnableNamespaceNotActiveAutoForwarding, true), SearchAttributesNumberOfKeysLimit: dc.GetIntPropertyFilteredByNamespace(dynamicconfig.SearchAttributesNumberOfKeysLimit, 100), SearchAttributesSizeOfValueLimit: dc.GetIntPropertyFilteredByNamespace(dynamicconfig.SearchAttributesSizeOfValueLimit, 2*1024), @@ -342,7 +342,7 @@ func (s *Service) Stop() { // 5. Stop everything forcefully and return requestDrainTime := util.Max(time.Second, s.config.ShutdownDrainDuration()) - failureDetectionTime := util.Max(0, s.config.MembershipFailureDetectionDuration()) + failureDetectionTime := util.Max(0, s.config.ShutdownFailureDetectionDuration()) logger.Info("ShutdownHandler: Updating gRPC health status to ShuttingDown") s.healthServer.Shutdown() From 6d968f0294c960e352c0ad31d9be7f6e332eba7e Mon Sep 17 00:00:00 2001 From: yux0 Date: Fri, 10 Feb 2023 11:41:30 -0800 Subject: [PATCH 4/4] rename --- service/frontend/service.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/service/frontend/service.go b/service/frontend/service.go index 5a12f6143e3..42d4ddbd12e 100644 --- a/service/frontend/service.go +++ b/service/frontend/service.go @@ -85,7 +85,7 @@ type Config struct { WorkerBuildIdSizeLimit dynamicconfig.IntPropertyFn DisallowQuery dynamicconfig.BoolPropertyFnWithNamespaceFilter ShutdownDrainDuration dynamicconfig.DurationPropertyFn - ShutdownFailureDetectionDuration dynamicconfig.DurationPropertyFn + ShutdownFailHealthcheckDuration dynamicconfig.DurationPropertyFn MaxBadBinaries dynamicconfig.IntPropertyFnWithNamespaceFilter @@ -208,7 +208,7 @@ func NewConfig(dc *dynamicconfig.Collection, numHistoryShards int32, enableReadF BlobSizeLimitWarn: dc.GetIntPropertyFilteredByNamespace(dynamicconfig.BlobSizeLimitWarn, 256*1024), ThrottledLogRPS: dc.GetIntProperty(dynamicconfig.FrontendThrottledLogRPS, 20), ShutdownDrainDuration: dc.GetDurationProperty(dynamicconfig.FrontendShutdownDrainDuration, 0*time.Second), - ShutdownFailureDetectionDuration: dc.GetDurationProperty(dynamicconfig.FrontendShutdownFailHealthcheckDuration, 10*time.Second), + ShutdownFailHealthcheckDuration: dc.GetDurationProperty(dynamicconfig.FrontendShutdownFailHealthcheckDuration, 10*time.Second), EnableNamespaceNotActiveAutoForwarding: dc.GetBoolPropertyFnWithNamespaceFilter(dynamicconfig.EnableNamespaceNotActiveAutoForwarding, true), SearchAttributesNumberOfKeysLimit: dc.GetIntPropertyFilteredByNamespace(dynamicconfig.SearchAttributesNumberOfKeysLimit, 100), SearchAttributesSizeOfValueLimit: dc.GetIntPropertyFilteredByNamespace(dynamicconfig.SearchAttributesSizeOfValueLimit, 2*1024), @@ -342,7 +342,7 @@ func (s *Service) Stop() { // 5. Stop everything forcefully and return requestDrainTime := util.Max(time.Second, s.config.ShutdownDrainDuration()) - failureDetectionTime := util.Max(0, s.config.ShutdownFailureDetectionDuration()) + failureDetectionTime := util.Max(0, s.config.ShutdownFailHealthcheckDuration()) logger.Info("ShutdownHandler: Updating gRPC health status to ShuttingDown") s.healthServer.Shutdown()