From 11b14597f4cb619cea538a7035c534ba1d87e169 Mon Sep 17 00:00:00 2001 From: James Forcier Date: Thu, 12 Sep 2019 14:49:07 -0400 Subject: [PATCH] upstream: add failure percentage-based outlier detection (#8130) Description: Add a new outlier detection mode which compares each host's rate of request failure to a configured fixed threshold. Risk Level: Low Testing: 2 new unit tests added. Docs Changes: New mode and config options described. Release Notes: white_check_mark Fixes #8105 Signed-off-by: James Forcier --- .../api/v2/cluster/outlier_detection.proto | 30 ++ .../v2alpha/outlier_detection_event.proto | 12 + .../cluster_manager/cluster_runtime.rst | 25 ++ .../cluster_manager/cluster_stats.rst | 4 + .../intro/arch_overview/upstream/outlier.rst | 27 ++ docs/root/intro/version_history.rst | 1 + .../common/upstream/outlier_detection_impl.cc | 116 ++++++- .../common/upstream/outlier_detection_impl.h | 18 +- .../upstream/outlier_detection_impl_test.cc | 296 +++++++++++++++++- 9 files changed, 508 insertions(+), 21 deletions(-) diff --git a/api/envoy/api/v2/cluster/outlier_detection.proto b/api/envoy/api/v2/cluster/outlier_detection.proto index 72cf038cb034..d457c8165f49 100644 --- a/api/envoy/api/v2/cluster/outlier_detection.proto +++ b/api/envoy/api/v2/cluster/outlier_detection.proto @@ -111,4 +111,34 @@ message OutlierDetection { // is set to true. google.protobuf.UInt32Value enforcing_local_origin_success_rate = 15 [(validate.rules).uint32.lte = 100]; + + // The failure percentage to use when determining failure percentage-based outlier detection. If + // the failure percentage of a given host is greater than or equal to this value, it will be + // ejected. Defaults to 85. + google.protobuf.UInt32Value failure_percentage_threshold = 16 [(validate.rules).uint32.lte = 100]; + + // The % chance that a host will be actually ejected when an outlier status is detected through + // failure percentage statistics. This setting can be used to disable ejection or to ramp it up + // slowly. Defaults to 0. + // + // [#next-major-version: setting this without setting failure_percentage_threshold should be + // invalid in v4.] + google.protobuf.UInt32Value enforcing_failure_percentage = 17 [(validate.rules).uint32.lte = 100]; + + // The % chance that a host will be actually ejected when an outlier status is detected through + // local-origin failure percentage statistics. This setting can be used to disable ejection or to + // ramp it up slowly. Defaults to 0. + google.protobuf.UInt32Value enforcing_failure_percentage_local_origin = 18 + [(validate.rules).uint32.lte = 100]; + + // The minimum number of hosts in a cluster in order to perform failure percentage-based ejection. + // If the total number of hosts in the cluster is less than this value, failure percentage-based + // ejection will not be performed. Defaults to 5. + google.protobuf.UInt32Value failure_percentage_minimum_hosts = 19; + + // The minimum number of total requests that must be collected in one interval (as defined by the + // interval duration above) to perform failure percentage-based ejection for this host. If the + // volume is lower than this setting, failure percentage-based ejection will not be performed for + // this host. Defaults to 50. + google.protobuf.UInt32Value failure_percentage_request_volume = 20; } diff --git a/api/envoy/data/cluster/v2alpha/outlier_detection_event.proto b/api/envoy/data/cluster/v2alpha/outlier_detection_event.proto index 836eeec42837..1273f84d6df2 100644 --- a/api/envoy/data/cluster/v2alpha/outlier_detection_event.proto +++ b/api/envoy/data/cluster/v2alpha/outlier_detection_event.proto @@ -39,6 +39,7 @@ message OutlierDetectionEvent { option (validate.required) = true; OutlierEjectSuccessRate eject_success_rate_event = 9; OutlierEjectConsecutive eject_consecutive_event = 10; + OutlierEjectFailurePercentage eject_failure_percentage_event = 11; } } @@ -75,6 +76,12 @@ enum OutlierEjectionType { // is set to *true*. // See :ref:`Cluster outlier detection ` documentation for SUCCESS_RATE_LOCAL_ORIGIN = 4; + // Runs over aggregated success rate statistics from every host in cluster and selects hosts for + // which ratio of failed replies is above configured value. + FAILURE_PERCENTAGE = 5; + // Runs over aggregated success rate statistics for local origin failures from every host in + // cluster and selects hosts for which ratio of failed replies is above configured value. + FAILURE_PERCENTAGE_LOCAL_ORIGIN = 6; } // Represents possible action applied to upstream host @@ -97,3 +104,8 @@ message OutlierEjectSuccessRate { message OutlierEjectConsecutive { } + +message OutlierEjectFailurePercentage { + // Host's success rate at the time of the ejection event on a 0-100 range. + uint32 host_success_rate = 1 [(validate.rules).uint32.lte = 100]; +} diff --git a/docs/root/configuration/upstream/cluster_manager/cluster_runtime.rst b/docs/root/configuration/upstream/cluster_manager/cluster_runtime.rst index a64750cf6625..195d025c24bc 100644 --- a/docs/root/configuration/upstream/cluster_manager/cluster_runtime.rst +++ b/docs/root/configuration/upstream/cluster_manager/cluster_runtime.rst @@ -102,6 +102,31 @@ outlier_detection.success_rate_stdev_factor ` setting in outlier detection +outlier_detection.enforcing_failure_percentage + :ref:`enforcing_failure_percentage + ` + setting in outlier detection + +outlier_detection.enforcing_failure_percentage_local_origin + :ref:`enforcing_failure_percentage_local_origin + ` + setting in outlier detection + +outlier_detection.failure_percentage_request_volume + :ref:`failure_percentage_request_volume + ` + setting in outlier detection + +outlier_detection.failure_percentage_minimum_hosts + :ref:`failure_percentage_minimum_hosts + ` + setting in outlier detection + +outlier_detection.failure_percentage_threshold + :ref:`failure_percentage_threshold + ` + setting in outlier detection + Core ---- diff --git a/docs/root/configuration/upstream/cluster_manager/cluster_stats.rst b/docs/root/configuration/upstream/cluster_manager/cluster_stats.rst index f4f1890baf8d..57bd438109b6 100644 --- a/docs/root/configuration/upstream/cluster_manager/cluster_stats.rst +++ b/docs/root/configuration/upstream/cluster_manager/cluster_stats.rst @@ -141,6 +141,10 @@ statistics will be rooted at *cluster..outlier_detection.* and contain the ejections_detected_consecutive_local_origin_failure, Counter, Number of detected consecutive local origin failure ejections (even if unenforced) ejections_enforced_local_origin_success_rate, Counter, Number of enforced success rate outlier ejections for locally originated failures ejections_detected_local_origin_success_rate, Counter, Number of detected success rate outlier ejections for locally originated failures (even if unenforced) + ejections_enforced_failure_percentage, Counter, Number of enforced failure percentage outlier ejections. Exact meaning of this counter depends on :ref:`outlier_detection.split_external_local_origin_errors` config item. Refer to :ref:`Outlier Detection documentation` for details. + ejections_detected_failure_percentage, Counter, Number of detected failure percentage outlier ejections (even if unenforced). Exact meaning of this counter depends on :ref:`outlier_detection.split_external_local_origin_errors` config item. Refer to :ref:`Outlier Detection documentation` for details. + ejections_enforced_failure_percentage_local_origin, Counter, Number of enforced failure percentage outlier ejections for locally originated failures + ejections_detected_failure_percentage_local_origin, Counter, Number of detected failure percentage outlier ejections for locally originated failures (even if unenforced) ejections_total, Counter, Deprecated. Number of ejections due to any outlier type (even if unenforced) ejections_consecutive_5xx, Counter, Deprecated. Number of consecutive 5xx ejections (even if unenforced) diff --git a/docs/root/intro/arch_overview/upstream/outlier.rst b/docs/root/intro/arch_overview/upstream/outlier.rst index 5b3c392b6a70..6743fba99147 100644 --- a/docs/root/intro/arch_overview/upstream/outlier.rst +++ b/docs/root/intro/arch_overview/upstream/outlier.rst @@ -145,6 +145,33 @@ Most configuration items, namely types of errors, but :ref:`outlier_detection.enforcing_success_rate` applies to externally originated errors only and :ref:`outlier_detection.enforcing_local_origin_success_rate` applies to locally originated errors only. +.. _arch_overview_outlier_detection_failure_percentage: + +Failure Percentage +^^^^^^^^^^^^^^^^^^ + +Failure Percentage based outlier ejection functions similarly to the success rate detecion type, in +that it relies on success rate data from each host in a cluster. However, rather than compare those +values to the mean success rate of the cluster as a whole, they are compared to a flat +user-configured threshold. This threshold is configured via the +:ref:`outlier_detection.failure_percentage_threshold` +field. + +The other configuration fields for failure percentage based ejection are similar to the fields for +success rate ejection. Failure percentage based ejection also obeys +:ref:`outlier_detection.split_external_local_origin_errors`; +the enforcement percentages for externally- and locally-originated errors are controlled by +:ref:`outlier_detection.enforcing_failure_percentage` +and +:ref:`outlier_detection.enforcing_failure_percentage_local_origin`, +respectively. As with success rate detection, detection will not be performed for a host if its +request volume over the aggregation interval is less than the +:ref:`outlier_detection.failure_percentage_request_volume` +value. Detection also will not be performed for a cluster if the number of hosts with the minimum +required request volume in an interval is less than the +:ref:`outlier_detection.failure_percentage_minimum_hosts` +value. + .. _arch_overview_outlier_detection_grpc: gRPC diff --git a/docs/root/intro/version_history.rst b/docs/root/intro/version_history.rst index 82d08ed8a443..11e887e225e0 100644 --- a/docs/root/intro/version_history.rst +++ b/docs/root/intro/version_history.rst @@ -61,6 +61,7 @@ Version history * tracing: added :ref:`max_path_tag_length ` to support customizing the length of the request path included in the extracted `http.url ` tag. * upstream: added :ref:`an option ` that allows draining HTTP, TCP connection pools on cluster membership change. * upstream: added network filter chains to upstream connections, see :ref:`filters`. +* upstream: added new :ref:`failure-percentage based outlier detection` mode. * upstream: use p2c to select hosts for least-requests load balancers if all host weights are the same, even in cases where weights are not equal to 1. * upstream: added :ref:`fail_traffic_on_panic ` to allow failing all requests to a cluster during panic state. * zookeeper: parse responses and emit latency stats. diff --git a/source/common/upstream/outlier_detection_impl.cc b/source/common/upstream/outlier_detection_impl.cc index 0d0efa147366..436a2ef28633 100644 --- a/source/common/upstream/outlier_detection_impl.cc +++ b/source/common/upstream/outlier_detection_impl.cc @@ -222,12 +222,22 @@ DetectorConfig::DetectorConfig(const envoy::api::v2::cluster::OutlierDetection& PROTOBUF_GET_WRAPPED_OR_DEFAULT(config, success_rate_request_volume, 100))), success_rate_stdev_factor_(static_cast( PROTOBUF_GET_WRAPPED_OR_DEFAULT(config, success_rate_stdev_factor, 1900))), + failure_percentage_threshold_(static_cast( + PROTOBUF_GET_WRAPPED_OR_DEFAULT(config, failure_percentage_threshold, 85))), + failure_percentage_minimum_hosts_(static_cast( + PROTOBUF_GET_WRAPPED_OR_DEFAULT(config, failure_percentage_minimum_hosts, 5))), + failure_percentage_request_volume_(static_cast( + PROTOBUF_GET_WRAPPED_OR_DEFAULT(config, failure_percentage_request_volume, 50))), enforcing_consecutive_5xx_(static_cast( PROTOBUF_GET_WRAPPED_OR_DEFAULT(config, enforcing_consecutive_5xx, 100))), enforcing_consecutive_gateway_failure_(static_cast( PROTOBUF_GET_WRAPPED_OR_DEFAULT(config, enforcing_consecutive_gateway_failure, 0))), enforcing_success_rate_(static_cast( PROTOBUF_GET_WRAPPED_OR_DEFAULT(config, enforcing_success_rate, 100))), + enforcing_failure_percentage_(static_cast( + PROTOBUF_GET_WRAPPED_OR_DEFAULT(config, enforcing_failure_percentage, 0))), + enforcing_failure_percentage_local_origin_(static_cast( + PROTOBUF_GET_WRAPPED_OR_DEFAULT(config, enforcing_failure_percentage_local_origin, 0))), split_external_local_origin_errors_(config.split_external_local_origin_errors()), consecutive_local_origin_failure_(static_cast( PROTOBUF_GET_WRAPPED_OR_DEFAULT(config, consecutive_local_origin_failure, 5))), @@ -355,6 +365,13 @@ bool DetectorImpl::enforceEjection(envoy::data::cluster::v2alpha::OutlierEjectio return runtime_.snapshot().featureEnabled( "outlier_detection.enforcing_local_origin_success_rate", config_.enforcingLocalOriginSuccessRate()); + case envoy::data::cluster::v2alpha::OutlierEjectionType::FAILURE_PERCENTAGE: + return runtime_.snapshot().featureEnabled("outlier_detection.enforcing_failure_percentage", + config_.enforcingFailurePercentage()); + case envoy::data::cluster::v2alpha::OutlierEjectionType::FAILURE_PERCENTAGE_LOCAL_ORIGIN: + return runtime_.snapshot().featureEnabled( + "outlier_detection.enforcing_failure_percentage_local_origin", + config_.enforcingFailurePercentageLocalOrigin()); default: // Checked by schema. NOT_REACHED_GCOVR_EXCL_LINE; @@ -382,6 +399,12 @@ void DetectorImpl::updateEnforcedEjectionStats( case envoy::data::cluster::v2alpha::OutlierEjectionType::SUCCESS_RATE_LOCAL_ORIGIN: stats_.ejections_enforced_local_origin_success_rate_.inc(); break; + case envoy::data::cluster::v2alpha::OutlierEjectionType::FAILURE_PERCENTAGE: + stats_.ejections_enforced_failure_percentage_.inc(); + break; + case envoy::data::cluster::v2alpha::OutlierEjectionType::FAILURE_PERCENTAGE_LOCAL_ORIGIN: + stats_.ejections_enforced_local_origin_failure_percentage_.inc(); + break; default: // Checked by schema. NOT_REACHED_GCOVR_EXCL_LINE; @@ -406,6 +429,12 @@ void DetectorImpl::updateDetectedEjectionStats( case envoy::data::cluster::v2alpha::OutlierEjectionType::SUCCESS_RATE_LOCAL_ORIGIN: stats_.ejections_detected_local_origin_success_rate_.inc(); break; + case envoy::data::cluster::v2alpha::OutlierEjectionType::FAILURE_PERCENTAGE: + stats_.ejections_detected_failure_percentage_.inc(); + break; + case envoy::data::cluster::v2alpha::OutlierEjectionType::FAILURE_PERCENTAGE_LOCAL_ORIGIN: + stats_.ejections_detected_local_origin_failure_percentage_.inc(); + break; default: // Checked by schema. NOT_REACHED_GCOVR_EXCL_LINE; @@ -556,32 +585,55 @@ void DetectorImpl::processSuccessRateEjections( "outlier_detection.success_rate_minimum_hosts", config_.successRateMinimumHosts()); uint64_t success_rate_request_volume = runtime_.snapshot().getInteger( "outlier_detection.success_rate_request_volume", config_.successRateRequestVolume()); + uint64_t failure_percentage_minimum_hosts = + runtime_.snapshot().getInteger("outlier_detection.failure_percentage_minimum_hosts", + config_.failurePercentageMinimumHosts()); + uint64_t failure_percentage_request_volume = + runtime_.snapshot().getInteger("outlier_detection.failure_percentage_request_volume", + config_.failurePercentageRequestVolume()); + std::vector valid_success_rate_hosts; + std::vector valid_failure_percentage_hosts; double success_rate_sum = 0; // Reset the Detector's success rate mean and stdev. getSRNums(monitor_type) = {-1, -1}; // Exit early if there are not enough hosts. - if (host_monitors_.size() < success_rate_minimum_hosts) { + if (host_monitors_.size() < success_rate_minimum_hosts && + host_monitors_.size() < failure_percentage_minimum_hosts) { return; } // reserve upper bound of vector size to avoid reallocation. valid_success_rate_hosts.reserve(host_monitors_.size()); + valid_failure_percentage_hosts.reserve(host_monitors_.size()); for (const auto& host : host_monitors_) { // Don't do work if the host is already ejected. if (!host.first->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)) { - absl::optional host_success_rate = host.second->getSRMonitor(monitor_type) - .successRateAccumulator() - .getSuccessRate(success_rate_request_volume); - - if (host_success_rate) { - valid_success_rate_hosts.emplace_back( - HostSuccessRatePair(host.first, host_success_rate.value())); - success_rate_sum += host_success_rate.value(); - host.second->successRate(monitor_type, host_success_rate.value()); + absl::optional> host_success_rate_and_volume = + host.second->getSRMonitor(monitor_type) + .successRateAccumulator() + .getSuccessRateAndVolume(); + + if (!host_success_rate_and_volume) { + continue; + } + double success_rate = host_success_rate_and_volume.value().first; + double request_volume = host_success_rate_and_volume.value().second; + + if (request_volume >= + std::min(success_rate_request_volume, failure_percentage_request_volume)) { + host.second->successRate(monitor_type, success_rate); + } + + if (request_volume >= success_rate_request_volume) { + valid_success_rate_hosts.emplace_back(HostSuccessRatePair(host.first, success_rate)); + success_rate_sum += success_rate; + } + if (request_volume >= failure_percentage_request_volume) { + valid_failure_percentage_hosts.emplace_back(HostSuccessRatePair(host.first, success_rate)); } } } @@ -607,6 +659,28 @@ void DetectorImpl::processSuccessRateEjections( } } } + + if (!valid_failure_percentage_hosts.empty() && + valid_failure_percentage_hosts.size() >= failure_percentage_minimum_hosts) { + const double failure_percentage_threshold = runtime_.snapshot().getInteger( + "outlier_detection.failure_percentage_threshold", config_.failurePercentageThreshold()); + + for (const auto& host_success_rate_pair : valid_failure_percentage_hosts) { + if ((100.0 - host_success_rate_pair.success_rate_) >= failure_percentage_threshold) { + // We should eject. + + // The ejection type returned by the SuccessRateMonitor's getEjectionType() will be a + // SUCCESS_RATE type, so we need to figure it out for ourselves. + const envoy::data::cluster::v2alpha::OutlierEjectionType type = + (monitor_type == DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin) + ? envoy::data::cluster::v2alpha::OutlierEjectionType::FAILURE_PERCENTAGE + : envoy::data::cluster::v2alpha::OutlierEjectionType:: + FAILURE_PERCENTAGE_LOCAL_ORIGIN; + updateDetectedEjectionStats(type); + ejectHost(host_success_rate_pair.host_, type); + } + } + } } void DetectorImpl::onIntervalTimer() { @@ -660,6 +734,15 @@ void EventLoggerImpl::logEject(const HostDescriptionConstSharedPtr& host, Detect detector.successRateEjectionThreshold(monitor_type)); event.mutable_eject_success_rate_event()->set_host_success_rate( host->outlierDetector().successRate(monitor_type)); + } else if ((type == envoy::data::cluster::v2alpha::OutlierEjectionType::FAILURE_PERCENTAGE) || + (type == envoy::data::cluster::v2alpha::OutlierEjectionType:: + FAILURE_PERCENTAGE_LOCAL_ORIGIN)) { + const DetectorHostMonitor::SuccessRateMonitorType monitor_type = + (type == envoy::data::cluster::v2alpha::OutlierEjectionType::FAILURE_PERCENTAGE) + ? DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin + : DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin; + event.mutable_eject_failure_percentage_event()->set_host_success_rate( + host->outlierDetector().successRate(monitor_type)); } else { event.mutable_eject_consecutive_event(); } @@ -707,14 +790,15 @@ SuccessRateAccumulatorBucket* SuccessRateAccumulator::updateCurrentWriter() { return current_success_rate_bucket_.get(); } -absl::optional -SuccessRateAccumulator::getSuccessRate(uint64_t success_rate_request_volume) { - if (backup_success_rate_bucket_->total_request_counter_ < success_rate_request_volume) { - return {}; +absl::optional> SuccessRateAccumulator::getSuccessRateAndVolume() { + if (!backup_success_rate_bucket_->total_request_counter_) { + return absl::nullopt; } - return {backup_success_rate_bucket_->success_request_counter_ * 100.0 / - backup_success_rate_bucket_->total_request_counter_}; + double success_rate = backup_success_rate_bucket_->success_request_counter_ * 100.0 / + backup_success_rate_bucket_->total_request_counter_; + + return {{success_rate, backup_success_rate_bucket_->total_request_counter_}}; } } // namespace Outlier diff --git a/source/common/upstream/outlier_detection_impl.h b/source/common/upstream/outlier_detection_impl.h index 2b703c2adba4..633cd52f16cb 100644 --- a/source/common/upstream/outlier_detection_impl.h +++ b/source/common/upstream/outlier_detection_impl.h @@ -92,7 +92,7 @@ class SuccessRateAccumulator { * @return a valid absl::optional with the success rate. If there were not enough * requests, an invalid absl::optional is returned. */ - absl::optional getSuccessRate(uint64_t success_rate_request_volume); + absl::optional> getSuccessRateAndVolume(); private: std::unique_ptr current_success_rate_bucket_; @@ -214,13 +214,17 @@ class DetectorHostMonitorImpl : public DetectorHostMonitor { COUNTER(ejections_detected_consecutive_5xx) \ COUNTER(ejections_detected_consecutive_gateway_failure) \ COUNTER(ejections_detected_success_rate) \ + COUNTER(ejections_detected_failure_percentage) \ COUNTER(ejections_enforced_consecutive_5xx) \ COUNTER(ejections_enforced_consecutive_gateway_failure) \ COUNTER(ejections_enforced_success_rate) \ + COUNTER(ejections_enforced_failure_percentage) \ COUNTER(ejections_detected_consecutive_local_origin_failure) \ COUNTER(ejections_enforced_consecutive_local_origin_failure) \ COUNTER(ejections_detected_local_origin_success_rate) \ COUNTER(ejections_enforced_local_origin_success_rate) \ + COUNTER(ejections_detected_local_origin_failure_percentage) \ + COUNTER(ejections_enforced_local_origin_failure_percentage) \ COUNTER(ejections_enforced_total) \ COUNTER(ejections_overflow) \ COUNTER(ejections_success_rate) \ @@ -249,11 +253,18 @@ class DetectorConfig { uint64_t successRateMinimumHosts() const { return success_rate_minimum_hosts_; } uint64_t successRateRequestVolume() const { return success_rate_request_volume_; } uint64_t successRateStdevFactor() const { return success_rate_stdev_factor_; } + uint64_t failurePercentageThreshold() const { return failure_percentage_threshold_; } + uint64_t failurePercentageMinimumHosts() const { return failure_percentage_minimum_hosts_; } + uint64_t failurePercentageRequestVolume() const { return failure_percentage_request_volume_; } uint64_t enforcingConsecutive5xx() const { return enforcing_consecutive_5xx_; } uint64_t enforcingConsecutiveGatewayFailure() const { return enforcing_consecutive_gateway_failure_; } uint64_t enforcingSuccessRate() const { return enforcing_success_rate_; } + uint64_t enforcingFailurePercentage() const { return enforcing_failure_percentage_; } + uint64_t enforcingFailurePercentageLocalOrigin() const { + return enforcing_failure_percentage_local_origin_; + } bool splitExternalLocalOriginErrors() const { return split_external_local_origin_errors_; } uint64_t consecutiveLocalOriginFailure() const { return consecutive_local_origin_failure_; } uint64_t enforcingConsecutiveLocalOriginFailure() const { @@ -270,9 +281,14 @@ class DetectorConfig { const uint64_t success_rate_minimum_hosts_; const uint64_t success_rate_request_volume_; const uint64_t success_rate_stdev_factor_; + const uint64_t failure_percentage_threshold_; + const uint64_t failure_percentage_minimum_hosts_; + const uint64_t failure_percentage_request_volume_; const uint64_t enforcing_consecutive_5xx_; const uint64_t enforcing_consecutive_gateway_failure_; const uint64_t enforcing_success_rate_; + const uint64_t enforcing_failure_percentage_; + const uint64_t enforcing_failure_percentage_local_origin_; const bool split_external_local_origin_errors_; const uint64_t consecutive_local_origin_failure_; const uint64_t enforcing_consecutive_local_origin_failure_; diff --git a/test/common/upstream/outlier_detection_impl_test.cc b/test/common/upstream/outlier_detection_impl_test.cc index 8c8c51062031..1df588abbc6f 100644 --- a/test/common/upstream/outlier_detection_impl_test.cc +++ b/test/common/upstream/outlier_detection_impl_test.cc @@ -129,6 +129,9 @@ enforcing_success_rate: 20 success_rate_minimum_hosts: 50 success_rate_request_volume: 200 success_rate_stdev_factor: 3000 +failure_percentage_minimum_hosts: 10 +failure_percentage_request_volume: 25 +failure_percentage_threshold: 70 )EOF"; envoy::api::v2::cluster::OutlierDetection outlier_detection; @@ -148,6 +151,11 @@ success_rate_stdev_factor: 3000 EXPECT_EQ(50UL, detector->config().successRateMinimumHosts()); EXPECT_EQ(200UL, detector->config().successRateRequestVolume()); EXPECT_EQ(3000UL, detector->config().successRateStdevFactor()); + EXPECT_EQ(0UL, detector->config().enforcingFailurePercentage()); + EXPECT_EQ(0UL, detector->config().enforcingFailurePercentageLocalOrigin()); + EXPECT_EQ(10UL, detector->config().failurePercentageMinimumHosts()); + EXPECT_EQ(25UL, detector->config().failurePercentageRequestVolume()); + EXPECT_EQ(70UL, detector->config().failurePercentageThreshold()); } TEST_F(OutlierDetectorImplTest, DestroyWithActive) { @@ -902,9 +910,11 @@ TEST_F(OutlierDetectorImplTest, BasicFlowSuccessRateExternalOrigin) { time_system_.setMonotonicTime(std::chrono::milliseconds(60001)); EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000), _)); interval_timer_->invokeCallback(); + // The success rate should be *calculated* since the minimum request volume was met for failure + // percentage ejection, but the host should not be ejected. EXPECT_EQ(0UL, outlier_detection_ejections_active_.value()); - EXPECT_EQ(-1, hosts_[4]->outlierDetector().successRate( - DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin)); + EXPECT_EQ(50UL, hosts_[4]->outlierDetector().successRate( + DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin)); EXPECT_EQ(-1, detector->successRateAverage( DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin)); EXPECT_EQ(-1, detector->successRateEjectionThreshold( @@ -1032,9 +1042,11 @@ TEST_F(OutlierDetectorImplTest, BasicFlowSuccessRateLocalOrigin) { time_system_.setMonotonicTime(std::chrono::milliseconds(60001)); EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000), _)); interval_timer_->invokeCallback(); + // The success rate should be *calculated* since the minimum request volume was met for failure + // percentage ejection, but the host should not be ejected. EXPECT_EQ(0UL, outlier_detection_ejections_active_.value()); - EXPECT_EQ(-1, hosts_[4]->outlierDetector().successRate( - DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin)); + EXPECT_EQ(50UL, hosts_[4]->outlierDetector().successRate( + DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin)); EXPECT_EQ(-1, detector->successRateAverage(DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin)); EXPECT_EQ(-1, detector->successRateEjectionThreshold( @@ -1056,6 +1068,252 @@ TEST_F(OutlierDetectorImplTest, EmptySuccessRate) { interval_timer_->invokeCallback(); } +TEST_F(OutlierDetectorImplTest, BasicFlowFailurePercentageExternalOrigin) { + EXPECT_CALL(cluster_.prioritySet(), addMemberUpdateCb(_)); + addHosts({ + "tcp://127.0.0.1:80", + "tcp://127.0.0.1:81", + "tcp://127.0.0.1:82", + "tcp://127.0.0.1:83", + "tcp://127.0.0.1:84", + }); + + EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000), _)); + std::shared_ptr detector(DetectorImpl::create( + cluster_, empty_outlier_detection_, dispatcher_, runtime_, time_system_, event_logger_)); + detector->addChangedStateCb([&](HostSharedPtr host) -> void { checker_.check(host); }); + + // Turn off 5xx detection and SR detection to test failure percentage detection in isolation. + ON_CALL(runtime_.snapshot_, featureEnabled("outlier_detection.enforcing_consecutive_5xx", 100)) + .WillByDefault(Return(false)); + ON_CALL(runtime_.snapshot_, + featureEnabled("outlier_detection.enforcing_consecutive_gateway_failure", 100)) + .WillByDefault(Return(false)); + ON_CALL(runtime_.snapshot_, featureEnabled("outlier_detection.enforcing_success_rate", 100)) + .WillByDefault(Return(false)); + // Now turn on failure percentage detection. + ON_CALL(runtime_.snapshot_, featureEnabled("outlier_detection.enforcing_failure_percentage", 0)) + .WillByDefault(Return(true)); + // Expect non-enforcing logging to happen every time the consecutive_5xx_ counter + // gets saturated (every 5 times). + EXPECT_CALL(*event_logger_, + logEject(std::static_pointer_cast(hosts_[3]), _, + envoy::data::cluster::v2alpha::OutlierEjectionType::CONSECUTIVE_5XX, false)) + .Times(50); + EXPECT_CALL( + *event_logger_, + logEject(std::static_pointer_cast(hosts_[3]), _, + envoy::data::cluster::v2alpha::OutlierEjectionType::CONSECUTIVE_GATEWAY_FAILURE, + false)) + .Times(50); + EXPECT_CALL(*event_logger_, + logEject(std::static_pointer_cast(hosts_[4]), _, + envoy::data::cluster::v2alpha::OutlierEjectionType::CONSECUTIVE_5XX, false)) + .Times(60); + EXPECT_CALL( + *event_logger_, + logEject(std::static_pointer_cast(hosts_[4]), _, + envoy::data::cluster::v2alpha::OutlierEjectionType::CONSECUTIVE_GATEWAY_FAILURE, + false)) + .Times(60); + + // Cause a failure percentage error on one host. First 3 hosts have perfect failure percentage; + // fourth host has failure percentage slightly below threshold; fifth has failure percentage + // slightly above threshold. + loadRq(hosts_, 50, 200); + loadRq(hosts_[3], 250, 503); + loadRq(hosts_[4], 300, 503); + + time_system_.setMonotonicTime(std::chrono::milliseconds(10000)); + EXPECT_CALL(checker_, check(hosts_[4])); + EXPECT_CALL(*event_logger_, + logEject(std::static_pointer_cast(hosts_[4]), _, + envoy::data::cluster::v2alpha::OutlierEjectionType::FAILURE_PERCENTAGE, + true)); + EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000), _)); + ON_CALL(runtime_.snapshot_, getInteger("outlier_detection.success_rate_stdev_factor", 1900)) + .WillByDefault(Return(1900)); + interval_timer_->invokeCallback(); + EXPECT_FLOAT_EQ(100.0 * (50.0 / 300.0), + hosts_[3]->outlierDetector().successRate( + DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin)); + EXPECT_FLOAT_EQ(100.0 * (50.0 / 350.0), + hosts_[4]->outlierDetector().successRate( + DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin)); + // Make sure that local origin success rate monitor is not affected + EXPECT_EQ(-1, hosts_[4]->outlierDetector().successRate( + DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin)); + EXPECT_EQ(-1, + detector->successRateAverage(DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin)); + EXPECT_EQ(-1, detector->successRateEjectionThreshold( + DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin)); + EXPECT_FALSE(hosts_[3]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); + EXPECT_TRUE(hosts_[4]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); + EXPECT_EQ(1UL, outlier_detection_ejections_active_.value()); + + // Interval that doesn't bring the host back in. + time_system_.setMonotonicTime(std::chrono::milliseconds(19999)); + EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000), _)); + interval_timer_->invokeCallback(); + EXPECT_TRUE(hosts_[4]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); + EXPECT_EQ(1UL, outlier_detection_ejections_active_.value()); + + // Interval that does bring the host back in. + time_system_.setMonotonicTime(std::chrono::milliseconds(50001)); + EXPECT_CALL(checker_, check(hosts_[4])); + EXPECT_CALL(*event_logger_, + logUneject(std::static_pointer_cast(hosts_[4]))); + EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000), _)); + interval_timer_->invokeCallback(); + EXPECT_FALSE(hosts_[4]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); + EXPECT_EQ(0UL, outlier_detection_ejections_active_.value()); + + // Expect non-enforcing logging to happen every time the consecutive_5xx_ counter + // gets saturated (every 5 times). + EXPECT_CALL(*event_logger_, + logEject(std::static_pointer_cast(hosts_[4]), _, + envoy::data::cluster::v2alpha::OutlierEjectionType::CONSECUTIVE_5XX, false)) + .Times(5); + EXPECT_CALL( + *event_logger_, + logEject(std::static_pointer_cast(hosts_[4]), _, + envoy::data::cluster::v2alpha::OutlierEjectionType::CONSECUTIVE_GATEWAY_FAILURE, + false)) + .Times(5); + + // Give 4 hosts enough request volume but not to the 5th. Should not cause an ejection. + loadRq(hosts_, 25, 200); + loadRq(hosts_[4], 25, 503); + + time_system_.setMonotonicTime(std::chrono::milliseconds(60001)); + EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000), _)); + interval_timer_->invokeCallback(); + // The success rate should be *calculated* since the minimum request volume was met for failure + // percentage ejection, but the host should not be ejected. + EXPECT_EQ(0UL, outlier_detection_ejections_active_.value()); + EXPECT_EQ(50UL, hosts_[4]->outlierDetector().successRate( + DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin)); + EXPECT_EQ(-1, detector->successRateAverage( + DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin)); + EXPECT_EQ(-1, detector->successRateEjectionThreshold( + DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin)); +} + +TEST_F(OutlierDetectorImplTest, BasicFlowFailurePercentageLocalOrigin) { + EXPECT_CALL(cluster_.prioritySet(), addMemberUpdateCb(_)); + addHosts({ + "tcp://127.0.0.1:80", + "tcp://127.0.0.1:81", + "tcp://127.0.0.1:82", + "tcp://127.0.0.1:83", + "tcp://127.0.0.1:84", + }); + + EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000), _)); + std::shared_ptr detector(DetectorImpl::create( + cluster_, outlier_detection_split_, dispatcher_, runtime_, time_system_, event_logger_)); + detector->addChangedStateCb([&](HostSharedPtr host) -> void { checker_.check(host); }); + + // Turn off 5xx detection and SR detection to test failure percentage detection in isolation. + ON_CALL(runtime_.snapshot_, + featureEnabled("outlier_detection.enforcing_consecutive_local_origin_failure", 100)) + .WillByDefault(Return(false)); + ON_CALL(runtime_.snapshot_, + featureEnabled("outlier_detection.enforcing_local_origin_success_rate", 100)) + .WillByDefault(Return(false)); + // Now turn on failure percentage detection. + ON_CALL(runtime_.snapshot_, + featureEnabled("outlier_detection.enforcing_failure_percentage_local_origin", 0)) + .WillByDefault(Return(true)); + // Expect non-enforcing logging to happen every time the consecutive_ counter + // gets saturated (every 5 times). + EXPECT_CALL( + *event_logger_, + logEject(std::static_pointer_cast(hosts_[4]), _, + envoy::data::cluster::v2alpha::OutlierEjectionType::CONSECUTIVE_LOCAL_ORIGIN_FAILURE, + false)) + .Times(40); + // Cause a failure percentage error on one host. First 4 of the hosts have perfect failure + // percentage. + loadRq(hosts_, 200, Result::LOCAL_ORIGIN_CONNECT_SUCCESS); + loadRq(hosts_[4], 200, Result::LOCAL_ORIGIN_CONNECT_FAILED); + + time_system_.setMonotonicTime(std::chrono::milliseconds(10000)); + EXPECT_CALL(checker_, check(hosts_[4])); + EXPECT_CALL( + *event_logger_, + logEject(std::static_pointer_cast(hosts_[4]), _, + envoy::data::cluster::v2alpha::OutlierEjectionType::FAILURE_PERCENTAGE_LOCAL_ORIGIN, + true)); + EXPECT_CALL( + *event_logger_, + logEject(std::static_pointer_cast(hosts_[4]), _, + envoy::data::cluster::v2alpha::OutlierEjectionType::SUCCESS_RATE_LOCAL_ORIGIN, + false)); + EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000), _)); + ON_CALL(runtime_.snapshot_, getInteger("outlier_detection.failure_percentage_threshold", 85)) + .WillByDefault(Return(40)); + interval_timer_->invokeCallback(); + EXPECT_EQ(50, hosts_[4]->outlierDetector().successRate( + DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin)); + EXPECT_EQ(90, + detector->successRateAverage(DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin)); + EXPECT_EQ(52, detector->successRateEjectionThreshold( + DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin)); + // Make sure that external origin success rate monitor is not affected + EXPECT_EQ(-1, hosts_[4]->outlierDetector().successRate( + DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin)); + EXPECT_EQ(-1, detector->successRateAverage( + DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin)); + EXPECT_EQ(-1, detector->successRateEjectionThreshold( + DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin)); + EXPECT_TRUE(hosts_[4]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); + EXPECT_EQ(1UL, outlier_detection_ejections_active_.value()); + + // Interval that doesn't bring the host back in. + time_system_.setMonotonicTime(std::chrono::milliseconds(19999)); + EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000), _)); + interval_timer_->invokeCallback(); + EXPECT_TRUE(hosts_[4]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); + EXPECT_EQ(1UL, outlier_detection_ejections_active_.value()); + + // Interval that does bring the host back in. + time_system_.setMonotonicTime(std::chrono::milliseconds(50001)); + EXPECT_CALL(checker_, check(hosts_[4])); + EXPECT_CALL(*event_logger_, + logUneject(std::static_pointer_cast(hosts_[4]))); + EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000), _)); + interval_timer_->invokeCallback(); + EXPECT_FALSE(hosts_[4]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); + EXPECT_EQ(0UL, outlier_detection_ejections_active_.value()); + + // Expect non-enforcing logging to happen every time the consecutive_ counter + // gets saturated (every 5 times). + EXPECT_CALL( + *event_logger_, + logEject(std::static_pointer_cast(hosts_[4]), _, + envoy::data::cluster::v2alpha::OutlierEjectionType::CONSECUTIVE_LOCAL_ORIGIN_FAILURE, + false)) + .Times(5); + + // Give 4 hosts enough request volume but not to the 5th. Should not cause an ejection. + loadRq(hosts_, 25, Result::LOCAL_ORIGIN_CONNECT_SUCCESS); + loadRq(hosts_[4], 25, Result::LOCAL_ORIGIN_CONNECT_FAILED); + + time_system_.setMonotonicTime(std::chrono::milliseconds(60001)); + EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000), _)); + interval_timer_->invokeCallback(); + // The success rate should be *calculated* since the minimum request volume was met for failure + // percentage ejection, but the host should not be ejected. + EXPECT_EQ(0UL, outlier_detection_ejections_active_.value()); + EXPECT_EQ(50UL, hosts_[4]->outlierDetector().successRate( + DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin)); + EXPECT_EQ(-1, + detector->successRateAverage(DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin)); + EXPECT_EQ(-1, detector->successRateEjectionThreshold( + DetectorHostMonitor::SuccessRateMonitorType::LocalOrigin)); +} + TEST_F(OutlierDetectorImplTest, RemoveWhileEjected) { EXPECT_CALL(cluster_.prioritySet(), addMemberUpdateCb(_)); addHosts({"tcp://127.0.0.1:80"}); @@ -1345,6 +1603,36 @@ TEST(OutlierDetectionEventLoggerImplTest, All) { .WillOnce(SaveArg<0>(&log4)); event_logger.logUneject(host); Json::Factory::loadFromString(log4); + + StringViewSaver log5; + EXPECT_CALL(host->outlier_detector_, lastUnejectionTime()).WillOnce(ReturnRef(monotonic_time)); + EXPECT_CALL(host->outlier_detector_, + successRate(DetectorHostMonitor::SuccessRateMonitorType::ExternalOrigin)) + .WillOnce(Return(0)); + EXPECT_CALL(*file, + write(absl::string_view( + "{\"type\":\"FAILURE_PERCENTAGE\",\"cluster_name\":\"fake_cluster\"," + "\"upstream_url\":\"10.0.0.1:443\",\"action\":\"EJECT\"," + "\"num_ejections\":0,\"enforced\":false,\"eject_failure_percentage_event\":{" + "\"host_success_rate\":0},\"timestamp\":\"2018-12-18T09:00:00Z\"," + "\"secs_since_last_action\":\"30\"}\n"))) + .WillOnce(SaveArg<0>(&log5)); + event_logger.logEject(host, detector, + envoy::data::cluster::v2alpha::OutlierEjectionType::FAILURE_PERCENTAGE, + false); + Json::Factory::loadFromString(log5); + + StringViewSaver log6; + EXPECT_CALL(host->outlier_detector_, lastEjectionTime()).WillOnce(ReturnRef(monotonic_time)); + EXPECT_CALL(*file, + write(absl::string_view( + "{\"type\":\"CONSECUTIVE_5XX\",\"cluster_name\":\"fake_cluster\"," + "\"upstream_url\":\"10.0.0.1:443\",\"action\":\"UNEJECT\"," + "\"num_ejections\":0,\"enforced\":false,\"timestamp\":\"2018-12-18T09:00:00Z\"," + "\"secs_since_last_action\":\"30\"}\n"))) + .WillOnce(SaveArg<0>(&log6)); + event_logger.logUneject(host); + Json::Factory::loadFromString(log6); } TEST(OutlierUtility, SRThreshold) {