From 3f96697f72817df037879c248f16b9b4e45b6a19 Mon Sep 17 00:00:00 2001 From: Nadia Pinaeva Date: Mon, 26 Feb 2024 08:18:01 +0100 Subject: [PATCH] Add convergence tracker failure on timeout. Update readme with cross-platform comparison notes. Signed-off-by: Nadia Pinaeva --- kube-burner-workload/README.md | 7 +++++++ kube-burner-workload/convergence_waiter.sh | 11 ++++++++--- .../openshift/openflow-tracker/openflow-tracker.py | 8 ++++++-- .../openflow-tracker/openflow-tracker.py | 8 ++++++-- 4 files changed, 27 insertions(+), 7 deletions(-) diff --git a/kube-burner-workload/README.md b/kube-burner-workload/README.md index 7a5eae4..ff73ee6 100644 --- a/kube-burner-workload/README.md +++ b/kube-burner-workload/README.md @@ -333,6 +333,13 @@ and also may be reused and improved by the same platform as a part of this frame Every platform may have its own README. +### Comparing different platforms + +To ensure results for different platform are comparable, set up the convergence tracker logic to be as similar as possible, +all timeouts and variables defining successful test run should be the same. +Cluster-specific parameters, like resource quotas, enables services (e.g. observability), nodes configurations may also +affect the results. + ## Tracking the end of the test `CONVERGENCE_TRACKER` env variable enables `convergence-tracker` job. diff --git a/kube-burner-workload/convergence_waiter.sh b/kube-burner-workload/convergence_waiter.sh index 94eb06c..653c6b6 100755 --- a/kube-burner-workload/convergence_waiter.sh +++ b/kube-burner-workload/convergence_waiter.sh @@ -2,9 +2,14 @@ TIME_SPENT=0 while [ $TIME_SPENT -le 3600 ]; do - # failure will return 1 because of the "echo FAILED| wc -l" - PODS_COUNT=$( (kubectl get pods -n convergence-tracker-0 --no-headers || echo FAILED) | grep -c -v Completed) - if [ "$PODS_COUNT" -eq 0 ]; then + FAILED_COUNT=$(kubectl get pods -n convergence-tracker-0 --field-selector status.phase=Failed -o name | wc -l) + if [ "$FAILED_COUNT" -ne 0 ]; then + echo "ERROR: convergence tracker pod reported failure" + kubectl get pods -n convergence-tracker-0 --field-selector status.phase=Failed -o name + exit 1 + fi + RUNNING_COUNT=$(kubectl get pods -n convergence-tracker-0 --field-selector status.phase!=Succeeded -o name | wc -l) + if [ "$RUNNING_COUNT" -eq 0 ]; then echo "DONE" exit 0 fi diff --git a/kube-burner-workload/openshift/openflow-tracker/openflow-tracker.py b/kube-burner-workload/openshift/openflow-tracker/openflow-tracker.py index c970ac4..5446f46 100644 --- a/kube-burner-workload/openshift/openflow-tracker/openflow-tracker.py +++ b/kube-burner-workload/openshift/openflow-tracker/openflow-tracker.py @@ -2,6 +2,7 @@ import logging import os import ssl +import sys import time import subprocess @@ -52,6 +53,7 @@ def get_number_of_logical_flows(): # stable_threshold in seconds, for how long number of flows shouldn't change to consider it stable # timout in seconds def wait_for_flows_to_stabilize(poll_interval, stable_threshold, timeout, node_name): + timed_out = False start = time.time() last_changed = time.time() ovs_flows_num = get_number_of_ovs_flows() @@ -73,8 +75,9 @@ def wait_for_flows_to_stabilize(poll_interval, stable_threshold, timeout, node_n time.sleep(poll_interval) if time.time() - start >= timeout: + timed_out = True logging.info(f"TIMEOUT: {node_name} {timeout} seconds passed") - return last_changed, ovs_flows_num + return last_changed, ovs_flows_num, timed_out def get_db_data(): @@ -161,7 +164,7 @@ def main(): index_result(doc) logging.info(f"Start openflow-tracker {node_name}, threshold {threshold}") - stabilize_time, flow_num = wait_for_flows_to_stabilize( + stabilize_time, flow_num, timed_out = wait_for_flows_to_stabilize( 1, threshold, 3600, node_name ) stabilize_datetime = datetime.datetime.fromtimestamp(stabilize_time) @@ -188,6 +191,7 @@ def main(): "unhealthy_logs": ovn_health_logs, } index_result(doc) + sys.exit(int(timed_out)) if __name__ == "__main__": diff --git a/kube-burner-workload/ovn-kubernetes/openflow-tracker/openflow-tracker.py b/kube-burner-workload/ovn-kubernetes/openflow-tracker/openflow-tracker.py index d6114db..b1a9763 100644 --- a/kube-burner-workload/ovn-kubernetes/openflow-tracker/openflow-tracker.py +++ b/kube-burner-workload/ovn-kubernetes/openflow-tracker/openflow-tracker.py @@ -1,6 +1,7 @@ import datetime import logging import os +import sys import time import subprocess @@ -21,6 +22,7 @@ def get_number_of_flows(): # stable_threshold in seconds, for how long number of flows shouldn't change to consider it stable # timout in seconds def wait_for_flows_to_stabilize(poll_interval, stable_threshold, timeout, node_name): + timed_out = False start = time.time() last_changed = time.time() flows_num = get_number_of_flows() @@ -35,8 +37,9 @@ def wait_for_flows_to_stabilize(poll_interval, stable_threshold, timeout, node_n time.sleep(poll_interval) if time.time() - start >= timeout: + timed_out = True logging.info(f"TIMEOUT: {node_name} {timeout} seconds passed") - return last_changed, flows_num + return last_changed, flows_num, timed_out def get_db_data(): @@ -93,7 +96,7 @@ def main(): ) logging.info(f"Start openflow-tracker {node_name}, threshold {threshold}") - stabilize_time, flow_num = wait_for_flows_to_stabilize( + stabilize_time, flow_num, timed_out = wait_for_flows_to_stabilize( 1, threshold, 3600, node_name ) stabilize_datetime = datetime.datetime.fromtimestamp(stabilize_time) @@ -107,6 +110,7 @@ def main(): logging.info(f"HEALTHCHECK: {node_name} has no problems") else: logging.info(f"HEALTHCHECK: {node_name} has concerning logs: {ovn_health_logs}") + sys.exit(int(timed_out)) if __name__ == "__main__":