Skip to content

Commit

Permalink
Add convergence tracker failure on timeout.
Browse files Browse the repository at this point in the history
Update readme with cross-platform comparison notes.

Signed-off-by: Nadia Pinaeva <npinaeva@redhat.com>
  • Loading branch information
npinaeva committed Feb 26, 2024
1 parent e323c03 commit 3f96697
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 7 deletions.
7 changes: 7 additions & 0 deletions kube-burner-workload/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,13 @@ and also may be reused and improved by the same platform as a part of this frame

Every platform may have its own README.

### Comparing different platforms

To ensure results for different platform are comparable, set up the convergence tracker logic to be as similar as possible,
all timeouts and variables defining successful test run should be the same.
Cluster-specific parameters, like resource quotas, enables services (e.g. observability), nodes configurations may also
affect the results.

## Tracking the end of the test

`CONVERGENCE_TRACKER` env variable enables `convergence-tracker` job.
Expand Down
11 changes: 8 additions & 3 deletions kube-burner-workload/convergence_waiter.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,14 @@

TIME_SPENT=0
while [ $TIME_SPENT -le 3600 ]; do
# failure will return 1 because of the "echo FAILED| wc -l"
PODS_COUNT=$( (kubectl get pods -n convergence-tracker-0 --no-headers || echo FAILED) | grep -c -v Completed)
if [ "$PODS_COUNT" -eq 0 ]; then
FAILED_COUNT=$(kubectl get pods -n convergence-tracker-0 --field-selector status.phase=Failed -o name | wc -l)
if [ "$FAILED_COUNT" -ne 0 ]; then
echo "ERROR: convergence tracker pod reported failure"
kubectl get pods -n convergence-tracker-0 --field-selector status.phase=Failed -o name
exit 1
fi
RUNNING_COUNT=$(kubectl get pods -n convergence-tracker-0 --field-selector status.phase!=Succeeded -o name | wc -l)
if [ "$RUNNING_COUNT" -eq 0 ]; then
echo "DONE"
exit 0
fi
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import logging
import os
import ssl
import sys
import time
import subprocess

Expand Down Expand Up @@ -52,6 +53,7 @@ def get_number_of_logical_flows():
# stable_threshold in seconds, for how long number of flows shouldn't change to consider it stable
# timout in seconds
def wait_for_flows_to_stabilize(poll_interval, stable_threshold, timeout, node_name):
timed_out = False
start = time.time()
last_changed = time.time()
ovs_flows_num = get_number_of_ovs_flows()
Expand All @@ -73,8 +75,9 @@ def wait_for_flows_to_stabilize(poll_interval, stable_threshold, timeout, node_n

time.sleep(poll_interval)
if time.time() - start >= timeout:
timed_out = True
logging.info(f"TIMEOUT: {node_name} {timeout} seconds passed")
return last_changed, ovs_flows_num
return last_changed, ovs_flows_num, timed_out


def get_db_data():
Expand Down Expand Up @@ -161,7 +164,7 @@ def main():
index_result(doc)

logging.info(f"Start openflow-tracker {node_name}, threshold {threshold}")
stabilize_time, flow_num = wait_for_flows_to_stabilize(
stabilize_time, flow_num, timed_out = wait_for_flows_to_stabilize(
1, threshold, 3600, node_name
)
stabilize_datetime = datetime.datetime.fromtimestamp(stabilize_time)
Expand All @@ -188,6 +191,7 @@ def main():
"unhealthy_logs": ovn_health_logs,
}
index_result(doc)
sys.exit(int(timed_out))


if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import datetime
import logging
import os
import sys
import time
import subprocess

Expand All @@ -21,6 +22,7 @@ def get_number_of_flows():
# stable_threshold in seconds, for how long number of flows shouldn't change to consider it stable
# timout in seconds
def wait_for_flows_to_stabilize(poll_interval, stable_threshold, timeout, node_name):
timed_out = False
start = time.time()
last_changed = time.time()
flows_num = get_number_of_flows()
Expand All @@ -35,8 +37,9 @@ def wait_for_flows_to_stabilize(poll_interval, stable_threshold, timeout, node_n

time.sleep(poll_interval)
if time.time() - start >= timeout:
timed_out = True
logging.info(f"TIMEOUT: {node_name} {timeout} seconds passed")
return last_changed, flows_num
return last_changed, flows_num, timed_out


def get_db_data():
Expand Down Expand Up @@ -93,7 +96,7 @@ def main():
)

logging.info(f"Start openflow-tracker {node_name}, threshold {threshold}")
stabilize_time, flow_num = wait_for_flows_to_stabilize(
stabilize_time, flow_num, timed_out = wait_for_flows_to_stabilize(
1, threshold, 3600, node_name
)
stabilize_datetime = datetime.datetime.fromtimestamp(stabilize_time)
Expand All @@ -107,6 +110,7 @@ def main():
logging.info(f"HEALTHCHECK: {node_name} has no problems")
else:
logging.info(f"HEALTHCHECK: {node_name} has concerning logs: {ovn_health_logs}")
sys.exit(int(timed_out))


if __name__ == "__main__":
Expand Down

0 comments on commit 3f96697

Please sign in to comment.