Skip to content

Commit

Permalink
Merge pull request #5 from mazdakn/convergence_timeout
Browse files Browse the repository at this point in the history
Make convergence timeout configurable
  • Loading branch information
npinaeva authored Mar 5, 2024
2 parents e323c03 + 91ffae6 commit fc29d43
Show file tree
Hide file tree
Showing 8 changed files with 54 additions and 26 deletions.
7 changes: 4 additions & 3 deletions kube-burner-workload/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -340,9 +340,10 @@ The right way to track convergence may differ based on network plugin or cluster
as an example.
For example, ovn-kubernetes network plugin uses OVS flows underneath, therefore this job spins up a pod
on every node and tracks the number of OVS flows, when this number stops changing it considers the config to be applied.
There is a `THRESHOLD` parameter that defines for how long it waits to consider the number of flows converged.
`convergence_waiter.sh` is a script that waits for all convergence-tracker pod to be completed, before deleting
the workload.
There is a `CONVERGENCE_PERIOD` parameter that defines for how long it waits to consider the number of flows converged.
In addition, `CONVERGENCE_TIMEOUT` sets the hard deadline for convergence tracking.
`convergence_waiter.sh` is a script that waits `CONVERGENCE_PERIOD` for all convergence-tracker pod to be completed,
before deleting the workload.

## Running

Expand Down
3 changes: 2 additions & 1 deletion kube-burner-workload/convergence_waiter.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
#!/bin/bash

TIME_SPENT=0
while [ $TIME_SPENT -le 3600 ]; do
TIMEOUT=$((CONVERGENCE_TIMEOUT + CONVERGENCE_PERIOD))
while [ $TIME_SPENT -le "$TIMEOUT" ]; do
# failure will return 1 because of the "echo FAILED| wc -l"
PODS_COUNT=$( (kubectl get pods -n convergence-tracker-0 --no-headers || echo FAILED) | grep -c -v Completed)
if [ "$PODS_COUNT" -eq 0 ]; then
Expand Down
11 changes: 8 additions & 3 deletions kube-burner-workload/env
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,18 @@ KUBECONFIG=

# PLATFORM is one of the folders under network-policy workload
PLATFORM=kind-metrics
# CONVERGENCE_TRACKER
# Convergence tracker settings
CONVERGENCE_TRACKER=
# CONVERGENCE_PERIOD and CONVERGENCE_TIMEOUT are convergence tracker parameters.
# CONVERGENCE_PERIOD specifies for how long the system should be stable to be considered converged and
# CONVERGENCE_TIMEOUT is a timer specifying the hard deadline for policy convergence.
# A test failure will be reported by convergence tracker in CONVERGENCE_TIMEOUT + CONVERGENCE_PERIOD seconds.
CONVERGENCE_PERIOD=60
CONVERGENCE_TIMEOUT=3600

# Number of nodes to run convergence tracker. Doesn't have effect if CONVERGENCE_TRACKER is false
NODES_COUNT=3

# THRESHOLD is a convergence tracker parameter, specifying for how long the system should be stable to be considered converged
THRESHOLD=60
# JOB_PAUSE defines for how long he workload won't be deleted after the test is done
# default behaviour is to wait for 5 minutes after job completion to see how the system
# behaves some time after all work is done
Expand Down
3 changes: 2 additions & 1 deletion kube-burner-workload/network-policy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ jobs:
- objectTemplate: {{.PLATFORM}}/convergence_tracker.yml
replicas: {{.NODES_COUNT}}
inputVars:
threshold: "{{.THRESHOLD}}"
convergence_period: "{{.CONVERGENCE_PERIOD}}"
convergence_timeout: "{{.CONVERGENCE_TIMEOUT}}"
es_server: "{{.ES_SERVER}}"
es_index: {{.ES_INDEX}}
metadata: "netpols_per_namespace: {{.NETPOLS_PER_NAMESPACE}}, pods_per_namespace: {{.PODS_PER_NAMESPACE}},
Expand Down
6 changes: 4 additions & 2 deletions kube-burner-workload/openshift/convergence_tracker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,10 @@ spec:
- name: pod-logs
mountPath: /var/log/pods
env:
- name: THRESHOLD
value: "{{.threshold}}"
- name: CONVERGENCE_PERIOD
value: "{{.convergence_period}}"
- name: CONVERGENCE_TIMEOUT
value: "{{.convergence_timeout}}"
- name: ES_SERVER
value: {{.es_server}}
- name: ES_INDEX_NETPOL
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,15 +49,20 @@ def get_number_of_logical_flows():


# poll_interval in seconds, float
# stable_threshold in seconds, for how long number of flows shouldn't change to consider it stable
# convergence_period in seconds, for how long number of flows shouldn't change to consider it stable
# convergence_timeout in seconds, for how long number to wait for stabilisation before timing out
# timout in seconds
def wait_for_flows_to_stabilize(poll_interval, stable_threshold, timeout, node_name):
def wait_for_flows_to_stabilize(
poll_interval, convergence_period, convergence_timeout, node_name
):
timeout = convergence_timeout + convergence_period
start = time.time()
last_changed = time.time()
ovs_flows_num = get_number_of_ovs_flows()
logical_flows_num = get_number_of_logical_flows()
while (
time.time() - last_changed < stable_threshold and time.time() - start < timeout
time.time() - last_changed < convergence_period
and time.time() - start < timeout
):
new_logical_flows_num = get_number_of_logical_flows()
if new_logical_flows_num != logical_flows_num:
Expand Down Expand Up @@ -141,7 +146,8 @@ def main():
es_index = os.getenv("ES_INDEX_NETPOL")
node_name = os.getenv("MY_NODE_NAME")
uuid = os.getenv("UUID")
threshold = int(os.getenv("THRESHOLD"))
convergence_period = int(os.getenv("CONVERGENCE_PERIOD"))
convergence_timeout = int(os.getenv("CONVERGENCE_TIMEOUT"))
start_time = datetime.datetime.now()

logging.basicConfig(
Expand All @@ -155,14 +161,17 @@ def main():
"workload": "network-policy-perf",
"uuid": uuid,
"source_name": node_name,
"threshold": threshold,
"convergence_period": convergence_period,
"convergence_timeout": convergence_timeout,
"test_metadata": os.getenv("METADATA"),
}
index_result(doc)

logging.info(f"Start openflow-tracker {node_name}, threshold {threshold}")
logging.info(
f"Start openflow-tracker {node_name}, convergence_period {convergence_period}, convergence timeout {convergence_timeout}"
)
stabilize_time, flow_num = wait_for_flows_to_stabilize(
1, threshold, 3600, node_name
1, convergence_period, convergence_timeout, node_name
)
stabilize_datetime = datetime.datetime.fromtimestamp(stabilize_time)
nbdb_data = get_db_data()
Expand Down
6 changes: 4 additions & 2 deletions kube-burner-workload/ovn-kubernetes/convergence_tracker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,10 @@ spec:
- name: host-var-log-ovs
mountPath: /var/log/openvswitch
env:
- name: THRESHOLD
value: "{{.threshold}}"
- name: CONVERGENCE_PERIOD
value: "{{.convergence_period}}"
- name: CONVERGENCE_TIMEOUT
value: "{{.convergence_timeout}}"
- name: ES_SERVER
value: {{.es_server}}
- name: ES_INDEX_NETPOL
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,18 @@ def get_number_of_flows():


# poll_interval in seconds, float
# stable_threshold in seconds, for how long number of flows shouldn't change to consider it stable
# timout in seconds
def wait_for_flows_to_stabilize(poll_interval, stable_threshold, timeout, node_name):
# convergence_period in seconds, for how long number of flows shouldn't change to consider it stable
# convergence_timeout in seconds, for how long number to wait for stabilisation before timing out
def wait_for_flows_to_stabilize(
poll_interval, convergence_period, convergence_timeout, node_name
):
timeout = convergence_timeout + convergence_period
start = time.time()
last_changed = time.time()
flows_num = get_number_of_flows()
while (
time.time() - last_changed < stable_threshold and time.time() - start < timeout
time.time() - last_changed < convergence_period
and time.time() - start < timeout
):
new_flows_num = get_number_of_flows()
if new_flows_num != flows_num:
Expand Down Expand Up @@ -84,17 +88,20 @@ def check_ovn_health():

def main():
node_name = os.getenv("MY_NODE_NAME")
threshold = int(os.getenv("THRESHOLD"))
convergence_period = int(os.getenv("CONVERGENCE_PERIOD"))
convergence_timeout = int(os.getenv("CONVERGENCE_TIMEOUT"))

logging.basicConfig(
format="%(asctime)s %(levelname)-8s %(message)s",
level=logging.INFO,
datefmt="%Y-%m-%d %H:%M:%S",
)

logging.info(f"Start openflow-tracker {node_name}, threshold {threshold}")
logging.info(
f"Start openflow-tracker {node_name}, convergence_period {convergence_period}, convergence timeout {convergence_timeout}"
)
stabilize_time, flow_num = wait_for_flows_to_stabilize(
1, threshold, 3600, node_name
1, convergence_period, convergence_timeout, node_name
)
stabilize_datetime = datetime.datetime.fromtimestamp(stabilize_time)
nbdb_data = get_db_data()
Expand Down

0 comments on commit fc29d43

Please sign in to comment.