Add weekly tests for memory growth (triton-inference-server#3101)

* Add client_memory_growth_weekly test * Modify repetitions and add memory_growth_weekly test * Update repetitions * Modify repetition with condition * Update test.sh * Modify repetition with condition * Modify Dockerfile.QA * Update the comment for repetitions * Remove semicolons * Update symbols * Update the comment for repeititon * Added copyrights * Update reptitions * Modify email subject conditionally * Update comment * Update variable EMAIL_SUBJECT * Change variable to TRITON_PERF_WEEKLY * Update repetition times * Update Dockerfile.QA and email subject * Update repetition times * Update copyright * Remove busyop test temporarily * Update comments for disabled tests * Update write up for graphs * Update write up and reptition times * Remove changes for copyright Co-authored-by: Kris Hung <krish@krish-dt.nvidia.com>
ileixe · Jul 16, 2021 · f235751 · f235751
1 parent 0645cf1
commit f235751
Show file tree

Hide file tree

Showing 4 changed files with 110 additions and 70 deletions.
diff --git a/qa/L0_client_memory_growth/client_memory_mail.py b/qa/L0_client_memory_growth/client_memory_mail.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -35,9 +35,11 @@
 
 if __name__ == '__main__':
     today = date.today().strftime("%Y-%m-%d")
-    subject = "Triton Client Memory Growth Summary: " + today
+    subject = "Triton Client Memory Growth " + sys.argv[1] + " Summary: " + today
     memory_graphs = glob.glob("client_memory_growth*.log")
-    html_content = "<html><head></head><body><pre style=\"font-size:11pt;font-family:Consolas;\">"
+    write_up = "<p>This test is run for both HTTP and GRPC protocols using C++ and Python test scripts. The max-allowed difference between mean and maximum memory usage is set to 10MB and 1MB for C++ and Python tests individually.</p>"
+    write_up += "<p><b>&#8226 What to look for</b><br>A linear memory growth in the beginning of the graph is acceptable only when it is followed by a flat memory usage. If a linear memory growth is observed during the entire test then there is possibly a memory leak.</p>"
+    html_content = "<html><head></head><body><pre style=\"font-size:11pt;font-family:Arial, sans-serif;\">" + write_up + "</pre><pre style=\"font-size:11pt;font-family:Consolas;\">"
     for mem_graph in sorted(memory_graphs):
         html_content += "\n" + mem_graph + "\n"
         with open(mem_graph, "r") as f:

diff --git a/qa/L0_client_memory_growth/test.sh b/qa/L0_client_memory_growth/test.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -53,6 +53,18 @@ SERVER=/opt/tritonserver/bin/tritonserver
 SERVER_ARGS="--model-repository=$DATADIR"
 source ../common/util.sh
 
+# Set the number of repetitions in nightly and weekly tests
+# Set the email subject for nightly and weekly tests
+if [ "$TRITON_PERF_WEEKLY" == 1 ]; then
+    REPETITION_CPP=2000000
+    REPETITION_PY=2400000
+    EMAIL_SUBJECT="Weekly"
+else
+    REPETITION_CPP=100000
+    REPETITION_PY=10000
+    EMAIL_SUBJECT="Nightly"
+fi
+
 mkdir -p $DATADIR/custom_identity_int32/1
 
 RET=0
@@ -77,11 +89,11 @@ for PROTOCOL in http grpc; do
         if [ "$LANG" == "c++" ]; then
             MEMORY_GROWTH_TEST=$MEMORY_GROWTH_TEST_CPP
             MAX_ALLOWED_ALLOC="10"
-            EXTRA_ARGS="-r 100000 -i ${PROTOCOL}"
+            EXTRA_ARGS="-r ${REPETITION_CPP} -i ${PROTOCOL}"
         else
             MEMORY_GROWTH_TEST="python $MEMORY_GROWTH_TEST_PY"
             MAX_ALLOWED_ALLOC="1"
-            EXTRA_ARGS="-r 10000 -i ${PROTOCOL}"
+            EXTRA_ARGS="-r ${REPETITION_PY} -i ${PROTOCOL}"
         fi
 
         $LEAKCHECK $LEAKCHECK_ARGS $MEMORY_GROWTH_TEST $EXTRA_ARGS >> ${CLIENT_LOG} 2>&1
@@ -125,7 +137,7 @@ fi
 
 # Run only if both TRITON_FROM and TRITON_TO_DL are set
 if [[ ! -z "$TRITON_FROM" ]] || [[ ! -z "$TRITON_TO_DL" ]]; then
-    python client_memory_mail.py
+    python client_memory_mail.py $EMAIL_SUBJECT
 fi
 
 exit $RET
diff --git a/qa/L0_memory_growth/server_memory_mail.py b/qa/L0_memory_growth/server_memory_mail.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -35,12 +35,25 @@
 
 if __name__ == '__main__':
     today = date.today().strftime("%Y-%m-%d")
-    subject = "Triton Server Memory Growth Summary: " + today
-    memory_graphs = glob.glob("memory_growth*.log")
-    html_content = "<html><head></head><body><pre style=\"font-size:11pt;font-family:Consolas;\">"
-    for mem_graph in sorted(memory_graphs):
+    subject = "Triton Server Memory Growth " + sys.argv[1] + " Summary: " + today
+    memory_graphs_resnet = glob.glob("memory_growth_resnet*.log")
+    memory_graphs_busyop = glob.glob("memory_growth_busyop.log")
+    write_up = "<p>This test uses perf_analyzer as clients running on 4 different models. The max allowed difference between mean and maximum memory usage is set to 150MB.</p>"
+    write_up += "<p><b>&#8226 What to look for</b><br>A linear memory growth in the beginning of the graph is acceptable only when it is followed by a flat memory usage. If a linear memory growth is observed during the entire test then there is possibly a memory leak.</p>"
+    html_content = "<html><head></head><body><pre style=\"font-size:11pt;font-family:Arial, sans-serif;\">" + write_up + "</pre><pre style=\"font-size:11pt;font-family:Consolas;\">"
+    for mem_graph in sorted(memory_graphs_resnet):
         html_content += "\n" + mem_graph + "\n"
         with open(mem_graph, "r") as f:
             html_content += f.read() + "\n"
+    # The busy op model causes PTX failures when running the CI.
+    # Should be uncommented when it's ready for merging.
+    # TODO Uncomment after PTX issues are resolved.
+    # write_up = "<p>The busyop test is by design to show that actual memory growth is correctly detected and displayed.</p>"
+    # write_up += "<p><b>&#8226 What to look for</b><br>The memory usage should increase continually over time, and a linear growth should be observed in the graph below.</p>"
+    # html_content += "</pre><pre style=\"font-size:11pt;font-family:Arial, sans-serif;\">" + write_up + "</pre><pre style=\"font-size:11pt;font-family:Consolas;\">"
+    # for mem_graph in sorted(memory_graphs_busyop):
+    #     html_content += "\n" + mem_graph + "\n"
+    #     with open(mem_graph, "r") as f:
+    #         html_content += f.read() + "\n"
     html_content += "</pre></body></html>"
     nightly_email_helper.send(subject, html_content, is_html=True)
diff --git a/qa/L0_memory_growth/test.sh b/qa/L0_memory_growth/test.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -70,6 +70,16 @@ INSTANCE_CNT=2
 CONCURRENCY=32
 CLIENT_BS=8
 
+# Set the number of repetitions in nightly and weekly tests
+# Set the email subject for nightly and weekly tests
+if [ "$TRITON_PERF_WEEKLY" == 1 ]; then
+    REPETITION=200
+    EMAIL_SUBJECT="Weekly"
+else
+    REPETITION=3
+    EMAIL_SUBJECT="Nightly"
+fi
+
 # Threshold memory growth in MB
 MAX_ALLOWED_ALLOC="150"
 export MAX_ALLOWED_ALLOC
@@ -131,8 +141,8 @@ for MODEL in $(ls models); do
 
     set +e
 
-    # Run the perf analyzer 3 times
-    for i in {1..3}; do    
+    # Run the perf analyzer 'REPETITION' times
+    for ((i=1; i<=$REPETITION; i++)); do
         $PERF_ANALYZER -v -m $MODEL -i grpc --concurrency-range $CONCURRENCY -b $CLIENT_BS >> $CLIENT_LOG 2>&1
         if [ $? -ne 0 ]; then
             cat $CLIENT_LOG
@@ -164,60 +174,63 @@ done
 # Next perform a test that has unbound memory growth. Use the busy op model
 # with a high delay in order to force requests to sit in the queue, and result
 # in memory growth.
-BUSY_OP_TEST=busy_op_test.py
-DELAY_CYCLES=2100000000
-NUM_REQUESTS=100
-
-rm -rf test_repo && mkdir test_repo
-cp -r ${DATADIR}/qa_custom_ops/tf_custom_ops/graphdef_busyop test_repo/
-
-# Explicitly set library path so custom ops can find TF
-LD_LIBRARY_PATH=/opt/tritonserver/backends/tensorflow1
-SERVER_ARGS="--model-repository=`pwd`/test_repo"
-SERVER_LD_PRELOAD="${DATADIR}/qa_custom_ops/tf_custom_ops/libbusyop.so"
-
-LEAKCHECK_LOG="test_busyop.valgrind.log"
-MASSIF_LOG="test_busyop.massif"
-GRAPH_LOG="memory_growth_busyop.log"
-LEAKCHECK_ARGS="$LEAKCHECK_ARGS_BASE --massif-out-file=$MASSIF_LOG --max-threads=3000 --log-file=$LEAKCHECK_LOG"
-SERVER_LOG="test_busyop.server.log"
-CLIENT_LOG="test_busyop.client.log"
-
-# Run server
-run_server_leakcheck
-if [ "$SERVER_PID" == "0" ]; then
-    echo -e "\n***\n*** Failed to start $SERVER\n***"
-    cat $SERVER_LOG
-    exit 1
-fi
-
-set +e
-
-# Run the busy_op test
-python $BUSY_OP_TEST -v -m graphdef_busyop -d $DELAY_CYCLES -n $NUM_REQUESTS > $CLIENT_LOG 2>&1
-if [ $? -ne 0 ]; then
-    cat $CLIENT_LOG
-    echo -e "\n***\n*** Test graphdef_busyop Failed\n***"
-    RET=1
-fi
-set -e
-
-# Stop Server
-kill $SERVER_PID
-wait $SERVER_PID
-
-set +e
-
-ms_print ${MASSIF_LOG} | head -n35 >> ${GRAPH_LOG}
-cat ${GRAPH_LOG}
-# Check the massif output
-python $MASSIF_TEST $MASSIF_LOG $MAX_ALLOWED_ALLOC --start-from-middle >> $CLIENT_LOG 2>&1
-if [ $? -ne 1 ]; then
-    cat $CLIENT_LOG
-    echo -e "\n***\n*** Test for graphdef_busyop Failed\n***"
-    RET=1
-fi
-set -e
+# The busy op model causes PTX failures when running the CI.
+# Should be uncommented when it's ready for merging.
+# TODO Re-enable after PTX issues are resolved. 
+# BUSY_OP_TEST=busy_op_test.py
+# DELAY_CYCLES=2100000000
+# NUM_REQUESTS=100
+
+# rm -rf test_repo && mkdir test_repo
+# cp -r ${DATADIR}/qa_custom_ops/tf_custom_ops/graphdef_busyop test_repo/
+
+# # Explicitly set library path so custom ops can find TF
+# LD_LIBRARY_PATH=/opt/tritonserver/backends/tensorflow1
+# SERVER_ARGS="--model-repository=`pwd`/test_repo"
+# SERVER_LD_PRELOAD="${DATADIR}/qa_custom_ops/tf_custom_ops/libbusyop.so"
+
+# LEAKCHECK_LOG="test_busyop.valgrind.log"
+# MASSIF_LOG="test_busyop.massif"
+# GRAPH_LOG="memory_growth_busyop.log"
+# LEAKCHECK_ARGS="$LEAKCHECK_ARGS_BASE --massif-out-file=$MASSIF_LOG --max-threads=3000 --log-file=$LEAKCHECK_LOG"
+# SERVER_LOG="test_busyop.server.log"
+# CLIENT_LOG="test_busyop.client.log"
+
+# # Run server
+# run_server_leakcheck
+# if [ "$SERVER_PID" == "0" ]; then
+#     echo -e "\n***\n*** Failed to start $SERVER\n***"
+#     cat $SERVER_LOG
+#     exit 1
+# fi
+
+# set +e
+
+# # Run the busy_op test
+# python $BUSY_OP_TEST -v -m graphdef_busyop -d $DELAY_CYCLES -n $NUM_REQUESTS > $CLIENT_LOG 2>&1
+# if [ $? -ne 0 ]; then
+#     cat $CLIENT_LOG
+#     echo -e "\n***\n*** Test graphdef_busyop Failed\n***"
+#     RET=1
+# fi
+# set -e
+
+# # Stop Server
+# kill $SERVER_PID
+# wait $SERVER_PID
+
+# set +e
+
+# ms_print ${MASSIF_LOG} | head -n35 >> ${GRAPH_LOG}
+# cat ${GRAPH_LOG}
+# # Check the massif output
+# python $MASSIF_TEST $MASSIF_LOG $MAX_ALLOWED_ALLOC --start-from-middle >> $CLIENT_LOG 2>&1
+# if [ $? -ne 1 ]; then
+#     cat $CLIENT_LOG
+#     echo -e "\n***\n*** Test for graphdef_busyop Failed\n***"
+#     RET=1
+# fi
+# set -e
 
 if [ $RET -eq 0 ]; then
     echo -e "\n***\n*** Test Passed\n***"
@@ -227,7 +240,7 @@ fi
 
 # Run only if both TRITON_FROM and TRITON_TO_DL are set
 if [[ ! -z "$TRITON_FROM" ]] || [[ ! -z "$TRITON_TO_DL" ]]; then
-    python server_memory_mail.py
+    python server_memory_mail.py $EMAIL_SUBJECT
 fi
 
 exit $RET