script to monitor memory + cpu utilization (pytorch#82006)

Add a python script that runs in the background during test jobs to log cpu + gpu memory usage and cpu utilization of python tests (really any python process) to a file and upload the file as an artifact. I plan on using the the gpu memory usage stats to better understand how to parallelize them, but it is easy to add on other stats if people want them. In the future, we want to add the ability to track network usage to see if we can decrease it. GPU utilization will also likely need to be improved. Click the hud link to see uploaded usage log artifacts Pull Request resolved: pytorch#82006 Approved by: https://github.com/huydhn
zmmwl · Jul 25, 2022 · 6f2a88d · 6f2a88d
1 parent 0e995f8
commit 6f2a88d
Show file tree

Hide file tree

Showing 7 changed files with 190 additions and 0 deletions.
diff --git a/.github/actions/upload-test-artifacts/action.yml b/.github/actions/upload-test-artifacts/action.yml
@@ -36,6 +36,20 @@ runs:
         rm -f test-reports-*.zip
         zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml'
 
+    - name: Zip usage log for upload
+      if: runner.os != 'Windows' && !inputs.use-gha
+      shell: bash
+      env:
+        FILE_SUFFIX: ${{ inputs.file-suffix }}
+      run: |
+        # Remove any previous test reports if they exist
+        rm -f usage-log-*.zip
+        # this workflow is also run in bazel build test, but we dont generate usage reports for it
+        # so check to see if the file exists first
+        if [ -f 'usage_log.txt' ]; then
+            zip "usage-log-${FILE_SUFFIX}.zip" 'usage_log.txt'
+        fi
+
     # Windows zip
     - name: Zip JSONs for upload
       if: runner.os == 'Windows' && !inputs.use-gha
@@ -55,6 +69,15 @@ runs:
         # -ir => recursive include all files in pattern
         7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml'
 
+    - name: Zip usage log for upload
+      if: runner.os == 'Windows' && !inputs.use-gha
+      shell: powershell
+      env:
+        FILE_SUFFIX: ${{ inputs.file-suffix }}
+      run: |
+        # -ir => recursive include all files in pattern
+        7z a "usage-log-$Env:FILE_SUFFIX.zip" 'usage_log.txt'
+
     # S3 upload
     - name: Store Test Downloaded JSONs on S3
       uses: seemethere/upload-artifact-s3@v5
@@ -76,6 +99,16 @@ runs:
         if-no-files-found: error
         path: test-reports-*.zip
 
+    - name: Store Usage Logs on S3
+      uses: seemethere/upload-artifact-s3@v5
+      if: ${{ !inputs.use-gha }}
+      with:
+        s3-prefix: |
+          ${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact
+        retention-days: 14
+        if-no-files-found: ignore
+        path: usage-log-*.zip
+
     # GHA upload
     - name: Store Test Downloaded JSONs on Github
       uses: actions/upload-artifact@v2

diff --git a/.github/workflows/_linux-test.yml b/.github/workflows/_linux-test.yml
@@ -63,6 +63,15 @@ jobs:
             bash .github/scripts/install_nvidia_utils_linux.sh
             echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
 
+      - name: Start monitoring script
+        id: monitor-script
+        shell: bash
+        run: |
+          python3 -m pip install psutil==5.9.1
+          python3 -m pip install pynvml==11.4.1
+          python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
+          echo "::set-output name=monitor-script-pid::${!}"
+
       - name: Download build artifacts
         uses: ./.github/actions/download-build-artifacts
         with:
@@ -166,6 +175,14 @@ jobs:
         with:
           github-token: ${{ secrets.GITHUB_TOKEN }}
 
+      - name: Stop monitoring script
+        if: always() && ${{ steps.monitor-script.outputs.monitor-script-pid }}
+        shell: bash
+        env:
+          MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }}
+        run: |
+          kill "$MONITOR_SCRIPT_PID"
+
       - name: Upload test artifacts
         uses: ./.github/actions/upload-test-artifacts
         if: always() && (steps.test.conclusion == 'success' || steps.test.conclusion == 'failure')

diff --git a/.github/workflows/_mac-test.yml b/.github/workflows/_mac-test.yml
@@ -55,6 +55,15 @@ jobs:
       - name: Checkout PyTorch
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
 
+      - name: Start monitoring script
+        id: monitor-script
+        shell: bash
+        run: |
+          python3 -m pip install psutil==5.9.1
+          python3 -m pip install pynvml==11.4.1
+          python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
+          echo "::set-output name=monitor-script-pid::${!}"
+
       - name: Download build artifacts
         uses: ./.github/actions/download-build-artifacts
         with:
@@ -105,6 +114,14 @@ jobs:
         with:
           github-token: ${{ secrets.GITHUB_TOKEN }}
 
+      - name: Stop monitoring script
+        if: always() && ${{ steps.monitor-script.outputs.monitor-script-pid }}
+        shell: bash
+        env:
+          MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }}
+        run: |
+          kill "$MONITOR_SCRIPT_PID"
+
       - name: Upload test artifacts
         uses: ./.github/actions/upload-test-artifacts
         if: always() && (steps.test.conclusion == 'success' || steps.test.conclusion == 'failure')

diff --git a/.github/workflows/_rocm-test.yml b/.github/workflows/_rocm-test.yml
@@ -62,6 +62,15 @@ jobs:
         with:
           docker-image: ${{ inputs.docker-image }}
 
+      - name: Start monitoring script
+        id: monitor-script
+        shell: bash
+        run: |
+          python3 -m pip install psutil==5.9.1
+          python3 -m pip install pynvml==11.4.1
+          python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
+          echo "::set-output name=monitor-script-pid::${!}"
+
       - name: Download build artifacts
         uses: ./.github/actions/download-build-artifacts
         with:
@@ -167,6 +176,14 @@ jobs:
         with:
           github-token: ${{ secrets.GITHUB_TOKEN }}
 
+      - name: Stop monitoring script
+        if: always() && ${{ steps.monitor-script.outputs.monitor-script-pid }}
+        shell: bash
+        env:
+          MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }}
+        run: |
+          kill "$MONITOR_SCRIPT_PID"
+
       - name: Upload test artifacts
         uses: ./.github/actions/upload-test-artifacts
         if: always() && (steps.test.conclusion == 'success' || steps.test.conclusion == 'failure')

diff --git a/.github/workflows/_win-test.yml b/.github/workflows/_win-test.yml
@@ -52,6 +52,15 @@ jobs:
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
 
+      - name: Start monitoring script
+        id: monitor-script
+        shell: bash
+        run: |
+          python3 -m pip install psutil==5.9.1
+          python3 -m pip install pynvml==11.4.1
+          python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
+          echo "::set-output name=monitor-script-pid::${!}"
+
       - name: Download PyTorch Build Artifacts
         uses: seemethere/download-artifact-s3@v4
         with:
@@ -112,6 +121,14 @@ jobs:
         with:
           github-token: ${{ secrets.GITHUB_TOKEN }}
 
+      - name: Stop monitoring script
+        if: always() && ${{ steps.monitor-script.outputs.monitor-script-pid }}
+        shell: bash
+        env:
+          MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }}
+        run: |
+          kill "$MONITOR_SCRIPT_PID"
+
       - name: Upload test artifacts
         uses: ./.github/actions/upload-test-artifacts
         if: always() && (steps.test.conclusion == 'success' || steps.test.conclusion == 'failure')

diff --git a/.gitignore b/.gitignore
@@ -44,6 +44,7 @@ docs/cpp/source/html/
 docs/cpp/source/latex/
 docs/source/generated/
 log
+usage_log.txt
 test-reports/
 test/.coverage
 test/.hypothesis/

diff --git a/tools/stats/monitor.py b/tools/stats/monitor.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python3
+import datetime
+import json
+import signal
+import time
+from typing import Any, Dict, List
+
+import psutil  # type: ignore[import]
+import pynvml  # type: ignore[import]
+
+
+def get_processes_running_python_tests() -> List[Any]:
+    python_processes = []
+    for process in psutil.process_iter():
+        try:
+            if "python" in process.name() and process.cmdline():
+                python_processes.append(process)
+        except (psutil.NoSuchProcess, psutil.AccessDenied):
+            # access denied or the process died
+            pass
+    return python_processes
+
+
+def get_per_process_cpu_info() -> List[Dict[str, Any]]:
+    processes = get_processes_running_python_tests()
+    per_process_info = []
+    for p in processes:
+        info = {
+            "pid": p.pid,
+            "cmd": " ".join(p.cmdline()),
+            "cpu_percent": p.cpu_percent(),
+            "rss_memory": p.memory_info().rss,
+            "uss_memory": p.memory_full_info().uss,
+        }
+        if "pss" in p.memory_full_info():
+            # only availiable in linux
+            info["pss_memory"] = p.memory_full_info().pss
+        per_process_info.append(info)
+    return per_process_info
+
+
+def get_per_process_gpu_info(handle: Any) -> List[Dict[str, Any]]:
+    processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
+    per_process_info = []
+    for p in processes:
+        info = {"pid": p.pid, "gpu_memory": p.usedGpuMemory}
+        per_process_info.append(info)
+    return per_process_info
+
+
+if __name__ == "__main__":
+
+    handle = None
+    try:
+        pynvml.nvmlInit()
+        handle = pynvml.nvmlDeviceGetHandleByIndex(0)
+    except pynvml.NVMLError:
+        # no pynvml avaliable, probably because not cuda
+        pass
+
+    kill_now = False
+
+    def exit_gracefully(*args: Any) -> None:
+        global kill_now
+        kill_now = True
+
+    signal.signal(signal.SIGTERM, exit_gracefully)
+
+    while not kill_now:
+        try:
+            stats = {
+                "time": datetime.datetime.utcnow().isoformat("T") + "Z",
+                "total_cpu_percent": psutil.cpu_percent(),
+                "per_process_cpu_info": get_per_process_cpu_info(),
+            }
+            if handle is not None:
+                stats["per_process_gpu_info"] = get_per_process_gpu_info(handle)
+                stats["total_gpu_utilizaiton"] = pynvml.nvmlDeviceGetUtilizationRates(
+                    handle
+                ).gpu
+        except Exception as e:
+            stats = {
+                "time": datetime.datetime.utcnow().isoformat("T") + "Z",
+                "error": str(e),
+            }
+        finally:
+            print(json.dumps(stats))
+            time.sleep(1)