forked from pytorch/pytorch
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
script to monitor memory + cpu utilization (pytorch#82006)
Add a python script that runs in the background during test jobs to log cpu + gpu memory usage and cpu utilization of python tests (really any python process) to a file and upload the file as an artifact. I plan on using the the gpu memory usage stats to better understand how to parallelize them, but it is easy to add on other stats if people want them. In the future, we want to add the ability to track network usage to see if we can decrease it. GPU utilization will also likely need to be improved. Click the hud link to see uploaded usage log artifacts Pull Request resolved: pytorch#82006 Approved by: https://github.com/huydhn
- Loading branch information
1 parent
0e995f8
commit 6f2a88d
Showing
7 changed files
with
190 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
#!/usr/bin/env python3 | ||
import datetime | ||
import json | ||
import signal | ||
import time | ||
from typing import Any, Dict, List | ||
|
||
import psutil # type: ignore[import] | ||
import pynvml # type: ignore[import] | ||
|
||
|
||
def get_processes_running_python_tests() -> List[Any]: | ||
python_processes = [] | ||
for process in psutil.process_iter(): | ||
try: | ||
if "python" in process.name() and process.cmdline(): | ||
python_processes.append(process) | ||
except (psutil.NoSuchProcess, psutil.AccessDenied): | ||
# access denied or the process died | ||
pass | ||
return python_processes | ||
|
||
|
||
def get_per_process_cpu_info() -> List[Dict[str, Any]]: | ||
processes = get_processes_running_python_tests() | ||
per_process_info = [] | ||
for p in processes: | ||
info = { | ||
"pid": p.pid, | ||
"cmd": " ".join(p.cmdline()), | ||
"cpu_percent": p.cpu_percent(), | ||
"rss_memory": p.memory_info().rss, | ||
"uss_memory": p.memory_full_info().uss, | ||
} | ||
if "pss" in p.memory_full_info(): | ||
# only availiable in linux | ||
info["pss_memory"] = p.memory_full_info().pss | ||
per_process_info.append(info) | ||
return per_process_info | ||
|
||
|
||
def get_per_process_gpu_info(handle: Any) -> List[Dict[str, Any]]: | ||
processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) | ||
per_process_info = [] | ||
for p in processes: | ||
info = {"pid": p.pid, "gpu_memory": p.usedGpuMemory} | ||
per_process_info.append(info) | ||
return per_process_info | ||
|
||
|
||
if __name__ == "__main__": | ||
|
||
handle = None | ||
try: | ||
pynvml.nvmlInit() | ||
handle = pynvml.nvmlDeviceGetHandleByIndex(0) | ||
except pynvml.NVMLError: | ||
# no pynvml avaliable, probably because not cuda | ||
pass | ||
|
||
kill_now = False | ||
|
||
def exit_gracefully(*args: Any) -> None: | ||
global kill_now | ||
kill_now = True | ||
|
||
signal.signal(signal.SIGTERM, exit_gracefully) | ||
|
||
while not kill_now: | ||
try: | ||
stats = { | ||
"time": datetime.datetime.utcnow().isoformat("T") + "Z", | ||
"total_cpu_percent": psutil.cpu_percent(), | ||
"per_process_cpu_info": get_per_process_cpu_info(), | ||
} | ||
if handle is not None: | ||
stats["per_process_gpu_info"] = get_per_process_gpu_info(handle) | ||
stats["total_gpu_utilizaiton"] = pynvml.nvmlDeviceGetUtilizationRates( | ||
handle | ||
).gpu | ||
except Exception as e: | ||
stats = { | ||
"time": datetime.datetime.utcnow().isoformat("T") + "Z", | ||
"error": str(e), | ||
} | ||
finally: | ||
print(json.dumps(stats)) | ||
time.sleep(1) |