Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Memory Profiling #15866

Merged
merged 21 commits into from
Jun 28, 2024
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/cudf/source/user_guide/api_docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,4 @@ This page provides a list of all publicly accessible modules, methods and classe
options
extension_dtypes
pylibcudf/index.rst
performance_tracking
12 changes: 12 additions & 0 deletions docs/cudf/source/user_guide/api_docs/performance_tracking.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
.. _api.performance_tracking:

====================
Performance Tracking
====================

.. currentmodule:: cudf.utils.performance_tracking
.. autosummary::
:toctree: api/

get_memory_records
print_memory_report
1 change: 1 addition & 0 deletions docs/cudf/source/user_guide/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,6 @@ options
performance-comparisons/index
PandasCompat
copy-on-write
memory-profiling
pandas-2.0-breaking-changes
```
54 changes: 54 additions & 0 deletions docs/cudf/source/user_guide/memory-profiling.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
(memory-profiling-user-doc)=

# Memory Profiling


Peak memory usage is a common concern in GPU programming since the available GPU memory is typically less than available CPU memory. To easily identify memory hotspots, cudf provides a memory profiler.

## Enabling memory profiling

1. Use `cudf.set_option`:

```python
>>> import cudf
>>> cudf.set_option("memory_profiling", True)
```

2. Set the environment variable ``CUDF_MEMORY_PROFILING`` to ``1`` prior to the
launch of the Python interpreter:

```
CUDF_MEMORY_PROFILING="1" python -c "import cudf"
```

## Get profiling result

To get the result of the profiling, use {py:func}`cudf.utils.performance_tracking.print_memory_report`. In the following, we enable profiling, do some work, and then print the profiling results:

```python
>>> import cudf
>>> from cudf.utils.performance_tracking import print_memory_report
>>> cudf.set_option("memory_profiling", True)
>>> cudf.DataFrame({"a": [1, 2, 3]}) # Some work
a
0 1
1 2
2 3
>>> print_memory_report() # Pretty print the result of the profiling
Memory Profiling
================

Legends:
ncalls - number of times the function or code block was called
memory_peak - peak memory allocated in function or code block (in bytes)
memory_total - total memory allocated in function or code block (in bytes)

Ordered by: memory_peak

ncalls memory_peak memory_total filename:lineno(function)
1 32 32 cudf/core/dataframe.py:690(DataFrame.__init__)
2 0 0 cudf/core/index.py:214(RangeIndex.__init__)
6 0 0 cudf/core/index.py:424(RangeIndex.__len__)
```

It is also possible to access the raw profiling data through {py:func}`cudf.utils.performance_tracking.get_memory_records`.
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/buffer/spill_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,14 @@
import rmm.mr

from cudf.options import get_option
from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
from cudf.utils.performance_tracking import _performance_tracking
from cudf.utils.string import format_bytes

if TYPE_CHECKING:
from cudf.core.buffer.spillable_buffer import SpillableBufferOwner

_spill_cudf_nvtx_annotate = partial(
_cudf_nvtx_annotate, domain="cudf_python-spill"
_performance_tracking, domain="cudf_python-spill"
)


Expand Down
7 changes: 4 additions & 3 deletions python/cudf/cudf/core/buffer/spillable_buffer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple

import numpy
import nvtx
from typing_extensions import Self

import rmm
Expand All @@ -21,7 +22,7 @@
host_memory_allocation,
)
from cudf.core.buffer.exposure_tracked_buffer import ExposureTrackedBuffer
from cudf.utils.nvtx_annotation import _get_color_for_nvtx, annotate
from cudf.utils.performance_tracking import _get_color_for_nvtx
from cudf.utils.string import format_bytes

if TYPE_CHECKING:
Expand Down Expand Up @@ -200,7 +201,7 @@ def spill(self, target: str = "cpu") -> None:
)

if (ptr_type, target) == ("gpu", "cpu"):
with annotate(
with nvtx.annotate(
message="SpillDtoH",
color=_get_color_for_nvtx("SpillDtoH"),
domain="cudf_python-spill",
Expand All @@ -218,7 +219,7 @@ def spill(self, target: str = "cpu") -> None:
# trigger a new call to this buffer's `spill()`.
# Therefore, it is important that spilling-on-demand doesn't
# try to unspill an already locked buffer!
with annotate(
with nvtx.annotate(
message="SpillHtoD",
color=_get_color_for_nvtx("SpillHtoD"),
domain="cudf_python-spill",
Expand Down
14 changes: 14 additions & 0 deletions python/cudf/cudf/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,20 @@ def _integer_and_none_validator(val):
_make_contains_validator([False, True]),
)

_register_option(
"memory_profiling",
_env_get_bool("CUDF_MEMORY_PROFILING", False),
textwrap.dedent(
"""
If set to `False`, disables memory profiling.
If set to `True`, enables memory profiling.
Read more at: :ref:`memory-profiling-user-doc`
\tValid values are True or False. Default is False.
"""
),
_make_contains_validator([False, True]),
)


class option_context(ContextDecorator):
"""
Expand Down
38 changes: 38 additions & 0 deletions python/cudf/cudf/tests/test_performance_tracking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from io import StringIO

import pytest

import rmm.mr

import cudf
from cudf.utils.performance_tracking import (
get_memory_records,
print_memory_report,
)


@pytest.fixture
def rmm_reset():
"""Fixture to reset the RMM resource before and after the test"""
mr = rmm.mr.get_current_device_resource()
try:
rmm.mr.set_current_device_resource(rmm.mr.CudaMemoryResource())
yield
finally:
rmm.mr.set_current_device_resource(mr)


def test_memory_profiling(rmm_reset):
df1 = cudf.DataFrame({"a": [1, 2, 3]})
assert len(get_memory_records()) == 0

cudf.set_option("memory_profiling", True)
df1.merge(df1)

assert len(get_memory_records()) > 0

out = StringIO()
print_memory_report(file=out)
assert "DataFrame.merge" in out.getvalue()
35 changes: 8 additions & 27 deletions python/cudf/cudf/utils/nvtx_annotation.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,11 @@
# Copyright (c) 2023, NVIDIA CORPORATION.
# Copyright (c) 2023-2024, NVIDIA CORPORATION.

import hashlib
from functools import partial

from nvtx import annotate

_NVTX_COLORS = ["green", "blue", "purple", "rapids"]


def _get_color_for_nvtx(name):
m = hashlib.sha256()
m.update(name.encode())
hash_value = int(m.hexdigest(), 16)
idx = hash_value % len(_NVTX_COLORS)
return _NVTX_COLORS[idx]


def _cudf_nvtx_annotate(func, domain="cudf_python"):
"""Decorator for applying nvtx annotations to methods in cudf."""
return annotate(
message=func.__qualname__,
color=_get_color_for_nvtx(func.__qualname__),
domain=domain,
)(func)


_dask_cudf_nvtx_annotate = partial(
_cudf_nvtx_annotate, domain="dask_cudf_python"
from cudf.utils.performance_tracking import (
_dask_cudf_performance_tracking,
_performance_tracking,
)

# TODO: will remove this file and use _performance_tracking before merging
_cudf_nvtx_annotate = _performance_tracking
vyasr marked this conversation as resolved.
Show resolved Hide resolved
_dask_cudf_nvtx_annotate = _dask_cudf_performance_tracking
80 changes: 80 additions & 0 deletions python/cudf/cudf/utils/performance_tracking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# Copyright (c) 2023-2024, NVIDIA CORPORATION.

import contextlib
import functools
import hashlib
import sys
from typing import Dict

import nvtx

import rmm.statistics

from cudf.options import get_option

_NVTX_COLORS = ["green", "blue", "purple", "rapids"]


def _get_color_for_nvtx(name):
m = hashlib.sha256()
m.update(name.encode())
hash_value = int(m.hexdigest(), 16)
idx = hash_value % len(_NVTX_COLORS)
return _NVTX_COLORS[idx]


def _performance_tracking(func, domain="cudf_python"):
"""Decorator for applying performance tracking (if enabled)."""

@functools.wraps(func)
def wrapper(*args, **kwargs):
with contextlib.ExitStack() as stack:
if get_option("memory_profiling"):
rmm.statistics.enable_statistics()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue: This doesn't behave like a normal context manager (leave the state unchanged after exit) because it doesn't unwind the change to the current memory resource that rmm.statistics.enable_statistics enacts. This means that the following code does an "unexpected" thing: side-effectfully changes the MR:

# statistics not enabled...
with cudf.options_context(("memory_profiling", True)):
    # code that profiles statistics

# expect stats memory resource to be popped, but it is not

It seems like the call to enable statistics should be paired with a matching disable statistics call. But this seems bad if it were done at every level of the call stack.

It feels like the right place for modification of the MR is in the set_option call. set_option("memory_profiling", True) would store the old MR, push the statistics MR, and then a later set_option("memory_profiling", False") would pop the statistics MR if it still matches.

This, however, would preclude use of the environment variable default (because that is set on import, and we don't want to have to call RMM on import).

Does this suggest that we need a specific enable_memory_profiling that can be used as a context manager that sets the option and sets up the MR?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this is an issue, RMM already have rmm.statistics.statistics() and rmm.statistics.enable_statistics().
Thus the user code would look something like:

import cudf
import rmm.statistics

# The must first enable the cudf option
cudf.set_option("memory_profiling", True)

# Then enable statistics within a context
with rmm.statistics.statistics():
    # by code

# Or globally
rmm.statistics.enable_statistics()

Of cause, we could wrap/alias the rmm functions in cudf native functions. However, I am not sure if the disadvantage of the "unexpected" side-effect outweigh the user-friendliness of just setting CUDF_MEMORY_PROFILING="1"?

Copy link
Member Author

@madsbk madsbk Jun 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Alternatively, we could implement lazy environment variable in options.py. It might be worth streamlining the option module in any case.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In terms of solutions, it depends a little bit on how end-user-friendly we want this to be. If this is mostly for developer use, I am happy with the programmatic set_option + use RMM calls. If we want this as a debugging tool for end users "where is my code allocating lots of vram", then we likely need something a little more ergonomic.

As much as I am allergic to environment variables from a reproducibility point of view, I can see the attraction.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure but I guess we could start with the set_option + use RMM calls and then later consider a more end-user-friendly solution?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that's a reasonable compromise.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

stack.enter_context(
rmm.statistics.profiler(
name=rmm.statistics._get_descriptive_name_of_object(
func
)
)
)
if nvtx.enabled():
stack.enter_context(
nvtx.annotate(
message=func.__qualname__,
color=_get_color_for_nvtx(func.__qualname__),
domain=domain,
)
)
return func(*args, **kwargs)

return wrapper


_dask_cudf_performance_tracking = functools.partial(
_performance_tracking, domain="dask_cudf_python"
)


def get_memory_records() -> (
Dict[str, rmm.statistics.ProfilerRecords.MemoryRecord]
):
"""Get the memory records from the memory profiling

Returns
-------
Dict that maps function names to memory records. Empty if
memory profiling is disabled
"""
return rmm.statistics.default_profiler_records.records


def print_memory_report(file=sys.stdout) -> None:
"""Pretty print the result fo the memory profiling
madsbk marked this conversation as resolved.
Show resolved Hide resolved

Parameters
----------
file
The output stream
"""
print(rmm.statistics.default_profiler_records.report(), file=file)
5 changes: 3 additions & 2 deletions python/cudf/cudf/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,8 +159,9 @@ def _external_only_api(func, alternative=""):
@functools.wraps(func)
def wrapper(*args, **kwargs):
# Check the immediately preceding frame to see if it's in cudf.
frame, lineno = next(traceback.walk_stack(None))
fn = frame.f_code.co_filename
pre_frame = traceback.extract_stack(limit=2)[0]
fn = pre_frame.filename
lineno = pre_frame.lineno
wence- marked this conversation as resolved.
Show resolved Hide resolved
if _cudf_root in fn and _tests_root not in fn:
raise RuntimeError(
f"External-only API called in {fn} at line {lineno}. "
Expand Down
Loading