Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] compile_udf: Cache PTX for similar functions #7371

Merged
merged 4 commits into from
Feb 26, 2021
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions conda/environments/cudf_dev_cuda10.1.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ dependencies:
- packaging
- protobuf
- nvtx>=0.2.1
- cachetools
- pip:
- git+https://github.com/dask/dask.git@master
- git+https://github.com/dask/distributed.git@master
Expand Down
1 change: 1 addition & 0 deletions conda/environments/cudf_dev_cuda10.2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ dependencies:
- packaging
- protobuf
- nvtx>=0.2.1
- cachetools
- pip:
- git+https://github.com/dask/dask.git@master
- git+https://github.com/dask/distributed.git@master
Expand Down
1 change: 1 addition & 0 deletions conda/environments/cudf_dev_cuda11.0.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ dependencies:
- packaging
- protobuf
- nvtx>=0.2.1
- cachetools
- pip:
- git+https://github.com/dask/dask.git@master
- git+https://github.com/dask/distributed.git@master
Expand Down
1 change: 1 addition & 0 deletions conda/recipes/cudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ requirements:
- fsspec>=0.6.0
- {{ pin_compatible('cudatoolkit', max_pin='x.x') }}
- nvtx >=0.2.1
- cachetools

test:
requires:
Expand Down
102 changes: 102 additions & 0 deletions python/cudf/cudf/tests/test_compile_udf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
# Copyright (c) 2021, NVIDIA CORPORATION.

from cudf.utils import cudautils
from numba import types


def setup_function():
cudautils._udf_code_cache.clear()


def assert_cache_size(size):
assert cudautils._udf_code_cache.currsize == size


def test_first_compile_sets_cache_entry():
# The first compilation should put an entry in the cache
kkraus14 marked this conversation as resolved.
Show resolved Hide resolved
cudautils.compile_udf(lambda x: x + 1, (types.float32,))
assert_cache_size(1)


def test_code_cache_same_code_different_function_hit():
# Compilation of a distinct function with the same code and signature
# should reuse the cached entry

cudautils.compile_udf(lambda x: x + 1, (types.float32,))
assert_cache_size(1)

cudautils.compile_udf(lambda x: x + 1, (types.float32,))
assert_cache_size(1)


def test_code_cache_different_types_miss():
# Compilation of a distinct function with the same code but different types
# should create an additional cache entry

cudautils.compile_udf(lambda x: x + 1, (types.float32,))
assert_cache_size(1)

cudautils.compile_udf(lambda x: x + 1, (types.float64,))
assert_cache_size(2)


def test_code_cache_different_cvars_miss():
# Compilation of a distinct function with the same types and code as an
# existing entry but different closure variables should create an
# additional cache entry
kkraus14 marked this conversation as resolved.
Show resolved Hide resolved

def gen_closure(y):
return lambda x: x + y

cudautils.compile_udf(gen_closure(1), (types.float32,))
assert_cache_size(1)

cudautils.compile_udf(gen_closure(2), (types.float32,))
assert_cache_size(2)


def test_same_function_and_types():
# Compilation of the same function with the same types should not hit the
# UDF code cache, but instead use the LRU cache wrapping the compile_udf
# function. We test for this by compiling the function for:
#
# 1. A float32 argument
# 2. A float64 argument
# 3. A float32 argument
#
# then popping the least-recently used item. The LRU item should have a
# float32 argument, as the last type the code cache saw had a float64
# argument - the float32 argument for compilation 3 should have been
# serviced by the LRU cache wrapping the compile_udf function, not the code
# cache.
def f(x):
return x + 1

cudautils.compile_udf(f, (types.float32,))
cudautils.compile_udf(f, (types.float64,))
cudautils.compile_udf(f, (types.float32,))

k, v = cudautils._udf_code_cache.popitem()
# First element of the key is the type signature, then get the first (only)
# argument
argtype = k[0][0]

assert argtype == types.float32


def test_lambda_in_loop_code_cached():
# Compiling a UDF defined in a loop should result in the code cache being
# reused for each loop iteration after the first. We check for this by
# ensuring that there is only one entry in the code cache after the loop.
# We expect the LRU cache wrapping compile_udf to contain three entries,
# because it doesn't recognize that distinct functions with the same code,
# closure variables and type are the same.

# Clear the LRU cache to ensure we know how many entries it should have
cudautils.compile_udf.cache_clear()

for i in range(3):
cudautils.compile_udf(lambda x: x + 1, (types.float32,))

assert_cache_size(1)
assert cudautils.compile_udf.cache_info().currsize == 3
33 changes: 32 additions & 1 deletion python/cudf/cudf/utils/cudautils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
# Copyright (c) 2018-2021, NVIDIA CORPORATION.
from functools import lru_cache

import cachetools
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need a new dependency for this? Can we not use functools.lru_cache?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't see a way to control the key with functools.lru_cache - is there a way I have missed?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fair enough. Let's move forward as is for now but we should generally try to avoid adding new dependencies whenever possible unless they're needed.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added the cachetools dependency as I noticed that something that cuDF depends on seems to depend on it, so it was already installed in my environment... However, the relevant code is approximately 160 lines - would vendoring it be better?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nah this is fine. Wasn't aware this was already in the dependency tree.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@gmarkall, @kkraus14, since this is already somewhere in our dependency tree, should it still be added as an explicit dependency in the integration repo?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ajschmidt8 Would that be added to the run section in https://github.com/rapidsai/integration/blob/branch-0.18/conda/recipes/rapids-build-env/meta.yaml ? If that's intended to install all packages that cuDF might use, then I'd guess it should be (my answer is a bit tentative because I'm not particularly familiar with the integration repo).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cool, i will open a PR to add it there

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PR added here: rapidsai/integration#220

import cupy
import numpy as np
from numba import cuda
from pickle import dumps

import cudf
from cudf.utils.utils import check_equals_float, check_equals_int
Expand Down Expand Up @@ -235,6 +237,13 @@ def grouped_window_sizes_from_offset(arr, group_starts, offset):
return window_sizes


# This cache is keyed on the (signature, code, closure variables) of UDFs, so
# it can hit for distinct functions that are similar. The lru_cache wrapping
# compile_udf misses for these similar functions, but doesn't need to serialize
# closure variables to check for a hit.
_udf_code_cache = cachetools.LRUCache(maxsize=32)


@lru_cache(maxsize=32)
kkraus14 marked this conversation as resolved.
Show resolved Hide resolved
def compile_udf(udf, type_signature):
"""Compile ``udf`` with `numba`
Expand Down Expand Up @@ -266,8 +275,30 @@ def compile_udf(udf, type_signature):
An numpy type

"""

# Check if we've already compiled a similar (but possibly distinct)
# function before
codebytes = udf.__code__.co_code
if udf.__closure__ is not None:
cvars = tuple([x.cell_contents for x in udf.__closure__])
cvarbytes = dumps(cvars)
else:
cvarbytes = b""

key = (type_signature, codebytes, cvarbytes)
res = _udf_code_cache.get(key)
if res:
return res

# We haven't compiled a function like this before, so need to fall back to
# compilation with Numba
ptx_code, return_type = cuda.compile_ptx_for_current_device(
udf, type_signature, device=True
)
output_type = numpy_support.as_dtype(return_type)
return (ptx_code, output_type.type)

# Populate the cache for this function
res = (ptx_code, output_type.type)
_udf_code_cache[key] = res

return res