From d34442994863b67606016db5a0fbfeae91c5f773 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 7 Feb 2022 12:53:35 -0800 Subject: [PATCH 1/8] Add decorator for identifying uses of APIs that we don't want to call internally. --- python/cudf/cudf/utils/utils.py | 41 +++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index 4143cbd1d66..e8cca17e88b 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -2,6 +2,8 @@ import decimal import functools +import os +import traceback from collections.abc import Sequence from typing import FrozenSet, Set, Union @@ -37,6 +39,45 @@ } +NO_EXTERNAL_ONLY_APIS = os.getenv("NO_EXTERNAL_ONLY_APIS") + +if NO_EXTERNAL_ONLY_APIS in ("True", "1", "TRUE"): + _cudf_root = os.path.join("python", "cudf", "cudf") + _tests_root = os.path.join(_cudf_root, "tests") + + def _external_only_api(func): + """Decorator to indicate that a function should not be used internally. + + cudf contains many APIs that exist for pandas compatibility but are + intrinsically inefficient. For some of these cudf has internal + equivalents that are much faster. Usage of the slow public APIs inside + our implementation can lead to unnecessary performance bottlenecks. + Applying this decorator to such functions and then setting the + environment variable NO_EXTERNAL_ONLY_APIS will cause such functions to + raise exceptions if they are called from anywhere inside cudf, making + it easy to identify and excise such usage. + """ + + def wrapper(*args, **kwargs): + # Check the immediately preceding frame to see if it's in cudf. + frame, lineno = next(traceback.walk_stack(None)) + fn = frame.f_code.co_filename + if _cudf_root in fn and _tests_root not in fn: + raise RuntimeError( + f"External-only API called in {fn} at line {lineno}." + ) + return func(*args, **kwargs) + + return wrapper + + +else: + + def _external_only_api(func): + """The default implementation is a no-op.""" + return func + + def scalar_broadcast_to(scalar, size, dtype=None): if isinstance(size, (tuple, list)): From a94d9bea120d018ec78325b4cb84fa55845b36ca Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 10 Feb 2022 12:15:11 -0800 Subject: [PATCH 2/8] Add support for providing an alternative to the API. --- python/cudf/cudf/utils/utils.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index e8cca17e88b..585b7d36230 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -45,7 +45,7 @@ _cudf_root = os.path.join("python", "cudf", "cudf") _tests_root = os.path.join(_cudf_root, "tests") - def _external_only_api(func): + def _external_only_api(func, alternative=""): """Decorator to indicate that a function should not be used internally. cudf contains many APIs that exist for pandas compatibility but are @@ -58,13 +58,25 @@ def _external_only_api(func): it easy to identify and excise such usage. """ + # If the first arg is a string then an alternative function to use in + # place of this API was provided, so we pass that to a subsequent call. + # It would be cleaner to implement this pattern by using a class + # decorator with a factory method, but there is no way to generically + # wrap docstrings on a class (we would need the docstring to be on the + # class itself, not instances, because that's what `help` looks at) and + # there is also no way to make mypy happy with that approach. + if isinstance(func, str): + return lambda actual_func: _external_only_api(actual_func, func) + + @functools.wraps(func) def wrapper(*args, **kwargs): # Check the immediately preceding frame to see if it's in cudf. frame, lineno = next(traceback.walk_stack(None)) fn = frame.f_code.co_filename if _cudf_root in fn and _tests_root not in fn: raise RuntimeError( - f"External-only API called in {fn} at line {lineno}." + f"External-only API called in {fn} at line {lineno}. " + f"{alternative}" ) return func(*args, **kwargs) @@ -73,8 +85,11 @@ def wrapper(*args, **kwargs): else: - def _external_only_api(func): + def _external_only_api(func, alternative=""): """The default implementation is a no-op.""" + if isinstance(func, str): + return lambda actual_func: _external_only_api(actual_func, func) + return func From 5a58883503f9b1174333daf0c6d39e5ef6d31bda Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 10 Feb 2022 13:14:14 -0800 Subject: [PATCH 3/8] Make pytest always set the env var rather than relying on CI only. --- python/cudf/cudf/tests/conftest.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/python/cudf/cudf/tests/conftest.py b/python/cudf/cudf/tests/conftest.py index 041bd055f0a..fc0f1b40e54 100644 --- a/python/cudf/cudf/tests/conftest.py +++ b/python/cudf/cudf/tests/conftest.py @@ -1,3 +1,4 @@ +import os import pathlib import pytest @@ -8,3 +9,24 @@ @pytest.fixture(scope="session") def datadir(): return pathlib.Path(__file__).parent / "data" + + +# To set and remove the NO_EXTERNAL_ONLY_APIS environment variable we must use +# the sessionstart and sessionfinish hooks rather than a simple autouse, +# session-scope fixture because we need to set these variable before collection +# occurs because the envrionment variable will be checked as soon as cudf is +# imported anywhere. +def pytest_sessionstart(session): + """ + Called after the Session object has been created and + before performing collection and entering the run test loop. + """ + os.environ["NO_EXTERNAL_ONLY_APIS"] = "1" + + +def pytest_sessionfinish(session, exitstatus): + """ + Called after whole test run finished, right before + returning the exit status to the system. + """ + del os.environ["NO_EXTERNAL_ONLY_APIS"] From 469d88afa1daccd6d986f9ce52f43a241d0b5342 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 11 Feb 2022 18:30:14 -0800 Subject: [PATCH 4/8] Make paths set by pytest more robust to builds out of the source tree and different test file locations. --- python/cudf/cudf/tests/conftest.py | 4 ++++ python/cudf/cudf/utils/utils.py | 9 +++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/tests/conftest.py b/python/cudf/cudf/tests/conftest.py index fc0f1b40e54..1aa8665f952 100644 --- a/python/cudf/cudf/tests/conftest.py +++ b/python/cudf/cudf/tests/conftest.py @@ -5,6 +5,8 @@ import rmm # noqa: F401 +_CURRENT_DIRECTORY = os.path.dirname(os.path.realpath(__file__)) + @pytest.fixture(scope="session") def datadir(): @@ -22,6 +24,7 @@ def pytest_sessionstart(session): before performing collection and entering the run test loop. """ os.environ["NO_EXTERNAL_ONLY_APIS"] = "1" + os.environ["_CUDF_TEST_ROOT"] = _CURRENT_DIRECTORY def pytest_sessionfinish(session, exitstatus): @@ -30,3 +33,4 @@ def pytest_sessionfinish(session, exitstatus): returning the exit status to the system. """ del os.environ["NO_EXTERNAL_ONLY_APIS"] + del os.environ["_CUDF_TEST_ROOT"] diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index 585b7d36230..115cd683938 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -39,11 +39,16 @@ } +# The test root is set by pytest to support situations where tests are run from +# a source tree on a built version of cudf. NO_EXTERNAL_ONLY_APIS = os.getenv("NO_EXTERNAL_ONLY_APIS") +_CUDF_TEST_ROOT = os.getenv("_CUDF_TEST_ROOT") if NO_EXTERNAL_ONLY_APIS in ("True", "1", "TRUE"): - _cudf_root = os.path.join("python", "cudf", "cudf") - _tests_root = os.path.join(_cudf_root, "tests") + _cudf_root = cudf.__file__.replace("/__init__.py", "") + # If the environment variable for the test root is not set, we default to + # using the path relative to the cudf root directory. + _tests_root = _CUDF_TEST_ROOT or os.path.join(_cudf_root, "tests") def _external_only_api(func, alternative=""): """Decorator to indicate that a function should not be used internally. From 57608eba6c3af4bd555fb0a9eec8adef94c476cd Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 15 Feb 2022 09:12:18 -0800 Subject: [PATCH 5/8] Specify paths explicitly when executing tests so that pytest hooks are discovered. --- ci/gpu/build.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 53ad948b61c..4acdc372817 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. ############################################## # cuDF GPU build and test script for CI # ############################################## @@ -249,15 +249,15 @@ fi cd "$WORKSPACE/python/cudf" gpuci_logger "Python py.test for cuDF" -py.test -n 8 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-tmp" --ignore="$WORKSPACE/python/cudf/cudf/benchmarks" --junitxml="$WORKSPACE/junit-cudf.xml" -v --cov-config=.coveragerc --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-coverage.xml" --cov-report term --dist=loadscope +py.test -n 8 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-tmp" --ignore="$WORKSPACE/python/cudf/cudf/benchmarks" --junitxml="$WORKSPACE/junit-cudf.xml" -v --cov-config=.coveragerc --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-coverage.xml" --cov-report term --dist=loadscope cudf cd "$WORKSPACE/python/dask_cudf" gpuci_logger "Python py.test for dask-cudf" -py.test -n 8 --cache-clear --basetemp="$WORKSPACE/dask-cudf-cuda-tmp" --junitxml="$WORKSPACE/junit-dask-cudf.xml" -v --cov-config=.coveragerc --cov=dask_cudf --cov-report=xml:"$WORKSPACE/python/dask_cudf/dask-cudf-coverage.xml" --cov-report term +py.test -n 8 --cache-clear --basetemp="$WORKSPACE/dask-cudf-cuda-tmp" --junitxml="$WORKSPACE/junit-dask-cudf.xml" -v --cov-config=.coveragerc --cov=dask_cudf --cov-report=xml:"$WORKSPACE/python/dask_cudf/dask-cudf-coverage.xml" --cov-report term dask_cudf cd "$WORKSPACE/python/custreamz" gpuci_logger "Python py.test for cuStreamz" -py.test -n 8 --cache-clear --basetemp="$WORKSPACE/custreamz-cuda-tmp" --junitxml="$WORKSPACE/junit-custreamz.xml" -v --cov-config=.coveragerc --cov=custreamz --cov-report=xml:"$WORKSPACE/python/custreamz/custreamz-coverage.xml" --cov-report term +py.test -n 8 --cache-clear --basetemp="$WORKSPACE/custreamz-cuda-tmp" --junitxml="$WORKSPACE/junit-custreamz.xml" -v --cov-config=.coveragerc --cov=custreamz --cov-report=xml:"$WORKSPACE/python/custreamz/custreamz-coverage.xml" --cov-report term custreamz gpuci_logger "Test notebooks" "$WORKSPACE/ci/gpu/test-notebooks.sh" 2>&1 | tee nbtest.log From 05dc3957bda3d98e039306c67fb07e172039cd6b Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 15 Feb 2022 09:24:42 -0800 Subject: [PATCH 6/8] Only delete keys if they exist, which depending on the invocation they may not. --- python/cudf/cudf/tests/conftest.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/tests/conftest.py b/python/cudf/cudf/tests/conftest.py index 1aa8665f952..fa9061629df 100644 --- a/python/cudf/cudf/tests/conftest.py +++ b/python/cudf/cudf/tests/conftest.py @@ -32,5 +32,8 @@ def pytest_sessionfinish(session, exitstatus): Called after whole test run finished, right before returning the exit status to the system. """ - del os.environ["NO_EXTERNAL_ONLY_APIS"] - del os.environ["_CUDF_TEST_ROOT"] + try: + del os.environ["NO_EXTERNAL_ONLY_APIS"] + del os.environ["_CUDF_TEST_ROOT"] + except KeyError: + pass From 55e81ec1c40d6a2ea491f7349fd3be2c21204059 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 15 Feb 2022 12:05:06 -0800 Subject: [PATCH 7/8] Add copyright to conftest.py. --- python/cudf/cudf/tests/conftest.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cudf/cudf/tests/conftest.py b/python/cudf/cudf/tests/conftest.py index fa9061629df..b60cc835f01 100644 --- a/python/cudf/cudf/tests/conftest.py +++ b/python/cudf/cudf/tests/conftest.py @@ -1,3 +1,5 @@ +# Copyright (c) 2019-2022, NVIDIA CORPORATION. + import os import pathlib From cc72147136aaf5277b5db34d69c75c32d2ef906c Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 15 Feb 2022 13:58:14 -0800 Subject: [PATCH 8/8] Address PR comments. --- python/cudf/cudf/tests/conftest.py | 4 +- python/cudf/cudf/utils/utils.py | 91 ++++++++++++++---------------- 2 files changed, 45 insertions(+), 50 deletions(-) diff --git a/python/cudf/cudf/tests/conftest.py b/python/cudf/cudf/tests/conftest.py index b60cc835f01..4d5b5926d6e 100644 --- a/python/cudf/cudf/tests/conftest.py +++ b/python/cudf/cudf/tests/conftest.py @@ -7,7 +7,7 @@ import rmm # noqa: F401 -_CURRENT_DIRECTORY = os.path.dirname(os.path.realpath(__file__)) +_CURRENT_DIRECTORY = str(pathlib.Path(__file__).resolve().parent) @pytest.fixture(scope="session") @@ -18,7 +18,7 @@ def datadir(): # To set and remove the NO_EXTERNAL_ONLY_APIS environment variable we must use # the sessionstart and sessionfinish hooks rather than a simple autouse, # session-scope fixture because we need to set these variable before collection -# occurs because the envrionment variable will be checked as soon as cudf is +# occurs because the environment variable will be checked as soon as cudf is # imported anywhere. def pytest_sessionstart(session): """ diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index 115cd683938..b283f89873d 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -42,61 +42,56 @@ # The test root is set by pytest to support situations where tests are run from # a source tree on a built version of cudf. NO_EXTERNAL_ONLY_APIS = os.getenv("NO_EXTERNAL_ONLY_APIS") -_CUDF_TEST_ROOT = os.getenv("_CUDF_TEST_ROOT") - -if NO_EXTERNAL_ONLY_APIS in ("True", "1", "TRUE"): - _cudf_root = cudf.__file__.replace("/__init__.py", "") - # If the environment variable for the test root is not set, we default to - # using the path relative to the cudf root directory. - _tests_root = _CUDF_TEST_ROOT or os.path.join(_cudf_root, "tests") - - def _external_only_api(func, alternative=""): - """Decorator to indicate that a function should not be used internally. - - cudf contains many APIs that exist for pandas compatibility but are - intrinsically inefficient. For some of these cudf has internal - equivalents that are much faster. Usage of the slow public APIs inside - our implementation can lead to unnecessary performance bottlenecks. - Applying this decorator to such functions and then setting the - environment variable NO_EXTERNAL_ONLY_APIS will cause such functions to - raise exceptions if they are called from anywhere inside cudf, making - it easy to identify and excise such usage. - """ - - # If the first arg is a string then an alternative function to use in - # place of this API was provided, so we pass that to a subsequent call. - # It would be cleaner to implement this pattern by using a class - # decorator with a factory method, but there is no way to generically - # wrap docstrings on a class (we would need the docstring to be on the - # class itself, not instances, because that's what `help` looks at) and - # there is also no way to make mypy happy with that approach. - if isinstance(func, str): - return lambda actual_func: _external_only_api(actual_func, func) - - @functools.wraps(func) - def wrapper(*args, **kwargs): - # Check the immediately preceding frame to see if it's in cudf. - frame, lineno = next(traceback.walk_stack(None)) - fn = frame.f_code.co_filename - if _cudf_root in fn and _tests_root not in fn: - raise RuntimeError( - f"External-only API called in {fn} at line {lineno}. " - f"{alternative}" - ) - return func(*args, **kwargs) - return wrapper +_cudf_root = os.path.dirname(cudf.__file__) +# If the environment variable for the test root is not set, we default to +# using the path relative to the cudf root directory. +_tests_root = os.getenv("_CUDF_TEST_ROOT") or os.path.join(_cudf_root, "tests") + +def _external_only_api(func, alternative=""): + """Decorator to indicate that a function should not be used internally. -else: + cudf contains many APIs that exist for pandas compatibility but are + intrinsically inefficient. For some of these cudf has internal + equivalents that are much faster. Usage of the slow public APIs inside + our implementation can lead to unnecessary performance bottlenecks. + Applying this decorator to such functions and setting the environment + variable NO_EXTERNAL_ONLY_APIS will cause such functions to raise + exceptions if they are called from anywhere inside cudf, making it easy + to identify and excise such usage. - def _external_only_api(func, alternative=""): - """The default implementation is a no-op.""" - if isinstance(func, str): - return lambda actual_func: _external_only_api(actual_func, func) + The `alternative` should be a complete phrase or sentence since it will + be used verbatim in error messages. + """ + # If the first arg is a string then an alternative function to use in + # place of this API was provided, so we pass that to a subsequent call. + # It would be cleaner to implement this pattern by using a class + # decorator with a factory method, but there is no way to generically + # wrap docstrings on a class (we would need the docstring to be on the + # class itself, not instances, because that's what `help` looks at) and + # there is also no way to make mypy happy with that approach. + if isinstance(func, str): + return lambda actual_func: _external_only_api(actual_func, func) + + if not NO_EXTERNAL_ONLY_APIS: return func + @functools.wraps(func) + def wrapper(*args, **kwargs): + # Check the immediately preceding frame to see if it's in cudf. + frame, lineno = next(traceback.walk_stack(None)) + fn = frame.f_code.co_filename + if _cudf_root in fn and _tests_root not in fn: + raise RuntimeError( + f"External-only API called in {fn} at line {lineno}. " + f"{alternative}" + ) + return func(*args, **kwargs) + + return wrapper + def scalar_broadcast_to(scalar, size, dtype=None):