Use generator expressions in any/all functions. (#10736)

This PR uses generator expressions in `any(...)` and `all(...)` to avoid allocating a list in memory while maximizing the potential benefit of early exit from the `any`/`all` function. I also fixed a few miscellaneous things (~ 10 lines): - Use `cls` in `classmethod`s - Simplify a lambda expression - Use `super()` with no arguments if the arguments are the parent class and `self` - Parenthesize multi-line strings with implicit concatenation to clarify the behavior when written in a tuple of values Note: Some of these were caught by https://codereview.doctor/rapidsai/cudf. In some places, the bot correctly identified a problem but its suggestions were invalid or incomplete. I identified steps for improvement beyond what the bot suggested for most of these cases. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Ashwin Srinath (https://github.com/shwina) - Michael Wang (https://github.com/isVoid) URL: #10736
rapidsai · Apr 30, 2022 · 027c34a · 027c34a
1 parent bf10a94
commit 027c34a
Show file tree

Hide file tree

Showing 11 changed files with 32 additions and 36 deletions.
diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 import random
 from collections import OrderedDict
@@ -312,13 +312,9 @@ def sanitize(value, struct_type):
         return tuple(values_list)
 
     has_nulls_or_nullable_dtype = any(
-        [
-            True
-            if df[col].dtype in pandas_dtypes_to_np_dtypes
-            or df[col].isnull().any()
-            else False
-            for col in df.columns
-        ]
+        (col := df[colname]).dtype in pandas_dtypes_to_np_dtypes
+        or col.isnull().any()
+        for colname in df.columns
     )
     pdf = df.copy(deep=True)
     for field in arrow_table_schema:

diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 import pandas as pd
 import pyarrow as pa
 
@@ -39,7 +39,7 @@ def closed(self):
         return self._closed
 
     @classmethod
-    def from_arrow(self, data):
+    def from_arrow(cls, data):
         new_col = super().from_arrow(data.storage)
         size = len(data)
         dtype = IntervalDtype.from_arrow(data.type)

diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py
@@ -31,7 +31,7 @@ def base_size(self):
             return len(self.base_children[0])
 
     @classmethod
-    def from_arrow(self, data):
+    def from_arrow(cls, data):
         size = len(data)
         dtype = cudf.core.dtypes.StructDtype.from_arrow(data.type)
 

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
@@ -160,7 +160,7 @@ def _can_downcast_to_series(self, df, arg):
                 ):
                     return True
             dtypes = df.dtypes.values.tolist()
-            all_numeric = all([is_numeric_dtype(t) for t in dtypes])
+            all_numeric = all(is_numeric_dtype(t) for t in dtypes)
             if all_numeric:
                 return True
         if ncols == 1:
@@ -720,7 +720,7 @@ def _init_from_series_list(self, data, columns, index):
 
             final_index = as_index(index)
 
-        series_lengths = list(map(lambda x: len(x), data))
+        series_lengths = list(map(len, data))
         data = numeric_normalize_types(*data)
         if series_lengths.count(series_lengths[0]) == len(series_lengths):
             # Calculating the final dataframe columns by
@@ -2999,11 +2999,11 @@ def agg(self, aggs, axis=None):
 
         elif isinstance(aggs, dict):
             cols = aggs.keys()
-            if any([callable(val) for val in aggs.values()]):
+            if any(callable(val) for val in aggs.values()):
                 raise NotImplementedError(
                     "callable parameter is not implemented yet"
                 )
-            elif all([isinstance(val, str) for val in aggs.values()]):
+            elif all(isinstance(val, str) for val in aggs.values()):
                 result = cudf.Series(index=cols)
                 for key, value in aggs.items():
                     col = df_normalized[key]
@@ -3013,7 +3013,7 @@ def agg(self, aggs, axis=None):
                             f"'Series' object"
                         )
                     result[key] = getattr(col, value)()
-            elif all([isinstance(val, abc.Iterable) for val in aggs.values()]):
+            elif all(isinstance(val, abc.Iterable) for val in aggs.values()):
                 idxs = set()
                 for val in aggs.values():
                     if isinstance(val, str):
@@ -6032,9 +6032,7 @@ def append(
             if (cols.get_indexer(other._data.to_pandas_index()) >= 0).all():
                 other = other.reindex(columns=cols)
 
-        return super(DataFrame, self)._append(
-            other, ignore_index, verify_integrity, sort
-        )
+        return super()._append(other, ignore_index, verify_integrity, sort)
 
     @_cudf_nvtx_annotate
     @copy_docstring(reshape.pivot)

diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
@@ -629,7 +629,7 @@ def _generate_months_column(self, size, op):
     def _is_no_op(self) -> bool:
         # some logic could be implemented here for more complex cases
         # such as +1 year, -12 months
-        return all([i == 0 for i in self._kwds.values()])
+        return all(i == 0 for i in self._kwds.values())
 
     def __neg__(self):
         new_scalars = {k: -v for k, v in self._kwds.items()}

diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
@@ -56,7 +56,7 @@ def _write_parquet(
         "row_group_size_rows": row_group_size_rows,
         "partitions_info": partitions_info,
     }
-    if all([ioutils.is_fsspec_open_file(buf) for buf in paths_or_bufs]):
+    if all(ioutils.is_fsspec_open_file(buf) for buf in paths_or_bufs):
         with ExitStack() as stack:
             fsspec_objs = [stack.enter_context(file) for file in paths_or_bufs]
             file_objs = [

diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py
@@ -133,7 +133,7 @@ def assert_eq(left, right, **kwargs):
         # Use the overloaded __eq__ of the operands
         if left == right:
             return True
-        elif any([np.issubdtype(type(x), np.floating) for x in (left, right)]):
+        elif any(np.issubdtype(type(x), np.floating) for x in (left, right)):
             np.testing.assert_almost_equal(left, right)
         else:
             np.testing.assert_equal(left, right)

diff --git a/python/cudf/cudf/tests/test_custom_accessor.py b/python/cudf/cudf/tests/test_custom_accessor.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 import pandas as pd
 import pytest
@@ -17,7 +17,7 @@ def __init__(self, obj):
     @staticmethod
     def _validate(obj):
         cols = obj.columns
-        if not all([vertex in cols for vertex in ["x", "y"]]):
+        if not all(vertex in cols for vertex in ["x", "y"]):
             raise AttributeError("Must have vertices 'x', 'y'.")
 
     @property

diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py
@@ -189,10 +189,8 @@ def assert_column_array_dtype_equal(column: ColumnBase, array: pa.array):
         )
     elif isinstance(column.dtype, StructDtype):
         return array.type.equals(column.dtype.to_arrow()) and all(
-            [
-                assert_column_array_dtype_equal(child, array.field(i))
-                for i, child in enumerate(column.base_children)
-            ]
+            assert_column_array_dtype_equal(child, array.field(i))
+            for i, child in enumerate(column.base_children)
         )
     elif isinstance(
         column.dtype, (Decimal128Dtype, Decimal64Dtype, Decimal32Dtype)

diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
@@ -762,7 +762,7 @@ def test_multiindex_copy_deep(data, deep):
         lptrs = [child.base_data.ptr for child in lchildren]
         rptrs = [child.base_data.ptr for child in rchildren]
 
-        assert all([(x == y) is same_ref for x, y in zip(lptrs, rptrs)])
+        assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs))
 
     elif isinstance(data, cudf.MultiIndex):
         mi1 = data
@@ -772,19 +772,19 @@ def test_multiindex_copy_deep(data, deep):
         lptrs = [lv._data._data[None].base_data.ptr for lv in mi1._levels]
         rptrs = [lv._data._data[None].base_data.ptr for lv in mi2._levels]
 
-        assert all([(x == y) is same_ref for x, y in zip(lptrs, rptrs)])
+        assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs))
 
         # Assert ._codes identity
         lptrs = [c.base_data.ptr for _, c in mi1._codes._data.items()]
         rptrs = [c.base_data.ptr for _, c in mi2._codes._data.items()]
 
-        assert all([(x == y) is same_ref for x, y in zip(lptrs, rptrs)])
+        assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs))
 
         # Assert ._data identity
         lptrs = [d.base_data.ptr for _, d in mi1._data.items()]
         rptrs = [d.base_data.ptr for _, d in mi2._data.items()]
 
-        assert all([(x == y) is same_ref for x, y in zip(lptrs, rptrs)])
+        assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs))
 
 
 @pytest.mark.parametrize(

diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py
@@ -308,8 +308,10 @@ def test_character_tokenize_series():
             "hello world",
             "sdf",
             None,
-            "goodbye, one-two:three~four+five_six@sev"
-            "en#eight^nine heŒŽ‘•™œ$µ¾ŤƠé Ǆ",
+            (
+                "goodbye, one-two:three~four+five_six@sev"
+                "en#eight^nine heŒŽ‘•™œ$µ¾ŤƠé Ǆ"
+            ),
         ]
     )
     expected = cudf.Series(
@@ -423,8 +425,10 @@ def test_character_tokenize_index():
             "hello world",
             "sdf",
             None,
-            "goodbye, one-two:three~four+five_six@sev"
-            "en#eight^nine heŒŽ‘•™œ$µ¾ŤƠé Ǆ",
+            (
+                "goodbye, one-two:three~four+five_six@sev"
+                "en#eight^nine heŒŽ‘•™œ$µ¾ŤƠé Ǆ"
+            ),
         ]
     )
     expected = cudf.core.index.as_index(