Optimize cudf.concat for axis=0 (#9222)

This PR optimizes `cudf.concat` when `axis=0` by not materializing `RangeIndex` objects present as index to the `Dataframe` objects. Partially addresses #9200, This is 1/2 of full optimizations. A follow-up PR to optimize `axis=1` will be opened as there are multiple large changes. Here is a benchmark: On `branch-21.10`: ```ipython IPython 7.27.0 -- An enhanced Interactive Python. Type '?' for help. In [1]: import cudf In [2]: df = cudf.DataFrame({'a':[1, 2, 3]*100}) In [3]: df2 = cudf.DataFrame({'a':[1, 2, 3]*100}, index=cudf.RangeIndex(300, 600)) In [4]: %timeit cudf.concat([df, df2]) 806 µs ± 8.02 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each) ``` This PR: ```ipython IPython 7.27.0 -- An enhanced Interactive Python. Type '?' for help. In [1]: import cudf In [2]: df = cudf.DataFrame({'a':[1, 2, 3]*100}) In [3]: df2 = cudf.DataFrame({'a':[1, 2, 3]*100}, index=cudf.RangeIndex(300, 600)) In [4]: %timeit cudf.concat([df, df2]) 434 µs ± 4.35 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each) ``` Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Ashwin Srinath (https://github.com/shwina) URL: #9222
rapidsai · Sep 14, 2021 · cc13060 · cc13060
1 parent eb09d14
commit cc13060
Show file tree

Hide file tree

Showing 6 changed files with 127 additions and 40 deletions.
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
@@ -1058,6 +1058,10 @@ def _concat(
         # the number of empty input frames
         num_empty_input_frames = 0
 
+        # flag to indicate if all DataFrame's have
+        # RangeIndex as their index
+        are_all_range_index = False
+
         for i, obj in enumerate(objs):
             # shallow-copy the input DFs in case the same DF instance
             # is concatenated with itself
@@ -1076,6 +1080,10 @@ def _concat(
                 result_index_length += len(obj)
                 empty_has_index = empty_has_index or len(obj) > 0
 
+            are_all_range_index = (
+                True if i == 0 else are_all_range_index
+            ) and isinstance(obj.index, cudf.RangeIndex)
+
         if join == "inner":
             sets_of_column_names = [set(obj._column_names) for obj in objs]
 
@@ -1150,7 +1158,8 @@ def _concat(
         columns = [
             (
                 []
-                if (ignore_index and not empty_has_index)
+                if are_all_range_index
+                or (ignore_index and not empty_has_index)
                 else list(f._index._data.columns)
             )
             + [f._data[name] if name in f._data else None for name in names]
@@ -1205,14 +1214,21 @@ def _concat(
 
         # Concatenate the Tables
         out = cls._from_data(
-            *libcudf.concat.concat_tables(tables, ignore_index)
+            *libcudf.concat.concat_tables(
+                tables, ignore_index=ignore_index or are_all_range_index
+            )
         )
 
         # If ignore_index is True, all input frames are empty, and at
         # least one input frame has an index, assign a new RangeIndex
         # to the result frame.
         if empty_has_index and num_empty_input_frames == len(objs):
             out._index = cudf.RangeIndex(result_index_length)
+        elif are_all_range_index and not ignore_index:
+            out._index = cudf.core.index.GenericIndex._concat(
+                [o._index for o in objs]
+            )
+
         # Reassign the categories for any categorical table cols
         _reassign_categories(
             categories, out._data, indices[first_data_column_position:]

diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py
@@ -64,13 +64,25 @@ def test_concat_dataframe(index, nulls, axis):
     # DataFrame
     res = gd.concat([gdf, gdf2, gdf, gdf_empty1], axis=axis).to_pandas()
     sol = pd.concat([df, df2, df, df_empty1], axis=axis)
-    assert_eq(res, sol, check_names=False, check_categorical=False)
+    assert_eq(
+        res,
+        sol,
+        check_names=False,
+        check_categorical=False,
+        check_index_type=True,
+    )
 
     # Series
     for c in [i for i in ("x", "y", "z") if i != index]:
         res = gd.concat([gdf[c], gdf2[c], gdf[c]], axis=axis).to_pandas()
         sol = pd.concat([df[c], df2[c], df[c]], axis=axis)
-        assert_eq(res, sol, check_names=False, check_categorical=False)
+        assert_eq(
+            res,
+            sol,
+            check_names=False,
+            check_categorical=False,
+            check_index_type=True,
+        )
 
     # Index
     res = gd.concat([gdf.index, gdf2.index], axis=axis).to_pandas()
@@ -91,7 +103,13 @@ def test_concat_all_nulls(values):
     gb = gd.Series([None])
     gs = gd.concat([ga, gb])
 
-    assert_eq(ps, gs, check_dtype=False, check_categorical=False)
+    assert_eq(
+        ps,
+        gs,
+        check_dtype=False,
+        check_categorical=False,
+        check_index_type=True,
+    )
 
 
 def test_concat_errors():
@@ -167,7 +185,13 @@ def test_concat_misordered_columns():
     res = gd.concat([gdf, gdf2]).to_pandas()
     sol = pd.concat([df, df2], sort=False)
 
-    assert_eq(res, sol, check_names=False, check_categorical=False)
+    assert_eq(
+        res,
+        sol,
+        check_names=False,
+        check_categorical=False,
+        check_index_type=True,
+    )
 
 
 @pytest.mark.parametrize("axis", [1, "columns"])
@@ -182,7 +206,7 @@ def test_concat_columns(axis):
     expect = pd.concat([pdf1, pdf2], axis=axis)
     got = gd.concat([gdf1, gdf2], axis=axis)
 
-    assert_eq(expect, got)
+    assert_eq(expect, got, check_index_type=True)
 
 
 def test_concat_multiindex_dataframe():
@@ -201,7 +225,9 @@ def test_concat_multiindex_dataframe():
     gdg1 = gd.from_pandas(pdg1)
     gdg2 = gd.from_pandas(pdg2)
     assert_eq(
-        gd.concat([gdg1, gdg2]).astype("float64"), pd.concat([pdg1, pdg2])
+        gd.concat([gdg1, gdg2]).astype("float64"),
+        pd.concat([pdg1, pdg2]),
+        check_index_type=True,
     )
     assert_eq(gd.concat([gdg1, gdg2], axis=1), pd.concat([pdg1, pdg2], axis=1))
 
@@ -221,7 +247,9 @@ def test_concat_multiindex_series():
     pdg2 = pdg["z"]
     gdg1 = gd.from_pandas(pdg1)
     gdg2 = gd.from_pandas(pdg2)
-    assert_eq(gd.concat([gdg1, gdg2]), pd.concat([pdg1, pdg2]))
+    assert_eq(
+        gd.concat([gdg1, gdg2]), pd.concat([pdg1, pdg2]), check_index_type=True
+    )
     assert_eq(gd.concat([gdg1, gdg2], axis=1), pd.concat([pdg1, pdg2], axis=1))
 
 
@@ -363,10 +391,19 @@ def test_concat_mixed_input():
     assert_eq(
         pd.concat([pdf1, None, pdf2, None]),
         gd.concat([gdf1, None, gdf2, None]),
+        check_index_type=True,
+    )
+    assert_eq(
+        pd.concat([pdf1, None]), gd.concat([gdf1, None]), check_index_type=True
+    )
+    assert_eq(
+        pd.concat([None, pdf2]), gd.concat([None, gdf2]), check_index_type=True
+    )
+    assert_eq(
+        pd.concat([None, pdf2, pdf1]),
+        gd.concat([None, gdf2, gdf1]),
+        check_index_type=True,
     )
-    assert_eq(pd.concat([pdf1, None]), gd.concat([gdf1, None]))
-    assert_eq(pd.concat([None, pdf2]), gd.concat([None, gdf2]))
-    assert_eq(pd.concat([None, pdf2, pdf1]), gd.concat([None, gdf2, gdf1]))
 
 
 @pytest.mark.parametrize(
@@ -540,7 +577,7 @@ def test_concat_empty_dataframes(df, other, ignore_index):
             else:
                 expected[key] = expected[key].fillna(-1)
                 actual[key] = col.fillna(-1)
-        assert_eq(expected, actual, check_dtype=False)
+        assert_eq(expected, actual, check_dtype=False, check_index_type=True)
     else:
         assert_eq(
             expected, actual, check_index_type=False if gdf.empty else True
@@ -564,7 +601,7 @@ def test_concat_empty_and_nonempty_series(ignore_index, data, axis):
     got = gd.concat([s1, s2], axis=axis, ignore_index=ignore_index)
     expect = pd.concat([ps1, ps2], axis=axis, ignore_index=ignore_index)
 
-    assert_eq(got, expect)
+    assert_eq(got, expect, check_index_type=True)
 
 
 @pytest.mark.parametrize("ignore_index", [True, False])
@@ -577,7 +614,7 @@ def test_concat_two_empty_series(ignore_index, axis):
     got = gd.concat([s1, s2], axis=axis, ignore_index=ignore_index)
     expect = pd.concat([ps1, ps2], axis=axis, ignore_index=ignore_index)
 
-    assert_eq(got, expect)
+    assert_eq(got, expect, check_index_type=True)
 
 
 @pytest.mark.parametrize(
@@ -670,6 +707,7 @@ def test_concat_join(objs, ignore_index, sort, join, axis):
             ignore_index=ignore_index,
             axis=axis,
         ),
+        check_index_type=True,
     )
 
 
@@ -1247,6 +1285,7 @@ def test_concat_preserve_order():
     assert_eq(
         pd.concat(dfs, join="inner"),
         gd.concat([gd.DataFrame(df) for df in dfs], join="inner"),
+        check_index_type=True,
     )
 
 
@@ -1255,7 +1294,11 @@ def test_concat_preserve_order():
 def test_concat_single_object(ignore_index, typ):
     """Ensure that concat on a single object does not change it."""
     obj = typ([1, 2, 3])
-    assert_eq(gd.concat([obj], ignore_index=ignore_index, axis=0), obj)
+    assert_eq(
+        gd.concat([obj], ignore_index=ignore_index, axis=0),
+        obj,
+        check_index_type=True,
+    )
 
 
 @pytest.mark.parametrize("ltype", [Decimal64Dtype(3, 1), Decimal64Dtype(7, 2)])
@@ -1277,7 +1320,7 @@ def test_concat_decimal_dataframe(ltype, rtype):
     got = gd.concat([gdf1, gdf2])
     expected = pd.concat([pdf1, pdf2])
 
-    assert_eq(expected, got)
+    assert_eq(expected, got, check_index_type=True)
 
 
 @pytest.mark.parametrize("ltype", [Decimal64Dtype(4, 1), Decimal64Dtype(8, 2)])
@@ -1294,7 +1337,7 @@ def test_concat_decimal_series(ltype, rtype):
     got = gd.concat([gs1, gs2])
     expected = pd.concat([ps1, ps2])
 
-    assert_eq(expected, got)
+    assert_eq(expected, got, check_index_type=True)
 
 
 @pytest.mark.parametrize(
@@ -1395,7 +1438,7 @@ def test_concat_decimal_series(ltype, rtype):
 )
 def test_concat_decimal_numeric_dataframe(df1, df2, df3, expected):
     df = gd.concat([df1, df2, df3])
-    assert_eq(df, expected)
+    assert_eq(df, expected, check_index_type=True)
     assert_eq(df.val.dtype, expected.val.dtype)
 
 
@@ -1487,7 +1530,7 @@ def test_concat_decimal_numeric_dataframe(df1, df2, df3, expected):
 )
 def test_concat_decimal_numeric_series(s1, s2, s3, expected):
     s = gd.concat([s1, s2, s3])
-    assert_eq(s, expected)
+    assert_eq(s, expected, check_index_type=True)
 
 
 @pytest.mark.parametrize(
@@ -1558,7 +1601,7 @@ def test_concat_decimal_numeric_series(s1, s2, s3, expected):
 )
 def test_concat_decimal_non_numeric(s1, s2, expected):
     s = gd.concat([s1, s2])
-    assert_eq(s, expected)
+    assert_eq(s, expected, check_index_type=True)
 
 
 @pytest.mark.parametrize(
@@ -1581,4 +1624,4 @@ def test_concat_decimal_non_numeric(s1, s2, expected):
 )
 def test_concat_struct_column(s1, s2, expected):
     s = gd.concat([s1, s2])
-    assert_eq(s, expected)
+    assert_eq(s, expected, check_index_type=True)
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
@@ -1487,7 +1487,7 @@ def test_csv_writer_file_append(tmpdir):
 
     result = cudf.read_csv(gdf_df_fname)
     expected = cudf.concat([gdf1, gdf2], ignore_index=True)
-    assert_eq(result, expected)
+    assert_eq(result, expected, check_index_type=True)
 
 
 def test_csv_writer_buffer(tmpdir):