From 210e34dcbc7716254d56d6c74164a025638e067b Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Wed, 11 Nov 2020 07:41:52 -0800
Subject: [PATCH 01/16] fix typo

---
 python/cudf/cudf/_lib/parquet.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 3c2ff128685..b97529d39d1 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -142,7 +142,7 @@ cpdef generate_pandas_metadata(Table table, index):
                             "'category' column dtypes are currently not "
                             + "supported by the gpu accelerated parquet writer"
                         )
-                    elif is_list_dtype(col):
+                    elif is_list_dtype(idx):
                         types.append(col.dtype.to_arrow())
                     else:
                         types.append(np_to_pa_dtype(idx.dtype))

From f16ab6a1cedbfab0c71f80fcbf527c484bbae336 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Wed, 11 Nov 2020 20:54:01 -0600
Subject: [PATCH 02/16] Fix writing index in parquet writer.

---
 python/cudf/cudf/_lib/parquet.pyx      | 98 +++++++++++++++-----------
 python/cudf/cudf/tests/test_parquet.py | 34 +++++++++
 2 files changed, 91 insertions(+), 41 deletions(-)

diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index b97529d39d1..a213d8c14fe 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -119,37 +119,34 @@ cpdef generate_pandas_metadata(Table table, index):
 
     # Indexes
     if index is not False:
-        for name in table._index.names:
-            if name is not None:
-                if isinstance(table._index, cudf.core.multiindex.MultiIndex):
-                    idx = table.index.get_level_values(name)
-                else:
-                    idx = table.index
-
-                if isinstance(idx, cudf.core.index.RangeIndex):
-                    descr = {
-                        "kind": "range",
-                        "name": table.index.name,
-                        "start": table.index._start,
-                        "stop": table.index._stop,
-                        "step": 1,
-                    }
-                else:
-                    descr = name
-                    col_names.append(name)
-                    if is_categorical_dtype(idx):
-                        raise ValueError(
-                            "'category' column dtypes are currently not "
-                            + "supported by the gpu accelerated parquet writer"
-                        )
-                    elif is_list_dtype(idx):
-                        types.append(col.dtype.to_arrow())
-                    else:
-                        types.append(np_to_pa_dtype(idx.dtype))
-                    index_levels.append(idx)
-                index_descriptors.append(descr)
+        for level, name in enumerate(table._index.names):
+            if isinstance(table._index, cudf.core.multiindex.MultiIndex):
+                idx = table.index.get_level_values(level)
+            else:
+                idx = table.index
+
+            if isinstance(idx, cudf.core.index.RangeIndex):
+                descr = {
+                    "kind": "range",
+                    "name": table.index.name,
+                    "start": table.index._start,
+                    "stop": table.index._stop,
+                    "step": 1,
+                }
             else:
-                col_names.append(name)
+                descr = _index_level_name(idx.name, level, col_names)
+                if is_categorical_dtype(idx):
+                    raise ValueError(
+                        "'category' column dtypes are currently not "
+                        + "supported by the gpu accelerated parquet writer"
+                    )
+                elif is_list_dtype(idx):
+                    types.append(col.dtype.to_arrow())
+                else:
+                    types.append(np_to_pa_dtype(idx.dtype))
+                index_levels.append(idx)
+            col_names.append(name)
+            index_descriptors.append(descr)
 
     metadata = pa.pandas_compat.construct_metadata(
         table,
@@ -295,21 +292,20 @@ cpdef write_parquet(
 
     cdef vector[string] column_names
     cdef map[string, string] user_data
-    cdef table_view tv = table.data_view()
+    cdef table_view tv
     cdef unique_ptr[cudf_io_types.data_sink] _data_sink
     cdef cudf_io_types.sink_info sink = make_sink_info(path, _data_sink)
 
-    if index is not False:
+    if index is not False and not isinstance(table._index, cudf.RangeIndex):
         tv = table.view()
-        if isinstance(table._index, cudf.core.multiindex.MultiIndex):
-            for idx_name in table._index.names:
-                column_names.push_back(str.encode(idx_name))
-        else:
-            if table._index.name is not None:
-                column_names.push_back(str.encode(table._index.name))
-            else:
-                # No named index exists so just write out columns
-                tv = table.data_view()
+        for level, idx_name in enumerate(table._index.names):
+            column_names.push_back(
+                str.encode(
+                    _index_level_name(idx_name, level, table._column_names)
+                )
+            )
+    else:
+        tv = table.data_view()
 
     for col_name in table._column_names:
         column_names.push_back(str.encode(col_name))
@@ -541,3 +537,23 @@ cdef Column _update_column_struct_field_names(
             )
         col.set_base_children(tuple(children))
     return col
+
+
+def _index_level_name(index_name, level, column_names):
+    """
+    Return the name of an index level or a default name
+    if `index_name` is None or is already a column name.
+
+    Parameters
+    ----------
+    index_name : name of an Index object
+    level : level of the Index object
+
+    Returns
+    -------
+    name : str
+    """
+    if index_name is not None and index_name not in column_names:
+        return index_name
+    else:
+        return f"__index_level_{level}__"
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 633f5b472e7..73035a4717c 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -1579,3 +1579,37 @@ def test_parquet_nullable_boolean(tmpdir, engine):
     actual_gdf = cudf.read_parquet(pandas_path, engine=engine)
 
     assert_eq(actual_gdf, expected_gdf)
+
+
+@pytest.mark.parametrize(
+    "pdf",
+    [
+        pd.DataFrame(index=[1, 2, 3]),
+        # pd.DataFrame(index=pd.RangeIndex(0, 10, 1)),
+        pd.DataFrame({"a": [1, 2, 3]}, index=[0.43534, 345, 0.34534]),
+        pd.DataFrame(
+            {"b": [11, 22, 33], "c": ["a", "b", "c"]},
+            index=pd.Index(["a", "b", "c"], name="custom name"),
+        ),
+        pd.DataFrame(index=pd.Index(["a", "b", "c"], name="custom name")),
+    ],
+)
+@pytest.mark.parametrize("index", [None, True, False])
+def test_parquet_index(tmpdir, pdf, index):
+    pandas_path = tmpdir.join("pandas_index.parquet")
+    cudf_path = tmpdir.join("pandas_index.parquet")
+
+    gdf = cudf.from_pandas(pdf)
+
+    pdf.to_parquet(pandas_path, index=index)
+    gdf.to_parquet(cudf_path, index=index)
+
+    expected = pd.read_parquet(cudf_path)
+    actual = cudf.read_parquet(cudf_path)
+
+    assert_eq(expected, actual)
+
+    expected = pd.read_parquet(pandas_path)
+    actual = cudf.read_parquet(pandas_path)
+
+    assert_eq(expected, actual)

From 614500a5fac0505ac9fde0e20fb53daf9224193e Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Mon, 16 Nov 2020 14:52:34 -0600
Subject: [PATCH 03/16] Fix Parquet reader index handling

---
 python/cudf/cudf/_lib/parquet.pyx             | 60 ++++++++++++++-----
 python/cudf/cudf/core/dataframe.py            |  9 +--
 python/cudf/cudf/core/index.py                |  7 +++
 python/cudf/cudf/tests/test_pandas_interop.py | 10 ++--
 python/cudf/cudf/tests/test_parquet.py        | 52 +++++++++-------
 5 files changed, 89 insertions(+), 49 deletions(-)

diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index a213d8c14fe..3d8d04e07a9 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -6,6 +6,7 @@ import cudf
 import errno
 import os
 import pyarrow as pa
+from collections import OrderedDict
 
 try:
     import ujson as json
@@ -129,9 +130,9 @@ cpdef generate_pandas_metadata(Table table, index):
                 descr = {
                     "kind": "range",
                     "name": table.index.name,
-                    "start": table.index._start,
-                    "stop": table.index._stop,
-                    "step": 1,
+                    "start": table.index.start,
+                    "stop": table.index.stop,
+                    "step": table.index.step,
                 }
             else:
                 descr = _index_level_name(idx.name, level, col_names)
@@ -222,15 +223,24 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
     column_names = [x.decode() for x in c_out_table.metadata.column_names]
 
     # Access the Parquet user_data json to find the index
-    index_col = ''
+    index_col = None
     cdef map[string, string] user_data = c_out_table.metadata.user_data
     json_str = user_data[b'pandas'].decode('utf-8')
     meta = None
     if json_str != "":
         meta = json.loads(json_str)
         if 'index_columns' in meta and len(meta['index_columns']) > 0:
-            index_col = meta['index_columns'][0]
-
+            index_col = meta['index_columns']
+            if isinstance(index_col[0], dict) and \
+                    index_col[0]['kind'] == 'range':
+                is_range_index = True
+            else:
+                is_range_index = False
+                index_col_names = OrderedDict()
+                for idx_col in index_col:
+                    for c in meta['columns']:
+                        if c['field_name'] == idx_col:
+                            index_col_names[idx_col] = c['name']
     df = cudf.DataFrame._from_table(
         Table.from_unique_ptr(
             move(c_out_table.tbl),
@@ -247,7 +257,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
 
         if not column_names:
             column_names = [o['name'] for o in meta['columns']]
-            if index_col in cols_dtype_map:
+            if not is_range_index and index_col in cols_dtype_map:
                 column_names.remove(index_col)
 
         for col in column_names:
@@ -258,16 +268,38 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
             )
 
     # Set the index column
-    if index_col is not '' and isinstance(index_col, str):
-        if index_col in column_names:
-            df = df.set_index(index_col)
-            new_index_name = pa.pandas_compat._backwards_compatible_index_name(
-                df.index.name, df.index.name
+    if index_col is not None and len(index_col) > 0:
+        if is_range_index:
+            range_index_meta = index_col[0]
+            idx = cudf.RangeIndex(
+                start=range_index_meta['start'],
+                stop=range_index_meta['stop'],
+                step=range_index_meta['step'],
+                name=range_index_meta['name']
             )
-            df.index.name = new_index_name
+            if skiprows is not None:
+                idx = idx[skiprows:]
+            if num_rows is not None:
+                idx = idx[:num_rows]
+            df.index = idx
+        elif set(index_col).issubset(column_names):
+            index_data = df[index_col]
+            actual_index_names = list(index_col_names.values())
+            if len(index_data._data) == 1:
+                idx = cudf.Index(
+                    index_data._data.columns[0],
+                    name=actual_index_names[0]
+                )
+            else:
+                idx = cudf.MultiIndex.from_frame(
+                    index_data,
+                    names=actual_index_names
+                )
+            df.drop(columns=index_col, inplace=True)
+            df.index = idx
         else:
             if use_pandas_metadata:
-                df.index.name = index_col
+                df.index.names = index_col
 
     return df
 
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index a730e3488eb..107d2d20e38 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -4940,10 +4940,7 @@ def from_pandas(cls, dataframe, nan_as_null=None):
             df.columns = dataframe.columns
 
         # Set index
-        if isinstance(dataframe.index, pd.MultiIndex):
-            index = cudf.from_pandas(dataframe.index, nan_as_null=nan_as_null)
-        else:
-            index = dataframe.index
+        index = cudf.from_pandas(dataframe.index, nan_as_null=nan_as_null)
         result = df.set_index(index)
 
         return result
@@ -7137,10 +7134,8 @@ def from_pandas(obj, nan_as_null=None):
     elif isinstance(obj, pd.MultiIndex):
         return cudf.MultiIndex.from_pandas(obj, nan_as_null=nan_as_null)
     elif isinstance(obj, pd.RangeIndex):
-        if obj._step and obj._step != 1:
-            raise ValueError("cudf RangeIndex requires step == 1")
         return cudf.core.index.RangeIndex(
-            obj._start, stop=obj._stop, name=obj.name
+            start=obj.start, stop=obj.stop, step=obj.step, name=obj.name
         )
     elif isinstance(obj, pd.Index):
         return cudf.Index.from_pandas(obj, nan_as_null=nan_as_null)
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 7485b99b0ce..56348e4a1a4 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -1532,6 +1532,13 @@ def stop(self):
         """
         return self._stop
 
+    @property
+    def step(self):
+        """
+        The value of the step parameter.
+        """
+        return self._step
+
     @property
     def _num_columns(self):
         return 1
diff --git a/python/cudf/cudf/tests/test_pandas_interop.py b/python/cudf/cudf/tests/test_pandas_interop.py
index 064b73f1052..15b1acdfc08 100644
--- a/python/cudf/cudf/tests/test_pandas_interop.py
+++ b/python/cudf/cudf/tests/test_pandas_interop.py
@@ -1,8 +1,7 @@
-# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright (c) 2018-2020, NVIDIA CORPORATION.
 
 import numpy as np
 import pandas as pd
-import pytest
 
 import cudf
 from cudf.core import DataFrame
@@ -85,6 +84,7 @@ def test_from_pandas_rangeindex():
 
 
 def test_from_pandas_rangeindex_step():
-    idx1 = pd.RangeIndex(start=0, stop=8, step=2, name="myindex")
-    with pytest.raises(ValueError):
-        cudf.from_pandas(idx1)
+    expected = pd.RangeIndex(start=0, stop=8, step=2, name="myindex")
+    actual = cudf.from_pandas(expected)
+
+    assert_eq(expected, actual)
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 73035a4717c..e66adff7d5e 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -1,4 +1,5 @@
 # Copyright (c) 2019-2020, NVIDIA CORPORATION.
+
 import os
 import pathlib
 import random
@@ -967,15 +968,7 @@ def test_parquet_reader_list_skiprows(skip, tmpdir):
     src.to_parquet(fname)
     assert os.path.exists(fname)
 
-    expect = pd.DataFrame(
-        {
-            "a": list_gen(int_gen, skip, num_rows - skip, 80, 50),
-            "b": list_gen(string_gen, skip, num_rows - skip, 80, 50),
-            "c": list_gen(
-                int_gen, skip, num_rows - skip, 80, 50, include_validity=True
-            ),
-        }
-    )
+    expect = src.iloc[skip:]
     got = cudf.read_parquet(fname, skiprows=skip)
     assert_eq(expect, got, check_dtype=False)
 
@@ -998,18 +991,7 @@ def test_parquet_reader_list_num_rows(skip, tmpdir):
     assert os.path.exists(fname)
 
     rows_to_read = min(3, num_rows - skip)
-    expect = pd.DataFrame(
-        {
-            "a": list_gen(int_gen, skip, rows_to_read, 80, 50),
-            "b": list_gen(string_gen, skip, rows_to_read, 80, 50),
-            "c": list_gen(
-                int_gen, skip, rows_to_read, 80, 50, include_validity=True
-            ),
-            "d": list_gen(
-                string_gen, skip, rows_to_read, 80, 50, include_validity=True
-            ),
-        }
-    )
+    expect = src.iloc[skip:].head(rows_to_read)
     got = cudf.read_parquet(fname, skiprows=skip, num_rows=rows_to_read)
     assert_eq(expect, got, check_dtype=False)
 
@@ -1514,7 +1496,7 @@ def test_parquet_writer_sliced(tmpdir):
     df_select = df.iloc[1:3]
 
     df_select.to_parquet(cudf_path)
-    assert_eq(cudf.read_parquet(cudf_path), df_select.reset_index(drop=True))
+    assert_eq(cudf.read_parquet(cudf_path), df_select)
 
 
 def test_parquet_writer_list_basic(tmpdir):
@@ -1585,13 +1567,37 @@ def test_parquet_nullable_boolean(tmpdir, engine):
     "pdf",
     [
         pd.DataFrame(index=[1, 2, 3]),
-        # pd.DataFrame(index=pd.RangeIndex(0, 10, 1)),
+        pytest.param(
+            pd.DataFrame(index=pd.RangeIndex(0, 10, 1)),
+            marks=pytest.mark.xfail(
+                reason="https://github.com/pandas-dev/pandas/issues/37897"
+                "https://github.com/pandas-dev/pandas/issues/37896"
+            ),
+        ),
         pd.DataFrame({"a": [1, 2, 3]}, index=[0.43534, 345, 0.34534]),
         pd.DataFrame(
             {"b": [11, 22, 33], "c": ["a", "b", "c"]},
             index=pd.Index(["a", "b", "c"], name="custom name"),
         ),
+        pd.DataFrame(
+            {"a": [10, 11, 12], "b": [99, 88, 77]},
+            index=pd.RangeIndex(12, 17, 2),
+        ),
+        pd.DataFrame(
+            {"b": [99, 88, 77]},
+            index=pd.RangeIndex(22, 27, 2, name="hello index"),
+        ),
         pd.DataFrame(index=pd.Index(["a", "b", "c"], name="custom name")),
+        pd.DataFrame(
+            {"a": ["a", "bb", "cc"], "b": [10, 21, 32]},
+            index=pd.MultiIndex.from_tuples([[1, 2], [10, 11], [15, 16]]),
+        ),
+        pd.DataFrame(
+            {"a": ["a", "bb", "cc"], "b": [10, 21, 32]},
+            index=pd.MultiIndex.from_tuples(
+                [[1, 2], [10, 11], [15, 16]], names=["first", "second"]
+            ),
+        ),
     ],
 )
 @pytest.mark.parametrize("index", [None, True, False])

From 3bb0aecc152f45ec3a5477ac32c3184d1dcca2a0 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 16 Nov 2020 15:01:46 -0600
Subject: [PATCH 04/16] Update CHANGELOG.md

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ff00d2ac33b..8375bbd4026 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -123,6 +123,7 @@
 - PR #6742 Fix concat bug in dask_cudf Series/Index creation
 - PR #6632 Fix DataFrame initialization from list of dicts
 - PR #6767 Fix sort order of parameters in `test_scalar_invalid_implicit_conversion` pytest
+- PR #6771 Fix index handling in parquet reader and writer
 
 
 # cuDF 0.16.0 (21 Oct 2020)

From 01c6423fe8c7298516a2fbc61f5bcd38b9b048f7 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Mon, 23 Nov 2020 10:48:18 -0800
Subject: [PATCH 05/16] enable respecting columns and index

---
 python/cudf/cudf/core/dataframe.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index a730e3488eb..6fd0d6a8040 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -200,6 +200,9 @@ def __init__(self, data=None, index=None, columns=None, dtype=None):
         """
         super().__init__()
 
+        if isinstance(columns, (Series, cudf.Index)):
+            columns = columns.to_pandas()
+
         if isinstance(data, ColumnAccessor):
             self._data = data
             if index is None:
@@ -207,16 +210,15 @@ def __init__(self, data=None, index=None, columns=None, dtype=None):
             self.index = as_index(index)
             return None
 
-        if isinstance(data, DataFrame):
-            self._data = data._data
-            self._index = data._index
-            self.columns = data.columns
-            return
+        if isinstance(data, (DataFrame, pd.DataFrame)):
+            if columns is not None:
+                data = data[columns]
+
+            if isinstance(data, pd.DataFrame):
+                data = self.from_pandas(data)
 
-        if isinstance(data, pd.DataFrame):
-            data = self.from_pandas(data)
             self._data = data._data
-            self._index = data._index
+            self._index = data._index if index is None else as_index(index)
             self.columns = data.columns
             return
 
@@ -226,8 +228,6 @@ def __init__(self, data=None, index=None, columns=None, dtype=None):
             else:
                 self._index = as_index(index)
             if columns is not None:
-                if isinstance(columns, (Series, cudf.Index)):
-                    columns = columns.to_pandas()
 
                 self._data = ColumnAccessor(
                     OrderedDict.fromkeys(

From a9d920011df06156affc4a26aef71f4a344a5447 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Mon, 23 Nov 2020 13:59:00 -0800
Subject: [PATCH 06/16] fix columnAccessor constructor

---
 python/cudf/cudf/core/dataframe.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 6fd0d6a8040..c4c3ad6e1f1 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -204,6 +204,15 @@ def __init__(self, data=None, index=None, columns=None, dtype=None):
             columns = columns.to_pandas()
 
         if isinstance(data, ColumnAccessor):
+            if columns is not None:
+                data = ColumnAccessor(
+                    data=OrderedDict(
+                        (col_name, data[col_name]) for col_name in columns
+                    ),
+                    multiindex=data.multiindex,
+                    level_names=data.level_names,
+                )
+
             self._data = data
             if index is None:
                 index = as_index(range(self._data.nrows))

From b2c45b3b72da6e013d3daa4e29ce92ae4c49226a Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Mon, 23 Nov 2020 17:22:52 -0800
Subject: [PATCH 07/16] handle non-existent columns

---
 python/cudf/cudf/core/dataframe.py       | 37 +++++++++++++++++-------
 python/cudf/cudf/tests/test_dataframe.py | 34 ++++++++++++++++++++++
 2 files changed, 61 insertions(+), 10 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index c4c3ad6e1f1..35838f1189e 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -205,12 +205,8 @@ def __init__(self, data=None, index=None, columns=None, dtype=None):
 
         if isinstance(data, ColumnAccessor):
             if columns is not None:
-                data = ColumnAccessor(
-                    data=OrderedDict(
-                        (col_name, data[col_name]) for col_name in columns
-                    ),
-                    multiindex=data.multiindex,
-                    level_names=data.level_names,
+                data = _get_columns_from_column_accessor(
+                    data=data, columns=columns
                 )
 
             self._data = data
@@ -220,13 +216,16 @@ def __init__(self, data=None, index=None, columns=None, dtype=None):
             return None
 
         if isinstance(data, (DataFrame, pd.DataFrame)):
-            if columns is not None:
-                data = data[columns]
-
             if isinstance(data, pd.DataFrame):
                 data = self.from_pandas(data)
 
-            self._data = data._data
+            if columns is not None:
+                self._data = _get_columns_from_column_accessor(
+                    data=data._data, columns=columns
+                )
+            else:
+                self._data = data._data
+
             self._index = data._index if index is None else as_index(index)
             self.columns = data.columns
             return
@@ -7306,3 +7305,21 @@ def _get_host_unique(array):
         return [array]
     else:
         return set(array)
+
+
+def _get_columns_from_column_accessor(data, columns):
+    return ColumnAccessor(
+        data=OrderedDict(
+            (
+                col_name,
+                data[col_name]
+                if col_name in data
+                else cudf.core.column.column_empty(
+                    row_count=data.nrows, dtype="float64", masked=True
+                ),
+            )
+            for col_name in columns
+        ),
+        multiindex=data.multiindex,
+        level_names=data.level_names,
+    )
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 69eb70e7201..82ce87c668b 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -7995,3 +7995,37 @@ def test_dataframe_from_pandas_duplicate_columns():
         ValueError, match="Duplicate column names are not allowed"
     ):
         gd.from_pandas(pdf)
+
+
+@pytest.mark.parametrize(
+    "df",
+    [
+        pd.DataFrame(
+            {"a": [1, 2, 3], "b": [10, 11, 20], "c": ["a", "bcd", "xyz"]}
+        )
+    ],
+)
+@pytest.mark.parametrize(
+    "columns",
+    [
+        None,
+        ["a"],
+        ["c", "a"],
+        ["b", "a", "c"],
+        [],
+        pd.Index(["c", "a"]),
+        gd.Index(["c", "a"]),
+    ],
+)
+def test_dataframe_constructor_columns(df, columns):
+    gdf = gd.from_pandas(df)
+
+    expected = pd.DataFrame(
+        df,
+        columns=columns.to_pandas()
+        if isinstance(columns, gd.Index)
+        else columns,
+    )
+    actual = gd.DataFrame(gdf, columns=columns)
+
+    assert_eq(expected, actual)

From c1cf47ec9e9de25e9749f38d294475422fe49401 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Tue, 24 Nov 2020 13:00:17 -0600
Subject: [PATCH 08/16] Handle more cases and add tests for the same.

---
 python/cudf/cudf/core/dataframe.py       | 34 +++++++++++-------------
 python/cudf/cudf/tests/test_dataframe.py | 26 +++++++++++++-----
 2 files changed, 35 insertions(+), 25 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 35838f1189e..d07b03db7c2 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -204,33 +204,29 @@ def __init__(self, data=None, index=None, columns=None, dtype=None):
             columns = columns.to_pandas()
 
         if isinstance(data, ColumnAccessor):
+            if index is None:
+                index = as_index(range(data.nrows))
+
             if columns is not None:
                 data = _get_columns_from_column_accessor(
-                    data=data, columns=columns
+                    column_accessor=data, columns=columns
                 )
-
             self._data = data
-            if index is None:
-                index = as_index(range(self._data.nrows))
             self.index = as_index(index)
-            return None
-
-        if isinstance(data, (DataFrame, pd.DataFrame)):
+        elif isinstance(data, (DataFrame, pd.DataFrame)):
             if isinstance(data, pd.DataFrame):
                 data = self.from_pandas(data)
 
             if columns is not None:
                 self._data = _get_columns_from_column_accessor(
-                    data=data._data, columns=columns
+                    column_accessor=data._data, columns=columns
                 )
             else:
                 self._data = data._data
+                self.columns = data.columns
 
             self._index = data._index if index is None else as_index(index)
-            self.columns = data.columns
-            return
-
-        if data is None:
+        elif data is None:
             if index is None:
                 self._index = RangeIndex(0)
             else:
@@ -7307,19 +7303,21 @@ def _get_host_unique(array):
         return set(array)
 
 
-def _get_columns_from_column_accessor(data, columns):
+def _get_columns_from_column_accessor(column_accessor, columns):
     return ColumnAccessor(
         data=OrderedDict(
             (
                 col_name,
-                data[col_name]
-                if col_name in data
+                column_accessor[col_name]
+                if col_name in column_accessor
                 else cudf.core.column.column_empty(
-                    row_count=data.nrows, dtype="float64", masked=True
+                    row_count=column_accessor.nrows,
+                    dtype="object",
+                    masked=True,
                 ),
             )
             for col_name in columns
         ),
-        multiindex=data.multiindex,
-        level_names=data.level_names,
+        multiindex=column_accessor.multiindex,
+        level_names=column_accessor.level_names,
     )
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 82ce87c668b..7476e516679 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -8015,17 +8015,29 @@ def test_dataframe_from_pandas_duplicate_columns():
         [],
         pd.Index(["c", "a"]),
         gd.Index(["c", "a"]),
+        ["abc", "a"],
+        ["column_not_exists1", "column_not_exists2"],
     ],
 )
 def test_dataframe_constructor_columns(df, columns):
-    gdf = gd.from_pandas(df)
+    def assert_local_eq(actual, df, expected, host_columns):
+        if host_columns is not None and any(
+            col not in df.columns for col in host_columns
+        ):
+            assert_eq(expected, actual, check_dtype=False)
+        else:
+            assert_eq(expected, actual)
 
-    expected = pd.DataFrame(
-        df,
-        columns=columns.to_pandas()
-        if isinstance(columns, gd.Index)
-        else columns,
+    gdf = gd.from_pandas(df)
+    host_columns = (
+        columns.to_pandas() if isinstance(columns, gd.Index) else columns
     )
+
+    expected = pd.DataFrame(df, columns=host_columns)
     actual = gd.DataFrame(gdf, columns=columns)
 
-    assert_eq(expected, actual)
+    assert_local_eq(actual, df, expected, host_columns)
+
+    actual = gd.DataFrame(gdf._data, columns=columns)
+
+    assert_local_eq(actual, df, expected, host_columns)

From 264c4cf6dcf40280a399a10e8d23cfc0ae6295b6 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Tue, 24 Nov 2020 16:05:44 -0600
Subject: [PATCH 09/16] Fix dask issue and add tests

---
 python/cudf/cudf/core/dataframe.py       |  5 ++++-
 python/cudf/cudf/tests/test_dataframe.py | 13 ++++++++-----
 python/dask_cudf/dask_cudf/sorting.py    |  2 +-
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index d07b03db7c2..fd5d31c1329 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -217,6 +217,9 @@ def __init__(self, data=None, index=None, columns=None, dtype=None):
             if isinstance(data, pd.DataFrame):
                 data = self.from_pandas(data)
 
+            if index is not None and not data.index.equals(index):
+                data = data.reindex(index)
+
             if columns is not None:
                 self._data = _get_columns_from_column_accessor(
                     column_accessor=data._data, columns=columns
@@ -225,7 +228,7 @@ def __init__(self, data=None, index=None, columns=None, dtype=None):
                 self._data = data._data
                 self.columns = data.columns
 
-            self._index = data._index if index is None else as_index(index)
+            self._index = data._index
         elif data is None:
             if index is None:
                 self._index = RangeIndex(0)
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 7476e516679..fb73bf35b80 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -8019,7 +8019,8 @@ def test_dataframe_from_pandas_duplicate_columns():
         ["column_not_exists1", "column_not_exists2"],
     ],
 )
-def test_dataframe_constructor_columns(df, columns):
+@pytest.mark.parametrize("index", [None, ["abc", "def", "ghi"]])
+def test_dataframe_constructor_columns(df, columns, index):
     def assert_local_eq(actual, df, expected, host_columns):
         if host_columns is not None and any(
             col not in df.columns for col in host_columns
@@ -8033,11 +8034,13 @@ def assert_local_eq(actual, df, expected, host_columns):
         columns.to_pandas() if isinstance(columns, gd.Index) else columns
     )
 
-    expected = pd.DataFrame(df, columns=host_columns)
-    actual = gd.DataFrame(gdf, columns=columns)
+    expected = pd.DataFrame(df, columns=host_columns, index=index)
+    actual = gd.DataFrame(gdf, columns=columns, index=index)
 
     assert_local_eq(actual, df, expected, host_columns)
 
-    actual = gd.DataFrame(gdf._data, columns=columns)
-
+    expected = pd.DataFrame(df, columns=host_columns)
+    actual = gd.DataFrame(gdf._data, columns=columns, index=index)
+    if index is not None:
+        expected.index = index
     assert_local_eq(actual, df, expected, host_columns)
diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py
index 16454019929..0a908ba1389 100644
--- a/python/dask_cudf/dask_cudf/sorting.py
+++ b/python/dask_cudf/dask_cudf/sorting.py
@@ -96,7 +96,7 @@ def _append_counts(val, count):
     index = lower  # alias; we no longer need lower
     index[mask] = upper[mask]
     rv = combined_vals.iloc[index]
-    return rv.reset_index(drop=True)
+    return rv.reset_index(drop=True)._data
 
 
 def _approximate_quantile(df, q):

From cfeef0823f298fd682319d03e6847bd1365eede7 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 24 Nov 2020 16:08:10 -0600
Subject: [PATCH 10/16] Update CHANGELOG.md

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index af326d16ebb..6a8ea5908e3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -158,6 +158,7 @@
 - PR #6824 Fix JNI build
 - PR #6826 Fix resource management in Java ColumnBuilder
 - PR #6830 Fix categorical scalar insertion
+- PR #6838 Fix `columns` & `index` handling in dataframe constructor
 
 
 # cuDF 0.16.0 (21 Oct 2020)

From 004f7c1691a0779298099fa5022b4c66991d440b Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Wed, 25 Nov 2020 04:32:39 -0600
Subject: [PATCH 11/16] Add more changes

---
 python/cudf/cudf/core/dataframe.py       | 27 ++++++++++++++++--------
 python/cudf/cudf/tests/test_dataframe.py | 20 +++++++++++++-----
 2 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index fd5d31c1329..57251008a04 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -206,12 +206,18 @@ def __init__(self, data=None, index=None, columns=None, dtype=None):
         if isinstance(data, ColumnAccessor):
             if index is None:
                 index = as_index(range(data.nrows))
+            else:
+                index = as_index(index)
 
             if columns is not None:
-                data = _get_columns_from_column_accessor(
-                    column_accessor=data, columns=columns
+                self._data = _get_columns_from_column_accessor(
+                    column_accessor=data,
+                    columns=columns,
+                    nrows=len(index) if data.nrows == 0 else data.nrows,
                 )
-            self._data = data
+            else:
+                self._data = data
+
             self.index = as_index(index)
         elif isinstance(data, (DataFrame, pd.DataFrame)):
             if isinstance(data, pd.DataFrame):
@@ -219,16 +225,21 @@ def __init__(self, data=None, index=None, columns=None, dtype=None):
 
             if index is not None and not data.index.equals(index):
                 data = data.reindex(index)
+                index = data._index
 
             if columns is not None:
                 self._data = _get_columns_from_column_accessor(
-                    column_accessor=data._data, columns=columns
+                    column_accessor=data._data,
+                    columns=columns,
+                    nrows=len(index)
+                    if data._data.nrows == 0
+                    else data._data.nrows,
                 )
             else:
                 self._data = data._data
                 self.columns = data.columns
 
-            self._index = data._index
+            self._index = index
         elif data is None:
             if index is None:
                 self._index = RangeIndex(0)
@@ -7306,7 +7317,7 @@ def _get_host_unique(array):
         return set(array)
 
 
-def _get_columns_from_column_accessor(column_accessor, columns):
+def _get_columns_from_column_accessor(column_accessor, columns, nrows):
     return ColumnAccessor(
         data=OrderedDict(
             (
@@ -7314,9 +7325,7 @@ def _get_columns_from_column_accessor(column_accessor, columns):
                 column_accessor[col_name]
                 if col_name in column_accessor
                 else cudf.core.column.column_empty(
-                    row_count=column_accessor.nrows,
-                    dtype="object",
-                    masked=True,
+                    row_count=nrows, dtype="object", masked=True,
                 ),
             )
             for col_name in columns
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index fb73bf35b80..a370217efd4 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -8002,7 +8002,8 @@ def test_dataframe_from_pandas_duplicate_columns():
     [
         pd.DataFrame(
             {"a": [1, 2, 3], "b": [10, 11, 20], "c": ["a", "bcd", "xyz"]}
-        )
+        ),
+        pd.DataFrame(),
     ],
 )
 @pytest.mark.parametrize(
@@ -8019,15 +8020,21 @@ def test_dataframe_from_pandas_duplicate_columns():
         ["column_not_exists1", "column_not_exists2"],
     ],
 )
-@pytest.mark.parametrize("index", [None, ["abc", "def", "ghi"]])
+@pytest.mark.parametrize("index", [["abc", "def", "ghi"]])
 def test_dataframe_constructor_columns(df, columns, index):
     def assert_local_eq(actual, df, expected, host_columns):
+        check_index_type = False if expected.empty else True
         if host_columns is not None and any(
             col not in df.columns for col in host_columns
         ):
-            assert_eq(expected, actual, check_dtype=False)
+            assert_eq(
+                expected,
+                actual,
+                check_dtype=False,
+                check_index_type=check_index_type,
+            )
         else:
-            assert_eq(expected, actual)
+            assert_eq(expected, actual, check_index_type=check_index_type)
 
     gdf = gd.from_pandas(df)
     host_columns = (
@@ -8042,5 +8049,8 @@ def assert_local_eq(actual, df, expected, host_columns):
     expected = pd.DataFrame(df, columns=host_columns)
     actual = gd.DataFrame(gdf._data, columns=columns, index=index)
     if index is not None:
-        expected.index = index
+        if df.shape == (0, 0):
+            expected = pd.DataFrame(columns=host_columns, index=index)
+        else:
+            expected.index = index
     assert_local_eq(actual, df, expected, host_columns)

From 12849ba4d30491aa0a7ca0206a3f6c9936996cb9 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Tue, 1 Dec 2020 11:24:17 -0800
Subject: [PATCH 12/16] handle index slicing when row groups is used

---
 python/cudf/cudf/_lib/parquet.pyx      | 48 ++++++++++++++++++++------
 python/cudf/cudf/tests/test_parquet.py |  5 ++-
 2 files changed, 42 insertions(+), 11 deletions(-)

diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 67ff8b6404b..b63d7762269 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -271,16 +271,44 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
     if index_col is not None and len(index_col) > 0:
         if is_range_index:
             range_index_meta = index_col[0]
-            idx = cudf.RangeIndex(
-                start=range_index_meta['start'],
-                stop=range_index_meta['stop'],
-                step=range_index_meta['step'],
-                name=range_index_meta['name']
-            )
-            if skiprows is not None:
-                idx = idx[skiprows:]
-            if num_rows is not None:
-                idx = idx[:num_rows]
+            if row_groups is not None:
+                per_file_metadata = []
+                for s in filepaths_or_buffers:
+                    per_file_metadata.append(pa.parquet.read_metadata(s))
+
+                filtered_idx = []
+                for i, file_meta in enumerate(per_file_metadata):
+                    row_groups_i = []
+                    start = 0
+                    for row_group in range(file_meta.num_row_groups):
+                        stop = start + file_meta.row_group(row_group).num_rows
+                        row_groups_i.append((start, stop))
+                        start = stop
+
+                    for rg in row_groups[i]:
+                        filtered_idx.append(
+                            cudf.RangeIndex(
+                                start=rg[k][0],
+                                stop=rg[k][1],
+                                step=range_index_meta['step']
+                            )
+                        )
+
+                if len(filtered_idx) > 0:
+                    idx = cudf.concat(filtered_idx)
+                else:
+                    idx = cudf.Index([])
+            else:
+                idx = cudf.RangeIndex(
+                    start=range_index_meta['start'],
+                    stop=range_index_meta['stop'],
+                    step=range_index_meta['step'],
+                    name=range_index_meta['name']
+                )
+                if skiprows is not None:
+                    idx = idx[skiprows:]
+                if num_rows is not None:
+                    idx = idx[:num_rows]
             df.index = idx
         elif set(index_col).issubset(column_names):
             index_data = df[index_col]
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 3f5ae04f341..54b5901c9eb 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -460,7 +460,10 @@ def test_parquet_read_filtered_multiple_files(tmpdir):
         [fname_0, fname_1, fname_2], filters=[("x", "==", 2)]
     )
     assert_eq(
-        filtered_df, cudf.DataFrame({"x": [2, 3, 2, 3], "y": list("bbcc")})
+        filtered_df,
+        cudf.DataFrame(
+            {"x": [2, 3, 2, 3], "y": list("bbcc")}, index=[2, 3, 2, 3]
+        ),
     )
 
 

From fa8aa69cb88b2cc74abc79acf20f08c27e6d74ef Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Tue, 1 Dec 2020 13:55:26 -0800
Subject: [PATCH 13/16] address review comments

---
 python/cudf/cudf/_lib/parquet.pyx      | 12 ++++++------
 python/cudf/cudf/tests/test_parquet.py |  3 +--
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index b63d7762269..19da062f7c2 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -272,9 +272,9 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
         if is_range_index:
             range_index_meta = index_col[0]
             if row_groups is not None:
-                per_file_metadata = []
-                for s in filepaths_or_buffers:
-                    per_file_metadata.append(pa.parquet.read_metadata(s))
+                per_file_metadata = [
+                    pa.parquet.read_metadata(s) for s in filepaths_or_buffers
+                ]
 
                 filtered_idx = []
                 for i, file_meta in enumerate(per_file_metadata):
@@ -288,8 +288,8 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
                     for rg in row_groups[i]:
                         filtered_idx.append(
                             cudf.RangeIndex(
-                                start=rg[k][0],
-                                stop=rg[k][1],
+                                start=row_groups_i[rg][0],
+                                stop=row_groups_i[rg][1],
                                 step=range_index_meta['step']
                             )
                         )
@@ -297,7 +297,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
                 if len(filtered_idx) > 0:
                     idx = cudf.concat(filtered_idx)
                 else:
-                    idx = cudf.Index([])
+                    idx = cudf.Index(cudf.core.column.column_empty(0))
             else:
                 idx = cudf.RangeIndex(
                     start=range_index_meta['start'],
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 54b5901c9eb..fb8c293017a 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -1636,8 +1636,7 @@ def test_parquet_nullable_boolean(tmpdir, engine):
         pytest.param(
             pd.DataFrame(index=pd.RangeIndex(0, 10, 1)),
             marks=pytest.mark.xfail(
-                reason="https://github.com/pandas-dev/pandas/issues/37897"
-                "https://github.com/pandas-dev/pandas/issues/37896"
+                reason="https://issues.apache.org/jira/browse/ARROW-10643"
             ),
         ),
         pd.DataFrame({"a": [1, 2, 3]}, index=[0.43534, 345, 0.34534]),

From 820307ce9561de42a58a5edba99b14edb0a56ea2 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Tue, 1 Dec 2020 14:07:48 -0800
Subject: [PATCH 14/16] remove unrelated commits

---
 CHANGELOG.md                             |  1 -
 python/cudf/cudf/core/dataframe.py       | 75 +++++++-----------------
 python/cudf/cudf/tests/test_dataframe.py | 59 -------------------
 3 files changed, 21 insertions(+), 114 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ad757afe5f3..db2142805c4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -176,7 +176,6 @@
 - PR #6855 Fix `.str.replace_with_backrefs` docs examples
 - PR #6853 Fix contiguous split of null string columns
 - PR #6861 Fix compile error in type_dispatch_benchmark.cu
-- PR #6838 Fix `columns` & `index` handling in dataframe constructor
 
 
 # cuDF 0.16.0 (21 Oct 2020)
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index af8aef2180e..d299f6e63fc 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -200,52 +200,34 @@ def __init__(self, data=None, index=None, columns=None, dtype=None):
         """
         super().__init__()
 
-        if isinstance(columns, (Series, cudf.Index)):
-            columns = columns.to_pandas()
-
         if isinstance(data, ColumnAccessor):
+            self._data = data
             if index is None:
-                index = as_index(range(data.nrows))
-            else:
-                index = as_index(index)
-
-            if columns is not None:
-                self._data = _get_columns_from_column_accessor(
-                    column_accessor=data,
-                    columns=columns,
-                    nrows=len(index) if data.nrows == 0 else data.nrows,
-                )
-            else:
-                self._data = data
-
+                index = as_index(range(self._data.nrows))
             self.index = as_index(index)
-        elif isinstance(data, (DataFrame, pd.DataFrame)):
-            if isinstance(data, pd.DataFrame):
-                data = self.from_pandas(data)
+            return None
 
-            if index is not None and not data.index.equals(index):
-                data = data.reindex(index)
-                index = data._index
+        if isinstance(data, DataFrame):
+            self._data = data._data
+            self._index = data._index
+            self.columns = data.columns
+            return
 
-            if columns is not None:
-                self._data = _get_columns_from_column_accessor(
-                    column_accessor=data._data,
-                    columns=columns,
-                    nrows=len(index)
-                    if data._data.nrows == 0
-                    else data._data.nrows,
-                )
-            else:
-                self._data = data._data
-                self.columns = data.columns
+        if isinstance(data, pd.DataFrame):
+            data = self.from_pandas(data)
+            self._data = data._data
+            self._index = data._index
+            self.columns = data.columns
+            return
 
-            self._index = index
-        elif data is None:
+        if data is None:
             if index is None:
                 self._index = RangeIndex(0)
             else:
                 self._index = as_index(index)
             if columns is not None:
+                if isinstance(columns, (Series, cudf.Index)):
+                    columns = columns.to_pandas()
 
                 self._data = ColumnAccessor(
                     OrderedDict.fromkeys(
@@ -4958,7 +4940,10 @@ def from_pandas(cls, dataframe, nan_as_null=None):
             df.columns = dataframe.columns
 
         # Set index
-        index = cudf.from_pandas(dataframe.index, nan_as_null=nan_as_null)
+        if isinstance(dataframe.index, pd.MultiIndex):
+            index = cudf.from_pandas(dataframe.index, nan_as_null=nan_as_null)
+        else:
+            index = dataframe.index
         result = df.set_index(index)
 
         return result
@@ -7310,21 +7295,3 @@ def _get_host_unique(array):
         return [array]
     else:
         return set(array)
-
-
-def _get_columns_from_column_accessor(column_accessor, columns, nrows):
-    return ColumnAccessor(
-        data=OrderedDict(
-            (
-                col_name,
-                column_accessor[col_name]
-                if col_name in column_accessor
-                else cudf.core.column.column_empty(
-                    row_count=nrows, dtype="object", masked=True,
-                ),
-            )
-            for col_name in columns
-        ),
-        multiindex=column_accessor.multiindex,
-        level_names=column_accessor.level_names,
-    )
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index a370217efd4..69eb70e7201 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -7995,62 +7995,3 @@ def test_dataframe_from_pandas_duplicate_columns():
         ValueError, match="Duplicate column names are not allowed"
     ):
         gd.from_pandas(pdf)
-
-
-@pytest.mark.parametrize(
-    "df",
-    [
-        pd.DataFrame(
-            {"a": [1, 2, 3], "b": [10, 11, 20], "c": ["a", "bcd", "xyz"]}
-        ),
-        pd.DataFrame(),
-    ],
-)
-@pytest.mark.parametrize(
-    "columns",
-    [
-        None,
-        ["a"],
-        ["c", "a"],
-        ["b", "a", "c"],
-        [],
-        pd.Index(["c", "a"]),
-        gd.Index(["c", "a"]),
-        ["abc", "a"],
-        ["column_not_exists1", "column_not_exists2"],
-    ],
-)
-@pytest.mark.parametrize("index", [["abc", "def", "ghi"]])
-def test_dataframe_constructor_columns(df, columns, index):
-    def assert_local_eq(actual, df, expected, host_columns):
-        check_index_type = False if expected.empty else True
-        if host_columns is not None and any(
-            col not in df.columns for col in host_columns
-        ):
-            assert_eq(
-                expected,
-                actual,
-                check_dtype=False,
-                check_index_type=check_index_type,
-            )
-        else:
-            assert_eq(expected, actual, check_index_type=check_index_type)
-
-    gdf = gd.from_pandas(df)
-    host_columns = (
-        columns.to_pandas() if isinstance(columns, gd.Index) else columns
-    )
-
-    expected = pd.DataFrame(df, columns=host_columns, index=index)
-    actual = gd.DataFrame(gdf, columns=columns, index=index)
-
-    assert_local_eq(actual, df, expected, host_columns)
-
-    expected = pd.DataFrame(df, columns=host_columns)
-    actual = gd.DataFrame(gdf._data, columns=columns, index=index)
-    if index is not None:
-        if df.shape == (0, 0):
-            expected = pd.DataFrame(columns=host_columns, index=index)
-        else:
-            expected.index = index
-    assert_local_eq(actual, df, expected, host_columns)

From d509894b2a844c2d405a1f68ff7b0caa71919115 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Tue, 1 Dec 2020 14:08:59 -0800
Subject: [PATCH 15/16] revert unrelated changes

---
 python/dask_cudf/dask_cudf/sorting.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py
index 0a908ba1389..16454019929 100644
--- a/python/dask_cudf/dask_cudf/sorting.py
+++ b/python/dask_cudf/dask_cudf/sorting.py
@@ -96,7 +96,7 @@ def _append_counts(val, count):
     index = lower  # alias; we no longer need lower
     index[mask] = upper[mask]
     rv = combined_vals.iloc[index]
-    return rv.reset_index(drop=True)._data
+    return rv.reset_index(drop=True)
 
 
 def _approximate_quantile(df, q):

From 2a9b65c1f8127634ae17015c52131ab47c56ec5a Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Tue, 1 Dec 2020 14:31:20 -0800
Subject: [PATCH 16/16] add back required change

---
 python/cudf/cudf/core/dataframe.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index d299f6e63fc..107d2d20e38 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -4940,10 +4940,7 @@ def from_pandas(cls, dataframe, nan_as_null=None):
             df.columns = dataframe.columns
 
         # Set index
-        if isinstance(dataframe.index, pd.MultiIndex):
-            index = cudf.from_pandas(dataframe.index, nan_as_null=nan_as_null)
-        else:
-            index = dataframe.index
+        index = cudf.from_pandas(dataframe.index, nan_as_null=nan_as_null)
         result = df.set_index(index)
 
         return result