From 94ee5c966ac74ca2454c4324de9ce9bba54e968f Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Wed, 17 Mar 2021 05:59:51 -0500 Subject: [PATCH 1/3] code changes --- cpp/src/io/orc/orc.cpp | 4 ++-- cpp/src/io/orc/reader_impl.cu | 3 +++ python/cudf/cudf/tests/test_orc.py | 16 ++++++++++++++++ 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp index 6c0216a6d6b..fa98105e859 100644 --- a/cpp/src/io/orc/orc.cpp +++ b/cpp/src/io/orc/orc.cpp @@ -550,14 +550,14 @@ std::vector metadata::select_columns(std::vector use_names, } } else { // For now, only select all leaf nodes - for (int i = 0; i < get_num_columns(); ++i) { + for (int i = 1; i < get_num_columns(); ++i) { if (ff.types[i].subtypes.empty()) { selection.emplace_back(i); if (ff.types[i].kind == orc::TIMESTAMP) { has_timestamp_column = true; } } } } - CUDF_EXPECTS(selection.size() > 0, "Filtered out all columns"); + CUDF_EXPECTS(use_names.empty() or selection.size() > 0, "Filtered out all columns"); return selection; } diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index 61adef26dab..73908cc1553 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -425,6 +425,9 @@ table_with_metadata reader::impl::read(size_type skip_rows, std::vector> out_columns; table_metadata out_metadata; + // There are no columns in table + if (_selected_columns.size() == 0) return {std::make_unique(), std::move(out_metadata)}; + // Select only stripes required (aka row groups) const auto selected_stripes = _metadata->select_stripes(stripes, skip_rows, num_rows); diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index ca8aa00f80c..b50ac06e0d0 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -738,3 +738,19 @@ def test_nanoseconds_overflow(): pyarrow_got = pa.orc.ORCFile(buffer).read() assert_eq(expected.to_pandas(), pyarrow_got.to_pandas()) + + +def test_empty_dataframe(): + buffer = BytesIO() + expected = cudf.DataFrame() + expected.to_orc(buffer) + + # Raise error if column name is mentioned, but it doesn't exist. + with pytest.raises(RuntimeError): + cudf.read_orc(buffer, columns=["a"]) + + got_df = cudf.read_orc(buffer) + expected_pdf = pd.read_orc(buffer) + + assert_eq(expected, got_df) + assert_eq(expected_pdf, got_df) From 34d5722d322b529f69c0d04667760efcc34b9b45 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Wed, 17 Mar 2021 23:45:52 -0500 Subject: [PATCH 2/3] review changes --- cpp/src/io/orc/orc.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp index fa98105e859..6cec7bbe6f6 100644 --- a/cpp/src/io/orc/orc.cpp +++ b/cpp/src/io/orc/orc.cpp @@ -548,6 +548,7 @@ std::vector metadata::select_columns(std::vector use_names, } } } + CUDF_EXPECTS(selection.size() > 0, "Filtered out all columns"); } else { // For now, only select all leaf nodes for (int i = 1; i < get_num_columns(); ++i) { @@ -557,7 +558,6 @@ std::vector metadata::select_columns(std::vector use_names, } } } - CUDF_EXPECTS(use_names.empty() or selection.size() > 0, "Filtered out all columns"); return selection; } From 7afc0f0a223a39010474365602aab691e726fae7 Mon Sep 17 00:00:00 2001 From: Ramakrishna Prabhu Date: Thu, 18 Mar 2021 09:28:29 -0500 Subject: [PATCH 3/3] review changes to handle invalid column names --- cpp/src/io/orc/orc.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp index 6cec7bbe6f6..e1b6c3ace6c 100644 --- a/cpp/src/io/orc/orc.cpp +++ b/cpp/src/io/orc/orc.cpp @@ -538,17 +538,19 @@ std::vector metadata::select_columns(std::vector use_names, if (not use_names.empty()) { int index = 0; for (const auto &use_name : use_names) { + bool name_found = false; for (int i = 0; i < get_num_columns(); ++i, ++index) { if (index >= get_num_columns()) { index = 0; } if (get_column_name(index) == use_name) { + name_found = true; selection.emplace_back(index); if (ff.types[index].kind == orc::TIMESTAMP) { has_timestamp_column = true; } index++; break; } } + CUDF_EXPECTS(name_found, "Unknown column name : " + std::string(use_name)); } - CUDF_EXPECTS(selection.size() > 0, "Filtered out all columns"); } else { // For now, only select all leaf nodes for (int i = 1; i < get_num_columns(); ++i) {