Fix ORC reader for empty DataFrame/Table (#7624)

`ff.types` by default will have a [main type as struct](https://github.com/rapidsai/cudf/blob/0146f743987a6f2a51aab08f34771eb4d3531afc/cpp/src/io/orc/writer_impl.cu#L1278) under which all other columns will originate. So, we need to skip first which is not a column and start with 1st index. (Look for `Type Information` in [ORC Specification](https://orc.apache.org/specification/ORCv1/)) Along with that, we should also take care of the scenario where user would specify specific column name to retrieve, but it doesn't exist in case of empty data frame/table. Added test case to validate both scenario. closes #7356 Authors: - Ram (Ramakrishna Prabhu) (@rgsl888prabhu) Approvers: - Devavret Makkar (@devavret) - @nvdbaranec - Vukasin Milovanovic (@vuule) - Michael Wang (@isVoid) URL: #7624
rapidsai · Mar 22, 2021 · 8632ca0 · 8632ca0
1 parent c21bd0e
commit 8632ca0
Show file tree

Hide file tree

Showing 3 changed files with 23 additions and 2 deletions.
diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp
@@ -538,26 +538,28 @@ std::vector<int> metadata::select_columns(std::vector<std::string> use_names,
   if (not use_names.empty()) {
     int index = 0;
     for (const auto &use_name : use_names) {
+      bool name_found = false;
       for (int i = 0; i < get_num_columns(); ++i, ++index) {
         if (index >= get_num_columns()) { index = 0; }
         if (get_column_name(index) == use_name) {
+          name_found = true;
           selection.emplace_back(index);
           if (ff.types[index].kind == orc::TIMESTAMP) { has_timestamp_column = true; }
           index++;
           break;
         }
       }
+      CUDF_EXPECTS(name_found, "Unknown column name : " + std::string(use_name));
     }
   } else {
     // For now, only select all leaf nodes
-    for (int i = 0; i < get_num_columns(); ++i) {
+    for (int i = 1; i < get_num_columns(); ++i) {
       if (ff.types[i].subtypes.empty()) {
         selection.emplace_back(i);
         if (ff.types[i].kind == orc::TIMESTAMP) { has_timestamp_column = true; }
       }
     }
   }
-  CUDF_EXPECTS(selection.size() > 0, "Filtered out all columns");
 
   return selection;
 }

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
@@ -419,6 +419,9 @@ table_with_metadata reader::impl::read(size_type skip_rows,
   std::vector<std::unique_ptr<column>> out_columns;
   table_metadata out_metadata;
 
+  // There are no columns in table
+  if (_selected_columns.size() == 0) return {std::make_unique<table>(), std::move(out_metadata)};
+
   // Select only stripes required (aka row groups)
   const auto selected_stripes = _metadata->select_stripes(stripes, skip_rows, num_rows);
 

diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
@@ -740,6 +740,22 @@ def test_nanoseconds_overflow():
     assert_eq(expected.to_pandas(), pyarrow_got.to_pandas())
 
 
+def test_empty_dataframe():
+    buffer = BytesIO()
+    expected = cudf.DataFrame()
+    expected.to_orc(buffer)
+
+    # Raise error if column name is mentioned, but it doesn't exist.
+    with pytest.raises(RuntimeError):
+        cudf.read_orc(buffer, columns=["a"])
+
+    got_df = cudf.read_orc(buffer)
+    expected_pdf = pd.read_orc(buffer)
+
+    assert_eq(expected, got_df)
+    assert_eq(expected_pdf, got_df)
+
+
 @pytest.mark.parametrize(
     "data", [[None, ""], ["", None], [None, None], ["", ""]]
 )