From 2bdba7ec44bf6f5343c1f8d13262b77345be3128 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 14 Oct 2021 14:11:12 -0700
Subject: [PATCH 01/24] add column path

---
 cpp/src/io/orc/orc.cpp | 24 ++++++++++++++----------
 cpp/src/io/orc/orc.h   |  6 ++++++
 2 files changed, 20 insertions(+), 10 deletions(-)
diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp
index b275496c705..534187dfc0e 100644
--- a/cpp/src/io/orc/orc.cpp
+++ b/cpp/src/io/orc/orc.cpp
@@ -467,17 +467,21 @@ void metadata::init_column_names() const
 {
   auto const schema_idxs = get_schema_indexes();
   auto const& types      = ff.types;
-  for (int32_t col_id = 0; col_id < get_num_columns(); ++col_id) {
-    std::string col_name;
-    if (schema_idxs[col_id].parent >= 0 and schema_idxs[col_id].field >= 0) {
-      auto const parent_idx = static_cast<uint32_t>(schema_idxs[col_id].parent);
-      auto const field_idx  = static_cast<uint32_t>(schema_idxs[col_id].field);
-      if (field_idx < types[parent_idx].fieldNames.size()) {
-        col_name = types[parent_idx].fieldNames[field_idx];
-      }
+  // root ORC column
+  column_names.push_back("");
+  column_paths.push_back("");
+  for (int32_t col_id = 1; col_id < get_num_columns(); ++col_id) {
+    auto const parent_idx = schema_idxs[col_id].parent;
+    auto const field_idx  = schema_idxs[col_id].field;
+    if (field_idx >= 0 and field_idx < static_cast<int32_t>(types[parent_idx].fieldNames.size())) {
+      column_names.push_back(types[parent_idx].fieldNames[field_idx]);
+    } else {
+      // If we have no name, generate a name
+      column_names.push_back("col" + std::to_string(col_id));
     }
-    // If we have no name (root column), generate a name
-    column_names.push_back(col_name.empty() ? "col" + std::to_string(col_id) : col_name);
+
+    column_paths.push_back(column_paths[parent_idx] +
+                           (column_paths[parent_idx].empty() ? "" : ".") + column_names.back());
   }
 }
 
diff --git a/cpp/src/io/orc/orc.h b/cpp/src/io/orc/orc.h
index 405bf7c2ecc..13b816d929f 100644
--- a/cpp/src/io/orc/orc.h
+++ b/cpp/src/io/orc/orc.h
@@ -591,6 +591,11 @@ class metadata {
     if (column_names.empty() && get_num_columns() != 0) { init_column_names(); }
     return column_names[column_id];
   }
+  std::string const& get_column_path(int32_t column_id) const
+  {
+    if (column_paths.empty() && get_num_columns() != 0) { init_column_names(); }
+    return column_paths[column_id];
+  }
   int get_row_index_stride() const { return ff.rowIndexStride; }
 
  public:
@@ -610,6 +615,7 @@ class metadata {
   void init_column_names() const;
 
   mutable std::vector<std::string> column_names;
+  mutable std::vector<std::string> column_paths;
 };
 
 /**

From 5af011d8c491ab4cc9d4c68aa27c9a969205bd87 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 14 Oct 2021 16:41:47 -0700
Subject: [PATCH 02/24] simplify add_column; remove _has_nested_column

---
 cpp/src/io/orc/reader_impl.cu  | 19 +++++++------------
 cpp/src/io/orc/reader_impl.hpp |  1 -
 2 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 00ae00a9171..bb741c5054f 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -467,14 +467,12 @@ class aggregate_orc_metadata {
    * @param level current level of nesting.
    * @param id current column id that needs to be added.
    * @param has_timestamp_column True if timestamp column present and false otherwise.
-   * @param has_nested_column True if any of the selected column is a nested type.
    */
   void add_column(std::vector<std::vector<orc_column_meta>>& selection,
                   std::vector<SchemaType> const& types,
                   const size_t level,
                   const uint32_t id,
-                  bool& has_timestamp_column,
-                  bool& has_nested_column)
+                  bool& has_timestamp_column)
   {
     if (level == selection.size()) { selection.emplace_back(); }
     selection[level].push_back({id, 0});
@@ -483,11 +481,10 @@ class aggregate_orc_metadata {
 
     if (types[id].kind == orc::MAP or types[id].kind == orc::LIST or
         types[id].kind == orc::STRUCT) {
-      has_nested_column = true;
       for (const auto child_id : types[id].subtypes) {
         // Since nested column needs to be processed before its child can be processed,
         // child column is being added to next level
-        add_column(selection, types, level + 1, child_id, has_timestamp_column, has_nested_column);
+        add_column(selection, types, level + 1, child_id, has_timestamp_column);
       }
       selection[level][col_id].num_children = types[id].subtypes.size();
     }
@@ -498,12 +495,11 @@ class aggregate_orc_metadata {
    *
    * @param use_names List of column names to select
    * @param has_timestamp_column True if timestamp column present and false otherwise
-   * @param has_nested_column True if any of the selected column is a nested type.
    *
    * @return Vector of list of ORC column meta-data
    */
   std::vector<std::vector<orc_column_meta>> select_columns(
-    std::vector<std::string> const& use_names, bool& has_timestamp_column, bool& has_nested_column)
+    std::vector<std::string> const& use_names, bool& has_timestamp_column)
   {
     auto const& pfm = per_file_metadata[0];
     std::vector<std::vector<orc_column_meta>> selection;
@@ -520,7 +516,7 @@ class aggregate_orc_metadata {
           auto col_id = pfm.ff.types[0].subtypes[index];
           if (pfm.get_column_name(col_id) == use_name) {
             name_found = true;
-            add_column(selection, pfm.ff.types, 0, col_id, has_timestamp_column, has_nested_column);
+            add_column(selection, pfm.ff.types, 0, col_id, has_timestamp_column);
             // Should start with next index
             index = i + 1;
             break;
@@ -530,7 +526,7 @@ class aggregate_orc_metadata {
       }
     } else {
       for (auto const& col_id : pfm.ff.types[0].subtypes) {
-        add_column(selection, pfm.ff.types, 0, col_id, has_timestamp_column, has_nested_column);
+        add_column(selection, pfm.ff.types, 0, col_id, has_timestamp_column);
       }
     }
 
@@ -1164,8 +1160,7 @@ reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
   _metadata = std::make_unique<aggregate_orc_metadata>(_sources);
 
   // Select only columns required by the options
-  _selected_columns =
-    _metadata->select_columns(options.get_columns(), _has_timestamp_column, _has_nested_column);
+  _selected_columns = _metadata->select_columns(options.get_columns(), _has_timestamp_column);
 
   // Override output timestamp resolution if requested
   if (options.get_timestamp_type().id() != type_id::EMPTY) {
@@ -1187,7 +1182,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
                                        const std::vector<std::vector<size_type>>& stripes,
                                        rmm::cuda_stream_view stream)
 {
-  CUDF_EXPECTS(skip_rows == 0 or (not _has_nested_column),
+  CUDF_EXPECTS(skip_rows == 0 or _selected_columns.size() == 1,
                "skip_rows is not supported by nested columns");
 
   std::vector<std::unique_ptr<column>> out_columns;
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 4be75b6cc2a..8fed64ed64c 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -215,7 +215,6 @@ class reader::impl {
   bool _use_index            = true;
   bool _use_np_dtypes        = true;
   bool _has_timestamp_column = false;
-  bool _has_nested_column    = false;
   std::vector<std::string> _decimal_cols_as_float;
   data_type _timestamp_type{type_id::EMPTY};
   reader_column_meta _col_meta;

From d76abd07b60f0074a11e2fa009ea958625b4b72e Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 14 Oct 2021 17:11:43 -0700
Subject: [PATCH 03/24] further simplify add_column

---
 cpp/src/io/orc/reader_impl.cu | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index bb741c5054f..159b7895c96 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -475,18 +475,13 @@ class aggregate_orc_metadata {
                   bool& has_timestamp_column)
   {
     if (level == selection.size()) { selection.emplace_back(); }
-    selection[level].push_back({id, 0});
-    const int col_id = selection[level].size() - 1;
+    selection[level].push_back({id, types[id].subtypes.size()});
     if (types[id].kind == orc::TIMESTAMP) { has_timestamp_column = true; }
 
-    if (types[id].kind == orc::MAP or types[id].kind == orc::LIST or
-        types[id].kind == orc::STRUCT) {
-      for (const auto child_id : types[id].subtypes) {
-        // Since nested column needs to be processed before its child can be processed,
-        // child column is being added to next level
-        add_column(selection, types, level + 1, child_id, has_timestamp_column);
-      }
-      selection[level][col_id].num_children = types[id].subtypes.size();
+    for (const auto child_id : types[id].subtypes) {
+      // Since nested column needs to be processed before its child can be processed,
+      // child column is being added to next level
+      add_column(selection, types, level + 1, child_id, has_timestamp_column);
     }
   }
 

From 7c8af1f53247950c8477aa7667e2c67a61cd47ad Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 14 Oct 2021 17:59:14 -0700
Subject: [PATCH 04/24] cast

---
 cpp/src/io/orc/reader_impl.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 159b7895c96..aff3afa4b53 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -475,7 +475,7 @@ class aggregate_orc_metadata {
                   bool& has_timestamp_column)
   {
     if (level == selection.size()) { selection.emplace_back(); }
-    selection[level].push_back({id, types[id].subtypes.size()});
+    selection[level].push_back({id, static_cast<uint32_t>(types[id].subtypes.size())});
     if (types[id].kind == orc::TIMESTAMP) { has_timestamp_column = true; }
 
     for (const auto child_id : types[id].subtypes) {

From 06fca7a1de8454bfacb2ffe435cc6c814fe93270 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 20 Oct 2021 00:08:15 -0700
Subject: [PATCH 05/24] remove has_timestamp_column

---
 cpp/src/io/orc/reader_impl.cu  | 84 +++++++++++++++++++++++++++-------
 cpp/src/io/orc/reader_impl.hpp | 14 ++++--
 2 files changed, 78 insertions(+), 20 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index d1c8c3661f4..a73205ca3f6 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -466,22 +466,19 @@ class aggregate_orc_metadata {
    * @param types A vector of schema types of columns.
    * @param level current level of nesting.
    * @param id current column id that needs to be added.
-   * @param has_timestamp_column True if timestamp column present and false otherwise.
    */
   void add_column(std::vector<std::vector<orc_column_meta>>& selection,
                   std::vector<SchemaType> const& types,
                   const size_t level,
-                  const uint32_t id,
-                  bool& has_timestamp_column)
+                  const uint32_t id)
   {
     if (level == selection.size()) { selection.emplace_back(); }
     selection[level].push_back({id, static_cast<uint32_t>(types[id].subtypes.size())});
-    if (types[id].kind == orc::TIMESTAMP) { has_timestamp_column = true; }
 
     for (const auto child_id : types[id].subtypes) {
       // Since nested column needs to be processed before its child can be processed,
       // child column is being added to next level
-      add_column(selection, types, level + 1, child_id, has_timestamp_column);
+      add_column(selection, types, level + 1, child_id);
     }
   }
 
@@ -489,12 +486,52 @@ class aggregate_orc_metadata {
    * @brief Filters and reduces down to a selection of columns
    *
    * @param use_names List of column names to select
-   * @param has_timestamp_column True if timestamp column present and false otherwise
+   * @return Vector of list of ORC column meta-data
+   */
+  std::vector<std::vector<orc_column_meta>> select_columns2(
+    std::vector<std::string> const& use_names)
+  {
+    auto const& pfm = per_file_metadata[0];
+    std::vector<std::vector<orc_column_meta>> selection;
+
+    if (not use_names.empty()) {
+      uint32_t index = 0;
+      // Have to check only parent columns
+      auto const num_columns = pfm.ff.types[0].subtypes.size();
+
+      for (const auto& use_name : use_names) {
+        bool name_found = false;
+        for (uint32_t i = 0; i < num_columns; ++i, ++index) {
+          if (index >= num_columns) { index = 0; }
+          auto col_id = pfm.ff.types[0].subtypes[index];
+          if (pfm.get_column_name(col_id) == use_name) {
+            name_found = true;
+            add_column(selection, pfm.ff.types, 0, col_id);
+            // Should start with next index
+            index = i + 1;
+            break;
+          }
+        }
+        CUDF_EXPECTS(name_found, "Unknown column name : " + std::string(use_name));
+      }
+    } else {
+      for (auto const& col_id : pfm.ff.types[0].subtypes) {
+        add_column(selection, pfm.ff.types, 0, col_id);
+      }
+    }
+
+    return selection;
+  }
+
+  /**
+   * @brief Filters and reduces down to a selection of columns
+   *
+   * @param use_names List of column names to select
    *
    * @return Vector of list of ORC column meta-data
    */
   std::vector<std::vector<orc_column_meta>> select_columns(
-    std::vector<std::string> const& use_names, bool& has_timestamp_column)
+    std::vector<std::string> const& use_names)
   {
     auto const& pfm = per_file_metadata[0];
     std::vector<std::vector<orc_column_meta>> selection;
@@ -511,7 +548,7 @@ class aggregate_orc_metadata {
           auto col_id = pfm.ff.types[0].subtypes[index];
           if (pfm.get_column_name(col_id) == use_name) {
             name_found = true;
-            add_column(selection, pfm.ff.types, 0, col_id, has_timestamp_column);
+            add_column(selection, pfm.ff.types, 0, col_id);
             // Should start with next index
             index = i + 1;
             break;
@@ -521,7 +558,7 @@ class aggregate_orc_metadata {
       }
     } else {
       for (auto const& col_id : pfm.ff.types[0].subtypes) {
-        add_column(selection, pfm.ff.types, 0, col_id, has_timestamp_column);
+        add_column(selection, pfm.ff.types, 0, col_id);
       }
     }
 
@@ -1155,7 +1192,7 @@ reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
   _metadata = std::make_unique<aggregate_orc_metadata>(_sources);
 
   // Select only columns required by the options
-  _selected_columns = _metadata->select_columns(options.get_columns(), _has_timestamp_column);
+  _selected_columns = _metadata->select_columns2(options.get_columns());
 
   // Override output timestamp resolution if requested
   if (options.get_timestamp_type().id() != type_id::EMPTY) {
@@ -1172,6 +1209,24 @@ reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
   _decimal_cols_as_float = options.get_decimal_cols_as_float();
 }
 
+timezone_table reader::impl::compute_timezone_table(
+  const std::vector<cudf::io::orc::metadata::stripe_source_mapping>& selected_stripes,
+  rmm::cuda_stream_view stream)
+{
+  if (selected_stripes.empty()) return {};
+
+  auto const has_timestamp_column =
+    std::any_of(_selected_columns.cbegin(), _selected_columns.cend(), [&](auto& col_lvl) {
+      return std::any_of(col_lvl.cbegin(), col_lvl.cend(), [&](auto& col_meta) {
+        return _metadata->get_col_type(col_meta.id).kind == TypeKind::TIMESTAMP;
+      });
+    });
+  if (not has_timestamp_column) return {};
+
+  return build_timezone_transition_table(selected_stripes[0].stripe_info[0].second->writerTimezone,
+                                         stream);
+}
+
 table_with_metadata reader::impl::read(size_type skip_rows,
                                        size_type num_rows,
                                        const std::vector<std::vector<size_type>>& stripes,
@@ -1196,6 +1251,8 @@ table_with_metadata reader::impl::read(size_type skip_rows,
   // Select only stripes required (aka row groups)
   const auto selected_stripes = _metadata->select_stripes(stripes, skip_rows, num_rows);
 
+  auto const tz_table = compute_timezone_table(selected_stripes, stream);
+
   // Iterates through levels of nested columns, child column will be one level down
   // compared to parent column.
   for (size_t level = 0; level < _selected_columns.size(); level++) {
@@ -1471,13 +1528,6 @@ table_with_metadata reader::impl::read(size_type skip_rows,
           }
         }
 
-        // Setup table for converting timestamp columns from local to UTC time
-        auto const tz_table =
-          _has_timestamp_column
-            ? build_timezone_transition_table(
-                selected_stripes[0].stripe_info[0].second->writerTimezone, stream)
-            : timezone_table{};
-
         for (size_t i = 0; i < column_types.size(); ++i) {
           bool is_nullable = false;
           for (size_t j = 0; j < total_num_stripes; ++j) {
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 8fed64ed64c..9eb3ed29341 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -205,6 +205,15 @@ class reader::impl {
                                               column_name_info& schema_info,
                                               rmm::cuda_stream_view stream);
 
+  /**
+   * @brief Setup table for converting timestamp columns from local to UTC time
+   *
+   * @return Timezone table with timestamp offsets
+   */
+  timezone_table compute_timezone_table(
+    const std::vector<cudf::io::orc::metadata::stripe_source_mapping>& selected_stripes,
+    rmm::cuda_stream_view stream);
+
  private:
   rmm::mr::device_memory_resource* _mr = nullptr;
   std::vector<std::unique_ptr<datasource>> _sources;
@@ -212,9 +221,8 @@ class reader::impl {
   // _output_columns associated schema indices
   std::vector<std::vector<orc_column_meta>> _selected_columns;
 
-  bool _use_index            = true;
-  bool _use_np_dtypes        = true;
-  bool _has_timestamp_column = false;
+  bool _use_index     = true;
+  bool _use_np_dtypes = true;
   std::vector<std::string> _decimal_cols_as_float;
   data_type _timestamp_type{type_id::EMPTY};
   reader_column_meta _col_meta;

From 06f7cdbc5e6aa0c7253206e1fb44e363685ac70d Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 20 Oct 2021 17:26:09 -0700
Subject: [PATCH 06/24] switch to new slection - works for existing cases

---
 cpp/src/io/orc/orc.h          |   4 +-
 cpp/src/io/orc/reader_impl.cu | 147 ++++++++++++++++++----------------
 2 files changed, 81 insertions(+), 70 deletions(-)

diff --git a/cpp/src/io/orc/orc.h b/cpp/src/io/orc/orc.h
index 13b816d929f..aed903348c2 100644
--- a/cpp/src/io/orc/orc.h
+++ b/cpp/src/io/orc/orc.h
@@ -555,8 +555,8 @@ class OrcDecompressor {
  *
  */
 struct orc_column_meta {
-  uint32_t id;            // orc id for the column
-  uint32_t num_children;  // number of children at the same level of nesting in case of struct
+  int32_t id;            // orc id for the column
+  int32_t num_children;  // number of children at the same level of nesting in case of struct
 };
 
 /**
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index a73205ca3f6..5a9b31bac9c 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -359,6 +359,15 @@ class aggregate_orc_metadata {
     return per_file_metadata[source_idx].get_column_name(column_idx);
   }
 
+  auto get_column_path(const int source_idx, const int column_idx) const
+  {
+    CUDF_EXPECTS(source_idx <= static_cast<int>(per_file_metadata.size()),
+                 "Out of range source_idx provided");
+    CUDF_EXPECTS(column_idx <= per_file_metadata[source_idx].get_num_columns(),
+                 "Out of range column_idx provided");
+    return per_file_metadata[source_idx].get_column_path(column_idx);
+  }
+
   auto is_row_grp_idx_present() const { return row_grp_idx_present; }
 
   std::vector<cudf::io::orc::metadata::stripe_source_mapping> select_stripes(
@@ -469,11 +478,11 @@ class aggregate_orc_metadata {
    */
   void add_column(std::vector<std::vector<orc_column_meta>>& selection,
                   std::vector<SchemaType> const& types,
-                  const size_t level,
-                  const uint32_t id)
+                  size_t level,
+                  int32_t id)
   {
     if (level == selection.size()) { selection.emplace_back(); }
-    selection[level].push_back({id, static_cast<uint32_t>(types[id].subtypes.size())});
+    selection[level].push_back({id, static_cast<int32_t>(types[id].subtypes.size())});
 
     for (const auto child_id : types[id].subtypes) {
       // Since nested column needs to be processed before its child can be processed,
@@ -482,87 +491,82 @@ class aggregate_orc_metadata {
     }
   }
 
-  /**
-   * @brief Filters and reduces down to a selection of columns
-   *
-   * @param use_names List of column names to select
-   * @return Vector of list of ORC column meta-data
-   */
-  std::vector<std::vector<orc_column_meta>> select_columns2(
-    std::vector<std::string> const& use_names)
+  void add_column_new(std::map<int32_t, std::vector<int32_t>>& selection,
+                      std::vector<SchemaType> const& types,
+                      int32_t id)
   {
-    auto const& pfm = per_file_metadata[0];
-    std::vector<std::vector<orc_column_meta>> selection;
-
-    if (not use_names.empty()) {
-      uint32_t index = 0;
-      // Have to check only parent columns
-      auto const num_columns = pfm.ff.types[0].subtypes.size();
-
-      for (const auto& use_name : use_names) {
-        bool name_found = false;
-        for (uint32_t i = 0; i < num_columns; ++i, ++index) {
-          if (index >= num_columns) { index = 0; }
-          auto col_id = pfm.ff.types[0].subtypes[index];
-          if (pfm.get_column_name(col_id) == use_name) {
-            name_found = true;
-            add_column(selection, pfm.ff.types, 0, col_id);
-            // Should start with next index
-            index = i + 1;
-            break;
-          }
-        }
-        CUDF_EXPECTS(name_found, "Unknown column name : " + std::string(use_name));
-      }
-    } else {
-      for (auto const& col_id : pfm.ff.types[0].subtypes) {
-        add_column(selection, pfm.ff.types, 0, col_id);
-      }
+    for (auto child_id : types[id].subtypes) {
+      selection[id].push_back(child_id);
+      add_column_new(selection, types, child_id);
     }
+  }
+
+  void levelize(std::map<int32_t, std::vector<int32_t>>& selection,
+                int32_t id,
+                int32_t level,
+                std::vector<std::vector<orc_column_meta>>& sorted_levels)
+  {
+    if (static_cast<int32_t>(sorted_levels.size()) == level) sorted_levels.emplace_back();
+
+    sorted_levels[level].push_back({id, static_cast<int32_t>(selection[id].size())});
 
-    return selection;
+    for (auto child_id : selection[id]) {
+      // Since nested column needs to be processed before its child can be processed,
+      // child column is being added to next level
+      levelize(selection, child_id, level + 1, sorted_levels);
+    }
+  }
+
+  std::vector<std::vector<orc_column_meta>> sort_into_levels(
+    std::map<int32_t, std::vector<int32_t>>& selection)
+  {
+    std::vector<std::vector<orc_column_meta>> sorted_levels;
+    for (auto col_id : selection[0]) {
+      levelize(selection, col_id, 0, sorted_levels);
+    }
+    return sorted_levels;
   }
 
   /**
    * @brief Filters and reduces down to a selection of columns
    *
    * @param use_names List of column names to select
-   *
    * @return Vector of list of ORC column meta-data
    */
   std::vector<std::vector<orc_column_meta>> select_columns(
     std::vector<std::string> const& use_names)
   {
     auto const& pfm = per_file_metadata[0];
-    std::vector<std::vector<orc_column_meta>> selection;
-
-    if (not use_names.empty()) {
-      uint32_t index = 0;
-      // Have to check only parent columns
-      auto const num_columns = pfm.ff.types[0].subtypes.size();
-
-      for (const auto& use_name : use_names) {
-        bool name_found = false;
-        for (uint32_t i = 0; i < num_columns; ++i, ++index) {
-          if (index >= num_columns) { index = 0; }
-          auto col_id = pfm.ff.types[0].subtypes[index];
-          if (pfm.get_column_name(col_id) == use_name) {
-            name_found = true;
-            add_column(selection, pfm.ff.types, 0, col_id);
-            // Should start with next index
-            index = i + 1;
-            break;
-          }
-        }
-        CUDF_EXPECTS(name_found, "Unknown column name : " + std::string(use_name));
-      }
-    } else {
+
+    if (use_names.empty()) {
+      std::vector<std::vector<orc_column_meta>> selection;
       for (auto const& col_id : pfm.ff.types[0].subtypes) {
         add_column(selection, pfm.ff.types, 0, col_id);
       }
+      return selection;
     }
 
-    return selection;
+    std::map<int32_t, std::vector<int32_t>> selected_column_map;
+    for (const auto& use_name : use_names) {
+      bool name_found = false;
+      for (auto col_id = 1; col_id < pfm.get_num_columns(); ++col_id) {
+        if (pfm.get_column_path(col_id) == use_name) {
+          name_found = true;
+          selected_column_map[0].push_back(col_id);
+          add_column_new(selected_column_map, pfm.ff.types, col_id);
+          break;
+        }
+      }
+      CUDF_EXPECTS(name_found, "Unknown column name : " + std::string(use_name));
+    }
+    /*
+    for (auto [k, v] : sel) {
+      std::cout << "key: " << k << std::endl;
+      for (auto& ev : v)
+        std::cout << ev << ' ';
+      std::cout << std::endl;
+    }*/
+    return sort_into_levels(selected_column_map);
   }
 };
 
@@ -1004,7 +1008,7 @@ void reader::impl::aggregate_child_meta(cudf::detail::host_2dspan<gpu::ColumnDes
         for (size_t rowgroup_id = 0; rowgroup_id < stripe_num_row_groups;
              rowgroup_id++, processed_row_groups++) {
           const auto child_rows = row_groups[processed_row_groups][parent_col_idx].num_child_rows;
-          for (uint32_t id = 0; id < p_col.num_children; id++) {
+          for (int32_t id = 0; id < p_col.num_children; id++) {
             const auto child_col_idx                                  = index + id;
             rwgrp_meta[processed_row_groups][child_col_idx].start_row = processed_child_rows;
             rwgrp_meta[processed_row_groups][child_col_idx].num_rows  = child_rows;
@@ -1015,7 +1019,7 @@ void reader::impl::aggregate_child_meta(cudf::detail::host_2dspan<gpu::ColumnDes
 
       // Aggregate start row, number of rows per chunk and total number of rows in a column
       const auto child_rows = chunks[stripe_id][parent_col_idx].num_child_rows;
-      for (uint32_t id = 0; id < p_col.num_children; id++) {
+      for (int32_t id = 0; id < p_col.num_children; id++) {
         const auto child_col_idx = index + id;
 
         num_child_rows[child_col_idx] += child_rows;
@@ -1033,7 +1037,7 @@ void reader::impl::aggregate_child_meta(cudf::detail::host_2dspan<gpu::ColumnDes
     auto parent_valid_map  = out_buffers[parent_col_idx].null_mask();
     auto num_rows          = out_buffers[parent_col_idx].size;
 
-    for (uint32_t id = 0; id < p_col.num_children; id++) {
+    for (int32_t id = 0; id < p_col.num_children; id++) {
       const auto child_col_idx                     = index + id;
       _col_meta.parent_column_index[child_col_idx] = parent_col_idx;
       if (type == type_id::STRUCT) {
@@ -1192,7 +1196,14 @@ reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
   _metadata = std::make_unique<aggregate_orc_metadata>(_sources);
 
   // Select only columns required by the options
-  _selected_columns = _metadata->select_columns2(options.get_columns());
+  _selected_columns = _metadata->select_columns(options.get_columns());
+  /*
+    for (auto& lvl : _selected_columns) {
+      for (auto& c : lvl)
+        std::cout << c.id << ',' << c.num_children << ' ';
+      std::cout << std::endl;
+    }*/
+  // CUDF_FAIL("stop");
 
   // Override output timestamp resolution if requested
   if (options.get_timestamp_type().id() != type_id::EMPTY) {

From 03bc301153fce1fac2fbd9d67f437fd72aabe07b Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 21 Oct 2021 00:01:06 -0700
Subject: [PATCH 07/24] remove old add_column

---
 cpp/src/io/orc/reader_impl.cu | 59 ++++++++++-------------------------
 1 file changed, 17 insertions(+), 42 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 5a9b31bac9c..22f8c8af8db 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -467,37 +467,13 @@ class aggregate_orc_metadata {
     return selected_stripes_mapping;
   }
 
-  /**
-   * @brief Adds column as per the request and saves metadata about children.
-   *        Children of a column will be added to the next level.
-   *
-   * @param selection A vector that saves list of columns as per levels of nesting.
-   * @param types A vector of schema types of columns.
-   * @param level current level of nesting.
-   * @param id current column id that needs to be added.
-   */
-  void add_column(std::vector<std::vector<orc_column_meta>>& selection,
+  void add_column(std::map<int32_t, std::vector<int32_t>>& selection,
                   std::vector<SchemaType> const& types,
-                  size_t level,
                   int32_t id)
-  {
-    if (level == selection.size()) { selection.emplace_back(); }
-    selection[level].push_back({id, static_cast<int32_t>(types[id].subtypes.size())});
-
-    for (const auto child_id : types[id].subtypes) {
-      // Since nested column needs to be processed before its child can be processed,
-      // child column is being added to next level
-      add_column(selection, types, level + 1, child_id);
-    }
-  }
-
-  void add_column_new(std::map<int32_t, std::vector<int32_t>>& selection,
-                      std::vector<SchemaType> const& types,
-                      int32_t id)
   {
     for (auto child_id : types[id].subtypes) {
       selection[id].push_back(child_id);
-      add_column_new(selection, types, child_id);
+      add_column(selection, types, child_id);
     }
   }
 
@@ -538,26 +514,25 @@ class aggregate_orc_metadata {
   {
     auto const& pfm = per_file_metadata[0];
 
+    std::map<int32_t, std::vector<int32_t>> selected_columns_map;
     if (use_names.empty()) {
-      std::vector<std::vector<orc_column_meta>> selection;
       for (auto const& col_id : pfm.ff.types[0].subtypes) {
-        add_column(selection, pfm.ff.types, 0, col_id);
+        selected_columns_map[0].push_back(col_id);
+        add_column(selected_columns_map, pfm.ff.types, col_id);
       }
-      return selection;
-    }
-
-    std::map<int32_t, std::vector<int32_t>> selected_column_map;
-    for (const auto& use_name : use_names) {
-      bool name_found = false;
-      for (auto col_id = 1; col_id < pfm.get_num_columns(); ++col_id) {
-        if (pfm.get_column_path(col_id) == use_name) {
-          name_found = true;
-          selected_column_map[0].push_back(col_id);
-          add_column_new(selected_column_map, pfm.ff.types, col_id);
-          break;
+    } else {
+      for (const auto& use_name : use_names) {
+        bool name_found = false;
+        for (auto col_id = 1; col_id < pfm.get_num_columns(); ++col_id) {
+          if (pfm.get_column_path(col_id) == use_name) {
+            name_found = true;
+            selected_columns_map[0].push_back(col_id);
+            add_column(selected_columns_map, pfm.ff.types, col_id);
+            break;
+          }
         }
+        CUDF_EXPECTS(name_found, "Unknown column name : " + std::string(use_name));
       }
-      CUDF_EXPECTS(name_found, "Unknown column name : " + std::string(use_name));
     }
     /*
     for (auto [k, v] : sel) {
@@ -566,7 +541,7 @@ class aggregate_orc_metadata {
         std::cout << ev << ' ';
       std::cout << std::endl;
     }*/
-    return sort_into_levels(selected_column_map);
+    return sort_into_levels(selected_columns_map);
   }
 };
 

From 23287af0e320868de5d939ceaf2236e1d82d0252 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 21 Oct 2021 00:26:31 -0700
Subject: [PATCH 08/24] bit of refactor

---
 cpp/src/io/orc/reader_impl.cu | 46 +++++++++++++++++++++++------------
 1 file changed, 30 insertions(+), 16 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 22f8c8af8db..22e5d5f8fd7 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -467,38 +467,54 @@ class aggregate_orc_metadata {
     return selected_stripes_mapping;
   }
 
-  void add_column(std::map<int32_t, std::vector<int32_t>>& selection,
-                  std::vector<SchemaType> const& types,
-                  int32_t id)
+  void add_column_and_nested(std::map<int32_t, std::vector<int32_t>>& selected_columns,
+                             std::vector<SchemaType> const& types,
+                             int32_t id)
   {
     for (auto child_id : types[id].subtypes) {
-      selection[id].push_back(child_id);
-      add_column(selection, types, child_id);
+      selected_columns[id].push_back(child_id);
+      add_column_and_nested(selected_columns, types, child_id);
     }
   }
 
-  void levelize(std::map<int32_t, std::vector<int32_t>>& selection,
+  void update_parent_mapping(std::map<int32_t, std::vector<int32_t>>& selected_columns,
+                             cudf::io::orc::metadata const& metadata,
+                             int32_t id)
+  {
+    selected_columns[0].push_back(id);
+    // TODO walk back up the tree and update
+  }
+
+  void add_column_to_mapping(std::map<int32_t, std::vector<int32_t>>& selected_columns,
+                             cudf::io::orc::metadata const& metadata,
+                             int32_t id)
+  {
+    update_parent_mapping(selected_columns, metadata, id);
+    add_column_and_nested(selected_columns, metadata.ff.types, id);
+  }
+
+  void levelize(std::map<int32_t, std::vector<int32_t>>& selected_columns,
                 int32_t id,
                 int32_t level,
                 std::vector<std::vector<orc_column_meta>>& sorted_levels)
   {
     if (static_cast<int32_t>(sorted_levels.size()) == level) sorted_levels.emplace_back();
 
-    sorted_levels[level].push_back({id, static_cast<int32_t>(selection[id].size())});
+    sorted_levels[level].push_back({id, static_cast<int32_t>(selected_columns[id].size())});
 
-    for (auto child_id : selection[id]) {
+    for (auto child_id : selected_columns[id]) {
       // Since nested column needs to be processed before its child can be processed,
       // child column is being added to next level
-      levelize(selection, child_id, level + 1, sorted_levels);
+      levelize(selected_columns, child_id, level + 1, sorted_levels);
     }
   }
 
   std::vector<std::vector<orc_column_meta>> sort_into_levels(
-    std::map<int32_t, std::vector<int32_t>>& selection)
+    std::map<int32_t, std::vector<int32_t>>& selected_columns)
   {
     std::vector<std::vector<orc_column_meta>> sorted_levels;
-    for (auto col_id : selection[0]) {
-      levelize(selection, col_id, 0, sorted_levels);
+    for (auto col_id : selected_columns[0]) {
+      levelize(selected_columns, col_id, 0, sorted_levels);
     }
     return sorted_levels;
   }
@@ -517,8 +533,7 @@ class aggregate_orc_metadata {
     std::map<int32_t, std::vector<int32_t>> selected_columns_map;
     if (use_names.empty()) {
       for (auto const& col_id : pfm.ff.types[0].subtypes) {
-        selected_columns_map[0].push_back(col_id);
-        add_column(selected_columns_map, pfm.ff.types, col_id);
+        add_column_to_mapping(selected_columns_map, pfm, col_id);
       }
     } else {
       for (const auto& use_name : use_names) {
@@ -526,8 +541,7 @@ class aggregate_orc_metadata {
         for (auto col_id = 1; col_id < pfm.get_num_columns(); ++col_id) {
           if (pfm.get_column_path(col_id) == use_name) {
             name_found = true;
-            selected_columns_map[0].push_back(col_id);
-            add_column(selected_columns_map, pfm.ff.types, col_id);
+            add_column_to_mapping(selected_columns_map, pfm, col_id);
             break;
           }
         }

From 8e18a0df26275214214455ca94d60ab15be4151e Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 21 Oct 2021 11:58:04 -0700
Subject: [PATCH 09/24] levelize -> lambda

---
 cpp/src/io/orc/reader_impl.cu | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 22e5d5f8fd7..d01ec27c969 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -493,28 +493,25 @@ class aggregate_orc_metadata {
     add_column_and_nested(selected_columns, metadata.ff.types, id);
   }
 
-  void levelize(std::map<int32_t, std::vector<int32_t>>& selected_columns,
-                int32_t id,
-                int32_t level,
-                std::vector<std::vector<orc_column_meta>>& sorted_levels)
-  {
-    if (static_cast<int32_t>(sorted_levels.size()) == level) sorted_levels.emplace_back();
-
-    sorted_levels[level].push_back({id, static_cast<int32_t>(selected_columns[id].size())});
-
-    for (auto child_id : selected_columns[id]) {
-      // Since nested column needs to be processed before its child can be processed,
-      // child column is being added to next level
-      levelize(selected_columns, child_id, level + 1, sorted_levels);
-    }
-  }
-
   std::vector<std::vector<orc_column_meta>> sort_into_levels(
     std::map<int32_t, std::vector<int32_t>>& selected_columns)
   {
     std::vector<std::vector<orc_column_meta>> sorted_levels;
+
+    std::function<void(int32_t, int32_t)> levelize = [&](int32_t id, int32_t level) {
+      if (static_cast<int32_t>(sorted_levels.size()) == level) sorted_levels.emplace_back();
+
+      sorted_levels[level].push_back({id, static_cast<int32_t>(selected_columns[id].size())});
+
+      for (auto child_id : selected_columns[id]) {
+        // Since nested column needs to be processed before its child can be processed,
+        // child column is being added to next level
+        levelize(child_id, level + 1);
+      }
+    };
+
     for (auto col_id : selected_columns[0]) {
-      levelize(selected_columns, col_id, 0, sorted_levels);
+      levelize(col_id, 0);
     }
     return sorted_levels;
   }

From 536857fb1c921312fb474b24df2cb4f360c184b3 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 21 Oct 2021 16:52:40 -0700
Subject: [PATCH 10/24] functional column selection

---
 cpp/src/io/orc/orc.cpp        | 27 +++++++++-------------
 cpp/src/io/orc/orc.h          | 25 +++++++++------------
 cpp/src/io/orc/reader_impl.cu | 42 +++++++++++++++++------------------
 3 files changed, 41 insertions(+), 53 deletions(-)

diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp
index 534187dfc0e..2d8e402894f 100644
--- a/cpp/src/io/orc/orc.cpp
+++ b/cpp/src/io/orc/orc.cpp
@@ -459,20 +459,19 @@ metadata::metadata(datasource* const src) : source(src)
   auto md_data     = decompressor->Decompress(buffer->data(), ps.metadataLength, &md_length);
   orc::ProtobufReader(md_data, md_length).read(md);
 
-  // Initialize the column names
+  init_schema();
   init_column_names();
 }
 
-void metadata::init_column_names() const
+void metadata::init_column_names()
 {
-  auto const schema_idxs = get_schema_indexes();
-  auto const& types      = ff.types;
+  auto const& types = ff.types;
   // root ORC column
   column_names.push_back("");
   column_paths.push_back("");
   for (int32_t col_id = 1; col_id < get_num_columns(); ++col_id) {
-    auto const parent_idx = schema_idxs[col_id].parent;
-    auto const field_idx  = schema_idxs[col_id].field;
+    auto const parent_idx = schema[col_id].parent;
+    auto const field_idx  = schema[col_id].field;
     if (field_idx >= 0 and field_idx < static_cast<int32_t>(types[parent_idx].fieldNames.size())) {
       column_names.push_back(types[parent_idx].fieldNames[field_idx]);
     } else {
@@ -485,26 +484,22 @@ void metadata::init_column_names() const
   }
 }
 
-std::vector<metadata::schema_indexes> metadata::get_schema_indexes() const
+void metadata::init_schema()
 {
-  std::vector<schema_indexes> result(ff.types.size());
+  auto const schema_size = static_cast<uint32_t>(ff.types.size());
+  schema.resize(schema_size);
 
-  auto const schema_size = static_cast<uint32_t>(result.size());
   for (uint32_t i = 0; i < schema_size; i++) {
     auto const& subtypes    = ff.types[i].subtypes;
     auto const num_children = static_cast<uint32_t>(subtypes.size());
-    if (result[i].parent == -1) {  // Not initialized
-      result[i].parent = i;        // set root node as its own parent
-    }
     for (uint32_t j = 0; j < num_children; j++) {
       auto const column_id = subtypes[j];
       CUDF_EXPECTS(column_id > i && column_id < schema_size, "Invalid column id");
-      CUDF_EXPECTS(result[column_id].parent == -1, "Same node referenced twice");
-      result[column_id].parent = i;
-      result[column_id].field  = j;
+      CUDF_EXPECTS(schema[column_id].parent == -1, "Same node referenced twice");
+      schema[column_id].parent = i;
+      schema[column_id].field  = j;
     }
   }
-  return result;
 }
 
 }  // namespace orc
diff --git a/cpp/src/io/orc/orc.h b/cpp/src/io/orc/orc.h
index aed903348c2..2ee4117a06f 100644
--- a/cpp/src/io/orc/orc.h
+++ b/cpp/src/io/orc/orc.h
@@ -586,18 +586,12 @@ class metadata {
   size_t get_total_rows() const { return ff.numberOfRows; }
   int get_num_stripes() const { return ff.stripes.size(); }
   int get_num_columns() const { return ff.types.size(); }
-  std::string const& get_column_name(int32_t column_id) const
-  {
-    if (column_names.empty() && get_num_columns() != 0) { init_column_names(); }
-    return column_names[column_id];
-  }
-  std::string const& get_column_path(int32_t column_id) const
-  {
-    if (column_paths.empty() && get_num_columns() != 0) { init_column_names(); }
-    return column_paths[column_id];
-  }
+  std::string const& get_column_name(int32_t column_id) const { return column_names[column_id]; }
+  std::string const& get_column_path(int32_t column_id) const { return column_paths[column_id]; }
   int get_row_index_stride() const { return ff.rowIndexStride; }
 
+  int32_t parent_id(int32_t column_id) const { return schema[column_id].parent; }
+
  public:
   PostScript ps;
   FileFooter ff;
@@ -607,15 +601,16 @@ class metadata {
   datasource* const source;
 
  private:
-  struct schema_indexes {
+  struct column_schema_indexes {
     int32_t parent = -1;
     int32_t field  = -1;
   };
-  std::vector<schema_indexes> get_schema_indexes() const;
-  void init_column_names() const;
+  void init_schema();
+  std::vector<column_schema_indexes> schema;
 
-  mutable std::vector<std::string> column_names;
-  mutable std::vector<std::string> column_paths;
+  void init_column_names();
+  std::vector<std::string> column_names;
+  std::vector<std::string> column_paths;
 };
 
 /**
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index d01ec27c969..fb477d78831 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -467,13 +467,16 @@ class aggregate_orc_metadata {
     return selected_stripes_mapping;
   }
 
-  void add_column_and_nested(std::map<int32_t, std::vector<int32_t>>& selected_columns,
-                             std::vector<SchemaType> const& types,
-                             int32_t id)
+  void add_nested_columns(std::map<int32_t, std::vector<int32_t>>& selected_columns,
+                          std::vector<SchemaType> const& types,
+                          int32_t id)
   {
     for (auto child_id : types[id].subtypes) {
-      selected_columns[id].push_back(child_id);
-      add_column_and_nested(selected_columns, types, child_id);
+      if (std::find(selected_columns[id].cbegin(), selected_columns[id].cend(), child_id) ==
+          selected_columns[id].end()) {
+        selected_columns[id].push_back(child_id);
+        add_nested_columns(selected_columns, types, child_id);
+      }
     }
   }
 
@@ -481,8 +484,17 @@ class aggregate_orc_metadata {
                              cudf::io::orc::metadata const& metadata,
                              int32_t id)
   {
-    selected_columns[0].push_back(id);
-    // TODO walk back up the tree and update
+    auto current_id = id;
+    auto parent_id  = metadata.parent_id(current_id);
+    while (parent_id != -1) {
+      if (std::find(selected_columns[parent_id].cbegin(),
+                    selected_columns[parent_id].cend(),
+                    current_id) == selected_columns[parent_id].end()) {
+        selected_columns[parent_id].push_back(current_id);
+      }
+      current_id = parent_id;
+      parent_id  = metadata.parent_id(current_id);
+    }
   }
 
   void add_column_to_mapping(std::map<int32_t, std::vector<int32_t>>& selected_columns,
@@ -490,7 +502,7 @@ class aggregate_orc_metadata {
                              int32_t id)
   {
     update_parent_mapping(selected_columns, metadata, id);
-    add_column_and_nested(selected_columns, metadata.ff.types, id);
+    add_nested_columns(selected_columns, metadata.ff.types, id);
   }
 
   std::vector<std::vector<orc_column_meta>> sort_into_levels(
@@ -545,13 +557,6 @@ class aggregate_orc_metadata {
         CUDF_EXPECTS(name_found, "Unknown column name : " + std::string(use_name));
       }
     }
-    /*
-    for (auto [k, v] : sel) {
-      std::cout << "key: " << k << std::endl;
-      for (auto& ev : v)
-        std::cout << ev << ' ';
-      std::cout << std::endl;
-    }*/
     return sort_into_levels(selected_columns_map);
   }
 };
@@ -1183,13 +1188,6 @@ reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
 
   // Select only columns required by the options
   _selected_columns = _metadata->select_columns(options.get_columns());
-  /*
-    for (auto& lvl : _selected_columns) {
-      for (auto& c : lvl)
-        std::cout << c.id << ',' << c.num_children << ' ';
-      std::cout << std::endl;
-    }*/
-  // CUDF_FAIL("stop");
 
   // Override output timestamp resolution if requested
   if (options.get_timestamp_type().id() != type_id::EMPTY) {

From 1de85ba6dcd77fb299cda522dc6d7c432a85adf3 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 21 Oct 2021 23:55:38 -0700
Subject: [PATCH 11/24] refactor selected columns into a hierarchy class

---
 cpp/src/io/orc/reader_impl.cu  | 131 +++++++++++++++------------------
 cpp/src/io/orc/reader_impl.hpp |  12 ++-
 2 files changed, 69 insertions(+), 74 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index fb477d78831..59c91499c7c 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -116,9 +116,6 @@ constexpr std::pair<gpu::StreamIndexType, uint32_t> get_index_type_and_pos(
   }
 }
 
-}  // namespace
-
-namespace {
 /**
  * @brief struct to store buffer data and size of list buffer
  */
@@ -246,6 +243,23 @@ bool should_convert_decimal_column_to_float(const std::vector<std::string>& colu
 
 }  // namespace
 
+column_hierarchy::column_hierarchy(nesting_map child_map) : children{std::move(child_map)}
+{
+  std::function<void(int32_t, int32_t)> levelize = [&](int32_t id, int32_t level) {
+    if (static_cast<int32_t>(levels.size()) == level) levels.emplace_back();
+
+    levels[level].push_back({id, static_cast<int32_t>(children[id].size())});
+
+    for (auto child_id : children[id]) {
+      levelize(child_id, level + 1);
+    }
+  };
+
+  for (auto col_id : children[0]) {
+    levelize(col_id, 0);
+  }
+}
+
 /**
  * @brief In order to support multiple input files/buffers we need to gather
  * the metadata across all of those input(s). This class provides a place
@@ -505,44 +519,20 @@ class aggregate_orc_metadata {
     add_nested_columns(selected_columns, metadata.ff.types, id);
   }
 
-  std::vector<std::vector<orc_column_meta>> sort_into_levels(
-    std::map<int32_t, std::vector<int32_t>>& selected_columns)
-  {
-    std::vector<std::vector<orc_column_meta>> sorted_levels;
-
-    std::function<void(int32_t, int32_t)> levelize = [&](int32_t id, int32_t level) {
-      if (static_cast<int32_t>(sorted_levels.size()) == level) sorted_levels.emplace_back();
-
-      sorted_levels[level].push_back({id, static_cast<int32_t>(selected_columns[id].size())});
-
-      for (auto child_id : selected_columns[id]) {
-        // Since nested column needs to be processed before its child can be processed,
-        // child column is being added to next level
-        levelize(child_id, level + 1);
-      }
-    };
-
-    for (auto col_id : selected_columns[0]) {
-      levelize(col_id, 0);
-    }
-    return sorted_levels;
-  }
-
   /**
    * @brief Filters and reduces down to a selection of columns
    *
    * @param use_names List of column names to select
    * @return Vector of list of ORC column meta-data
    */
-  std::vector<std::vector<orc_column_meta>> select_columns(
-    std::vector<std::string> const& use_names)
+  column_hierarchy select_columns(std::vector<std::string> const& use_names)
   {
     auto const& pfm = per_file_metadata[0];
 
-    std::map<int32_t, std::vector<int32_t>> selected_columns_map;
+    column_hierarchy::nesting_map selected_columns;
     if (use_names.empty()) {
       for (auto const& col_id : pfm.ff.types[0].subtypes) {
-        add_column_to_mapping(selected_columns_map, pfm, col_id);
+        add_column_to_mapping(selected_columns, pfm, col_id);
       }
     } else {
       for (const auto& use_name : use_names) {
@@ -550,14 +540,14 @@ class aggregate_orc_metadata {
         for (auto col_id = 1; col_id < pfm.get_num_columns(); ++col_id) {
           if (pfm.get_column_path(col_id) == use_name) {
             name_found = true;
-            add_column_to_mapping(selected_columns_map, pfm, col_id);
+            add_column_to_mapping(selected_columns, pfm, col_id);
             break;
           }
         }
-        CUDF_EXPECTS(name_found, "Unknown column name : " + std::string(use_name));
+        CUDF_EXPECTS(name_found, "Unknown column name: " + std::string(use_name));
       }
     }
-    return sort_into_levels(selected_columns_map);
+    return {std::move(selected_columns)};
   }
 };
 
@@ -960,14 +950,14 @@ void reader::impl::aggregate_child_meta(cudf::detail::host_2dspan<gpu::ColumnDes
 {
   const auto num_of_stripes         = chunks.size().first;
   const auto num_of_rowgroups       = row_groups.size().first;
-  const auto num_parent_cols        = _selected_columns[level].size();
-  const auto num_child_cols         = _selected_columns[level + 1].size();
+  const auto num_parent_cols        = selected_columns.levels[level].size();
+  const auto num_child_cols         = selected_columns.levels[level + 1].size();
   const auto number_of_child_chunks = num_child_cols * num_of_stripes;
   auto& num_child_rows              = _col_meta.num_child_rows;
   auto& parent_column_data          = _col_meta.parent_column_data;
 
   // Reset the meta to store child column details.
-  num_child_rows.resize(_selected_columns[level + 1].size());
+  num_child_rows.resize(selected_columns.levels[level + 1].size());
   std::fill(num_child_rows.begin(), num_child_rows.end(), 0);
   parent_column_data.resize(number_of_child_chunks);
   _col_meta.parent_column_index.resize(number_of_child_chunks);
@@ -1168,8 +1158,8 @@ void reader::impl::create_columns(std::vector<std::vector<column_buffer>>&& col_
                                   std::vector<column_name_info>& schema_info,
                                   rmm::cuda_stream_view stream)
 {
-  std::transform(_selected_columns[0].begin(),
-                 _selected_columns[0].end(),
+  std::transform(selected_columns.levels[0].begin(),
+                 selected_columns.levels[0].end(),
                  std::back_inserter(out_columns),
                  [&](auto const col_meta) {
                    schema_info.emplace_back("");
@@ -1181,14 +1171,11 @@ void reader::impl::create_columns(std::vector<std::vector<column_buffer>>&& col_
 reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
                    orc_reader_options const& options,
                    rmm::mr::device_memory_resource* mr)
-  : _mr(mr), _sources(std::move(sources))
+  : _mr(mr),
+    _sources(std::move(sources)),
+    _metadata{std::make_unique<aggregate_orc_metadata>(_sources)},
+    selected_columns{_metadata->select_columns(options.get_columns())}
 {
-  // Open and parse the source(s) dataset metadata
-  _metadata = std::make_unique<aggregate_orc_metadata>(_sources);
-
-  // Select only columns required by the options
-  _selected_columns = _metadata->select_columns(options.get_columns());
-
   // Override output timestamp resolution if requested
   if (options.get_timestamp_type().id() != type_id::EMPTY) {
     _timestamp_type = options.get_timestamp_type();
@@ -1210,8 +1197,8 @@ timezone_table reader::impl::compute_timezone_table(
 {
   if (selected_stripes.empty()) return {};
 
-  auto const has_timestamp_column =
-    std::any_of(_selected_columns.cbegin(), _selected_columns.cend(), [&](auto& col_lvl) {
+  auto const has_timestamp_column = std::any_of(
+    selected_columns.levels.cbegin(), selected_columns.levels.cend(), [&](auto& col_lvl) {
       return std::any_of(col_lvl.cbegin(), col_lvl.cend(), [&](auto& col_meta) {
         return _metadata->get_col_type(col_meta.id).kind == TypeKind::TIMESTAMP;
       });
@@ -1228,20 +1215,21 @@ table_with_metadata reader::impl::read(size_type skip_rows,
                                        rmm::cuda_stream_view stream)
 {
   // Selected columns at different levels of nesting are stored in different elements
-  // of `_selected_columns`; thus, size == 1 means no nested columns
-  CUDF_EXPECTS(skip_rows == 0 or _selected_columns.size() == 1,
+  // of `selected_columns`; thus, size == 1 means no nested columns
+  CUDF_EXPECTS(skip_rows == 0 or selected_columns.num_levels() == 1,
                "skip_rows is not supported by nested columns");
 
   std::vector<std::unique_ptr<column>> out_columns;
   // buffer and stripe data are stored as per nesting level
-  std::vector<std::vector<column_buffer>> out_buffers(_selected_columns.size());
+  std::vector<std::vector<column_buffer>> out_buffers(selected_columns.num_levels());
   std::vector<column_name_info> schema_info;
-  std::vector<std::vector<rmm::device_buffer>> lvl_stripe_data(_selected_columns.size());
+  std::vector<std::vector<rmm::device_buffer>> lvl_stripe_data(selected_columns.num_levels());
   std::vector<std::vector<rmm::device_uvector<uint32_t>>> null_count_prefix_sums;
   table_metadata out_metadata;
 
   // There are no columns in the table
-  if (_selected_columns.size() == 0) return {std::make_unique<table>(), std::move(out_metadata)};
+  if (selected_columns.num_levels() == 0)
+    return {std::make_unique<table>(), std::move(out_metadata)};
 
   // Select only stripes required (aka row groups)
   const auto selected_stripes = _metadata->select_stripes(stripes, skip_rows, num_rows);
@@ -1250,8 +1238,8 @@ table_with_metadata reader::impl::read(size_type skip_rows,
 
   // Iterates through levels of nested columns, child column will be one level down
   // compared to parent column.
-  for (size_t level = 0; level < _selected_columns.size(); level++) {
-    auto& selected_columns = _selected_columns[level];
+  for (size_t level = 0; level < selected_columns.num_levels(); level++) {
+    auto& columns_level = selected_columns.levels[level];
     // Association between each ORC column and its cudf::column
     _col_meta.orc_col_map.emplace_back(_metadata->get_num_cols(), -1);
     std::vector<orc_column_meta> nested_col;
@@ -1259,7 +1247,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
 
     // Get a list of column data types
     std::vector<data_type> column_types;
-    for (auto& col : selected_columns) {
+    for (auto& col : columns_level) {
       // If the column type is orc::DECIMAL see if the user
       // desires it to be converted to float64 or not
       auto const decimal_as_float64 = should_convert_decimal_column_to_float(
@@ -1289,8 +1277,8 @@ table_with_metadata reader::impl::read(size_type skip_rows,
 
     // If no rows or stripes to read, return empty columns
     if (num_rows <= 0 || selected_stripes.empty()) {
-      std::transform(_selected_columns[0].begin(),
-                     _selected_columns[0].end(),
+      std::transform(selected_columns.levels[0].begin(),
+                     selected_columns.levels[0].end(),
                      std::back_inserter(out_columns),
                      [&](auto const col_meta) {
                        schema_info.emplace_back("");
@@ -1306,7 +1294,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
                         [](size_t sum, auto& stripe_source_mapping) {
                           return sum + stripe_source_mapping.stripe_info.size();
                         });
-      const auto num_columns = selected_columns.size();
+      const auto num_columns = columns_level.size();
       cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> chunks(
         total_num_stripes, num_columns, stream);
       memset(chunks.base_host_ptr(), 0, chunks.memory_size());
@@ -1327,12 +1315,13 @@ table_with_metadata reader::impl::read(size_type skip_rows,
       std::vector<orc_stream_info> stream_info;
 
       null_count_prefix_sums.emplace_back();
-      null_count_prefix_sums.back().reserve(_selected_columns[level].size());
-      std::generate_n(
-        std::back_inserter(null_count_prefix_sums.back()), _selected_columns[level].size(), [&]() {
-          return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(total_num_stripes,
-                                                                          stream);
-        });
+      null_count_prefix_sums.back().reserve(selected_columns.levels[level].size());
+      std::generate_n(std::back_inserter(null_count_prefix_sums.back()),
+                      selected_columns.levels[level].size(),
+                      [&]() {
+                        return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
+                          total_num_stripes, stream);
+                      });
 
       // Tracker for eventually deallocating compressed and uncompressed data
       auto& stripe_data = lvl_stripe_data[level];
@@ -1430,19 +1419,17 @@ table_with_metadata reader::impl::read(size_type skip_rows,
               (level == 0)
                 ? nullptr
                 : null_count_prefix_sums[level - 1][_col_meta.parent_column_index[col_idx]].data();
-            chunk.encoding_kind = stripe_footer->columns[selected_columns[col_idx].id].kind;
+            chunk.encoding_kind = stripe_footer->columns[columns_level[col_idx].id].kind;
             chunk.type_kind     = _metadata->per_file_metadata[stripe_source_mapping.source_idx]
-                                .ff.types[selected_columns[col_idx].id]
+                                .ff.types[columns_level[col_idx].id]
                                 .kind;
             // num_child_rows for a struct column will be same, for other nested types it will be
             // calculated.
-            chunk.num_child_rows = (chunk.type_kind != orc::STRUCT) ? 0 : chunk.num_rows;
-            auto const decimal_as_float64 =
-              should_convert_decimal_column_to_float(_decimal_cols_as_float,
-                                                     _metadata->per_file_metadata[0],
-                                                     selected_columns[col_idx].id);
+            chunk.num_child_rows          = (chunk.type_kind != orc::STRUCT) ? 0 : chunk.num_rows;
+            auto const decimal_as_float64 = should_convert_decimal_column_to_float(
+              _decimal_cols_as_float, _metadata->per_file_metadata[0], columns_level[col_idx].id);
             chunk.decimal_scale = _metadata->per_file_metadata[stripe_source_mapping.source_idx]
-                                    .ff.types[selected_columns[col_idx].id]
+                                    .ff.types[columns_level[col_idx].id]
                                     .scale.value_or(0) |
                                   (decimal_as_float64 ? orc::gpu::orc_decimal2float64_scale : 0);
 
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 9eb3ed29341..bf1b6eefffe 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -71,6 +71,15 @@ struct reader_column_meta {
   std::vector<row_group_meta> rwgrp_meta;  // rowgroup metadata [rowgroup][column]
 };
 
+struct column_hierarchy {
+  using nesting_map = std::map<int32_t, std::vector<int32_t>>;
+  nesting_map children;
+  std::vector<std::vector<orc_column_meta>> levels;
+
+  column_hierarchy(nesting_map child_map);
+  auto num_levels() const { return levels.size(); }
+};
+
 /**
  * @brief Implementation for ORC reader
  */
@@ -218,8 +227,7 @@ class reader::impl {
   rmm::mr::device_memory_resource* _mr = nullptr;
   std::vector<std::unique_ptr<datasource>> _sources;
   std::unique_ptr<aggregate_orc_metadata> _metadata;
-  // _output_columns associated schema indices
-  std::vector<std::vector<orc_column_meta>> _selected_columns;
+  column_hierarchy selected_columns;
 
   bool _use_index     = true;
   bool _use_np_dtypes = true;

From a32eb966f8f42f2717c6bad0bb7329ad7527bc85 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 22 Oct 2021 00:20:10 -0700
Subject: [PATCH 12/24] "final" fixes!

---
 cpp/src/io/orc/reader_impl.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 59c91499c7c..112140f8ecb 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -1122,7 +1122,7 @@ column_buffer&& reader::impl::assemble_buffer(const int32_t orc_col_id,
   switch (kind) {
     case orc::LIST:
     case orc::STRUCT:
-      for (auto const& col : _metadata->get_col_type(orc_col_id).subtypes) {
+      for (auto const& col : selected_columns.children[orc_col_id]) {
         col_buffer.children.emplace_back(assemble_buffer(col, col_buffers, level + 1, stream));
       }
 
@@ -1130,9 +1130,9 @@ column_buffer&& reader::impl::assemble_buffer(const int32_t orc_col_id,
     case orc::MAP: {
       std::vector<column_buffer> child_col_buffers;
       // Get child buffers
-      for (size_t idx = 0; idx < _metadata->get_col_type(orc_col_id).subtypes.size(); idx++) {
+      for (size_t idx = 0; idx < selected_columns.children[orc_col_id].size(); idx++) {
         auto name = get_map_child_col_name(idx);
-        auto col  = _metadata->get_col_type(orc_col_id).subtypes[idx];
+        auto col  = selected_columns.children[orc_col_id][idx];
         child_col_buffers.emplace_back(assemble_buffer(col, col_buffers, level + 1, stream));
         child_col_buffers.back().name = name;
       }

From fb79740300ebf98be0176b675e9275f3797f4bd4 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 22 Oct 2021 13:36:18 -0700
Subject: [PATCH 13/24] select nested even if a subset has previously been
 selected

---
 cpp/src/io/orc/reader_impl.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 112140f8ecb..3d5cd22aa55 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -489,8 +489,8 @@ class aggregate_orc_metadata {
       if (std::find(selected_columns[id].cbegin(), selected_columns[id].cend(), child_id) ==
           selected_columns[id].end()) {
         selected_columns[id].push_back(child_id);
-        add_nested_columns(selected_columns, types, child_id);
       }
+      add_nested_columns(selected_columns, types, child_id);
     }
   }
 

From 7ceaac4ed3c9a97fb3a930a51b7eb1be2ae97c5c Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 22 Oct 2021 15:17:37 -0700
Subject: [PATCH 14/24] fix C++ test; clean up start

---
 cpp/src/io/orc/orc.cpp    | 19 +++++++++++--------
 cpp/tests/io/orc_test.cpp |  2 +-
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp
index 2d8e402894f..0d74859f8c0 100644
--- a/cpp/src/io/orc/orc.cpp
+++ b/cpp/src/io/orc/orc.cpp
@@ -465,22 +465,25 @@ metadata::metadata(datasource* const src) : source(src)
 
 void metadata::init_column_names()
 {
-  auto const& types = ff.types;
   // root ORC column
   column_names.push_back("");
   column_paths.push_back("");
+
   for (int32_t col_id = 1; col_id < get_num_columns(); ++col_id) {
-    auto const parent_idx = schema[col_id].parent;
-    auto const field_idx  = schema[col_id].field;
-    if (field_idx >= 0 and field_idx < static_cast<int32_t>(types[parent_idx].fieldNames.size())) {
-      column_names.push_back(types[parent_idx].fieldNames[field_idx]);
+    auto const& types              = ff.types;
+    auto const parent_idx          = schema[col_id].parent;
+    auto const& parent_field_names = types[parent_idx].fieldNames;
+    auto const field_idx           = schema[col_id].field;
+    if (field_idx >= 0 and field_idx < static_cast<int32_t>(parent_field_names.size())) {
+      column_names.push_back(parent_field_names[field_idx]);
     } else {
       // If we have no name, generate a name
-      column_names.push_back("col" + std::to_string(col_id));
+      column_names.push_back("_col" + std::to_string(col_id));
     }
 
-    column_paths.push_back(column_paths[parent_idx] +
-                           (column_paths[parent_idx].empty() ? "" : ".") + column_names.back());
+    // Don't include ORC root column name in path
+    column_paths.push_back((parent_idx == 0 ? "" : column_paths[parent_idx] + ".") +
+                           column_names.back());
   }
 }
 
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index f2d5952d0ed..817a6c9cb10 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -1000,7 +1000,7 @@ TEST_F(OrcStatisticsTest, Basic)
   auto const stats = cudf_io::read_parsed_orc_statistics(cudf_io::source_info{filepath});
 
   auto const expected_column_names =
-    std::vector<std::string>{"col0", "_col0", "_col1", "_col2", "_col3", "_col4"};
+    std::vector<std::string>{"", "_col0", "_col1", "_col2", "_col3", "_col4"};
   EXPECT_EQ(stats.column_names, expected_column_names);
 
   auto validate_statistics = [&](std::vector<cudf_io::column_statistics> const& stats) {

From c2b087f3b0264e2382730a168fa7311cc16ad948 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 22 Oct 2021 16:27:26 -0700
Subject: [PATCH 15/24] metadata clean up

---
 cpp/src/io/orc/orc.cpp | 66 ++++++++++++++++++++++--------------------
 cpp/src/io/orc/orc.h   | 18 +++++++-----
 2 files changed, 44 insertions(+), 40 deletions(-)

diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp
index 0d74859f8c0..aabef514891 100644
--- a/cpp/src/io/orc/orc.cpp
+++ b/cpp/src/io/orc/orc.cpp
@@ -15,10 +15,13 @@
  */
 
 #include "orc.h"
-#include <string>
 #include "orc_field_reader.hpp"
 #include "orc_field_writer.hpp"
 
+#include <thrust/tabulate.h>
+
+#include <string>
+
 namespace cudf {
 namespace io {
 namespace orc {
@@ -459,48 +462,47 @@ metadata::metadata(datasource* const src) : source(src)
   auto md_data     = decompressor->Decompress(buffer->data(), ps.metadataLength, &md_length);
   orc::ProtobufReader(md_data, md_length).read(md);
 
-  init_schema();
+  init_parent_descriptors();
   init_column_names();
 }
 
 void metadata::init_column_names()
 {
-  // root ORC column
-  column_names.push_back("");
-  column_paths.push_back("");
-
-  for (int32_t col_id = 1; col_id < get_num_columns(); ++col_id) {
-    auto const& types              = ff.types;
-    auto const parent_idx          = schema[col_id].parent;
-    auto const& parent_field_names = types[parent_idx].fieldNames;
-    auto const field_idx           = schema[col_id].field;
-    if (field_idx >= 0 and field_idx < static_cast<int32_t>(parent_field_names.size())) {
-      column_names.push_back(parent_field_names[field_idx]);
-    } else {
-      // If we have no name, generate a name
-      column_names.push_back("_col" + std::to_string(col_id));
-    }
-
+  column_names.resize(get_num_columns());
+  thrust::tabulate(column_names.begin(), column_names.end(), [&](auto col_id) {
+    auto const parent_id = parents[col_id].id;
+    if (parent_id < 0) return std::string{};
+    auto const& parent_field_names = ff.types[parent_id].fieldNames;
+    auto const field_idx           = parents[col_id].field_idx;
+    // Child columns of lists don't have a name in ORC files, generate placeholder in that case
+    return field_idx < static_cast<int32_t>(parent_field_names.size())
+             ? parent_field_names[field_idx]
+             : std::to_string(col_id);
+  });
+
+  column_paths.resize(get_num_columns());
+  thrust::tabulate(column_paths.begin(), column_paths.end(), [&](auto col_id) {
+    auto const parent_id = parents[col_id].id;
+    if (parent_id < 0) return std::string{};
     // Don't include ORC root column name in path
-    column_paths.push_back((parent_idx == 0 ? "" : column_paths[parent_idx] + ".") +
-                           column_names.back());
-  }
+    return (parent_id == 0 ? "" : column_paths[parent_id] + ".") + column_names[col_id];
+  });
 }
 
-void metadata::init_schema()
+void metadata::init_parent_descriptors()
 {
-  auto const schema_size = static_cast<uint32_t>(ff.types.size());
-  schema.resize(schema_size);
+  auto const num_columns = static_cast<uint32_t>(ff.types.size());
+  parents.resize(num_columns);
 
-  for (uint32_t i = 0; i < schema_size; i++) {
-    auto const& subtypes    = ff.types[i].subtypes;
+  for (uint32_t col_id = 0; col_id < num_columns; ++col_id) {
+    auto const& subtypes    = ff.types[col_id].subtypes;
     auto const num_children = static_cast<uint32_t>(subtypes.size());
-    for (uint32_t j = 0; j < num_children; j++) {
-      auto const column_id = subtypes[j];
-      CUDF_EXPECTS(column_id > i && column_id < schema_size, "Invalid column id");
-      CUDF_EXPECTS(schema[column_id].parent == -1, "Same node referenced twice");
-      schema[column_id].parent = i;
-      schema[column_id].field  = j;
+    for (uint32_t field_idx = 0; field_idx < num_children; ++field_idx) {
+      auto const child_id = subtypes[field_idx];
+      CUDF_EXPECTS(child_id > col_id && child_id < num_columns, "Invalid column id");
+      CUDF_EXPECTS(parents[child_id].id == -1, "Same node referenced twice");
+      parents[child_id].id        = col_id;
+      parents[child_id].field_idx = field_idx;
     }
   }
 }
diff --git a/cpp/src/io/orc/orc.h b/cpp/src/io/orc/orc.h
index 2ee4117a06f..a0b94927591 100644
--- a/cpp/src/io/orc/orc.h
+++ b/cpp/src/io/orc/orc.h
@@ -586,11 +586,11 @@ class metadata {
   size_t get_total_rows() const { return ff.numberOfRows; }
   int get_num_stripes() const { return ff.stripes.size(); }
   int get_num_columns() const { return ff.types.size(); }
-  std::string const& get_column_name(int32_t column_id) const { return column_names[column_id]; }
-  std::string const& get_column_path(int32_t column_id) const { return column_paths[column_id]; }
+  std::string const& get_column_name(int32_t column_id) const { return column_names.at(column_id); }
+  std::string const& get_column_path(int32_t column_id) const { return column_paths.at(column_id); }
   int get_row_index_stride() const { return ff.rowIndexStride; }
 
-  int32_t parent_id(int32_t column_id) const { return schema[column_id].parent; }
+  int32_t parent_id(int32_t column_id) const { return parents.at(column_id).id; }
 
  public:
   PostScript ps;
@@ -601,12 +601,14 @@ class metadata {
   datasource* const source;
 
  private:
-  struct column_schema_indexes {
-    int32_t parent = -1;
-    int32_t field  = -1;
+  struct column_parent {
+    // parent's ID
+    int32_t id = -1;
+    // Index of this column in the parent's list of children
+    int32_t field_idx = -1;
   };
-  void init_schema();
-  std::vector<column_schema_indexes> schema;
+  void init_parent_descriptors();
+  std::vector<column_parent> parents;
 
   void init_column_names();
   std::vector<std::string> column_names;

From 7cf566daf77ce9b344372e707fb0225e7f03773b Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 22 Oct 2021 16:58:05 -0700
Subject: [PATCH 16/24] final(?) clean up; python test

---
 cpp/src/io/orc/orc.h               |  12 +++-
 cpp/src/io/orc/reader_impl.cu      | 102 ++++++++++++++++-------------
 cpp/src/io/orc/reader_impl.hpp     |   5 ++
 python/cudf/cudf/tests/test_orc.py |  21 ++++++
 4 files changed, 92 insertions(+), 48 deletions(-)

diff --git a/cpp/src/io/orc/orc.h b/cpp/src/io/orc/orc.h
index a0b94927591..c0293e1d59b 100644
--- a/cpp/src/io/orc/orc.h
+++ b/cpp/src/io/orc/orc.h
@@ -586,8 +586,16 @@ class metadata {
   size_t get_total_rows() const { return ff.numberOfRows; }
   int get_num_stripes() const { return ff.stripes.size(); }
   int get_num_columns() const { return ff.types.size(); }
-  std::string const& get_column_name(int32_t column_id) const { return column_names.at(column_id); }
-  std::string const& get_column_path(int32_t column_id) const { return column_paths.at(column_id); }
+  std::string const& get_column_name(int32_t column_id) const
+  {
+    CUDF_EXPECTS(column_id < get_num_columns(), "Out of range column id provided");
+    return column_names[column_id];
+  }
+  std::string const& get_column_path(int32_t column_id) const
+  {
+    CUDF_EXPECTS(column_id < get_num_columns(), "Out of range column id provided");
+    return column_paths[column_id];
+  }
   int get_row_index_stride() const { return ff.rowIndexStride; }
 
   int32_t parent_id(int32_t column_id) const { return parents.at(column_id).id; }
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 3d5cd22aa55..83205d540dc 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -245,6 +245,7 @@ bool should_convert_decimal_column_to_float(const std::vector<std::string>& colu
 
 column_hierarchy::column_hierarchy(nesting_map child_map) : children{std::move(child_map)}
 {
+  // Sort columns by nesting levels
   std::function<void(int32_t, int32_t)> levelize = [&](int32_t id, int32_t level) {
     if (static_cast<int32_t>(levels.size()) == level) levels.emplace_back();
 
@@ -260,6 +261,7 @@ column_hierarchy::column_hierarchy(nesting_map child_map) : children{std::move(c
   }
 }
 
+// TODO: move class to a separate hpp/cpp files
 /**
  * @brief In order to support multiple input files/buffers we need to gather
  * the metadata across all of those input(s). This class provides a place
@@ -268,6 +270,56 @@ column_hierarchy::column_hierarchy(nesting_map child_map) : children{std::move(c
 class aggregate_orc_metadata {
   using OrcStripeInfo = std::pair<const StripeInformation*, const StripeFooter*>;
 
+  /**
+   * @brief Goes up to the root to include the column with the given id and its parents.
+   */
+  void update_parent_mapping(std::map<int32_t, std::vector<int32_t>>& selected_columns,
+                             cudf::io::orc::metadata const& metadata,
+                             int32_t id)
+  {
+    auto current_id = id;
+    auto parent_id  = metadata.parent_id(current_id);
+    while (parent_id != -1) {
+      if (std::find(selected_columns[parent_id].cbegin(),
+                    selected_columns[parent_id].cend(),
+                    current_id) == selected_columns[parent_id].end()) {
+        selected_columns[parent_id].push_back(current_id);
+      }
+      current_id = parent_id;
+      parent_id  = metadata.parent_id(current_id);
+    }
+  }
+
+  /**
+   * @brief Adds all columns nested under the column with the given id to the nesting map.
+   */
+  void add_nested_columns(std::map<int32_t, std::vector<int32_t>>& selected_columns,
+                          std::vector<SchemaType> const& types,
+                          int32_t id)
+  {
+    for (auto child_id : types[id].subtypes) {
+      if (std::find(selected_columns[id].cbegin(), selected_columns[id].cend(), child_id) ==
+          selected_columns[id].end()) {
+        selected_columns[id].push_back(child_id);
+      }
+      add_nested_columns(selected_columns, types, child_id);
+    }
+  }
+
+  /**
+   * @brief Adds the column with the given id to the mapping
+   *
+   * All nested columns and direct ancestors of column `id` are included.
+   * Columns that are not on the direct path are excluded, which may result in prunning.
+   */
+  void add_column_to_mapping(std::map<int32_t, std::vector<int32_t>>& selected_columns,
+                             cudf::io::orc::metadata const& metadata,
+                             int32_t id)
+  {
+    update_parent_mapping(selected_columns, metadata, id);
+    add_nested_columns(selected_columns, metadata.ff.types, id);
+  }
+
  public:
   mutable std::vector<cudf::io::orc::metadata> per_file_metadata;
   size_type const num_rows;
@@ -364,22 +416,18 @@ class aggregate_orc_metadata {
 
   int get_row_index_stride() const { return per_file_metadata[0].ff.rowIndexStride; }
 
-  auto get_column_name(const int source_idx, const int column_idx) const
+  auto get_column_name(const int source_idx, const int column_id) const
   {
     CUDF_EXPECTS(source_idx <= static_cast<int>(per_file_metadata.size()),
                  "Out of range source_idx provided");
-    CUDF_EXPECTS(column_idx <= per_file_metadata[source_idx].get_num_columns(),
-                 "Out of range column_idx provided");
-    return per_file_metadata[source_idx].get_column_name(column_idx);
+    return per_file_metadata[source_idx].get_column_name(column_id);
   }
 
-  auto get_column_path(const int source_idx, const int column_idx) const
+  auto get_column_path(const int source_idx, const int column_id) const
   {
     CUDF_EXPECTS(source_idx <= static_cast<int>(per_file_metadata.size()),
                  "Out of range source_idx provided");
-    CUDF_EXPECTS(column_idx <= per_file_metadata[source_idx].get_num_columns(),
-                 "Out of range column_idx provided");
-    return per_file_metadata[source_idx].get_column_path(column_idx);
+    return per_file_metadata[source_idx].get_column_path(column_id);
   }
 
   auto is_row_grp_idx_present() const { return row_grp_idx_present; }
@@ -481,44 +529,6 @@ class aggregate_orc_metadata {
     return selected_stripes_mapping;
   }
 
-  void add_nested_columns(std::map<int32_t, std::vector<int32_t>>& selected_columns,
-                          std::vector<SchemaType> const& types,
-                          int32_t id)
-  {
-    for (auto child_id : types[id].subtypes) {
-      if (std::find(selected_columns[id].cbegin(), selected_columns[id].cend(), child_id) ==
-          selected_columns[id].end()) {
-        selected_columns[id].push_back(child_id);
-      }
-      add_nested_columns(selected_columns, types, child_id);
-    }
-  }
-
-  void update_parent_mapping(std::map<int32_t, std::vector<int32_t>>& selected_columns,
-                             cudf::io::orc::metadata const& metadata,
-                             int32_t id)
-  {
-    auto current_id = id;
-    auto parent_id  = metadata.parent_id(current_id);
-    while (parent_id != -1) {
-      if (std::find(selected_columns[parent_id].cbegin(),
-                    selected_columns[parent_id].cend(),
-                    current_id) == selected_columns[parent_id].end()) {
-        selected_columns[parent_id].push_back(current_id);
-      }
-      current_id = parent_id;
-      parent_id  = metadata.parent_id(current_id);
-    }
-  }
-
-  void add_column_to_mapping(std::map<int32_t, std::vector<int32_t>>& selected_columns,
-                             cudf::io::orc::metadata const& metadata,
-                             int32_t id)
-  {
-    update_parent_mapping(selected_columns, metadata, id);
-    add_nested_columns(selected_columns, metadata.ff.types, id);
-  }
-
   /**
    * @brief Filters and reduces down to a selection of columns
    *
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index bf1b6eefffe..81ebb3dfbd1 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -71,9 +71,14 @@ struct reader_column_meta {
   std::vector<row_group_meta> rwgrp_meta;  // rowgroup metadata [rowgroup][column]
 };
 
+/**
+ * @brief Describes a column hierarchy, which may exclude some input columns.
+ */
 struct column_hierarchy {
   using nesting_map = std::map<int32_t, std::vector<int32_t>>;
+  // Children IDs of each column
   nesting_map children;
+  // Each element contains column at the given nesting level
   std::vector<std::vector<orc_column_meta>> levels;
 
   column_hierarchy(nesting_map child_map);
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index d94198bcd4d..a1280f6cf97 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -1492,3 +1492,24 @@ def test_empty_statistics():
         assert stats[0]["i"].get("minimum") == 1
         assert stats[0]["i"].get("maximum") == 1
         assert stats[0]["i"].get("sum") == 1
+
+
+@pytest.mark.filterwarnings("ignore:.*struct.*experimental")
+@pytest.mark.parametrize(
+    "columns",
+    [
+        (["lvl1_struct.a", "lvl1_struct.b"], ["lvl1_struct"]),
+        (["lvl1_struct", "lvl1_struct.a"], ["lvl1_struct"]),
+        (["lvl1_struct.a", "lvl1_struct"], ["lvl1_struct"]),
+        (["lvl1_struct.b", "lvl1_struct.a"], ["lvl1_struct.b", "lvl1_struct"]),
+        (["lvl2_struct.lvl1_struct", "lvl2_struct"], ["lvl2_struct"]),
+        (
+            ["lvl2_struct.a", "lvl2_struct.lvl1_struct.c", "lvl2_struct"],
+            ["lvl2_struct"],
+        ),
+    ],
+)
+def test_select_nested(list_struct_buff, columns):
+    df_cols1 = cudf.read_orc(list_struct_buff, columns=columns[0])
+    df_cols2 = cudf.read_orc(list_struct_buff, columns=columns[1])
+    assert_eq(df_cols1, df_cols2)

From 7babfef6e75d22a9cb58fc869776ebd12a7ce77a Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Sat, 23 Oct 2021 00:45:39 -0700
Subject: [PATCH 17/24] add C++ test

---
 cpp/tests/io/orc_test.cpp | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index 817a6c9cb10..86cd48de417 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -1371,4 +1371,40 @@ TEST_F(OrcWriterTest, TestMap)
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
+TEST_F(OrcReaderTest, NestedColumnSelection)
+{
+  auto const num_rows  = 1000;
+  auto child_col1_data = random_values<int32_t>(num_rows);
+  auto child_col2_data = random_values<int64_t>(num_rows);
+  auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3; });
+  column_wrapper<int32_t> child_col1 = {child_col1_data.begin(), child_col1_data.end(), validity};
+  column_wrapper<int64_t> child_col2 = {child_col2_data.begin(), child_col2_data.end(), validity};
+  auto struct_col                    = cudf::test::structs_column_wrapper{child_col1, child_col2};
+  table_view expected({struct_col});
+
+  cudf_io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("struct_s");
+  expected_metadata.column_metadata[0].child(0).set_name("field_a");
+  expected_metadata.column_metadata[0].child(1).set_name("field_b");
+
+  auto filepath = temp_env->get_temp_filepath("OrcNestedSelection.orc");
+  cudf_io::orc_writer_options out_opts =
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected)
+      .metadata(&expected_metadata);
+  cudf_io::write_orc(out_opts);
+
+  cudf_io::orc_reader_options in_opts =
+    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath})
+      .use_index(false)
+      .columns({"struct_s.field_b"});
+  auto result = cudf_io::read_orc(in_opts);
+
+  // Verify that only one child column is included in the output table
+  ASSERT_EQ(1, result.tbl->view().column(0).num_children());
+  // Verify that the first child column is `field_b`
+  column_wrapper<int64_t> expected_col = {child_col2_data.begin(), child_col2_data.end(), validity};
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_col, result.tbl->view().column(0).child(0));
+  ASSERT_EQ("field_b", result.metadata.schema_info[0].children[0].name);
+}
+
 CUDF_TEST_PROGRAM_MAIN()

From 14dd923a4e335da3050f163c732993b3a0da9490 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 25 Oct 2021 15:04:54 -0700
Subject: [PATCH 18/24] encapsulate parent info in metadata

---
 cpp/src/io/orc/orc.cpp        | 31 ++++++++++++++-----------------
 cpp/src/io/orc/orc.h          | 22 ++++++++++++++++++----
 cpp/src/io/orc/reader_impl.cu |  5 ++---
 3 files changed, 34 insertions(+), 24 deletions(-)

diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp
index aabef514891..b850fec82e8 100644
--- a/cpp/src/io/orc/orc.cpp
+++ b/cpp/src/io/orc/orc.cpp
@@ -470,39 +470,36 @@ void metadata::init_column_names()
 {
   column_names.resize(get_num_columns());
   thrust::tabulate(column_names.begin(), column_names.end(), [&](auto col_id) {
-    auto const parent_id = parents[col_id].id;
-    if (parent_id < 0) return std::string{};
-    auto const& parent_field_names = ff.types[parent_id].fieldNames;
-    auto const field_idx           = parents[col_id].field_idx;
+    if (not column_has_parent(col_id)) return std::string{};
+    auto const& parent_field_names = ff.types[parent_id(col_id)].fieldNames;
     // Child columns of lists don't have a name in ORC files, generate placeholder in that case
-    return field_idx < static_cast<int32_t>(parent_field_names.size())
-             ? parent_field_names[field_idx]
+    return field_index(col_id) < static_cast<int32_t>(parent_field_names.size())
+             ? parent_field_names[field_index(col_id)]
              : std::to_string(col_id);
   });
 
   column_paths.resize(get_num_columns());
   thrust::tabulate(column_paths.begin(), column_paths.end(), [&](auto col_id) {
-    auto const parent_id = parents[col_id].id;
-    if (parent_id < 0) return std::string{};
+    if (not column_has_parent(col_id)) return std::string{};
     // Don't include ORC root column name in path
-    return (parent_id == 0 ? "" : column_paths[parent_id] + ".") + column_names[col_id];
+    return (parent_id(col_id) == 0 ? "" : column_paths[parent_id(col_id)] + ".") +
+           column_names[col_id];
   });
 }
 
 void metadata::init_parent_descriptors()
 {
-  auto const num_columns = static_cast<uint32_t>(ff.types.size());
+  auto const num_columns = static_cast<int32_t>(ff.types.size());
   parents.resize(num_columns);
 
-  for (uint32_t col_id = 0; col_id < num_columns; ++col_id) {
+  for (int32_t col_id = 0; col_id < num_columns; ++col_id) {
     auto const& subtypes    = ff.types[col_id].subtypes;
-    auto const num_children = static_cast<uint32_t>(subtypes.size());
-    for (uint32_t field_idx = 0; field_idx < num_children; ++field_idx) {
-      auto const child_id = subtypes[field_idx];
+    auto const num_children = static_cast<int32_t>(subtypes.size());
+    for (int32_t field_idx = 0; field_idx < num_children; ++field_idx) {
+      auto const child_id = static_cast<int32_t>(subtypes[field_idx]);
       CUDF_EXPECTS(child_id > col_id && child_id < num_columns, "Invalid column id");
-      CUDF_EXPECTS(parents[child_id].id == -1, "Same node referenced twice");
-      parents[child_id].id        = col_id;
-      parents[child_id].field_idx = field_idx;
+      CUDF_EXPECTS(not column_has_parent(child_id), "Same node referenced twice");
+      parents[child_id] = {col_id, field_idx};
     }
   }
 }
diff --git a/cpp/src/io/orc/orc.h b/cpp/src/io/orc/orc.h
index c0293e1d59b..450e006dca8 100644
--- a/cpp/src/io/orc/orc.h
+++ b/cpp/src/io/orc/orc.h
@@ -598,7 +598,20 @@ class metadata {
   }
   int get_row_index_stride() const { return ff.rowIndexStride; }
 
-  int32_t parent_id(int32_t column_id) const { return parents.at(column_id).id; }
+  /**
+   * @brief Returns the ID of the parent column of the given column.
+   */
+  int32_t parent_id(int32_t column_id) const { return parents.at(column_id).value().id; }
+
+  /**
+   * @brief Returns the index the given column has in its parent's children list.
+   */
+  int32_t field_index(int32_t column_id) const { return parents.at(column_id).value().field_idx; }
+
+  /**
+   * @brief Returns whether the given column has a parent.
+   */
+  int32_t column_has_parent(int32_t column_id) const { return parents.at(column_id).has_value(); }
 
  public:
   PostScript ps;
@@ -611,12 +624,13 @@ class metadata {
  private:
   struct column_parent {
     // parent's ID
-    int32_t id = -1;
+    int32_t id;
     // Index of this column in the parent's list of children
-    int32_t field_idx = -1;
+    int32_t field_idx;
+    column_parent(int32_t parent_id, int32_t field_idx) : id{parent_id}, field_idx{field_idx} {}
   };
   void init_parent_descriptors();
-  std::vector<column_parent> parents;
+  std::vector<std::optional<column_parent>> parents;
 
   void init_column_names();
   std::vector<std::string> column_names;
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 83205d540dc..287a3a5e6c1 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -278,15 +278,14 @@ class aggregate_orc_metadata {
                              int32_t id)
   {
     auto current_id = id;
-    auto parent_id  = metadata.parent_id(current_id);
-    while (parent_id != -1) {
+    while (metadata.column_has_parent(current_id)) {
+      auto parent_id = metadata.parent_id(current_id);
       if (std::find(selected_columns[parent_id].cbegin(),
                     selected_columns[parent_id].cend(),
                     current_id) == selected_columns[parent_id].end()) {
         selected_columns[parent_id].push_back(current_id);
       }
       current_id = parent_id;
-      parent_id  = metadata.parent_id(current_id);
     }
   }
 

From 5564d098d271e73e18b14cb884704b5a0a4cb7e6 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 25 Oct 2021 16:35:28 -0700
Subject: [PATCH 19/24] move aggregate metdata to a separate file

---
 cpp/CMakeLists.txt                        |   1 +
 cpp/src/io/orc/aggregate_orc_metadata.cpp | 283 +++++++++++++++
 cpp/src/io/orc/aggregate_orc_metadata.hpp | 116 +++++++
 cpp/src/io/orc/reader_impl.cu             | 403 +++-------------------
 cpp/src/io/orc/reader_impl.hpp            |  21 +-
 5 files changed, 445 insertions(+), 379 deletions(-)
 create mode 100644 cpp/src/io/orc/aggregate_orc_metadata.cpp
 create mode 100644 cpp/src/io/orc/aggregate_orc_metadata.hpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 8aed7089dc5..7c01985e5aa 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -270,6 +270,7 @@ add_library(cudf
     src/io/functions.cpp
     src/io/json/json_gpu.cu
     src/io/json/reader_impl.cu
+    src/io/orc/aggregate_orc_metadata.cpp
     src/io/orc/dict_enc.cu
     src/io/orc/orc.cpp
     src/io/orc/reader_impl.cu
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp
new file mode 100644
index 00000000000..9a673c44ee0
--- /dev/null
+++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp
@@ -0,0 +1,283 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "aggregate_orc_metadata.hpp"
+
+#include <algorithm>
+#include <numeric>
+
+namespace cudf::io::orc::detail {
+
+column_hierarchy::column_hierarchy(nesting_map child_map) : children{std::move(child_map)}
+{
+  // Sort columns by nesting levels
+  std::function<void(int32_t, int32_t)> levelize = [&](int32_t id, int32_t level) {
+    if (static_cast<int32_t>(levels.size()) == level) levels.emplace_back();
+
+    levels[level].push_back({id, static_cast<int32_t>(children[id].size())});
+
+    for (auto child_id : children[id]) {
+      levelize(child_id, level + 1);
+    }
+  };
+
+  for (auto col_id : children[0]) {
+    levelize(col_id, 0);
+  }
+}
+
+/**
+ * @brief Goes up to the root to include the column with the given id and its parents.
+ */
+void update_parent_mapping(std::map<int32_t, std::vector<int32_t>>& selected_columns,
+                           cudf::io::orc::metadata const& metadata,
+                           int32_t id)
+{
+  auto current_id = id;
+  while (metadata.column_has_parent(current_id)) {
+    auto parent_id = metadata.parent_id(current_id);
+    if (std::find(selected_columns[parent_id].cbegin(),
+                  selected_columns[parent_id].cend(),
+                  current_id) == selected_columns[parent_id].end()) {
+      selected_columns[parent_id].push_back(current_id);
+    }
+    current_id = parent_id;
+  }
+}
+
+/**
+ * @brief Adds all columns nested under the column with the given id to the nesting map.
+ */
+void add_nested_columns(std::map<int32_t, std::vector<int32_t>>& selected_columns,
+                        std::vector<SchemaType> const& types,
+                        int32_t id)
+{
+  for (auto child_id : types[id].subtypes) {
+    if (std::find(selected_columns[id].cbegin(), selected_columns[id].cend(), child_id) ==
+        selected_columns[id].end()) {
+      selected_columns[id].push_back(child_id);
+    }
+    add_nested_columns(selected_columns, types, child_id);
+  }
+}
+
+/**
+ * @brief Adds the column with the given id to the mapping
+ *
+ * All nested columns and direct ancestors of column `id` are included.
+ * Columns that are not on the direct path are excluded, which may result in prunning.
+ */
+void add_column_to_mapping(std::map<int32_t, std::vector<int32_t>>& selected_columns,
+                           cudf::io::orc::metadata const& metadata,
+                           int32_t id)
+{
+  update_parent_mapping(selected_columns, metadata, id);
+  add_nested_columns(selected_columns, metadata.ff.types, id);
+}
+
+/**
+ * @brief Create a metadata object from each element in the source vector
+ */
+auto metadatas_from_sources(std::vector<std::unique_ptr<datasource>> const& sources)
+{
+  std::vector<cudf::io::orc::metadata> metadatas;
+  std::transform(
+    sources.cbegin(), sources.cend(), std::back_inserter(metadatas), [](auto const& source) {
+      return cudf::io::orc::metadata(source.get());
+    });
+  return metadatas;
+}
+
+size_type aggregate_orc_metadata::calc_num_rows() const
+{
+  return std::accumulate(
+    per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto& sum, auto& pfm) {
+      return sum + pfm.get_total_rows();
+    });
+}
+
+size_type aggregate_orc_metadata::calc_num_cols() const
+{
+  if (not per_file_metadata.empty()) { return per_file_metadata[0].get_num_columns(); }
+  return 0;
+}
+
+size_type aggregate_orc_metadata::calc_num_stripes() const
+{
+  return std::accumulate(
+    per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto& sum, auto& pfm) {
+      return sum + pfm.get_num_stripes();
+    });
+}
+
+aggregate_orc_metadata::aggregate_orc_metadata(
+  std::vector<std::unique_ptr<datasource>> const& sources)
+  : per_file_metadata(metadatas_from_sources(sources)),
+    num_rows(calc_num_rows()),
+    num_columns(calc_num_cols()),
+    num_stripes(calc_num_stripes())
+{
+  // Verify that the input files have the same number of columns,
+  // as well as matching types, compression, and names
+  for (auto const& pfm : per_file_metadata) {
+    CUDF_EXPECTS(per_file_metadata[0].get_num_columns() == pfm.get_num_columns(),
+                 "All sources must have the same number of columns");
+    CUDF_EXPECTS(per_file_metadata[0].ps.compression == pfm.ps.compression,
+                 "All sources must have the same compression type");
+
+    // Check the types, column names, and decimal scale
+    for (size_t i = 0; i < pfm.ff.types.size(); i++) {
+      CUDF_EXPECTS(pfm.ff.types[i].kind == per_file_metadata[0].ff.types[i].kind,
+                   "Column types across all input sources must be the same");
+      CUDF_EXPECTS(std::equal(pfm.ff.types[i].fieldNames.begin(),
+                              pfm.ff.types[i].fieldNames.end(),
+                              per_file_metadata[0].ff.types[i].fieldNames.begin()),
+                   "All source column names must be the same");
+      CUDF_EXPECTS(
+        pfm.ff.types[i].scale.value_or(0) == per_file_metadata[0].ff.types[i].scale.value_or(0),
+        "All scale values must be the same");
+    }
+  }
+}
+
+std::vector<cudf::io::orc::metadata::stripe_source_mapping> aggregate_orc_metadata::select_stripes(
+  std::vector<std::vector<size_type>> const& user_specified_stripes,
+  size_type& row_start,
+  size_type& row_count)
+{
+  std::vector<cudf::io::orc::metadata::stripe_source_mapping> selected_stripes_mapping;
+
+  if (!user_specified_stripes.empty()) {
+    CUDF_EXPECTS(user_specified_stripes.size() == get_num_source_files(),
+                 "Must specify stripes for each source");
+    // row_start is 0 if stripes are set. If this is not true anymore, then
+    // row_start needs to be subtracted to get the correct row_count
+    CUDF_EXPECTS(row_start == 0, "Start row index should be 0");
+
+    row_count = 0;
+    // Each vector entry represents a source file; each nested vector represents the
+    // user_defined_stripes to get from that source file
+    for (size_t src_file_idx = 0; src_file_idx < user_specified_stripes.size(); ++src_file_idx) {
+      std::vector<OrcStripeInfo> stripe_infos;
+
+      // Coalesce stripe info at the source file later since that makes downstream processing much
+      // easier in impl::read
+      for (const size_t& stripe_idx : user_specified_stripes[src_file_idx]) {
+        CUDF_EXPECTS(stripe_idx < per_file_metadata[src_file_idx].ff.stripes.size(),
+                     "Invalid stripe index");
+        stripe_infos.push_back(
+          std::make_pair(&per_file_metadata[src_file_idx].ff.stripes[stripe_idx], nullptr));
+        row_count += per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows;
+      }
+      selected_stripes_mapping.push_back({static_cast<int>(src_file_idx), stripe_infos});
+    }
+  } else {
+    row_start = std::max(row_start, 0);
+    if (row_count < 0) {
+      row_count = static_cast<size_type>(
+        std::min<int64_t>(get_num_rows(), std::numeric_limits<size_type>::max()));
+    }
+    row_count = std::min(row_count, get_num_rows() - row_start);
+    CUDF_EXPECTS(row_count >= 0, "Invalid row count");
+    CUDF_EXPECTS(row_start <= get_num_rows(), "Invalid row start");
+
+    size_type count            = 0;
+    size_type stripe_skip_rows = 0;
+    // Iterate all source files, each source file has corelating metadata
+    for (size_t src_file_idx = 0;
+         src_file_idx < per_file_metadata.size() && count < row_start + row_count;
+         ++src_file_idx) {
+      std::vector<OrcStripeInfo> stripe_infos;
+
+      for (size_t stripe_idx = 0; stripe_idx < per_file_metadata[src_file_idx].ff.stripes.size() &&
+                                  count < row_start + row_count;
+           ++stripe_idx) {
+        count += per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows;
+        if (count > row_start || count == 0) {
+          stripe_infos.push_back(
+            std::make_pair(&per_file_metadata[src_file_idx].ff.stripes[stripe_idx], nullptr));
+        } else {
+          stripe_skip_rows = count;
+        }
+      }
+
+      selected_stripes_mapping.push_back({static_cast<int>(src_file_idx), stripe_infos});
+    }
+    // Need to remove skipped rows from the stripes which are not selected.
+    row_start -= stripe_skip_rows;
+  }
+
+  // Read each stripe's stripefooter metadata
+  if (not selected_stripes_mapping.empty()) {
+    for (auto& mapping : selected_stripes_mapping) {
+      // Resize to all stripe_info for the source level
+      per_file_metadata[mapping.source_idx].stripefooters.resize(mapping.stripe_info.size());
+
+      for (size_t i = 0; i < mapping.stripe_info.size(); i++) {
+        const auto stripe         = mapping.stripe_info[i].first;
+        const auto sf_comp_offset = stripe->offset + stripe->indexLength + stripe->dataLength;
+        const auto sf_comp_length = stripe->footerLength;
+        CUDF_EXPECTS(
+          sf_comp_offset + sf_comp_length < per_file_metadata[mapping.source_idx].source->size(),
+          "Invalid stripe information");
+        const auto buffer =
+          per_file_metadata[mapping.source_idx].source->host_read(sf_comp_offset, sf_comp_length);
+        size_t sf_length = 0;
+        auto sf_data     = per_file_metadata[mapping.source_idx].decompressor->Decompress(
+          buffer->data(), sf_comp_length, &sf_length);
+        ProtobufReader(sf_data, sf_length)
+          .read(per_file_metadata[mapping.source_idx].stripefooters[i]);
+        mapping.stripe_info[i].second = &per_file_metadata[mapping.source_idx].stripefooters[i];
+        if (stripe->indexLength == 0) { row_grp_idx_present = false; }
+      }
+    }
+  }
+
+  return selected_stripes_mapping;
+}
+
+/**
+ * @brief Filters and reduces down to a selection of columns
+ *
+ * @param use_names List of column names to select
+ * @return Vector of list of ORC column meta-data
+ */
+column_hierarchy aggregate_orc_metadata::select_columns(std::vector<std::string> const& use_names)
+{
+  auto const& pfm = per_file_metadata[0];
+
+  column_hierarchy::nesting_map selected_columns;
+  if (use_names.empty()) {
+    for (auto const& col_id : pfm.ff.types[0].subtypes) {
+      add_column_to_mapping(selected_columns, pfm, col_id);
+    }
+  } else {
+    for (const auto& use_name : use_names) {
+      bool name_found = false;
+      for (auto col_id = 1; col_id < pfm.get_num_columns(); ++col_id) {
+        if (pfm.get_column_path(col_id) == use_name) {
+          name_found = true;
+          add_column_to_mapping(selected_columns, pfm, col_id);
+          break;
+        }
+      }
+      CUDF_EXPECTS(name_found, "Unknown column name: " + std::string(use_name));
+    }
+  }
+  return {std::move(selected_columns)};
+}
+
+}  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.hpp b/cpp/src/io/orc/aggregate_orc_metadata.hpp
new file mode 100644
index 00000000000..2951c660886
--- /dev/null
+++ b/cpp/src/io/orc/aggregate_orc_metadata.hpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "orc.h"
+
+#include <map>
+#include <vector>
+
+namespace cudf::io::orc::detail {
+
+/**
+ * @brief Describes a column hierarchy, which may exclude some input columns.
+ */
+struct column_hierarchy {
+  using nesting_map = std::map<int32_t, std::vector<int32_t>>;
+  // Children IDs of each column
+  nesting_map children;
+  // Each element contains column at the given nesting level
+  std::vector<std::vector<orc_column_meta>> levels;
+
+  column_hierarchy(nesting_map child_map);
+  auto num_levels() const { return levels.size(); }
+};
+
+/**
+ * @brief In order to support multiple input files/buffers we need to gather
+ * the metadata across all of those input(s). This class provides a place
+ * to aggregate that metadata from all the files.
+ */
+class aggregate_orc_metadata {
+  using OrcStripeInfo = std::pair<const StripeInformation*, const StripeFooter*>;
+
+  /**
+   * @brief Sums up the number of rows of each source
+   */
+  size_type calc_num_rows() const;
+
+  /**
+   * @brief Number of columns in a ORC file.
+   */
+  size_type calc_num_cols() const;
+
+  /**
+   * @brief Sums up the number of stripes of each source
+   */
+  size_type calc_num_stripes() const;
+
+ public:
+  mutable std::vector<cudf::io::orc::metadata> per_file_metadata;  // TODO needed to be mutable?
+  size_type const num_rows;
+  size_type const num_columns;
+  size_type const num_stripes;
+  bool row_grp_idx_present = true;
+
+  aggregate_orc_metadata(std::vector<std::unique_ptr<datasource>> const& sources);
+
+  auto const& get_schema(int schema_idx) const { return per_file_metadata[0].ff.types[schema_idx]; }
+
+  auto get_col_type(int col_idx) const { return per_file_metadata[0].ff.types[col_idx]; }
+
+  auto get_num_rows() const { return num_rows; }
+
+  auto get_num_cols() const { return per_file_metadata[0].get_num_columns(); }
+
+  auto get_num_stripes() const { return num_stripes; }
+
+  auto get_num_source_files() const { return per_file_metadata.size(); }
+
+  auto const& get_types() const { return per_file_metadata[0].ff.types; }
+
+  int get_row_index_stride() const { return per_file_metadata[0].ff.rowIndexStride; }
+
+  auto get_column_name(const int source_idx, const int column_id) const
+  {
+    CUDF_EXPECTS(source_idx <= static_cast<int>(per_file_metadata.size()),
+                 "Out of range source_idx provided");
+    return per_file_metadata[source_idx].get_column_name(column_id);
+  }
+
+  auto get_column_path(const int source_idx, const int column_id) const
+  {
+    CUDF_EXPECTS(source_idx <= static_cast<int>(per_file_metadata.size()),
+                 "Out of range source_idx provided");
+    return per_file_metadata[source_idx].get_column_path(column_id);
+  }
+
+  auto is_row_grp_idx_present() const { return row_grp_idx_present; }
+
+  std::vector<cudf::io::orc::metadata::stripe_source_mapping> select_stripes(
+    std::vector<std::vector<size_type>> const& user_specified_stripes,
+    size_type& row_start,
+    size_type& row_count);
+
+  /**
+   * @brief Filters and reduces down to a selection of columns
+   *
+   * @param use_names List of column names to select
+   * @return Vector of list of ORC column meta-data
+   */
+  column_hierarchy select_columns(std::vector<std::string> const& use_names);
+};
+
+}  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 287a3a5e6c1..9cfe7f735cf 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -19,13 +19,13 @@
  * @brief cuDF-IO ORC reader class implementation
  */
 
-#include "io/orc/orc_gpu.h"
+#include "orc.h"
+#include "orc_gpu.h"
 #include "reader_impl.hpp"
 #include "timezone.cuh"
 
 #include <io/comp/gpuinflate.h>
 #include <io/utilities/time_utils.cuh>
-#include "orc.h"
 
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/table/table.hpp>
@@ -33,7 +33,6 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
 
-#include <iterator>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
@@ -42,15 +41,13 @@
 #include <nvcomp/snappy.h>
 
 #include <algorithm>
-#include <array>
+#include <iterator>
 
 namespace cudf {
 namespace io {
 namespace detail {
 namespace orc {
-// Import functionality that's independent of legacy code
 using namespace cudf::io::orc;
-using namespace cudf::io;
 
 namespace {
 /**
@@ -243,323 +240,6 @@ bool should_convert_decimal_column_to_float(const std::vector<std::string>& colu
 
 }  // namespace
 
-column_hierarchy::column_hierarchy(nesting_map child_map) : children{std::move(child_map)}
-{
-  // Sort columns by nesting levels
-  std::function<void(int32_t, int32_t)> levelize = [&](int32_t id, int32_t level) {
-    if (static_cast<int32_t>(levels.size()) == level) levels.emplace_back();
-
-    levels[level].push_back({id, static_cast<int32_t>(children[id].size())});
-
-    for (auto child_id : children[id]) {
-      levelize(child_id, level + 1);
-    }
-  };
-
-  for (auto col_id : children[0]) {
-    levelize(col_id, 0);
-  }
-}
-
-// TODO: move class to a separate hpp/cpp files
-/**
- * @brief In order to support multiple input files/buffers we need to gather
- * the metadata across all of those input(s). This class provides a place
- * to aggregate that metadata from all the files.
- */
-class aggregate_orc_metadata {
-  using OrcStripeInfo = std::pair<const StripeInformation*, const StripeFooter*>;
-
-  /**
-   * @brief Goes up to the root to include the column with the given id and its parents.
-   */
-  void update_parent_mapping(std::map<int32_t, std::vector<int32_t>>& selected_columns,
-                             cudf::io::orc::metadata const& metadata,
-                             int32_t id)
-  {
-    auto current_id = id;
-    while (metadata.column_has_parent(current_id)) {
-      auto parent_id = metadata.parent_id(current_id);
-      if (std::find(selected_columns[parent_id].cbegin(),
-                    selected_columns[parent_id].cend(),
-                    current_id) == selected_columns[parent_id].end()) {
-        selected_columns[parent_id].push_back(current_id);
-      }
-      current_id = parent_id;
-    }
-  }
-
-  /**
-   * @brief Adds all columns nested under the column with the given id to the nesting map.
-   */
-  void add_nested_columns(std::map<int32_t, std::vector<int32_t>>& selected_columns,
-                          std::vector<SchemaType> const& types,
-                          int32_t id)
-  {
-    for (auto child_id : types[id].subtypes) {
-      if (std::find(selected_columns[id].cbegin(), selected_columns[id].cend(), child_id) ==
-          selected_columns[id].end()) {
-        selected_columns[id].push_back(child_id);
-      }
-      add_nested_columns(selected_columns, types, child_id);
-    }
-  }
-
-  /**
-   * @brief Adds the column with the given id to the mapping
-   *
-   * All nested columns and direct ancestors of column `id` are included.
-   * Columns that are not on the direct path are excluded, which may result in prunning.
-   */
-  void add_column_to_mapping(std::map<int32_t, std::vector<int32_t>>& selected_columns,
-                             cudf::io::orc::metadata const& metadata,
-                             int32_t id)
-  {
-    update_parent_mapping(selected_columns, metadata, id);
-    add_nested_columns(selected_columns, metadata.ff.types, id);
-  }
-
- public:
-  mutable std::vector<cudf::io::orc::metadata> per_file_metadata;
-  size_type const num_rows;
-  size_type const num_columns;
-  size_type const num_stripes;
-  bool row_grp_idx_present = true;
-
-  /**
-   * @brief Create a metadata object from each element in the source vector
-   */
-  auto metadatas_from_sources(std::vector<std::unique_ptr<datasource>> const& sources)
-  {
-    std::vector<cudf::io::orc::metadata> metadatas;
-    std::transform(
-      sources.cbegin(), sources.cend(), std::back_inserter(metadatas), [](auto const& source) {
-        return cudf::io::orc::metadata(source.get());
-      });
-    return metadatas;
-  }
-
-  /**
-   * @brief Sums up the number of rows of each source
-   */
-  size_type calc_num_rows() const
-  {
-    return std::accumulate(
-      per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto& sum, auto& pfm) {
-        return sum + pfm.get_total_rows();
-      });
-  }
-
-  /**
-   * @brief Number of columns in a ORC file.
-   */
-  size_type calc_num_cols() const
-  {
-    if (not per_file_metadata.empty()) { return per_file_metadata[0].get_num_columns(); }
-    return 0;
-  }
-
-  /**
-   * @brief Sums up the number of stripes of each source
-   */
-  size_type calc_num_stripes() const
-  {
-    return std::accumulate(
-      per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto& sum, auto& pfm) {
-        return sum + pfm.get_num_stripes();
-      });
-  }
-
-  aggregate_orc_metadata(std::vector<std::unique_ptr<datasource>> const& sources)
-    : per_file_metadata(metadatas_from_sources(sources)),
-      num_rows(calc_num_rows()),
-      num_columns(calc_num_cols()),
-      num_stripes(calc_num_stripes())
-  {
-    // Verify that the input files have the same number of columns,
-    // as well as matching types, compression, and names
-    for (auto const& pfm : per_file_metadata) {
-      CUDF_EXPECTS(per_file_metadata[0].get_num_columns() == pfm.get_num_columns(),
-                   "All sources must have the same number of columns");
-      CUDF_EXPECTS(per_file_metadata[0].ps.compression == pfm.ps.compression,
-                   "All sources must have the same compression type");
-
-      // Check the types, column names, and decimal scale
-      for (size_t i = 0; i < pfm.ff.types.size(); i++) {
-        CUDF_EXPECTS(pfm.ff.types[i].kind == per_file_metadata[0].ff.types[i].kind,
-                     "Column types across all input sources must be the same");
-        CUDF_EXPECTS(std::equal(pfm.ff.types[i].fieldNames.begin(),
-                                pfm.ff.types[i].fieldNames.end(),
-                                per_file_metadata[0].ff.types[i].fieldNames.begin()),
-                     "All source column names must be the same");
-        CUDF_EXPECTS(
-          pfm.ff.types[i].scale.value_or(0) == per_file_metadata[0].ff.types[i].scale.value_or(0),
-          "All scale values must be the same");
-      }
-    }
-  }
-
-  auto const& get_schema(int schema_idx) const { return per_file_metadata[0].ff.types[schema_idx]; }
-
-  auto get_col_type(int col_idx) const { return per_file_metadata[0].ff.types[col_idx]; }
-
-  auto get_num_rows() const { return num_rows; }
-
-  auto get_num_cols() const { return per_file_metadata[0].get_num_columns(); }
-
-  auto get_num_stripes() const { return num_stripes; }
-
-  auto get_num_source_files() const { return per_file_metadata.size(); }
-
-  auto const& get_types() const { return per_file_metadata[0].ff.types; }
-
-  int get_row_index_stride() const { return per_file_metadata[0].ff.rowIndexStride; }
-
-  auto get_column_name(const int source_idx, const int column_id) const
-  {
-    CUDF_EXPECTS(source_idx <= static_cast<int>(per_file_metadata.size()),
-                 "Out of range source_idx provided");
-    return per_file_metadata[source_idx].get_column_name(column_id);
-  }
-
-  auto get_column_path(const int source_idx, const int column_id) const
-  {
-    CUDF_EXPECTS(source_idx <= static_cast<int>(per_file_metadata.size()),
-                 "Out of range source_idx provided");
-    return per_file_metadata[source_idx].get_column_path(column_id);
-  }
-
-  auto is_row_grp_idx_present() const { return row_grp_idx_present; }
-
-  std::vector<cudf::io::orc::metadata::stripe_source_mapping> select_stripes(
-    std::vector<std::vector<size_type>> const& user_specified_stripes,
-    size_type& row_start,
-    size_type& row_count)
-  {
-    std::vector<cudf::io::orc::metadata::stripe_source_mapping> selected_stripes_mapping;
-
-    if (!user_specified_stripes.empty()) {
-      CUDF_EXPECTS(user_specified_stripes.size() == get_num_source_files(),
-                   "Must specify stripes for each source");
-      // row_start is 0 if stripes are set. If this is not true anymore, then
-      // row_start needs to be subtracted to get the correct row_count
-      CUDF_EXPECTS(row_start == 0, "Start row index should be 0");
-
-      row_count = 0;
-      // Each vector entry represents a source file; each nested vector represents the
-      // user_defined_stripes to get from that source file
-      for (size_t src_file_idx = 0; src_file_idx < user_specified_stripes.size(); ++src_file_idx) {
-        std::vector<OrcStripeInfo> stripe_infos;
-
-        // Coalesce stripe info at the source file later since that makes downstream processing much
-        // easier in impl::read
-        for (const size_t& stripe_idx : user_specified_stripes[src_file_idx]) {
-          CUDF_EXPECTS(stripe_idx < per_file_metadata[src_file_idx].ff.stripes.size(),
-                       "Invalid stripe index");
-          stripe_infos.push_back(
-            std::make_pair(&per_file_metadata[src_file_idx].ff.stripes[stripe_idx], nullptr));
-          row_count += per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows;
-        }
-        selected_stripes_mapping.push_back({static_cast<int>(src_file_idx), stripe_infos});
-      }
-    } else {
-      row_start = std::max(row_start, 0);
-      if (row_count < 0) {
-        row_count = static_cast<size_type>(
-          std::min<int64_t>(get_num_rows(), std::numeric_limits<size_type>::max()));
-      }
-      row_count = std::min(row_count, get_num_rows() - row_start);
-      CUDF_EXPECTS(row_count >= 0, "Invalid row count");
-      CUDF_EXPECTS(row_start <= get_num_rows(), "Invalid row start");
-
-      size_type count            = 0;
-      size_type stripe_skip_rows = 0;
-      // Iterate all source files, each source file has corelating metadata
-      for (size_t src_file_idx = 0;
-           src_file_idx < per_file_metadata.size() && count < row_start + row_count;
-           ++src_file_idx) {
-        std::vector<OrcStripeInfo> stripe_infos;
-
-        for (size_t stripe_idx = 0;
-             stripe_idx < per_file_metadata[src_file_idx].ff.stripes.size() &&
-             count < row_start + row_count;
-             ++stripe_idx) {
-          count += per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows;
-          if (count > row_start || count == 0) {
-            stripe_infos.push_back(
-              std::make_pair(&per_file_metadata[src_file_idx].ff.stripes[stripe_idx], nullptr));
-          } else {
-            stripe_skip_rows = count;
-          }
-        }
-
-        selected_stripes_mapping.push_back({static_cast<int>(src_file_idx), stripe_infos});
-      }
-      // Need to remove skipped rows from the stripes which are not selected.
-      row_start -= stripe_skip_rows;
-    }
-
-    // Read each stripe's stripefooter metadata
-    if (not selected_stripes_mapping.empty()) {
-      for (auto& mapping : selected_stripes_mapping) {
-        // Resize to all stripe_info for the source level
-        per_file_metadata[mapping.source_idx].stripefooters.resize(mapping.stripe_info.size());
-
-        for (size_t i = 0; i < mapping.stripe_info.size(); i++) {
-          const auto stripe         = mapping.stripe_info[i].first;
-          const auto sf_comp_offset = stripe->offset + stripe->indexLength + stripe->dataLength;
-          const auto sf_comp_length = stripe->footerLength;
-          CUDF_EXPECTS(
-            sf_comp_offset + sf_comp_length < per_file_metadata[mapping.source_idx].source->size(),
-            "Invalid stripe information");
-          const auto buffer =
-            per_file_metadata[mapping.source_idx].source->host_read(sf_comp_offset, sf_comp_length);
-          size_t sf_length = 0;
-          auto sf_data     = per_file_metadata[mapping.source_idx].decompressor->Decompress(
-            buffer->data(), sf_comp_length, &sf_length);
-          ProtobufReader(sf_data, sf_length)
-            .read(per_file_metadata[mapping.source_idx].stripefooters[i]);
-          mapping.stripe_info[i].second = &per_file_metadata[mapping.source_idx].stripefooters[i];
-          if (stripe->indexLength == 0) { row_grp_idx_present = false; }
-        }
-      }
-    }
-
-    return selected_stripes_mapping;
-  }
-
-  /**
-   * @brief Filters and reduces down to a selection of columns
-   *
-   * @param use_names List of column names to select
-   * @return Vector of list of ORC column meta-data
-   */
-  column_hierarchy select_columns(std::vector<std::string> const& use_names)
-  {
-    auto const& pfm = per_file_metadata[0];
-
-    column_hierarchy::nesting_map selected_columns;
-    if (use_names.empty()) {
-      for (auto const& col_id : pfm.ff.types[0].subtypes) {
-        add_column_to_mapping(selected_columns, pfm, col_id);
-      }
-    } else {
-      for (const auto& use_name : use_names) {
-        bool name_found = false;
-        for (auto col_id = 1; col_id < pfm.get_num_columns(); ++col_id) {
-          if (pfm.get_column_path(col_id) == use_name) {
-            name_found = true;
-            add_column_to_mapping(selected_columns, pfm, col_id);
-            break;
-          }
-        }
-        CUDF_EXPECTS(name_found, "Unknown column name: " + std::string(use_name));
-      }
-    }
-    return {std::move(selected_columns)};
-  }
-};
-
 void snappy_decompress(device_span<gpu_inflate_input_s> comp_in,
                        device_span<gpu_inflate_status_s> comp_stat,
                        size_t max_uncomp_page_size,
@@ -1048,17 +728,17 @@ std::unique_ptr<column> reader::impl::create_empty_column(const int32_t orc_col_
                                                           column_name_info& schema_info,
                                                           rmm::cuda_stream_view stream)
 {
-  schema_info.name = _metadata->get_column_name(0, orc_col_id);
+  schema_info.name = _metadata.get_column_name(0, orc_col_id);
   // If the column type is orc::DECIMAL see if the user
   // desires it to be converted to float64 or not
   auto const decimal_as_float64 = should_convert_decimal_column_to_float(
-    _decimal_cols_as_float, _metadata->per_file_metadata[0], orc_col_id);
+    _decimal_cols_as_float, _metadata.per_file_metadata[0], orc_col_id);
   auto const type = to_type_id(
-    _metadata->get_schema(orc_col_id), _use_np_dtypes, _timestamp_type.id(), decimal_as_float64);
+    _metadata.get_schema(orc_col_id), _use_np_dtypes, _timestamp_type.id(), decimal_as_float64);
   int32_t scale = 0;
   std::vector<std::unique_ptr<column>> child_columns;
   std::unique_ptr<column> out_col = nullptr;
-  auto kind                       = _metadata->get_col_type(orc_col_id).kind;
+  auto kind                       = _metadata.get_col_type(orc_col_id).kind;
 
   switch (kind) {
     case orc::LIST:
@@ -1068,7 +748,7 @@ std::unique_ptr<column> reader::impl::create_empty_column(const int32_t orc_col_
         0,
         make_empty_column(data_type(type_id::INT32)),
         create_empty_column(
-          _metadata->get_col_type(orc_col_id).subtypes[0], schema_info.children.back(), stream),
+          _metadata.get_col_type(orc_col_id).subtypes[0], schema_info.children.back(), stream),
         0,
         rmm::device_buffer{0, stream},
         stream);
@@ -1076,8 +756,8 @@ std::unique_ptr<column> reader::impl::create_empty_column(const int32_t orc_col_
     case orc::MAP: {
       schema_info.children.emplace_back("offsets");
       schema_info.children.emplace_back("struct");
-      const auto child_column_ids = _metadata->get_col_type(orc_col_id).subtypes;
-      for (size_t idx = 0; idx < _metadata->get_col_type(orc_col_id).subtypes.size(); idx++) {
+      const auto child_column_ids = _metadata.get_col_type(orc_col_id).subtypes;
+      for (size_t idx = 0; idx < _metadata.get_col_type(orc_col_id).subtypes.size(); idx++) {
         auto& children_schema = schema_info.children.back().children;
         children_schema.emplace_back("");
         child_columns.push_back(create_empty_column(
@@ -1096,7 +776,7 @@ std::unique_ptr<column> reader::impl::create_empty_column(const int32_t orc_col_
     } break;
 
     case orc::STRUCT:
-      for (const auto col : _metadata->get_col_type(orc_col_id).subtypes) {
+      for (const auto col : _metadata.get_col_type(orc_col_id).subtypes) {
         schema_info.children.emplace_back("");
         child_columns.push_back(create_empty_column(col, schema_info.children.back(), stream));
       }
@@ -1106,7 +786,7 @@ std::unique_ptr<column> reader::impl::create_empty_column(const int32_t orc_col_
 
     case orc::DECIMAL:
       if (type == type_id::DECIMAL64) {
-        scale = -static_cast<int32_t>(_metadata->get_types()[orc_col_id].scale.value_or(0));
+        scale = -static_cast<int32_t>(_metadata.get_types()[orc_col_id].scale.value_or(0));
       }
       out_col = make_empty_column(data_type(type, scale));
       break;
@@ -1126,8 +806,8 @@ column_buffer&& reader::impl::assemble_buffer(const int32_t orc_col_id,
   auto const col_id = _col_meta.orc_col_map[level][orc_col_id];
   auto& col_buffer  = col_buffers[level][col_id];
 
-  col_buffer.name = _metadata->get_column_name(0, orc_col_id);
-  auto kind       = _metadata->get_col_type(orc_col_id).kind;
+  col_buffer.name = _metadata.get_column_name(0, orc_col_id);
+  auto kind       = _metadata.get_col_type(orc_col_id).kind;
   switch (kind) {
     case orc::LIST:
     case orc::STRUCT:
@@ -1182,8 +862,8 @@ reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
                    rmm::mr::device_memory_resource* mr)
   : _mr(mr),
     _sources(std::move(sources)),
-    _metadata{std::make_unique<aggregate_orc_metadata>(_sources)},
-    selected_columns{_metadata->select_columns(options.get_columns())}
+    _metadata{_sources},
+    selected_columns{_metadata.select_columns(options.get_columns())}
 {
   // Override output timestamp resolution if requested
   if (options.get_timestamp_type().id() != type_id::EMPTY) {
@@ -1209,7 +889,7 @@ timezone_table reader::impl::compute_timezone_table(
   auto const has_timestamp_column = std::any_of(
     selected_columns.levels.cbegin(), selected_columns.levels.cend(), [&](auto& col_lvl) {
       return std::any_of(col_lvl.cbegin(), col_lvl.cend(), [&](auto& col_meta) {
-        return _metadata->get_col_type(col_meta.id).kind == TypeKind::TIMESTAMP;
+        return _metadata.get_col_type(col_meta.id).kind == TypeKind::TIMESTAMP;
       });
     });
   if (not has_timestamp_column) return {};
@@ -1241,7 +921,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
     return {std::make_unique<table>(), std::move(out_metadata)};
 
   // Select only stripes required (aka row groups)
-  const auto selected_stripes = _metadata->select_stripes(stripes, skip_rows, num_rows);
+  const auto selected_stripes = _metadata.select_stripes(stripes, skip_rows, num_rows);
 
   auto const tz_table = compute_timezone_table(selected_stripes, stream);
 
@@ -1250,7 +930,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
   for (size_t level = 0; level < selected_columns.num_levels(); level++) {
     auto& columns_level = selected_columns.levels[level];
     // Association between each ORC column and its cudf::column
-    _col_meta.orc_col_map.emplace_back(_metadata->get_num_cols(), -1);
+    _col_meta.orc_col_map.emplace_back(_metadata.get_num_cols(), -1);
     std::vector<orc_column_meta> nested_col;
     bool is_data_empty = false;
 
@@ -1260,19 +940,19 @@ table_with_metadata reader::impl::read(size_type skip_rows,
       // If the column type is orc::DECIMAL see if the user
       // desires it to be converted to float64 or not
       auto const decimal_as_float64 = should_convert_decimal_column_to_float(
-        _decimal_cols_as_float, _metadata->per_file_metadata[0], col.id);
+        _decimal_cols_as_float, _metadata.per_file_metadata[0], col.id);
       auto col_type = to_type_id(
-        _metadata->get_col_type(col.id), _use_np_dtypes, _timestamp_type.id(), decimal_as_float64);
+        _metadata.get_col_type(col.id), _use_np_dtypes, _timestamp_type.id(), decimal_as_float64);
       CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type");
       // Remove this once we support Decimal128 data type
       CUDF_EXPECTS(
-        (col_type != type_id::DECIMAL64) or (_metadata->get_col_type(col.id).precision <= 18),
+        (col_type != type_id::DECIMAL64) or (_metadata.get_col_type(col.id).precision <= 18),
         "Decimal data has precision > 18, Decimal64 data type doesn't support it.");
       if (col_type == type_id::DECIMAL64) {
         // sign of the scale is changed since cuDF follows c++ libraries like CNL
         // which uses negative scaling, but liborc and other libraries
         // follow positive scaling.
-        auto const scale = -static_cast<int32_t>(_metadata->get_col_type(col.id).scale.value_or(0));
+        auto const scale = -static_cast<int32_t>(_metadata.get_col_type(col.id).scale.value_or(0));
         column_types.emplace_back(col_type, scale);
       } else {
         column_types.emplace_back(col_type);
@@ -1311,11 +991,11 @@ table_with_metadata reader::impl::read(size_type skip_rows,
       const bool use_index =
         (_use_index == true) &&
         // Do stripes have row group index
-        _metadata->is_row_grp_idx_present() &&
+        _metadata.is_row_grp_idx_present() &&
         // Only use if we don't have much work with complete columns & stripes
         // TODO: Consider nrows, gpu, and tune the threshold
-        (num_rows > _metadata->get_row_index_stride() && !(_metadata->get_row_index_stride() & 7) &&
-         _metadata->get_row_index_stride() > 0 && num_columns * total_num_stripes < 8 * 128) &&
+        (num_rows > _metadata.get_row_index_stride() && !(_metadata.get_row_index_stride() & 7) &&
+         _metadata.get_row_index_stride() > 0 && num_columns * total_num_stripes < 8 * 128) &&
         // Only use if first row is aligned to a stripe boundary
         // TODO: Fix logic to handle unaligned rows
         (skip_rows == 0);
@@ -1352,7 +1032,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
                                                           stripe_info,
                                                           stripe_footer,
                                                           _col_meta.orc_col_map[level],
-                                                          _metadata->get_types(),
+                                                          _metadata.get_types(),
                                                           use_index,
                                                           &num_dict_entries,
                                                           chunks,
@@ -1384,16 +1064,16 @@ table_with_metadata reader::impl::read(size_type skip_rows,
               len += stream_info[stream_count].length;
               stream_count++;
             }
-            if (_metadata->per_file_metadata[stripe_source_mapping.source_idx]
+            if (_metadata.per_file_metadata[stripe_source_mapping.source_idx]
                   .source->is_device_read_preferred(len)) {
               read_tasks.push_back(
-                std::make_pair(_metadata->per_file_metadata[stripe_source_mapping.source_idx]
+                std::make_pair(_metadata.per_file_metadata[stripe_source_mapping.source_idx]
                                  .source->device_read_async(offset, len, d_dst, stream),
                                len));
 
             } else {
               const auto buffer =
-                _metadata->per_file_metadata[stripe_source_mapping.source_idx].source->host_read(
+                _metadata.per_file_metadata[stripe_source_mapping.source_idx].source->host_read(
                   offset, len);
               CUDF_EXPECTS(buffer->size() == len, "Unexpected discrepancy in bytes read.");
               CUDA_TRY(cudaMemcpyAsync(
@@ -1406,8 +1086,8 @@ table_with_metadata reader::impl::read(size_type skip_rows,
           const auto rowgroup_id         = num_rowgroups;
           auto stripe_num_rowgroups      = 0;
           if (use_index) {
-            stripe_num_rowgroups = (num_rows_per_stripe + _metadata->get_row_index_stride() - 1) /
-                                   _metadata->get_row_index_stride();
+            stripe_num_rowgroups = (num_rows_per_stripe + _metadata.get_row_index_stride() - 1) /
+                                   _metadata.get_row_index_stride();
           }
           // Update chunks to reference streams pointers
           for (size_t col_idx = 0; col_idx < num_columns; col_idx++) {
@@ -1429,15 +1109,15 @@ table_with_metadata reader::impl::read(size_type skip_rows,
                 ? nullptr
                 : null_count_prefix_sums[level - 1][_col_meta.parent_column_index[col_idx]].data();
             chunk.encoding_kind = stripe_footer->columns[columns_level[col_idx].id].kind;
-            chunk.type_kind     = _metadata->per_file_metadata[stripe_source_mapping.source_idx]
+            chunk.type_kind     = _metadata.per_file_metadata[stripe_source_mapping.source_idx]
                                 .ff.types[columns_level[col_idx].id]
                                 .kind;
             // num_child_rows for a struct column will be same, for other nested types it will be
             // calculated.
             chunk.num_child_rows          = (chunk.type_kind != orc::STRUCT) ? 0 : chunk.num_rows;
             auto const decimal_as_float64 = should_convert_decimal_column_to_float(
-              _decimal_cols_as_float, _metadata->per_file_metadata[0], columns_level[col_idx].id);
-            chunk.decimal_scale = _metadata->per_file_metadata[stripe_source_mapping.source_idx]
+              _decimal_cols_as_float, _metadata.per_file_metadata[0], columns_level[col_idx].id);
+            chunk.decimal_scale = _metadata.per_file_metadata[stripe_source_mapping.source_idx]
                                     .ff.types[columns_level[col_idx].id]
                                     .scale.value_or(0) |
                                   (decimal_as_float64 ? orc::gpu::orc_decimal2float64_scale : 0);
@@ -1490,15 +1170,15 @@ table_with_metadata reader::impl::read(size_type skip_rows,
                          });
         }
         // Setup row group descriptors if using indexes
-        if (_metadata->per_file_metadata[0].ps.compression != orc::NONE and not is_data_empty) {
+        if (_metadata.per_file_metadata[0].ps.compression != orc::NONE and not is_data_empty) {
           auto decomp_data =
             decompress_stripe_data(chunks,
                                    stripe_data,
-                                   _metadata->per_file_metadata[0].decompressor.get(),
+                                   _metadata.per_file_metadata[0].decompressor.get(),
                                    stream_info,
                                    total_num_stripes,
                                    row_groups,
-                                   _metadata->get_row_index_stride(),
+                                   _metadata.get_row_index_stride(),
                                    level == 0,
                                    stream);
           stripe_data.clear();
@@ -1513,7 +1193,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
                                     num_columns,
                                     total_num_stripes,
                                     num_rowgroups,
-                                    _metadata->get_row_index_stride(),
+                                    _metadata.get_row_index_stride(),
                                     level == 0,
                                     stream);
           }
@@ -1540,7 +1220,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
                              skip_rows,
                              tz_table.view(),
                              row_groups,
-                             _metadata->get_row_index_stride(),
+                             _metadata.get_row_index_stride(),
                              out_buffers[level],
                              level,
                              stream);
@@ -1589,7 +1269,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
 
   out_metadata.schema_info = std::move(schema_info);
 
-  for (const auto& meta : _metadata->per_file_metadata) {
+  for (const auto& meta : _metadata.per_file_metadata) {
     for (const auto& kv : meta.ff.metadata) {
       out_metadata.user_data.insert({kv.name, kv.value});
     }
@@ -1616,6 +1296,7 @@ table_with_metadata reader::read(orc_reader_options const& options, rmm::cuda_st
   return _impl->read(
     options.get_skip_rows(), options.get_num_rows(), options.get_stripes(), stream);
 }
+
 }  // namespace orc
 }  // namespace detail
 }  // namespace io
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 81ebb3dfbd1..2a1d4c3425a 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include "aggregate_orc_metadata.hpp"
 #include "orc.h"
 #include "orc_gpu.h"
 
@@ -38,7 +39,6 @@ namespace io {
 namespace detail {
 namespace orc {
 using namespace cudf::io::orc;
-using namespace cudf::io;
 
 // Forward declarations
 class metadata;
@@ -46,7 +46,6 @@ namespace {
 struct orc_stream_info;
 struct stripe_source_mapping;
 }  // namespace
-class aggregate_orc_metadata;
 
 /**
  * @brief Keeps track of orc mapping and child column details.
@@ -71,20 +70,6 @@ struct reader_column_meta {
   std::vector<row_group_meta> rwgrp_meta;  // rowgroup metadata [rowgroup][column]
 };
 
-/**
- * @brief Describes a column hierarchy, which may exclude some input columns.
- */
-struct column_hierarchy {
-  using nesting_map = std::map<int32_t, std::vector<int32_t>>;
-  // Children IDs of each column
-  nesting_map children;
-  // Each element contains column at the given nesting level
-  std::vector<std::vector<orc_column_meta>> levels;
-
-  column_hierarchy(nesting_map child_map);
-  auto num_levels() const { return levels.size(); }
-};
-
 /**
  * @brief Implementation for ORC reader
  */
@@ -231,8 +216,8 @@ class reader::impl {
  private:
   rmm::mr::device_memory_resource* _mr = nullptr;
   std::vector<std::unique_ptr<datasource>> _sources;
-  std::unique_ptr<aggregate_orc_metadata> _metadata;
-  column_hierarchy selected_columns;
+  cudf::io::orc::detail::aggregate_orc_metadata _metadata;
+  cudf::io::orc::detail::column_hierarchy selected_columns;
 
   bool _use_index     = true;
   bool _use_np_dtypes = true;

From 1281df12955ce2b28831d4ee31af5db2b60fc7a3 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 26 Oct 2021 14:00:00 -0700
Subject: [PATCH 20/24] add comments

---
 cpp/src/io/orc/aggregate_orc_metadata.hpp | 1 +
 python/cudf/cudf/tests/test_orc.py        | 9 +++++----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/cpp/src/io/orc/aggregate_orc_metadata.hpp b/cpp/src/io/orc/aggregate_orc_metadata.hpp
index 2951c660886..b74d3fb64f1 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.hpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.hpp
@@ -25,6 +25,7 @@ namespace cudf::io::orc::detail {
  * @brief Describes a column hierarchy, which may exclude some input columns.
  */
 struct column_hierarchy {
+  // Maps column IDs to the IDs of their children columns
   using nesting_map = std::map<int32_t, std::vector<int32_t>>;
   // Children IDs of each column
   nesting_map children;
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index a1280f6cf97..dfec1eaadc1 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -1496,7 +1496,7 @@ def test_empty_statistics():
 
 @pytest.mark.filterwarnings("ignore:.*struct.*experimental")
 @pytest.mark.parametrize(
-    "columns",
+    "equivalent_columns",
     [
         (["lvl1_struct.a", "lvl1_struct.b"], ["lvl1_struct"]),
         (["lvl1_struct", "lvl1_struct.a"], ["lvl1_struct"]),
@@ -1509,7 +1509,8 @@ def test_empty_statistics():
         ),
     ],
 )
-def test_select_nested(list_struct_buff, columns):
-    df_cols1 = cudf.read_orc(list_struct_buff, columns=columns[0])
-    df_cols2 = cudf.read_orc(list_struct_buff, columns=columns[1])
+def test_select_nested(list_struct_buff, equivalent_columns):
+    # The two column selections should be equivalent
+    df_cols1 = cudf.read_orc(list_struct_buff, columns=equivalent_columns[0])
+    df_cols2 = cudf.read_orc(list_struct_buff, columns=equivalent_columns[1])
     assert_eq(df_cols1, df_cols2)

From 5c47de3827c7f9d309743272c30c48fce6cd8102 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 26 Oct 2021 23:32:03 -0700
Subject: [PATCH 21/24] Apply suggestions from code review

Co-authored-by: Vyas Ramasubramani <vyas.ramasubramani@gmail.com>
---
 cpp/src/io/orc/aggregate_orc_metadata.cpp | 4 ++--
 cpp/src/io/orc/aggregate_orc_metadata.hpp | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp
index 9a673c44ee0..d7007071b96 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.cpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp
@@ -104,7 +104,7 @@ auto metadatas_from_sources(std::vector<std::unique_ptr<datasource>> const& sour
 size_type aggregate_orc_metadata::calc_num_rows() const
 {
   return std::accumulate(
-    per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto& sum, auto& pfm) {
+    per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto const& sum, auto const& pfm) {
       return sum + pfm.get_total_rows();
     });
 }
@@ -118,7 +118,7 @@ size_type aggregate_orc_metadata::calc_num_cols() const
 size_type aggregate_orc_metadata::calc_num_stripes() const
 {
   return std::accumulate(
-    per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto& sum, auto& pfm) {
+    per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto const& sum, auto const& pfm) {
       return sum + pfm.get_num_stripes();
     });
 }
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.hpp b/cpp/src/io/orc/aggregate_orc_metadata.hpp
index b74d3fb64f1..eb5f848c331 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.hpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.hpp
@@ -64,7 +64,7 @@ class aggregate_orc_metadata {
   size_type const num_rows;
   size_type const num_columns;
   size_type const num_stripes;
-  bool row_grp_idx_present = true;
+  bool row_grp_idx_present{true};
 
   aggregate_orc_metadata(std::vector<std::unique_ptr<datasource>> const& sources);
 

From 03b3932d03753285e3e263b62e0caaf3b6a7b4f0 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 27 Oct 2021 12:39:54 -0700
Subject: [PATCH 22/24] add docs; remove unused members

---
 cpp/src/io/functions.cpp                  |  2 +-
 cpp/src/io/orc/aggregate_orc_metadata.cpp | 47 +++++++++--------------
 cpp/src/io/orc/aggregate_orc_metadata.hpp | 43 +++++++++++++--------
 cpp/src/io/orc/orc.h                      | 16 +++++++-
 cpp/src/io/orc/reader_impl.cu             |  6 +--
 5 files changed, 65 insertions(+), 49 deletions(-)

diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 511a1a22ee7..d825f40cd00 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -255,7 +255,7 @@ raw_orc_statistics read_raw_orc_statistics(source_info const& src_info)
 
   // Get column names
   for (auto i = 0; i < metadata.get_num_columns(); i++) {
-    result.column_names.push_back(metadata.get_column_name(i));
+    result.column_names.push_back(metadata.column_name(i));
   }
 
   // Get file-level statistics, statistics of each column of file
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp
index d7007071b96..afae6c41b59 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.cpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp
@@ -34,16 +34,17 @@ column_hierarchy::column_hierarchy(nesting_map child_map) : children{std::move(c
     }
   };
 
-  for (auto col_id : children[0]) {
-    levelize(col_id, 0);
-  }
+  std::for_each(
+    children[0].cbegin(), children[0].cend(), [&](auto col_id) { levelize(col_id, 0); });
 }
 
+namespace {
+
 /**
  * @brief Goes up to the root to include the column with the given id and its parents.
  */
 void update_parent_mapping(std::map<int32_t, std::vector<int32_t>>& selected_columns,
-                           cudf::io::orc::metadata const& metadata,
+                           metadata const& metadata,
                            int32_t id)
 {
   auto current_id = id;
@@ -81,7 +82,7 @@ void add_nested_columns(std::map<int32_t, std::vector<int32_t>>& selected_column
  * Columns that are not on the direct path are excluded, which may result in prunning.
  */
 void add_column_to_mapping(std::map<int32_t, std::vector<int32_t>>& selected_columns,
-                           cudf::io::orc::metadata const& metadata,
+                           metadata const& metadata,
                            int32_t id)
 {
   update_parent_mapping(selected_columns, metadata, id);
@@ -93,14 +94,16 @@ void add_column_to_mapping(std::map<int32_t, std::vector<int32_t>>& selected_col
  */
 auto metadatas_from_sources(std::vector<std::unique_ptr<datasource>> const& sources)
 {
-  std::vector<cudf::io::orc::metadata> metadatas;
+  std::vector<metadata> metadatas;
   std::transform(
     sources.cbegin(), sources.cend(), std::back_inserter(metadatas), [](auto const& source) {
-      return cudf::io::orc::metadata(source.get());
+      return metadata(source.get());
     });
   return metadatas;
 }
 
+}  // namespace
+
 size_type aggregate_orc_metadata::calc_num_rows() const
 {
   return std::accumulate(
@@ -109,12 +112,6 @@ size_type aggregate_orc_metadata::calc_num_rows() const
     });
 }
 
-size_type aggregate_orc_metadata::calc_num_cols() const
-{
-  if (not per_file_metadata.empty()) { return per_file_metadata[0].get_num_columns(); }
-  return 0;
-}
-
 size_type aggregate_orc_metadata::calc_num_stripes() const
 {
   return std::accumulate(
@@ -127,7 +124,6 @@ aggregate_orc_metadata::aggregate_orc_metadata(
   std::vector<std::unique_ptr<datasource>> const& sources)
   : per_file_metadata(metadatas_from_sources(sources)),
     num_rows(calc_num_rows()),
-    num_columns(calc_num_cols()),
     num_stripes(calc_num_stripes())
 {
   // Verify that the input files have the same number of columns,
@@ -153,15 +149,15 @@ aggregate_orc_metadata::aggregate_orc_metadata(
   }
 }
 
-std::vector<cudf::io::orc::metadata::stripe_source_mapping> aggregate_orc_metadata::select_stripes(
+std::vector<metadata::stripe_source_mapping> aggregate_orc_metadata::select_stripes(
   std::vector<std::vector<size_type>> const& user_specified_stripes,
   size_type& row_start,
   size_type& row_count)
 {
-  std::vector<cudf::io::orc::metadata::stripe_source_mapping> selected_stripes_mapping;
+  std::vector<metadata::stripe_source_mapping> selected_stripes_mapping;
 
   if (!user_specified_stripes.empty()) {
-    CUDF_EXPECTS(user_specified_stripes.size() == get_num_source_files(),
+    CUDF_EXPECTS(user_specified_stripes.size() == per_file_metadata.size(),
                  "Must specify stripes for each source");
     // row_start is 0 if stripes are set. If this is not true anymore, then
     // row_start needs to be subtracted to get the correct row_count
@@ -249,32 +245,27 @@ std::vector<cudf::io::orc::metadata::stripe_source_mapping> aggregate_orc_metada
   return selected_stripes_mapping;
 }
 
-/**
- * @brief Filters and reduces down to a selection of columns
- *
- * @param use_names List of column names to select
- * @return Vector of list of ORC column meta-data
- */
-column_hierarchy aggregate_orc_metadata::select_columns(std::vector<std::string> const& use_names)
+column_hierarchy aggregate_orc_metadata::select_columns(
+  std::vector<std::string> const& column_paths)
 {
   auto const& pfm = per_file_metadata[0];
 
   column_hierarchy::nesting_map selected_columns;
-  if (use_names.empty()) {
+  if (column_paths.empty()) {
     for (auto const& col_id : pfm.ff.types[0].subtypes) {
       add_column_to_mapping(selected_columns, pfm, col_id);
     }
   } else {
-    for (const auto& use_name : use_names) {
+    for (const auto& path : column_paths) {
       bool name_found = false;
       for (auto col_id = 1; col_id < pfm.get_num_columns(); ++col_id) {
-        if (pfm.get_column_path(col_id) == use_name) {
+        if (pfm.column_path(col_id) == path) {
           name_found = true;
           add_column_to_mapping(selected_columns, pfm, col_id);
           break;
         }
       }
-      CUDF_EXPECTS(name_found, "Unknown column name: " + std::string(use_name));
+      CUDF_EXPECTS(name_found, "Unknown column name: " + std::string(path));
     }
   }
   return {std::move(selected_columns)};
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.hpp b/cpp/src/io/orc/aggregate_orc_metadata.hpp
index eb5f848c331..5eb837fc1df 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.hpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.hpp
@@ -60,9 +60,8 @@ class aggregate_orc_metadata {
   size_type calc_num_stripes() const;
 
  public:
-  mutable std::vector<cudf::io::orc::metadata> per_file_metadata;  // TODO needed to be mutable?
+  std::vector<metadata> per_file_metadata;
   size_type const num_rows;
-  size_type const num_columns;
   size_type const num_stripes;
   bool row_grp_idx_present{true};
 
@@ -78,40 +77,54 @@ class aggregate_orc_metadata {
 
   auto get_num_stripes() const { return num_stripes; }
 
-  auto get_num_source_files() const { return per_file_metadata.size(); }
-
   auto const& get_types() const { return per_file_metadata[0].ff.types; }
 
   int get_row_index_stride() const { return per_file_metadata[0].ff.rowIndexStride; }
 
-  auto get_column_name(const int source_idx, const int column_id) const
+  auto is_row_grp_idx_present() const { return row_grp_idx_present; }
+
+  /**
+   * @brief Returns the name of the given column from the given source.
+   */
+  auto column_name(const int source_idx, const int column_id) const
   {
     CUDF_EXPECTS(source_idx <= static_cast<int>(per_file_metadata.size()),
                  "Out of range source_idx provided");
-    return per_file_metadata[source_idx].get_column_name(column_id);
+    return per_file_metadata[source_idx].column_name(column_id);
   }
 
-  auto get_column_path(const int source_idx, const int column_id) const
+  /**
+   * @brief Returns the full name of the given column from the given source.
+   *
+   * Full name includes ancestor columns' names.
+   */
+  auto column_path(const int source_idx, const int column_id) const
   {
     CUDF_EXPECTS(source_idx <= static_cast<int>(per_file_metadata.size()),
                  "Out of range source_idx provided");
-    return per_file_metadata[source_idx].get_column_path(column_id);
+    return per_file_metadata[source_idx].column_path(column_id);
   }
 
-  auto is_row_grp_idx_present() const { return row_grp_idx_present; }
-
-  std::vector<cudf::io::orc::metadata::stripe_source_mapping> select_stripes(
+  /**
+   * @brief Selects the stripes to read, based on the row/stripe selection parameters.
+   *
+   * Stripes are potentially selected from multiple files.
+   */
+  std::vector<metadata::stripe_source_mapping> select_stripes(
     std::vector<std::vector<size_type>> const& user_specified_stripes,
     size_type& row_start,
     size_type& row_count);
 
   /**
-   * @brief Filters and reduces down to a selection of columns
+   * @brief Filters ORC file to a selection of columns, based on their paths in the file.
+   *
+   * Paths are in format "grandparent_col.parent_col.child_col", where the root ORC column is
+   * ommited to match the cuDF table hierarchy.
    *
-   * @param use_names List of column names to select
-   * @return Vector of list of ORC column meta-data
+   * @param column_paths List of full column names (i.e. paths) to select from the ORC file
+   * @return Columns hierarchy - lists of children columns and sorted columns in each nesting level
    */
-  column_hierarchy select_columns(std::vector<std::string> const& use_names);
+  column_hierarchy select_columns(std::vector<std::string> const& column_paths);
 };
 
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/orc.h b/cpp/src/io/orc/orc.h
index 450e006dca8..51cc5cb780a 100644
--- a/cpp/src/io/orc/orc.h
+++ b/cpp/src/io/orc/orc.h
@@ -586,12 +586,24 @@ class metadata {
   size_t get_total_rows() const { return ff.numberOfRows; }
   int get_num_stripes() const { return ff.stripes.size(); }
   int get_num_columns() const { return ff.types.size(); }
-  std::string const& get_column_name(int32_t column_id) const
+  /**
+   * @brief Returns the name of the column with the given ID.
+   *
+   * Name might not be unique in the ORC file, since columns with different parents are allowed to
+   * have the same names.
+   */
+  std::string const& column_name(int32_t column_id) const
   {
     CUDF_EXPECTS(column_id < get_num_columns(), "Out of range column id provided");
     return column_names[column_id];
   }
-  std::string const& get_column_path(int32_t column_id) const
+  /**
+   * @brief Returns the full name of the column with the given ID - includes the ancestor columns
+   * names.
+   *
+   * Each column in the ORC file has a unique path.
+   */
+  std::string const& column_path(int32_t column_id) const
   {
     CUDF_EXPECTS(column_id < get_num_columns(), "Out of range column id provided");
     return column_paths[column_id];
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 9cfe7f735cf..325c454ac5b 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -235,7 +235,7 @@ bool should_convert_decimal_column_to_float(const std::vector<std::string>& colu
 {
   return (std::find(columns_to_convert.begin(),
                     columns_to_convert.end(),
-                    metadata.get_column_name(column_index)) != columns_to_convert.end());
+                    metadata.column_name(column_index)) != columns_to_convert.end());
 }
 
 }  // namespace
@@ -728,7 +728,7 @@ std::unique_ptr<column> reader::impl::create_empty_column(const int32_t orc_col_
                                                           column_name_info& schema_info,
                                                           rmm::cuda_stream_view stream)
 {
-  schema_info.name = _metadata.get_column_name(0, orc_col_id);
+  schema_info.name = _metadata.column_name(0, orc_col_id);
   // If the column type is orc::DECIMAL see if the user
   // desires it to be converted to float64 or not
   auto const decimal_as_float64 = should_convert_decimal_column_to_float(
@@ -806,7 +806,7 @@ column_buffer&& reader::impl::assemble_buffer(const int32_t orc_col_id,
   auto const col_id = _col_meta.orc_col_map[level][orc_col_id];
   auto& col_buffer  = col_buffers[level][col_id];
 
-  col_buffer.name = _metadata.get_column_name(0, orc_col_id);
+  col_buffer.name = _metadata.column_name(0, orc_col_id);
   auto kind       = _metadata.get_col_type(orc_col_id).kind;
   switch (kind) {
     case orc::LIST:

From 2148b941af71b690706a893be66537518c1cb9b4 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 27 Oct 2021 12:55:18 -0700
Subject: [PATCH 23/24] use size_type

---
 cpp/src/io/orc/aggregate_orc_metadata.cpp | 14 ++++++------
 cpp/src/io/orc/aggregate_orc_metadata.hpp |  2 +-
 cpp/src/io/orc/orc.h                      | 26 ++++++++++++++---------
 3 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp
index afae6c41b59..45d60605936 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.cpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp
@@ -24,7 +24,7 @@ namespace cudf::io::orc::detail {
 column_hierarchy::column_hierarchy(nesting_map child_map) : children{std::move(child_map)}
 {
   // Sort columns by nesting levels
-  std::function<void(int32_t, int32_t)> levelize = [&](int32_t id, int32_t level) {
+  std::function<void(size_type, int32_t)> levelize = [&](size_type id, int32_t level) {
     if (static_cast<int32_t>(levels.size()) == level) levels.emplace_back();
 
     levels[level].push_back({id, static_cast<int32_t>(children[id].size())});
@@ -43,9 +43,9 @@ namespace {
 /**
  * @brief Goes up to the root to include the column with the given id and its parents.
  */
-void update_parent_mapping(std::map<int32_t, std::vector<int32_t>>& selected_columns,
+void update_parent_mapping(std::map<size_type, std::vector<size_type>>& selected_columns,
                            metadata const& metadata,
-                           int32_t id)
+                           size_type id)
 {
   auto current_id = id;
   while (metadata.column_has_parent(current_id)) {
@@ -62,9 +62,9 @@ void update_parent_mapping(std::map<int32_t, std::vector<int32_t>>& selected_col
 /**
  * @brief Adds all columns nested under the column with the given id to the nesting map.
  */
-void add_nested_columns(std::map<int32_t, std::vector<int32_t>>& selected_columns,
+void add_nested_columns(std::map<size_type, std::vector<size_type>>& selected_columns,
                         std::vector<SchemaType> const& types,
-                        int32_t id)
+                        size_type id)
 {
   for (auto child_id : types[id].subtypes) {
     if (std::find(selected_columns[id].cbegin(), selected_columns[id].cend(), child_id) ==
@@ -81,9 +81,9 @@ void add_nested_columns(std::map<int32_t, std::vector<int32_t>>& selected_column
  * All nested columns and direct ancestors of column `id` are included.
  * Columns that are not on the direct path are excluded, which may result in prunning.
  */
-void add_column_to_mapping(std::map<int32_t, std::vector<int32_t>>& selected_columns,
+void add_column_to_mapping(std::map<size_type, std::vector<size_type>>& selected_columns,
                            metadata const& metadata,
-                           int32_t id)
+                           size_type id)
 {
   update_parent_mapping(selected_columns, metadata, id);
   add_nested_columns(selected_columns, metadata.ff.types, id);
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.hpp b/cpp/src/io/orc/aggregate_orc_metadata.hpp
index 5eb837fc1df..356d20843e8 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.hpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.hpp
@@ -26,7 +26,7 @@ namespace cudf::io::orc::detail {
  */
 struct column_hierarchy {
   // Maps column IDs to the IDs of their children columns
-  using nesting_map = std::map<int32_t, std::vector<int32_t>>;
+  using nesting_map = std::map<size_type, std::vector<size_type>>;
   // Children IDs of each column
   nesting_map children;
   // Each element contains column at the given nesting level
diff --git a/cpp/src/io/orc/orc.h b/cpp/src/io/orc/orc.h
index 51cc5cb780a..d75b76a0341 100644
--- a/cpp/src/io/orc/orc.h
+++ b/cpp/src/io/orc/orc.h
@@ -555,8 +555,8 @@ class OrcDecompressor {
  *
  */
 struct orc_column_meta {
-  int32_t id;            // orc id for the column
-  int32_t num_children;  // number of children at the same level of nesting in case of struct
+  size_type id;            // orc id for the column
+  size_type num_children;  // number of children at the same level of nesting in case of struct
 };
 
 /**
@@ -592,7 +592,7 @@ class metadata {
    * Name might not be unique in the ORC file, since columns with different parents are allowed to
    * have the same names.
    */
-  std::string const& column_name(int32_t column_id) const
+  std::string const& column_name(size_type column_id) const
   {
     CUDF_EXPECTS(column_id < get_num_columns(), "Out of range column id provided");
     return column_names[column_id];
@@ -603,7 +603,7 @@ class metadata {
    *
    * Each column in the ORC file has a unique path.
    */
-  std::string const& column_path(int32_t column_id) const
+  std::string const& column_path(size_type column_id) const
   {
     CUDF_EXPECTS(column_id < get_num_columns(), "Out of range column id provided");
     return column_paths[column_id];
@@ -613,17 +613,23 @@ class metadata {
   /**
    * @brief Returns the ID of the parent column of the given column.
    */
-  int32_t parent_id(int32_t column_id) const { return parents.at(column_id).value().id; }
+  size_type parent_id(size_type column_id) const { return parents.at(column_id).value().id; }
 
   /**
    * @brief Returns the index the given column has in its parent's children list.
    */
-  int32_t field_index(int32_t column_id) const { return parents.at(column_id).value().field_idx; }
+  size_type field_index(size_type column_id) const
+  {
+    return parents.at(column_id).value().field_idx;
+  }
 
   /**
    * @brief Returns whether the given column has a parent.
    */
-  int32_t column_has_parent(int32_t column_id) const { return parents.at(column_id).has_value(); }
+  size_type column_has_parent(size_type column_id) const
+  {
+    return parents.at(column_id).has_value();
+  }
 
  public:
   PostScript ps;
@@ -636,10 +642,10 @@ class metadata {
  private:
   struct column_parent {
     // parent's ID
-    int32_t id;
+    size_type id;
     // Index of this column in the parent's list of children
-    int32_t field_idx;
-    column_parent(int32_t parent_id, int32_t field_idx) : id{parent_id}, field_idx{field_idx} {}
+    size_type field_idx;
+    column_parent(size_type parent_id, size_type field_idx) : id{parent_id}, field_idx{field_idx} {}
   };
   void init_parent_descriptors();
   std::vector<std::optional<column_parent>> parents;

From 3c0d862cd600f7a6de49f9e0d9bb92cf83e5571e Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 27 Oct 2021 13:11:37 -0700
Subject: [PATCH 24/24] more size_type use

---
 cpp/src/io/orc/orc.cpp         | 12 ++++++------
 cpp/src/io/orc/reader_impl.cu  | 17 +++++++++--------
 cpp/src/io/orc/reader_impl.hpp |  6 +++---
 3 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp
index b850fec82e8..89eac0c9901 100644
--- a/cpp/src/io/orc/orc.cpp
+++ b/cpp/src/io/orc/orc.cpp
@@ -473,7 +473,7 @@ void metadata::init_column_names()
     if (not column_has_parent(col_id)) return std::string{};
     auto const& parent_field_names = ff.types[parent_id(col_id)].fieldNames;
     // Child columns of lists don't have a name in ORC files, generate placeholder in that case
-    return field_index(col_id) < static_cast<int32_t>(parent_field_names.size())
+    return field_index(col_id) < static_cast<size_type>(parent_field_names.size())
              ? parent_field_names[field_index(col_id)]
              : std::to_string(col_id);
   });
@@ -489,14 +489,14 @@ void metadata::init_column_names()
 
 void metadata::init_parent_descriptors()
 {
-  auto const num_columns = static_cast<int32_t>(ff.types.size());
+  auto const num_columns = static_cast<size_type>(ff.types.size());
   parents.resize(num_columns);
 
-  for (int32_t col_id = 0; col_id < num_columns; ++col_id) {
+  for (size_type col_id = 0; col_id < num_columns; ++col_id) {
     auto const& subtypes    = ff.types[col_id].subtypes;
-    auto const num_children = static_cast<int32_t>(subtypes.size());
-    for (int32_t field_idx = 0; field_idx < num_children; ++field_idx) {
-      auto const child_id = static_cast<int32_t>(subtypes[field_idx]);
+    auto const num_children = static_cast<size_type>(subtypes.size());
+    for (size_type field_idx = 0; field_idx < num_children; ++field_idx) {
+      auto const child_id = static_cast<size_type>(subtypes[field_idx]);
       CUDF_EXPECTS(child_id > col_id && child_id < num_columns, "Invalid column id");
       CUDF_EXPECTS(not column_has_parent(child_id), "Same node referenced twice");
       parents[child_id] = {col_id, field_idx};
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 325c454ac5b..da525ab8592 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -635,7 +635,7 @@ void reader::impl::aggregate_child_meta(cudf::detail::host_2dspan<gpu::ColumnDes
                                         cudf::detail::host_2dspan<gpu::RowGroup> row_groups,
                                         std::vector<column_buffer>& out_buffers,
                                         std::vector<orc_column_meta> const& list_col,
-                                        const int32_t level)
+                                        const size_type level)
 {
   const auto num_of_stripes         = chunks.size().first;
   const auto num_of_rowgroups       = row_groups.size().first;
@@ -678,7 +678,7 @@ void reader::impl::aggregate_child_meta(cudf::detail::host_2dspan<gpu::ColumnDes
         for (size_t rowgroup_id = 0; rowgroup_id < stripe_num_row_groups;
              rowgroup_id++, processed_row_groups++) {
           const auto child_rows = row_groups[processed_row_groups][parent_col_idx].num_child_rows;
-          for (int32_t id = 0; id < p_col.num_children; id++) {
+          for (size_type id = 0; id < p_col.num_children; id++) {
             const auto child_col_idx                                  = index + id;
             rwgrp_meta[processed_row_groups][child_col_idx].start_row = processed_child_rows;
             rwgrp_meta[processed_row_groups][child_col_idx].num_rows  = child_rows;
@@ -689,7 +689,7 @@ void reader::impl::aggregate_child_meta(cudf::detail::host_2dspan<gpu::ColumnDes
 
       // Aggregate start row, number of rows per chunk and total number of rows in a column
       const auto child_rows = chunks[stripe_id][parent_col_idx].num_child_rows;
-      for (int32_t id = 0; id < p_col.num_children; id++) {
+      for (size_type id = 0; id < p_col.num_children; id++) {
         const auto child_col_idx = index + id;
 
         num_child_rows[child_col_idx] += child_rows;
@@ -707,7 +707,7 @@ void reader::impl::aggregate_child_meta(cudf::detail::host_2dspan<gpu::ColumnDes
     auto parent_valid_map  = out_buffers[parent_col_idx].null_mask();
     auto num_rows          = out_buffers[parent_col_idx].size;
 
-    for (int32_t id = 0; id < p_col.num_children; id++) {
+    for (size_type id = 0; id < p_col.num_children; id++) {
       const auto child_col_idx                     = index + id;
       _col_meta.parent_column_index[child_col_idx] = parent_col_idx;
       if (type == type_id::STRUCT) {
@@ -724,7 +724,7 @@ void reader::impl::aggregate_child_meta(cudf::detail::host_2dspan<gpu::ColumnDes
 
 std::string get_map_child_col_name(size_t const idx) { return (idx == 0) ? "key" : "value"; }
 
-std::unique_ptr<column> reader::impl::create_empty_column(const int32_t orc_col_id,
+std::unique_ptr<column> reader::impl::create_empty_column(const size_type orc_col_id,
                                                           column_name_info& schema_info,
                                                           rmm::cuda_stream_view stream)
 {
@@ -798,7 +798,7 @@ std::unique_ptr<column> reader::impl::create_empty_column(const int32_t orc_col_
 }
 
 // Adds child column buffers to parent column
-column_buffer&& reader::impl::assemble_buffer(const int32_t orc_col_id,
+column_buffer&& reader::impl::assemble_buffer(const size_type orc_col_id,
                                               std::vector<std::vector<column_buffer>>& col_buffers,
                                               const size_t level,
                                               rmm::cuda_stream_view stream)
@@ -952,7 +952,8 @@ table_with_metadata reader::impl::read(size_type skip_rows,
         // sign of the scale is changed since cuDF follows c++ libraries like CNL
         // which uses negative scaling, but liborc and other libraries
         // follow positive scaling.
-        auto const scale = -static_cast<int32_t>(_metadata.get_col_type(col.id).scale.value_or(0));
+        auto const scale =
+          -static_cast<size_type>(_metadata.get_col_type(col.id).scale.value_or(0));
         column_types.emplace_back(col_type, scale);
       } else {
         column_types.emplace_back(col_type);
@@ -1127,7 +1128,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
                                     ? sizeof(string_index_pair)
                                   : ((column_types[col_idx].id() == type_id::LIST) or
                                  (column_types[col_idx].id() == type_id::STRUCT))
-                                    ? sizeof(int32_t)
+                                    ? sizeof(size_type)
                                     : cudf::size_of(column_types[col_idx]);
             chunk.num_rowgroups = stripe_num_rowgroups;
             if (chunk.type_kind == orc::TIMESTAMP) {
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 2a1d4c3425a..c9de2211d48 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -51,7 +51,7 @@ struct stripe_source_mapping;
  * @brief Keeps track of orc mapping and child column details.
  */
 struct reader_column_meta {
-  std::vector<std::vector<int32_t>>
+  std::vector<std::vector<size_type>>
     orc_col_map;                         // Mapping between column id in orc to processing order.
   std::vector<uint32_t> num_child_rows;  // number of rows in child columns
 
@@ -173,7 +173,7 @@ class reader::impl {
    * @param col_buffers Column buffers for columns and children.
    * @param level Current nesting level.
    */
-  column_buffer&& assemble_buffer(const int32_t orc_col_id,
+  column_buffer&& assemble_buffer(const size_type orc_col_id,
                                   std::vector<std::vector<column_buffer>>& col_buffers,
                                   const size_t level,
                                   rmm::cuda_stream_view stream);
@@ -200,7 +200,7 @@ class reader::impl {
    *
    * @return An empty column equivalent to orc column type.
    */
-  std::unique_ptr<column> create_empty_column(const int32_t orc_col_id,
+  std::unique_ptr<column> create_empty_column(const size_type orc_col_id,
                                               column_name_info& schema_info,
                                               rmm::cuda_stream_view stream);