diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 39acc362450..3e875b71ca6 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -139,6 +139,8 @@ include(cmake/thirdparty/CUDF_GetLibcudacxx.cmake)
 include(cmake/thirdparty/CUDF_GetGTest.cmake)
 # Stringify libcudf and libcudacxx headers used in JIT operations
 include(cmake/Modules/StringifyJITHeaders.cmake)
+# find cuFile
+include(cmake/Modules/FindcuFile.cmake)
 
 ###################################################################################################
 # - library targets -------------------------------------------------------------------------------
@@ -244,6 +246,7 @@ add_library(cudf
     src/io/statistics/column_stats.cu
     src/io/utilities/data_sink.cpp
     src/io/utilities/datasource.cpp
+    src/io/utilities/file_io_utilities.cpp
     src/io/utilities/parsing_utils.cu
     src/io/utilities/type_conversion.cpp
     src/jit/cache.cpp
@@ -422,7 +425,8 @@ target_include_directories(cudf
                        "$<BUILD_INTERFACE:${CUDF_GENERATED_INCLUDE_DIR}/include>"
            PRIVATE     "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>"
            INTERFACE   "$<INSTALL_INTERFACE:include>"
-                       "$<INSTALL_INTERFACE:include/libcudf/libcudaxx>")
+                       "$<INSTALL_INTERFACE:include/libcudf/libcudacxx>"
+                       "$<INSTALL_INTERFACE:include/libcudf/Thrust>")
 
 # Add Conda library paths if specified
 if(CONDA_LINK_DIRS)
@@ -469,6 +473,11 @@ else()
     target_link_libraries(cudf PUBLIC CUDA::nvrtc CUDA::cudart CUDA::cuda_driver)
 endif()
 
+# Add cuFile interface if available
+if(TARGET cuFile::cuFile_interface)
+    target_link_libraries(cudf PRIVATE cuFile::cuFile_interface)
+endif()
+
 file(WRITE "${CUDF_BINARY_DIR}/fatbin.ld"
 [=[
 SECTIONS
@@ -570,6 +579,9 @@ install(DIRECTORY
             ${CUDF_GENERATED_INCLUDE_DIR}/include/libcudacxx
         DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/libcudf)
 
+install(DIRECTORY ${Thrust_SOURCE_DIR}/
+  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/libcudf/Thrust)
+
 include(CMakePackageConfigHelpers)
 
 configure_package_config_file(cmake/cudf-config.cmake.in "${CUDF_BINARY_DIR}/cmake/cudf-config.cmake"
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index dfc340b1459..0bf92ff54bb 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -180,13 +180,16 @@ ConfigureBench(SUBWORD_TOKENIZER_BENCH text/subword_benchmark.cpp)
 # - strings benchmark -------------------------------------------------------------------
 ConfigureBench(STRINGS_BENCH
   string/case_benchmark.cpp
+  string/combine_benchmark.cpp
   string/contains_benchmark.cpp
   string/convert_durations_benchmark.cpp
   string/convert_floats_benchmark.cpp
   string/copy_benchmark.cpp
   string/extract_benchmark.cpp
+  string/factory_benchmark.cu
   string/filter_benchmark.cpp
   string/find_benchmark.cpp
   string/replace_benchmark.cpp
   string/split_benchmark.cpp
+  string/substring_benchmark.cpp
   string/url_decode_benchmark.cpp)
diff --git a/cpp/benchmarks/fixture/benchmark_fixture.hpp b/cpp/benchmarks/fixture/benchmark_fixture.hpp
index ad2ce095b6e..dd1bbcba0b4 100644
--- a/cpp/benchmarks/fixture/benchmark_fixture.hpp
+++ b/cpp/benchmarks/fixture/benchmark_fixture.hpp
@@ -88,4 +88,4 @@ class benchmark : public ::benchmark::Fixture {
   std::shared_ptr<rmm::mr::device_memory_resource> mr;
 };
 
-};  // namespace cudf
+}  // namespace cudf
diff --git a/cpp/benchmarks/string/combine_benchmark.cpp b/cpp/benchmarks/string/combine_benchmark.cpp
new file mode 100644
index 00000000000..2a5013a9ae7
--- /dev/null
+++ b/cpp/benchmarks/string/combine_benchmark.cpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "string_bench_args.hpp"
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/combine.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+class StringCombine : public cudf::benchmark {
+};
+
+static void BM_combine(benchmark::State& state)
+{
+  cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
+  cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
+  data_profile table_profile;
+  table_profile.set_distribution_params(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
+  auto const table =
+    create_random_table({cudf::type_id::STRING}, 2, row_count{n_rows}, table_profile);
+  cudf::strings_column_view input1(table->view().column(0));
+  cudf::strings_column_view input2(table->view().column(1));
+  cudf::string_scalar separator("+");
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, 0);
+    cudf::strings::concatenate(table->view(), separator);
+  }
+
+  state.SetBytesProcessed(state.iterations() * (input1.chars_size() + input2.chars_size()));
+}
+
+static void generate_bench_args(benchmark::internal::Benchmark* b)
+{
+  int const min_rows   = 1 << 12;
+  int const max_rows   = 1 << 24;
+  int const row_mult   = 8;
+  int const min_rowlen = 1 << 4;
+  int const max_rowlen = 1 << 11;
+  int const len_mult   = 4;
+  generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
+}
+
+#define STRINGS_BENCHMARK_DEFINE(name)          \
+  BENCHMARK_DEFINE_F(StringCombine, name)       \
+  (::benchmark::State & st) { BM_combine(st); } \
+  BENCHMARK_REGISTER_F(StringCombine, name)     \
+    ->Apply(generate_bench_args)                \
+    ->UseManualTime()                           \
+    ->Unit(benchmark::kMillisecond);
+
+STRINGS_BENCHMARK_DEFINE(concat)
diff --git a/cpp/benchmarks/string/factory_benchmark.cu b/cpp/benchmarks/string/factory_benchmark.cu
new file mode 100644
index 00000000000..6c5dceffaa8
--- /dev/null
+++ b/cpp/benchmarks/string/factory_benchmark.cu
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "string_bench_args.hpp"
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/strings/string_view.cuh>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/execution_policy.h>
+#include <thrust/transform.h>
+
+#include <limits>
+
+namespace {
+using string_pair = thrust::pair<char const*, cudf::size_type>;
+struct string_view_to_pair {
+  __device__ string_pair operator()(thrust::pair<cudf::string_view, bool> const& p)
+  {
+    return (p.second) ? string_pair{p.first.data(), p.first.size_bytes()} : string_pair{nullptr, 0};
+  }
+};
+}  // namespace
+
+class StringsFactory : public cudf::benchmark {
+};
+
+static void BM_factory(benchmark::State& state)
+{
+  cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
+  cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
+  data_profile table_profile;
+  table_profile.set_distribution_params(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
+  auto const table =
+    create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
+  auto d_column = cudf::column_device_view::create(table->view().column(0));
+  rmm::device_vector<string_pair> pairs(d_column->size());
+  thrust::transform(thrust::device,
+                    d_column->pair_begin<cudf::string_view, true>(),
+                    d_column->pair_end<cudf::string_view, true>(),
+                    pairs.data(),
+                    string_view_to_pair{});
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, 0);
+    cudf::make_strings_column(pairs);
+  }
+
+  cudf::strings_column_view input(table->view().column(0));
+  state.SetBytesProcessed(state.iterations() * input.chars_size());
+}
+
+static void generate_bench_args(benchmark::internal::Benchmark* b)
+{
+  int const min_rows   = 1 << 12;
+  int const max_rows   = 1 << 24;
+  int const row_mult   = 8;
+  int const min_rowlen = 1 << 5;
+  int const max_rowlen = 1 << 13;
+  int const len_mult   = 4;
+  generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
+}
+
+#define STRINGS_BENCHMARK_DEFINE(name)          \
+  BENCHMARK_DEFINE_F(StringsFactory, name)      \
+  (::benchmark::State & st) { BM_factory(st); } \
+  BENCHMARK_REGISTER_F(StringsFactory, name)    \
+    ->Apply(generate_bench_args)                \
+    ->UseManualTime()                           \
+    ->Unit(benchmark::kMillisecond);
+
+STRINGS_BENCHMARK_DEFINE(factory)
diff --git a/cpp/benchmarks/string/string_bench_args.hpp b/cpp/benchmarks/string/string_bench_args.hpp
index f81f859de74..05ed1bf5b33 100644
--- a/cpp/benchmarks/string/string_bench_args.hpp
+++ b/cpp/benchmarks/string/string_bench_args.hpp
@@ -17,6 +17,10 @@
 
 #include <benchmark/benchmark.h>
 
+#include <cudf/types.hpp>
+
+#include <limits>
+
 /**
  * @brief Generate row count and row length argument ranges for a string benchmark.
  *
diff --git a/cpp/benchmarks/string/substring_benchmark.cpp b/cpp/benchmarks/string/substring_benchmark.cpp
new file mode 100644
index 00000000000..d47c42e45be
--- /dev/null
+++ b/cpp/benchmarks/string/substring_benchmark.cpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "string_bench_args.hpp"
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/strings/substring.hpp>
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <limits>
+
+#include <thrust/iterator/constant_iterator.h>
+
+class StringSubstring : public cudf::benchmark {
+};
+
+enum substring_type { position, multi_position, delimiter, multi_delimiter };
+
+static void BM_substring(benchmark::State& state, substring_type rt)
+{
+  cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
+  cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
+  data_profile table_profile;
+  table_profile.set_distribution_params(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
+  auto const table =
+    create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
+  cudf::strings_column_view input(table->view().column(0));
+  auto starts_itr = thrust::constant_iterator<cudf::size_type>(1);
+  auto stops_itr  = thrust::constant_iterator<cudf::size_type>(max_str_length / 2);
+  cudf::test::fixed_width_column_wrapper<int32_t> starts(starts_itr, starts_itr + n_rows);
+  cudf::test::fixed_width_column_wrapper<int32_t> stops(stops_itr, stops_itr + n_rows);
+  auto delim_itr = thrust::constant_iterator<std::string>(" ");
+  cudf::test::strings_column_wrapper delimiters(delim_itr, delim_itr + n_rows);
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, 0);
+    switch (rt) {
+      case position: cudf::strings::slice_strings(input, 1, max_str_length / 2); break;
+      case multi_position: cudf::strings::slice_strings(input, starts, stops); break;
+      case delimiter: cudf::strings::slice_strings(input, std::string{" "}, 1); break;
+      case multi_delimiter:
+        cudf::strings::slice_strings(input, cudf::strings_column_view(delimiters), 1);
+        break;
+    }
+  }
+
+  state.SetBytesProcessed(state.iterations() * input.chars_size());
+}
+
+static void generate_bench_args(benchmark::internal::Benchmark* b)
+{
+  int const min_rows   = 1 << 12;
+  int const max_rows   = 1 << 24;
+  int const row_mult   = 8;
+  int const min_rowlen = 1 << 5;
+  int const max_rowlen = 1 << 13;
+  int const len_mult   = 4;
+  generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
+}
+
+#define STRINGS_BENCHMARK_DEFINE(name)                                  \
+  BENCHMARK_DEFINE_F(StringSubstring, name)                             \
+  (::benchmark::State & st) { BM_substring(st, substring_type::name); } \
+  BENCHMARK_REGISTER_F(StringSubstring, name)                           \
+    ->Apply(generate_bench_args)                                        \
+    ->UseManualTime()                                                   \
+    ->Unit(benchmark::kMillisecond);
+
+STRINGS_BENCHMARK_DEFINE(position)
+STRINGS_BENCHMARK_DEFINE(multi_position)
+STRINGS_BENCHMARK_DEFINE(delimiter)
+STRINGS_BENCHMARK_DEFINE(multi_delimiter)
diff --git a/cpp/cmake/Modules/FindcuFile.cmake b/cpp/cmake/Modules/FindcuFile.cmake
index e67b79d9d60..4f67e186f42 100644
--- a/cpp/cmake/Modules/FindcuFile.cmake
+++ b/cpp/cmake/Modules/FindcuFile.cmake
@@ -93,6 +93,12 @@ find_package_handle_standard_args(cuFile
     cuFile_VERSION
 )
 
+if (cuFile_INCLUDE_DIR AND NOT TARGET cuFile::cuFile_interface)
+  add_library(cuFile::cuFile_interface IMPORTED INTERFACE)
+  target_include_directories(cuFile::cuFile_interface INTERFACE "$<BUILD_INTERFACE:${cuFile_INCLUDE_DIR}>")
+  target_compile_options(cuFile::cuFile_interface INTERFACE "${cuFile_COMPILE_OPTIONS}")
+  target_compile_definitions(cuFile::cuFile_interface INTERFACE CUFILE_FOUND)
+endif ()
 
 if (cuFile_FOUND AND NOT TARGET cuFile::cuFile)
   add_library(cuFile::cuFile UNKNOWN IMPORTED)
diff --git a/cpp/cmake/cudf-config.cmake.in b/cpp/cmake/cudf-config.cmake.in
index 1147e1160e7..0a478516f18 100644
--- a/cpp/cmake/cudf-config.cmake.in
+++ b/cpp/cmake/cudf-config.cmake.in
@@ -15,12 +15,22 @@ find_dependency(CUDAToolkit)
 find_dependency(Threads)
 find_dependency(ZLIB)
 
+# Don't look for a Boost CMake configuration file because it adds the
+# `-DBOOST_ALL_NO_LIB` and `-DBOOST_FILESYSTEM_DYN_LINK` compile defs
+set(Boost_NO_BOOST_CMAKE ON)
+find_dependency(Boost @CUDF_MIN_VERSION_Boost@ COMPONENTS filesystem)
+
 find_dependency(Arrow @CUDF_VERSION_Arrow@)
+
+set(ArrowCUDA_DIR "${Arrow_DIR}")
 find_dependency(ArrowCUDA @CUDF_VERSION_Arrow@)
-find_dependency(Boost @CUDF_MIN_VERSION_Boost@)
 
 find_dependency(rmm @CUDF_MIN_VERSION_rmm@)
-find_dependency(gtest @CUDF_MIN_VERSION_gtest@)
+find_dependency(GTest @CUDF_MIN_VERSION_GTest@)
+
+set(Thrust_ROOT "${CMAKE_CURRENT_LIST_DIR}/../../../include/libcudf/Thrust")
+find_dependency(Thrust @CUDF_MIN_VERSION_Thrust@)
+thrust_create_target(cudf::Thrust FROM_OPTIONS)
 
 list(POP_FRONT CMAKE_MODULE_PATH)
 
diff --git a/cpp/cmake/thirdparty/CUDF_GetGTest.cmake b/cpp/cmake/thirdparty/CUDF_GetGTest.cmake
index 2911e4fce29..e346dce1730 100644
--- a/cpp/cmake/thirdparty/CUDF_GetGTest.cmake
+++ b/cpp/cmake/thirdparty/CUDF_GetGTest.cmake
@@ -48,6 +48,6 @@ function(find_and_configure_gtest VERSION)
     endif()
 endfunction()
 
-set(CUDF_MIN_VERSION_gtest 1.10.0)
+set(CUDF_MIN_VERSION_GTest 1.10.0)
 
-find_and_configure_gtest(${CUDF_MIN_VERSION_gtest})
+find_and_configure_gtest(${CUDF_MIN_VERSION_GTest})
diff --git a/cpp/cmake/thirdparty/CUDF_GetThrust.cmake b/cpp/cmake/thirdparty/CUDF_GetThrust.cmake
index e045093104a..5a304f234d2 100644
--- a/cpp/cmake/thirdparty/CUDF_GetThrust.cmake
+++ b/cpp/cmake/thirdparty/CUDF_GetThrust.cmake
@@ -24,6 +24,7 @@ function(find_and_configure_thrust VERSION)
 
     thrust_create_target(cudf::Thrust FROM_OPTIONS)
     set(THRUST_LIBRARY "cudf::Thrust" PARENT_SCOPE)
+    set(Thrust_SOURCE_DIR "${Thrust_SOURCE_DIR}" PARENT_SCOPE)
 endfunction()
 
 set(CUDF_MIN_VERSION_Thrust 1.10.0)
diff --git a/cpp/include/cudf/io/data_sink.hpp b/cpp/include/cudf/io/data_sink.hpp
index 0ae403458a0..e0eb60af070 100644
--- a/cpp/include/cudf/io/data_sink.hpp
+++ b/cpp/include/cudf/io/data_sink.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -107,23 +107,35 @@ class data_sink {
    */
   virtual bool supports_device_write() const { return false; }
 
+  /**
+   * @brief Estimates whether a direct device write would be more optimal for the given size.
+   *
+   * @param size Number of bytes to write
+   * @return whether the device write is expected to be more performant for the given size
+   */
+  virtual bool is_device_write_preferred(size_t size) const { return supports_device_write(); }
+
   /**
    * @brief Append the buffer content to the sink from a gpu address
    *
-   * @param[in] data Pointer to the buffer to be written into the sink object
-   * @param[in] size Number of bytes to write
+   * For optimal performance, should only be called when `is_device_write_preferred` returns `true`.
+   * Data sink implementations that don't support direct device writes don't need to override
+   * this function.
    *
-   * @return void
+   * @throws cudf::logic_error the object does not support direct device writes, i.e.
+   * `supports_device_write` returns `false`.
+   *
+   * @param gpu_data Pointer to the buffer to be written into the sink object
+   * @param size Number of bytes to write
+   * @param stream CUDA stream to use
    */
   virtual void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream)
   {
-    CUDF_FAIL("data_sink classes that support device_write must override this function.");
+    CUDF_FAIL("data_sink classes that support device_write must override it.");
   }
 
   /**
    * @brief Flush the data written into the sink
-   *
-   * @return void
    */
   virtual void flush() = 0;
 
diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp
index 88f2bd187e2..8fcc045e6d2 100644
--- a/cpp/include/cudf/io/datasource.hpp
+++ b/cpp/include/cudf/io/datasource.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,8 @@
 #include <cudf/io/types.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <arrow/buffer.h>
 #include <arrow/io/file.h>
 #include <arrow/io/interfaces.h>
@@ -50,12 +52,15 @@ class datasource {
     /**
      * @brief Returns the address of the data in the buffer.
      */
-    virtual const uint8_t* data() const = 0;
+    virtual uint8_t const* data() const = 0;
 
     /**
      * @brief Base class destructor
      */
     virtual ~buffer() {}
+
+    template <typename Container>
+    static std::unique_ptr<buffer> create(Container&& data_owner);
   };
 
   /**
@@ -147,37 +152,57 @@ class datasource {
    */
   virtual bool supports_device_read() const { return false; }
 
+  /**
+   * @brief Estimates whether a direct device read would be more optimal for the given size.
+   *
+   * @param size Number of bytes to read
+   * @return whether the device read is expected to be more performant for the given size
+   */
+  virtual bool is_device_read_preferred(size_t size) const { return supports_device_read(); }
+
   /**
    * @brief Returns a device buffer with a subset of data from the source.
    *
+   * For optimal performance, should only be called when `is_device_read_preferred` returns `true`.
    * Data source implementations that don't support direct device reads don't need to override this
    * function.
    *
-   * @param[in] offset Bytes from the start
-   * @param[in] size Bytes to read
+   *  @throws cudf::logic_error the object does not support direct device reads, i.e.
+   * `supports_device_read` returns `false`.
+   *
+   * @param offset Number of bytes from the start
+   * @param size Number of bytes to read
+   * @param stream CUDA stream to use
    *
    * @return The data buffer in the device memory
    */
-  virtual std::unique_ptr<datasource::buffer> device_read(size_t offset, size_t size)
+  virtual std::unique_ptr<datasource::buffer> device_read(size_t offset,
+                                                          size_t size,
+                                                          rmm::cuda_stream_view stream)
   {
-    CUDF_FAIL("datasource classes that support device_read must override this function.");
+    CUDF_FAIL("datasource classes that support device_read must override it.");
   }
 
   /**
    * @brief Reads a selected range into a preallocated device buffer
    *
+   * For optimal performance, should only be called when `is_device_read_preferred` returns `true`.
    * Data source implementations that don't support direct device reads don't need to override this
    * function.
    *
-   * @param[in] offset Bytes from the start
-   * @param[in] size Bytes to read
-   * @param[in] dst Address of the existing device memory
+   *  @throws cudf::logic_error when the object does not support direct device reads, i.e.
+   * `supports_device_read` returns `false`.
+   *
+   * @param offset Number of bytes from the start
+   * @param size Number of bytes to read
+   * @param dst Address of the existing device memory
+   * @param stream CUDA stream to use
    *
    * @return The number of bytes read (can be smaller than size)
    */
-  virtual size_t device_read(size_t offset, size_t size, uint8_t* dst)
+  virtual size_t device_read(size_t offset, size_t size, uint8_t* dst, rmm::cuda_stream_view stream)
   {
-    CUDF_FAIL("datasource classes that support device_read must override this function.");
+    CUDF_FAIL("datasource classes that support device_read must override it.");
   }
 
   /**
@@ -205,14 +230,57 @@ class datasource {
 
     size_t size() const override { return _size; }
 
-    const uint8_t* data() const override { return _data; }
+    uint8_t const* data() const override { return _data; }
 
    private:
     uint8_t* const _data;
     size_t const _size;
   };
+
+  /**
+   * @brief Derived implementation of `buffer` that owns the data.
+   *
+   * Can use different container types to hold the data buffer.
+   *
+   * @tparam Container Type of the container object that owns the data
+   */
+  template <typename Container>
+  class owning_buffer : public buffer {
+   public:
+    /**
+     * @brief Moves the input container into the newly created object.
+     */
+    owning_buffer(Container&& data_owner)
+      : _data(std::move(data_owner)), _data_ptr(_data.data()), _size(_data.size())
+    {
+    }
+
+    /**
+     * @brief Moves the input container into the newly created object, and exposes a subspan of the
+     * buffer.
+     */
+    owning_buffer(Container&& data_owner, uint8_t const* data_ptr, size_t size)
+      : _data(std::move(data_owner)), _data_ptr(data_ptr), _size(size)
+    {
+    }
+
+    size_t size() const override { return _size; }
+
+    uint8_t const* data() const override { return static_cast<uint8_t const*>(_data_ptr); }
+
+   private:
+    Container _data;
+    void const* _data_ptr;
+    size_t _size;
+  };
 };
 
+template <typename Container>
+std::unique_ptr<datasource::buffer> datasource::buffer::create(Container&& data_owner)
+{
+  return std::make_unique<owning_buffer<Container>>(std::move(data_owner));
+}
+
 /**
  * @brief Implementation class for reading from an Apache Arrow file. The file
  * could be a memory-mapped file or other implementation supported by Arrow.
@@ -230,7 +298,7 @@ class arrow_io_source : public datasource {
     {
     }
     size_t size() const override { return arrow_buffer->size(); }
-    const uint8_t* data() const override { return arrow_buffer->data(); }
+    uint8_t const* data() const override { return arrow_buffer->data(); }
   };
 
  public:
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index cd3b7bf27da..3e63e8fc770 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -376,6 +376,162 @@ table_with_metadata read_parquet(
  * @{
  * @file
  */
+class table_input_metadata;
+
+class column_in_metadata {
+  friend table_input_metadata;
+  std::string _name = "";
+  thrust::optional<bool> _nullable;
+  // TODO: This isn't implemented yet
+  bool _list_column_is_map  = false;
+  bool _use_int96_timestamp = false;
+  // bool _output_as_binary = false;
+  thrust::optional<uint8_t> _decimal_precision;
+  std::vector<column_in_metadata> children;
+
+ public:
+  /**
+   * @brief Set the name of this column
+   *
+   * @return this for chaining
+   */
+  column_in_metadata& set_name(std::string const& name)
+  {
+    _name = name;
+    return *this;
+  }
+
+  /**
+   * @brief Set the nullability of this column
+   *
+   * Only valid in case of chunked writes. In single writes, this option is ignored.
+   *
+   * @return column_in_metadata&
+   */
+  column_in_metadata& set_nullability(bool nullable)
+  {
+    _nullable = nullable;
+    return *this;
+  }
+
+  /**
+   * @brief Specify that this list column should be encoded as a map in the written parquet file
+   *
+   * The column must have the structure list<struct<key, value>>. This option is invalid otherwise
+   *
+   * @return this for chaining
+   */
+  column_in_metadata& set_list_column_as_map()
+  {
+    _list_column_is_map = true;
+    return *this;
+  }
+
+  /**
+   * @brief Specifies whether this timestamp column should be encoded using the deprecated int96
+   * physical type. Only valid for the following column types:
+   * timestamp_s, timestamp_ms, timestamp_us, timestamp_ns
+   *
+   * @param req True = use int96 physical type. False = use int64 physical type
+   * @return this for chaining
+   */
+  column_in_metadata& set_int96_timestamps(bool req)
+  {
+    _use_int96_timestamp = req;
+    return *this;
+  }
+
+  /**
+   * @brief Set the decimal precision of this column. Only valid if this column is a decimal
+   * (fixed-point) type
+   *
+   * @param precision The integer precision to set for this decimal column
+   * @return this for chaining
+   */
+  column_in_metadata& set_decimal_precision(uint8_t precision)
+  {
+    _decimal_precision = precision;
+    return *this;
+  }
+
+  /**
+   * @brief Get reference to a child of this column
+   *
+   * @param i Index of the child to get
+   * @return this for chaining
+   */
+  column_in_metadata& child(size_type i) { return children[i]; }
+
+  /**
+   * @brief Get const reference to a child of this column
+   *
+   * @param i Index of the child to get
+   * @return this for chaining
+   */
+  column_in_metadata const& child(size_type i) const { return children[i]; }
+
+  /**
+   * @brief Get the name of this column
+   */
+  std::string get_name() const { return _name; }
+
+  /**
+   * @brief Get whether nullability has been explicitly set for this column.
+   */
+  bool is_nullability_defined() const { return _nullable.has_value(); }
+
+  /**
+   * @brief Gets the explicitly set nullability for this column.
+   * @throws If nullability is not explicitly defined for this column.
+   *         Check using `is_nullability_defined()` first.
+   */
+  bool nullable() const { return _nullable.value(); }
+
+  /**
+   * @brief If this is the metadata of a list column, returns whether it is to be encoded as a map.
+   */
+  bool is_map() const { return _list_column_is_map; }
+
+  /**
+   * @brief Get whether to encode this timestamp column using deprecated int96 physical type
+   */
+  bool is_enabled_int96_timestamps() const { return _use_int96_timestamp; }
+
+  /**
+   * @brief Get whether precision has been set for this decimal column
+   */
+  bool is_decimal_precision_set() const { return _decimal_precision.has_value(); }
+
+  /**
+   * @brief Get the decimal precision that was set for this column.
+   * @throws If decimal precision was not set for this column.
+   *         Check using `is_decimal_precision_set()` first.
+   */
+  uint8_t get_decimal_precision() const { return _decimal_precision.value(); }
+
+  /**
+   * @brief Get the number of children of this column
+   */
+  size_type num_children() const { return children.size(); }
+};
+
+class table_input_metadata {
+ public:
+  table_input_metadata() = default;  // Required by cython
+
+  /**
+   * @brief Construct a new table_input_metadata from a table_view.
+   *
+   * The constructed table_input_metadata has the same structure as the passed table_view
+   *
+   * @param table The table_view to construct metadata for
+   * @param user_data Optional Additional metadata to encode, as key-value pairs
+   */
+  table_input_metadata(table_view const& table, std::map<std::string, std::string> user_data = {});
+
+  std::vector<column_in_metadata> column_metadata;
+  std::map<std::string, std::string> user_data;  //!< Format-dependent metadata as key-values pairs
+};
 
 /**
  * @brief Class to build `parquet_writer_options`.
@@ -395,14 +551,12 @@ class parquet_writer_options {
   // Sets of columns to output
   table_view _table;
   // Optional associated metadata
-  const table_metadata* _metadata = nullptr;
-  // Parquet writes can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS.
+  table_input_metadata const* _metadata = nullptr;
+  // Parquet writer can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS.
+  // If true then overrides any per-column setting in _metadata.
   bool _write_timestamps_as_int96 = false;
   // Column chunks file path to be set in the raw output metadata
   std::string _column_chunks_file_path;
-  /// vector of precision values for decimal writing. Exactly one entry
-  /// per decimal column. Optional unless decimals are being written.
-  std::vector<uint8_t> _decimal_precision;
 
   /**
    * @brief Constructor from sink and table.
@@ -465,7 +619,7 @@ class parquet_writer_options {
   /**
    * @brief Returns associated metadata.
    */
-  table_metadata const* get_metadata() const { return _metadata; }
+  table_input_metadata const* get_metadata() const { return _metadata; }
 
   /**
    * @brief Returns `true` if timestamps will be written as INT96
@@ -477,17 +631,12 @@ class parquet_writer_options {
    */
   std::string get_column_chunks_file_path() const { return _column_chunks_file_path; }
 
-  /**
-   * @brief Returns a constant reference to the decimal precision vector.
-   */
-  std::vector<uint8_t> const& get_decimal_precision() const { return _decimal_precision; }
-
   /**
    * @brief Sets metadata.
    *
    * @param metadata Associated metadata.
    */
-  void set_metadata(table_metadata const* metadata) { _metadata = metadata; }
+  void set_metadata(table_input_metadata const* metadata) { _metadata = metadata; }
 
   /**
    * @brief Sets the level of statistics.
@@ -520,11 +669,6 @@ class parquet_writer_options {
   {
     _column_chunks_file_path.assign(file_path);
   }
-
-  /**
-   * @brief Sets the decimal precision vector data.
-   */
-  void set_decimal_precision(std::vector<uint8_t> dp) { _decimal_precision = std::move(dp); }
 };
 
 class parquet_writer_options_builder {
@@ -555,7 +699,7 @@ class parquet_writer_options_builder {
    * @param metadata Associated metadata.
    * @return this for chaining.
    */
-  parquet_writer_options_builder& metadata(table_metadata const* metadata)
+  parquet_writer_options_builder& metadata(table_input_metadata const* metadata)
   {
     options._metadata = metadata;
     return *this;
@@ -672,11 +816,10 @@ class chunked_parquet_writer_options {
   // Specify the level of statistics in the output file
   statistics_freq _stats_level = statistics_freq::STATISTICS_ROWGROUP;
   // Optional associated metadata.
-  const table_metadata_with_nullability* _nullable_metadata = nullptr;
-  // Parquet writes can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS.
+  table_input_metadata const* _metadata = nullptr;
+  // Parquet writer can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS.
+  // If true then overrides any per-column setting in _metadata.
   bool _write_timestamps_as_int96 = false;
-  // Optional decimal precision data - must be present if writing decimals
-  std::vector<uint8_t> _decimal_precision = {};
 
   /**
    * @brief Constructor from sink.
@@ -711,17 +854,9 @@ class chunked_parquet_writer_options {
   statistics_freq get_stats_level() const { return _stats_level; }
 
   /**
-   * @brief Returns nullable metadata information.
+   * @brief Returns metadata information.
    */
-  const table_metadata_with_nullability* get_nullable_metadata() const
-  {
-    return _nullable_metadata;
-  }
-
-  /**
-   * @brief Returns decimal precision pointer.
-   */
-  std::vector<uint8_t> const& get_decimal_precision() const { return _decimal_precision; }
+  table_input_metadata const* get_metadata() const { return _metadata; }
 
   /**
    * @brief Returns `true` if timestamps will be written as INT96
@@ -729,22 +864,11 @@ class chunked_parquet_writer_options {
   bool is_enabled_int96_timestamps() const { return _write_timestamps_as_int96; }
 
   /**
-   * @brief Sets nullable metadata.
+   * @brief Sets metadata.
    *
    * @param metadata Associated metadata.
    */
-  void set_nullable_metadata(const table_metadata_with_nullability* metadata)
-  {
-    _nullable_metadata = metadata;
-  }
-
-  /**
-   * @brief Sets decimal precision data.
-   *
-   * @param v Vector of precision data flattened with exactly one entry per
-   *          decimal column.
-   */
-  void set_decimal_precision_data(std::vector<uint8_t> const& v) { _decimal_precision = v; }
+  void set_metadata(table_input_metadata const* metadata) { _metadata = metadata; }
 
   /**
    * @brief Sets the level of statistics in parquet_writer_options.
@@ -797,15 +921,14 @@ class chunked_parquet_writer_options_builder {
   chunked_parquet_writer_options_builder(sink_info const& sink) : options(sink){};
 
   /**
-   * @brief Sets nullable metadata to chunked_parquet_writer_options.
+   * @brief Sets metadata to chunked_parquet_writer_options.
    *
    * @param metadata Associated metadata.
    * @return this for chaining.
    */
-  chunked_parquet_writer_options_builder& nullable_metadata(
-    const table_metadata_with_nullability* metadata)
+  chunked_parquet_writer_options_builder& metadata(table_input_metadata const* metadata)
   {
-    options._nullable_metadata = metadata;
+    options._metadata = metadata;
     return *this;
   }
 
@@ -821,18 +944,6 @@ class chunked_parquet_writer_options_builder {
     return *this;
   }
 
-  /**
-   * @brief Sets decimal precision data.
-   *
-   * @param v Vector of precision data flattened with exactly one entry per
-   *          decimal column.
-   */
-  chunked_parquet_writer_options_builder& decimal_precision(std::vector<uint8_t> const& v)
-  {
-    options._decimal_precision = v;
-    return *this;
-  }
-
   /**
    * @brief Sets compression type to chunked_parquet_writer_options.
    *
diff --git a/cpp/include/cudf/strings/char_types/char_types.hpp b/cpp/include/cudf/strings/char_types/char_types.hpp
index 300722920f4..1f5b6241850 100644
--- a/cpp/include/cudf/strings/char_types/char_types.hpp
+++ b/cpp/include/cudf/strings/char_types/char_types.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -146,82 +146,6 @@ std::unique_ptr<column> filter_characters_of_type(
   string_character_types types_to_keep = string_character_types::ALL_TYPES,
   rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
 
-/**
- * @brief Returns a boolean column identifying strings in which all
- * characters are valid for conversion to integers.
- *
- * The output row entry will be set to `true` if the corresponding string element
- * has at least one character in [-+0-9].
- *
- * @code{.pseudo}
- * Example:
- * s = ['123', '-456', '', 'A', '+7']
- * b = s.is_integer(s)
- * b is [true, true, false, false, true]
- * @endcode
- *
- * Any null row results in a null entry for that row in the output column.
- *
- * @param strings Strings instance for this operation.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New column of boolean results for each string.
- */
-std::unique_ptr<column> is_integer(
-  strings_column_view const& strings,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Returns `true` if all strings contain
- * characters that are valid for conversion to integers.
- *
- * This function will return `true` if all string elements
- * has at least one character in [-+0-9].
- *
- * Any null entry or empty string will cause this function to return `false`.
- *
- * @param strings Strings instance for this operation.
- * @return true if all string are valid
- */
-bool all_integer(strings_column_view const& strings);
-
-/**
- * @brief Returns a boolean column identifying strings in which all
- * characters are valid for conversion to floats.
- *
- * The output row entry will be set to `true` if the corresponding string element
- * has at least one character in [-+0-9eE.].
- *
- * @code{.pseudo}
- * Example:
- * s = ['123', '-456', '', 'A', '+7', '8.9' '3.7e+5']
- * b = s.is_float(s)
- * b is [true, true, false, false, true, true, true]
- * @endcode
- *
- * Any null row results in a null entry for that row in the output column.
- *
- * @param strings Strings instance for this operation.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New column of boolean results for each string.
- */
-std::unique_ptr<column> is_float(
-  strings_column_view const& strings,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Returns `true` if all strings contain
- * characters that are valid for conversion to floats.
- *
- * This function will return `true` if all string elements
- * has at least one character in [-+0-9eE.].
- *
- * Any null entry or empty string will cause this function to return `false`.
- *
- * @param strings Strings instance for this operation.
- * @return true if all string are valid
- */
-bool all_float(strings_column_view const& strings);
-
 /** @} */  // end of doxygen group
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/include/cudf/strings/convert/convert_floats.hpp b/cpp/include/cudf/strings/convert/convert_floats.hpp
index cb4746dbf40..d1e00b36f6f 100644
--- a/cpp/include/cudf/strings/convert/convert_floats.hpp
+++ b/cpp/include/cudf/strings/convert/convert_floats.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -68,6 +68,30 @@ std::unique_ptr<column> from_floats(
   column_view const& floats,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Returns a boolean column identifying strings in which all
+ * characters are valid for conversion to floats.
+ *
+ * The output row entry will be set to `true` if the corresponding string element
+ * has at least one character in [-+0-9eE.].
+ *
+ * @code{.pseudo}
+ * Example:
+ * s = ['123', '-456', '', 'A', '+7', '8.9' '3.7e+5']
+ * b = s.is_float(s)
+ * b is [true, true, false, false, true, true, true]
+ * @endcode
+ *
+ * Any null row results in a null entry for that row in the output column.
+ *
+ * @param strings Strings instance for this operation.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return New column of boolean results for each string.
+ */
+std::unique_ptr<column> is_float(
+  strings_column_view const& strings,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of doxygen group
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/include/cudf/strings/convert/convert_integers.hpp b/cpp/include/cudf/strings/convert/convert_integers.hpp
index 8f42deb380d..1e2fa80b129 100644
--- a/cpp/include/cudf/strings/convert/convert_integers.hpp
+++ b/cpp/include/cudf/strings/convert/convert_integers.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -73,6 +73,30 @@ std::unique_ptr<column> from_integers(
   column_view const& integers,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Returns a boolean column identifying strings in which all
+ * characters are valid for conversion to integers.
+ *
+ * The output row entry will be set to `true` if the corresponding string element
+ * has at least one character in [-+0-9].
+ *
+ * @code{.pseudo}
+ * Example:
+ * s = ['123', '-456', '', 'A', '+7']
+ * b = s.is_integer(s)
+ * b is [true, true, false, false, true]
+ * @endcode
+ *
+ * Any null row results in a null entry for that row in the output column.
+ *
+ * @param strings Strings instance for this operation.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return New column of boolean results for each string.
+ */
+std::unique_ptr<column> is_integer(
+  strings_column_view const& strings,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Returns a new integer numeric column parsing hexadecimal values from the
  * provided strings column.
diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index 28da8ef4324..988fa552100 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -31,15 +31,60 @@
 #include <thrust/transform.h>
 
 namespace cudf {
+namespace strings {
+namespace detail {
 
-template <typename Iterator>
-constexpr inline bool is_signed_iterator()
+/**
+ * @brief Returns a new chars column using the specified indices to select
+ * strings from the input iterator.
+ *
+ * This uses a character-parallel gather CUDA kernel that performs very
+ * well on a strings column with long strings (e.g. average > 64 bytes).
+ *
+ * @tparam StringIterator Iterator should produce `string_view` objects.
+ * @tparam MapIterator Iterator for retrieving integer indices of the `StringIterator`.
+ *
+ * @param strings_begin Start of the iterator to retrieve `string_view` instances
+ * @param map_begin Start of index iterator.
+ * @param map_end End of index iterator.
+ * @param offsets The offset values to be associated with the output chars column.
+ * @param chars_bytes The total number of bytes for the output chars column.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @return New chars column fit for a strings column.
+ */
+template <typename StringIterator, typename MapIterator>
+std::unique_ptr<cudf::column> gather_chars(StringIterator strings_begin,
+                                           MapIterator map_begin,
+                                           MapIterator map_end,
+                                           cudf::device_span<int32_t const> const offsets,
+                                           size_type chars_bytes,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
 {
-  return std::is_signed<typename std::iterator_traits<Iterator>::value_type>::value;
-}
+  auto const output_count = std::distance(map_begin, map_end);
+  if (output_count == 0) return make_empty_column(data_type{type_id::INT8});
 
-namespace strings {
-namespace detail {
+  auto chars_column  = create_chars_child_column(output_count, 0, chars_bytes, stream, mr);
+  auto const d_chars = chars_column->mutable_view().template data<char>();
+
+  auto gather_chars_fn = [strings_begin, map_begin, offsets] __device__(size_type out_idx) -> char {
+    auto const out_row =
+      thrust::prev(thrust::upper_bound(thrust::seq, offsets.begin(), offsets.end(), out_idx));
+    auto const row_idx = map_begin[thrust::distance(offsets.begin(), out_row)];  // get row index
+    auto const d_str   = strings_begin[row_idx];                                 // get row's string
+    auto const offset  = out_idx - *out_row;  // get string's char
+    return d_str.data()[offset];
+  };
+
+  thrust::transform(rmm::exec_policy(stream),
+                    thrust::make_counting_iterator<size_type>(0),
+                    thrust::make_counting_iterator<size_type>(chars_bytes),
+                    d_chars,
+                    gather_chars_fn);
+
+  return chars_column;
+}
 
 /**
  * @brief Returns a new strings column using the specified indices to select
@@ -107,29 +152,15 @@ std::unique_ptr<cudf::column> gather(
     rmm::exec_policy(stream), d_out_offsets, d_out_offsets + output_count + 1, d_out_offsets);
 
   // build chars column
-  size_type const out_chars_bytes = static_cast<size_type>(total_bytes);
-  auto out_chars_column  = create_chars_child_column(output_count, 0, out_chars_bytes, stream, mr);
-  auto const d_out_chars = out_chars_column->mutable_view().template data<char>();
-
-  // fill in chars
   cudf::device_span<int32_t const> const d_out_offsets_span(d_out_offsets, output_count + 1);
-  auto const d_in_chars = (strings_count > 0) ? strings.chars().data<char>() : nullptr;
-  auto gather_chars_fn =
-    [d_out_offsets_span, begin, d_in_offsets, d_in_chars] __device__(size_type out_char_idx) {
-      // find output row index for this output char index
-      auto const next_row_ptr = thrust::upper_bound(
-        thrust::seq, d_out_offsets_span.begin(), d_out_offsets_span.end(), out_char_idx);
-      auto const out_row_idx     = thrust::distance(d_out_offsets_span.begin(), next_row_ptr) - 1;
-      auto const str_char_offset = out_char_idx - d_out_offsets_span[out_row_idx];
-      auto const in_row_idx      = begin[out_row_idx];
-      auto const in_char_offset  = d_in_offsets[in_row_idx] + str_char_offset;
-      return d_in_chars[in_char_offset];
-    };
-  thrust::transform(rmm::exec_policy(stream),
-                    thrust::make_counting_iterator<size_type>(0),
-                    thrust::make_counting_iterator<size_type>(out_chars_bytes),
-                    d_out_chars,
-                    gather_chars_fn);
+  auto const d_strings  = column_device_view::create(strings.parent(), stream);
+  auto out_chars_column = gather_chars(d_strings->begin<string_view>(),
+                                       begin,
+                                       end,
+                                       d_out_offsets_span,
+                                       static_cast<size_type>(total_bytes),
+                                       stream,
+                                       mr);
 
   return make_strings_column(output_count,
                              std::move(out_offsets_column),
diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
index 8e843c555c5..92cf537454c 100644
--- a/cpp/include/cudf/strings/detail/strings_column_factories.cuh
+++ b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
@@ -20,6 +20,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/valid_if.cuh>
+#include <cudf/strings/detail/gather.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/utilities/error.hpp>
 
@@ -27,6 +28,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/copy.h>
 #include <thrust/for_each.h>
 #include <thrust/transform_reduce.h>
 
@@ -34,7 +36,27 @@ namespace cudf {
 namespace strings {
 namespace detail {
 
-// Create a strings-type column from iterators of pointer/size pairs
+/**
+ * @brief Average string byte-length threshold for deciding character-level
+ * vs. row-level parallel algorithm.
+ *
+ * This value was determined by running the factory_benchmark against different
+ * string lengths and observing the point where the performance is faster for
+ * long strings.
+ */
+constexpr size_type FACTORY_BYTES_PER_ROW_THRESHOLD = 64;
+
+/**
+ * @brief Create a strings-type column from iterators of pointer/size pairs
+ *
+ * @tparam IndexPairIterator iterator over type `pair<char const*,size_type>` values
+ *
+ * @param begin First string row (inclusive)
+ * @param end Last string row (exclusive)
+ * @param stream CUDA stream used for device memory operations
+ * @param mr  Device memory resource used to allocate the returned column's device memory
+ * @return New strings column
+ */
 template <typename IndexPairIterator>
 std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
                                             IndexPairIterator end,
@@ -51,7 +73,7 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
   auto size_checker = [] __device__(string_index_pair const& item) {
     return (item.first != nullptr) ? item.second : 0;
   };
-  size_t bytes = thrust::transform_reduce(
+  size_t const bytes = thrust::transform_reduce(
     rmm::exec_policy(stream), begin, end, size_checker, 0, thrust::plus<size_t>());
   CUDF_EXPECTS(bytes < static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
                "total size of strings is too large for cudf column");
@@ -65,26 +87,49 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
     offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
 
   // create null mask
-  auto validator  = [] __device__(string_index_pair const item) { return item.first != nullptr; };
-  auto new_nulls  = cudf::detail::valid_if(begin, end, validator, stream, mr);
-  auto null_count = new_nulls.second;
+  auto validator = [] __device__(string_index_pair const item) { return item.first != nullptr; };
+  auto new_nulls = cudf::detail::valid_if(begin, end, validator, stream, mr);
+  auto const null_count = new_nulls.second;
   auto null_mask =
     (null_count > 0) ? std::move(new_nulls.first) : rmm::device_buffer{0, stream, mr};
 
+  auto const avg_bytes_per_row = bytes / std::max(strings_count - null_count, 1);
   // build chars column
-  auto chars_column =
-    strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
-  auto d_chars    = chars_column->mutable_view().template data<char>();
-  auto copy_chars = [d_chars] __device__(auto item) {
-    string_index_pair str = thrust::get<0>(item);
-    size_type offset      = thrust::get<1>(item);
-    if (str.first != nullptr) memcpy(d_chars + offset, str.first, str.second);
-  };
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_zip_iterator(
-                       thrust::make_tuple(begin, offsets_column->view().template begin<int32_t>())),
-                     strings_count,
-                     copy_chars);
+  std::unique_ptr<column> chars_column = [&] {
+    // use a character-parallel kernel for long string lengths
+    if (avg_bytes_per_row > FACTORY_BYTES_PER_ROW_THRESHOLD) {
+      auto const d_offsets =
+        device_span<size_type const>{offsets_column->view().template data<int32_t>(),
+                                     static_cast<std::size_t>(offsets_column->size())};
+      auto const str_begin = thrust::make_transform_iterator(begin, [] __device__(auto ip) {
+        return string_view{ip.first, ip.second};
+      });
+
+      return gather_chars(str_begin,
+                          thrust::make_counting_iterator<size_type>(0),
+                          thrust::make_counting_iterator<size_type>(strings_count),
+                          d_offsets,
+                          static_cast<size_type>(bytes),
+                          stream,
+                          mr);
+    } else {
+      // this approach is 2-3x faster for a large number of smaller string lengths
+      auto chars_column =
+        strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
+      auto d_chars    = chars_column->mutable_view().template data<char>();
+      auto copy_chars = [d_chars] __device__(auto item) {
+        string_index_pair const str = thrust::get<0>(item);
+        size_type const offset      = thrust::get<1>(item);
+        if (str.first != nullptr) memcpy(d_chars + offset, str.first, str.second);
+      };
+      thrust::for_each_n(rmm::exec_policy(stream),
+                         thrust::make_zip_iterator(thrust::make_tuple(
+                           begin, offsets_column->view().template begin<int32_t>())),
+                         strings_count,
+                         copy_chars);
+      return chars_column;
+    }
+  }();
 
   return make_strings_column(strings_count,
                              std::move(offsets_column),
@@ -95,7 +140,22 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
                              mr);
 }
 
-// Create a strings-type column from iterators to chars, offsets, and bitmask.
+/**
+ * @brief Create a strings-type column from iterators to chars, offsets, and bitmask.
+ *
+ * @tparam CharIterator iterator over character bytes (int8)
+ * @tparam OffsetIterator iterator over offset values (size_type)
+ *
+ * @param chars_begin First character byte (inclusive)
+ * @param chars_end Last character byte (exclusive)
+ * @param offset_begin First offset value (inclusive)
+ * @param offset_end Last offset value (exclusive)
+ * @param null_count Number of null rows
+ * @param null_mask The validity bitmask in Arrow format
+ * @param stream CUDA stream used for device memory operations
+ * @param mr  Device memory resource used to allocate the returned column's device memory
+ * @return New strings column
+ */
 template <typename CharIterator, typename OffsetIterator>
 std::unique_ptr<column> make_strings_column(CharIterator chars_begin,
                                             CharIterator chars_end,
diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index e045476ea77..1e0d45d081d 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -224,6 +224,18 @@ constexpr inline bool is_unsigned(data_type type)
   return cudf::type_dispatcher(type, is_unsigned_impl{});
 }
 
+/**
+ * @brief Indicates whether the `Iterator` value type is unsigned.
+ *
+ * @tparam Iterator  The type to verify
+ * @return true if the iterator's value type is unsigned
+ */
+template <typename Iterator>
+constexpr inline bool is_signed_iterator()
+{
+  return std::is_signed<typename std::iterator_traits<Iterator>::value_type>::value;
+}
+
 /**
  * @brief Indicates whether the type `T` is a floating point type.
  *
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 332e8aff7fc..76580122fe6 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -351,7 +351,7 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
         // during the conversion stage
         const std::string quotechar(1, opts.quotechar);
         const std::string dblquotechar(2, opts.quotechar);
-        std::unique_ptr<column> col = cudf::make_strings_column(out_buffers[i]._strings, stream);
+        std::unique_ptr<column> col = cudf::make_strings_column(*out_buffers[i]._strings, stream);
         out_columns.emplace_back(
           cudf::strings::replace(col->view(), dblquotechar, quotechar, -1, mr_));
       } else {
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index dda2e0704f6..f7e153d71f4 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -416,36 +416,28 @@ void writer::impl::write_chunked(strings_column_view const& str_column_view,
   auto total_num_bytes      = strings_column.chars_size();
   char const* ptr_all_bytes = strings_column.chars().data<char>();
 
-  if (out_sink_->supports_device_write()) {
-    // host algorithm call, but the underlying call
-    // is a device_write taking a device buffer;
-    //
+  if (out_sink_->is_device_write_preferred(total_num_bytes)) {
+    // Direct write from device memory
     out_sink_->device_write(ptr_all_bytes, total_num_bytes, stream);
-    out_sink_->device_write(newline.data(),
-                            newline.size(),
-                            stream);  // needs newline at the end, to separate from next chunk
   } else {
-    // no device write possible;
-    //
-    // copy the bytes to host, too:
-    //
+    // copy the bytes to host to write them out
     thrust::host_vector<char> h_bytes(total_num_bytes);
     CUDA_TRY(cudaMemcpyAsync(h_bytes.data(),
                              ptr_all_bytes,
                              total_num_bytes * sizeof(char),
                              cudaMemcpyDeviceToHost,
                              stream.value()));
-
     stream.synchronize();
 
-    // host algorithm call, where the underlying call
-    // is also host_write taking a host buffer;
-    //
-    char const* ptr_h_bytes = h_bytes.data();
-    out_sink_->host_write(ptr_h_bytes, total_num_bytes);
+    out_sink_->host_write(h_bytes.data(), total_num_bytes);
+  }
+
+  // Needs newline at the end, to separate from next chunk
+  if (out_sink_->is_device_write_preferred(newline.size())) {
+    out_sink_->device_write(newline.data(), newline.size(), stream);
+  } else {
     out_sink_->host_write(options_.get_line_terminator().data(),
-                          options_.get_line_terminator()
-                            .size());  // needs newline at the end, to separate from next chunk
+                          options_.get_line_terminator().size());
   }
 }
 
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 1b7635f8d0d..bc6d36a0328 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -419,6 +419,22 @@ std::unique_ptr<std::vector<uint8_t>> merge_rowgroup_metadata(
   return detail_parquet::writer::merge_rowgroup_metadata(metadata_list);
 }
 
+table_input_metadata::table_input_metadata(table_view const& table,
+                                           std::map<std::string, std::string> user_data)
+  : user_data{std::move(user_data)}
+{
+  // Create a metadata hierarchy using `table`
+  std::function<column_in_metadata(column_view const&)> get_children = [&](column_view const& col) {
+    auto col_meta = column_in_metadata{};
+    std::transform(
+      col.child_begin(), col.child_end(), std::back_inserter(col_meta.children), get_children);
+    return col_meta;
+  };
+
+  std::transform(
+    table.begin(), table.end(), std::back_inserter(this->column_metadata), get_children);
+}
+
 /**
  * @copydoc cudf::io::write_parquet
  */
diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index 5a82c9891b8..1a1fa8d0602 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -158,7 +158,7 @@ std::unique_ptr<table> create_json_keys_info_table(const parse_options_view &opt
   auto const info_table_mdv = mutable_table_device_view::create(info_table->mutable_view(), stream);
 
   // Reset the key counter - now used for indexing
-  key_counter.set_value(0, stream);
+  key_counter.set_value_zero(stream);
   // Fill the allocated columns
   cudf::io::json::gpu::collect_keys_info(
     options, data, row_offsets, key_counter.data(), {*info_table_mdv}, stream);
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 73908cc1553..9f88c6584ce 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -31,7 +31,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
-#include <rmm/device_vector.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <algorithm>
 #include <array>
@@ -223,7 +223,7 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
   const OrcDecompressor *decompressor,
   std::vector<orc_stream_info> &stream_info,
   size_t num_stripes,
-  rmm::device_vector<gpu::RowGroup> &row_groups,
+  device_span<gpu::RowGroup> row_groups,
   size_t row_index_stride,
   rmm::cuda_stream_view stream)
 {
@@ -254,9 +254,9 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
   CUDF_EXPECTS(total_decomp_size > 0, "No decompressible data found");
 
   rmm::device_buffer decomp_data(total_decomp_size, stream);
-  rmm::device_vector<gpu_inflate_input_s> inflate_in(num_compressed_blocks +
-                                                     num_uncompressed_blocks);
-  rmm::device_vector<gpu_inflate_status_s> inflate_out(num_compressed_blocks);
+  rmm::device_uvector<gpu_inflate_input_s> inflate_in(
+    num_compressed_blocks + num_uncompressed_blocks, stream);
+  rmm::device_uvector<gpu_inflate_status_s> inflate_out(num_compressed_blocks, stream);
 
   // Parse again to populate the decompression input/output buffers
   size_t decomp_offset      = 0;
@@ -265,9 +265,9 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
   for (size_t i = 0; i < compinfo.size(); ++i) {
     auto dst_base                 = static_cast<uint8_t *>(decomp_data.data());
     compinfo[i].uncompressed_data = dst_base + decomp_offset;
-    compinfo[i].decctl            = inflate_in.data().get() + start_pos;
-    compinfo[i].decstatus         = inflate_out.data().get() + start_pos;
-    compinfo[i].copyctl           = inflate_in.data().get() + start_pos_uncomp;
+    compinfo[i].decctl            = inflate_in.data() + start_pos;
+    compinfo[i].decstatus         = inflate_out.data() + start_pos;
+    compinfo[i].copyctl           = inflate_in.data() + start_pos_uncomp;
 
     stream_info[i].dst_pos = decomp_offset;
     decomp_offset += compinfo[i].max_uncompressed_size;
@@ -285,19 +285,18 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
   if (num_compressed_blocks > 0) {
     switch (decompressor->GetKind()) {
       case orc::ZLIB:
-        CUDA_TRY(gpuinflate(
-          inflate_in.data().get(), inflate_out.data().get(), num_compressed_blocks, 0, stream));
+        CUDA_TRY(
+          gpuinflate(inflate_in.data(), inflate_out.data(), num_compressed_blocks, 0, stream));
         break;
       case orc::SNAPPY:
-        CUDA_TRY(gpu_unsnap(
-          inflate_in.data().get(), inflate_out.data().get(), num_compressed_blocks, stream));
+        CUDA_TRY(gpu_unsnap(inflate_in.data(), inflate_out.data(), num_compressed_blocks, stream));
         break;
       default: CUDF_EXPECTS(false, "Unexpected decompression dispatch"); break;
     }
   }
   if (num_uncompressed_blocks > 0) {
     CUDA_TRY(gpu_copy_uncompressed_blocks(
-      inflate_in.data().get() + num_compressed_blocks, num_uncompressed_blocks, stream));
+      inflate_in.data() + num_compressed_blocks, num_uncompressed_blocks, stream));
   }
   gpu::PostDecompressionReassemble(compinfo.device_ptr(), compinfo.size(), stream);
 
@@ -324,7 +323,7 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
 
   if (not row_groups.empty()) {
     chunks.host_to_device(stream);
-    gpu::ParseRowGroupIndex(row_groups.data().get(),
+    gpu::ParseRowGroupIndex(row_groups.data(),
                             compinfo.device_ptr(),
                             chunks.device_ptr(),
                             num_columns,
@@ -341,8 +340,8 @@ void reader::impl::decode_stream_data(hostdevice_vector<gpu::ColumnDesc> &chunks
                                       size_t num_dicts,
                                       size_t skip_rows,
                                       size_t num_rows,
-                                      timezone_table const &tz_table,
-                                      const rmm::device_vector<gpu::RowGroup> &row_groups,
+                                      timezone_table_view tz_table,
+                                      device_span<gpu::RowGroup const> row_groups,
                                       size_t row_index_stride,
                                       std::vector<column_buffer> &out_buffers,
                                       rmm::cuda_stream_view stream)
@@ -360,24 +359,19 @@ void reader::impl::decode_stream_data(hostdevice_vector<gpu::ColumnDesc> &chunks
   }
 
   // Allocate global dictionary for deserializing
-  rmm::device_vector<gpu::DictionaryEntry> global_dict(num_dicts);
+  rmm::device_uvector<gpu::DictionaryEntry> global_dict(num_dicts, stream);
 
   chunks.host_to_device(stream);
-  gpu::DecodeNullsAndStringDictionaries(chunks.device_ptr(),
-                                        global_dict.data().get(),
-                                        num_columns,
-                                        num_stripes,
-                                        num_rows,
-                                        skip_rows,
-                                        stream);
+  gpu::DecodeNullsAndStringDictionaries(
+    chunks.device_ptr(), global_dict.data(), num_columns, num_stripes, num_rows, skip_rows, stream);
   gpu::DecodeOrcColumnData(chunks.device_ptr(),
-                           global_dict.data().get(),
+                           global_dict.data(),
                            num_columns,
                            num_stripes,
                            num_rows,
                            skip_rows,
-                           tz_table.view(),
-                           row_groups.data().get(),
+                           tz_table,
+                           row_groups.data(),
                            row_groups.size() / num_columns,
                            row_index_stride,
                            stream);
@@ -538,9 +532,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
           chunk.ts_clock_rate = to_clockrate(_timestamp_type.id());
         }
         for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
-          if (chunk.strm_len[k] > 0) {
-            chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k]].dst_pos;
-          }
+          chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k]].dst_pos;
         }
       }
       stripe_start_row += stripe_info->numberOfRows;
@@ -553,7 +545,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
     // Process dataset chunk pages into output columns
     if (stripe_data.size() != 0) {
       // Setup row group descriptors if using indexes
-      rmm::device_vector<gpu::RowGroup> row_groups(num_rowgroups * num_columns);
+      rmm::device_uvector<gpu::RowGroup> row_groups(num_rowgroups * num_columns, stream);
       if (_metadata->ps.compression != orc::NONE) {
         auto decomp_data = decompress_stripe_data(chunks,
                                                   stripe_data,
@@ -566,9 +558,9 @@ table_with_metadata reader::impl::read(size_type skip_rows,
         stripe_data.clear();
         stripe_data.push_back(std::move(decomp_data));
       } else {
-        if (not row_groups.empty()) {
+        if (not row_groups.is_empty()) {
           chunks.host_to_device(stream);
-          gpu::ParseRowGroupIndex(row_groups.data().get(),
+          gpu::ParseRowGroupIndex(row_groups.data(),
                                   nullptr,
                                   chunks.device_ptr(),
                                   num_columns,
@@ -582,7 +574,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
       // Setup table for converting timestamp columns from local to UTC time
       auto const tz_table =
         _has_timestamp_column
-          ? build_timezone_transition_table(selected_stripes[0].second->writerTimezone)
+          ? build_timezone_transition_table(selected_stripes[0].second->writerTimezone, stream)
           : timezone_table{};
 
       std::vector<column_buffer> out_buffers;
@@ -601,7 +593,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
                          num_dict_entries,
                          skip_rows,
                          num_rows,
-                         tz_table,
+                         tz_table.view(),
                          row_groups,
                          _metadata->get_row_index_stride(),
                          out_buffers,
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 818e70b15e7..3a2913c5548 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -97,7 +97,7 @@ class reader::impl {
                                             const OrcDecompressor *decompressor,
                                             std::vector<orc_stream_info> &stream_info,
                                             size_t num_stripes,
-                                            rmm::device_vector<gpu::RowGroup> &row_groups,
+                                            device_span<gpu::RowGroup> row_groups,
                                             size_t row_index_stride,
                                             rmm::cuda_stream_view stream);
 
@@ -118,8 +118,8 @@ class reader::impl {
                           size_t num_dicts,
                           size_t skip_rows,
                           size_t num_rows,
-                          timezone_table const &tz_table,
-                          const rmm::device_vector<gpu::RowGroup> &row_groups,
+                          timezone_table_view tz_table,
+                          device_span<gpu::RowGroup const> row_groups,
                           size_t row_index_stride,
                           std::vector<column_buffer> &out_buffers,
                           rmm::cuda_stream_view stream);
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index 1ff752034ad..6206d98773f 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -1572,7 +1572,7 @@ __global__ void __launch_bounds__(block_size)
           if (t == 0) { s->top.data.buffered_count = n; }
         }
 
-        numvals = min(numvals * 8, is_last_set ? s->top.data.max_vals : blockDim.x);
+        numvals = min(numvals * 8, is_last_set ? (s->top.data.max_vals + 7) & (~0x7) : blockDim.x);
 
       } else if (s->chunk.type_kind == LONG || s->chunk.type_kind == TIMESTAMP ||
                  s->chunk.type_kind == DECIMAL) {
diff --git a/cpp/src/io/orc/timezone.cpp b/cpp/src/io/orc/timezone.cpp
index bf8b96b89dc..81ffa954c1a 100644
--- a/cpp/src/io/orc/timezone.cpp
+++ b/cpp/src/io/orc/timezone.cpp
@@ -374,7 +374,8 @@ static int64_t get_transition_time(dst_transition_s const &trans, int year)
   return trans.time + day * day_seconds;
 }
 
-timezone_table build_timezone_transition_table(std::string const &timezone_name)
+timezone_table build_timezone_transition_table(std::string const &timezone_name,
+                                               rmm::cuda_stream_view stream)
 {
   if (timezone_name == "UTC" || timezone_name.empty()) {
     // Return an empty table for UTC
@@ -459,7 +460,22 @@ timezone_table build_timezone_transition_table(std::string const &timezone_name)
     year_timestamp += (365 + is_leap_year(year)) * day_seconds;
   }
 
-  return {get_gmt_offset(ttimes, offsets, orc_utc_offset), ttimes, offsets};
+  rmm::device_uvector<int64_t> d_ttimes{ttimes.size(), stream};
+  CUDA_TRY(cudaMemcpyAsync(d_ttimes.data(),
+                           ttimes.data(),
+                           ttimes.size() * sizeof(int64_t),
+                           cudaMemcpyDefault,
+                           stream.value()));
+  rmm::device_uvector<int32_t> d_offsets{offsets.size(), stream};
+  CUDA_TRY(cudaMemcpyAsync(d_offsets.data(),
+                           offsets.data(),
+                           offsets.size() * sizeof(int32_t),
+                           cudaMemcpyDefault,
+                           stream.value()));
+  auto const gmt_offset = get_gmt_offset(ttimes, offsets, orc_utc_offset);
+  stream.synchronize();
+
+  return {gmt_offset, std::move(d_ttimes), std::move(d_offsets)};
 }
 
 }  // namespace io
diff --git a/cpp/src/io/orc/timezone.cuh b/cpp/src/io/orc/timezone.cuh
index 3a87f28391c..b0231ca9e7d 100644
--- a/cpp/src/io/orc/timezone.cuh
+++ b/cpp/src/io/orc/timezone.cuh
@@ -20,8 +20,8 @@
 #include <cudf/utilities/span.hpp>
 
 #include <thrust/binary_search.h>
-#include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
+#include <rmm/device_uvector.hpp>
 
 #include <stdint.h>
 #include <string>
@@ -108,8 +108,15 @@ inline __device__ int32_t get_gmt_offset(cudf::device_span<int64_t const> ttimes
 
 struct timezone_table {
   int32_t gmt_offset = 0;
-  rmm::device_vector<int64_t> ttimes;
-  rmm::device_vector<int32_t> offsets;
+  rmm::device_uvector<int64_t> ttimes;
+  rmm::device_uvector<int32_t> offsets;
+  timezone_table() : ttimes{0, rmm::cuda_stream_default}, offsets{0, rmm::cuda_stream_default} {}
+  timezone_table(int32_t gmt_offset,
+                 rmm::device_uvector<int64_t> &&ttimes,
+                 rmm::device_uvector<int32_t> &&offsets)
+    : gmt_offset{gmt_offset}, ttimes{std::move(ttimes)}, offsets{std::move(offsets)}
+  {
+  }
   timezone_table_view view() const { return {gmt_offset, ttimes, offsets}; }
 };
 
@@ -119,10 +126,12 @@ struct timezone_table {
  * Uses system's TZif files. Assumes little-endian platform when parsing these files.
  *
  * @param timezone_name standard timezone name (for example, "US/Pacific")
+ * @param stream CUDA stream used for any device memory operations and kernel launches
  *
  * @return The transition table for the given timezone
  */
-timezone_table build_timezone_transition_table(std::string const &timezone_name);
+timezone_table build_timezone_transition_table(std::string const &timezone_name,
+                                               rmm::cuda_stream_view stream);
 
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/parquet/page_dict.cu b/cpp/src/io/parquet/page_dict.cu
index d984cc1e44f..46d471d5cf7 100644
--- a/cpp/src/io/parquet/page_dict.cu
+++ b/cpp/src/io/parquet/page_dict.cu
@@ -36,7 +36,7 @@ struct dict_state_s {
   uint32_t num_dict_entries;    //!< Dictionary entries in current fragment to add
   uint32_t frag_dict_size;
   EncColumnChunk ck;
-  EncColumnDesc col;
+  parquet_column_device_view col;
   PageFragment frag;
   volatile uint32_t scratch_red[32];
   uint16_t frag_dict[max_page_fragment_size];
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 8b99248e2fd..3b29394686f 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -44,7 +44,7 @@ constexpr int init_hash_bits       = 12;
 constexpr uint32_t rle_buffer_size = (1 << 9);
 
 struct frag_init_state_s {
-  EncColumnDesc col;
+  parquet_column_device_view col;
   PageFragment frag;
   uint32_t total_dupes;
   size_type start_value_idx;
@@ -70,7 +70,7 @@ struct page_enc_state_s {
   volatile uint32_t scratch_red[32];
   EncPage page;
   EncColumnChunk ck;
-  EncColumnDesc col;
+  parquet_column_device_view col;
   gpu_inflate_input_s comp_in;
   gpu_inflate_status_s comp_out;
   uint16_t vals[rle_buffer_size];
@@ -111,12 +111,13 @@ inline __device__ uint32_t uint64_init_hash(uint64_t v)
  */
 // blockDim {512,1,1}
 template <int block_size>
-__global__ void __launch_bounds__(block_size) gpuInitPageFragments(PageFragment *frag,
-                                                                   const EncColumnDesc *col_desc,
-                                                                   int32_t num_fragments,
-                                                                   int32_t num_columns,
-                                                                   uint32_t fragment_size,
-                                                                   uint32_t max_num_rows)
+__global__ void __launch_bounds__(block_size)
+  gpuInitPageFragments(PageFragment *frag,
+                       const parquet_column_device_view *col_desc,
+                       int32_t num_fragments,
+                       int32_t num_columns,
+                       uint32_t fragment_size,
+                       uint32_t max_num_rows)
 {
   __shared__ __align__(16) frag_init_state_s state_g;
 
@@ -158,12 +159,18 @@ __global__ void __launch_bounds__(block_size) gpuInitPageFragments(PageFragment
     } else {
       auto col                     = *(s->col.parent_column);
       auto current_start_value_idx = start_row;
-      while (col.type().id() == type_id::LIST) {
-        auto offset_col = col.child(lists_column_view::offsets_column_index);
-        current_start_value_idx =
-          offset_col.element<size_type>(current_start_value_idx + col.offset());
-        end_value_idx = offset_col.element<size_type>(end_value_idx + col.offset());
-        col           = col.child(lists_column_view::child_column_index);
+      while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) {
+        if (col.type().id() == type_id::STRUCT) {
+          current_start_value_idx += col.offset();
+          end_value_idx += col.offset();
+          col = col.child(0);
+        } else {
+          auto offset_col = col.child(lists_column_view::offsets_column_index);
+          current_start_value_idx =
+            offset_col.element<size_type>(current_start_value_idx + col.offset());
+          end_value_idx = offset_col.element<size_type>(end_value_idx + col.offset());
+          col           = col.child(lists_column_view::child_column_index);
+        }
       }
       s->start_value_idx = current_start_value_idx;
     }
@@ -372,12 +379,13 @@ __global__ void __launch_bounds__(block_size) gpuInitPageFragments(PageFragment
 }
 
 // blockDim {128,1,1}
-__global__ void __launch_bounds__(128) gpuInitFragmentStats(statistics_group *groups,
-                                                            const PageFragment *fragments,
-                                                            const EncColumnDesc *col_desc,
-                                                            int32_t num_fragments,
-                                                            int32_t num_columns,
-                                                            uint32_t fragment_size)
+__global__ void __launch_bounds__(128)
+  gpuInitFragmentStats(statistics_group *groups,
+                       const PageFragment *fragments,
+                       const parquet_column_device_view *col_desc,
+                       int32_t num_fragments,
+                       int32_t num_columns,
+                       uint32_t fragment_size)
 {
   __shared__ __align__(8) statistics_group group_g[4];
 
@@ -397,13 +405,13 @@ __global__ void __launch_bounds__(128) gpuInitFragmentStats(statistics_group *gr
 // blockDim {128,1,1}
 __global__ void __launch_bounds__(128) gpuInitPages(EncColumnChunk *chunks,
                                                     EncPage *pages,
-                                                    const EncColumnDesc *col_desc,
+                                                    const parquet_column_device_view *col_desc,
                                                     statistics_merge_group *page_grstats,
                                                     statistics_merge_group *chunk_grstats,
                                                     int32_t num_rowgroups,
                                                     int32_t num_columns)
 {
-  __shared__ __align__(8) EncColumnDesc col_g;
+  __shared__ __align__(8) parquet_column_device_view col_g;
   __shared__ __align__(8) EncColumnChunk ck_g;
   __shared__ __align__(8) PageFragment frag_g;
   __shared__ __align__(8) EncPage page_g;
@@ -541,8 +549,8 @@ __global__ void __launch_bounds__(128) gpuInitPages(EncColumnChunk *chunks,
           page_g.num_rows         = rows_in_page;
           page_g.num_leaf_values  = leaf_values_in_page;
           page_g.num_values       = values_in_page;
-          uint32_t def_level_bits = col_g.level_bits & 0xf;
-          uint32_t rep_level_bits = col_g.level_bits >> 4;
+          uint32_t def_level_bits = col_g.num_def_level_bits();
+          uint32_t rep_level_bits = col_g.num_rep_level_bits();
           // Run length = 4, max(rle/bitpack header) = 5, add one byte per 256 values for overhead
           // TODO (dm): Improve readability of these calculations.
           uint32_t def_level_size =
@@ -936,10 +944,12 @@ __global__ void __launch_bounds__(128, 8) gpuEncodePages(EncPage *pages,
   __syncthreads();
 
   // Encode Repetition and Definition levels
-  if (s->page.page_type != PageType::DICTIONARY_PAGE && s->col.level_bits != 0 &&
-      s->col.parent_column == nullptr) {
+  if (s->page.page_type != PageType::DICTIONARY_PAGE &&
+      (s->col.num_def_level_bits()) != 0 &&  // This means max definition level is not 0 (nullable)
+      (s->col.num_rep_level_bits()) == 0     // This means there are no repetition levels (non-list)
+  ) {
     // Calculate definition levels from validity
-    uint32_t def_lvl_bits = s->col.level_bits & 0xf;
+    uint32_t def_lvl_bits = s->col.num_def_level_bits();
     if (def_lvl_bits != 0) {
       if (!t) {
         s->rle_run     = 0;
@@ -954,9 +964,32 @@ __global__ void __launch_bounds__(128, 8) gpuEncodePages(EncPage *pages,
         uint32_t row         = s->page.start_row + rle_numvals + t;
         // Definition level encodes validity. Checks the valid map and if it is valid, then sets the
         // def_lvl accordingly and sets it in s->vals which is then given to RleEncode to encode
-        uint32_t def_lvl = (rle_numvals + t < s->page.num_rows && row < s->col.num_rows)
-                             ? s->col.leaf_column->is_valid(row)
-                             : 0;
+        uint32_t def_lvl = [&]() {
+          bool within_bounds = rle_numvals + t < s->page.num_rows && row < s->col.num_rows;
+          if (not within_bounds) { return 0u; }
+          uint32_t def       = 0;
+          size_type l        = 0;
+          bool is_col_struct = false;
+          auto col           = *s->col.parent_column;
+          do {
+            // If col not nullable then it does not contribute to def levels
+            if (s->col.nullability[l]) {
+              if (col.is_valid(row)) {
+                ++def;
+              } else {
+                // We have found the shallowest level at which this row is null
+                break;
+              }
+            }
+            is_col_struct = (col.type().id() == type_id::STRUCT);
+            if (is_col_struct) {
+              row += col.offset();
+              col = col.child(0);
+              ++l;
+            }
+          } while (is_col_struct);
+          return def;
+        }();
         s->vals[(rle_numvals + t) & (rle_buffer_size - 1)] = def_lvl;
         __syncthreads();
         rle_numvals += nrows;
@@ -974,7 +1007,9 @@ __global__ void __launch_bounds__(128, 8) gpuEncodePages(EncPage *pages,
         if (t == 0) { s->cur = rle_out; }
       }
     }
-  } else if (s->page.page_type != PageType::DICTIONARY_PAGE && s->col.parent_column != nullptr) {
+  } else if (s->page.page_type != PageType::DICTIONARY_PAGE &&
+             s->col.num_rep_level_bits() != 0  // This means there ARE repetition levels (has list)
+  ) {
     auto encode_levels = [&](uint8_t const *lvl_val_data, uint32_t nbits) {
       // For list types, the repetition and definition levels are pre-calculated. We just need to
       // encode and write them now.
@@ -1010,9 +1045,9 @@ __global__ void __launch_bounds__(128, 8) gpuEncodePages(EncPage *pages,
         if (t == 0) { s->cur = rle_out; }
       }
     };
-    encode_levels(s->col.rep_values, s->col.level_bits >> 4);
+    encode_levels(s->col.rep_values, s->col.num_rep_level_bits());
     __syncthreads();
-    encode_levels(s->col.def_values, s->col.level_bits & 0xf);
+    encode_levels(s->col.def_values, s->col.num_def_level_bits());
   }
   // Encode data values
   __syncthreads();
@@ -1041,10 +1076,15 @@ __global__ void __launch_bounds__(128, 8) gpuEncodePages(EncPage *pages,
     if (s->col.parent_column != nullptr) {
       auto col                    = *(s->col.parent_column);
       auto current_page_start_val = s->page_start_val;
-      while (col.type().id() == type_id::LIST) {
-        current_page_start_val = col.child(lists_column_view::offsets_column_index)
-                                   .element<size_type>(current_page_start_val + col.offset());
-        col = col.child(lists_column_view::child_column_index);
+      while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) {
+        if (col.type().id() == type_id::STRUCT) {
+          current_page_start_val += col.offset();
+          col = col.child(0);
+        } else {
+          current_page_start_val = col.child(lists_column_view::offsets_column_index)
+                                     .element<size_type>(current_page_start_val + col.offset());
+          col = col.child(lists_column_view::child_column_index);
+        }
       }
       s->page_start_val = current_page_start_val;
     }
@@ -1156,11 +1196,13 @@ __global__ void __launch_bounds__(128, 8) gpuEncodePages(EncPage *pages,
             auto const ret = convert_nanoseconds([&]() {
               using namespace cuda::std::chrono;
 
-              switch (s->col.converted_type) {
-                case TIMESTAMP_MILLIS: {
+              switch (s->col.leaf_column->type().id()) {
+                case type_id::TIMESTAMP_SECONDS:
+                case type_id::TIMESTAMP_MILLISECONDS: {
                   return sys_time<nanoseconds>{milliseconds{v}};
                 } break;
-                case TIMESTAMP_MICROS: {
+                case type_id::TIMESTAMP_MICROSECONDS:
+                case type_id::TIMESTAMP_NANOSECONDS: {
                   return sys_time<nanoseconds>{microseconds{v}};
                 } break;
               }
@@ -1383,7 +1425,7 @@ class header_encoder {
 
 __device__ uint8_t *EncodeStatistics(uint8_t *start,
                                      const statistics_chunk *s,
-                                     const EncColumnDesc *col,
+                                     const parquet_column_device_view *col,
                                      float *fp_scratch)
 {
   uint8_t *end, dtype, dtype_len;
@@ -1441,7 +1483,7 @@ __global__ void __launch_bounds__(128) gpuEncodePageHeaders(EncPage *pages,
                                                             const statistics_chunk *chunk_stats,
                                                             uint32_t start_page)
 {
-  __shared__ __align__(8) EncColumnDesc col_g;
+  __shared__ __align__(8) parquet_column_device_view col_g;
   __shared__ __align__(8) EncColumnChunk ck_g;
   __shared__ __align__(8) EncPage page_g;
   __shared__ __align__(8) float fp_scratch[2];
@@ -1567,6 +1609,42 @@ __global__ void __launch_bounds__(1024) gpuGatherPages(EncColumnChunk *chunks, c
   }
 }
 
+/**
+ * @brief Functor to get definition level value for a nested struct column until the leaf level or
+ * the first list level.
+ *
+ */
+struct def_level_fn {
+  column_device_view const *parent_col;
+  uint8_t const *d_nullability;
+  uint8_t sub_level_start;
+  uint8_t curr_def_level;
+
+  __device__ uint32_t operator()(size_type i)
+  {
+    uint32_t def       = curr_def_level;
+    uint8_t l          = sub_level_start;
+    bool is_col_struct = false;
+    auto col           = *parent_col;
+    do {
+      // If col not nullable then it does not contribute to def levels
+      if (d_nullability[l]) {
+        if (not col.nullable() or bit_is_set(col.null_mask(), i)) {
+          ++def;
+        } else {  // We have found the shallowest level at which this row is null
+          break;
+        }
+      }
+      is_col_struct = (col.type().id() == type_id::STRUCT);
+      if (is_col_struct) {
+        col = col.child(0);
+        ++l;
+      }
+    } while (is_col_struct);
+    return def;
+  }
+};
+
 /**
  * @brief Get the dremel offsets and repetition and definition levels for a LIST column
  *
@@ -1633,16 +1711,53 @@ __global__ void __launch_bounds__(1024) gpuGatherPages(EncColumnChunk *chunks, c
  * ```
  *
  * Similarly we merge up all the way till level 0 offsets
+ *
+ * STRUCT COLUMNS :
+ * In case of struct columns, we don't have to merge struct levels with their children because a
+ * struct is the same size as its children. e.g. for a column `struct<int, float>`, if the row `i`
+ * is null, then the children columns `int` and `float` are also null at `i`. They also have the
+ * null entry represented in their respective null masks. So for any case of strictly struct based
+ * nesting, we can get the definition levels merely by iterating over the nesting for the same row.
+ *
+ * In case struct and lists are intermixed, the definition levels of all the contiguous struct
+ * levels can be constructed using the aforementioned iterative method. Only when we reach a list
+ * level, we need to do a merge with the subsequent level.
+ *
+ * So, for a column like `struct<list<int>>`, we are going to merge between the levels `struct<list`
+ * and `int`.
+ * For a column like `list<struct<int>>`, we are going to merge between `list` and `struct<int>`.
+ *
+ * In general, one nesting level is the list level and any struct level that precedes it.
+ *
+ * A few more examples to visualize the partitioning of column hierarchy into nesting levels:
+ * (L is list, S is struct, i is integer(leaf data level), angle brackets omitted)
+ * ```
+ * 1. LSi     = L   Si
+ *              - | --
+ *
+ * 2. LLSi    = L   L   Si
+ *              - | - | --
+ *
+ * 3. SSLi    = SSL   i
+ *              --- | -
+ *
+ * 4. LLSLSSi = L   L   SL   SSi
+ *              - | - | -- | ---
+```
  */
 dremel_data get_dremel_data(column_view h_col,
-                            std::vector<bool> const &level_nullability,
+                            // TODO(cp): use device_span once it is converted to a single hd_vec
+                            rmm::device_uvector<uint8_t> const &d_nullability,
+                            std::vector<uint8_t> const &nullability,
                             rmm::cuda_stream_view stream)
 {
-  CUDF_EXPECTS(h_col.type().id() == type_id::LIST,
-               "Can only get rep/def levels for LIST type column");
+  auto get_list_level = [](column_view col) {
+    while (col.type().id() == type_id::STRUCT) { col = col.child(0); }
+    return col;
+  };
 
   auto get_empties = [&](column_view col, size_type start, size_type end) {
-    auto lcv = lists_column_view(col);
+    auto lcv = lists_column_view(get_list_level(col));
     rmm::device_uvector<size_type> empties_idx(lcv.size(), stream);
     rmm::device_uvector<size_type> empties(lcv.size(), stream);
     auto d_off = lcv.offsets().data<size_type>();
@@ -1663,38 +1778,60 @@ dremel_data get_dremel_data(column_view h_col,
     return std::make_tuple(std::move(empties), std::move(empties_idx), empties_size);
   };
 
-  // Reverse the nesting in order to merge the deepest level with the leaf first and merge bottom
-  // up
-  auto curr_col        = h_col;
-  size_t max_vals_size = 0;
+  auto curr_col = h_col;
   std::vector<column_view> nesting_levels;
   std::vector<uint8_t> def_at_level;
-  size_type level       = 0;
-  auto add_def_at_level = [&](size_type level) {
-    auto is_level_nullable =
-      curr_col.nullable() or (not level_nullability.empty() and level_nullability[level]);
-    def_at_level.push_back(is_level_nullable ? 2 : 1);
+  std::vector<uint8_t> start_at_sub_level;
+  uint8_t curr_nesting_level_idx = 0;
+
+  auto add_def_at_level = [&](column_view col) {
+    // Add up all def level contributions in this column all the way till the first list column
+    // appears in the hierarchy or until we get to leaf
+    uint32_t def = 0;
+    start_at_sub_level.push_back(curr_nesting_level_idx);
+    while (col.type().id() == type_id::STRUCT) {
+      def += (nullability[curr_nesting_level_idx]) ? 1 : 0;
+      col = col.child(0);
+      ++curr_nesting_level_idx;
+    }
+    // At the end of all those structs is either a list column or the leaf. Leaf column contributes
+    // at least one def level. It doesn't matter what the leaf contributes because it'll be at the
+    // end of the exclusive scan.
+    def += (nullability[curr_nesting_level_idx]) ? 2 : 1;
+    def_at_level.push_back(def);
+    ++curr_nesting_level_idx;
   };
-  while (curr_col.type().id() == type_id::LIST) {
+  while (cudf::is_nested(curr_col.type())) {
     nesting_levels.push_back(curr_col);
-    add_def_at_level(level);
-    auto lcv = lists_column_view(curr_col);
-    max_vals_size += lcv.offsets().size();
-    curr_col = lcv.child();
-    level++;
+    add_def_at_level(curr_col);
+    while (curr_col.type().id() == type_id::STRUCT) {
+      // Go down the hierarchy until we get to the LIST or the leaf level
+      curr_col = curr_col.child(0);
+    }
+    if (curr_col.type().id() == type_id::LIST) {
+      curr_col = curr_col.child(lists_column_view::child_column_index);
+      if (not is_nested(curr_col.type())) {
+        // Special case: when the leaf data column is the immediate child of the list col then we
+        // want it to be included right away. Otherwise the struct containing it will be included in
+        // the next iteration of this loop.
+        nesting_levels.push_back(curr_col);
+        add_def_at_level(curr_col);
+        break;
+      }
+    }
   }
-  // One more entry for leaf col
-  add_def_at_level(level);
-  max_vals_size += curr_col.size();
 
-  // Add one more value at the end so that we can have the max def level
-  def_at_level.push_back(0);
+  std::unique_ptr<rmm::device_buffer> device_view_owners;
+  column_device_view *d_nesting_levels;
+  std::tie(device_view_owners, d_nesting_levels) =
+    contiguous_copy_column_device_views<column_device_view>(nesting_levels, stream);
+
   thrust::exclusive_scan(
     thrust::host, def_at_level.begin(), def_at_level.end(), def_at_level.begin());
 
   // Sliced list column views only have offsets applied to top level. Get offsets for each level.
-  rmm::device_uvector<size_type> d_column_offsets(nesting_levels.size() + 1, stream);
-  rmm::device_uvector<size_type> d_column_ends(nesting_levels.size() + 1, stream);
+  rmm::device_uvector<size_type> d_column_offsets(nesting_levels.size(), stream);
+  rmm::device_uvector<size_type> d_column_ends(nesting_levels.size(), stream);
 
   auto d_col = column_device_view::create(h_col, stream);
   cudf::detail::device_single_thread(
@@ -1709,24 +1846,29 @@ dremel_data get_dremel_data(column_view h_col,
       end_idx_at_level[level] = end;
       ++level;
       // Apply offset recursively until we get to leaf data
-      while (curr_col.type().id() == type_id::LIST) {
-        off = curr_col.child(lists_column_view::offsets_column_index).element<size_type>(off);
-        end = curr_col.child(lists_column_view::offsets_column_index).element<size_type>(end);
-        offset_at_level[level]  = off;
-        end_idx_at_level[level] = end;
-        ++level;
-        curr_col = curr_col.child(lists_column_view::child_column_index);
+      // Skip doing the following for any structs we encounter in between.
+      while (curr_col.type().id() == type_id::LIST or curr_col.type().id() == type_id::STRUCT) {
+        if (curr_col.type().id() == type_id::LIST) {
+          off = curr_col.child(lists_column_view::offsets_column_index).element<size_type>(off);
+          end = curr_col.child(lists_column_view::offsets_column_index).element<size_type>(end);
+          offset_at_level[level]  = off;
+          end_idx_at_level[level] = end;
+          ++level;
+          curr_col = curr_col.child(lists_column_view::child_column_index);
+        } else {
+          curr_col = curr_col.child(0);
+        }
       }
     },
     stream);
 
-  thrust::host_vector<size_type> column_offsets(nesting_levels.size() + 1);
+  thrust::host_vector<size_type> column_offsets(d_column_offsets.size());
   CUDA_TRY(cudaMemcpyAsync(column_offsets.data(),
                            d_column_offsets.data(),
                            d_column_offsets.size() * sizeof(size_type),
                            cudaMemcpyDeviceToHost,
                            stream.value()));
-  thrust::host_vector<size_type> column_ends(nesting_levels.size() + 1);
+  thrust::host_vector<size_type> column_ends(d_column_ends.size());
   CUDA_TRY(cudaMemcpyAsync(column_ends.data(),
                            d_column_ends.data(),
                            d_column_ends.size() * sizeof(size_type),
@@ -1735,6 +1877,11 @@ dremel_data get_dremel_data(column_view h_col,
 
   stream.synchronize();
 
+  size_t max_vals_size = 0;
+  for (size_t l = 0; l < column_offsets.size(); ++l) {
+    max_vals_size += column_ends[l] - column_offsets[l];
+  }
+
   rmm::device_uvector<uint8_t> rep_level(max_vals_size, stream);
   rmm::device_uvector<uint8_t> def_level(max_vals_size, stream);
 
@@ -1745,9 +1892,13 @@ dremel_data get_dremel_data(column_view h_col,
   {
     // At this point, curr_col contains the leaf column. Max nesting level is
     // nesting_levels.size().
-    size_t level              = nesting_levels.size() - 1;
+
+    // We are going to start by merging the last column in nesting_levels (the leaf, which is at the
+    // index `nesting_levels.size() - 1`) with the second-to-last (which is at
+    // `nesting_levels.size() - 2`).
+    size_t level              = nesting_levels.size() - 2;
     curr_col                  = nesting_levels[level];
-    auto lcv                  = lists_column_view(curr_col);
+    auto lcv                  = lists_column_view(get_list_level(curr_col));
     auto offset_size_at_level = column_ends[level] - column_offsets[level] + 1;
 
     // Get empties at this level
@@ -1760,25 +1911,21 @@ dremel_data get_dremel_data(column_view h_col,
     // Merge empty at deepest parent level with the rep, def level vals at leaf level
 
     auto input_parent_rep_it = thrust::make_constant_iterator(level);
-    auto input_parent_def_it = thrust::make_transform_iterator(
-      thrust::make_counting_iterator(0),
-      [idx            = empties_idx.data(),
-       mask           = lcv.null_mask(),
-       level_nullable = level_nullability.empty() ? false : level_nullability[level],
-       curr_def_level = def_at_level[level]] __device__(auto i) {
-        return curr_def_level +
-               ((mask && bit_is_set(mask, idx[i]) or (!mask && level_nullable)) ? 1 : 0);
-      });
-
-    auto input_child_rep_it = thrust::make_constant_iterator(nesting_levels.size());
-    auto input_child_def_it = thrust::make_transform_iterator(
-      thrust::make_counting_iterator(column_offsets[level + 1]),
-      [mask           = lcv.child().null_mask(),
-       level_nullable = level_nullability.empty() ? false : level_nullability[level + 1],
-       curr_def_level = def_at_level[level + 1]] __device__(auto i) {
-        return curr_def_level +
-               ((mask && bit_is_set(mask, i) or (!mask && level_nullable)) ? 1 : 0);
-      });
+    auto input_parent_def_it =
+      thrust::make_transform_iterator(empties_idx.begin(),
+                                      def_level_fn{d_nesting_levels + level,
+                                                   d_nullability.data(),
+                                                   start_at_sub_level[level],
+                                                   def_at_level[level]});
+
+    // `nesting_levels.size()` == no of list levels + leaf. Max repetition level = no of list levels
+    auto input_child_rep_it = thrust::make_constant_iterator(nesting_levels.size() - 1);
+    auto input_child_def_it =
+      thrust::make_transform_iterator(thrust::make_counting_iterator(column_offsets[level + 1]),
+                                      def_level_fn{d_nesting_levels + level + 1,
+                                                   d_nullability.data(),
+                                                   start_at_sub_level[level + 1],
+                                                   def_at_level[level + 1]});
 
     // Zip the input and output value iterators so that merge operation is done only once
     auto input_parent_zip_it =
@@ -1831,9 +1978,11 @@ dremel_data get_dremel_data(column_view h_col,
                     rep_level.begin());
   }
 
-  for (int level = nesting_levels.size() - 2; level >= 0; level--) {
+  // Having already merged the last two levels, we are now going to merge the result with the
+  // third-last level which is at index `nesting_levels.size() - 3`.
+  for (int level = nesting_levels.size() - 3; level >= 0; level--) {
     curr_col                  = nesting_levels[level];
-    auto lcv                  = lists_column_view(curr_col);
+    auto lcv                  = lists_column_view(get_list_level(curr_col));
     auto offset_size_at_level = column_ends[level] - column_offsets[level] + 1;
 
     // Get empties at this level
@@ -1857,15 +2006,12 @@ dremel_data get_dremel_data(column_view h_col,
     auto transformed_empties = thrust::make_transform_iterator(empties.begin(), offset_transformer);
 
     auto input_parent_rep_it = thrust::make_constant_iterator(level);
-    auto input_parent_def_it = thrust::make_transform_iterator(
-      thrust::make_counting_iterator(0),
-      [idx            = empties_idx.data(),
-       mask           = lcv.null_mask(),
-       level_nullable = level_nullability.empty() ? false : level_nullability[level],
-       curr_def_level = def_at_level[level]] __device__(auto i) {
-        return curr_def_level +
-               ((mask && bit_is_set(mask, idx[i]) or (!mask && level_nullable)) ? 1 : 0);
-      });
+    auto input_parent_def_it =
+      thrust::make_transform_iterator(empties_idx.begin(),
+                                      def_level_fn{d_nesting_levels + level,
+                                                   d_nullability.data(),
+                                                   start_at_sub_level[level],
+                                                   def_at_level[level]});
 
     // Zip the input and output value iterators so that merge operation is done only once
     auto input_parent_zip_it =
@@ -1927,16 +2073,10 @@ dremel_data get_dremel_data(column_view h_col,
 
   stream.synchronize();
 
-  size_type leaf_col_offset = column_offsets[column_offsets.size() - 1];
-  size_type leaf_data_size  = column_ends[column_ends.size() - 1] - leaf_col_offset;
-  uint8_t max_def_level     = def_at_level.back() - 1;
+  size_type leaf_data_size = column_ends.back() - column_offsets.back();
 
-  return dremel_data{std::move(new_offsets),
-                     std::move(rep_level),
-                     std::move(def_level),
-                     leaf_col_offset,
-                     leaf_data_size,
-                     max_def_level};
+  return dremel_data{
+    std::move(new_offsets), std::move(rep_level), std::move(def_level), leaf_data_size};
 }
 
 /**
@@ -1949,7 +2089,7 @@ dremel_data get_dremel_data(column_view h_col,
  * @param[in] stream CUDA stream to use, default 0
  */
 void InitPageFragments(PageFragment *frag,
-                       const EncColumnDesc *col_desc,
+                       const parquet_column_device_view *col_desc,
                        int32_t num_fragments,
                        int32_t num_columns,
                        uint32_t fragment_size,
@@ -1974,7 +2114,7 @@ void InitPageFragments(PageFragment *frag,
  */
 void InitFragmentStatistics(statistics_group *groups,
                             const PageFragment *fragments,
-                            const EncColumnDesc *col_desc,
+                            const parquet_column_device_view *col_desc,
                             int32_t num_fragments,
                             int32_t num_columns,
                             uint32_t fragment_size,
@@ -1999,7 +2139,7 @@ void InitFragmentStatistics(statistics_group *groups,
  */
 void InitEncoderPages(EncColumnChunk *chunks,
                       EncPage *pages,
-                      const EncColumnDesc *col_desc,
+                      const parquet_column_device_view *col_desc,
                       int32_t num_rowgroups,
                       int32_t num_columns,
                       statistics_merge_group *page_grstats,
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 43d144ec980..ad3c214069f 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -184,7 +184,7 @@ struct ColumnChunkDesc {
   {
   }
 
-  uint8_t *compressed_data;                        // pointer to compressed column chunk data
+  uint8_t const *compressed_data;                  // pointer to compressed column chunk data
   size_t compressed_size;                          // total compressed data size for this chunk
   size_t num_values;                               // total number of values in this column
   size_t start_row;                                // starting row of this chunk
@@ -215,7 +215,7 @@ struct ColumnChunkDesc {
 /**
  * @brief Struct describing an encoder column
  */
-struct EncColumnDesc : stats_column_desc {
+struct parquet_column_device_view : stats_column_desc {
   uint32_t *dict_index;    //!< Dictionary index [row]
   uint32_t *dict_data;     //!< Dictionary data (unique row indices)
   uint8_t physical_type;   //!< physical data type
@@ -223,12 +223,17 @@ struct EncColumnDesc : stats_column_desc {
   // TODO (dm): Evaluate if this is sufficient. At 4 bits, this allows a maximum 16 level nesting
   uint8_t level_bits;  //!< bits to encode max definition (lower nibble) & repetition (upper nibble)
                        //!< levels
+  constexpr uint8_t num_def_level_bits() { return level_bits & 0xf; }
+  constexpr uint8_t num_rep_level_bits() { return level_bits >> 4; }
   size_type const *const
     *nesting_offsets;  //!< If column is a nested type, contains offset array of each nesting level
 
   size_type const *level_offsets;  //!< Offset array for per-row pre-calculated rep/def level values
   uint8_t const *rep_values;       //!< Pre-calculated repetition level values
   uint8_t const *def_values;       //!< Pre-calculated definition level values
+  uint8_t *nullability;  //!< Array of nullability of each nesting level. e.g. nullable[0] is
+                         //!< nullability of parent_column. May be different from col.nullable() in
+                         //!< case of chunked writing.
 };
 
 constexpr int max_page_fragment_size = 5000;  //!< Max number of rows in a page fragment
@@ -299,15 +304,15 @@ inline size_t __device__ __host__ GetMaxCompressedBfrSize(size_t uncomp_size,
  * @brief Struct describing an encoder column chunk
  */
 struct EncColumnChunk {
-  const EncColumnDesc *col_desc;  //!< Column description
-  PageFragment *fragments;        //!< First fragment in chunk
-  uint8_t *uncompressed_bfr;      //!< Uncompressed page data
-  uint8_t *compressed_bfr;        //!< Compressed page data
-  const statistics_chunk *stats;  //!< Fragment statistics
-  uint32_t bfr_size;              //!< Uncompressed buffer size
-  uint32_t compressed_size;       //!< Compressed buffer size
-  uint32_t start_row;             //!< First row of chunk
-  uint32_t num_rows;              //!< Number of rows in chunk
+  const parquet_column_device_view *col_desc;  //!< Column description
+  PageFragment *fragments;                     //!< First fragment in chunk
+  uint8_t *uncompressed_bfr;                   //!< Uncompressed page data
+  uint8_t *compressed_bfr;                     //!< Compressed page data
+  const statistics_chunk *stats;               //!< Fragment statistics
+  uint32_t bfr_size;                           //!< Uncompressed buffer size
+  uint32_t compressed_size;                    //!< Compressed buffer size
+  uint32_t start_row;                          //!< First row of chunk
+  uint32_t num_rows;                           //!< Number of rows in chunk
   uint32_t num_values;      //!< Number of values in chunk. Different from num_rows for nested types
   uint32_t first_fragment;  //!< First fragment of chunk
   uint32_t first_page;      //!< First page of chunk
@@ -398,9 +403,7 @@ struct dremel_data {
   rmm::device_uvector<uint8_t> rep_level;
   rmm::device_uvector<uint8_t> def_level;
 
-  size_type leaf_col_offset;
   size_type leaf_data_size;
-  uint8_t max_def_level;
 };
 
 /**
@@ -423,8 +426,9 @@ struct dremel_data {
  * @return A struct containing dremel data
  */
 dremel_data get_dremel_data(column_view h_col,
-                            std::vector<bool> const &level_nullability = {},
-                            rmm::cuda_stream_view stream               = rmm::cuda_stream_default);
+                            rmm::device_uvector<uint8_t> const &d_nullability,
+                            std::vector<uint8_t> const &nullability,
+                            rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
 /**
  * @brief Launches kernel for initializing encoder page fragments
@@ -438,7 +442,7 @@ dremel_data get_dremel_data(column_view h_col,
  * @param[in] stream CUDA stream to use, default 0
  */
 void InitPageFragments(PageFragment *frag,
-                       const EncColumnDesc *col_desc,
+                       const parquet_column_device_view *col_desc,
                        int32_t num_fragments,
                        int32_t num_columns,
                        uint32_t fragment_size,
@@ -458,7 +462,7 @@ void InitPageFragments(PageFragment *frag,
  */
 void InitFragmentStatistics(statistics_group *groups,
                             const PageFragment *fragments,
-                            const EncColumnDesc *col_desc,
+                            const parquet_column_device_view *col_desc,
                             int32_t num_fragments,
                             int32_t num_columns,
                             uint32_t fragment_size,
@@ -478,7 +482,7 @@ void InitFragmentStatistics(statistics_group *groups,
  */
 void InitEncoderPages(EncColumnChunk *chunks,
                       EncPage *pages,
-                      const EncColumnDesc *col_desc,
+                      const parquet_column_device_view *col_desc,
                       int32_t num_rowgroups,
                       int32_t num_columns,
                       statistics_merge_group *page_grstats  = nullptr,
diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
index a7a02cc6108..16cf0877c23 100644
--- a/cpp/src/io/parquet/reader_impl.cu
+++ b/cpp/src/io/parquet/reader_impl.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -822,7 +822,7 @@ void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::ve
  * @copydoc cudf::io::detail::parquet::read_column_chunks
  */
 void reader::impl::read_column_chunks(
-  std::vector<rmm::device_buffer> &page_data,
+  std::vector<std::unique_ptr<datasource::buffer>> &page_data,
   hostdevice_vector<gpu::ColumnChunkDesc> &chunks,  // TODO const?
   size_t begin_chunk,
   size_t end_chunk,
@@ -850,9 +850,15 @@ void reader::impl::read_column_chunks(
       next_chunk++;
     }
     if (io_size != 0) {
-      auto buffer         = _sources[chunk_source_map[chunk]]->host_read(io_offset, io_size);
-      page_data[chunk]    = rmm::device_buffer(buffer->data(), buffer->size(), stream);
-      uint8_t *d_compdata = static_cast<uint8_t *>(page_data[chunk].data());
+      auto &source = _sources[chunk_source_map[chunk]];
+      if (source->is_device_read_preferred(io_size)) {
+        page_data[chunk] = source->device_read(io_offset, io_size, stream);
+      } else {
+        auto const buffer = source->host_read(io_offset, io_size);
+        page_data[chunk] =
+          datasource::buffer::create(rmm::device_buffer(buffer->data(), buffer->size(), stream));
+      }
+      auto d_compdata = page_data[chunk]->data();
       do {
         chunks[chunk].compressed_data = d_compdata;
         d_compdata += chunks[chunk].compressed_size;
@@ -1414,7 +1420,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
     std::vector<size_type> chunk_source_map(num_chunks);
 
     // Tracker for eventually deallocating compressed and uncompressed data
-    std::vector<rmm::device_buffer> page_data(num_chunks);
+    std::vector<std::unique_ptr<datasource::buffer>> page_data(num_chunks);
 
     // Keep track of column chunk file offsets
     std::vector<size_t> column_chunk_offsets(num_chunks);
@@ -1516,10 +1522,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
         decomp_page_data = decompress_page_data(chunks, pages, stream);
         // Free compressed data
         for (size_t c = 0; c < chunks.size(); c++) {
-          if (chunks[c].codec != parquet::Compression::UNCOMPRESSED && page_data[c].size() != 0) {
-            page_data[c].resize(0);
-            page_data[c].shrink_to_fit();
-          }
+          if (chunks[c].codec != parquet::Compression::UNCOMPRESSED) { page_data[c].reset(); }
         }
       }
 
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 137fca03bfd..ca200936134 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -91,7 +91,7 @@ class reader::impl {
    * @param stream CUDA stream used for device memory operations and kernel launches.
    *
    */
-  void read_column_chunks(std::vector<rmm::device_buffer> &page_data,
+  void read_column_chunks(std::vector<std::unique_ptr<datasource::buffer>> &page_data,
                           hostdevice_vector<gpu::ColumnChunkDesc> &chunks,
                           size_t begin_chunk,
                           size_t end_chunk,
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index a645ca0fd91..31baf419f45 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -70,364 +70,453 @@ parquet::Compression to_parquet_compression(compression_type compression)
   }
 }
 
-std::vector<std::vector<bool>> get_per_column_nullability(table_view const &table,
-                                                          std::vector<bool> const &col_nullable)
-{
-  auto get_depth = [](column_view const &col) {
-    column_view curr_col = col;
-    uint16_t depth       = 1;
-    while (curr_col.type().id() == type_id::LIST) {
-      depth++;
-      curr_col = lists_column_view{curr_col}.child();
+}  // namespace
+
+struct linked_column_view;
+
+using LinkedColPtr    = std::shared_ptr<linked_column_view>;
+using LinkedColVector = std::vector<LinkedColPtr>;
+
+/**
+ * @brief column_view with the added member pointer to the parent of this column.
+ *
+ */
+struct linked_column_view : public column_view {
+  // TODO(cp): we are currently keeping all column_view children info multiple times - once for each
+  //       copy of this object. Options:
+  // 1. Inherit from column_view_base. Only lose out on children vector. That is not needed.
+  // 2. Don't inherit at all. make linked_column_view keep a reference wrapper to its column_view
+  linked_column_view(column_view const &col) : column_view(col), parent(nullptr)
+  {
+    for (auto child_it = col.child_begin(); child_it < col.child_end(); ++child_it) {
+      children.push_back(std::make_shared<linked_column_view>(this, *child_it));
     }
-    return depth;
-  };
+  }
 
-  // for each column, check depth and add subsequent bool values to its nullable vector
-  std::vector<std::vector<bool>> per_column_nullability;
-  auto null_it  = col_nullable.begin();
-  auto const_it = thrust::make_constant_iterator(true);
-  for (auto const &col : table) {
-    uint16_t depth = get_depth(col);
-    if (col_nullable.empty()) {
-      // If no per-column nullability is specified then assume that all columns are nullable
-      per_column_nullability.emplace_back(const_it, const_it + depth);
-    } else {
-      CUDF_EXPECTS(
-        null_it + depth <= col_nullable.end(),
-        "Mismatch between size of column nullability passed in user_metadata_with_nullability and "
-        "number of null masks expected in table. Expected more values in passed metadata");
-      per_column_nullability.emplace_back(null_it, null_it + depth);
-      null_it += depth;
+  linked_column_view(linked_column_view *parent, column_view const &col)
+    : column_view(col), parent(parent)
+  {
+    for (auto child_it = col.child_begin(); child_it < col.child_end(); ++child_it) {
+      children.push_back(std::make_shared<linked_column_view>(this, *child_it));
     }
   }
-  CUDF_EXPECTS(
-    null_it == col_nullable.end(),
-    "Mismatch between size of column nullability passed in user_metadata_with_nullability and "
-    "number of null masks expected in table. Too many values in passed metadata");
-  return per_column_nullability;
-}
+
+  linked_column_view *parent;  //!< Pointer to parent of this column. Nullptr if root
+  LinkedColVector children;
+};
 
 /**
- * @brief Get the leaf column
+ * @brief Converts all column_views of a table into linked_column_views
  *
- * Returns the dtype of the leaf column when `col` is a list column.
+ * @param table table of columns to convert
+ * @return Vector of converted linked_column_views
  */
-column_view get_leaf_col(column_view col)
+LinkedColVector input_table_to_linked_columns(table_view const &table)
 {
-  column_view curr_col = col;
-  while (curr_col.type().id() == type_id::LIST) { curr_col = lists_column_view{curr_col}.child(); }
-  return curr_col;
-}
+  LinkedColVector result;
+  for (column_view const &col : table) {
+    result.emplace_back(std::make_shared<linked_column_view>(col));
+  }
 
-}  // namespace
+  return result;
+}
 
 /**
- * @brief Helper kernel for converting string data/offsets into nvstrdesc
- * REMOVEME: Once we eliminate the legacy readers/writers, the kernels could be
- * made to use the native offset+data layout.
+ * @brief Extends SchemaElement to add members required in constructing parquet_column_view
+ *
+ * Added members are:
+ * 1. leaf_column: Pointer to leaf linked_column_view which points to the corresponding data stream
+ *    of a leaf schema node. For non-leaf struct node, this is nullptr.
+ * 2. stats_dtype: datatype for statistics calculation required for the data stream of a leaf node.
+ * 3. ts_scale: scale to multiply or divide timestamp by in order to convert timestamp to parquet
+ *    supported types
  */
-__global__ void stringdata_to_nvstrdesc(gpu::nvstrdesc_s *dst,
-                                        const size_type *offsets,
-                                        const char *strdata,
-                                        const uint32_t *nulls,
-                                        size_type column_size)
-{
-  size_type row = blockIdx.x * blockDim.x + threadIdx.x;
-  if (row < column_size) {
-    uint32_t is_valid = (nulls) ? (nulls[row >> 5] >> (row & 0x1f)) & 1 : 1;
-    size_t count;
-    const char *ptr;
-    if (is_valid) {
-      size_type cur  = offsets[row];
-      size_type next = offsets[row + 1];
-      ptr            = strdata + cur;
-      count          = (next > cur) ? next - cur : 0;
+struct schema_tree_node : public SchemaElement {
+  LinkedColPtr leaf_column;
+  statistics_dtype stats_dtype;
+  int32_t ts_scale;
+
+  // TODO(fut): Think about making schema a class that holds a vector of schema_tree_nodes. The
+  // function construct_schema_tree could be its constructor. It can have method to get the per
+  // column nullability given a schema node index corresponding to a leaf schema. Much easier than
+  // that is a method to get path in schema, given a leaf node
+};
+
+struct leaf_schema_fn {
+  schema_tree_node &col_schema;
+  LinkedColPtr const &col;
+  column_in_metadata const &col_meta;
+  bool timestamp_is_int96;
+
+  template <typename T>
+  std::enable_if_t<std::is_same<T, bool>::value, void> operator()()
+  {
+    col_schema.type        = Type::BOOLEAN;
+    col_schema.stats_dtype = statistics_dtype::dtype_bool;
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same<T, int8_t>::value, void> operator()()
+  {
+    col_schema.type           = Type::INT32;
+    col_schema.converted_type = ConvertedType::INT_8;
+    col_schema.stats_dtype    = statistics_dtype::dtype_int8;
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same<T, int16_t>::value, void> operator()()
+  {
+    col_schema.type           = Type::INT32;
+    col_schema.converted_type = ConvertedType::INT_16;
+    col_schema.stats_dtype    = statistics_dtype::dtype_int16;
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same<T, int32_t>::value, void> operator()()
+  {
+    col_schema.type        = Type::INT32;
+    col_schema.stats_dtype = statistics_dtype::dtype_int32;
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same<T, int64_t>::value, void> operator()()
+  {
+    col_schema.type        = Type::INT64;
+    col_schema.stats_dtype = statistics_dtype::dtype_int64;
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same<T, uint8_t>::value, void> operator()()
+  {
+    col_schema.type           = Type::INT32;
+    col_schema.converted_type = ConvertedType::UINT_8;
+    col_schema.stats_dtype    = statistics_dtype::dtype_int8;
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same<T, uint16_t>::value, void> operator()()
+  {
+    col_schema.type           = Type::INT32;
+    col_schema.converted_type = ConvertedType::UINT_16;
+    col_schema.stats_dtype    = statistics_dtype::dtype_int16;
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same<T, uint32_t>::value, void> operator()()
+  {
+    col_schema.type           = Type::INT32;
+    col_schema.converted_type = ConvertedType::UINT_32;
+    col_schema.stats_dtype    = statistics_dtype::dtype_int32;
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same<T, uint64_t>::value, void> operator()()
+  {
+    col_schema.type           = Type::INT64;
+    col_schema.converted_type = ConvertedType::UINT_64;
+    col_schema.stats_dtype    = statistics_dtype::dtype_int64;
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same<T, float>::value, void> operator()()
+  {
+    col_schema.type        = Type::FLOAT;
+    col_schema.stats_dtype = statistics_dtype::dtype_float32;
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same<T, double>::value, void> operator()()
+  {
+    col_schema.type        = Type::DOUBLE;
+    col_schema.stats_dtype = statistics_dtype::dtype_float64;
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same<T, cudf::string_view>::value, void> operator()()
+  {
+    col_schema.type           = Type::BYTE_ARRAY;
+    col_schema.converted_type = ConvertedType::UTF8;
+    col_schema.stats_dtype    = statistics_dtype::dtype_string;
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same<T, cudf::timestamp_D>::value, void> operator()()
+  {
+    col_schema.type           = Type::INT32;
+    col_schema.converted_type = ConvertedType::DATE;
+    col_schema.stats_dtype    = statistics_dtype::dtype_int32;
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same<T, cudf::timestamp_s>::value, void> operator()()
+  {
+    col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64;
+    col_schema.converted_type =
+      (timestamp_is_int96) ? ConvertedType::UNKNOWN : ConvertedType::TIMESTAMP_MILLIS;
+    col_schema.stats_dtype = statistics_dtype::dtype_timestamp64;
+    col_schema.ts_scale    = 1000;
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same<T, cudf::timestamp_ms>::value, void> operator()()
+  {
+    col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64;
+    col_schema.converted_type =
+      (timestamp_is_int96) ? ConvertedType::UNKNOWN : ConvertedType::TIMESTAMP_MILLIS;
+    col_schema.stats_dtype = statistics_dtype::dtype_timestamp64;
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same<T, cudf::timestamp_us>::value, void> operator()()
+  {
+    col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64;
+    col_schema.converted_type =
+      (timestamp_is_int96) ? ConvertedType::UNKNOWN : ConvertedType::TIMESTAMP_MICROS;
+    col_schema.stats_dtype = statistics_dtype::dtype_timestamp64;
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same<T, cudf::timestamp_ns>::value, void> operator()()
+  {
+    col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64;
+    col_schema.converted_type =
+      (timestamp_is_int96) ? ConvertedType::UNKNOWN : ConvertedType::TIMESTAMP_MICROS;
+    col_schema.stats_dtype = statistics_dtype::dtype_timestamp64;
+    col_schema.ts_scale    = -1000;  // negative value indicates division by absolute value
+  }
+
+  //  unsupported outside cudf for parquet 1.0.
+  template <typename T>
+  std::enable_if_t<std::is_same<T, cudf::duration_D>::value, void> operator()()
+  {
+    col_schema.type           = Type::INT32;
+    col_schema.converted_type = ConvertedType::TIME_MILLIS;
+    col_schema.stats_dtype    = statistics_dtype::dtype_int64;
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same<T, cudf::duration_s>::value, void> operator()()
+  {
+    col_schema.type           = Type::INT64;
+    col_schema.converted_type = ConvertedType::TIME_MILLIS;
+    col_schema.stats_dtype    = statistics_dtype::dtype_int64;
+    col_schema.ts_scale       = 1000;
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same<T, cudf::duration_ms>::value, void> operator()()
+  {
+    col_schema.type           = Type::INT64;
+    col_schema.converted_type = ConvertedType::TIME_MILLIS;
+    col_schema.stats_dtype    = statistics_dtype::dtype_int64;
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same<T, cudf::duration_us>::value, void> operator()()
+  {
+    col_schema.type           = Type::INT64;
+    col_schema.converted_type = ConvertedType::TIME_MICROS;
+    col_schema.stats_dtype    = statistics_dtype::dtype_int64;
+  }
+
+  //  unsupported outside cudf for parquet 1.0.
+  template <typename T>
+  std::enable_if_t<std::is_same<T, cudf::duration_ns>::value, void> operator()()
+  {
+    col_schema.type           = Type::INT64;
+    col_schema.converted_type = ConvertedType::TIME_MICROS;
+    col_schema.stats_dtype    = statistics_dtype::dtype_int64;
+    col_schema.ts_scale       = -1000;  // negative value indicates division by absolute value
+  }
+
+  template <typename T>
+  std::enable_if_t<cudf::is_fixed_point<T>(), void> operator()()
+  {
+    if (std::is_same<T, numeric::decimal32>::value) {
+      col_schema.type        = Type::INT32;
+      col_schema.stats_dtype = statistics_dtype::dtype_int32;
+    } else if (std::is_same<T, numeric::decimal64>::value) {
+      col_schema.type        = Type::INT64;
+      col_schema.stats_dtype = statistics_dtype::dtype_decimal64;
     } else {
-      ptr   = nullptr;
-      count = 0;
+      CUDF_FAIL("Unsupported fixed point type for parquet writer");
     }
-    dst[row].ptr   = ptr;
-    dst[row].count = count;
+    col_schema.converted_type = ConvertedType::DECIMAL;
+    col_schema.decimal_scale = -col->type().scale();  // parquet and cudf disagree about scale signs
+    CUDF_EXPECTS(col_meta.is_decimal_precision_set(),
+                 "Precision must be specified for decimal columns");
+    CUDF_EXPECTS(col_meta.get_decimal_precision() >= col_schema.decimal_scale,
+                 "Precision must be equal to or greater than scale!");
+    col_schema.decimal_precision = col_meta.get_decimal_precision();
   }
-}
 
-/**
- * @brief Helper class that adds parquet-specific column info
- */
-class parquet_column_view {
- public:
-  /**
-   * @brief Constructor that extracts out the string position + length pairs
-   * for building dictionaries for string columns
-   */
-  explicit parquet_column_view(size_t id,
-                               column_view const &col,
-                               std::vector<bool> const &nullability,
-                               const table_metadata *metadata,
-                               bool int96_timestamps,
-                               std::vector<uint8_t> const &decimal_precision,
-                               uint &decimal_precision_idx,
-                               rmm::cuda_stream_view stream)
-    : _col(col),
-      _leaf_col(get_leaf_col(col)),
-      _id(id),
-      _string_type(_leaf_col.type().id() == type_id::STRING),
-      _list_type(col.type().id() == type_id::LIST),
-      _type_width((_string_type || _list_type) ? 0 : cudf::size_of(col.type())),
-      _row_count(col.size()),
-      _null_count(_leaf_col.null_count()),
-      _data(col.head<uint8_t>() + col.offset() * _type_width),
-      _nulls(_leaf_col.nullable() ? _leaf_col.null_mask() : nullptr),
-      _offset(col.offset()),
-      _converted_type(ConvertedType::UNKNOWN),
-      _ts_scale(0),
-      _dremel_offsets(0, stream),
-      _rep_level(0, stream),
-      _def_level(0, stream),
-      _nullability(nullability)
+  template <typename T>
+  std::enable_if_t<cudf::is_nested<T>(), void> operator()()
   {
-    switch (_leaf_col.type().id()) {
-      case cudf::type_id::INT8:
-        _physical_type  = Type::INT32;
-        _converted_type = ConvertedType::INT_8;
-        _stats_dtype    = statistics_dtype::dtype_int8;
-        break;
-      case cudf::type_id::INT16:
-        _physical_type  = Type::INT32;
-        _converted_type = ConvertedType::INT_16;
-        _stats_dtype    = statistics_dtype::dtype_int16;
-        break;
-      case cudf::type_id::INT32:
-        _physical_type = Type::INT32;
-        _stats_dtype   = statistics_dtype::dtype_int32;
-        break;
-      case cudf::type_id::INT64:
-        _physical_type = Type::INT64;
-        _stats_dtype   = statistics_dtype::dtype_int64;
-        break;
-      case cudf::type_id::UINT8:
-        _physical_type  = Type::INT32;
-        _converted_type = ConvertedType::UINT_8;
-        _stats_dtype    = statistics_dtype::dtype_int8;
-        break;
-      case cudf::type_id::UINT16:
-        _physical_type  = Type::INT32;
-        _converted_type = ConvertedType::UINT_16;
-        _stats_dtype    = statistics_dtype::dtype_int16;
-        break;
-      case cudf::type_id::UINT32:
-        _physical_type  = Type::INT32;
-        _converted_type = ConvertedType::UINT_32;
-        _stats_dtype    = statistics_dtype::dtype_int32;
-        break;
-      case cudf::type_id::UINT64:
-        _physical_type  = Type::INT64;
-        _converted_type = ConvertedType::UINT_64;
-        _stats_dtype    = statistics_dtype::dtype_int64;
-        break;
-      case cudf::type_id::FLOAT32:
-        _physical_type = Type::FLOAT;
-        _stats_dtype   = statistics_dtype::dtype_float32;
-        break;
-      case cudf::type_id::FLOAT64:
-        _physical_type = Type::DOUBLE;
-        _stats_dtype   = statistics_dtype::dtype_float64;
-        break;
-      case cudf::type_id::BOOL8:
-        _physical_type = Type::BOOLEAN;
-        _stats_dtype   = statistics_dtype::dtype_bool;
-        break;
-      // unsupported outside cudf for parquet 1.0.
-      case cudf::type_id::DURATION_DAYS:
-        _physical_type  = Type::INT32;
-        _converted_type = ConvertedType::TIME_MILLIS;
-        _stats_dtype    = statistics_dtype::dtype_int64;
-        break;
-      case cudf::type_id::DURATION_SECONDS:
-        _physical_type  = Type::INT64;
-        _converted_type = ConvertedType::TIME_MILLIS;
-        _stats_dtype    = statistics_dtype::dtype_int64;
-        _ts_scale       = 1000;
-        break;
-      case cudf::type_id::DURATION_MILLISECONDS:
-        _physical_type  = Type::INT64;
-        _converted_type = ConvertedType::TIME_MILLIS;
-        _stats_dtype    = statistics_dtype::dtype_int64;
-        break;
-      case cudf::type_id::DURATION_MICROSECONDS:
-        _physical_type  = Type::INT64;
-        _converted_type = ConvertedType::TIME_MICROS;
-        _stats_dtype    = statistics_dtype::dtype_int64;
-        break;
-      // unsupported outside cudf for parquet 1.0.
-      case cudf::type_id::DURATION_NANOSECONDS:
-        _physical_type  = Type::INT64;
-        _converted_type = ConvertedType::TIME_MICROS;
-        _stats_dtype    = statistics_dtype::dtype_int64;
-        _ts_scale       = -1000;  // negative value indicates division by absolute value
-        break;
-      case cudf::type_id::TIMESTAMP_DAYS:
-        _physical_type  = Type::INT32;
-        _converted_type = ConvertedType::DATE;
-        _stats_dtype    = statistics_dtype::dtype_int32;
-        break;
-      case cudf::type_id::TIMESTAMP_SECONDS:
-        _physical_type  = int96_timestamps ? Type::INT96 : Type::INT64;
-        _converted_type = ConvertedType::TIMESTAMP_MILLIS;
-        _stats_dtype    = statistics_dtype::dtype_timestamp64;
-        _ts_scale       = 1000;
-        break;
-      case cudf::type_id::TIMESTAMP_MILLISECONDS:
-        _physical_type  = int96_timestamps ? Type::INT96 : Type::INT64;
-        _converted_type = ConvertedType::TIMESTAMP_MILLIS;
-        _stats_dtype    = statistics_dtype::dtype_timestamp64;
-        break;
-      case cudf::type_id::TIMESTAMP_MICROSECONDS:
-        _physical_type  = int96_timestamps ? Type::INT96 : Type::INT64;
-        _converted_type = ConvertedType::TIMESTAMP_MICROS;
-        _stats_dtype    = statistics_dtype::dtype_timestamp64;
-        break;
-      case cudf::type_id::TIMESTAMP_NANOSECONDS:
-        _physical_type  = int96_timestamps ? Type::INT96 : Type::INT64;
-        _converted_type = ConvertedType::TIMESTAMP_MICROS;
-        _stats_dtype    = statistics_dtype::dtype_timestamp64;
-        _ts_scale       = -1000;  // negative value indicates division by absolute value
-        break;
-      case cudf::type_id::STRING:
-        _physical_type  = Type::BYTE_ARRAY;
-        _converted_type = ConvertedType::UTF8;
-        _stats_dtype    = statistics_dtype::dtype_string;
-        break;
-      case cudf::type_id::DECIMAL32:
-        _physical_type  = Type::INT32;
-        _converted_type = ConvertedType::DECIMAL;
-        _stats_dtype    = statistics_dtype::dtype_int32;
-        _decimal_scale  = -_leaf_col.type().scale();  // parquet and cudf disagree about scale signs
-        CUDF_EXPECTS(decimal_precision.size() > decimal_precision_idx,
-                     "Not enough decimal precision values passed for data!");
-        CUDF_EXPECTS(decimal_precision[decimal_precision_idx] >= _decimal_scale,
-                     "Precision must be equal to or greater than scale!");
-        _decimal_precision = decimal_precision[decimal_precision_idx++];
-        break;
-      case cudf::type_id::DECIMAL64:
-        _physical_type  = Type::INT64;
-        _converted_type = ConvertedType::DECIMAL;
-        _stats_dtype    = statistics_dtype::dtype_decimal64;
-        _decimal_scale  = -_leaf_col.type().scale();  // parquet and cudf disagree about scale signs
-        CUDF_EXPECTS(decimal_precision.size() > decimal_precision_idx,
-                     "Not enough decimal precision values passed for data!");
-        CUDF_EXPECTS(decimal_precision[decimal_precision_idx] >= _decimal_scale,
-                     "Precision must be equal to or greater than scale!");
-        _decimal_precision = decimal_precision[decimal_precision_idx++];
-        break;
-      default:
-        _physical_type = UNDEFINED_TYPE;
-        _stats_dtype   = dtype_none;
-        break;
-    }
-    size_type leaf_col_offset = col.offset();
-    _data_count               = col.size();
-    if (_list_type) {
-      // Top level column's offsets are not applied to all children. Get the effective offset and
-      // size of the leaf column
-      // Calculate row offset into dremel data (repetition/definition values) and the respective
-      // definition and repetition levels
-      gpu::dremel_data dremel = gpu::get_dremel_data(col, _nullability, stream);
-      _dremel_offsets         = std::move(dremel.dremel_offsets);
-      _rep_level              = std::move(dremel.rep_level);
-      _def_level              = std::move(dremel.def_level);
-      leaf_col_offset         = dremel.leaf_col_offset;
-      _data_count             = dremel.leaf_data_size;
-      _max_def_level          = dremel.max_def_level;
-
-      _type_width = (is_fixed_width(_leaf_col.type())) ? cudf::size_of(_leaf_col.type()) : 0;
-      _data       = (is_fixed_width(_leaf_col.type()))
-                ? _leaf_col.head<uint8_t>() + leaf_col_offset * _type_width
-                : nullptr;
-
-      // Calculate nesting levels
-      column_view curr_col = col;
-      _nesting_levels      = 0;
-      while (curr_col.type().id() == type_id::LIST) {
-        lists_column_view list_col(curr_col);
-        _nesting_levels++;
-        curr_col = list_col.child();
-      }
+    CUDF_FAIL("This functor is only meant for physical data types");
+  }
 
-      // Update level nullability if no nullability was passed in.
-      curr_col = col;
-      if (_nullability.empty()) {
-        while (curr_col.type().id() == type_id::LIST) {
-          lists_column_view list_col(curr_col);
-          _nullability.push_back(list_col.null_mask() != nullptr);
-          curr_col = list_col.child();
+  template <typename T>
+  std::enable_if_t<cudf::is_dictionary<T>(), void> operator()()
+  {
+    CUDF_FAIL("Dictionary columns are not supported for writing");
+  }
+};
+
+/**
+ * @brief Construct schema from input columns and per-column input options
+ *
+ * Recursively traverses through linked_columns and corresponding metadata to construct schema tree.
+ * The resulting schema tree is stored in a vector in pre-order traversal order.
+ */
+std::vector<schema_tree_node> construct_schema_tree(LinkedColVector const &linked_columns,
+                                                    table_input_metadata const &metadata,
+                                                    bool single_write_mode,
+                                                    bool int96_timestamps)
+{
+  std::vector<schema_tree_node> schema;
+  schema_tree_node root{};
+  root.type            = UNDEFINED_TYPE;
+  root.repetition_type = NO_REPETITION_TYPE;
+  root.name            = "schema";
+  root.num_children    = linked_columns.size();
+  root.parent_idx      = -1;  // root schema has no parent
+  schema.push_back(std::move(root));
+
+  std::function<void(LinkedColPtr const &, column_in_metadata const &, size_t)> add_schema =
+    [&](LinkedColPtr const &col, column_in_metadata const &col_meta, size_t parent_idx) {
+      bool col_nullable = [&]() {
+        if (single_write_mode) {
+          return col->nullable();
+        } else {
+          if (col_meta.is_nullability_defined()) {
+            if (col_meta.nullable() == false) {
+              CUDF_EXPECTS(
+                col->nullable() == false,
+                "Mismatch in metadata prescribed nullability and input column nullability. "
+                "Metadata for nullable input column cannot prescribe nullability = false");
+            }
+            return col_meta.nullable();
+          } else {
+            // For chunked write, when not provided nullability, we assume the worst case scenario
+            // that all columns are nullable.
+            return true;
+          }
+        }
+      }();
+
+      if (col->type().id() == type_id::STRUCT) {
+        // if struct, add current and recursively call for all children
+        schema_tree_node struct_schema{};
+        struct_schema.repetition_type =
+          col_nullable ? FieldRepetitionType::OPTIONAL : FieldRepetitionType::REQUIRED;
+
+        struct_schema.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name();
+        struct_schema.num_children = col->num_children();
+        struct_schema.parent_idx   = parent_idx;
+        schema.push_back(std::move(struct_schema));
+
+        auto struct_node_index = schema.size() - 1;
+        // for (auto child_it = col->children.begin(); child_it < col->children.end(); child_it++) {
+        //   add_schema(*child_it, struct_node_index);
+        // }
+        CUDF_EXPECTS(col->num_children() == static_cast<int>(col_meta.num_children()),
+                     "Mismatch in number of child columns between input table and metadata");
+        for (size_t i = 0; i < col->children.size(); ++i) {
+          add_schema(col->children[i], col_meta.child(i), struct_node_index);
+        }
+      } else if (col->type().id() == type_id::LIST) {
+        // List schema is denoted by two levels for each nesting level and one final level for leaf.
+        // The top level is the same name as the column name.
+        // So e.g. List<List<int>> is denoted in the schema by
+        // "col_name" : { "list" : { "element" : { "list" : { "element" } } } }
+
+        schema_tree_node list_schema_1{};
+        list_schema_1.converted_type = ConvertedType::LIST;
+        list_schema_1.repetition_type =
+          col_nullable ? FieldRepetitionType::OPTIONAL : FieldRepetitionType::REQUIRED;
+        list_schema_1.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name();
+        list_schema_1.num_children = 1;
+        list_schema_1.parent_idx   = parent_idx;
+        schema.push_back(std::move(list_schema_1));
+
+        schema_tree_node list_schema_2{};
+        list_schema_2.repetition_type = FieldRepetitionType::REPEATED;
+        list_schema_2.name            = "list";
+        list_schema_2.num_children    = 1;
+        list_schema_2.parent_idx      = schema.size() - 1;  // Parent is list_schema_1, last added.
+        schema.push_back(std::move(list_schema_2));
+
+        CUDF_EXPECTS(col_meta.num_children() == 2,
+                     "List column's metadata should have exactly two children");
+
+        add_schema(col->children[lists_column_view::child_column_index],
+                   col_meta.child(lists_column_view::child_column_index),
+                   schema.size() - 1);
+      } else {
+        // if leaf, add current
+        if (col->type().id() == type_id::STRING) {
+          CUDF_EXPECTS(col_meta.num_children() == 2 or col_meta.num_children() == 0,
+                       "String column's corresponding metadata should have zero or two children");
+        } else {
+          CUDF_EXPECTS(col_meta.num_children() == 0,
+                       "Leaf column's corresponding metadata cannot have children");
         }
-        _nullability.push_back(curr_col.null_mask() != nullptr);
-      }
 
-      stream.synchronize();
-    } else {
-      if (_nullability.empty()) { _nullability = {col.nullable()}; }
-      _max_def_level = (_nullability[0]) ? 1 : 0;
-    }
-    if (_string_type && _data_count > 0) {
-      strings_column_view view{_leaf_col};
-      _indexes = rmm::device_buffer(_data_count * sizeof(gpu::nvstrdesc_s), stream);
-
-      stringdata_to_nvstrdesc<<<((_data_count - 1) >> 8) + 1, 256, 0, stream.value()>>>(
-        reinterpret_cast<gpu::nvstrdesc_s *>(_indexes.data()),
-        view.offsets().data<size_type>() + leaf_col_offset,
-        view.chars().data<char>(),
-        _nulls,
-        _data_count);
-      _data = _indexes.data();
-
-      stream.synchronize();
-    }
+        schema_tree_node col_schema{};
 
-    // Generating default name if name isn't present in metadata
-    if (metadata && _id < metadata->column_names.size()) {
-      _name = metadata->column_names[_id];
-    } else {
-      _name = "_col" + std::to_string(_id);
-    }
-    _path_in_schema.push_back(_name);
+        bool timestamp_is_int96 = int96_timestamps or col_meta.is_enabled_int96_timestamps();
+
+        cudf::type_dispatcher(col->type(),
+                              leaf_schema_fn{col_schema, col, col_meta, timestamp_is_int96});
+
+        col_schema.repetition_type = col_nullable ? OPTIONAL : REQUIRED;
+        col_schema.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name();
+        col_schema.parent_idx  = parent_idx;
+        col_schema.leaf_column = col;
+        schema.push_back(col_schema);
+      }
+    };
+
+  CUDF_EXPECTS(metadata.column_metadata.size() == linked_columns.size(),
+               "Mismatch in the number of columns and the corresponding metadata elements");
+  // Add all linked_columns to schema using parent_idx = 0 (root)
+  for (size_t i = 0; i < linked_columns.size(); ++i) {
+    add_schema(linked_columns[i], metadata.column_metadata[i], 0);
   }
 
-  auto is_string() const noexcept { return _string_type; }
-  auto is_list() const noexcept { return _list_type; }
-  size_t type_width() const noexcept { return _type_width; }
-  size_t row_count() const noexcept { return _row_count; }
-  size_t data_count() const noexcept { return _data_count; }
-  size_t null_count() const noexcept { return _null_count; }
-  bool nullable() const { return _nullability.back(); }
-  void const *data() const noexcept { return _data; }
-  uint32_t const *nulls() const noexcept { return _nulls; }
-  size_type offset() const noexcept { return _offset; }
-  bool level_nullable(size_t level) const { return _nullability[level]; }
-  int32_t decimal_scale() const noexcept { return _decimal_scale; }
-  uint8_t decimal_precision() const noexcept { return _decimal_precision; }
-
-  // List related data
-  column_view cudf_col() const noexcept { return _col; }
-  column_view leaf_col() const noexcept { return _leaf_col; }
-  size_type nesting_levels() const noexcept { return _nesting_levels; }
-  size_type const *level_offsets() const noexcept { return _dremel_offsets.data(); }
-  uint8_t const *repetition_levels() const noexcept { return _rep_level.data(); }
-  uint8_t const *definition_levels() const noexcept { return _def_level.data(); }
-  uint16_t max_def_level() const noexcept { return _max_def_level; }
-  void set_def_level(uint16_t def_level) { _max_def_level = def_level; }
-
-  auto name() const noexcept { return _name; }
-  auto physical_type() const noexcept { return _physical_type; }
-  auto converted_type() const noexcept { return _converted_type; }
-  auto stats_type() const noexcept { return _stats_dtype; }
-  int32_t ts_scale() const noexcept { return _ts_scale; }
-  void set_path_in_schema(std::vector<std::string> path) { _path_in_schema = std::move(path); }
-  auto get_path_in_schema() const noexcept { return _path_in_schema; }
-
-  // Dictionary management
+  return schema;
+}
+
+/**
+ * @brief Class to store parquet specific information for one data stream.
+ *
+ * Contains information about a single data stream. In case of struct columns, a data stream is one
+ * of the child leaf columns that contains data.
+ * e.g. A column Struct<int, List<float>> contains 2 data streams:
+ * - Struct<int>
+ * - Struct<List<float>>
+ *
+ */
+struct parquet_column_view {
+  parquet_column_view(schema_tree_node const &schema_node,
+                      std::vector<schema_tree_node> const &schema_tree,
+                      rmm::cuda_stream_view stream);
+
+  column_view leaf_column_view() const;
+  gpu::parquet_column_device_view get_device_view();
+
+  column_view cudf_column_view() const { return cudf_col; }
+  parquet::Type physical_type() const { return schema_node.type; }
+
+  std::vector<std::string> const &get_path_in_schema() { return path_in_schema; }
+
+  // LIST related member functions
+  uint8_t max_def_level() const noexcept { return _max_def_level; }
+  uint8_t max_rep_level() const noexcept { return _max_rep_level; }
+  bool is_list() const noexcept { return _is_list; }
+
+  // Dictionary related member functions
   uint32_t *get_dict_data() { return (_dict_data.size()) ? _dict_data.data().get() : nullptr; }
   uint32_t *get_dict_index() { return (_dict_index.size()) ? _dict_index.data().get() : nullptr; }
   void use_dictionary(bool use_dict) { _dictionary_used = use_dict; }
@@ -448,56 +537,185 @@ class parquet_column_view {
   }
 
  private:
-  // cudf data column
-  column_view _col;
-  column_view _leaf_col;
-
-  // Identifier within set of columns
-  size_t _id        = 0;
-  bool _string_type = false;
-  bool _list_type   = false;
-
-  size_t _type_width     = 0;
-  size_t _row_count      = 0;
-  size_t _data_count     = 0;
-  size_t _null_count     = 0;
-  void const *_data      = nullptr;
-  uint32_t const *_nulls = nullptr;
-  size_type _offset      = 0;
-
-  // parquet-related members
-  std::string _name{};
-  Type _physical_type;
-  ConvertedType _converted_type;
-  statistics_dtype _stats_dtype;
-  int32_t _ts_scale;
-  std::vector<std::string> _path_in_schema;
-
-  // Dictionary-related members
-  bool _dictionary_used = false;
-  rmm::device_vector<uint32_t> _dict_data;
-  rmm::device_vector<uint32_t> _dict_index;
+  // Schema related members
+  schema_tree_node schema_node;
+  std::vector<std::string> path_in_schema;
+  uint8_t _max_def_level = 0;
+  uint8_t _max_rep_level = 0;
+  rmm::device_uvector<uint8_t> _d_nullability;
+
+  column_view cudf_col;
 
   // List-related members
+  bool _is_list;
   rmm::device_uvector<size_type>
     _dremel_offsets;  ///< For each row, the absolute offset into the repetition and definition
                       ///< level vectors. O(num rows)
   rmm::device_uvector<uint8_t> _rep_level;
   rmm::device_uvector<uint8_t> _def_level;
-  std::vector<bool> _nullability;
-  size_type _max_def_level  = -1;
-  size_type _nesting_levels = 0;
+  std::vector<uint8_t> _nullability;
+  size_type _data_count = 0;
 
-  // String-related members
-  rmm::device_buffer _indexes;
-
-  // Decimal-related members
-  int32_t _decimal_scale     = 0;
-  uint8_t _decimal_precision = 0;
+  // Dictionary related members
+  bool _dictionary_used = false;
+  rmm::device_vector<uint32_t> _dict_data;
+  rmm::device_vector<uint32_t> _dict_index;
 };
 
+parquet_column_view::parquet_column_view(schema_tree_node const &schema_node,
+                                         std::vector<schema_tree_node> const &schema_tree,
+                                         rmm::cuda_stream_view stream)
+  : schema_node(schema_node),
+    _d_nullability(0, stream),
+    _dremel_offsets(0, stream),
+    _rep_level(0, stream),
+    _def_level(0, stream)
+{
+  // Construct single inheritance column_view from linked_column_view
+  auto curr_col                           = schema_node.leaf_column.get();
+  column_view single_inheritance_cudf_col = *curr_col;
+  while (curr_col->parent) {
+    auto const &parent = *curr_col->parent;
+
+    // For list columns, we still need to retain the offset child column.
+    auto children =
+      (parent.type().id() == type_id::LIST)
+        ? std::vector<column_view>{parent.child(lists_column_view::offsets_column_index),
+                                   single_inheritance_cudf_col}
+        : std::vector<column_view>{single_inheritance_cudf_col};
+
+    single_inheritance_cudf_col = column_view(parent.type(),
+                                              parent.size(),
+                                              parent.head(),
+                                              parent.null_mask(),
+                                              UNKNOWN_NULL_COUNT,
+                                              parent.offset(),
+                                              children);
+
+    curr_col = curr_col->parent;
+  }
+  cudf_col = single_inheritance_cudf_col;
+
+  // Construct path_in_schema by travelling up in the schema_tree
+  std::vector<std::string> path;
+  auto curr_schema_node = schema_node;
+  do {
+    path.push_back(curr_schema_node.name);
+    if (curr_schema_node.parent_idx != -1) {
+      curr_schema_node = schema_tree[curr_schema_node.parent_idx];
+    }
+  } while (curr_schema_node.parent_idx != -1);
+  path_in_schema = std::vector<std::string>(path.crbegin(), path.crend());
+
+  // Calculate max definition level by counting the number of levels that are optional (nullable)
+  // and max repetition level by counting the number of REPEATED levels in this column's hierarchy
+  uint16_t max_def_level = 0;
+  uint16_t max_rep_level = 0;
+  curr_schema_node       = schema_node;
+  while (curr_schema_node.parent_idx != -1) {
+    if (curr_schema_node.repetition_type == parquet::REPEATED or
+        curr_schema_node.repetition_type == parquet::OPTIONAL) {
+      ++max_def_level;
+    }
+    if (curr_schema_node.repetition_type == parquet::REPEATED) { ++max_rep_level; }
+    curr_schema_node = schema_tree[curr_schema_node.parent_idx];
+  }
+  CUDF_EXPECTS(max_def_level < 256, "Definition levels above 255 are not supported");
+  CUDF_EXPECTS(max_rep_level < 256, "Definition levels above 255 are not supported");
+
+  _max_def_level = max_def_level;
+  _max_rep_level = max_rep_level;
+
+  // Construct nullability vector using repetition_type from schema.
+  std::vector<uint8_t> r_nullability;
+  curr_schema_node = schema_node;
+  while (curr_schema_node.parent_idx != -1) {
+    if (not curr_schema_node.is_stub()) {
+      r_nullability.push_back(curr_schema_node.repetition_type == FieldRepetitionType::OPTIONAL);
+    }
+    curr_schema_node = schema_tree[curr_schema_node.parent_idx];
+  }
+  _nullability = std::vector<uint8_t>(r_nullability.crbegin(), r_nullability.crend());
+  // TODO(cp): Explore doing this for all columns in a single go outside this ctor. Maybe using
+  // hostdevice_vector. Currently this involves a cudaMemcpyAsync for each column.
+  _d_nullability = rmm::device_uvector<uint8_t>(_nullability.size(), stream);
+  CUDA_TRY(cudaMemcpyAsync(_d_nullability.data(),
+                           _nullability.data(),
+                           _nullability.size() * sizeof(uint8_t),
+                           cudaMemcpyHostToDevice,
+                           stream.value()));
+
+  _is_list = (_max_rep_level > 0);
+
+  if (cudf_col.size() == 0) { return; }
+
+  if (_is_list) {
+    // Top level column's offsets are not applied to all children. Get the effective offset and
+    // size of the leaf column
+    // Calculate row offset into dremel data (repetition/definition values) and the respective
+    // definition and repetition levels
+    gpu::dremel_data dremel = gpu::get_dremel_data(cudf_col, _d_nullability, _nullability, stream);
+    _dremel_offsets         = std::move(dremel.dremel_offsets);
+    _rep_level              = std::move(dremel.rep_level);
+    _def_level              = std::move(dremel.def_level);
+    _data_count = dremel.leaf_data_size;  // Needed for knowing what size dictionary to allocate
+
+    stream.synchronize();
+  } else {
+    // For non-list struct, the size of the root column is the same as the size of the leaf column
+    _data_count = cudf_col.size();
+  }
+}
+
+column_view parquet_column_view::leaf_column_view() const
+{
+  auto col = cudf_col;
+  while (cudf::is_nested(col.type())) {
+    if (col.type().id() == type_id::LIST) {
+      col = col.child(lists_column_view::child_column_index);
+    } else if (col.type().id() == type_id::STRUCT) {
+      col = col.child(0);  // Stored cudf_col has only one child if struct
+    }
+  }
+  return col;
+}
+
+gpu::parquet_column_device_view parquet_column_view::get_device_view()
+{
+  column_view col  = leaf_column_view();
+  auto desc        = gpu::parquet_column_device_view{};  // Zero out all fields
+  desc.stats_dtype = schema_node.stats_dtype;
+  desc.ts_scale    = schema_node.ts_scale;
+
+  // TODO (dm): Enable dictionary for list after refactor
+  if (physical_type() != BOOLEAN && physical_type() != UNDEFINED_TYPE && !is_list()) {
+    alloc_dictionary(_data_count);
+    desc.dict_index = get_dict_index();
+    desc.dict_data  = get_dict_data();
+  }
+
+  if (is_list()) {
+    desc.level_offsets = _dremel_offsets.data();
+    desc.rep_values    = _rep_level.data();
+    desc.def_values    = _def_level.data();
+  }
+  desc.num_rows      = cudf_col.size();
+  desc.physical_type = static_cast<uint8_t>(physical_type());
+  auto count_bits    = [](uint16_t number) {
+    int16_t nbits = 0;
+    while (number > 0) {
+      nbits++;
+      number >>= 1;
+    }
+    return nbits;
+  };
+  desc.level_bits  = count_bits(max_rep_level()) << 4 | count_bits(max_def_level());
+  desc.nullability = _d_nullability.data();
+  return desc;
+}
+
 void writer::impl::init_page_fragments(hostdevice_vector<gpu::PageFragment> &frag,
-                                       hostdevice_vector<gpu::EncColumnDesc> &col_desc,
+                                       hostdevice_vector<gpu::parquet_column_device_view> &col_desc,
                                        uint32_t num_columns,
                                        uint32_t num_fragments,
                                        uint32_t num_rows,
@@ -513,12 +731,13 @@ void writer::impl::init_page_fragments(hostdevice_vector<gpu::PageFragment> &fra
   frag.device_to_host(stream, true);
 }
 
-void writer::impl::gather_fragment_statistics(statistics_chunk *frag_stats_chunk,
-                                              hostdevice_vector<gpu::PageFragment> &frag,
-                                              hostdevice_vector<gpu::EncColumnDesc> &col_desc,
-                                              uint32_t num_columns,
-                                              uint32_t num_fragments,
-                                              uint32_t fragment_size)
+void writer::impl::gather_fragment_statistics(
+  statistics_chunk *frag_stats_chunk,
+  hostdevice_vector<gpu::PageFragment> &frag,
+  hostdevice_vector<gpu::parquet_column_device_view> &col_desc,
+  uint32_t num_columns,
+  uint32_t num_fragments,
+  uint32_t fragment_size)
 {
   rmm::device_vector<statistics_group> frag_stats_group(num_fragments * num_columns);
 
@@ -534,11 +753,12 @@ void writer::impl::gather_fragment_statistics(statistics_chunk *frag_stats_chunk
   stream.synchronize();
 }
 
-void writer::impl::build_chunk_dictionaries(hostdevice_vector<gpu::EncColumnChunk> &chunks,
-                                            hostdevice_vector<gpu::EncColumnDesc> &col_desc,
-                                            uint32_t num_rowgroups,
-                                            uint32_t num_columns,
-                                            uint32_t num_dictionaries)
+void writer::impl::build_chunk_dictionaries(
+  hostdevice_vector<gpu::EncColumnChunk> &chunks,
+  hostdevice_vector<gpu::parquet_column_device_view> &col_desc,
+  uint32_t num_rowgroups,
+  uint32_t num_columns,
+  uint32_t num_dictionaries)
 {
   size_t dict_scratch_size = (size_t)num_dictionaries * gpu::kDictScratchSize;
   rmm::device_vector<uint32_t> dict_scratch(dict_scratch_size / sizeof(uint32_t));
@@ -560,7 +780,7 @@ void writer::impl::build_chunk_dictionaries(hostdevice_vector<gpu::EncColumnChun
 }
 
 void writer::impl::init_encoder_pages(hostdevice_vector<gpu::EncColumnChunk> &chunks,
-                                      hostdevice_vector<gpu::EncColumnDesc> &col_desc,
+                                      hostdevice_vector<gpu::parquet_column_device_view> &col_desc,
                                       gpu::EncPage *pages,
                                       statistics_chunk *page_stats,
                                       statistics_chunk *frag_stats,
@@ -651,10 +871,11 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
     stats_granularity_(options.get_stats_level()),
     int96_timestamps(options.is_enabled_int96_timestamps()),
     out_sink_(std::move(sink)),
-    decimal_precision(options.get_decimal_precision()),
-    single_write_mode(mode == SingleWriteMode::YES),
-    user_metadata(options.get_metadata())
+    single_write_mode(mode == SingleWriteMode::YES)
 {
+  if (options.get_metadata()) {
+    table_meta = std::make_unique<table_input_metadata>(*options.get_metadata());
+  }
   init_state();
 }
 
@@ -668,15 +889,12 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
     compression_(to_parquet_compression(options.get_compression())),
     stats_granularity_(options.get_stats_level()),
     int96_timestamps(options.is_enabled_int96_timestamps()),
-    decimal_precision(options.get_decimal_precision()),
     single_write_mode(mode == SingleWriteMode::YES),
     out_sink_(std::move(sink))
 {
-  if (options.get_nullable_metadata() != nullptr) {
-    user_metadata_with_nullability = *options.get_nullable_metadata();
-    user_metadata                  = &user_metadata_with_nullability;
+  if (options.get_metadata()) {
+    table_meta = std::make_unique<table_input_metadata>(*options.get_metadata());
   }
-
   init_state();
 }
 
@@ -695,148 +913,51 @@ void writer::impl::write(table_view const &table)
 {
   CUDF_EXPECTS(not closed, "Data has already been flushed to out and closed");
 
-  size_type num_columns = table.num_columns();
-  size_type num_rows    = table.num_rows();
+  size_type num_rows = table.num_rows();
 
-  // Wrapper around cudf columns to attach parquet-specific type info.
-  // Note : I wish we could do this in the begin() function but since the
-  // metadata is optional we would have no way of knowing how many columns
-  // we actually have.
-  std::vector<parquet_column_view> parquet_columns;
-  parquet_columns.reserve(num_columns);  // Avoids unnecessary re-allocation
-
-  // because the repetition type is global (in the sense of, not per-rowgroup or per write_chunk()
-  // call) we cannot know up front if the user is going to end up passing tables with nulls/no nulls
-  // in the multiple write_chunk() case.  so we'll do some special handling.
-  // The user can pass in information about the nullability of a column to be enforced across
-  // write_chunk() calls, in a flattened bool vector. Figure out that per column.
-  auto per_column_nullability =
-    (single_write_mode)
-      ? std::vector<std::vector<bool>>{}
-      : get_per_column_nullability(table, user_metadata_with_nullability.column_nullable);
-
-  uint decimal_precision_idx = 0;
-
-  for (auto it = table.begin(); it < table.end(); ++it) {
-    const auto col        = *it;
-    const auto current_id = parquet_columns.size();
-
-    // if the user is explicitly saying "I am only calling this once", assume the columns in this
-    // one table tell us everything we need to know about their nullability.
-    // Empty nullability means the writer figures out the nullability from the cudf columns.
-    auto const &this_column_nullability =
-      (single_write_mode) ? std::vector<bool>{} : per_column_nullability[current_id];
-
-    parquet_columns.emplace_back(current_id,
-                                 col,
-                                 this_column_nullability,
-                                 user_metadata,
-                                 int96_timestamps,
-                                 decimal_precision,
-                                 decimal_precision_idx,
-                                 stream);
-  }
+  if (not table_meta) { table_meta = std::make_unique<table_input_metadata>(table); }
 
-  CUDF_EXPECTS(decimal_precision_idx == decimal_precision.size(),
-               "Too many decimal precision values!");
+  // Fill unnamed columns' names in table_meta
+  std::function<void(column_in_metadata &, std::string)> add_default_name =
+    [&](column_in_metadata &col_meta, std::string default_name) {
+      if (col_meta.get_name().empty()) col_meta.set_name(default_name);
+      for (size_type i = 0; i < col_meta.num_children(); ++i) {
+        add_default_name(col_meta.child(i), col_meta.get_name() + "_" + std::to_string(i));
+      }
+    };
+  for (size_t i = 0; i < table_meta->column_metadata.size(); ++i) {
+    add_default_name(table_meta->column_metadata[i], "_col" + std::to_string(i));
+  }
 
-  // first call. setup metadata. num_rows will get incremented as write_chunk is
-  // called multiple times.
-  // Calculate the sum of depths of all list columns
-  size_type const list_col_depths = std::accumulate(
-    parquet_columns.cbegin(), parquet_columns.cend(), 0, [](size_type sum, auto const &col) {
-      return sum + col.nesting_levels();
-    });
+  auto vec         = input_table_to_linked_columns(table);
+  auto schema_tree = construct_schema_tree(vec, *table_meta, single_write_mode, int96_timestamps);
+  // Construct parquet_column_views from the schema tree leaf nodes.
+  std::vector<parquet_column_view> parquet_columns;
 
-  // Make schema with current table
-  std::vector<SchemaElement> this_table_schema;
-  {
-    // Each level of nesting requires two levels of Schema. The leaf level needs one schema element
-    this_table_schema.reserve(1 + num_columns + list_col_depths * 2);
-    SchemaElement root{};
-    root.type            = UNDEFINED_TYPE;
-    root.repetition_type = NO_REPETITION_TYPE;
-    root.name            = "schema";
-    root.num_children    = num_columns;
-    this_table_schema.push_back(std::move(root));
-    for (auto i = 0; i < num_columns; i++) {
-      auto &col = parquet_columns[i];
-      if (col.is_list()) {
-        size_type nesting_depth = col.nesting_levels();
-        // Each level of nesting requires two levels of Schema. The leaf level needs one schema
-        // element
-        std::vector<SchemaElement> list_schema(nesting_depth * 2 + 1);
-        for (size_type j = 0; j < nesting_depth; j++) {
-          // List schema is denoted by two levels for each nesting level and one final level for
-          // leaf. The top level is the same name as the column name.
-          // So e.g. List<List<int>> is denoted in the schema by
-          // "col_name" : { "list" : { "element" : { "list" : { "element" } } } }
-          auto const group_idx = 2 * j;
-          auto const list_idx  = 2 * j + 1;
-
-          list_schema[group_idx].name            = (j == 0) ? col.name() : "element";
-          list_schema[group_idx].repetition_type = (col.level_nullable(j)) ? OPTIONAL : REQUIRED;
-          list_schema[group_idx].converted_type  = ConvertedType::LIST;
-          list_schema[group_idx].num_children    = 1;
-
-          list_schema[list_idx].name            = "list";
-          list_schema[list_idx].repetition_type = REPEATED;
-          list_schema[list_idx].num_children    = 1;
-        }
-        list_schema[nesting_depth * 2].name = "element";
-        list_schema[nesting_depth * 2].repetition_type =
-          col.level_nullable(nesting_depth) ? OPTIONAL : REQUIRED;
-        auto const &physical_type           = col.physical_type();
-        list_schema[nesting_depth * 2].type = physical_type;
-        list_schema[nesting_depth * 2].converted_type =
-          physical_type == parquet::Type::INT96 ? ConvertedType::UNKNOWN : col.converted_type();
-        list_schema[nesting_depth * 2].num_children      = 0;
-        list_schema[nesting_depth * 2].decimal_precision = col.decimal_precision();
-        list_schema[nesting_depth * 2].decimal_scale     = col.decimal_scale();
-
-        std::vector<std::string> path_in_schema;
-        std::transform(
-          list_schema.cbegin(), list_schema.cend(), std::back_inserter(path_in_schema), [](auto s) {
-            return s.name;
-          });
-        col.set_path_in_schema(path_in_schema);
-        this_table_schema.insert(this_table_schema.end(), list_schema.begin(), list_schema.end());
-      } else {
-        SchemaElement col_schema{};
-        // Column metadata
-        auto const &physical_type = col.physical_type();
-        col_schema.type           = physical_type;
-        col_schema.converted_type =
-          physical_type == parquet::Type::INT96 ? ConvertedType::UNKNOWN : col.converted_type();
-
-        col_schema.repetition_type =
-          (col.max_def_level() == 1 || (single_write_mode && col.row_count() < (size_t)num_rows))
-            ? OPTIONAL
-            : REQUIRED;
-
-        col_schema.name              = col.name();
-        col_schema.num_children      = 0;  // Leaf node
-        col_schema.decimal_precision = col.decimal_precision();
-        col_schema.decimal_scale     = col.decimal_scale();
-
-        this_table_schema.push_back(std::move(col_schema));
-      }
-    }
+  for (schema_tree_node const &schema_node : schema_tree) {
+    if (schema_node.leaf_column) { parquet_columns.emplace_back(schema_node, schema_tree, stream); }
   }
 
+  // Mass allocation of column_device_views for each parquet_column_view
+  std::vector<column_view> cudf_cols;
+  cudf_cols.reserve(parquet_columns.size());
+  for (auto const &parq_col : parquet_columns) { cudf_cols.push_back(parq_col.cudf_column_view()); }
+  table_view single_streams_table(cudf_cols);
+  size_type num_columns = single_streams_table.num_columns();
+
+  std::vector<SchemaElement> this_table_schema(schema_tree.begin(), schema_tree.end());
+
   if (md.version == 0) {
     md.version  = 1;
     md.num_rows = num_rows;
     md.column_order_listsize =
       (stats_granularity_ != statistics_freq::STATISTICS_NONE) ? num_columns : 0;
-    if (user_metadata != nullptr) {
-      std::transform(user_metadata->user_data.begin(),
-                     user_metadata->user_data.end(),
-                     std::back_inserter(md.key_value_metadata),
-                     [](auto const &kv) {
-                       return KeyValue{kv.first, kv.second};
-                     });
-    }
+    std::transform(table_meta->user_data.begin(),
+                   table_meta->user_data.end(),
+                   std::back_inserter(md.key_value_metadata),
+                   [](auto const &kv) {
+                     return KeyValue{kv.first, kv.second};
+                   });
     md.schema = this_table_schema;
   } else {
     // verify the user isn't passing mismatched tables
@@ -848,49 +969,17 @@ void writer::impl::write(table_view const &table)
   }
   // Create table_device_view so that corresponding column_device_view data
   // can be written into col_desc members
-  auto parent_column_table_device_view = table_device_view::create(table);
+  auto parent_column_table_device_view = table_device_view::create(single_streams_table);
   rmm::device_uvector<column_device_view> leaf_column_views(0, stream);
 
   // Initialize column description
-  hostdevice_vector<gpu::EncColumnDesc> col_desc(num_columns, stream);
-
-  // setup gpu column description.
-  // applicable to only this _write_chunk() call
-  for (auto i = 0; i < num_columns; i++) {
-    auto &col = parquet_columns[i];
-    // GPU column description
-    auto *desc             = &col_desc[i];
-    *desc                  = gpu::EncColumnDesc{};  // Zero out all fields
-    desc->column_data_base = col.data();
-    desc->valid_map_base   = col.nulls();
-    desc->column_offset    = col.offset();
-    desc->stats_dtype      = col.stats_type();
-    desc->ts_scale         = col.ts_scale();
-    // TODO (dm): Enable dictionary for list after refactor
-    if (col.physical_type() != BOOLEAN && col.physical_type() != UNDEFINED_TYPE && !col.is_list()) {
-      col.alloc_dictionary(col.data_count());
-      desc->dict_index = col.get_dict_index();
-      desc->dict_data  = col.get_dict_data();
-    }
-    if (col.is_list()) {
-      desc->level_offsets = col.level_offsets();
-      desc->rep_values    = col.repetition_levels();
-      desc->def_values    = col.definition_levels();
-    }
-    desc->num_values     = col.data_count();
-    desc->num_rows       = col.row_count();
-    desc->physical_type  = static_cast<uint8_t>(col.physical_type());
-    desc->converted_type = static_cast<uint8_t>(col.converted_type());
-    auto count_bits      = [](uint16_t number) {
-      int16_t nbits = 0;
-      while (number > 0) {
-        nbits++;
-        number >>= 1;
-      }
-      return nbits;
-    };
-    desc->level_bits = count_bits(col.nesting_levels()) << 4 | count_bits(col.max_def_level());
-  }
+  hostdevice_vector<gpu::parquet_column_device_view> col_desc(parquet_columns.size(), stream);
+  // This should've been `auto const&` but isn't since dictionary space is allocated when calling
+  // get_device_view(). Fix during dictionary refactor.
+  std::transform(
+    parquet_columns.begin(), parquet_columns.end(), col_desc.host_ptr(), [](auto &pcol) {
+      return pcol.get_device_view();
+    });
 
   // Init page fragments
   // 5000 is good enough for up to ~200-character strings. Longer strings will start producing
@@ -909,7 +998,7 @@ void writer::impl::write(table_view const &table)
   if (fragments.size() != 0) {
     // Move column info to device
     col_desc.host_to_device(stream);
-    leaf_column_views = create_leaf_column_device_views<gpu::EncColumnDesc>(
+    leaf_column_views = create_leaf_column_device_views<gpu::parquet_column_device_view>(
       col_desc, *parent_column_table_device_view, stream);
 
     init_page_fragments(fragments, col_desc, num_columns, num_fragments, num_rows, fragment_size);
@@ -1108,19 +1197,7 @@ void writer::impl::write(table_view const &table)
                        num_stats_bfr);
   }
 
-  auto host_bfr = [&]() {
-    // if the writer supports device_write(), we don't need this scratch space
-    if (out_sink_->supports_device_write()) {
-      return pinned_buffer<uint8_t>{nullptr, cudaFreeHost};
-    } else {
-      return pinned_buffer<uint8_t>{[](size_t size) {
-                                      uint8_t *ptr = nullptr;
-                                      CUDA_TRY(cudaMallocHost(&ptr, size));
-                                      return ptr;
-                                    }(max_chunk_bfr_size),
-                                    cudaFreeHost};
-    }
-  }();
+  pinned_buffer<uint8_t> host_bfr{nullptr, cudaFreeHost};
 
   // Encode row groups in batches
   for (uint32_t b = 0, r = 0, global_r = global_rowgroup_base; b < (uint32_t)batch_list.size();
@@ -1155,7 +1232,7 @@ void writer::impl::write(table_view const &table)
           dev_bfr = ck->uncompressed_bfr;
         }
 
-        if (out_sink_->supports_device_write()) {
+        if (out_sink_->is_device_write_preferred(ck->compressed_size)) {
           // let the writer do what it wants to retrieve the data from the gpu.
           out_sink_->device_write(dev_bfr + ck->ck_stat_size, ck->compressed_size, stream);
           // we still need to do a (much smaller) memcpy for the statistics.
@@ -1170,6 +1247,14 @@ void writer::impl::write(table_view const &table)
             stream.synchronize();
           }
         } else {
+          if (!host_bfr) {
+            host_bfr = pinned_buffer<uint8_t>{[](size_t size) {
+                                                uint8_t *ptr = nullptr;
+                                                CUDA_TRY(cudaMallocHost(&ptr, size));
+                                                return ptr;
+                                              }(max_chunk_bfr_size),
+                                              cudaFreeHost};
+          }
           // copy the full data
           CUDA_TRY(cudaMemcpyAsync(host_bfr.get(),
                                    dev_bfr,
diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp
index f5e0f7408c5..b8532d755eb 100644
--- a/cpp/src/io/parquet/writer_impl.hpp
+++ b/cpp/src/io/parquet/writer_impl.hpp
@@ -44,7 +44,7 @@ namespace io {
 namespace detail {
 namespace parquet {
 // Forward internal classes
-class parquet_column_view;
+struct parquet_column_view;
 
 using namespace cudf::io::parquet;
 using namespace cudf::io;
@@ -130,7 +130,7 @@ class writer::impl {
    * @param fragment_size Number of rows per fragment
    */
   void init_page_fragments(hostdevice_vector<gpu::PageFragment>& frag,
-                           hostdevice_vector<gpu::EncColumnDesc>& col_desc,
+                           hostdevice_vector<gpu::parquet_column_device_view>& col_desc,
                            uint32_t num_columns,
                            uint32_t num_fragments,
                            uint32_t num_rows,
@@ -148,7 +148,7 @@ class writer::impl {
    */
   void gather_fragment_statistics(statistics_chunk* dst_stats,
                                   hostdevice_vector<gpu::PageFragment>& frag,
-                                  hostdevice_vector<gpu::EncColumnDesc>& col_desc,
+                                  hostdevice_vector<gpu::parquet_column_device_view>& col_desc,
                                   uint32_t num_columns,
                                   uint32_t num_fragments,
                                   uint32_t fragment_size);
@@ -162,7 +162,7 @@ class writer::impl {
    * @param num_dictionaries Total number of dictionaries
    */
   void build_chunk_dictionaries(hostdevice_vector<gpu::EncColumnChunk>& chunks,
-                                hostdevice_vector<gpu::EncColumnDesc>& col_desc,
+                                hostdevice_vector<gpu::parquet_column_device_view>& col_desc,
                                 uint32_t num_rowgroups,
                                 uint32_t num_columns,
                                 uint32_t num_dictionaries);
@@ -178,7 +178,7 @@ class writer::impl {
    * @param num_stats_bfr Number of statistics buffers
    */
   void init_encoder_pages(hostdevice_vector<gpu::EncColumnChunk>& chunks,
-                          hostdevice_vector<gpu::EncColumnDesc>& col_desc,
+                          hostdevice_vector<gpu::parquet_column_device_view>& col_desc,
                           gpu::EncPage* pages,
                           statistics_chunk* page_stats,
                           statistics_chunk* frag_stats,
@@ -228,15 +228,9 @@ class writer::impl {
   // Overall file metadata.  Filled in during the process and written during write_chunked_end()
   cudf::io::parquet::FileMetaData md;
   // optional user metadata
-  table_metadata_with_nullability user_metadata_with_nullability;
-  // only used in the write_chunked() case. copied from the (optionally) user supplied
-  // argument to write()
-  table_metadata const* user_metadata = nullptr;
+  std::unique_ptr<table_input_metadata> table_meta;
   // to track if the output has been written to sink
   bool closed = false;
-  // vector of precision values for decimal writing. Exactly one entry
-  // per decimal column.
-  std::vector<uint8_t> decimal_precision;
   // current write position for rowgroups/chunks
   std::size_t current_chunk_offset;
   // special parameter only used by detail::write() to indicate that we are guaranteeing
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index b4e26c042fb..88444d41206 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -32,7 +32,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
-#include <rmm/device_vector.hpp>
+#include <rmm/device_uvector.hpp>
 
 namespace cudf {
 namespace io {
@@ -108,7 +108,10 @@ struct column_buffer {
     size = _size;
 
     switch (type.id()) {
-      case type_id::STRING: _strings.resize(size); break;
+      case type_id::STRING:
+        _strings = std::make_unique<rmm::device_uvector<str_pair>>(size, stream);
+        cudaMemsetAsync(_strings->data(), 0, size * sizeof(str_pair), stream.value());
+        break;
 
       // list columns store a buffer of int32's as offsets to represent
       // their individual rows
@@ -125,8 +128,8 @@ struct column_buffer {
     }
   }
 
-  auto data() { return _strings.size() ? _strings.data().get() : _data.data(); }
-  auto data_size() { return std::max(_data.size(), _strings.size() * sizeof(str_pair)); }
+  auto data() { return _strings ? _strings->data() : _data.data(); }
+  auto data_size() const { return _strings ? _strings->size() : _data.size(); }
 
   template <typename T = uint32_t>
   auto null_mask()
@@ -137,7 +140,7 @@ struct column_buffer {
 
   auto& null_count() { return _null_count; }
 
-  rmm::device_vector<str_pair> _strings;
+  std::unique_ptr<rmm::device_uvector<str_pair>> _strings;
   rmm::device_buffer _data{};
   rmm::device_buffer _null_mask{};
   size_type _null_count{0};
@@ -178,7 +181,7 @@ std::unique_ptr<column> make_column(
         schema_info->children.push_back(column_name_info{"offsets"});
         schema_info->children.push_back(column_name_info{"chars"});
       }
-      return make_strings_column(buffer._strings, stream, mr);
+      return make_strings_column(*buffer._strings, stream, mr);
 
     case type_id::LIST: {
       // make offsets column
diff --git a/cpp/src/io/utilities/column_utils.cuh b/cpp/src/io/utilities/column_utils.cuh
index 4f41e846631..c08f42583ef 100644
--- a/cpp/src/io/utilities/column_utils.cuh
+++ b/cpp/src/io/utilities/column_utils.cuh
@@ -57,27 +57,24 @@ rmm::device_uvector<column_device_view> create_leaf_column_device_views(
   auto leaf_columns = cudf::device_span<column_device_view>{leaf_column_views};
 
   auto iter = thrust::make_counting_iterator<size_type>(0);
-  thrust::for_each(rmm::exec_policy(stream),
-                   iter,
-                   iter + parent_table_device_view.num_columns(),
-                   [col_desc, parent_col_view = parent_table_device_view, leaf_columns] __device__(
-                     size_type index) mutable {
-                     column_device_view col = parent_col_view.column(index);
-
-                     if (col.type().id() == type_id::LIST) {
-                       col_desc[index].parent_column = parent_col_view.begin() + index;
-                     } else {
-                       col_desc[index].parent_column = nullptr;
-                     }
-                     // traverse till leaf column
-                     while (col.type().id() == type_id::LIST) {
-                       col = col.child(lists_column_view::child_column_index);
-                     }
-                     // Store leaf_column to device storage
-                     column_device_view *leaf_col_ptr = leaf_columns.begin() + index;
-                     *leaf_col_ptr                    = col;
-                     col_desc[index].leaf_column      = leaf_col_ptr;
-                   });
+  thrust::for_each(
+    rmm::exec_policy(stream),
+    iter,
+    iter + parent_table_device_view.num_columns(),
+    [col_desc, parent_col_view = parent_table_device_view, leaf_columns] __device__(
+      size_type index) mutable {
+      col_desc[index].parent_column = parent_col_view.begin() + index;
+      column_device_view col        = parent_col_view.column(index);
+      // traverse till leaf column
+      while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) {
+        col = (col.type().id() == type_id::LIST) ? col.child(lists_column_view::child_column_index)
+                                                 : col.child(0);
+      }
+      // Store leaf_column to device storage
+      column_device_view *leaf_col_ptr = leaf_columns.begin() + index;
+      *leaf_col_ptr                    = col;
+      col_desc[index].leaf_column      = leaf_col_ptr;
+    });
 
   return leaf_column_views;
 }
diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp
index 48558005303..10af7bcb0bd 100644
--- a/cpp/src/io/utilities/data_sink.cpp
+++ b/cpp/src/io/utilities/data_sink.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 
 #include <cudf/io/data_sink.hpp>
 #include <cudf/utilities/error.hpp>
+#include <io/utilities/file_io_utilities.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -29,24 +30,44 @@ namespace io {
 class file_sink : public data_sink {
  public:
   explicit file_sink(std::string const& filepath)
+    : _cufile_out(detail::make_cufile_output(filepath))
   {
-    outfile_.open(filepath, std::ios::out | std::ios::binary | std::ios::trunc);
-    CUDF_EXPECTS(outfile_.is_open(), "Cannot open output file");
+    _output_stream.open(filepath, std::ios::out | std::ios::binary | std::ios::trunc);
+    CUDF_EXPECTS(_output_stream.is_open(), "Cannot open output file");
   }
 
   virtual ~file_sink() { flush(); }
 
   void host_write(void const* data, size_t size) override
   {
-    outfile_.write(static_cast<char const*>(data), size);
+    _output_stream.seekp(_bytes_written);
+    _output_stream.write(static_cast<char const*>(data), size);
+    _bytes_written += size;
   }
 
-  void flush() override { outfile_.flush(); }
+  void flush() override { _output_stream.flush(); }
 
-  size_t bytes_written() override { return outfile_.tellp(); }
+  size_t bytes_written() override { return _bytes_written; }
+
+  bool supports_device_write() const override { return _cufile_out != nullptr; }
+
+  bool is_device_write_preferred(size_t size) const override
+  {
+    return _cufile_out != nullptr && _cufile_out->is_cufile_io_preferred(size);
+  }
+
+  void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override
+  {
+    if (!supports_device_write()) CUDF_FAIL("Device writes are not supported for this file.");
+
+    _cufile_out->write(gpu_data, _bytes_written, size);
+    _bytes_written += size;
+  }
 
  private:
-  std::ofstream outfile_;
+  std::ofstream _output_stream;
+  size_t _bytes_written = 0;
+  std::unique_ptr<detail::cufile_output_impl> _cufile_out;
 };
 
 /**
@@ -77,25 +98,25 @@ class host_buffer_sink : public data_sink {
  */
 class void_sink : public data_sink {
  public:
-  explicit void_sink() : bytes_written_(0) {}
+  explicit void_sink() : _bytes_written(0) {}
 
   virtual ~void_sink() {}
 
-  void host_write(void const* data, size_t size) override { bytes_written_ += size; }
+  void host_write(void const* data, size_t size) override { _bytes_written += size; }
 
   bool supports_device_write() const override { return true; }
 
   void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override
   {
-    bytes_written_ += size;
+    _bytes_written += size;
   }
 
   void flush() override {}
 
-  size_t bytes_written() override { return bytes_written_; }
+  size_t bytes_written() override { return _bytes_written; }
 
  private:
-  size_t bytes_written_;
+  size_t _bytes_written;
 };
 
 class user_sink_wrapper : public data_sink {
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 74163d023be..3f2884d5b7d 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,14 +14,14 @@
  * limitations under the License.
  */
 
+#include <cudf/io/datasource.hpp>
+
 #include <fcntl.h>
 #include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
 #include <unistd.h>
 
-#include <cudf/io/datasource.hpp>
 #include <cudf/utilities/error.hpp>
+#include <io/utilities/file_io_utilities.hpp>
 
 namespace cudf {
 namespace io {
@@ -34,12 +34,6 @@ namespace io {
  * mapping a subset of the file where the starting offset may not be zero.
  */
 class memory_mapped_source : public datasource {
-  struct file_wrapper {
-    const int fd = -1;
-    explicit file_wrapper(const char *filepath) : fd(open(filepath, O_RDONLY)) {}
-    ~file_wrapper() { close(fd); }
-  };
-
   class memory_mapped_buffer : public buffer {
     size_t _size   = 0;
     uint8_t *_data = nullptr;
@@ -52,77 +46,99 @@ class memory_mapped_source : public datasource {
 
  public:
   explicit memory_mapped_source(const char *filepath, size_t offset, size_t size)
+    : _cufile_in(detail::make_cufile_input(filepath))
   {
-    auto const file = file_wrapper(filepath);
-    CUDF_EXPECTS(file.fd != -1, "Cannot open file");
-
-    struct stat st;
-    CUDF_EXPECTS(fstat(file.fd, &st) != -1, "Cannot query file size");
-    file_size_ = static_cast<size_t>(st.st_size);
-
-    if (file_size_ != 0) { map(file.fd, offset, size); }
+    auto const file = detail::file_wrapper(filepath, O_RDONLY);
+    _file_size      = file.size();
+    if (_file_size != 0) { map(file.desc(), offset, size); }
   }
 
   virtual ~memory_mapped_source()
   {
-    if (map_addr_ != nullptr) { munmap(map_addr_, map_size_); }
+    if (_map_addr != nullptr) { munmap(_map_addr, _map_size); }
   }
 
   std::unique_ptr<buffer> host_read(size_t offset, size_t size) override
   {
-    CUDF_EXPECTS(offset >= map_offset_, "Requested offset is outside mapping");
+    CUDF_EXPECTS(offset >= _map_offset, "Requested offset is outside mapping");
 
     // Clamp length to available data in the mapped region
-    auto const read_size = std::min(size, map_size_ - (offset - map_offset_));
+    auto const read_size = std::min(size, _map_size - (offset - _map_offset));
 
     return std::make_unique<memory_mapped_buffer>(
-      static_cast<uint8_t *>(map_addr_) + (offset - map_offset_), read_size);
+      static_cast<uint8_t *>(_map_addr) + (offset - _map_offset), read_size);
   }
 
   size_t host_read(size_t offset, size_t size, uint8_t *dst) override
   {
-    CUDF_EXPECTS(offset >= map_offset_, "Requested offset is outside mapping");
+    CUDF_EXPECTS(offset >= _map_offset, "Requested offset is outside mapping");
 
     // Clamp length to available data in the mapped region
-    auto const read_size = std::min(size, map_size_ - (offset - map_offset_));
+    auto const read_size = std::min(size, _map_size - (offset - _map_offset));
 
-    auto const src = static_cast<uint8_t *>(map_addr_) + (offset - map_offset_);
+    auto const src = static_cast<uint8_t *>(_map_addr) + (offset - _map_offset);
     std::memcpy(dst, src, read_size);
     return read_size;
   }
 
-  size_t size() const override { return file_size_; }
+  bool supports_device_read() const override { return _cufile_in != nullptr; }
+
+  bool is_device_read_preferred(size_t size) const
+  {
+    return _cufile_in != nullptr && _cufile_in->is_cufile_io_preferred(size);
+  }
+
+  std::unique_ptr<datasource::buffer> device_read(size_t offset,
+                                                  size_t size,
+                                                  rmm::cuda_stream_view stream) override
+  {
+    if (!supports_device_read()) CUDF_FAIL("Device reads are not supported for this file.");
+
+    auto const read_size = std::min(size, _map_size - (offset - _map_offset));
+    return _cufile_in->read(offset, read_size, stream);
+  }
+
+  size_t device_read(size_t offset,
+                     size_t size,
+                     uint8_t *dst,
+                     rmm::cuda_stream_view stream) override
+  {
+    if (!supports_device_read()) CUDF_FAIL("Device reads are not supported for this file.");
+    auto const read_size = std::min(size, _map_size - (offset - _map_offset));
+    return _cufile_in->read(offset, read_size, dst, stream);
+  }
+
+  size_t size() const override { return _file_size; }
 
  private:
   void map(int fd, size_t offset, size_t size)
   {
-    CUDF_EXPECTS(offset < file_size_, "Offset is past end of file");
+    CUDF_EXPECTS(offset < _file_size, "Offset is past end of file");
 
     // Offset for `mmap()` must be page aligned
-    auto const map_offset = offset & ~(sysconf(_SC_PAGESIZE) - 1);
+    _map_offset = offset & ~(sysconf(_SC_PAGESIZE) - 1);
 
     // Clamp length to available data in the file
     if (size == 0) {
-      size = file_size_ - offset;
+      size = _file_size - offset;
     } else {
-      if ((offset + size) > file_size_) { size = file_size_ - offset; }
+      if ((offset + size) > _file_size) { size = _file_size - offset; }
     }
 
     // Size for `mmap()` needs to include the page padding
-    const auto map_size = size + (offset - map_offset);
+    _map_size = size + (offset - _map_offset);
 
     // Check if accessing a region within already mapped area
-    map_addr_ = mmap(NULL, map_size, PROT_READ, MAP_PRIVATE, fd, map_offset);
-    CUDF_EXPECTS(map_addr_ != MAP_FAILED, "Cannot create memory mapping");
-    map_offset_ = map_offset;
-    map_size_   = map_size;
+    _map_addr = mmap(nullptr, _map_size, PROT_READ, MAP_PRIVATE, fd, _map_offset);
+    CUDF_EXPECTS(_map_addr != MAP_FAILED, "Cannot create memory mapping");
   }
 
  private:
-  size_t file_size_  = 0;
-  void *map_addr_    = nullptr;
-  size_t map_size_   = 0;
-  size_t map_offset_ = 0;
+  size_t _file_size  = 0;
+  void *_map_addr    = nullptr;
+  size_t _map_size   = 0;
+  size_t _map_offset = 0;
+  std::unique_ptr<detail::cufile_input_impl> _cufile_in;
 };
 
 /**
@@ -148,14 +164,19 @@ class user_datasource_wrapper : public datasource {
 
   bool supports_device_read() const override { return source->supports_device_read(); }
 
-  size_t device_read(size_t offset, size_t size, uint8_t *dst) override
+  size_t device_read(size_t offset,
+                     size_t size,
+                     uint8_t *dst,
+                     rmm::cuda_stream_view stream) override
   {
-    return source->device_read(offset, size, dst);
+    return source->device_read(offset, size, dst, stream);
   }
 
-  std::unique_ptr<buffer> device_read(size_t offset, size_t size) override
+  std::unique_ptr<buffer> device_read(size_t offset,
+                                      size_t size,
+                                      rmm::cuda_stream_view stream) override
   {
-    return source->device_read(offset, size);
+    return source->device_read(offset, size, stream);
   }
 
   size_t size() const override { return source->size(); }
diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp
new file mode 100644
index 00000000000..22ff057cbc1
--- /dev/null
+++ b/cpp/src/io/utilities/file_io_utilities.cpp
@@ -0,0 +1,264 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cudf_test/file_utilities.hpp>
+#include <io/utilities/file_io_utilities.hpp>
+
+#include <rmm/device_buffer.hpp>
+
+#include <dlfcn.h>
+
+#include <fstream>
+
+namespace cudf {
+namespace io {
+namespace detail {
+
+file_wrapper::file_wrapper(std::string const &filepath, int flags)
+  : fd(open(filepath.c_str(), flags))
+{
+  CUDF_EXPECTS(fd != -1, "Cannot open file " + filepath);
+}
+
+file_wrapper::file_wrapper(std::string const &filepath, int flags, mode_t mode)
+  : fd(open(filepath.c_str(), flags, mode))
+{
+  CUDF_EXPECTS(fd != -1, "Cannot open file " + filepath);
+}
+
+file_wrapper::~file_wrapper() { close(fd); }
+
+long file_wrapper::size() const
+{
+  if (_size < 0) {
+    struct stat st;
+    CUDF_EXPECTS(fstat(fd, &st) != -1, "Cannot query file size");
+    _size = static_cast<size_t>(st.st_size);
+  }
+  return _size;
+}
+
+#ifdef CUFILE_FOUND
+
+/**
+ * @brief Class that manages cuFile configuration.
+ */
+class cufile_config {
+  std::string const default_policy    = "OFF";
+  std::string const json_path_env_var = "CUFILE_ENV_PATH_JSON";
+
+  std::string const policy = default_policy;
+  temp_directory tmp_config_dir{"cudf_cufile_config"};
+
+  std::string getenv_or(std::string const &env_var_name, std::string const &default_val)
+  {
+    auto const env_val = std::getenv(env_var_name.c_str());
+    return (env_val == nullptr) ? default_val : std::string(env_val);
+  }
+
+  cufile_config() : policy{getenv_or("LIBCUDF_CUFILE_POLICY", default_policy)}
+  {
+    if (is_enabled()) {
+      // Modify the config file based on the policy
+      auto const config_file_path = getenv_or(json_path_env_var, "/etc/cufile.json");
+      std::ifstream user_config_file(config_file_path);
+      // Modified config file is stored in a temporary directory
+      auto const cudf_config_path = tmp_config_dir.path() + "/cufile.json";
+      std::ofstream cudf_config_file(cudf_config_path);
+
+      std::string line;
+      while (std::getline(user_config_file, line)) {
+        std::string const tag = "\"allow_compat_mode\"";
+        if (line.find(tag) != std::string::npos) {
+          // TODO: only replace the true/false value
+          // Enable compatiblity mode when cuDF does not fall back to host path
+          cudf_config_file << tag << ": " << (is_required() ? "true" : "false") << ",\n";
+        } else {
+          cudf_config_file << line << '\n';
+        }
+
+        // Point libcufile to the modified config file
+        CUDF_EXPECTS(setenv(json_path_env_var.c_str(), cudf_config_path.c_str(), 0) == 0,
+                     "Failed to set the cuFile config file environment variable.");
+      }
+    }
+  }
+
+ public:
+  /**
+   * @brief Returns true when cuFile use is enabled.
+   */
+  bool is_enabled() const { return policy == "ALWAYS" or policy == "GDS"; }
+
+  /**
+   * @brief Returns true when cuDF should not fall back to host IO.
+   */
+  bool is_required() const { return policy == "ALWAYS"; }
+
+  static cufile_config const *instance()
+  {
+    static cufile_config _instance;
+    return &_instance;
+  }
+};
+
+/**
+ * @brief Class that dynamically loads the cuFile library and manages the cuFile driver.
+ */
+class cufile_shim {
+ private:
+  cufile_shim();
+
+  void *cf_lib                              = nullptr;
+  decltype(cuFileDriverOpen) *driver_open   = nullptr;
+  decltype(cuFileDriverClose) *driver_close = nullptr;
+
+  std::unique_ptr<cudf::logic_error> init_error;
+  auto is_valid() const noexcept { return init_error == nullptr; }
+
+ public:
+  cufile_shim(cufile_shim const &) = delete;
+  cufile_shim &operator=(cufile_shim const &) = delete;
+
+  static cufile_shim const *instance();
+
+  ~cufile_shim()
+  {
+    driver_close();
+    dlclose(cf_lib);
+  }
+
+  decltype(cuFileHandleRegister) *handle_register     = nullptr;
+  decltype(cuFileHandleDeregister) *handle_deregister = nullptr;
+  decltype(cuFileRead) *read                          = nullptr;
+  decltype(cuFileWrite) *write                        = nullptr;
+};
+
+cufile_shim::cufile_shim()
+{
+  try {
+    cf_lib      = dlopen("libcufile.so", RTLD_NOW);
+    driver_open = reinterpret_cast<decltype(driver_open)>(dlsym(cf_lib, "cuFileDriverOpen"));
+    CUDF_EXPECTS(driver_open != nullptr, "could not find cuFile cuFileDriverOpen symbol");
+    driver_close = reinterpret_cast<decltype(driver_close)>(dlsym(cf_lib, "cuFileDriverClose"));
+    CUDF_EXPECTS(driver_close != nullptr, "could not find cuFile cuFileDriverClose symbol");
+    handle_register =
+      reinterpret_cast<decltype(handle_register)>(dlsym(cf_lib, "cuFileHandleRegister"));
+    CUDF_EXPECTS(handle_register != nullptr, "could not find cuFile cuFileHandleRegister symbol");
+    handle_deregister =
+      reinterpret_cast<decltype(handle_deregister)>(dlsym(cf_lib, "cuFileHandleDeregister"));
+    CUDF_EXPECTS(handle_deregister != nullptr,
+                 "could not find cuFile cuFileHandleDeregister symbol");
+    read = reinterpret_cast<decltype(read)>(dlsym(cf_lib, "cuFileRead"));
+    CUDF_EXPECTS(read != nullptr, "could not find cuFile cuFileRead symbol");
+    write = reinterpret_cast<decltype(write)>(dlsym(cf_lib, "cuFileWrite"));
+    CUDF_EXPECTS(write != nullptr, "could not find cuFile cuFileWrite symbol");
+
+    CUDF_EXPECTS(driver_open().err == CU_FILE_SUCCESS, "Failed to initialize cuFile driver");
+  } catch (cudf::logic_error const &err) {
+    init_error = std::make_unique<cudf::logic_error>(err);
+  }
+}
+
+cufile_shim const *cufile_shim::instance()
+{
+  static cufile_shim _instance;
+  // Defer throwing to avoid repeated attempts to load the library
+  if (!_instance.is_valid()) CUDF_FAIL("" + std::string(_instance.init_error->what()));
+
+  return &_instance;
+}
+
+void cufile_registered_file::register_handle()
+{
+  CUfileDescr_t cufile_desc{};
+  cufile_desc.handle.fd = _file.desc();
+  cufile_desc.type      = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
+  CUDF_EXPECTS(shim->handle_register(&cf_handle, &cufile_desc).err == CU_FILE_SUCCESS,
+               "Cannot register file handle with cuFile");
+}
+
+cufile_registered_file::~cufile_registered_file() { shim->handle_deregister(cf_handle); }
+
+cufile_input_impl::cufile_input_impl(std::string const &filepath)
+  : shim{cufile_shim::instance()}, cf_file(shim, filepath, O_RDONLY | O_DIRECT)
+{
+}
+
+std::unique_ptr<datasource::buffer> cufile_input_impl::read(size_t offset,
+                                                            size_t size,
+                                                            rmm::cuda_stream_view stream)
+{
+  rmm::device_buffer out_data(size, stream);
+  CUDF_EXPECTS(shim->read(cf_file.handle(), out_data.data(), size, offset, 0) != -1,
+               "cuFile error reading from a file");
+
+  return datasource::buffer::create(std::move(out_data));
+}
+
+size_t cufile_input_impl::read(size_t offset,
+                               size_t size,
+                               uint8_t *dst,
+                               rmm::cuda_stream_view stream)
+{
+  CUDF_EXPECTS(shim->read(cf_file.handle(), dst, size, offset, 0) != -1,
+               "cuFile error reading from a file");
+  // always read the requested size for now
+  return size;
+}
+
+cufile_output_impl::cufile_output_impl(std::string const &filepath)
+  : shim{cufile_shim::instance()}, cf_file(shim, filepath, O_CREAT | O_RDWR | O_DIRECT, 0664)
+{
+}
+
+void cufile_output_impl::write(void const *data, size_t offset, size_t size)
+{
+  CUDF_EXPECTS(shim->write(cf_file.handle(), data, size, offset, 0) != -1,
+               "cuFile error writing to a file");
+}
+#endif
+
+std::unique_ptr<cufile_input_impl> make_cufile_input(std::string const &filepath)
+{
+#ifdef CUFILE_FOUND
+  if (cufile_config::instance()->is_enabled()) {
+    try {
+      return std::make_unique<cufile_input_impl>(filepath);
+    } catch (...) {
+      if (cufile_config::instance()->is_required()) throw;
+    }
+  }
+#endif
+  return nullptr;
+}
+
+std::unique_ptr<cufile_output_impl> make_cufile_output(std::string const &filepath)
+{
+#ifdef CUFILE_FOUND
+  if (cufile_config::instance()->is_enabled()) {
+    try {
+      return std::make_unique<cufile_output_impl>(filepath);
+    } catch (...) {
+      if (cufile_config::instance()->is_required()) throw;
+    }
+  }
+#endif
+  return nullptr;
+}
+
+}  // namespace detail
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp
new file mode 100644
index 00000000000..85399bdd44d
--- /dev/null
+++ b/cpp/src/io/utilities/file_io_utilities.hpp
@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef CUFILE_FOUND
+#include <cufile.h>
+#endif
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <cudf/io/datasource.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <string>
+
+namespace cudf {
+namespace io {
+namespace detail {
+
+/**
+ * @brief Class that provides RAII for file handling.
+ */
+class file_wrapper {
+  int const fd       = -1;
+  long mutable _size = -1;
+
+ public:
+  explicit file_wrapper(std::string const &filepath, int flags);
+  explicit file_wrapper(std::string const &filepath, int flags, mode_t mode);
+  ~file_wrapper();
+  long size() const;
+  auto desc() const { return fd; }
+};
+
+/**
+ * @brief Base class for cuFile input/output.
+ *
+ * Contains the common API for cuFile input and output classes.
+ */
+class cufile_io_base {
+ public:
+  /**
+   * @brief Returns an estimate of whether the cuFile operation is the optimal option.
+   *
+   * @param size Read/write operation size, in bytes.
+   * @return Whether a cuFile operation with the given size is expected to be faster than a host
+   * read + H2D copy
+   */
+  static bool is_cufile_io_preferred(size_t size) { return size > op_size_threshold; }
+
+ protected:
+  /**
+   * @brief The read/write size above which cuFile is faster then host read + copy
+   *
+   * This may not be the optimal threshold for all systems. Derived `is_cufile_io_preferred`
+   * implementations can use a different logic.
+   */
+  static constexpr size_t op_size_threshold = 128 << 10;
+};
+
+/**
+ * @brief Interface class for cufile input.
+ */
+class cufile_input : public cufile_io_base {
+ public:
+  /**
+   * @brief Reads into a new device buffer.
+   *
+   *  @throws cudf::logic_error on cuFile error
+   *
+   * @param offset Number of bytes from the start
+   * @param size Number of bytes to read
+   * @param stream CUDA stream to use
+   *
+   * @return The data buffer in the device memory
+   */
+  virtual std::unique_ptr<datasource::buffer> read(size_t offset,
+                                                   size_t size,
+                                                   rmm::cuda_stream_view stream) = 0;
+
+  /**
+   * @brief Reads into existing device memory.
+   *
+   *  @throws cudf::logic_error on cuFile error
+   *
+   * @param offset Number of bytes from the start
+   * @param size Number of bytes to read
+   * @param dst Address of the existing device memory
+   * @param stream CUDA stream to use
+   *
+   * @return The number of bytes read
+   */
+  virtual size_t read(size_t offset, size_t size, uint8_t *dst, rmm::cuda_stream_view stream) = 0;
+};
+
+/**
+ * @brief Interface class for cufile output.
+ */
+class cufile_output : public cufile_io_base {
+ public:
+  /**
+   * @brief Writes the data from a device buffer into a file.
+   *
+   *  @throws cudf::logic_error on cuFile error
+   *
+   * @param data Pointer to the buffer to be written into the output file
+   * @param offset Number of bytes from the start
+   * @param size Number of bytes to write
+   */
+  virtual void write(void const *data, size_t offset, size_t size) = 0;
+};
+
+#ifdef CUFILE_FOUND
+
+class cufile_shim;
+
+/**
+ * @brief Class that provides RAII for cuFile file registration.
+ */
+struct cufile_registered_file {
+  void register_handle();
+
+ public:
+  cufile_registered_file(cufile_shim const *shim, std::string const &filepath, int flags)
+    : _file(filepath, flags), shim{shim}
+  {
+    register_handle();
+  }
+
+  cufile_registered_file(cufile_shim const *shim,
+                         std::string const &filepath,
+                         int flags,
+                         mode_t mode)
+    : _file(filepath, flags, mode), shim{shim}
+  {
+    register_handle();
+  }
+
+  auto const &handle() const noexcept { return cf_handle; }
+
+  ~cufile_registered_file();
+
+ private:
+  file_wrapper const _file;
+  CUfileHandle_t cf_handle = nullptr;
+  cufile_shim const *shim  = nullptr;
+};
+
+/**
+ * @brief Adapter for the `cuFileRead` API.
+ *
+ * Exposes APIs to read directly from a file into device memory.
+ */
+class cufile_input_impl final : public cufile_input {
+ public:
+  cufile_input_impl(std::string const &filepath);
+
+  std::unique_ptr<datasource::buffer> read(size_t offset,
+                                           size_t size,
+                                           rmm::cuda_stream_view stream) override;
+
+  size_t read(size_t offset, size_t size, uint8_t *dst, rmm::cuda_stream_view stream) override;
+
+ private:
+  cufile_shim const *shim = nullptr;
+  cufile_registered_file const cf_file;
+};
+
+/**
+ * @brief Adapter for the `cuFileWrite` API.
+ *
+ * Exposes an API to write directly into a file from device memory.
+ */
+class cufile_output_impl final : public cufile_output {
+ public:
+  cufile_output_impl(std::string const &filepath);
+
+  void write(void const *data, size_t offset, size_t size) override;
+
+ private:
+  cufile_shim const *shim = nullptr;
+  cufile_registered_file const cf_file;
+};
+#else
+
+class cufile_input_impl final : public cufile_input {
+ public:
+  std::unique_ptr<datasource::buffer> read(size_t offset,
+                                           size_t size,
+                                           rmm::cuda_stream_view stream) override
+  {
+    CUDF_FAIL("Only used to compile without cufile library, should not be called");
+  }
+
+  size_t read(size_t offset, size_t size, uint8_t *dst, rmm::cuda_stream_view stream) override
+  {
+    CUDF_FAIL("Only used to compile without cufile library, should not be called");
+  }
+};
+
+class cufile_output_impl final : public cufile_output {
+ public:
+  void write(void const *data, size_t offset, size_t size) override
+  {
+    CUDF_FAIL("Only used to compile without cufile library, should not be called");
+  }
+};
+#endif
+
+/**
+ * @brief Creates a `cufile_input_impl` object
+ *
+ * Returns a null pointer if an exception occurs in the `cufile_input_impl` constructor, or if the
+ * cuFile library is not installed.
+ */
+std::unique_ptr<cufile_input_impl> make_cufile_input(std::string const &filepath);
+
+/**
+ * @brief Creates a `cufile_output_impl` object
+ *
+ * Returns a null pointer if an exception occurs in the `cufile_output_impl` constructor, or if the
+ * cuFile library is not installed.
+ */
+std::unique_ptr<cufile_output_impl> make_cufile_output(std::string const &filepath);
+
+}  // namespace detail
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index c2c32d4165a..b64e91c18bd 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -287,7 +287,7 @@ std::pair<rmm::device_vector<size_type>, rmm::device_vector<size_type>> probe_jo
 
     constexpr int block_size{DEFAULT_JOIN_BLOCK_SIZE};
     detail::grid_1d config(probe_table.num_rows(), block_size);
-    write_index.set_value(0, stream);
+    write_index.set_value_zero(stream);
 
     row_hash hash_probe{probe_table};
     row_equality equality{probe_table, build_table, compare_nulls == null_equality::EQUAL};
diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh
index 712d771bd73..b37f228f6d3 100644
--- a/cpp/src/join/hash_join.cuh
+++ b/cpp/src/join/hash_join.cuh
@@ -120,7 +120,7 @@ size_type estimate_join_output_size(table_device_view build_table,
   do {
     sample_probe_num_rows = std::min(sample_probe_num_rows, probe_table_num_rows);
 
-    size_estimate.set_value(0, stream);
+    size_estimate.set_value_zero(stream);
 
     row_hash hash_probe{probe_table};
     row_equality equality{probe_table, build_table, compare_nulls == null_equality::EQUAL};
diff --git a/cpp/src/join/nested_loop_join.cuh b/cpp/src/join/nested_loop_join.cuh
index 03d684f91d4..580017a6704 100644
--- a/cpp/src/join/nested_loop_join.cuh
+++ b/cpp/src/join/nested_loop_join.cuh
@@ -89,7 +89,7 @@ size_type estimate_nested_loop_join_output_size(table_device_view left,
   int num_sms{-1};
   CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id));
 
-  size_estimate.set_value(0, stream);
+  size_estimate.set_value_zero(stream);
 
   row_equality equality{left, right, compare_nulls == null_equality::EQUAL};
   // Determine number of output rows without actually building the output to simply
@@ -163,7 +163,7 @@ get_base_nested_loop_join_indices(table_view const& left,
 
     constexpr int block_size{DEFAULT_JOIN_BLOCK_SIZE};
     detail::grid_1d config(left_table->num_rows(), block_size);
-    write_index.set_value(0);
+    write_index.set_value_zero(stream);
 
     row_equality equality{*left_table, *right_table, compare_nulls == null_equality::EQUAL};
     const auto& join_output_l =
@@ -182,7 +182,7 @@ get_base_nested_loop_join_indices(table_view const& left,
 
     CHECK_CUDA(stream.value());
 
-    join_size              = write_index.value();
+    join_size              = write_index.value(stream);
     current_estimated_size = estimated_size;
     estimated_size *= 2;
   } while ((current_estimated_size < join_size));
diff --git a/cpp/src/rolling/grouped_rolling.cu b/cpp/src/rolling/grouped_rolling.cu
index c1ebc9f3f9f..b8cb5e45fec 100644
--- a/cpp/src/rolling/grouped_rolling.cu
+++ b/cpp/src/rolling/grouped_rolling.cu
@@ -110,8 +110,8 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
   using sort_groupby_helper = cudf::groupby::detail::sort::sort_groupby_helper;
 
   sort_groupby_helper helper{group_keys, cudf::null_policy::INCLUDE, cudf::sorted::YES};
-  auto const& group_offsets{helper.group_offsets()};
-  auto const& group_labels{helper.group_labels()};
+  auto const& group_offsets{helper.group_offsets(stream)};
+  auto const& group_labels{helper.group_labels(stream)};
 
   // `group_offsets` are interpreted in adjacent pairs, each pair representing the offsets
   // of the first, and one past the last elements in a group.
@@ -127,8 +127,8 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
   //   groups.)
   //   3. [0, 500, 1000] indicates two equal-sized groups: [0,500), and [500,1000).
 
-  assert(group_offsets.size() >= 2 && group_offsets[0] == 0 &&
-         group_offsets[group_offsets.size() - 1] == input.size() &&
+  assert(group_offsets.size() >= 2 && group_offsets.element(0, stream) == 0 &&
+         group_offsets.element(group_offsets.size() - 1, stream) == input.size() &&
          "Must have at least one group.");
 
   auto preceding_calculator = [d_group_offsets = group_offsets.data(),
diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu
index 10496b89328..0b384ad0631 100644
--- a/cpp/src/strings/char_types/char_types.cu
+++ b/cpp/src/strings/char_types/char_types.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -186,91 +186,6 @@ std::unique_ptr<column> filter_characters_of_type(strings_column_view const& str
                              mr);
 }
 
-std::unique_ptr<column> is_integer(
-  strings_column_view const& strings,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-{
-  auto strings_column = column_device_view::create(strings.parent(), stream);
-  auto d_column       = *strings_column;
-  // create output column
-  auto results   = make_numeric_column(data_type{type_id::BOOL8},
-                                     strings.size(),
-                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
-                                     stream,
-                                     mr);
-  auto d_results = results->mutable_view().data<bool>();
-  thrust::transform(rmm::exec_policy(stream),
-                    thrust::make_counting_iterator<size_type>(0),
-                    thrust::make_counting_iterator<size_type>(strings.size()),
-                    d_results,
-                    [d_column] __device__(size_type idx) {
-                      if (d_column.is_null(idx)) return false;
-                      return string::is_integer(d_column.element<string_view>(idx));
-                    });
-  results->set_null_count(strings.null_count());
-  return results;
-}
-
-bool all_integer(strings_column_view const& strings, rmm::cuda_stream_view stream)
-{
-  auto strings_column  = column_device_view::create(strings.parent(), stream);
-  auto d_column        = *strings_column;
-  auto transformer_itr = thrust::make_transform_iterator(
-    thrust::make_counting_iterator<size_type>(0), [d_column] __device__(size_type idx) {
-      if (d_column.is_null(idx)) return false;
-      return string::is_integer(d_column.element<string_view>(idx));
-    });
-  return thrust::all_of(rmm::exec_policy(stream),
-                        transformer_itr,
-                        transformer_itr + strings.size(),
-                        thrust::identity<bool>());
-}
-
-std::unique_ptr<column> is_float(
-  strings_column_view const& strings,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-{
-  auto strings_column = column_device_view::create(strings.parent(), stream);
-  auto d_column       = *strings_column;
-  // create output column
-  auto results   = make_numeric_column(data_type{type_id::BOOL8},
-                                     strings.size(),
-                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
-                                     stream,
-                                     mr);
-  auto d_results = results->mutable_view().data<bool>();
-  // check strings for valid float chars
-  thrust::transform(rmm::exec_policy(stream),
-                    thrust::make_counting_iterator<size_type>(0),
-                    thrust::make_counting_iterator<size_type>(strings.size()),
-                    d_results,
-                    [d_column] __device__(size_type idx) {
-                      if (d_column.is_null(idx)) return false;
-                      return string::is_float(d_column.element<string_view>(idx));
-                    });
-  results->set_null_count(strings.null_count());
-  return results;
-}
-
-bool all_float(strings_column_view const& strings, rmm::cuda_stream_view stream)
-{
-  auto strings_column  = column_device_view::create(strings.parent(), stream);
-  auto d_column        = *strings_column;
-  auto transformer_itr = thrust::make_transform_iterator(
-    thrust::make_counting_iterator<size_type>(0), [d_column] __device__(size_type idx) {
-      if (d_column.is_null(idx)) return false;
-      return string::is_float(d_column.element<string_view>(idx));
-    });
-  return thrust::all_of(rmm::exec_policy(stream),
-                        transformer_itr,
-                        transformer_itr + strings.size(),
-                        thrust::identity<bool>());
-}
-
 }  // namespace detail
 
 // external API
@@ -295,31 +210,5 @@ std::unique_ptr<column> filter_characters_of_type(strings_column_view const& str
     strings, types_to_remove, replacement, types_to_keep, rmm::cuda_stream_default, mr);
 }
 
-std::unique_ptr<column> is_integer(strings_column_view const& strings,
-                                   rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::is_integer(strings, rmm::cuda_stream_default, mr);
-}
-
-std::unique_ptr<column> is_float(strings_column_view const& strings,
-                                 rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::is_float(strings, rmm::cuda_stream_default, mr);
-}
-
-bool all_integer(strings_column_view const& strings)
-{
-  CUDF_FUNC_RANGE();
-  return detail::all_integer(strings, rmm::cuda_stream_default);
-}
-
-bool all_float(strings_column_view const& strings)
-{
-  CUDF_FUNC_RANGE();
-  return detail::all_float(strings, rmm::cuda_stream_default);
-}
-
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/src/strings/combine.cu b/cpp/src/strings/combine.cu
index 4e61d4d8c41..f9b8b9e0ea3 100644
--- a/cpp/src/strings/combine.cu
+++ b/cpp/src/strings/combine.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/valid_if.cuh>
@@ -31,7 +32,7 @@
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_vector.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/logical.h>
@@ -50,7 +51,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
                                     rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
-  auto num_columns = strings_columns.num_columns();
+  auto const num_columns = strings_columns.num_columns();
   CUDF_EXPECTS(num_columns > 0, "At least one column must be specified");
   // check all columns are of type string
   CUDF_EXPECTS(std::all_of(strings_columns.begin(),
@@ -59,7 +60,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
                "All columns must be of type string");
   if (num_columns == 1)  // single strings column returns a copy
     return std::make_unique<column>(*(strings_columns.begin()), stream, mr);
-  auto strings_count = strings_columns.num_rows();
+  auto const strings_count = strings_columns.num_rows();
   if (strings_count == 0)  // empty begets empty
     return detail::make_empty_strings_column(stream, mr);
 
@@ -88,12 +89,12 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
   // build offsets column by computing sizes of each string in the output
   auto offsets_transformer = [d_table, d_separator, d_narep] __device__(size_type row_idx) {
     // for this row (idx), iterate over each column and add up the bytes
-    bool null_element =
+    bool const null_element =
       thrust::any_of(thrust::seq, d_table.begin(), d_table.end(), [row_idx](auto const& d_column) {
         return d_column.is_null(row_idx);
       });
     if (null_element && !d_narep.is_valid()) return 0;
-    size_type bytes = thrust::transform_reduce(
+    size_type const bytes = thrust::transform_reduce(
       thrust::seq,
       d_table.begin(),
       d_table.end(),
@@ -105,9 +106,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
       0,
       thrust::plus<size_type>());
     // separator goes only in between elements
-    if (bytes > 0)                        // if not null
-      bytes -= d_separator.size_bytes();  // remove the last separator
-    return bytes;
+    return bytes == 0 ? 0 : (bytes - d_separator.size_bytes());  // remove the last separator
   };
   auto offsets_transformer_itr = thrust::make_transform_iterator(
     thrust::make_counting_iterator<size_type>(0), offsets_transformer);
@@ -116,7 +115,8 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
   auto d_results_offsets = offsets_column->view().data<int32_t>();
 
   // create the chars column
-  size_type bytes = thrust::device_pointer_cast(d_results_offsets)[strings_count];
+  auto const bytes =
+    cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
   auto chars_column =
     strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
   // fill the chars column
@@ -127,18 +127,17 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
     strings_count,
     [d_table, num_columns, d_separator, d_narep, d_results_offsets, d_results_chars] __device__(
       size_type idx) {
-      bool null_element = thrust::any_of(
+      bool const null_element = thrust::any_of(
         thrust::seq, d_table.begin(), d_table.end(), [idx](column_device_view const& col) {
           return col.is_null(idx);
         });
       if (null_element && !d_narep.is_valid())
         return;  // do not write to buffer at all if any column element for this row is null
-      size_type offset = d_results_offsets[idx];
-      char* d_buffer   = d_results_chars + offset;
+      char* d_buffer = d_results_chars + d_results_offsets[idx];
       // write out each column's entry for this row
       for (size_type col_idx = 0; col_idx < num_columns; ++col_idx) {
-        auto d_column = d_table.column(col_idx);
-        string_view d_str =
+        auto const d_column = d_table.column(col_idx);
+        string_view const d_str =
           d_column.is_null(idx) ? d_narep.value() : d_column.element<string_view>(idx);
         d_buffer = detail::copy_string(d_buffer, d_str);
         // separator goes only in between elements
@@ -173,8 +172,8 @@ std::unique_ptr<column> join_strings(strings_column_view const& strings,
   auto d_strings      = *strings_column;
 
   // create an offsets array for building the output memory layout
-  rmm::device_vector<size_type> output_offsets(strings_count + 1);
-  auto d_output_offsets = output_offsets.data().get();
+  rmm::device_uvector<size_type> output_offsets(strings_count + 1, stream);
+  auto d_output_offsets = output_offsets.data();
   // using inclusive-scan to compute last entry which is the total size
   thrust::transform_inclusive_scan(
     rmm::exec_policy(stream),
@@ -192,9 +191,12 @@ std::unique_ptr<column> join_strings(strings_column_view const& strings,
       return bytes;
     },
     thrust::plus<size_type>());
-  CUDA_TRY(cudaMemsetAsync(d_output_offsets, 0, sizeof(size_type), stream.value()));
+  size_type const zero = 0;
+  output_offsets.set_element_async(0, zero, stream);
   // total size is the last entry
-  size_type bytes = output_offsets.back();
+  // Note this call does a synchronize on the stream and thereby also protects the
+  // set_element_async parameter from going out of scope before it is used.
+  size_type const bytes = output_offsets.back_element(stream);
 
   // build offsets column (only 1 string so 2 offset entries)
   auto offsets_column =
@@ -254,7 +256,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
                                     rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
-  auto num_columns = strings_columns.num_columns();
+  auto const num_columns = strings_columns.num_columns();
   CUDF_EXPECTS(num_columns > 0, "At least one column must be specified");
   // Check if all columns are of type string
   CUDF_EXPECTS(std::all_of(strings_columns.begin(),
@@ -262,7 +264,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
                            [](auto c) { return c.type().id() == type_id::STRING; }),
                "All columns must be of type string");
 
-  auto strings_count = strings_columns.num_rows();
+  auto const strings_count = strings_columns.num_rows();
   CUDF_EXPECTS(strings_count == separators.size(),
                "Separators column should be the same size as the strings columns");
   if (strings_count == 0)  // Empty begets empty
@@ -277,7 +279,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
 
   if (num_columns == 1) {
     // Shallow copy of the resultant strings
-    rmm::device_vector<string_view> out_col_strings(strings_count);
+    rmm::device_uvector<string_view> out_col_strings(strings_count, stream);
 
     // Device view of the only column in the table view
     auto const col0_ptr = column_device_view::create(strings_columns.column(0), stream);
@@ -288,7 +290,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
       rmm::exec_policy(stream),
       thrust::make_counting_iterator<size_type>(0),
       thrust::make_counting_iterator<size_type>(strings_count),
-      out_col_strings.data().get(),
+      out_col_strings.begin(),
       // Output depends on the separator
       [col0, invalid_str, separator_col_view, separator_rep, col_rep] __device__(auto ridx) {
         if (!separator_col_view.is_valid(ridx) && !separator_rep.is_valid()) return invalid_str;
@@ -334,7 +336,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
     if (!separator_col_view.is_valid(ridx) && !separator_rep.is_valid()) return 0;
 
     // For this row (idx), iterate over each column and add up the bytes
-    bool all_nulls =
+    bool const all_nulls =
       thrust::all_of(thrust::seq, d_table.begin(), d_table.end(), [ridx](auto const& d_column) {
         return d_column.is_null(ridx);
       });
@@ -343,11 +345,11 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
     if (all_nulls && !col_rep.is_valid()) return 0;
 
     // There is at least one non-null column value (it can still be empty though)
-    auto separator_str = separator_col_view.is_valid(ridx)
-                           ? separator_col_view.element<string_view>(ridx)
-                           : separator_rep.value();
+    auto const separator_str = separator_col_view.is_valid(ridx)
+                                 ? separator_col_view.element<string_view>(ridx)
+                                 : separator_rep.value();
 
-    size_type bytes = thrust::transform_reduce(
+    size_type const bytes = thrust::transform_reduce(
       thrust::seq,
       d_table.begin(),
       d_table.end(),
@@ -395,7 +397,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
                        // to replace, do not write anything for this row
                        if (!separator_col_view.is_valid(ridx) && !separator_rep.is_valid()) return;
 
-                       bool all_nulls = thrust::all_of(
+                       bool const all_nulls = thrust::all_of(
                          thrust::seq, d_table.begin(), d_table.end(), [ridx](auto const& col) {
                            return col.is_null(ridx);
                          });
@@ -404,29 +406,27 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
                        // skip this row
                        if (all_nulls && !col_rep.is_valid()) return;
 
-                       size_type offset    = d_results_offsets[ridx];
-                       char* d_buffer      = d_results_chars + offset;
+                       char* d_buffer      = d_results_chars + d_results_offsets[ridx];
                        bool colval_written = false;
 
                        // There is at least one non-null column value (it can still be empty though)
-                       auto separator_str = separator_col_view.is_valid(ridx)
-                                              ? separator_col_view.element<string_view>(ridx)
-                                              : separator_rep.value();
+                       auto const separator_str = separator_col_view.is_valid(ridx)
+                                                    ? separator_col_view.element<string_view>(ridx)
+                                                    : separator_rep.value();
 
                        // Write out each column's entry for this row
                        for (size_type col_idx = 0; col_idx < num_columns; ++col_idx) {
-                         auto d_column = d_table.column(col_idx);
-                         // If the column isn't valid and if there isn't a replacement for it, skip
-                         // it
+                         auto const d_column = d_table.column(col_idx);
+                         // If the row is null and if there is no replacement, skip it
                          if (d_column.is_null(ridx) && !col_rep.is_valid()) continue;
 
                          // Separator goes only in between elements
                          if (colval_written)
                            d_buffer = detail::copy_string(d_buffer, separator_str);
 
-                         string_view d_str = d_column.is_null(ridx)
-                                               ? col_rep.value()
-                                               : d_column.element<string_view>(ridx);
+                         string_view const d_str = d_column.is_null(ridx)
+                                                     ? col_rep.value()
+                                                     : d_column.element<string_view>(ridx);
                          d_buffer       = detail::copy_string(d_buffer, d_str);
                          colval_written = true;
                        }
diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu
index 2bf65976986..b6d99efd51f 100644
--- a/cpp/src/strings/convert/convert_floats.cu
+++ b/cpp/src/strings/convert/convert_floats.cu
@@ -21,6 +21,7 @@
 #include <cudf/strings/convert/convert_floats.hpp>
 #include <cudf/strings/detail/converters.hpp>
 #include <cudf/strings/detail/utilities.hpp>
+#include <cudf/strings/string.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/traits.hpp>
@@ -536,12 +537,50 @@ std::unique_ptr<column> from_floats(column_view const& floats,
 }  // namespace detail
 
 // external API
-
 std::unique_ptr<column> from_floats(column_view const& floats, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::from_floats(floats, rmm::cuda_stream_default, mr);
 }
 
+namespace detail {
+std::unique_ptr<column> is_float(
+  strings_column_view const& strings,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  auto strings_column = column_device_view::create(strings.parent(), stream);
+  auto d_column       = *strings_column;
+  // create output column
+  auto results   = make_numeric_column(data_type{type_id::BOOL8},
+                                     strings.size(),
+                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
+                                     strings.null_count(),
+                                     stream,
+                                     mr);
+  auto d_results = results->mutable_view().data<bool>();
+  // check strings for valid float chars
+  thrust::transform(rmm::exec_policy(stream),
+                    thrust::make_counting_iterator<size_type>(0),
+                    thrust::make_counting_iterator<size_type>(strings.size()),
+                    d_results,
+                    [d_column] __device__(size_type idx) {
+                      if (d_column.is_null(idx)) return false;
+                      return string::is_float(d_column.element<string_view>(idx));
+                    });
+  results->set_null_count(strings.null_count());
+  return results;
+}
+
+}  // namespace detail
+
+// external API
+std::unique_ptr<column> is_float(strings_column_view const& strings,
+                                 rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::is_float(strings, rmm::cuda_stream_default, mr);
+}
+
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu
index 112550fc25b..5c5032b5c87 100644
--- a/cpp/src/strings/convert/convert_integers.cu
+++ b/cpp/src/strings/convert/convert_integers.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/strings/convert/convert_integers.hpp>
 #include <cudf/strings/detail/converters.hpp>
 #include <cudf/strings/detail/utilities.hpp>
+#include <cudf/strings/string.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/traits.hpp>
@@ -245,7 +246,6 @@ std::unique_ptr<column> from_integers(column_view const& integers,
 }  // namespace detail
 
 // external API
-
 std::unique_ptr<column> from_integers(column_view const& integers,
                                       rmm::mr::device_memory_resource* mr)
 {
@@ -253,5 +253,42 @@ std::unique_ptr<column> from_integers(column_view const& integers,
   return detail::from_integers(integers, rmm::cuda_stream_default, mr);
 }
 
+namespace detail {
+std::unique_ptr<column> is_integer(
+  strings_column_view const& strings,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  auto strings_column = column_device_view::create(strings.parent(), stream);
+  auto d_column       = *strings_column;
+  // create output column
+  auto results   = make_numeric_column(data_type{type_id::BOOL8},
+                                     strings.size(),
+                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
+                                     strings.null_count(),
+                                     stream,
+                                     mr);
+  auto d_results = results->mutable_view().data<bool>();
+  thrust::transform(rmm::exec_policy(stream),
+                    thrust::make_counting_iterator<size_type>(0),
+                    thrust::make_counting_iterator<size_type>(strings.size()),
+                    d_results,
+                    [d_column] __device__(size_type idx) {
+                      if (d_column.is_null(idx)) return false;
+                      return string::is_integer(d_column.element<string_view>(idx));
+                    });
+  results->set_null_count(strings.null_count());
+  return results;
+}
+}  // namespace detail
+
+// external API
+std::unique_ptr<column> is_integer(strings_column_view const& strings,
+                                   rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::is_integer(strings, rmm::cuda_stream_default, mr);
+}
+
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/src/strings/substring.cu b/cpp/src/strings/substring.cu
index 68080c0eb89..f712b0cb6aa 100644
--- a/cpp/src/strings/substring.cu
+++ b/cpp/src/strings/substring.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,17 +43,25 @@ namespace {
  * using the provided start, stop, and step parameters.
  */
 struct substring_fn {
-  const column_device_view d_column;
-  numeric_scalar_device_view<size_type> d_start, d_stop, d_step;
-  const int32_t* d_offsets{};
+  column_device_view const d_column;
+  numeric_scalar_device_view<size_type> const d_start;
+  numeric_scalar_device_view<size_type> const d_stop;
+  numeric_scalar_device_view<size_type> const d_step;
+  int32_t* d_offsets{};
   char* d_chars{};
 
-  __device__ cudf::size_type operator()(size_type idx)
+  __device__ void operator()(size_type idx)
   {
-    if (d_column.is_null(idx)) return 0;  // null string
-    string_view d_str = d_column.template element<string_view>(idx);
+    if (d_column.is_null(idx)) {
+      if (!d_chars) d_offsets[idx] = 0;
+      return;
+    }
+    auto const d_str  = d_column.template element<string_view>(idx);
     auto const length = d_str.length();
-    if (length == 0) return 0;  // empty string
+    if (length == 0) {
+      if (!d_chars) d_offsets[idx] = 0;
+      return;
+    }
     size_type const step = d_step.is_valid() ? d_step.value() : 1;
     auto const begin     = [&] {  // always inclusive
       // when invalid, default depends on step
@@ -88,7 +96,7 @@ struct substring_fn {
       if (d_buffer) d_buffer += from_char_utf8(*itr, d_buffer);
       itr += step;
     }
-    return bytes;
+    if (!d_chars) d_offsets[idx] = bytes;
   }
 };
 
@@ -103,42 +111,26 @@ std::unique_ptr<column> slice_strings(
   rmm::cuda_stream_view stream           = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource())
 {
-  size_type strings_count = strings.size();
-  if (strings_count == 0) return make_empty_strings_column(stream, mr);
+  if (strings.is_empty()) return make_empty_strings_column(stream, mr);
 
   if (step.is_valid()) CUDF_EXPECTS(step.value(stream) != 0, "Step parameter must not be 0");
 
-  auto strings_column = column_device_view::create(strings.parent(), stream);
-  auto d_column       = *strings_column;
-  auto d_start        = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(start));
-  auto d_stop         = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(stop));
-  auto d_step         = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(step));
-
-  // copy the null mask
-  rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
-
-  // build offsets column
-  auto offsets_transformer_itr = thrust::make_transform_iterator(
-    thrust::make_counting_iterator<int32_t>(0), substring_fn{d_column, d_start, d_stop, d_step});
-  auto offsets_column = make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
-  auto d_new_offsets = offsets_column->view().data<int32_t>();
-
-  // build chars column
-  auto bytes = cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
-  auto chars_column = strings::detail::create_chars_child_column(
-    strings_count, strings.null_count(), bytes, stream, mr);
-  auto d_chars = chars_column->mutable_view().data<char>();
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<size_type>(0),
-                     strings_count,
-                     substring_fn{d_column, d_start, d_stop, d_step, d_new_offsets, d_chars});
+  auto const d_column = column_device_view::create(strings.parent(), stream);
+  auto const d_start  = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(start));
+  auto const d_stop   = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(stop));
+  auto const d_step   = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(step));
 
-  return make_strings_column(strings_count,
-                             std::move(offsets_column),
-                             std::move(chars_column),
+  auto children = make_strings_children(substring_fn{*d_column, d_start, d_stop, d_step},
+                                        strings.size(),
+                                        strings.null_count(),
+                                        stream,
+                                        mr);
+
+  return make_strings_column(strings.size(),
+                             std::move(children.first),
+                             std::move(children.second),
                              strings.null_count(),
-                             std::move(null_mask),
+                             cudf::detail::copy_bitmask(strings.parent(), stream, mr),
                              stream,
                              mr);
 }
@@ -166,25 +158,33 @@ namespace {
  * This both calculates the output size and executes the substring.
  */
 struct substring_from_fn {
-  const column_device_view d_column;
-  const cudf::detail::input_indexalator starts;
-  const cudf::detail::input_indexalator stops;
-  const int32_t* d_offsets{};
+  column_device_view const d_column;
+  cudf::detail::input_indexalator const starts;
+  cudf::detail::input_indexalator const stops;
+  int32_t* d_offsets{};
   char* d_chars{};
 
-  __device__ size_type operator()(size_type idx)
+  __device__ void operator()(size_type idx)
   {
-    if (d_column.is_null(idx)) return 0;  // null string
-    string_view d_str = d_column.template element<string_view>(idx);
+    if (d_column.is_null(idx)) {
+      if (!d_chars) d_offsets[idx] = 0;
+      return;
+    }
+    auto const d_str  = d_column.template element<string_view>(idx);
     auto const length = d_str.length();
     auto const start  = starts[idx];
-    if (start >= length) return 0;  // empty string
+    if (start >= length) {
+      if (!d_chars) d_offsets[idx] = 0;
+      return;
+    }
     auto const stop = stops[idx];
     auto const end  = (((stop < 0) || (stop > length)) ? length : stop);
 
-    string_view d_substr = d_str.substr(start, end - start);
-    if (d_chars) memcpy(d_chars + d_offsets[idx], d_substr.data(), d_substr.size_bytes());
-    return d_substr.size_bytes();
+    auto const d_substr = d_str.substr(start, end - start);
+    if (d_chars)
+      memcpy(d_chars + d_offsets[idx], d_substr.data(), d_substr.size_bytes());
+    else
+      d_offsets[idx] = d_substr.size_bytes();
   }
 };
 
@@ -212,32 +212,18 @@ std::unique_ptr<column> compute_substrings_from_fn(column_device_view const& d_c
   auto strings_count = d_column.size();
 
   // Copy the null mask
-  rmm::device_buffer null_mask{0, stream, mr};
-  if (d_column.nullable())
-    null_mask = rmm::device_buffer(
-      d_column.null_mask(), cudf::bitmask_allocation_size_bytes(strings_count), stream, mr);
-
-  // Build offsets column
-  auto offsets_transformer_itr = thrust::make_transform_iterator(
-    thrust::make_counting_iterator<size_type>(0), substring_from_fn{d_column, starts, stops});
-  auto offsets_column = cudf::strings::detail::make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
-  auto d_new_offsets = offsets_column->view().data<int32_t>();
-
-  // Build chars column
-  auto bytes = cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
-  auto chars_column =
-    cudf::strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
-  auto chars_view = chars_column->mutable_view();
-  auto d_chars    = chars_view.template data<char>();
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<cudf::size_type>(0),
-                     strings_count,
-                     substring_from_fn{d_column, starts, stops, d_new_offsets, d_chars});
+  rmm::device_buffer null_mask =
+    !d_column.nullable()
+      ? rmm::device_buffer{0, stream, mr}
+      : rmm::device_buffer(
+          d_column.null_mask(), cudf::bitmask_allocation_size_bytes(strings_count), stream, mr);
+
+  auto children = make_strings_children(
+    substring_from_fn{d_column, starts, stops}, strings_count, null_count, stream, mr);
 
   return make_strings_column(strings_count,
-                             std::move(offsets_column),
-                             std::move(chars_column),
+                             std::move(children.first),
+                             std::move(children.second),
                              null_count,
                              std::move(null_mask),
                              stream,
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index 420bdc3e3ba..995ee94472f 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -100,6 +100,25 @@ std::unique_ptr<cudf::table> create_compressible_fixed_table(cudf::size_type num
   return create_fixed_table<T>(num_columns, num_rows, include_validity, compressible_elements);
 }
 
+void compare_metadata_equality(cudf::io::table_input_metadata in_meta,
+                               cudf::io::table_metadata out_meta)
+{
+  std::function<void(cudf::io::column_name_info, cudf::io::column_in_metadata)> compare_names =
+    [&](cudf::io::column_name_info out_col, cudf::io::column_in_metadata in_col) {
+      if (not in_col.get_name().empty()) { EXPECT_EQ(out_col.name, in_col.get_name()); }
+      EXPECT_EQ(out_col.children.size(), in_col.num_children());
+      for (size_t i = 0; i < out_col.children.size(); ++i) {
+        compare_names(out_col.children[i], in_col.child(i));
+      }
+    };
+
+  EXPECT_EQ(out_meta.schema_info.size(), in_meta.column_metadata.size());
+
+  for (size_t i = 0; i < out_meta.schema_info.size(); ++i) {
+    compare_names(out_meta.schema_info[i], in_meta.column_metadata[i]);
+  }
+}
+
 // Base test fixture for tests
 struct ParquetWriterTest : public cudf::test::BaseFixture {
 };
@@ -308,16 +327,6 @@ TEST_F(ParquetWriterTest, MultiColumn)
   column_wrapper<numeric::decimal32> col6{col6_data, col6_data + num_rows, validity};
   column_wrapper<numeric::decimal64> col7{col7_data, col7_data + num_rows, validity};
 
-  cudf_io::table_metadata expected_metadata;
-  // expected_metadata.column_names.emplace_back("bools");
-  expected_metadata.column_names.emplace_back("int8s");
-  expected_metadata.column_names.emplace_back("int16s");
-  expected_metadata.column_names.emplace_back("int32s");
-  expected_metadata.column_names.emplace_back("floats");
-  expected_metadata.column_names.emplace_back("doubles");
-  expected_metadata.column_names.emplace_back("decimal32s");
-  expected_metadata.column_names.emplace_back("decimal64s");
-
   std::vector<std::unique_ptr<column>> cols;
   // cols.push_back(col0.release());
   cols.push_back(col1.release());
@@ -330,12 +339,20 @@ TEST_F(ParquetWriterTest, MultiColumn)
   auto expected = std::make_unique<table>(std::move(cols));
   EXPECT_EQ(7, expected->num_columns());
 
+  cudf_io::table_input_metadata expected_metadata(*expected);
+  // expected_metadata.column_metadata[0].set_name( "bools");
+  expected_metadata.column_metadata[0].set_name("int8s");
+  expected_metadata.column_metadata[1].set_name("int16s");
+  expected_metadata.column_metadata[2].set_name("int32s");
+  expected_metadata.column_metadata[3].set_name("floats");
+  expected_metadata.column_metadata[4].set_name("doubles");
+  expected_metadata.column_metadata[5].set_name("decimal32s").set_decimal_precision(10);
+  expected_metadata.column_metadata[6].set_name("decimal64s").set_decimal_precision(20);
+
   auto filepath = temp_env->get_temp_filepath("MultiColumn.parquet");
   cudf_io::parquet_writer_options out_opts =
     cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected->view())
       .metadata(&expected_metadata);
-  std::vector<uint8_t> precisions = {10, 20};
-  out_opts.set_decimal_precision(precisions);
   cudf_io::write_parquet(out_opts);
 
   cudf_io::parquet_reader_options in_opts =
@@ -343,7 +360,7 @@ TEST_F(ParquetWriterTest, MultiColumn)
   auto result = cudf_io::read_parquet(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
-  EXPECT_EQ(expected_metadata.column_names, result.metadata.column_names);
+  compare_metadata_equality(expected_metadata, result.metadata);
 }
 
 TEST_F(ParquetWriterTest, MultiColumnWithNulls)
@@ -390,16 +407,6 @@ TEST_F(ParquetWriterTest, MultiColumnWithNulls)
   column_wrapper<numeric::decimal32> col6{col6_data, col6_data + num_rows, col6_mask};
   column_wrapper<numeric::decimal64> col7{col7_data, col7_data + num_rows, col7_mask};
 
-  cudf_io::table_metadata expected_metadata;
-  // expected_metadata.column_names.emplace_back("bools");
-  expected_metadata.column_names.emplace_back("int8s");
-  expected_metadata.column_names.emplace_back("int16s");
-  expected_metadata.column_names.emplace_back("int32s");
-  expected_metadata.column_names.emplace_back("floats");
-  expected_metadata.column_names.emplace_back("doubles");
-  expected_metadata.column_names.emplace_back("decimal32s");
-  expected_metadata.column_names.emplace_back("decimal64s");
-
   std::vector<std::unique_ptr<column>> cols;
   // cols.push_back(col0.release());
   cols.push_back(col1.release());
@@ -412,12 +419,20 @@ TEST_F(ParquetWriterTest, MultiColumnWithNulls)
   auto expected = std::make_unique<table>(std::move(cols));
   EXPECT_EQ(7, expected->num_columns());
 
+  cudf_io::table_input_metadata expected_metadata(*expected);
+  // expected_metadata.column_names.emplace_back("bools");
+  expected_metadata.column_metadata[0].set_name("int8s");
+  expected_metadata.column_metadata[1].set_name("int16s");
+  expected_metadata.column_metadata[2].set_name("int32s");
+  expected_metadata.column_metadata[3].set_name("floats");
+  expected_metadata.column_metadata[4].set_name("doubles");
+  expected_metadata.column_metadata[5].set_name("decimal32s").set_decimal_precision(9);
+  expected_metadata.column_metadata[6].set_name("decimal64s").set_decimal_precision(20);
+
   auto filepath = temp_env->get_temp_filepath("MultiColumnWithNulls.parquet");
   cudf_io::parquet_writer_options out_opts =
     cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected->view())
       .metadata(&expected_metadata);
-  std::vector<uint8_t> precisions = {9, 20};
-  out_opts.set_decimal_precision(precisions);
 
   cudf_io::write_parquet(out_opts);
 
@@ -426,7 +441,10 @@ TEST_F(ParquetWriterTest, MultiColumnWithNulls)
   auto result = cudf_io::read_parquet(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
-  EXPECT_EQ(expected_metadata.column_names, result.metadata.column_names);
+  // TODO: Need to be able to return metadata in tree form from reader so they can be compared.
+  // Unfortunately the closest thing to a heirarchical schema is column_name_info which does not
+  // have any tests for it c++ or python.
+  compare_metadata_equality(expected_metadata, result.metadata);
 }
 
 TEST_F(ParquetWriterTest, Strings)
@@ -443,11 +461,6 @@ TEST_F(ParquetWriterTest, Strings)
   column_wrapper<cudf::string_view> col1{strings.begin(), strings.end()};
   column_wrapper<float> col2{seq_col2.begin(), seq_col2.end(), validity};
 
-  cudf_io::table_metadata expected_metadata;
-  expected_metadata.column_names.emplace_back("col_other");
-  expected_metadata.column_names.emplace_back("col_string");
-  expected_metadata.column_names.emplace_back("col_another");
-
   std::vector<std::unique_ptr<column>> cols;
   cols.push_back(col0.release());
   cols.push_back(col1.release());
@@ -455,6 +468,11 @@ TEST_F(ParquetWriterTest, Strings)
   auto expected = std::make_unique<table>(std::move(cols));
   EXPECT_EQ(3, expected->num_columns());
 
+  cudf_io::table_input_metadata expected_metadata(*expected);
+  expected_metadata.column_metadata[0].set_name("col_other");
+  expected_metadata.column_metadata[1].set_name("col_string");
+  expected_metadata.column_metadata[2].set_name("col_another");
+
   auto filepath = temp_env->get_temp_filepath("Strings.parquet");
   cudf_io::parquet_writer_options out_opts =
     cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected->view())
@@ -466,7 +484,7 @@ TEST_F(ParquetWriterTest, Strings)
   auto result = cudf_io::read_parquet(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
-  EXPECT_EQ(expected_metadata.column_names, result.metadata.column_names);
+  compare_metadata_equality(expected_metadata, result.metadata);
 }
 
 TEST_F(ParquetWriterTest, SlicedTable)
@@ -479,7 +497,8 @@ TEST_F(ParquetWriterTest, SlicedTable)
 
   auto seq_col0 = random_values<int>(num_rows);
   auto seq_col2 = random_values<float>(num_rows);
-  auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
+  auto validity =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3 != 0; });
 
   column_wrapper<int> col0{seq_col0.begin(), seq_col0.end(), validity};
   column_wrapper<cudf::string_view> col1{strings.begin(), strings.end()};
@@ -510,17 +529,64 @@ TEST_F(ParquetWriterTest, SlicedTable)
            },
            valids2};
 
-  cudf_io::table_metadata expected_metadata;
-  expected_metadata.column_names.emplace_back("col_other");
-  expected_metadata.column_names.emplace_back("col_string");
-  expected_metadata.column_names.emplace_back("col_another");
-  expected_metadata.column_names.emplace_back("col_list");
-  expected_metadata.column_names.emplace_back("col_multi_level_list");
+  // Struct column
+  auto ages_col = cudf::test::fixed_width_column_wrapper<int32_t>{
+    {48, 27, 25, 31, 351, 351, 29, 15}, {1, 1, 1, 1, 1, 0, 1, 1}};
 
-  auto expected = table_view({col0, col1, col2, col3, col4});
+  auto col5 = cudf::test::structs_column_wrapper{{ages_col}, {1, 1, 1, 1, 0, 1, 1, 1}};
 
+  // Struct/List mixed column
+
+  // []
+  // [NULL, 2, NULL]
+  // [4, 5]
+  // NULL
+  // []
+  // [7, 8, 9]
+  // [10]
+  // [11, 12]
+  lcw land{{{}, {{1, 2, 3}, valids}, {4, 5}, {}, {}, {7, 8, 9}, {10}, {11, 12}}, valids2};
+
+  // []
+  // [[1, 2, 3], [], [4, 5], [], [0, 6, 0]]
+  // [[7, 8], []]
+  // [[]]
+  // [[]]
+  // [[], [], []]
+  // [[10]]
+  // [[13, 14], [15]]
+  lcw flats{lcw{},
+            {{1, 2, 3}, {}, {4, 5}, {}, {0, 6, 0}},
+            {{7, 8}, {}},
+            lcw{lcw{}},
+            lcw{lcw{}},
+            lcw{lcw{}, lcw{}, lcw{}},
+            {lcw{10}},
+            {{13, 14}, {15}}};
+
+  auto struct_1 = cudf::test::structs_column_wrapper{land, flats};
+  auto is_human = cudf::test::fixed_width_column_wrapper<bool>{
+    {true, true, false, false, true, false, true, false}};
+  auto col6 = cudf::test::structs_column_wrapper{{is_human, struct_1}};
+
+  auto expected = table_view({col0, col1, col2, col3, col4, col5, col6});
+
+  // auto expected_slice = expected;
   auto expected_slice = cudf::slice(expected, {2, static_cast<cudf::size_type>(num_rows) - 1});
 
+  cudf_io::table_input_metadata expected_metadata(expected_slice);
+  expected_metadata.column_metadata[0].set_name("col_other");
+  expected_metadata.column_metadata[1].set_name("col_string");
+  expected_metadata.column_metadata[2].set_name("col_another");
+  expected_metadata.column_metadata[3].set_name("col_list");
+  expected_metadata.column_metadata[4].set_name("col_multi_level_list");
+  expected_metadata.column_metadata[5].set_name("col_struct");
+  expected_metadata.column_metadata[5].set_name("col_struct_list");
+  expected_metadata.column_metadata[6].child(0).set_name("human?");
+  expected_metadata.column_metadata[6].child(1).set_name("particulars");
+  expected_metadata.column_metadata[6].child(1).child(0).set_name("land");
+  expected_metadata.column_metadata[6].child(1).child(1).set_name("flats");
+
   auto filepath = temp_env->get_temp_filepath("SlicedTable.parquet");
   cudf_io::parquet_writer_options out_opts =
     cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected_slice)
@@ -532,7 +598,7 @@ TEST_F(ParquetWriterTest, SlicedTable)
   auto result = cudf_io::read_parquet(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected_slice, result.tbl->view());
-  EXPECT_EQ(expected_metadata.column_names, result.metadata.column_names);
+  compare_metadata_equality(expected_metadata, result.metadata);
 }
 
 TEST_F(ParquetWriterTest, ListColumn)
@@ -607,18 +673,18 @@ TEST_F(ParquetWriterTest, ListColumn)
            },
            valids2};
 
-  cudf_io::table_metadata expected_metadata;
-  expected_metadata.column_names.emplace_back("col_list_int_0");
-  expected_metadata.column_names.emplace_back("col_list_list_int_1");
-  expected_metadata.column_names.emplace_back("col_list_list_int_nullable_2");
-  expected_metadata.column_names.emplace_back("col_list_list_nullable_double_nullable_3");
-  // expected_metadata.column_names.emplace_back("col_list_list_uint16_4");
-  expected_metadata.column_names.emplace_back("col_list_nullable_list_nullable_int_nullable_5");
-  expected_metadata.column_names.emplace_back("col_list_list_string_6");
-  expected_metadata.column_names.emplace_back("col_list_list_list_7");
-
   table_view expected({col0, col1, col2, col3, /* col4, */ col5, col6, col7});
 
+  cudf_io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("col_list_int_0");
+  expected_metadata.column_metadata[1].set_name("col_list_list_int_1");
+  expected_metadata.column_metadata[2].set_name("col_list_list_int_nullable_2");
+  expected_metadata.column_metadata[3].set_name("col_list_list_nullable_double_nullable_3");
+  // expected_metadata.column_metadata[0].set_name("col_list_list_uint16_4");
+  expected_metadata.column_metadata[4].set_name("col_list_nullable_list_nullable_int_nullable_5");
+  expected_metadata.column_metadata[5].set_name("col_list_list_string_6");
+  expected_metadata.column_metadata[6].set_name("col_list_list_list_7");
+
   auto filepath = temp_env->get_temp_filepath("ListColumn.parquet");
   auto out_opts = cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected)
                     .metadata(&expected_metadata)
@@ -630,7 +696,7 @@ TEST_F(ParquetWriterTest, ListColumn)
   auto result  = cudf_io::read_parquet(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
-  EXPECT_EQ(expected_metadata.column_names, result.metadata.column_names);
+  compare_metadata_equality(expected_metadata, result.metadata);
 }
 
 TEST_F(ParquetWriterTest, MultiIndex)
@@ -650,15 +716,6 @@ TEST_F(ParquetWriterTest, MultiIndex)
   column_wrapper<float> col4{col4_data.begin(), col4_data.end(), validity};
   column_wrapper<double> col5{col5_data.begin(), col5_data.end(), validity};
 
-  cudf_io::table_metadata expected_metadata;
-  expected_metadata.column_names.emplace_back("int8s");
-  expected_metadata.column_names.emplace_back("int16s");
-  expected_metadata.column_names.emplace_back("int32s");
-  expected_metadata.column_names.emplace_back("floats");
-  expected_metadata.column_names.emplace_back("doubles");
-  expected_metadata.user_data.insert(
-    {"pandas", "\"index_columns\": [\"floats\", \"doubles\"], \"column1\": [\"int8s\"]"});
-
   std::vector<std::unique_ptr<column>> cols;
   cols.push_back(col1.release());
   cols.push_back(col2.release());
@@ -668,6 +725,15 @@ TEST_F(ParquetWriterTest, MultiIndex)
   auto expected = std::make_unique<table>(std::move(cols));
   EXPECT_EQ(5, expected->num_columns());
 
+  cudf_io::table_input_metadata expected_metadata(*expected);
+  expected_metadata.column_metadata[0].set_name("int8s");
+  expected_metadata.column_metadata[1].set_name("int16s");
+  expected_metadata.column_metadata[2].set_name("int32s");
+  expected_metadata.column_metadata[3].set_name("floats");
+  expected_metadata.column_metadata[4].set_name("doubles");
+  expected_metadata.user_data.insert(
+    {"pandas", "\"index_columns\": [\"floats\", \"doubles\"], \"column1\": [\"int8s\"]"});
+
   auto filepath = temp_env->get_temp_filepath("MultiIndex.parquet");
   cudf_io::parquet_writer_options out_opts =
     cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected->view())
@@ -681,7 +747,7 @@ TEST_F(ParquetWriterTest, MultiIndex)
   auto result = cudf_io::read_parquet(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
-  EXPECT_EQ(expected_metadata.column_names, result.metadata.column_names);
+  compare_metadata_equality(expected_metadata, result.metadata);
 }
 
 TEST_F(ParquetWriterTest, HostBuffer)
@@ -692,14 +758,14 @@ TEST_F(ParquetWriterTest, HostBuffer)
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
   column_wrapper<int> col{seq_col.begin(), seq_col.end(), validity};
 
-  cudf_io::table_metadata expected_metadata;
-  expected_metadata.column_names.emplace_back("col_other");
-
   std::vector<std::unique_ptr<column>> cols;
   cols.push_back(col.release());
   const auto expected = std::make_unique<table>(std::move(cols));
   EXPECT_EQ(1, expected->num_columns());
 
+  cudf_io::table_input_metadata expected_metadata(*expected);
+  expected_metadata.column_metadata[0].set_name("col_other");
+
   std::vector<char> out_buffer;
   cudf_io::parquet_writer_options out_opts =
     cudf_io::parquet_writer_options::builder(cudf_io::sink_info(&out_buffer), expected->view())
@@ -710,7 +776,7 @@ TEST_F(ParquetWriterTest, HostBuffer)
   const auto result = cudf_io::read_parquet(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
-  EXPECT_EQ(expected_metadata.column_names, result.metadata.column_names);
+  compare_metadata_equality(expected_metadata, result.metadata);
 }
 
 TEST_F(ParquetWriterTest, NonNullable)
@@ -730,6 +796,175 @@ TEST_F(ParquetWriterTest, NonNullable)
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected);
 }
 
+TEST_F(ParquetWriterTest, Struct)
+{
+  // Struct<is_human:bool, Struct<names:string, ages:int>>
+
+  auto names = {"Samuel Vimes",
+                "Carrot Ironfoundersson",
+                "Angua von Uberwald",
+                "Cheery Littlebottom",
+                "Detritus",
+                "Mr Slant"};
+
+  // `Name` column has all valid values.
+  auto names_col = cudf::test::strings_column_wrapper{names.begin(), names.end()};
+
+  auto ages_col =
+    cudf::test::fixed_width_column_wrapper<int32_t>{{48, 27, 25, 31, 351, 351}, {1, 1, 1, 1, 1, 0}};
+
+  auto struct_1 = cudf::test::structs_column_wrapper{{names_col, ages_col}, {1, 1, 1, 1, 0, 1}};
+
+  auto is_human_col = cudf::test::fixed_width_column_wrapper<bool>{
+    {true, true, false, false, false, false}, {1, 1, 0, 1, 1, 0}};
+
+  auto struct_2 =
+    cudf::test::structs_column_wrapper{{is_human_col, struct_1}, {0, 1, 1, 1, 1, 1}}.release();
+
+  auto expected = table_view({*struct_2});
+
+  auto filepath = temp_env->get_temp_filepath("Struct.parquet");
+  cudf_io::parquet_writer_options args =
+    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected);
+  cudf_io::write_parquet(args);
+
+  cudf_io::parquet_reader_options read_args =
+    cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath));
+  cudf_io::read_parquet(read_args);
+}
+
+TEST_F(ParquetWriterTest, StructOfList)
+{
+  // Struct<is_human:bool,
+  //        Struct<weight:float,
+  //               ages:int,
+  //               land_unit:List<int>>,
+  //               flats:List<List<int>>
+  //              >
+  //       >
+
+  auto weights_col = cudf::test::fixed_width_column_wrapper<float>{1.1, 2.4, 5.3, 8.0, 9.6, 6.9};
+
+  auto ages_col =
+    cudf::test::fixed_width_column_wrapper<int32_t>{{48, 27, 25, 31, 351, 351}, {1, 1, 1, 1, 1, 0}};
+
+  auto valids  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; });
+  auto valids2 = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 3; });
+
+  using lcw = cudf::test::lists_column_wrapper<int32_t>;
+
+  // []
+  // [NULL, 2, NULL]
+  // [4, 5]
+  // NULL
+  // []
+  // [7, 8, 9]
+  lcw land_unit{{{}, {{1, 2, 3}, valids}, {4, 5}, {}, {}, {7, 8, 9}}, valids2};
+
+  // []
+  // [[1, 2, 3], [], [4, 5], [], [0, 6, 0]]
+  // [[7, 8], []]
+  // [[]]
+  // [[]]
+  // [[], [], []]
+  lcw flats{lcw{},
+            {{1, 2, 3}, {}, {4, 5}, {}, {0, 6, 0}},
+            {{7, 8}, {}},
+            lcw{lcw{}},
+            lcw{lcw{}},
+            lcw{lcw{}, lcw{}, lcw{}}};
+
+  auto struct_1 = cudf::test::structs_column_wrapper{{weights_col, ages_col, land_unit, flats},
+                                                     {1, 1, 1, 1, 0, 1}};
+
+  auto is_human_col = cudf::test::fixed_width_column_wrapper<bool>{
+    {true, true, false, false, false, false}, {1, 1, 0, 1, 1, 0}};
+
+  auto struct_2 =
+    cudf::test::structs_column_wrapper{{is_human_col, struct_1}, {0, 1, 1, 1, 1, 1}}.release();
+
+  // cudf::test::print(struct_2->child(1).child(2));
+
+  auto expected = table_view({*struct_2});
+
+  cudf_io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("being");
+  expected_metadata.column_metadata[0].child(0).set_name("human?");
+  expected_metadata.column_metadata[0].child(1).set_name("particulars");
+  expected_metadata.column_metadata[0].child(1).child(0).set_name("weight");
+  expected_metadata.column_metadata[0].child(1).child(1).set_name("age");
+  expected_metadata.column_metadata[0].child(1).child(2).set_name("land_unit");
+  expected_metadata.column_metadata[0].child(1).child(3).set_name("flats");
+
+  auto filepath = temp_env->get_temp_filepath("StructOfList.parquet");
+  cudf_io::parquet_writer_options args =
+    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected)
+      .metadata(&expected_metadata);
+  cudf_io::write_parquet(args);
+
+  cudf_io::parquet_reader_options read_args =
+    cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath));
+  const auto result = cudf_io::read_parquet(read_args);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+  compare_metadata_equality(expected_metadata, result.metadata);
+}
+
+TEST_F(ParquetWriterTest, ListOfStruct)
+{
+  // List<Struct<is_human:bool,
+  //             Struct<weight:float,
+  //                    ages:int,
+  //                   >
+  //            >
+  //     >
+
+  auto weight_col = cudf::test::fixed_width_column_wrapper<float>{1.1, 2.4, 5.3, 8.0, 9.6, 6.9};
+
+  auto ages_col =
+    cudf::test::fixed_width_column_wrapper<int32_t>{{48, 27, 25, 31, 351, 351}, {1, 1, 1, 1, 1, 0}};
+
+  auto struct_1 = cudf::test::structs_column_wrapper{{weight_col, ages_col}, {1, 1, 1, 1, 0, 1}};
+
+  auto is_human_col = cudf::test::fixed_width_column_wrapper<bool>{
+    {true, true, false, false, false, false}, {1, 1, 0, 1, 1, 0}};
+
+  auto struct_2 =
+    cudf::test::structs_column_wrapper{{is_human_col, struct_1}, {0, 1, 1, 1, 1, 1}}.release();
+
+  auto list_offsets_column =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 2, 5, 5, 6}.release();
+  auto num_list_rows = list_offsets_column->size() - 1;
+
+  auto list_col = cudf::make_lists_column(num_list_rows,
+                                          std::move(list_offsets_column),
+                                          std::move(struct_2),
+                                          cudf::UNKNOWN_NULL_COUNT,
+                                          {});
+
+  auto expected = table_view({*list_col});
+
+  cudf_io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("family");
+  expected_metadata.column_metadata[0].child(1).child(0).set_name("human?");
+  expected_metadata.column_metadata[0].child(1).child(1).set_name("particulars");
+  expected_metadata.column_metadata[0].child(1).child(1).child(0).set_name("weight");
+  expected_metadata.column_metadata[0].child(1).child(1).child(1).set_name("age");
+
+  auto filepath = temp_env->get_temp_filepath("ListOfStruct.parquet");
+  cudf_io::parquet_writer_options args =
+    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected)
+      .metadata(&expected_metadata);
+  cudf_io::write_parquet(args);
+
+  cudf_io::parquet_reader_options read_args =
+    cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath));
+  const auto result = cudf_io::read_parquet(read_args);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+  compare_metadata_equality(expected_metadata, result.metadata);
+}
+
 // custom data sink that supports device writes. uses plain file io.
 class custom_test_data_sink : public cudf::io::data_sink {
  public:
@@ -1055,6 +1290,168 @@ TEST_F(ParquetChunkedWriterTest, ListColumn)
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected);
 }
 
+TEST_F(ParquetChunkedWriterTest, ListOfStruct)
+{
+  // Table 1
+  auto weight_1   = cudf::test::fixed_width_column_wrapper<float>{{57.5, 51.1, 15.3}};
+  auto ages_1     = cudf::test::fixed_width_column_wrapper<int32_t>{{30, 27, 5}};
+  auto struct_1_1 = cudf::test::structs_column_wrapper{weight_1, ages_1};
+  auto is_human_1 = cudf::test::fixed_width_column_wrapper<bool>{{true, true, false}};
+  auto struct_2_1 = cudf::test::structs_column_wrapper{{is_human_1, struct_1_1}};
+
+  auto list_offsets_column_1 =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 2, 3, 3}.release();
+  auto num_list_rows_1 = list_offsets_column_1->size() - 1;
+
+  auto list_col_1 = cudf::make_lists_column(num_list_rows_1,
+                                            std::move(list_offsets_column_1),
+                                            struct_2_1.release(),
+                                            cudf::UNKNOWN_NULL_COUNT,
+                                            {});
+
+  auto table_1 = table_view({*list_col_1});
+
+  // Table 2
+  auto weight_2   = cudf::test::fixed_width_column_wrapper<float>{{1.1, -1.0, -1.0}};
+  auto ages_2     = cudf::test::fixed_width_column_wrapper<int32_t>{{31, 351, 351}, {1, 1, 0}};
+  auto struct_1_2 = cudf::test::structs_column_wrapper{{weight_2, ages_2}, {1, 0, 1}};
+  auto is_human_2 = cudf::test::fixed_width_column_wrapper<bool>{{false, false, false}, {1, 1, 0}};
+  auto struct_2_2 = cudf::test::structs_column_wrapper{{is_human_2, struct_1_2}};
+
+  auto list_offsets_column_2 =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 1, 2, 3}.release();
+  auto num_list_rows_2 = list_offsets_column_2->size() - 1;
+
+  auto list_col_2 = cudf::make_lists_column(num_list_rows_2,
+                                            std::move(list_offsets_column_2),
+                                            struct_2_2.release(),
+                                            cudf::UNKNOWN_NULL_COUNT,
+                                            {});
+
+  auto table_2 = table_view({*list_col_2});
+
+  auto full_table = cudf::concatenate({table_1, table_2});
+
+  cudf_io::table_input_metadata expected_metadata(table_1);
+  expected_metadata.column_metadata[0].set_name("family");
+  expected_metadata.column_metadata[0].child(1).set_nullability(false);
+  expected_metadata.column_metadata[0].child(1).child(0).set_name("human?");
+  expected_metadata.column_metadata[0].child(1).child(1).set_name("particulars");
+  expected_metadata.column_metadata[0].child(1).child(1).child(0).set_name("weight");
+  expected_metadata.column_metadata[0].child(1).child(1).child(1).set_name("age");
+
+  auto filepath = temp_env->get_temp_filepath("ChunkedListOfStruct.parquet");
+  cudf_io::chunked_parquet_writer_options args =
+    cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
+  args.set_metadata(&expected_metadata);
+  cudf_io::parquet_chunked_writer(args).write(table_1).write(table_2);
+
+  cudf_io::parquet_reader_options read_opts =
+    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
+  auto result = cudf_io::read_parquet(read_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *full_table);
+  compare_metadata_equality(expected_metadata, result.metadata);
+}
+
+TEST_F(ParquetChunkedWriterTest, ListOfStructOfStructOfListOfList)
+{
+  auto valids  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; });
+  auto valids2 = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 3; });
+
+  using lcw = cudf::test::lists_column_wrapper<int32_t>;
+
+  // Table 1 ===========================
+
+  // []
+  // [NULL, 2, NULL]
+  // [4, 5]
+  // NULL
+  lcw land_1{{{}, {{1, 2, 3}, valids}, {4, 5}, {}}, valids2};
+
+  // []
+  // [[1, 2, 3], [], [4, 5], [], [0, 6, 0]]
+  // [[7, 8], []]
+  // [[]]
+  lcw flats_1{lcw{}, {{1, 2, 3}, {}, {4, 5}, {}, {0, 6, 0}}, {{7, 8}, {}}, lcw{lcw{}}};
+
+  auto weight_1   = cudf::test::fixed_width_column_wrapper<float>{{57.5, 51.1, 15.3, 1.1}};
+  auto ages_1     = cudf::test::fixed_width_column_wrapper<int32_t>{{30, 27, 5, 31}};
+  auto struct_1_1 = cudf::test::structs_column_wrapper{weight_1, ages_1, land_1, flats_1};
+  auto is_human_1 = cudf::test::fixed_width_column_wrapper<bool>{{true, true, false, false}};
+  auto struct_2_1 = cudf::test::structs_column_wrapper{{is_human_1, struct_1_1}};
+
+  auto list_offsets_column_1 =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 2, 3, 4}.release();
+  auto num_list_rows_1 = list_offsets_column_1->size() - 1;
+
+  auto list_col_1 = cudf::make_lists_column(num_list_rows_1,
+                                            std::move(list_offsets_column_1),
+                                            struct_2_1.release(),
+                                            cudf::UNKNOWN_NULL_COUNT,
+                                            {});
+
+  auto table_1 = table_view({*list_col_1});
+
+  // Table 2 ===========================
+
+  // []
+  // [7, 8, 9]
+  lcw land_2{{}, {7, 8, 9}};
+
+  // [[]]
+  // [[], [], []]
+  lcw flats_2{lcw{lcw{}}, lcw{lcw{}, lcw{}, lcw{}}};
+
+  auto weight_2   = cudf::test::fixed_width_column_wrapper<float>{{-1.0, -1.0}};
+  auto ages_2     = cudf::test::fixed_width_column_wrapper<int32_t>{{351, 351}, {1, 0}};
+  auto struct_1_2 = cudf::test::structs_column_wrapper{{weight_2, ages_2, land_2, flats_2}, {0, 1}};
+  auto is_human_2 = cudf::test::fixed_width_column_wrapper<bool>{{false, false}, {1, 0}};
+  auto struct_2_2 = cudf::test::structs_column_wrapper{{is_human_2, struct_1_2}};
+
+  auto list_offsets_column_2 =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 1, 2}.release();
+  auto num_list_rows_2 = list_offsets_column_2->size() - 1;
+
+  auto list_col_2 = cudf::make_lists_column(num_list_rows_2,
+                                            std::move(list_offsets_column_2),
+                                            struct_2_2.release(),
+                                            cudf::UNKNOWN_NULL_COUNT,
+                                            {});
+
+  auto table_2 = table_view({*list_col_2});
+
+  auto full_table = cudf::concatenate({table_1, table_2});
+
+  cudf_io::table_input_metadata expected_metadata(table_1);
+  expected_metadata.column_metadata[0].set_name("family");
+  expected_metadata.column_metadata[0].child(1).set_nullability(false);
+  expected_metadata.column_metadata[0].child(1).child(0).set_name("human?");
+  expected_metadata.column_metadata[0].child(1).child(1).set_name("particulars");
+  expected_metadata.column_metadata[0].child(1).child(1).child(0).set_name("weight");
+  expected_metadata.column_metadata[0].child(1).child(1).child(1).set_name("age");
+  expected_metadata.column_metadata[0].child(1).child(1).child(2).set_name("land_unit");
+  expected_metadata.column_metadata[0].child(1).child(1).child(3).set_name("flats");
+
+  auto filepath = temp_env->get_temp_filepath("ListOfStructOfStructOfListOfList.parquet");
+  cudf_io::chunked_parquet_writer_options args =
+    cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
+  args.set_metadata(&expected_metadata);
+  cudf_io::parquet_chunked_writer(args).write(table_1).write(table_2);
+
+  cudf_io::parquet_reader_options read_opts =
+    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
+  auto result = cudf_io::read_parquet(read_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *full_table);
+  compare_metadata_equality(expected_metadata, result.metadata);
+
+  // We specifically mentioned in input schema that struct_2 is non-nullable across chunked calls.
+  auto result_parent_list = result.tbl->get_column(0);
+  auto result_struct_2    = result_parent_list.child(cudf::lists_column_view::child_column_index);
+  EXPECT_EQ(result_struct_2.nullable(), false);
+}
+
 TEST_F(ParquetChunkedWriterTest, MismatchedTypes)
 {
   srand(31337);
@@ -1150,8 +1547,7 @@ TEST_F(ParquetChunkedWriterTest, MismatchedStructureList)
     cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
   cudf_io::parquet_chunked_writer writer(args);
   writer.write(tbl0);
-  CUDF_EXPECT_THROW_MESSAGE(writer.write(tbl1),
-                            "Mismatch in schema between multiple calls to write_chunk");
+  EXPECT_THROW(writer.write(tbl1), cudf::logic_error);
 }
 
 TEST_F(ParquetChunkedWriterTest, DifferentNullability)
@@ -1174,6 +1570,54 @@ TEST_F(ParquetChunkedWriterTest, DifferentNullability)
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table);
 }
 
+TEST_F(ParquetChunkedWriterTest, DifferentNullabilityStruct)
+{
+  // Struct<is_human:bool (non-nullable),
+  //        Struct<weight:float>,
+  //               age:int
+  //              > (nullable)
+  //       > (non-nullable)
+
+  // Table 1: is_human and struct_1 are non-nullable but should be nullable when read back.
+  auto weight_1   = cudf::test::fixed_width_column_wrapper<float>{{57.5, 51.1, 15.3}};
+  auto ages_1     = cudf::test::fixed_width_column_wrapper<int32_t>{{30, 27, 5}};
+  auto struct_1_1 = cudf::test::structs_column_wrapper{weight_1, ages_1};
+  auto is_human_1 = cudf::test::fixed_width_column_wrapper<bool>{{true, true, false}};
+  auto struct_2_1 = cudf::test::structs_column_wrapper{{is_human_1, struct_1_1}};
+  auto table_1    = cudf::table_view({struct_2_1});
+
+  // Table 2: struct_1 and is_human are nullable now so if we hadn't assumed worst case (nullable)
+  // when writing table_1, we would have wrong pages for it.
+  auto weight_2   = cudf::test::fixed_width_column_wrapper<float>{{1.1, -1.0, -1.0}};
+  auto ages_2     = cudf::test::fixed_width_column_wrapper<int32_t>{{31, 351, 351}, {1, 1, 0}};
+  auto struct_1_2 = cudf::test::structs_column_wrapper{{weight_2, ages_2}, {1, 0, 1}};
+  auto is_human_2 = cudf::test::fixed_width_column_wrapper<bool>{{false, false, false}, {1, 1, 0}};
+  auto struct_2_2 = cudf::test::structs_column_wrapper{{is_human_2, struct_1_2}};
+  auto table_2    = cudf::table_view({struct_2_2});
+
+  auto full_table = cudf::concatenate({table_1, table_2});
+
+  cudf_io::table_input_metadata expected_metadata(table_1);
+  expected_metadata.column_metadata[0].set_name("being");
+  expected_metadata.column_metadata[0].child(0).set_name("human?");
+  expected_metadata.column_metadata[0].child(1).set_name("particulars");
+  expected_metadata.column_metadata[0].child(1).child(0).set_name("weight");
+  expected_metadata.column_metadata[0].child(1).child(1).set_name("age");
+
+  auto filepath = temp_env->get_temp_filepath("ChunkedNullableStruct.parquet");
+  cudf_io::chunked_parquet_writer_options args =
+    cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
+  args.set_metadata(&expected_metadata);
+  cudf_io::parquet_chunked_writer(args).write(table_1).write(table_2);
+
+  cudf_io::parquet_reader_options read_opts =
+    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
+  auto result = cudf_io::read_parquet(read_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *full_table);
+  compare_metadata_equality(expected_metadata, result.metadata);
+}
+
 TEST_F(ParquetChunkedWriterTest, ForcedNullability)
 {
   srand(31337);
@@ -1184,17 +1628,17 @@ TEST_F(ParquetChunkedWriterTest, ForcedNullability)
 
   auto filepath = temp_env->get_temp_filepath("ChunkedNoNullable.parquet");
 
-  cudf::io::table_metadata_with_nullability nullable_metadata;
+  cudf_io::table_input_metadata metadata(*table1);
 
   // In the absence of prescribed per-column nullability in metadata, the writer assumes the worst
   // and considers all columns nullable. However cudf::concatenate will not force nulls in case no
   // columns are nullable. To get the expected result, we tell the writer the nullability of all
   // columns in advance.
-  nullable_metadata.column_nullable.insert(nullable_metadata.column_nullable.begin(), 5, false);
+  for (auto& col_meta : metadata.column_metadata) { col_meta.set_nullability(false); }
 
   cudf_io::chunked_parquet_writer_options args =
     cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath})
-      .nullable_metadata(&nullable_metadata);
+      .metadata(&metadata);
   cudf_io::parquet_chunked_writer(args).write(*table1).write(*table2);
 
   cudf_io::parquet_reader_options read_opts =
@@ -1213,8 +1657,6 @@ TEST_F(ParquetChunkedWriterTest, ForcedNullabilityList)
 
   using lcw = cudf::test::lists_column_wrapper<int32_t>;
 
-  cudf::io::table_metadata_with_nullability nullable_metadata;
-
   // COL0 ====================
   // [1, 2, 3]
   // []
@@ -1228,9 +1670,6 @@ TEST_F(ParquetChunkedWriterTest, ForcedNullabilityList)
   // NULL
   lcw col01{{{7}, {}, {8, 9, 10, 11}, {}}, valids2};
 
-  nullable_metadata.column_nullable.push_back(true);   // List is nullable at first (root) level
-  nullable_metadata.column_nullable.push_back(false);  // non-nullable at second (leaf) level
-
   // COL1 (non-nested columns to test proper schema construction)
   size_t num_rows = static_cast<cudf::column_view>(col00).size();
   auto seq_col0   = random_values<int>(num_rows);
@@ -1239,18 +1678,22 @@ TEST_F(ParquetChunkedWriterTest, ForcedNullabilityList)
   column_wrapper<int> col10{seq_col0.begin(), seq_col0.end(), valids};
   column_wrapper<int> col11{seq_col1.begin(), seq_col1.end(), valids2};
 
-  nullable_metadata.column_nullable.push_back(true);
-
   auto table1 = table_view({col00, col10});
   auto table2 = table_view({col01, col11});
 
   auto full_table = cudf::concatenate({table1, table2});
 
+  cudf_io::table_input_metadata metadata(table1);
+  metadata.column_metadata[0].set_nullability(true);  // List is nullable at first (root) level
+  metadata.column_metadata[0].child(1).set_nullability(
+    false);  // non-nullable at second (leaf) level
+  metadata.column_metadata[1].set_nullability(true);
+
   auto filepath = temp_env->get_temp_filepath("ChunkedListNullable.parquet");
 
   cudf_io::chunked_parquet_writer_options args =
     cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath})
-      .nullable_metadata(&nullable_metadata);
+      .metadata(&metadata);
   cudf_io::parquet_chunked_writer(args).write(table1).write(table2);
 
   cudf_io::parquet_reader_options read_opts =
@@ -1260,30 +1703,50 @@ TEST_F(ParquetChunkedWriterTest, ForcedNullabilityList)
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table);
 }
 
-TEST_F(ParquetChunkedWriterTest, WrongNullability)
+TEST_F(ParquetChunkedWriterTest, ForcedNullabilityStruct)
 {
-  srand(31337);
-  auto table1 = create_random_fixed_table<int>(5, 5, false);
+  // Struct<is_human:bool (non-nullable),
+  //        Struct<weight:float>,
+  //               age:int
+  //              > (nullable)
+  //       > (non-nullable)
+
+  // Table 1: is_human and struct_2 are non-nullable and should stay that way when read back.
+  auto weight_1   = cudf::test::fixed_width_column_wrapper<float>{{57.5, 51.1, 15.3}};
+  auto ages_1     = cudf::test::fixed_width_column_wrapper<int32_t>{{30, 27, 5}};
+  auto struct_1_1 = cudf::test::structs_column_wrapper{weight_1, ages_1};
+  auto is_human_1 = cudf::test::fixed_width_column_wrapper<bool>{{true, true, false}};
+  auto struct_2_1 = cudf::test::structs_column_wrapper{{is_human_1, struct_1_1}};
+  auto table_1    = cudf::table_view({struct_2_1});
+
+  auto weight_2   = cudf::test::fixed_width_column_wrapper<float>{{1.1, -1.0, -1.0}};
+  auto ages_2     = cudf::test::fixed_width_column_wrapper<int32_t>{{31, 351, 351}, {1, 1, 0}};
+  auto struct_1_2 = cudf::test::structs_column_wrapper{{weight_2, ages_2}, {1, 0, 1}};
+  auto is_human_2 = cudf::test::fixed_width_column_wrapper<bool>{{false, false, false}};
+  auto struct_2_2 = cudf::test::structs_column_wrapper{{is_human_2, struct_1_2}};
+  auto table_2    = cudf::table_view({struct_2_2});
+
+  auto full_table = cudf::concatenate({table_1, table_2});
+
+  cudf_io::table_input_metadata expected_metadata(table_1);
+  expected_metadata.column_metadata[0].set_name("being").set_nullability(false);
+  expected_metadata.column_metadata[0].child(0).set_name("human?").set_nullability(false);
+  expected_metadata.column_metadata[0].child(1).set_name("particulars");
+  expected_metadata.column_metadata[0].child(1).child(0).set_name("weight");
+  expected_metadata.column_metadata[0].child(1).child(1).set_name("age");
+
+  auto filepath = temp_env->get_temp_filepath("ChunkedNullableStruct.parquet");
+  cudf_io::chunked_parquet_writer_options args =
+    cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
+  args.set_metadata(&expected_metadata);
+  cudf_io::parquet_chunked_writer(args).write(table_1).write(table_2);
 
-  auto filepath = temp_env->get_temp_filepath("ChunkedWrongNullable.parquet");
+  cudf_io::parquet_reader_options read_opts =
+    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
+  auto result = cudf_io::read_parquet(read_opts);
 
-  cudf::io::table_metadata_with_nullability nullable_metadata;
-  // Number of columns with mask in table (i.e 5) and size of column nullability (i.e 6), are
-  // mismatching.
-  nullable_metadata.column_nullable.insert(nullable_metadata.column_nullable.begin(), 6, true);
-  cudf_io::chunked_parquet_writer_options args =
-    cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath})
-      .nullable_metadata(&nullable_metadata);
-  EXPECT_THROW(cudf_io::parquet_chunked_writer(args).write(*table1), cudf::logic_error);
-
-  nullable_metadata.column_nullable.clear();
-  // Number of columns with mask in table (i.e 5) and size of column nullability (i.e 4), are
-  // mismatching.
-  nullable_metadata.column_nullable.insert(nullable_metadata.column_nullable.begin(), 4, true);
-  cudf_io::chunked_parquet_writer_options args2 =
-    cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath})
-      .nullable_metadata(&nullable_metadata);
-  EXPECT_THROW(cudf_io::parquet_chunked_writer(args2).write(*table1), cudf::logic_error);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table);
+  compare_metadata_equality(expected_metadata, result.metadata);
 }
 
 TEST_F(ParquetChunkedWriterTest, ReadRowGroups)
@@ -1328,7 +1791,7 @@ TEST_F(ParquetChunkedWriterTest, ReadRowGroupsError)
   EXPECT_THROW(cudf_io::read_parquet(read_opts), cudf::logic_error);
 }
 
-TEST_F(ParquetChunkedWriterTest, DecimalWrite)
+TEST_F(ParquetWriterTest, DecimalWrite)
 {
   constexpr cudf::size_type num_rows = 500;
   auto seq_col0                      = random_values<int32_t>(num_rows);
@@ -1345,36 +1808,25 @@ TEST_F(ParquetChunkedWriterTest, DecimalWrite)
   auto table = table_view({col0, col1});
 
   auto filepath = temp_env->get_temp_filepath("DecimalWrite.parquet");
-  cudf_io::chunked_parquet_writer_options args =
-    cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
+  cudf_io::parquet_writer_options args =
+    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, table);
 
   // verify failure if no decimal precision given
-  EXPECT_THROW(cudf_io::parquet_chunked_writer(args).write(table), cudf::logic_error);
+  EXPECT_THROW(cudf_io::write_parquet(args), cudf::logic_error);
+
+  cudf_io::table_input_metadata expected_metadata(table);
 
   // verify failure if too small a precision is given
-  std::vector<uint8_t> precisions{7, 1};
-  args.set_decimal_precision_data(precisions);
-  EXPECT_THROW(cudf_io::parquet_chunked_writer(args).write(table), cudf::logic_error);
-
-  // verify failure if too few precisions given
-  precisions.pop_back();
-  args.set_decimal_precision_data(precisions);
-  EXPECT_THROW(cudf_io::parquet_chunked_writer(args).write(table), cudf::logic_error);
-
-  // verify sucess if equal precision is given
-  precisions = {7, 9};
-  args.set_decimal_precision_data(precisions);
-  cudf_io::parquet_chunked_writer(args).write(table);
-
-  // verify failure if too many precisions given
-  precisions = {7, 14, 11};
-  args.set_decimal_precision_data(precisions);
-  EXPECT_THROW(cudf_io::parquet_chunked_writer(args).write(table), cudf::logic_error);
-
-  // write correctly
-  precisions.pop_back();
-  args.set_decimal_precision_data(precisions);
-  cudf_io::parquet_chunked_writer(args).write(table);
+  expected_metadata.column_metadata[0].set_decimal_precision(7);
+  expected_metadata.column_metadata[1].set_decimal_precision(1);
+  args.set_metadata(&expected_metadata);
+  EXPECT_THROW(cudf_io::write_parquet(args), cudf::logic_error);
+
+  // verify success if equal precision is given
+  expected_metadata.column_metadata[0].set_decimal_precision(7);
+  expected_metadata.column_metadata[1].set_decimal_precision(9);
+  args.set_metadata(&expected_metadata);
+  cudf_io::write_parquet(args);
 
   cudf_io::parquet_reader_options read_opts =
     cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
@@ -1744,9 +2196,9 @@ TEST_F(ParquetReaderTest, ReorderedColumns)
 
     cudf::table_view tbl{{a, b}};
     auto filepath = temp_env->get_temp_filepath("ReorderedColumns.parquet");
-    cudf_io::table_metadata md;
-    md.column_names.push_back("a");
-    md.column_names.push_back("b");
+    cudf_io::table_input_metadata md(tbl);
+    md.column_metadata[0].set_name("a");
+    md.column_metadata[1].set_name("b");
     cudf_io::parquet_writer_options opts =
       cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, tbl).metadata(&md);
     cudf_io::write_parquet(opts);
@@ -1766,9 +2218,9 @@ TEST_F(ParquetReaderTest, ReorderedColumns)
 
     cudf::table_view tbl{{a, b}};
     auto filepath = temp_env->get_temp_filepath("ReorderedColumns2.parquet");
-    cudf_io::table_metadata md;
-    md.column_names.push_back("a");
-    md.column_names.push_back("b");
+    cudf_io::table_input_metadata md(tbl);
+    md.column_metadata[0].set_name("a");
+    md.column_metadata[1].set_name("b");
     cudf_io::parquet_writer_options opts =
       cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, tbl).metadata(&md);
     cudf_io::write_parquet(opts);
@@ -1791,11 +2243,11 @@ TEST_F(ParquetReaderTest, ReorderedColumns)
 
   cudf::table_view tbl{{a, b, c, d}};
   auto filepath = temp_env->get_temp_filepath("ReorderedColumns3.parquet");
-  cudf_io::table_metadata md;
-  md.column_names.push_back("a");
-  md.column_names.push_back("b");
-  md.column_names.push_back("c");
-  md.column_names.push_back("d");
+  cudf_io::table_input_metadata md(tbl);
+  md.column_metadata[0].set_name("a");
+  md.column_metadata[1].set_name("b");
+  md.column_metadata[2].set_name("c");
+  md.column_metadata[3].set_name("d");
   cudf_io::parquet_writer_options opts =
     cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, tbl).metadata(&md);
   cudf_io::write_parquet(opts);
@@ -2205,4 +2657,5 @@ TEST_F(ParquetReaderTest, DecimalRead)
     EXPECT_THROW(cudf_io::read_parquet(read_opts), cudf::logic_error);
   }
 }
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/jit/jit-cache-multiprocess-test.cpp b/cpp/tests/jit/jit-cache-multiprocess-test.cpp
index 3dbf5b59c88..2f0b353673e 100644
--- a/cpp/tests/jit/jit-cache-multiprocess-test.cpp
+++ b/cpp/tests/jit/jit-cache-multiprocess-test.cpp
@@ -49,8 +49,10 @@ TEST_F(JitCacheMultiProcessTest, MultiProcessTest)
     // Brand new cache object that has nothing in in-memory cache
     cudf::jit::cudfJitCache cache;
 
-    input->set_value(4);
-    output->set_value(1);
+    auto const in{4};
+    auto const out{1};
+    input->set_value(in);
+    output->set_value(out);
 
     // make program
     auto program = cache.getProgram("FileCacheTestProg3", program3_source);
diff --git a/cpp/tests/merge/merge_test.cpp b/cpp/tests/merge/merge_test.cpp
index fa3bde8cb52..451fa82d5a3 100644
--- a/cpp/tests/merge/merge_test.cpp
+++ b/cpp/tests/merge/merge_test.cpp
@@ -729,4 +729,36 @@ TEST_F(MergeTest, KeysWithNulls)
     }
 }
 
+template <typename T>
+struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+};
+
+template <typename T>
+using fp_wrapper = cudf::test::fixed_point_column_wrapper<T>;
+
+TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+
+TYPED_TEST(FixedPointTestBothReps, FixedPointMerge)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = cudf::device_storage_type_t<decimalXX>;
+
+  auto const a       = fp_wrapper<RepType>{{4, 22, 33, 44, 55}, scale_type{-1}};
+  auto const b       = fp_wrapper<RepType>{{5, 7, 10}, scale_type{-1}};
+  auto const table_a = cudf::table_view(std::vector<cudf::column_view>{a});
+  auto const table_b = cudf::table_view(std::vector<cudf::column_view>{b});
+  auto const tables  = std::vector<cudf::table_view>{table_a, table_b};
+
+  auto const key_cols = std::vector<cudf::size_type>{0};
+  auto const order    = std::vector<cudf::order>{cudf::order::ASCENDING};
+
+  auto const exp       = fp_wrapper<RepType>{{4, 5, 7, 10, 22, 33, 44, 55}, scale_type{-1}};
+  auto const exp_table = cudf::table_view(std::vector<cudf::column_view>{exp});
+
+  auto const result = cudf::merge(tables, key_cols, order);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(exp_table.column(0), result->view().column(0));
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/strings/chars_types_tests.cpp b/cpp/tests/strings/chars_types_tests.cpp
index 803a9b01b07..702329edaba 100644
--- a/cpp/tests/strings/chars_types_tests.cpp
+++ b/cpp/tests/strings/chars_types_tests.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <tests/strings/utilities.h>
 #include <cudf/column/column.hpp>
 #include <cudf/strings/char_types/char_types.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -228,54 +227,6 @@ TEST_F(StringsCharsTest, Numerics)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
-TEST_F(StringsCharsTest, Integers)
-{
-  cudf::test::strings_column_wrapper strings1(
-    {"+175", "-34", "9.8", "17+2", "+-14", "1234567890", "67de", "", "1e10", "-", "++", ""});
-  auto results = cudf::strings::is_integer(cudf::strings_column_view(strings1));
-  cudf::test::fixed_width_column_wrapper<bool> expected1({1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1);
-  EXPECT_FALSE(cudf::strings::all_integer(cudf::strings_column_view(strings1)));
-
-  cudf::test::strings_column_wrapper strings2(
-    {"0", "+0", "-0", "1234567890", "-27341132", "+012", "023", "-045"});
-  results = cudf::strings::is_integer(cudf::strings_column_view(strings2));
-  cudf::test::fixed_width_column_wrapper<bool> expected2({1, 1, 1, 1, 1, 1, 1, 1});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2);
-  EXPECT_TRUE(cudf::strings::all_integer(cudf::strings_column_view(strings2)));
-}
-
-TEST_F(StringsCharsTest, Floats)
-{
-  cudf::test::strings_column_wrapper strings1({"+175",
-                                               "-9.8",
-                                               "7+2",
-                                               "+-4",
-                                               "6.7e17",
-                                               "-1.2e-5",
-                                               "e",
-                                               ".e",
-                                               "1.e+-2",
-                                               "00.00",
-                                               "1.0e+1.0",
-                                               "1.2.3",
-                                               "+",
-                                               "--",
-                                               ""});
-  auto results = cudf::strings::is_float(cudf::strings_column_view(strings1));
-  cudf::test::fixed_width_column_wrapper<bool> expected1(
-    {1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1);
-  EXPECT_FALSE(cudf::strings::all_float(cudf::strings_column_view(strings1)));
-
-  cudf::test::strings_column_wrapper strings2(
-    {"+175", "-34", "9.8", "1234567890", "6.7e17", "-917.2e5"});
-  results = cudf::strings::is_float(cudf::strings_column_view(strings2));
-  cudf::test::fixed_width_column_wrapper<bool> expected2({1, 1, 1, 1, 1, 1});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2);
-  EXPECT_TRUE(cudf::strings::all_float(cudf::strings_column_view(strings2)));
-}
-
 TEST_F(StringsCharsTest, EmptyStrings)
 {
   cudf::test::strings_column_wrapper strings({"", "", ""});
@@ -284,12 +235,6 @@ TEST_F(StringsCharsTest, EmptyStrings)
   auto results = cudf::strings::all_characters_of_type(
     strings_view, cudf::strings::string_character_types::ALPHANUM);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-  results = cudf::strings::is_integer(strings_view);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-  EXPECT_FALSE(cudf::strings::all_integer(strings_view));
-  results = cudf::strings::is_float(strings_view);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-  EXPECT_FALSE(cudf::strings::all_float(strings_view));
 }
 
 TEST_F(StringsCharsTest, FilterCharTypes)
@@ -379,14 +324,6 @@ TEST_F(StringsCharsTest, EmptyStringsColumn)
   EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id());
   EXPECT_EQ(0, results->view().size());
 
-  results = cudf::strings::is_integer(strings_view);
-  EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id());
-  EXPECT_EQ(0, results->view().size());
-
-  results = cudf::strings::is_float(strings_view);
-  EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id());
-  EXPECT_EQ(0, results->view().size());
-
   results = cudf::strings::filter_characters_of_type(
     strings_view, cudf::strings::string_character_types::NUMERIC);
   EXPECT_EQ(cudf::type_id::STRING, results->view().type().id());
diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu
index f904c404251..bd463a7ab0d 100644
--- a/cpp/tests/strings/factories_test.cu
+++ b/cpp/tests/strings/factories_test.cu
@@ -19,12 +19,18 @@
 #include <cudf/copying.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/execution_policy.h>
+#include <thrust/transform.h>
+
 #include <cstring>
 #include <vector>
 
@@ -198,3 +204,31 @@ TEST_F(StringsFactoriesTest, CreateOffsets)
     }
   }
 }
+
+namespace {
+using string_pair = thrust::pair<char const*, cudf::size_type>;
+struct string_view_to_pair {
+  __device__ string_pair operator()(thrust::pair<cudf::string_view, bool> const& p)
+  {
+    return (p.second) ? string_pair{p.first.data(), p.first.size_bytes()} : string_pair{nullptr, 0};
+  }
+};
+}  // namespace
+
+TEST_F(StringsFactoriesTest, StringPairWithNullsAndEmpty)
+{
+  cudf::test::strings_column_wrapper data(
+    {"", "this", "is", "", "a", "", "column", "of", "strings", "", ""},
+    {0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1});
+
+  auto d_column = cudf::column_device_view::create(data);
+  rmm::device_vector<string_pair> pairs(d_column->size());
+  thrust::transform(thrust::device,
+                    d_column->pair_begin<cudf::string_view, true>(),
+                    d_column->pair_end<cudf::string_view, true>(),
+                    pairs.data(),
+                    string_view_to_pair{});
+
+  auto result = cudf::make_strings_column(pairs);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), data);
+}
diff --git a/cpp/tests/strings/floats_tests.cpp b/cpp/tests/strings/floats_tests.cpp
index b98416d9edd..f7151363d83 100644
--- a/cpp/tests/strings/floats_tests.cpp
+++ b/cpp/tests/strings/floats_tests.cpp
@@ -27,6 +27,41 @@
 struct StringsConvertTest : public cudf::test::BaseFixture {
 };
 
+TEST_F(StringsConvertTest, IsFloat)
+{
+  cudf::test::strings_column_wrapper strings;
+  auto strings_view = cudf::strings_column_view(strings);
+  auto results      = cudf::strings::is_float(strings_view);
+  EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id());
+  EXPECT_EQ(0, results->view().size());
+
+  cudf::test::strings_column_wrapper strings1({"+175",
+                                               "-9.8",
+                                               "7+2",
+                                               "+-4",
+                                               "6.7e17",
+                                               "-1.2e-5",
+                                               "e",
+                                               ".e",
+                                               "1.e+-2",
+                                               "00.00",
+                                               "1.0e+1.0",
+                                               "1.2.3",
+                                               "+",
+                                               "--",
+                                               ""});
+  results = cudf::strings::is_float(cudf::strings_column_view(strings1));
+  cudf::test::fixed_width_column_wrapper<bool> expected1(
+    {1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1);
+
+  cudf::test::strings_column_wrapper strings2(
+    {"+175", "-34", "9.8", "1234567890", "6.7e17", "-917.2e5"});
+  results = cudf::strings::is_float(cudf::strings_column_view(strings2));
+  cudf::test::fixed_width_column_wrapper<bool> expected2({1, 1, 1, 1, 1, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2);
+}
+
 TEST_F(StringsConvertTest, ToFloats32)
 {
   std::vector<const char*> h_strings{"1234",
diff --git a/cpp/tests/strings/integers_tests.cu b/cpp/tests/strings/integers_tests.cu
index 9e2b9809b26..d6bf03b3f76 100644
--- a/cpp/tests/strings/integers_tests.cu
+++ b/cpp/tests/strings/integers_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,6 +29,27 @@
 struct StringsConvertTest : public cudf::test::BaseFixture {
 };
 
+TEST_F(StringsConvertTest, IsInteger)
+{
+  cudf::test::strings_column_wrapper strings;
+  auto strings_view = cudf::strings_column_view(strings);
+  auto results      = cudf::strings::is_integer(strings_view);
+  EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id());
+  EXPECT_EQ(0, results->view().size());
+
+  cudf::test::strings_column_wrapper strings1(
+    {"+175", "-34", "9.8", "17+2", "+-14", "1234567890", "67de", "", "1e10", "-", "++", ""});
+  results = cudf::strings::is_integer(cudf::strings_column_view(strings1));
+  cudf::test::fixed_width_column_wrapper<bool> expected1({1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1);
+
+  cudf::test::strings_column_wrapper strings2(
+    {"0", "+0", "-0", "1234567890", "-27341132", "+012", "023", "-045"});
+  results = cudf::strings::is_integer(cudf::strings_column_view(strings2));
+  cudf::test::fixed_width_column_wrapper<bool> expected2({1, 1, 1, 1, 1, 1, 1, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2);
+}
+
 TEST_F(StringsConvertTest, ToInteger)
 {
   std::vector<const char*> h_strings{
diff --git a/java/ci/Dockerfile.centos7 b/java/ci/Dockerfile.centos7
index 7e0e1c70d72..cbf7e22b229 100644
--- a/java/ci/Dockerfile.centos7
+++ b/java/ci/Dockerfile.centos7
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@ FROM nvidia/cuda:$CUDA_VERSION-devel-centos7
 
 ### Install basic requirements
 RUN yum install -y centos-release-scl
-RUN yum install -y devtoolset-7 epel-release
+RUN yum install -y devtoolset-8 epel-release
 RUN yum install -y git zlib-devel maven tar wget patch
 
 ## pre-create the CMAKE_INSTALL_PREFIX folder, set writable by any user for Jenkins
@@ -34,7 +34,7 @@ RUN mkdir /usr/local/rapids && mkdir /rapids && chmod 777 /usr/local/rapids && c
 RUN cd /rapids/ && wget https://dl.bintray.com/boostorg/release/1.72.0/source/boost_1_72_0.tar.gz && \
    tar zxf boost_1_72_0.tar.gz && \
    cd boost_1_72_0 && \
-   scl enable devtoolset-7 "./bootstrap.sh --prefix=/usr && ./b2 install --with-filesystem threading=multi link=static cxxflags=-fPIC; exit 0"
+   scl enable devtoolset-8 "./bootstrap.sh --prefix=/usr && ./b2 install --with-filesystem threading=multi link=static cxxflags=-fPIC; exit 0"
 
 RUN cd /usr/local/ && wget --quiet https://github.com/Kitware/CMake/releases/download/v3.19.0/cmake-3.19.0-Linux-x86_64.tar.gz && \
    tar zxf cmake-3.19.0-Linux-x86_64.tar.gz
diff --git a/java/ci/README.md b/java/ci/README.md
index 865ed7fd083..3ffed71b27c 100644
--- a/java/ci/README.md
+++ b/java/ci/README.md
@@ -39,12 +39,12 @@ Here I choose to download again in the container.
 git clone --recursive https://github.com/rapidsai/cudf.git -b branch-0.19
 ```
 
-### Build cuDF jar
+### Build cuDF jar with devtoolset
 
 ```bash
 cd cudf
 export WORKSPACE=`pwd`
-scl enable devtoolset-7 "java/ci/build-in-docker.sh"
+scl enable devtoolset-8 "java/ci/build-in-docker.sh"
 ```
 
 ### The output
diff --git a/java/ci/build-in-docker.sh b/java/ci/build-in-docker.sh
index 7e51a150ebc..eee943cde38 100755
--- a/java/ci/build-in-docker.sh
+++ b/java/ci/build-in-docker.sh
@@ -31,12 +31,7 @@ SIGN_FILE=$1
 OUT_PATH=$WORKSPACE/$OUT
 
 # set on Jenkins parameter
-if [ -z $RMM_VERSION ]
-then
-RMM_VERSION=`git describe --tags | grep -o -E '([0-9]+\.[0-9]+)'`
-fi
-echo "RMM_VERSION: $RMM_VERSION,\
- SIGN_FILE: $SIGN_FILE,\
+echo "SIGN_FILE: $SIGN_FILE,\
  SKIP_JAVA_TESTS: $SKIP_JAVA_TESTS,\
  BUILD_CPP_TESTS: $BUILD_CPP_TESTS,\
  ENABLED_PTDS: $ENABLE_PTDS,\
@@ -47,30 +42,11 @@ INSTALL_PREFIX=/usr/local/rapids
 export GIT_COMMITTER_NAME="ci"
 export GIT_COMMITTER_EMAIL="ci@nvidia.com"
 export CUDACXX=/usr/local/cuda/bin/nvcc
-export RMM_ROOT=$INSTALL_PREFIX
-export DLPACK_ROOT=$INSTALL_PREFIX
 export LIBCUDF_KERNEL_CACHE_PATH=/rapids
 
 # add cmake 3.19 to PATH
 export PATH=/usr/local/cmake-3.19.0-Linux-x86_64/bin:$PATH
 
-cd /rapids/
-git clone --recurse-submodules https://github.com/rapidsai/rmm.git -b branch-$RMM_VERSION
-git clone --recurse-submodules https://github.com/rapidsai/dlpack.git -b cudf
-
-###### Build rmm/dlpack ######
-mkdir -p /rapids/rmm/build
-cd /rapids/rmm/build
-echo "RMM SHA: `git rev-parse HEAD`"
-cmake .. -DCMAKE_INSTALL_PREFIX=$INSTALL_PREFIX -DBUILD_TESTS=$BUILD_CPP_TESTS
-make -j$PARALLEL_LEVEL install
-
-mkdir -p /rapids/dlpack/build
-cd /rapids/dlpack/build
-echo "DLPACK SHA: `git rev-parse HEAD`"
-cmake .. -DCMAKE_INSTALL_PREFIX=$INSTALL_PREFIX -DBUILD_TESTS=$BUILD_CPP_TESTS
-make -j$PARALLEL_LEVEL install
-
 ###### Build libcudf ######
 rm -rf $WORKSPACE/cpp/build
 mkdir -p $WORKSPACE/cpp/build
diff --git a/java/src/main/java/ai/rapids/cudf/ArrowIPCWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ArrowIPCWriterOptions.java
index 298e99b059d..ee5ae094b29 100644
--- a/java/src/main/java/ai/rapids/cudf/ArrowIPCWriterOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/ArrowIPCWriterOptions.java
@@ -67,6 +67,64 @@ public Builder withCallback(DoneOnGpu callback) {
       return this;
     }
 
+    /**
+     * Add the name(s) for nullable column(s).
+     *
+     * Please note the column names of the nested struct columns should be flattened in sequence.
+     * For examples,
+     * <pre>
+     *   A table with an int column and a struct column:
+     *                   ["int_col", "struct_col":{"field_1", "field_2"}]
+     *   output:
+     *                   ["int_col", "struct_col", "field_1", "field_2"]
+     *
+     *   A table with an int column and a list of non-nested type column:
+     *                   ["int_col", "list_col":[]]
+     *   output:
+     *                   ["int_col", "list_col"]
+     *
+     *   A table with an int column and a list of struct column:
+     *                   ["int_col", "list_struct_col":[{"field_1", "field_2"}]]
+     *   output:
+     *                   ["int_col", "list_struct_col", "field_1", "field_2"]
+     * </pre>
+     *
+     * @param columnNames The column names corresponding to the written table(s).
+     */
+    @Override
+    public Builder withColumnNames(String... columnNames) {
+      return super.withColumnNames(columnNames);
+    }
+
+    /**
+     * Add the name(s) for non-nullable column(s).
+     *
+     * Please note the column names of the nested struct columns should be flattened in sequence.
+     * For examples,
+     * <pre>
+     *   A table with an int column and a struct column:
+     *                   ["int_col", "struct_col":{"field_1", "field_2"}]
+     *   output:
+     *                   ["int_col", "struct_col", "field_1", "field_2"]
+     *
+     *   A table with an int column and a list of non-nested type column:
+     *                   ["int_col", "list_col":[]]
+     *   output:
+     *                   ["int_col", "list_col"]
+     *
+     *   A table with an int column and a list of struct column:
+     *                   ["int_col", "list_struct_col":[{"field_1", "field_2"}]]
+     *   output:
+     *                   ["int_col", "list_struct_col", "field_1", "field_2"]
+     * </pre>
+     *
+     * @param columnNames The column names corresponding to the written table(s).
+     */
+    @Override
+    public Builder withNotNullableColumnNames(String... columnNames) {
+      return super.withNotNullableColumnNames(columnNames);
+    }
+
     public ArrowIPCWriterOptions build() {
       return new ArrowIPCWriterOptions(this);
     }
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 2f3f2bf80cf..e50a9e86ead 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -256,6 +256,15 @@ public final ColumnVector getByteCount() {
     return new ColumnVector(byteCount(getNativeView()));
   }
 
+  /**
+   * Get the number of elements for each list. Null lists will have a value of null.
+   * @return the number of elements in each list as an INT32 value.
+   */
+  public final ColumnVector countElements() {
+    assert DType.LIST.equals(type) : "Only lists are supported";
+    return new ColumnVector(countElements(getNativeView()));
+  }
+
   /**
    * Returns a Boolean vector with the same number of rows as this instance, that has
    * TRUE for any entry that is not null, and FALSE for any null entry (as per the validity mask)
@@ -2749,6 +2758,8 @@ private static native long stringReplaceWithBackrefs(long columnView, String pat
 
   private static native long binaryOpVV(long lhs, long rhs, int op, int dtype, int scale);
 
+  private static native long countElements(long viewHandle);
+
   private static native long byteCount(long viewHandle) throws CudfException;
 
   private static native long extractListElement(long nativeView, int index);
diff --git a/java/src/main/java/ai/rapids/cudf/HashType.java b/java/src/main/java/ai/rapids/cudf/HashType.java
index b521bc5c42c..eb31edd8222 100644
--- a/java/src/main/java/ai/rapids/cudf/HashType.java
+++ b/java/src/main/java/ai/rapids/cudf/HashType.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2020, NVIDIA CORPORATION.
+ *  Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -22,8 +22,8 @@
  * Hash algorithm identifiers, mirroring native enum cudf::hash_id
  */
 public enum HashType {
-  // TODO IDENTITY(0),
-  // TODO MURMUR3(1),
+  IDENTITY(0),
+  MURMUR3(1),
   HASH_MD5(2),
   HASH_SERIAL_MURMUR3(3),
   HASH_SPARK_MURMUR3(4);
diff --git a/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java b/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java
index bf49fb59d52..6c52b8fe798 100644
--- a/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java
+++ b/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -353,6 +353,50 @@ static SerializedColumnHeader readFrom(DataInputStream din, long rowCount) throw
     }
   }
 
+  /** Class to hold the header and buffer pair result from host-side concatenation */
+  public static final class HostConcatResult implements AutoCloseable {
+    private final SerializedTableHeader tableHeader;
+    private final HostMemoryBuffer hostBuffer;
+
+    public HostConcatResult(SerializedTableHeader tableHeader, HostMemoryBuffer tableBuffer) {
+      this.tableHeader = tableHeader;
+      this.hostBuffer = tableBuffer;
+    }
+
+    public SerializedTableHeader getTableHeader() {
+      return tableHeader;
+    }
+
+    public HostMemoryBuffer getHostBuffer() {
+      return hostBuffer;
+    }
+
+    /** Build a contiguous table in device memory from this host-concatenated result */
+    public ContiguousTable toContiguousTable() {
+      DeviceMemoryBuffer devBuffer = DeviceMemoryBuffer.allocate(hostBuffer.length);
+      try {
+        if (hostBuffer.length > 0) {
+          devBuffer.copyFromHostBuffer(hostBuffer);
+        }
+        Table table = sliceUpColumnVectors(tableHeader, devBuffer, hostBuffer);
+        try {
+          return new ContiguousTable(table, devBuffer);
+        } catch (Exception e) {
+          table.close();
+          throw e;
+        }
+      } catch (Exception e) {
+        devBuffer.close();
+        throw e;
+      }
+    }
+
+    @Override
+    public void close() {
+      hostBuffer.close();
+    }
+  }
+
   /**
    * Visible for testing
    */
@@ -1681,15 +1725,32 @@ public static Table readAndConcat(SerializedTableHeader[] headers,
     return ct.getTable();
   }
 
+  /**
+   * Concatenate multiple tables in host memory into a contiguous table in device memory.
+   * @param headers table headers corresponding to the host table buffers
+   * @param dataBuffers host table buffer for each input table to be concatenated
+   * @return contiguous table in device memory
+   */
   public static ContiguousTable concatToContiguousTable(SerializedTableHeader[] headers,
                                                         HostMemoryBuffer[] dataBuffers) throws IOException {
+    try (HostConcatResult concatResult = concatToHostBuffer(headers, dataBuffers)) {
+      return concatResult.toContiguousTable();
+    }
+  }
+
+  /**
+   * Concatenate multiple tables in host memory into a single host table buffer.
+   * @param headers table headers corresponding to the host table buffers
+   * @param dataBuffers host table buffer for each input table to be concatenated
+   * @return host table header and buffer
+   */
+  public static HostConcatResult concatToHostBuffer(SerializedTableHeader[] headers,
+                                                    HostMemoryBuffer[] dataBuffers) throws IOException {
     ColumnBufferProvider[][] providersPerColumn = providersFrom(headers, dataBuffers);
-    DeviceMemoryBuffer devBuffer = null;
-    Table table = null;
     try {
       SerializedTableHeader combined = calcConcatHeader(providersPerColumn);
-
-      try (HostMemoryBuffer hostBuffer = HostMemoryBuffer.allocate(combined.dataLen)) {
+      HostMemoryBuffer hostBuffer = HostMemoryBuffer.allocate(combined.dataLen);
+      try {
         try (NvtxRange range = new NvtxRange("Concat Host Side", NvtxColor.GREEN)) {
           DataWriter writer = writerFrom(hostBuffer);
           int numColumns = combined.getNumColumns();
@@ -1697,27 +1758,14 @@ public static ContiguousTable concatToContiguousTable(SerializedTableHeader[] he
             writeConcat(writer, combined.getColumnHeader(columnIdx), providersPerColumn[columnIdx]);
           }
         }
-
-        devBuffer = DeviceMemoryBuffer.allocate(hostBuffer.length);
-        if (hostBuffer.length > 0) {
-          try (NvtxRange range = new NvtxRange("Copy Data To Device", NvtxColor.WHITE)) {
-            devBuffer.copyFromHostBuffer(hostBuffer);
-          }
-        }
-        table = sliceUpColumnVectors(combined, devBuffer, hostBuffer);
-        ContiguousTable result = new ContiguousTable(table, devBuffer);
-        table = null;
-        devBuffer = null;
-        return result;
+      } catch (Exception e) {
+        hostBuffer.close();
+        throw e;
       }
+
+      return new HostConcatResult(combined, hostBuffer);
     } finally {
       closeAll(providersPerColumn);
-      if (table != null) {
-        table.close();
-      }
-      if (devBuffer != null) {
-        devBuffer.close();
-      }
     }
   }
 
diff --git a/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java
index 1203fc25931..2e793494b7b 100644
--- a/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java
@@ -58,25 +58,12 @@ public Builder withTimestampInt96(boolean int96) {
     }
 
     /**
-     * Overwrite flattened precision values for all decimal columns that are expected to be in
-     * this Table. The list of precisions should be an in-order traversal of all Decimal columns,
-     * including nested columns. Please look at the example below.
-     *
-     * NOTE: The number of `precisionValues` should be equal to the numbers of Decimal columns
-     * otherwise a CudfException will be thrown. Also note that the values will be overwritten
-     * every time this method is called
-     *
-     * Example:
-     *  Table0 : c0[type: INT32]
-     *           c1[type: Decimal32(3, 1)]
-     *           c2[type: Struct[col0[type: Decimal(2, 1)],
-     *                           col1[type: INT64],
-     *                           col2[type: Decimal(8, 6)]]
-     *           c3[type: Decimal64(12, 5)]
-     *
-     *  Flattened list of precision from the above example will be {3, 2, 8, 12}
+     * This is a temporary hack to make things work.  This API will go away once we can update the
+     * parquet APIs properly.
+     * @param precisionValues a value for each column, non-decimal columns are ignored.
+     * @return this for chaining.
      */
-    public Builder withPrecisionValues(int... precisionValues) {
+    public Builder withDecimalPrecisions(int ... precisionValues) {
       this.precisionValues = precisionValues;
       return this;
     }
@@ -86,8 +73,6 @@ public ParquetWriterOptions build() {
     }
   }
 
-  public static final ParquetWriterOptions DEFAULT = new ParquetWriterOptions(new Builder());
-
   public static Builder builder() {
     return new Builder();
   }
@@ -107,7 +92,7 @@ public StatisticsFrequency getStatisticsFrequency() {
 
   /**
    * Return the flattened list of precisions if set otherwise empty array will be returned.
-   * For a definition of what `flattened` means please look at {@link Builder#withPrecisionValues}
+   * For a definition of what `flattened` means please look at {@link Builder#withDecimalPrecisions}
    */
   public int[] getPrecisions() {
     return precisions;
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index fcc23777d69..4da99d811f2 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -183,8 +183,12 @@ public long getDeviceMemorySize() {
   
   private static native ContiguousTable[] contiguousSplit(long inputTable, int[] indices);
 
+  private static native long[] partition(long inputTable, long partitionView,
+      int numberOfPartitions, int[] outputOffsets);
+
   private static native long[] hashPartition(long inputTable,
                                              int[] columnsToHash,
+                                             int hashTypeId,
                                              int numberOfPartitions,
                                              int[] outputOffsets) throws CudfException;
 
@@ -515,6 +519,10 @@ private static native long[] repeatColumnCount(long tableHandle,
 
   private static native long[] explodePosition(long tableHandle, int index);
 
+  private static native long[] explodeOuter(long tableHandle, int index);
+
+  private static native long[] explodeOuterPosition(long tableHandle, int index);
+
   private static native long createCudfTableView(long[] nativeColumnViewHandles);
 
   private static native long[] columnViewsFromPacked(ByteBuffer metadata, long dataAddress);
@@ -795,6 +803,12 @@ private static class ParquetTableWriter implements TableWriter {
     HostBufferConsumer consumer;
 
     private ParquetTableWriter(ParquetWriterOptions options, File outputFile) {
+      int numColumns = options.getColumnNames().length;
+      assert (numColumns == options.getColumnNullability().length);
+      int[] precisions = options.getPrecisions();
+      if (precisions != null) {
+        assert (numColumns >= options.getPrecisions().length);
+      }
       this.consumer = null;
       this.handle = writeParquetFileBegin(options.getColumnNames(),
           options.getColumnNullability(),
@@ -863,17 +877,6 @@ public static TableWriter writeParquetChunked(ParquetWriterOptions options,
     return new ParquetTableWriter(options, consumer);
   }
 
-  /**
-   * Writes this table to a Parquet file on the host
-   *
-   * @param outputFile file to write the table to
-   * @deprecated please use writeParquetChunked instead
-   */
-  @Deprecated
-  public void writeParquet(File outputFile) {
-    writeParquet(ParquetWriterOptions.DEFAULT, outputFile);
-  }
-
   /**
    * Writes this table to a Parquet file on the host
    *
@@ -1252,6 +1255,24 @@ public Table repeat(ColumnVector counts, boolean checkCount) {
     return new Table(repeatColumnCount(this.nativeHandle, counts.getNativeView(), checkCount));
   }
 
+  /**
+   * Partition this table using the mapping in partitionMap. partitionMap must be an integer
+   * column. The number of rows in partitionMap must be the same as this table.  Each row
+   * in the map will indicate which partition the rows in the table belong to.
+   * @param partitionMap the partitions for each row.
+   * @param numberOfPartitions number of partitions
+   * @return {@link PartitionedTable} Table that exposes a limited functionality of the
+   * {@link Table} class
+   */
+  public PartitionedTable partition(ColumnView partitionMap, int numberOfPartitions) {
+    int[] partitionOffsets = new int[numberOfPartitions];
+    return new PartitionedTable(new Table(partition(
+        getNativeView(),
+        partitionMap.getNativeView(),
+        partitionOffsets.length,
+        partitionOffsets)), partitionOffsets);
+  }
+
   /**
    * Find smallest indices in a sorted table where values should be inserted to maintain order.
    * <pre>
@@ -1724,7 +1745,7 @@ public ContiguousTable[] contiguousSplit(int... indices) {
    * Example:
    * input:  [[5,10,15], 100],
    *         [[20,25],   200],
-   *         [[30],      300],
+   *         [[30],      300]
    * index: 0
    * output: [5,         100],
    *         [10,        100],
@@ -1736,12 +1757,12 @@ public ContiguousTable[] contiguousSplit(int... indices) {
    *
    * Nulls propagate in different ways depending on what is null.
    * <code>
-   *     [[5,null,15], 100],
-   *     [null,        200]
-   * returns:
-   *     [5,           100],
-   *     [null,        100],
-   *     [15,          100]
+   * input:  [[5,null,15], 100],
+   *         [null,        200]
+   * index: 0
+   * output: [5,           100],
+   *         [null,        100],
+   *         [15,          100]
    * </code>
    * Note that null lists are completely removed from the output
    * and nulls inside lists are pulled out and remain.
@@ -1762,27 +1783,26 @@ public Table explode(int index) {
    * in the output. The corresponding rows for other columns in the input are duplicated. A position
    * column is added that has the index inside the original list for each row. Example:
    * <code>
-   * [[5,10,15], 100],
-   * [[20,25],   200],
-   * [[30],      300],
-   * returns
-   * [0,   5,    100],
-   * [1,   10,   100],
-   * [2,   15,    100],
-   * [0,   20,    200],
-   * [1,   25,    200],
-   * [0,   30,    300],
+   * input:  [[5,10,15], 100],
+   *         [[20,25],   200],
+   *         [[30],      300]
+   * index: 0
+   * output: [0,   5,    100],
+   *         [1,   10,   100],
+   *         [2,   15,   100],
+   *         [0,   20,   200],
+   *         [1,   25,   200],
+   *         [0,   30,   300]
    * </code>
    *
    * Nulls and empty lists propagate in different ways depending on what is null or empty.
    * <code>
-   * [[5,null,15], 100],
-   * [null,        200],
-   * [[],          300],
-   * returns
-   * [0,    5,     100],
-   * [1,    null,  100],
-   * [2,    15,    100],
+   * input:  [[5,null,15], 100],
+   *         [null,        200]
+   * index: 0
+   * output: [5,           100],
+   *         [null,        100],
+   *         [15,          100]
    * </code>
    *
    * Note that null lists are not included in the resulting table, but nulls inside
@@ -1798,6 +1818,96 @@ public Table explodePosition(int index) {
     return new Table(explodePosition(nativeHandle, index));
   }
 
+  /**
+   * Explodes a list column's elements.
+   *
+   * Any list is exploded, which means the elements of the list in each row are expanded
+   * into new rows in the output. The corresponding rows for other columns in the input
+   * are duplicated.
+   *
+   * <code>
+   * Example:
+   * input:  [[5,10,15], 100],
+   *         [[20,25],   200],
+   *         [[30],      300],
+   * index: 0
+   * output: [5,         100],
+   *         [10,        100],
+   *         [15,        100],
+   *         [20,        200],
+   *         [25,        200],
+   *         [30,        300]
+   * </code>
+   *
+   * Nulls propagate in different ways depending on what is null.
+   * <code>
+   *  input:  [[5,null,15], 100],
+   *          [null,        200]
+   * index: 0
+   * output:  [5,           100],
+   *          [null,        100],
+   *          [15,          100],
+   *          [null,        200]
+   * </code>
+   * Note that null lists are completely removed from the output
+   * and nulls inside lists are pulled out and remain.
+   *
+   * @param index Column index to explode inside the table.
+   * @return A new table with explode_col exploded.
+   */
+  public Table explodeOuter(int index) {
+    assert 0 <= index && index < columns.length : "Column index is out of range";
+    assert columns[index].getType().equals(DType.LIST) : "Column to explode must be of type LIST";
+    return new Table(explodeOuter(nativeHandle, index));
+  }
+
+  /**
+   * Explodes a list column's elements retaining any null entries or empty lists and includes a
+   * position column.
+   *
+   * Any list is exploded, which means the elements of the list in each row are expanded into new rows
+   * in the output. The corresponding rows for other columns in the input are duplicated. A position
+   * column is added that has the index inside the original list for each row. Example:
+   *
+   * <code>
+   * Example:
+   * input:  [[5,10,15], 100],
+   *         [[20,25],   200],
+   *         [[30],      300],
+   * index: 0
+   * output: [0,   5,    100],
+   *         [1,   10,   100],
+   *         [2,   15,   100],
+   *         [0,   20,   200],
+   *         [1,   25,   200],
+   *         [0,   30,   300]
+   * </code>
+   *
+   * Nulls and empty lists propagate as null entries in the result.
+   * <code>
+   * input:  [[5,null,15], 100],
+   *         [null,        200],
+   *         [[],          300]
+   * index: 0
+   * output: [0,     5,    100],
+   *         [1,  null,    100],
+   *         [2,    15,    100],
+   *         [0,  null,    200],
+   *         [0,  null,    300]
+   * </code>
+   *
+   *    returns
+   *
+   * @param index Column index to explode inside the table.
+   * @return A new table with exploded value and position. The column order of return table is
+   *         [cols before explode_input, explode_position, explode_value, cols after explode_input].
+   */
+  public Table explodeOuterPosition(int index) {
+    assert 0 <= index && index < columns.length : "Column index is out of range";
+    assert columns[index].getType().equals(DType.LIST) : "Column to explode must be of type LIST";
+    return new Table(explodeOuterPosition(nativeHandle, index));
+  }
+
   /**
    * Gathers the rows of this table according to `gatherMap` such that row "i"
    * in the resulting table's columns will contain row "gatherMap[i]" from this table.
@@ -2587,15 +2697,31 @@ public Table leftAntiJoin(TableOperation rightJoinIndices) {
     }
 
     /**
-     * Hash partition a table into the specified number of partitions.
+     * Hash partition a table into the specified number of partitions. Uses the default MURMUR3
+     * hashing.
      * @param numberOfPartitions - number of partitions to use
      * @return - {@link PartitionedTable} - Table that exposes a limited functionality of the
      * {@link Table} class
      */
     public PartitionedTable hashPartition(int numberOfPartitions) {
+      return hashPartition(HashType.MURMUR3, numberOfPartitions);
+    }
+
+    /**
+     * Hash partition a table into the specified number of partitions.
+     * @param type the type of hash to use. Depending on the type of hash different restrictions
+     *             on the hash column(s) may exist. Not all hash functions are guaranteed to work
+     *             besides IDENTITY and MURMUR3.
+     * @param numberOfPartitions - number of partitions to use
+     * @return {@link PartitionedTable} - Table that exposes a limited functionality of the
+     * {@link Table} class
+     */
+    public PartitionedTable hashPartition(HashType type, int numberOfPartitions) {
       int[] partitionOffsets = new int[numberOfPartitions];
-      return new PartitionedTable(new Table(Table.hashPartition(operation.table.nativeHandle,
+      return new PartitionedTable(new Table(Table.hashPartition(
+          operation.table.nativeHandle,
           operation.indices,
+          type.nativeId,
           partitionOffsets.length,
           partitionOffsets)), partitionOffsets);
     }
diff --git a/java/src/main/java/ai/rapids/cudf/WriterOptions.java b/java/src/main/java/ai/rapids/cudf/WriterOptions.java
index 60f7fb03459..5d5af3006a3 100644
--- a/java/src/main/java/ai/rapids/cudf/WriterOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/WriterOptions.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -46,7 +46,7 @@ protected static class WriterBuilder<T extends WriterBuilder> {
     final List<Boolean> columnNullability = new ArrayList<>();
 
     /**
-     * Add column name
+     * Add column name(s). For Parquet column names are not optional.
      * @param columnNames
      */
     public T withColumnNames(String... columnNames) {
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index ceafc75f840..46b3f0c5a53 100755
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -125,7 +125,8 @@ find_path(LIBCUDACXX_INCLUDE "cuda"
           "${CUDF_CPP_BUILD_DIR}/_deps/libcudacxx-src/include")
 
 find_path(SPDLOG_INCLUDE "spdlog"
-    HINTS "$ENV{RMM_ROOT}/_deps/spdlog-src/include"
+    HINTS "${CUDF_CPP_BUILD_DIR}/_deps/spdlog-src/include"
+          "$ENV{RMM_ROOT}/_deps/spdlog-src/include"
           "$ENV{RMM_ROOT}/include"
           "$ENV{CONDA_PREFIX}/include")
 
@@ -147,7 +148,8 @@ find_library(CUDF_LIB "cudf" REQUIRED HINTS ${CUDF_LIB_HINTS})
 # - RMM -------------------------------------------------------------------------------------------
 
 find_path(RMM_INCLUDE "rmm"
-          HINTS "$ENV{RMM_ROOT}/include"
+          HINTS "${CUDF_CPP_BUILD_DIR}/_deps/rmm-src/include"
+                "$ENV{RMM_ROOT}/include"
                 "$ENV{RMM_HOME}/include"
                 "$ENV{CONDA_PREFIX}/include/rmm"
                 "$ENV{CONDA_PREFIX}/include")
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 0ce9d6303e4..73db5ee4df3 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -20,16 +20,17 @@
 #include <cudf/binaryop.hpp>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/concatenate.hpp>
-#include <cudf/lists/extract.hpp>
-#include <cudf/reshape.hpp>
-#include <cudf/lists/detail/concatenate.hpp>
 #include <cudf/datetime.hpp>
 #include <cudf/filling.hpp>
 #include <cudf/hashing.hpp>
+#include <cudf/lists/count_elements.hpp>
+#include <cudf/lists/detail/concatenate.hpp>
+#include <cudf/lists/extract.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/quantiles.hpp>
 #include <cudf/reduction.hpp>
 #include <cudf/replace.hpp>
+#include <cudf/reshape.hpp>
 #include <cudf/rolling.hpp>
 #include <cudf/round.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
@@ -37,7 +38,6 @@
 #include <cudf/strings/attributes.hpp>
 #include <cudf/strings/capitalize.hpp>
 #include <cudf/strings/case.hpp>
-#include <cudf/strings/char_types/char_types.hpp>
 #include <cudf/strings/combine.hpp>
 #include <cudf/strings/contains.hpp>
 #include <cudf/strings/convert/convert_booleans.hpp>
@@ -431,6 +431,19 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_split(JNIEnv *env, j
   CATCH_STD(env, NULL);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_countElements(JNIEnv *env, jclass clazz,
+                                                                     jlong view_handle) {
+  JNI_NULL_CHECK(env, view_handle, "input column is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::column_view *n_column = reinterpret_cast<cudf::column_view *>(view_handle);
+    std::unique_ptr<cudf::column> result =
+        cudf::lists::count_elements(cudf::lists_column_view(*n_column));
+    return reinterpret_cast<jlong>(result.release());
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_charLengths(JNIEnv *env, jclass clazz,
                                                                    jlong view_handle) {
   JNI_NULL_CHECK(env, view_handle, "input column is null", 0);
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index e051f68be4e..6beedf54f5a 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -27,6 +27,7 @@
 #include <cudf/io/orc.hpp>
 #include <cudf/io/parquet.hpp>
 #include <cudf/join.hpp>
+#include <cudf/lists/explode.hpp>
 #include <cudf/merge.hpp>
 #include <cudf/partitioning.hpp>
 #include <cudf/reshape.hpp>
@@ -211,12 +212,15 @@ class native_arrow_ipc_writer_handle final {
                                           const std::shared_ptr<arrow::io::OutputStream> &sink)
       : initialized(false), column_names(col_names), file_name(""), sink(sink) {}
 
+private:
   bool initialized;
   std::vector<std::string> column_names;
+  std::vector<cudf::column_metadata> columns_meta;
   std::string file_name;
   std::shared_ptr<arrow::io::OutputStream> sink;
   std::shared_ptr<arrow::ipc::RecordBatchWriter> writer;
 
+public:
   void write(std::shared_ptr<arrow::Table> &arrow_tab, int64_t max_chunk) {
     if (!initialized) {
       if (!sink) {
@@ -245,6 +249,59 @@ class native_arrow_ipc_writer_handle final {
     }
     initialized = false;
   }
+
+  std::vector<cudf::column_metadata> get_column_metadata(const cudf::table_view& tview) {
+    if (!column_names.empty() && columns_meta.empty()) {
+      // Rebuild the structure of column meta according to table schema.
+      // All the tables written by this writer should share the same schema,
+      // so build column metadata only once.
+      columns_meta.reserve(tview.num_columns());
+      size_t idx = 0;
+      for (auto itr = tview.begin(); itr < tview.end(); ++itr) {
+        // It should consume the column names only when a column is
+        //   - type of struct, or
+        //   - not a child.
+        columns_meta.push_back(build_one_column_meta(*itr, idx));
+      }
+      if (idx < column_names.size()) {
+        throw cudf::jni::jni_exception("Too many column names are provided.");
+      }
+    }
+    return columns_meta;
+  }
+
+private:
+  cudf::column_metadata build_one_column_meta(const cudf::column_view& cview, size_t& idx,
+                                              const bool consume_name = true) {
+    auto col_meta = cudf::column_metadata{};
+    if (consume_name) {
+      col_meta.name = get_column_name(idx++);
+    }
+    // Process children
+    if (cview.type().id() == cudf::type_id::LIST) {
+      // list type:
+      //   - requires a stub metadata for offset column(index: 0).
+      //   - does not require a name for the child column(index 1).
+      col_meta.children_meta = {{}, build_one_column_meta(cview.child(1), idx, false)};
+    } else if (cview.type().id() == cudf::type_id::STRUCT) {
+      // struct type always consumes the column names.
+      col_meta.children_meta.reserve(cview.num_children());
+      for (auto itr = cview.child_begin(); itr < cview.child_end(); ++itr) {
+        col_meta.children_meta.push_back(build_one_column_meta(*itr, idx));
+      }
+    } else if (cview.type().id() == cudf::type_id::DICTIONARY32) {
+      // not supported yet in JNI, nested type?
+      throw cudf::jni::jni_exception("Unsupported type 'DICTIONARY32'");
+    }
+    return col_meta;
+  }
+
+  std::string& get_column_name(const size_t idx) {
+    if (idx < 0 || idx >= column_names.size()) {
+      throw cudf::jni::jni_exception("Missing names for columns or nested struct columns");
+    }
+    return column_names[idx];
+  }
 };
 
 class jni_arrow_output_stream final : public arrow::io::OutputStream {
@@ -914,12 +971,24 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin(
     cudf::jni::native_jbooleanArray col_nullability(env, j_col_nullability);
     cudf::jni::native_jstringArray meta_keys(env, j_metadata_keys);
     cudf::jni::native_jstringArray meta_values(env, j_metadata_values);
+    cudf::jni::native_jintArray precisions(env, j_precisions);
+
+    auto cpp_names = col_names.as_cpp_vector();
+    table_input_metadata metadata;
+    metadata.column_metadata.resize(col_nullability.size());
+    for (int i = 0; i < col_nullability.size(); i++) {
+       metadata.column_metadata[i]
+           .set_name(cpp_names[i])
+           .set_nullability(col_nullability[i])
+           .set_int96_timestamps(j_isInt96);
+    }
+
+    // Precisions is not always set
+    for (int i = 0; i < precisions.size(); i++) {
+       metadata.column_metadata[i]
+           .set_decimal_precision(precisions[i]);
+    }
 
-    auto d = col_nullability.data();
-    std::vector<bool> nullability(d, d + col_nullability.size());
-    table_metadata_with_nullability metadata;
-    metadata.column_nullable = nullability;
-    metadata.column_names = col_names.as_cpp_vector();
     for (auto i = 0; i < meta_keys.size(); ++i) {
       metadata.user_data[meta_keys[i].get()] = meta_values[i].get();
     }
@@ -927,16 +996,13 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin(
     std::unique_ptr<cudf::jni::jni_writer_data_sink> data_sink(
         new cudf::jni::jni_writer_data_sink(env, consumer));
     sink_info sink{data_sink.get()};
-    cudf::jni::native_jintArray precisions(env, j_precisions);
     std::vector<uint8_t> const v_precisions(
         precisions.data(), precisions.data() + precisions.size());
     chunked_parquet_writer_options opts =
         chunked_parquet_writer_options::builder(sink)
-            .nullable_metadata(&metadata)
+            .metadata(&metadata)
             .compression(static_cast<compression_type>(j_compression))
             .stats_level(static_cast<statistics_freq>(j_stats_freq))
-            .int96_timestamps(static_cast<bool>(j_isInt96))
-            .decimal_precision(v_precisions)
             .build();
 
     auto writer_ptr = std::make_unique<cudf::io::parquet_chunked_writer>(opts);
@@ -964,27 +1030,34 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetFileBegin(
     cudf::jni::native_jstringArray meta_keys(env, j_metadata_keys);
     cudf::jni::native_jstringArray meta_values(env, j_metadata_values);
     cudf::jni::native_jstring output_path(env, j_output_path);
+    cudf::jni::native_jintArray precisions(env, j_precisions);
 
-    auto d = col_nullability.data();
-    std::vector<bool> nullability(d, d + col_nullability.size());
-    table_metadata_with_nullability metadata;
-    metadata.column_nullable = nullability;
-    metadata.column_names = col_names.as_cpp_vector();
-    for (int i = 0; i < meta_keys.size(); ++i) {
+    auto cpp_names = col_names.as_cpp_vector();
+    table_input_metadata metadata;
+    metadata.column_metadata.resize(col_nullability.size());
+    for (int i = 0; i < col_nullability.size(); i++) {
+       metadata.column_metadata[i]
+           .set_name(cpp_names[i])
+           .set_nullability(col_nullability[i])
+           .set_int96_timestamps(j_isInt96);
+    }
+
+    // Precisions is not always set
+    for (int i = 0; i < precisions.size(); i++) {
+       metadata.column_metadata[i]
+           .set_decimal_precision(precisions[i]);
+    }
+
+    for (auto i = 0; i < meta_keys.size(); ++i) {
       metadata.user_data[meta_keys[i].get()] = meta_values[i].get();
     }
-    cudf::jni::native_jintArray precisions(env, j_precisions);
-    std::vector<uint8_t> v_precisions(
-        precisions.data(), precisions.data() + precisions.size());
- 
+
     sink_info sink{output_path.get()};
     chunked_parquet_writer_options opts =
         chunked_parquet_writer_options::builder(sink)
-            .nullable_metadata(&metadata)
+            .metadata(&metadata)
             .compression(static_cast<compression_type>(j_compression))
             .stats_level(static_cast<statistics_freq>(j_stats_freq))
-            .int96_timestamps(static_cast<bool>(j_isInt96))
-            .decimal_precision(v_precisions)
             .build();
 
     auto writer_ptr = std::make_unique<cudf::io::parquet_chunked_writer>(opts);
@@ -1245,12 +1318,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_convertCudfToArrowTable(JNIEnv
     cudf::jni::auto_set_device(env);
     std::unique_ptr<std::shared_ptr<arrow::Table>> result(
         new std::shared_ptr<arrow::Table>(nullptr));
-    auto column_metadata = std::vector<cudf::column_metadata>{};
-    column_metadata.reserve(state->column_names.size());
-    std::transform(std::begin(state->column_names), std::end(state->column_names),
-                   std::back_inserter(column_metadata),
-                   [](auto const &column_name) { return cudf::column_metadata{column_name}; });
-    *result = cudf::to_arrow(*tview, column_metadata);
+    *result = cudf::to_arrow(*tview, state->get_column_metadata(*tview));
     if (!result->get()) {
       return 0;
     }
@@ -1613,9 +1681,43 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_concatenate(JNIEnv *env,
   CATCH_STD(env, NULL);
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_partition(JNIEnv *env, jclass,
+                                                                 jlong input_table,
+                                                                 jlong partition_column,
+                                                                 jint number_of_partitions,
+                                                                 jintArray output_offsets) {
+
+  JNI_NULL_CHECK(env, input_table, "input table is null", NULL);
+  JNI_NULL_CHECK(env, partition_column, "partition_column is null", NULL);
+  JNI_NULL_CHECK(env, output_offsets, "output_offsets is null", NULL);
+  JNI_ARG_CHECK(env, number_of_partitions > 0, "number_of_partitions is zero", NULL);
+
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::table_view *n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
+    cudf::column_view *n_part_column = reinterpret_cast<cudf::column_view *>(partition_column);
+    cudf::jni::native_jintArray n_output_offsets(env, output_offsets);
+
+    auto result = cudf::partition(*n_input_table,
+                                  *n_part_column,
+                                  number_of_partitions);
+
+    for (size_t i = 0; i < result.second.size() - 1; i++) {
+      // for what ever reason partition returns the length of the result at then
+      // end and hash partition/round robin do not, so skip the last entry for
+      // consistency
+      n_output_offsets[i] = result.second[i];
+    }
+
+    return cudf::jni::convert_table_for_return(env, result.first);
+  }
+  CATCH_STD(env, NULL);
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_hashPartition(JNIEnv *env, jclass,
                                                                      jlong input_table,
                                                                      jintArray columns_to_hash,
+                                                                     jint hash_function,
                                                                      jint number_of_partitions,
                                                                      jintArray output_offsets) {
 
@@ -1626,6 +1728,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_hashPartition(JNIEnv *env
 
   try {
     cudf::jni::auto_set_device(env);
+    cudf::hash_id hash_func = static_cast<cudf::hash_id>(hash_function);
     cudf::table_view *n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
     cudf::jni::native_jintArray n_columns_to_hash(env, columns_to_hash);
     cudf::jni::native_jintArray n_output_offsets(env, output_offsets);
@@ -1638,7 +1741,10 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_hashPartition(JNIEnv *env
     }
 
     std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::size_type>> result =
-        cudf::hash_partition(*n_input_table, columns_to_hash_vec, number_of_partitions);
+        cudf::hash_partition(*n_input_table,
+                             columns_to_hash_vec,
+                             number_of_partitions,
+                             hash_func);
 
     for (size_t i = 0; i < result.second.size(); i++) {
       n_output_offsets[i] = result.second[i];
@@ -2046,4 +2152,32 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodePosition(JNIEnv *e
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodeOuter(JNIEnv *env, jclass,
+                                                                    jlong input_jtable,
+                                                                    jint column_index) {
+  JNI_NULL_CHECK(env, input_jtable, "explode: input table is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::table_view *input_table = reinterpret_cast<cudf::table_view *>(input_jtable);
+    cudf::size_type col_index = static_cast<cudf::size_type>(column_index);
+    std::unique_ptr<cudf::table> exploded = cudf::explode_outer(*input_table, col_index);
+    return cudf::jni::convert_table_for_return(env, exploded);
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodeOuterPosition(JNIEnv *env, jclass,
+                                                                            jlong input_jtable,
+                                                                            jint column_index) {
+  JNI_NULL_CHECK(env, input_jtable, "explode: input table is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::table_view *input_table = reinterpret_cast<cudf::table_view *>(input_jtable);
+    cudf::size_type col_index = static_cast<cudf::size_type>(column_index);
+    std::unique_ptr<cudf::table> exploded = cudf::explode_outer_position(*input_table, col_index);
+    return cudf::jni::convert_table_for_return(env, exploded);
+  }
+  CATCH_STD(env, 0);
+}
+
 } // extern "C"
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index d224543e574..420e176efe2 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -1666,6 +1666,18 @@ void testAppendStrings() {
     }
   }
 
+  @Test
+  void testCountElements() {
+    DataType dt = new ListType(true, new BasicType(true, DType.INT32));
+    try (ColumnVector cv = ColumnVector.fromLists(dt, Arrays.asList(1),
+        Arrays.asList(1, 2), null, Arrays.asList(null, null),
+        Arrays.asList(1, 2, 3), Arrays.asList(1, 2, 3, 4));
+         ColumnVector lengths = cv.countElements();
+         ColumnVector expected = ColumnVector.fromBoxedInts(1, 2, null, 2, 3, 4)) {
+      TableTest.assertColumnsAreEqual(expected, lengths);
+    }
+  }
+
   @Test
   void testStringLengths() {
     try (ColumnVector cv = ColumnVector.fromStrings("1", "12", null, "123", "1234");
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 88196a4112a..4eee3e97e6e 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -1742,7 +1742,7 @@ void testPartStability() {
     final int PARTS = 5;
     int expectedPart = -1;
     try (Table start = new Table.TestBuilder().column(0).build();
-         PartitionedTable out = start.onColumns(0).partition(PARTS)) {
+         PartitionedTable out = start.onColumns(0).hashPartition(PARTS)) {
       // Lets figure out what partitions this is a part of.
       int[] parts = out.getPartitions();
       for (int i = 0; i < parts.length; i++) {
@@ -1755,7 +1755,7 @@ void testPartStability() {
     for (int numEntries = 1; numEntries < COUNT; numEntries++) {
       try (ColumnVector data = ColumnVector.build(DType.INT32, numEntries, Range.appendInts(0, numEntries));
            Table t = new Table(data);
-           PartitionedTable out = t.onColumns(0).partition(PARTS);
+           PartitionedTable out = t.onColumns(0).hashPartition(PARTS);
            HostColumnVector tmp = out.getColumn(0).copyToHost()) {
         // Now we need to get the range out for the partition we expect
         int[] parts = out.getPartitions();
@@ -1775,6 +1775,73 @@ void testPartStability() {
 
   @Test
   void testPartition() {
+    try (Table t = new Table.TestBuilder()
+        .column(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
+        .build();
+         ColumnVector parts = ColumnVector
+             .fromInts(1, 2, 1, 2, 1, 2, 1, 2, 1, 2);
+         PartitionedTable pt = t.partition(parts, 3);
+         Table expected = new Table.TestBuilder()
+             .column(1, 3, 5, 7, 9, 2, 4, 6, 8, 10)
+             .build()) {
+      int[] partCutoffs = pt.getPartitions();
+      assertArrayEquals(new int[]{0, 0, 5}, partCutoffs);
+      assertTablesAreEqual(expected, pt.getTable());
+    }
+  }
+
+  @Test
+  void testIdentityHashPartition() {
+    final int count = 1024 * 1024;
+    try (ColumnVector aIn = ColumnVector.build(DType.INT64, count, Range.appendLongs(count));
+         ColumnVector bIn = ColumnVector.build(DType.INT32, count, (b) -> {
+           for (int i = 0; i < count; i++) {
+             b.append(i / 2);
+           }
+         });
+         ColumnVector cIn = ColumnVector.build(DType.STRING, count, (b) -> {
+           for (int i = 0; i < count; i++) {
+             b.appendUTF8String(String.valueOf(i).getBytes());
+           }
+         })) {
+
+      HashSet<Long> expected = new HashSet<>();
+      for (long i = 0; i < count; i++) {
+        expected.add(i);
+      }
+      try (Table input = new Table(new ColumnVector[]{aIn, bIn, cIn});
+           PartitionedTable output = input.onColumns(0).hashPartition(HashType.IDENTITY, 5)) {
+        int[] parts = output.getPartitions();
+        assertEquals(5, parts.length);
+        assertEquals(0, parts[0]);
+        int previous = 0;
+        long rows = 0;
+        for (int i = 1; i < parts.length; i++) {
+          assertTrue(parts[i] >= previous);
+          rows += parts[i] - previous;
+          previous = parts[i];
+        }
+        assertTrue(rows <= count);
+        try (HostColumnVector aOut = output.getColumn(0).copyToHost();
+             HostColumnVector bOut = output.getColumn(1).copyToHost();
+             HostColumnVector cOut = output.getColumn(2).copyToHost()) {
+
+          for (int i = 0; i < count; i++) {
+            long fromA = aOut.getLong(i);
+            long fromB = bOut.getInt(i);
+            String fromC = cOut.getJavaString(i);
+            assertTrue(expected.remove(fromA));
+            assertEquals(fromA / 2, fromB);
+            assertEquals(String.valueOf(fromA), fromC, "At Index " + i);
+          }
+          assertTrue(expected.isEmpty());
+        }
+      }
+    }
+  }
+
+  @Test
+  void testHashPartition() {
     final int count = 1024 * 1024;
     try (ColumnVector aIn = ColumnVector.build(DType.INT64, count, Range.appendLongs(count));
          ColumnVector bIn = ColumnVector.build(DType.INT32, count, (b) -> {
@@ -1793,7 +1860,7 @@ void testPartition() {
         expected.add(i);
       }
       try (Table input = new Table(new ColumnVector[]{aIn, bIn, cIn});
-           PartitionedTable output = input.onColumns(0).partition(5)) {
+           PartitionedTable output = input.onColumns(0).hashPartition(5)) {
         int[] parts = output.getPartitions();
         assertEquals(5, parts.length);
         assertEquals(0, parts[0]);
@@ -4056,15 +4123,38 @@ void testTableBasedFilter() {
   }
 
   private Table getExpectedFileTable() {
-    return new TestBuilder()
-        .column(true, false, false, true, false)
-        .column(5, 1, 0, 2, 7)
-        .column(new Byte[]{2, 3, 4, 5, 9})
-        .column(3l, 9l, 4l, 2l, 20l)
-        .column("this", "is", "a", "test", "string")
-        .column(1.0f, 3.5f, 5.9f, 7.1f, 9.8f)
-        .column(5.0d, 9.5d, 0.9d, 7.23d, 2.8d)
-        .build();
+    return getExpectedFileTable(false);
+  }
+
+  private Table getExpectedFileTable(boolean withNestedColumns) {
+    TestBuilder tb = new TestBuilder()
+            .column(true, false, false, true, false)
+            .column(5, 1, 0, 2, 7)
+            .column(new Byte[]{2, 3, 4, 5, 9})
+            .column(3l, 9l, 4l, 2l, 20l)
+            .column("this", "is", "a", "test", "string")
+            .column(1.0f, 3.5f, 5.9f, 7.1f, 9.8f)
+            .column(5.0d, 9.5d, 0.9d, 7.23d, 2.8d);
+    if (withNestedColumns) {
+      StructType nestedType = new StructType(true,
+              new BasicType(false, DType.INT32), new BasicType(false, DType.STRING));
+      tb.column(nestedType,
+            struct(1, "k1"), struct(2, "k2"), struct(3, "k3"),
+            struct(4, "k4"), new HostColumnVector.StructData((List) null))
+        .column(new ListType(false, new BasicType(false, DType.INT32)),
+                Arrays.asList(1, 2),
+                Arrays.asList(3, 4),
+                Arrays.asList(5),
+                Arrays.asList(6, 7),
+                Arrays.asList(8, 9, 10))
+        .column(new ListType(false, nestedType),
+            Arrays.asList(struct(1, "k1"), struct(2, "k2"), struct(3, "k3")),
+            Arrays.asList(struct(4, "k4"), struct(5, "k5")),
+            Arrays.asList(struct(6, "k6")),
+            Arrays.asList(new HostColumnVector.StructData((List) null)),
+            Arrays.asList());
+    }
+    return tb.build();
   }
 
   private Table getExpectedFileTableWithDecimals() {
@@ -4081,19 +4171,6 @@ private Table getExpectedFileTableWithDecimals() {
         .build();
   }
 
-  @Test
-  void testParquetWriteToFileNoNames() throws IOException {
-    File tempFile = File.createTempFile("test-nonames", ".parquet");
-    try (Table table0 = getExpectedFileTable()) {
-      table0.writeParquet(tempFile.getAbsoluteFile());
-      try (Table table1 = Table.readParquet(tempFile.getAbsoluteFile())) {
-        assertTablesAreEqual(table0, table1);
-      }
-    } finally {
-      tempFile.delete();
-    }
-  }
-
   private final class MyBufferConsumer implements HostBufferConsumer, AutoCloseable {
     public final HostMemoryBuffer buffer;
     long offset = 0;
@@ -4143,8 +4220,9 @@ void testParquetWriteToBufferChunkedInt96() {
     try (Table table0 = getExpectedFileTableWithDecimals();
          MyBufferConsumer consumer = new MyBufferConsumer()) {
       ParquetWriterOptions options = ParquetWriterOptions.builder()
+          .withColumnNames("_c1", "_c2", "_c3", "_c4", "_c5", "_c6", "_c7", "_c8", "_c9")
           .withTimestampInt96(true)
-          .withPrecisionValues(5, 5)
+          .withDecimalPrecisions(0, 0, 0, 0, 0, 0, 0, 5, 5)
           .build();
 
       try (TableWriter writer = Table.writeParquetChunked(options, consumer)) {
@@ -4161,9 +4239,13 @@ void testParquetWriteToBufferChunkedInt96() {
 
   @Test
   void testParquetWriteToBufferChunked() {
+    ParquetWriterOptions options = ParquetWriterOptions.builder()
+        .withColumnNames("_c1", "_c2", "_c3", "_c4", "_c5", "_c6", "_c7")
+        .withTimestampInt96(true)
+        .build();
     try (Table table0 = getExpectedFileTable();
          MyBufferConsumer consumer = new MyBufferConsumer()) {
-         try (TableWriter writer = Table.writeParquetChunked(ParquetWriterOptions.DEFAULT, consumer)) {
+         try (TableWriter writer = Table.writeParquetChunked(options, consumer)) {
            writer.write(table0);
            writer.write(table0);
            writer.write(table0);
@@ -4184,7 +4266,7 @@ void testParquetWriteToFileWithNames() throws IOException {
               "eighth", "nineth")
           .withCompressionType(CompressionType.NONE)
           .withStatisticsFrequency(ParquetWriterOptions.StatisticsFrequency.NONE)
-          .withPrecisionValues(5, 6)
+          .withDecimalPrecisions(0, 0, 0, 0, 0, 0, 0, 5, 6)
           .build();
       try (TableWriter writer = Table.writeParquetChunked(options, tempFile.getAbsoluteFile())) {
         writer.write(table0);
@@ -4207,7 +4289,7 @@ void testParquetWriteToFileWithNamesAndMetadata() throws IOException {
           .withMetadata("somekey", "somevalue")
           .withCompressionType(CompressionType.NONE)
           .withStatisticsFrequency(ParquetWriterOptions.StatisticsFrequency.NONE)
-          .withPrecisionValues(6, 8)
+          .withDecimalPrecisions(0, 0, 0, 0, 0, 0, 0, 6, 8)
           .build();
       try (TableWriter writer = Table.writeParquetChunked(options, tempFile.getAbsoluteFile())) {
         writer.write(table0);
@@ -4225,9 +4307,10 @@ void testParquetWriteToFileUncompressedNoStats() throws IOException {
     File tempFile = File.createTempFile("test-uncompressed", ".parquet");
     try (Table table0 = getExpectedFileTableWithDecimals()) {
       ParquetWriterOptions options = ParquetWriterOptions.builder()
+          .withColumnNames("_c1", "_c2", "_c3", "_c4", "_c5", "_c6", "_c7", "_c8", "_c9")
           .withCompressionType(CompressionType.NONE)
           .withStatisticsFrequency(ParquetWriterOptions.StatisticsFrequency.NONE)
-          .withPrecisionValues(4, 6)
+          .withDecimalPrecisions(0, 0, 0, 0, 0, 0, 0, 4, 6)
           .build();
       try (TableWriter writer = Table.writeParquetChunked(options, tempFile.getAbsoluteFile())) {
         writer.write(table0);
@@ -4272,10 +4355,13 @@ void testArrowIPCWriteToFileWithNamesAndMetadata() throws IOException {
 
   @Test
   void testArrowIPCWriteToBufferChunked() {
-    try (Table table0 = getExpectedFileTable();
+    try (Table table0 = getExpectedFileTable(true);
          MyBufferConsumer consumer = new MyBufferConsumer()) {
       ArrowIPCWriterOptions options = ArrowIPCWriterOptions.builder()
               .withColumnNames("first", "second", "third", "fourth", "fifth", "sixth", "seventh")
+              .withColumnNames("eighth", "eighth_id", "eighth_name")
+              .withColumnNames("ninth")
+              .withColumnNames("tenth", "child_id", "child_name")
               .build();
       try (TableWriter writer = Table.writeArrowIPCChunked(options, consumer)) {
         writer.write(table0);
@@ -4585,7 +4671,7 @@ private Table[] buildExplodeTestTableWithPrimitiveTypes(boolean pos, boolean out
     }
   }
 
-  private Table[] buildExplodeTestTableWithNestedTypes(boolean pos) {
+  private Table[] buildExplodeTestTableWithNestedTypes(boolean pos, boolean outer) {
     StructType nestedType = new StructType(true,
         new BasicType(false, DType.INT32), new BasicType(false, DType.STRING));
     try (Table input = new Table.TestBuilder()
@@ -4594,23 +4680,42 @@ private Table[] buildExplodeTestTableWithNestedTypes(boolean pos) {
             Arrays.asList(struct(4, "k4"), struct(5, "k5")),
             Arrays.asList(struct(6, "k6")),
             Arrays.asList(new HostColumnVector.StructData((List) null)),
-            Arrays.asList())
+            null)
         .column("s1", "s2", "s3", "s4", "s5")
         .column(1, 3, 5, 7, 9)
         .column(12.0, 14.0, 13.0, 11.0, 15.0)
         .build()) {
       Table.TestBuilder expectedBuilder = new Table.TestBuilder();
       if (pos) {
-        expectedBuilder.column(0, 1, 2, 0, 1, 0, 0);
+        if (!outer)
+          expectedBuilder.column(0, 1, 2, 0, 1, 0, 0);
+        else
+          expectedBuilder.column(0, 1, 2, 0, 1, 0, 0, 0);
       }
-      try (Table expected = expectedBuilder
-          .column(nestedType,
+      List<Object[]> expectedData = new ArrayList<Object[]>(){{
+        if (!outer) {
+          this.add(new HostColumnVector.StructData[]{
+              struct(1, "k1"), struct(2, "k2"), struct(3, "k3"),
+              struct(4, "k4"), struct(5, "k5"), struct(6, "k6"),
+              new HostColumnVector.StructData((List) null)});
+          this.add(new String[]{"s1", "s1", "s1", "s2", "s2", "s3", "s4"});
+          this.add(new Integer[]{1, 1, 1, 3, 3, 5, 7});
+          this.add(new Double[]{12.0, 12.0, 12.0, 14.0, 14.0, 13.0, 11.0});
+        } else {
+          this.add(new HostColumnVector.StructData[]{
               struct(1, "k1"), struct(2, "k2"), struct(3, "k3"),
               struct(4, "k4"), struct(5, "k5"), struct(6, "k6"),
-              new HostColumnVector.StructData((List) null))
-          .column("s1", "s1", "s1", "s2", "s2", "s3", "s4")
-          .column(1, 1, 1, 3, 3, 5, 7)
-          .column(12.0, 12.0, 12.0, 14.0, 14.0, 13.0, 11.0)
+              new HostColumnVector.StructData((List) null), null});
+          this.add(new String[]{"s1", "s1", "s1", "s2", "s2", "s3", "s4", "s5"});
+          this.add(new Integer[]{1, 1, 1, 3, 3, 5, 7, 9});
+          this.add(new Double[]{12.0, 12.0, 12.0, 14.0, 14.0, 13.0, 11.0, 15.0});
+        }
+      }};
+      try (Table expected = expectedBuilder
+          .column(nestedType, (HostColumnVector.StructData[]) expectedData.get(0))
+          .column((String[]) expectedData.get(1))
+          .column((Integer[]) expectedData.get(2))
+          .column((Double[]) expectedData.get(3))
           .build()) {
         return new Table[]{new Table(input.getColumns()), new Table(expected.getColumns())};
       }
@@ -4629,7 +4734,7 @@ void testExplode() {
     }
 
     // Child is nested type
-    Table[] testTables2 = buildExplodeTestTableWithNestedTypes(false);
+    Table[] testTables2 = buildExplodeTestTableWithNestedTypes(false, false);
     try (Table input = testTables2[0];
          Table expected = testTables2[1]) {
       try (Table exploded = input.explode(0)) {
@@ -4639,7 +4744,7 @@ void testExplode() {
   }
 
   @Test
-  void testPosExplode() {
+  void testExplodePosition() {
     // Child is primitive type
     Table[] testTables = buildExplodeTestTableWithPrimitiveTypes(true, false);
     try (Table input = testTables[0];
@@ -4649,8 +4754,8 @@ void testPosExplode() {
       }
     }
 
-    // Child is primitive type
-    Table[] testTables2 = buildExplodeTestTableWithNestedTypes(true);
+    // Child is nested type
+    Table[] testTables2 = buildExplodeTestTableWithNestedTypes(true, false);
     try (Table input = testTables2[0];
          Table expected = testTables2[1]) {
       try (Table exploded = input.explodePosition(0)) {
@@ -4659,4 +4764,45 @@ void testPosExplode() {
     }
   }
 
+  @Test
+  void testExplodeOuter() {
+    // Child is primitive type
+    Table[] testTables = buildExplodeTestTableWithPrimitiveTypes(false, true);
+    try (Table input = testTables[0];
+         Table expected = testTables[1]) {
+      try (Table exploded = input.explodeOuter(0)) {
+        assertTablesAreEqual(expected, exploded);
+      }
+    }
+
+    // Child is nested type
+    Table[] testTables2 = buildExplodeTestTableWithNestedTypes(false, true);
+    try (Table input = testTables2[0];
+         Table expected = testTables2[1]) {
+      try (Table exploded = input.explodeOuter(0)) {
+        assertTablesAreEqual(expected, exploded);
+      }
+    }
+  }
+
+  @Test
+  void testExplodeOuterPosition() {
+    // Child is primitive type
+    Table[] testTables = buildExplodeTestTableWithPrimitiveTypes(true, true);
+    try (Table input = testTables[0];
+         Table expected = testTables[1]) {
+      try (Table exploded = input.explodeOuterPosition(0)) {
+        assertTablesAreEqual(expected, exploded);
+      }
+    }
+
+    // Child is nested type
+    Table[] testTables2 = buildExplodeTestTableWithNestedTypes(true, true);
+    try (Table input = testTables2[0];
+         Table expected = testTables2[1]) {
+      try (Table exploded = input.explodeOuterPosition(0)) {
+        assertTablesAreEqual(expected, exploded);
+      }
+    }
+  }
 }
diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index ad798a73ed2..e5501428624 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -3,7 +3,7 @@
 import pandas as pd
 
 from libcpp cimport bool
-from libcpp.memory cimport make_unique, unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr, shared_ptr, make_shared
 from libcpp.vector cimport vector
 from libcpp.utility cimport move
 from libc.stdint cimport int32_t, int64_t
@@ -24,6 +24,10 @@ from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport size_type
+from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
+from cudf._lib.cpp.lists.gather cimport (
+    segmented_gather as cpp_segmented_gather
+)
 cimport cudf._lib.cpp.copying as cpp_copying
 
 # workaround for https://github.com/cython/cython/issues/3885
@@ -704,3 +708,22 @@ def sample(Table input, size_type n,
             else input._index_names
         )
     )
+
+
+def segmented_gather(Column source_column, Column gather_map):
+    cdef shared_ptr[lists_column_view] source_LCV = (
+        make_shared[lists_column_view](source_column.view())
+    )
+    cdef shared_ptr[lists_column_view] gather_map_LCV = (
+        make_shared[lists_column_view](gather_map.view())
+    )
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_segmented_gather(
+                source_LCV.get()[0], gather_map_LCV.get()[0])
+        )
+
+    result = Column.from_unique_ptr(move(c_result))
+    return result
diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
index f7f094834e6..519565fa48c 100644
--- a/python/cudf/cudf/_lib/cpp/io/parquet.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
@@ -3,6 +3,7 @@
 from libcpp cimport bool
 from libcpp.string cimport string
 from libcpp.vector cimport vector
+from libcpp.map cimport map
 from libcpp.memory cimport shared_ptr, unique_ptr
 from libc.stdint cimport uint8_t
 
@@ -64,17 +65,35 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
     cdef cudf_io_types.table_with_metadata read_parquet(
         parquet_reader_options args) except +
 
+    cdef cppclass column_in_metadata:
+        column_in_metadata& set_name(const string& name)
+        column_in_metadata& set_nullability(bool nullable)
+        column_in_metadata& set_list_column_as_map()
+        column_in_metadata& set_int96_timestamps(bool req)
+        column_in_metadata& child(size_type i)
+
+    cdef cppclass table_input_metadata:
+        table_input_metadata() except +
+        table_input_metadata(const cudf_table_view.table_view& table) except +
+        table_input_metadata(
+            const cudf_table_view.table_view& table,
+            map[string, string] user_data
+        ) except +
+
+        vector[column_in_metadata] column_metadata
+        map[string, string] user_data
+
     cdef cppclass parquet_writer_options:
         parquet_writer_options() except +
         cudf_io_types.sink_info get_sink_info() except +
         cudf_io_types.compression_type get_compression() except +
         cudf_io_types.statistics_freq get_stats_level() except +
         cudf_table_view.table_view get_table() except +
-        const cudf_io_types.table_metadata get_metadata() except +
+        const table_input_metadata get_metadata() except +
         string get_column_chunks_file_path() except+
 
         void set_metadata(
-            cudf_io_types.table_metadata *m
+            table_input_metadata *m
         ) except +
         void set_stats_level(
             cudf_io_types.statistics_freq sf
@@ -100,7 +119,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
             cudf_table_view.table_view table_
         ) except +
         parquet_writer_options_builder& metadata(
-            cudf_io_types.table_metadata *m
+            table_input_metadata *m
         ) except +
         parquet_writer_options_builder& stats_level(
             cudf_io_types.statistics_freq sf
@@ -126,11 +145,11 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         cudf_io_types.sink_info get_sink() except +
         cudf_io_types.compression_type get_compression() except +
         cudf_io_types.statistics_freq get_stats_level() except +
-        cudf_io_types.table_metadata_with_nullability* get_nullable_metadata(
+        table_input_metadata* get_metadata(
         ) except+
 
-        void set_nullable_metadata(
-            cudf_io_types.table_metadata_with_nullability *m
+        void set_metadata(
+            table_input_metadata *m
         ) except +
         void set_stats_level(
             cudf_io_types.statistics_freq sf
@@ -149,8 +168,8 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         chunked_parquet_writer_options_builder(
             cudf_io_types.sink_info sink_,
         ) except +
-        chunked_parquet_writer_options_builder& nullable_metadata(
-            cudf_io_types.table_metadata_with_nullability *m
+        chunked_parquet_writer_options_builder& metadata(
+            table_input_metadata *m
         ) except +
         chunked_parquet_writer_options_builder& stats_level(
             cudf_io_types.statistics_freq sf
diff --git a/python/cudf/cudf/_lib/cpp/lists/explode.pxd b/python/cudf/cudf/_lib/cpp/lists/explode.pxd
new file mode 100644
index 00000000000..cd2d44d2e42
--- /dev/null
+++ b/python/cudf/cudf/_lib/cpp/lists/explode.pxd
@@ -0,0 +1,13 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.cpp.table.table cimport table
+from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.cpp.types cimport size_type
+
+cdef extern from "cudf/lists/explode.hpp" namespace "cudf" nogil:
+    cdef unique_ptr[table] explode_outer(
+        const table_view,
+        size_type explode_column_idx,
+    ) except +
diff --git a/python/cudf/cudf/_lib/cpp/lists/gather.pxd b/python/cudf/cudf/_lib/cpp/lists/gather.pxd
new file mode 100644
index 00000000000..ea664eee82e
--- /dev/null
+++ b/python/cudf/cudf/_lib/cpp/lists/gather.pxd
@@ -0,0 +1,13 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
+
+
+cdef extern from "cudf/lists/gather.hpp" namespace "cudf::lists" nogil:
+    cdef unique_ptr[column] segmented_gather(
+        const lists_column_view source_column,
+        const lists_column_view gather_map_list
+    ) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings/char_types.pxd b/python/cudf/cudf/_lib/cpp/strings/char_types.pxd
index ad675027c10..934269c6f25 100644
--- a/python/cudf/cudf/_lib/cpp/strings/char_types.pxd
+++ b/python/cudf/cudf/_lib/cpp/strings/char_types.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2021, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from cudf._lib.cpp.column.column_view cimport column_view
@@ -33,11 +33,3 @@ cdef extern from "cudf/strings/char_types/char_types.hpp" \
         string_character_types types_to_remove,
         string_scalar replacement,
         string_character_types types_to_keep) except +
-
-    cdef unique_ptr[column] is_integer(
-        column_view source_strings
-    ) except +
-
-    cdef unique_ptr[column] is_float(
-        column_view source_strings
-    ) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_floats.pxd b/python/cudf/cudf/_lib/cpp/strings/convert/convert_floats.pxd
index baee01b8f99..55a84b60efd 100644
--- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_floats.pxd
+++ b/python/cudf/cudf/_lib/cpp/strings/convert/convert_floats.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2021, NVIDIA CORPORATION.
 
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
@@ -14,3 +14,7 @@ cdef extern from "cudf/strings/convert/convert_floats.hpp" namespace \
 
     cdef unique_ptr[column] from_floats(
         column_view input_col) except +
+
+    cdef unique_ptr[column] is_float(
+        column_view source_strings
+    ) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_integers.pxd b/python/cudf/cudf/_lib/cpp/strings/convert/convert_integers.pxd
index 92f99a2f5cb..6e45d4ba869 100644
--- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_integers.pxd
+++ b/python/cudf/cudf/_lib/cpp/strings/convert/convert_integers.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2021, NVIDIA CORPORATION.
 
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
@@ -15,6 +15,10 @@ cdef extern from "cudf/strings/convert/convert_integers.hpp" namespace \
     cdef unique_ptr[column] from_integers(
         column_view input_col) except +
 
+    cdef unique_ptr[column] is_integer(
+        column_view source_strings
+    ) except +
+
     cdef unique_ptr[column] hex_to_integers(
         column_view input_col,
         data_type output_type) except +
diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
index aba13580912..0f0ee35556a 100644
--- a/python/cudf/cudf/_lib/lists.pyx
+++ b/python/cudf/cudf/_lib/lists.pyx
@@ -1,17 +1,25 @@
 # Copyright (c) 2021, NVIDIA CORPORATION.
 
+from libcpp cimport bool
 from libcpp.memory cimport unique_ptr, shared_ptr, make_shared
 from libcpp.utility cimport move
 
 from cudf._lib.cpp.lists.count_elements cimport (
     count_elements as cpp_count_elements
 )
+from cudf._lib.cpp.lists.explode cimport (
+    explode_outer as cpp_explode_outer
+)
 from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.column.column cimport column
 
-from cudf._lib.column cimport Column
+from cudf._lib.cpp.table.table cimport table
+from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.cpp.types cimport size_type
 
+from cudf._lib.column cimport Column
+from cudf._lib.table cimport Table
 
 from cudf.core.dtypes import ListDtype
 
@@ -32,3 +40,21 @@ def count_elements(Column col):
 
     result = Column.from_unique_ptr(move(c_result))
     return result
+
+
+def explode_outer(Table tbl, int explode_column_idx, bool ignore_index=False):
+    cdef table_view c_table_view = (
+        tbl.data_view() if ignore_index else tbl.view()
+    )
+    cdef size_type c_explode_column_idx = explode_column_idx
+
+    cdef unique_ptr[table] c_result
+
+    with nogil:
+        c_result = move(cpp_explode_outer(c_table_view, c_explode_column_idx))
+
+    return Table.from_unique_ptr(
+        move(c_result),
+        column_names=tbl._column_names,
+        index_names=None if ignore_index else tbl._index_names
+    )
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index a9739a02283..87179c02fe2 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -48,6 +48,8 @@ from cudf._lib.cpp.table.table_view cimport (
 from cudf._lib.cpp.io.parquet cimport (
     read_parquet as parquet_reader,
     parquet_reader_options,
+    table_input_metadata,
+    column_in_metadata,
     parquet_writer_options,
     write_parquet as parquet_writer,
     parquet_chunked_writer as cpp_parquet_chunked_writer,
@@ -284,10 +286,8 @@ cpdef write_parquet(
     """
 
     # Create the write options
-    cdef unique_ptr[cudf_io_types.table_metadata] tbl_meta = \
-        make_unique[cudf_io_types.table_metadata]()
+    cdef unique_ptr[table_input_metadata] tbl_meta
 
-    cdef vector[string] column_names
     cdef map[string, string] user_data
     cdef table_view tv
     cdef unique_ptr[cudf_io_types.data_sink] _data_sink
@@ -295,23 +295,29 @@ cpdef write_parquet(
 
     if index is not False and not isinstance(table._index, cudf.RangeIndex):
         tv = table.view()
+        tbl_meta = make_unique[table_input_metadata](tv)
         for level, idx_name in enumerate(table._index.names):
-            column_names.push_back(
+            tbl_meta.get().column_metadata[level].set_name(
                 str.encode(
                     _index_level_name(idx_name, level, table._column_names)
                 )
             )
+        num_index_cols_meta = len(table._index.names)
     else:
         tv = table.data_view()
+        tbl_meta = make_unique[table_input_metadata](tv)
+        num_index_cols_meta = 0
 
-    for col_name in table._column_names:
-        column_names.push_back(str.encode(col_name))
+    for i, name in enumerate(table._column_names, num_index_cols_meta):
+        tbl_meta.get().column_metadata[i].set_name(name.encode())
+        _set_col_children_names(
+            table[name]._column, tbl_meta.get().column_metadata[i]
+        )
 
     pandas_metadata = generate_pandas_metadata(table, index)
     user_data[str.encode("pandas")] = str.encode(pandas_metadata)
 
     # Set the table_metadata
-    tbl_meta.get().column_names = column_names
     tbl_meta.get().user_data = user_data
 
     cdef cudf_io_types.compression_type comp_type = _get_comp_type(compression)
@@ -357,6 +363,7 @@ cdef class ParquetWriter:
     """
     cdef bool initialized
     cdef unique_ptr[cpp_parquet_chunked_writer] writer
+    cdef unique_ptr[table_input_metadata] tbl_meta
     cdef cudf_io_types.sink_info sink
     cdef unique_ptr[cudf_io_types.data_sink] _data_sink
     cdef cudf_io_types.statistics_freq stat_freq
@@ -416,20 +423,44 @@ cdef class ParquetWriter:
     def _initialize_chunked_state(self, Table table):
         """ Prepares all the values required to build the
         chunked_parquet_writer_options and creates a writer"""
-        cdef unique_ptr[cudf_io_types.table_metadata_with_nullability] tbl_meta
-        tbl_meta = make_unique[cudf_io_types.table_metadata_with_nullability]()
+        cdef table_view tv
 
         # Set the table_metadata
-        tbl_meta.get().column_names = get_column_names(table, self.index)
+        num_index_cols_meta = 0
+        self.tbl_meta = make_unique[table_input_metadata](table.data_view())
+        if self.index is not False:
+            if isinstance(table._index, cudf.core.multiindex.MultiIndex):
+                tv = table.view()
+                self.tbl_meta = make_unique[table_input_metadata](tv)
+                for level, idx_name in enumerate(table._index.names):
+                    self.tbl_meta.get().column_metadata[level].set_name(
+                        (str.encode(idx_name))
+                    )
+                num_index_cols_meta = len(table._index.names)
+            else:
+                if table._index.name is not None:
+                    tv = table.view()
+                    self.tbl_meta = make_unique[table_input_metadata](tv)
+                    self.tbl_meta.get().column_metadata[0].set_name(
+                        str.encode(table._index.name)
+                    )
+                    num_index_cols_meta = 1
+
+        for i, name in enumerate(table._column_names, num_index_cols_meta):
+            self.tbl_meta.get().column_metadata[i].set_name(name.encode())
+            _set_col_children_names(
+                table[name]._column, self.tbl_meta.get().column_metadata[i]
+            )
+
         pandas_metadata = generate_pandas_metadata(table, self.index)
-        tbl_meta.get().user_data[str.encode("pandas")] = \
+        self.tbl_meta.get().user_data[str.encode("pandas")] = \
             str.encode(pandas_metadata)
 
         cdef chunked_parquet_writer_options args
         with nogil:
             args = move(
                 chunked_parquet_writer_options.builder(self.sink)
-                .nullable_metadata(tbl_meta.get())
+                .metadata(self.tbl_meta.get())
                 .compression(self.comp_type)
                 .stats_level(self.stat_freq)
                 .build()
@@ -514,3 +545,15 @@ cdef Column _update_column_struct_field_names(
             )
         col.set_base_children(tuple(children))
     return col
+
+cdef _set_col_children_names(Column col, column_in_metadata& col_meta):
+    if is_struct_dtype(col):
+        for i, (child_col, name) in enumerate(
+            zip(col.children, list(col.dtype.fields))
+        ):
+            col_meta.child(i).set_name(name.encode())
+            _set_col_children_names(child_col, col_meta.child(i))
+    elif is_list_dtype(col):
+        _set_col_children_names(col.children[1], col_meta.child(1))
+    else:
+        return
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index 6a78be000c9..a5945bc72f0 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -47,7 +47,7 @@ cdef class DeviceScalar:
 
     def __init__(self, value, dtype):
         """
-        cudf.Scalar: Type representing a scalar value on the device
+        Type representing an *immutable* scalar value on the device
 
         Parameters
         ----------
@@ -63,6 +63,7 @@ cdef class DeviceScalar:
         self._set_value(value, dtype)
 
     def _set_value(self, value, dtype):
+        # IMPORTANT: this should only ever be called from __init__
         valid = not _is_null_host_scalar(value)
 
         if pd.api.types.is_string_dtype(dtype):
@@ -128,9 +129,12 @@ cdef class DeviceScalar:
 
     def __repr__(self):
         if self.value is cudf.NA:
-            return f"Scalar({self.value}, {self.dtype.__repr__()})"
+            return (
+                f"{self.__class__.__name__}"
+                f"({self.value}, {self.dtype.__repr__()})"
+            )
         else:
-            return f"Scalar({self.value.__repr__()})"
+            return f"{self.__class__.__name__}({self.value.__repr__()})"
 
     @staticmethod
     cdef DeviceScalar from_unique_ptr(unique_ptr[scalar] ptr):
diff --git a/python/cudf/cudf/_lib/strings/char_types.pyx b/python/cudf/cudf/_lib/strings/char_types.pyx
index 5d8d1522418..1890e98f956 100644
--- a/python/cudf/cudf/_lib/strings/char_types.pyx
+++ b/python/cudf/cudf/_lib/strings/char_types.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2021, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -14,8 +14,6 @@ from cudf._lib.cpp.strings.char_types cimport (
     all_characters_of_type as cpp_all_characters_of_type,
     filter_characters_of_type as cpp_filter_characters_of_type,
     string_character_types as string_character_types,
-    is_integer as cpp_is_integer,
-    is_float as cpp_is_float,
 )
 
 
@@ -191,35 +189,3 @@ def is_space(Column source_strings):
         ))
 
     return Column.from_unique_ptr(move(c_result))
-
-
-def is_integer(Column source_strings):
-    """
-    Returns a Column of boolean values with True for `source_strings`
-    that have intergers.
-    """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_is_integer(
-            source_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
-
-
-def is_float(Column source_strings):
-    """
-    Returns a Column of boolean values with True for `source_strings`
-    that have floats.
-    """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_is_float(
-            source_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
new file mode 100644
index 00000000000..195d9b71f6e
--- /dev/null
+++ b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
@@ -0,0 +1,29 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.column cimport Column
+from cudf._lib.cpp.column.column cimport column
+
+from cudf._lib.cpp.strings.convert.convert_floats cimport (
+    is_float as cpp_is_float,
+)
+
+
+def is_float(Column source_strings):
+    """
+    Returns a Column of boolean values with True for `source_strings`
+    that have floats.
+    """
+    cdef unique_ptr[column] c_result
+    cdef column_view source_view = source_strings.view()
+
+    with nogil:
+        c_result = move(cpp_is_float(
+            source_view
+        ))
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx b/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx
new file mode 100644
index 00000000000..d1bae1edd37
--- /dev/null
+++ b/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx
@@ -0,0 +1,29 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.column cimport Column
+from cudf._lib.cpp.column.column cimport column
+
+from cudf._lib.cpp.strings.convert.convert_integers cimport (
+    is_integer as cpp_is_integer,
+)
+
+
+def is_integer(Column source_strings):
+    """
+    Returns a Column of boolean values with True for `source_strings`
+    that have intergers.
+    """
+    cdef unique_ptr[column] c_result
+    cdef column_view source_view = source_strings.view()
+
+    with nogil:
+        c_result = move(cpp_is_integer(
+            source_view
+        ))
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index 4c4ef17c6b9..6698a47b416 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -22,6 +22,7 @@ from cudf.utils.dtypes import (
     np_to_pa_dtype,
     is_categorical_dtype,
     is_list_dtype,
+    is_struct_dtype,
 )
 
 
@@ -79,7 +80,7 @@ cpdef generate_pandas_metadata(Table table, index):
                 "'category' column dtypes are currently not "
                 + "supported by the gpu accelerated parquet writer"
             )
-        elif is_list_dtype(col):
+        elif is_list_dtype(col) or is_struct_dtype(col):
             types.append(col.dtype.to_arrow())
         else:
             types.append(np_to_pa_dtype(col.dtype))
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index a60fe627acb..1d3f73822a9 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -2,14 +2,16 @@
 
 import pickle
 
+import numpy as np
 import pyarrow as pa
 
 import cudf
+from cudf._lib.copying import segmented_gather
 from cudf._lib.lists import count_elements
 from cudf.core.buffer import Buffer
-from cudf.core.column import ColumnBase, column
+from cudf.core.column import ColumnBase, as_column, column
 from cudf.core.column.methods import ColumnMethodsMixin
-from cudf.utils.dtypes import is_list_dtype
+from cudf.utils.dtypes import is_list_dtype, is_numerical_dtype
 
 
 class ListColumn(ColumnBase):
@@ -228,3 +230,58 @@ def len(self):
         dtype: int32
         """
         return self._return_or_inplace(count_elements(self._column))
+
+    def take(self, lists_indices):
+        """
+        Collect list elements based on given indices.
+
+        Parameters
+        ----------
+        lists_indices: List type arrays
+            Specifies what to collect from each row
+
+        Returns
+        -------
+        ListColumn
+
+        Examples
+        --------
+        >>> s = cudf.Series([[1, 2, 3], None, [4, 5]])
+        >>> s
+        0    [1, 2, 3]
+        1         None
+        2       [4, 5]
+        dtype: list
+        >>> s.list.take([[0, 1], [], []])
+        0    [1, 2]
+        1      None
+        2        []
+        dtype: list
+        """
+
+        lists_indices_col = as_column(lists_indices)
+        if not isinstance(lists_indices_col, ListColumn):
+            raise ValueError("lists_indices should be list type array.")
+        if not lists_indices_col.size == self._column.size:
+            raise ValueError(
+                "lists_indices and list column is of different " "size."
+            )
+        if not is_numerical_dtype(
+            lists_indices_col.children[1].dtype
+        ) or not np.issubdtype(
+            lists_indices_col.children[1].dtype, np.integer
+        ):
+            raise TypeError(
+                "lists_indices should be column of values of index types."
+            )
+
+        try:
+            res = self._return_or_inplace(
+                segmented_gather(self._column, lists_indices_col)
+            )
+        except RuntimeError as e:
+            if "contains nulls" in str(e):
+                raise ValueError("lists_indices contains null.") from e
+            raise
+        else:
+            return res
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index ea01aa07b91..11dd7556812 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -70,13 +70,15 @@
     is_alpha as cpp_is_alpha,
     is_decimal as cpp_is_decimal,
     is_digit as cpp_is_digit,
-    is_float as cpp_is_float,
-    is_integer as cpp_is_integer,
     is_lower as cpp_is_lower,
     is_numeric as cpp_is_numeric,
     is_space as cpp_isspace,
     is_upper as cpp_is_upper,
 )
+from cudf._lib.strings.convert.convert_integers import (
+    is_integer as cpp_is_integer,
+)
+from cudf._lib.strings.convert.convert_floats import is_float as cpp_is_float
 from cudf._lib.strings.combine import (
     concatenate as cpp_concatenate,
     join as cpp_join,
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 25f57748765..812a20cba45 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1518,11 +1518,7 @@ def fallback(col, fn):
                 else:
                     if col not in df_cols:
                         r_opr = other_cols[col]
-                        l_opr = Series(
-                            column_empty(
-                                len(self), masked=True, dtype=other.dtype
-                            )
-                        )
+                        l_opr = Series(as_column(np.nan, length=len(self)))
                     if col not in other_cols_keys:
                         r_opr = None
                         l_opr = self[col]
@@ -2198,7 +2194,7 @@ def rpow(self, other, axis="columns", level=None, fill_value=None):
         return self._apply_op("rpow", other, fill_value)
 
     def __rpow__(self, other):
-        return self._apply_op("__pow__", other)
+        return self._apply_op("__rpow__", other)
 
     def floordiv(self, other, axis="columns", level=None, fill_value=None):
         """
@@ -7322,15 +7318,6 @@ def to_parquet(self, path, *args, **kwargs):
         """{docstring}"""
         from cudf.io import parquet as pq
 
-        if any(
-            isinstance(col, cudf.core.column.StructColumn)
-            for col in self._data.columns
-        ):
-            raise NotImplementedError(
-                "Writing to parquet format is not yet supported "
-                "with Struct columns."
-            )
-
         return pq.to_parquet(self, path, *args, **kwargs)
 
     @ioutils.doc_to_feather()
@@ -7709,6 +7696,52 @@ def equals(self, other):
                 return False
         return super().equals(other)
 
+    def explode(self, column, ignore_index=False):
+        """
+        Transform each element of a list-like to a row, replicating index
+        values.
+
+        Parameters
+        ----------
+        column : str or tuple
+            Column to explode.
+        ignore_index : bool, default False
+            If True, the resulting index will be labeled 0, 1, …, n - 1.
+
+        Returns
+        -------
+        DataFrame
+
+        Examples
+        --------
+        >>> import cudf
+        >>> cudf.DataFrame(
+                {"a": [[1, 2, 3], [], None, [4, 5]], "b": [11, 22, 33, 44]})
+                   a   b
+        0  [1, 2, 3]  11
+        1         []  22
+        2       None  33
+        3     [4, 5]  44
+        >>> df.explode('a')
+              a   b
+        0     1  11
+        0     2  11
+        0     3  11
+        1  <NA>  22
+        2  <NA>  33
+        3     4  44
+        3     5  44
+        """
+        if column not in self._column_names:
+            raise KeyError(column)
+
+        if not is_list_dtype(self._data[column].dtype):
+            data = self._data.copy(deep=True)
+            idx = None if ignore_index else self._index.copy(deep=True)
+            return self.__class__._from_data(data, index=idx)
+
+        return super()._explode(column, ignore_index)
+
     _accessors = set()  # type: Set[Any]
 
 
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index fab5936f94d..bfcc2d125db 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -573,6 +573,28 @@ def equals(self, other, **kwargs):
         else:
             return self._index.equals(other._index)
 
+    def _explode(self, explode_column: Any, ignore_index: bool):
+        """Helper function for `explode` in `Series` and `Dataframe`, explodes
+        a specified nested column. Other columns' corresponding rows are
+        duplicated. If ignore_index is set, the original index is not exploded
+        and will be replaced with a `RangeIndex`.
+        """
+        explode_column_num = self._column_names.index(explode_column)
+        if not ignore_index and self._index is not None:
+            explode_column_num += self._index.nlevels
+
+        res_tbl = libcudf.lists.explode_outer(
+            self, explode_column_num, ignore_index
+        )
+        res = self.__class__._from_table(res_tbl)
+
+        res._data.multiindex = self._data.multiindex
+        res._data._level_names = self._data._level_names
+
+        if not ignore_index and self._index is not None:
+            res.index.names = self._index.names
+        return res
+
     def _get_columns_by_label(self, labels, downcast):
         """
         Returns columns of the Frame specified by `labels`
diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index a3467e6fbe0..1e998ae37e2 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -56,7 +56,14 @@ def __init__(self, value, dtype=None):
         self._host_value = None
         self._host_dtype = None
         self._device_value = None
-        if isinstance(value, DeviceScalar):
+
+        if isinstance(value, Scalar):
+            if value._is_host_value_current:
+                self._host_value = value._host_value
+                self._host_dtype = value._host_dtype
+            else:
+                self._device_value = value._device_value
+        elif isinstance(value, DeviceScalar):
             self._device_value = value
         else:
             self._host_value, self._host_dtype = self._preprocess_host_value(
@@ -248,7 +255,10 @@ def __neg__(self):
     def __repr__(self):
         # str() fixes a numpy bug with NaT
         # https://github.com/numpy/numpy/issues/17552
-        return f"Scalar({str(self.value)}, dtype={self.dtype})"
+        return (
+            f"{self.__class__.__name__}"
+            f"({str(self.value)}, dtype={self.dtype})"
+        )
 
     def _binop_result_dtype_or_error(self, other, op):
         if op in {"__eq__", "__ne__", "__lt__", "__gt__", "__le__", "__ge__"}:
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 5e7121c0488..7ed2157277c 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -1501,9 +1501,7 @@ def _binaryop(
         If ``reflect`` is ``True``, swap the order of the operands.
         """
         if isinstance(other, cudf.DataFrame):
-            # TODO: fn is not the same as arg expected by _apply_op
-            # e.g. for fn = 'and', _apply_op equivalent is '__and__'
-            return other._apply_op(self, fn)
+            return NotImplemented
 
         result_name = utils.get_result_name(self, other)
         if isinstance(other, Series):
@@ -6364,6 +6362,47 @@ def keys(self):
         """
         return self.index
 
+    def explode(self, ignore_index=False):
+        """
+        Transform each element of a list-like to a row, replicating index
+        values.
+
+        Parameters
+        ----------
+        ignore_index : bool, default False
+            If True, the resulting index will be labeled 0, 1, …, n - 1.
+
+        Returns
+        -------
+        DataFrame
+
+        Examples
+        --------
+        >>> import cudf
+        >>> s = cudf.Series([[1, 2, 3], [], None, [4, 5]])
+        >>> s
+        0    [1, 2, 3]
+        1           []
+        2         None
+        3       [4, 5]
+        dtype: list
+        >>> s.explode()
+        0       1
+        0       2
+        0       3
+        1    <NA>
+        2    <NA>
+        3       4
+        3       5
+        dtype: int64
+        """
+        if not is_list_dtype(self._column.dtype):
+            data = self._data.copy(deep=True)
+            idx = None if ignore_index else self._index.copy(deep=True)
+            return self.__class__._from_data(data, index=idx)
+
+        return super()._explode(self._column_names[0], ignore_index)
+
     _accessors = set()  # type: Set[Any]
 
 
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 4e5e4ce1987..535e497e8dc 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -8,7 +8,9 @@
 from pandas.core.tools.datetimes import _unit_map
 
 import cudf
-from cudf._lib.strings.char_types import is_integer as cpp_is_integer
+from cudf._lib.strings.convert.convert_integers import (
+    is_integer as cpp_is_integer,
+)
 from cudf.core import column
 from cudf.core.index import as_index
 from cudf.utils.dtypes import is_scalar
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 77548b95277..b3ba439cb15 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -4996,13 +4996,13 @@ def test_cov_nans():
 @pytest.mark.parametrize(
     "gsr",
     [
-        cudf.Series([1, 2, 3]),
-        cudf.Series([1, 2, 3], index=["a", "b", "c"]),
-        cudf.Series([1, 2, 3], index=["a", "b", "d"]),
-        cudf.Series([1, 2], index=["a", "b"]),
-        cudf.Series([1, 2, 3], index=cudf.core.index.RangeIndex(0, 3)),
+        cudf.Series([4, 2, 3]),
+        cudf.Series([4, 2, 3], index=["a", "b", "c"]),
+        cudf.Series([4, 2, 3], index=["a", "b", "d"]),
+        cudf.Series([4, 2], index=["a", "b"]),
+        cudf.Series([4, 2, 3], index=cudf.core.index.RangeIndex(0, 3)),
         pytest.param(
-            cudf.Series([1, 2, 3, 4, 5], index=["a", "b", "d", "0", "12"]),
+            cudf.Series([4, 2, 3, 4, 5], index=["a", "b", "d", "0", "12"]),
             marks=pytest.mark.xfail,
         ),
     ],
@@ -5017,32 +5017,32 @@ def test_cov_nans():
         operator.truediv,
         operator.mod,
         operator.pow,
-        # comparison ops will temporarily XFAIL
-        # see PR  https://github.com/rapidsai/cudf/pull/7491
-        pytest.param(operator.eq, marks=pytest.mark.xfail()),
-        pytest.param(operator.lt, marks=pytest.mark.xfail()),
-        pytest.param(operator.le, marks=pytest.mark.xfail()),
-        pytest.param(operator.gt, marks=pytest.mark.xfail()),
-        pytest.param(operator.ge, marks=pytest.mark.xfail()),
-        pytest.param(operator.ne, marks=pytest.mark.xfail()),
+        operator.eq,
+        operator.lt,
+        operator.le,
+        operator.gt,
+        operator.ge,
+        operator.ne,
     ],
 )
 def test_df_sr_binop(gsr, colnames, op):
-    data = [[0, 2, 5], [3, None, 5], [6, 7, np.nan]]
+    data = [[3.0, 2.0, 5.0], [3.0, None, 5.0], [6.0, 7.0, np.nan]]
     data = dict(zip(colnames, data))
 
+    gsr = gsr.astype("float64")
+
     gdf = cudf.DataFrame(data)
-    pdf = pd.DataFrame.from_dict(data)
+    pdf = gdf.to_pandas(nullable=True)
 
-    psr = gsr.to_pandas()
+    psr = gsr.to_pandas(nullable=True)
 
     expect = op(pdf, psr)
-    got = op(gdf, gsr)
-    assert_eq(expect.astype(float), got.astype(float))
+    got = op(gdf, gsr).to_pandas(nullable=True)
+    assert_eq(expect, got, check_dtype=False)
 
     expect = op(psr, pdf)
-    got = op(psr, pdf)
-    assert_eq(expect.astype(float), got.astype(float))
+    got = op(gsr, gdf).to_pandas(nullable=True)
+    assert_eq(expect, got, check_dtype=False)
 
 
 @pytest.mark.parametrize(
@@ -8442,3 +8442,56 @@ def test_rename_for_level_is_None_MC():
     got = gdf.rename(columns={"a": "f"}, level=None)
 
     assert_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [
+            [[1, 2, 3], 11, "a"],
+            [None, 22, "e"],
+            [[4], 33, "i"],
+            [[], 44, "o"],
+            [[5, 6], 55, "u"],
+        ],  # nested
+        [
+            [1, 11, "a"],
+            [2, 22, "e"],
+            [3, 33, "i"],
+            [4, 44, "o"],
+            [5, 55, "u"],
+        ],  # non-nested
+    ],
+)
+@pytest.mark.parametrize(
+    ("labels", "label_to_explode"),
+    [
+        (None, 0),
+        (pd.Index(["a", "b", "c"]), "a"),
+        (
+            pd.MultiIndex.from_tuples(
+                [(0, "a"), (0, "b"), (1, "a")], names=["l0", "l1"]
+            ),
+            (0, "a"),
+        ),
+    ],
+)
+@pytest.mark.parametrize("ignore_index", [True, False])
+@pytest.mark.parametrize(
+    "p_index",
+    [
+        None,
+        ["ia", "ib", "ic", "id", "ie"],
+        pd.MultiIndex.from_tuples(
+            [(0, "a"), (0, "b"), (0, "c"), (1, "a"), (1, "b")]
+        ),
+    ],
+)
+def test_explode(data, labels, ignore_index, p_index, label_to_explode):
+    pdf = pd.DataFrame(data, index=p_index, columns=labels)
+    gdf = cudf.from_pandas(pdf)
+
+    expect = pdf.explode(label_to_explode, ignore_index)
+    got = gdf.explode(label_to_explode, ignore_index)
+
+    assert_eq(expect, got, check_dtype=False)
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index 195d8749ec6..33812cfa7a7 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -112,3 +112,50 @@ def test_len(data):
     got = gsr.list.len()
 
     assert_eq(expect, got, check_dtype=False)
+
+
+@pytest.mark.parametrize(
+    ("data", "idx"),
+    [
+        ([[1, 2, 3], [3, 4, 5], [4, 5, 6]], [[0, 1], [2], [1, 2]]),
+        ([[1, 2, 3], [3, 4, 5], [4, 5, 6]], [[1, 2, 0], [1, 0, 2], [0, 1, 2]]),
+        ([[1, 2, 3], []], [[0, 1], []]),
+        ([[1, 2, 3], [None]], [[0, 1], []]),
+        ([[1, None, 3], None], [[0, 1], []]),
+    ],
+)
+def test_take(data, idx):
+    ps = pd.Series(data)
+    gs = cudf.from_pandas(ps)
+
+    expected = pd.Series(zip(ps, idx)).map(
+        lambda x: [x[0][i] for i in x[1]] if x[0] is not None else None
+    )
+    got = gs.list.take(idx)
+    assert_eq(expected, got)
+
+
+@pytest.mark.parametrize(
+    ("invalid", "exception"),
+    [
+        ([[0]], pytest.raises(ValueError, match="different size")),
+        ([1, 2, 3, 4], pytest.raises(ValueError, match="should be list type")),
+        (
+            [["a", "b"], ["c"]],
+            pytest.raises(
+                TypeError, match="should be column of values of index types"
+            ),
+        ),
+        (
+            [[[1], [0]], [[0]]],
+            pytest.raises(
+                TypeError, match="should be column of values of index types"
+            ),
+        ),
+        ([[0, 1], None], pytest.raises(ValueError, match="contains null")),
+    ],
+)
+def test_take_invalid(invalid, exception):
+    gs = cudf.Series([[0, 1], [2, 3]])
+    with exception:
+        gs.list.take(invalid)
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index b50ac06e0d0..faad489da86 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -754,3 +754,19 @@ def test_empty_dataframe():
 
     assert_eq(expected, got_df)
     assert_eq(expected_pdf, got_df)
+
+
+@pytest.mark.parametrize(
+    "data", [[None, ""], ["", None], [None, None], ["", ""]]
+)
+def test_empty_string_columns(data):
+    buffer = BytesIO()
+
+    expected = cudf.DataFrame({"string": data}, dtype="str")
+    expected.to_orc(buffer)
+
+    expected_pdf = pd.read_orc(buffer)
+    got_df = cudf.read_orc(buffer)
+
+    assert_eq(expected, got_df)
+    assert_eq(expected_pdf, got_df)
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index dc4d0615a7f..6d50e4b6fee 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -1834,3 +1834,89 @@ def test_parquet_writer_list_statistics(tmpdir):
             actual_max = cudf.Series(pd_slice[col].explode().explode()).max()
             stats_max = stats.max
             assert normalized_equals(actual_max, stats_max)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        # Structs
+        {
+            "being": [
+                None,
+                {"human?": True, "Deets": {"Name": "Carrot", "Age": 27}},
+                {"human?": None, "Deets": {"Name": "Angua", "Age": 25}},
+                {"human?": False, "Deets": {"Name": "Cheery", "Age": 31}},
+                {"human?": False, "Deets": None},
+                {"human?": None, "Deets": {"Name": "Mr", "Age": None}},
+            ]
+        },
+        # List of Structs
+        pytest.param(
+            {
+                "family": [
+                    [
+                        None,
+                        {"human?": True, "deets": {"weight": 2.4, "age": 27}},
+                    ],
+                    [
+                        {"human?": None, "deets": {"weight": 5.3, "age": 25}},
+                        {"human?": False, "deets": {"weight": 8.0, "age": 31}},
+                        {"human?": False, "deets": None},
+                    ],
+                    [],
+                    [{"human?": None, "deets": {"weight": 6.9, "age": None}}],
+                ]
+            },
+            marks=pytest.mark.xfail(
+                reason="https://github.com/rapidsai/cudf/issues/7561"
+            ),
+        ),
+        # Struct of Lists
+        pytest.param(
+            {
+                "Real estate records": [
+                    None,
+                    {
+                        "Status": "NRI",
+                        "Ownerships": {
+                            "land_unit": [None, 2, None],
+                            "flats": [[1, 2, 3], [], [4, 5], [], [0, 6, 0]],
+                        },
+                    },
+                    {
+                        "Status": None,
+                        "Ownerships": {
+                            "land_unit": [4, 5],
+                            "flats": [[7, 8], []],
+                        },
+                    },
+                    {
+                        "Status": "RI",
+                        "Ownerships": {"land_unit": None, "flats": [[]]},
+                    },
+                    {"Status": "RI", "Ownerships": None},
+                    {
+                        "Status": None,
+                        "Ownerships": {
+                            "land_unit": [7, 8, 9],
+                            "flats": [[], [], []],
+                        },
+                    },
+                ]
+            },
+            marks=pytest.mark.xfail(
+                reason="https://github.com/rapidsai/cudf/issues/7562"
+            ),
+        ),
+    ],
+)
+def test_parquet_writer_nested(tmpdir, data):
+    expect = pd.DataFrame(data)
+    gdf = cudf.from_pandas(expect)
+
+    fname = tmpdir.join("test_parquet_writer_nested.parquet")
+    gdf.to_parquet(fname)
+    assert os.path.exists(fname)
+
+    got = pd.read_parquet(fname)
+    assert_eq(expect, got)
diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py
index 003e46c7e0d..58115cecee7 100644
--- a/python/cudf/cudf/tests/test_scalar.py
+++ b/python/cudf/cudf/tests/test_scalar.py
@@ -289,3 +289,17 @@ def test_device_scalar_direct_construction(value):
         assert s.dtype == "object"
     else:
         assert s.dtype == dtype
+
+
+@pytest.mark.parametrize("value", SCALAR_VALUES)
+def test_construct_from_scalar(value):
+    value = cudf.utils.utils.to_cudf_compatible_scalar(value)
+    x = cudf.Scalar(value, value.dtype)
+    y = cudf.Scalar(x)
+    assert x.value == y.value or np.isnan(x.value) and np.isnan(y.value)
+
+    # check that this works:
+    y.device_value
+
+    x._is_host_value_current == y._is_host_value_current
+    x._is_device_value_current == y._is_device_value_current
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index a1b4236719d..beda14934ca 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -1118,3 +1118,32 @@ def test_series_drop_raises():
     actual = gs.drop("p", errors="ignore")
 
     assert_eq(actual, expect)
+
+
+@pytest.mark.parametrize(
+    "data", [[[1, 2, 3], None, [4], [], [5, 6]], [1, 2, 3, 4, 5]],
+)
+@pytest.mark.parametrize("ignore_index", [True, False])
+@pytest.mark.parametrize(
+    "p_index",
+    [
+        None,
+        ["ia", "ib", "ic", "id", "ie"],
+        pd.MultiIndex.from_tuples(
+            [(0, "a"), (0, "b"), (0, "c"), (1, "a"), (1, "b")]
+        ),
+    ],
+)
+def test_explode(data, ignore_index, p_index):
+    pdf = pd.Series(data, index=p_index, name="someseries")
+    gdf = cudf.from_pandas(pdf)
+
+    expect = pdf.explode(ignore_index)
+    got = gdf.explode(ignore_index)
+
+    if data == [1, 2, 3, 4, 5] and ignore_index and p_index is not None:
+        # https://github.com/pandas-dev/pandas/issues/40487
+        with pytest.raises(AssertionError, match="different"):
+            assert_eq(expect, got, check_dtype=False)
+    else:
+        assert_eq(expect, got, check_dtype=False)