Rewriting row/column conversions for Spark <-> cudf data conversions (#…

…8444) Row to column and column to row conversions changed to support large numbers of columns and variable-width data. So far this is the column to row work and variable width work is not completed yet. This code is currently copied over to the cudf side for benchmarking, but will not remain there. Authors: - Mike Wilson (https://github.com/hyperbolic2346) Approvers: - MithunR (https://github.com/mythrocks) - https://github.com/nvdbaranec URL: #8444
rapidsai · Jan 10, 2022 · dd390a2 · dd390a2
1 parent b7b87fb
commit dd390a2
Show file tree

Hide file tree

Showing 8 changed files with 1,758 additions and 263 deletions.
diff --git a/cpp/include/cudf/detail/utilities/integer_utils.hpp b/cpp/include/cudf/detail/utilities/integer_utils.hpp
@@ -1,7 +1,7 @@
 /*
  * Copyright 2019 BlazingDB, Inc.
  *     Copyright 2019 Eyal Rozenberg <eyalroz@blazingdb.com>
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,12 +33,18 @@ namespace cudf {
 //! Utility functions
 namespace util {
 /**
- * Finds the smallest integer not less than `number_to_round` and modulo `S` is
- * zero. This function assumes that `number_to_round` is non-negative and
- * `modulus` is positive.
+ * @brief Rounds `number_to_round` up to the next multiple of modulus
+ *
+ * @tparam S type to return
+ * @param number_to_round number that is being rounded
+ * @param modulus value to which to round
+ * @return smallest integer greater than `number_to_round` and modulo `S` is zero.
+ *
+ * @note This function assumes that `number_to_round` is non-negative and
+ * `modulus` is positive. The safety is in regard to rollover.
  */
 template <typename S>
-inline S round_up_safe(S number_to_round, S modulus)
+S round_up_safe(S number_to_round, S modulus)
 {
   auto remainder = number_to_round % modulus;
   if (remainder == 0) { return number_to_round; }
@@ -50,18 +56,44 @@ inline S round_up_safe(S number_to_round, S modulus)
 }
 
 /**
- * Finds the largest integer not greater than `number_to_round` and modulo `S` is
- * zero. This function assumes that `number_to_round` is non-negative and
- * `modulus` is positive.
+ * @brief Rounds `number_to_round` down to the last multiple of modulus
+ *
+ * @tparam S type to return
+ * @param number_to_round number that is being rounded
+ * @param modulus value to which to round
+ * @return largest integer not greater than `number_to_round` and modulo `S` is zero.
+ *
+ * @note This function assumes that `number_to_round` is non-negative and
+ * `modulus` is positive and does not check for overflow.
  */
 template <typename S>
-inline S round_down_safe(S number_to_round, S modulus)
+S round_down_safe(S number_to_round, S modulus) noexcept
 {
   auto remainder    = number_to_round % modulus;
   auto rounded_down = number_to_round - remainder;
   return rounded_down;
 }
 
+/**
+ * @brief Rounds `number_to_round` up to the next multiple of modulus
+ *
+ * @tparam S type to return
+ * @param number_to_round number that is being rounded
+ * @param modulus value to which to round
+ * @return smallest integer greater than `number_to_round` and modulo `S` is zero.
+ *
+ * @note This function assumes that `number_to_round` is non-negative and
+ * `modulus` is positive and does not check for overflow.
+ */
+template <typename S>
+constexpr S round_up_unsafe(S number_to_round, S modulus) noexcept
+{
+  auto remainder = number_to_round % modulus;
+  if (remainder == 0) { return number_to_round; }
+  auto rounded_up = number_to_round - remainder + modulus;
+  return rounded_up;
+}
+
 /**
  * Divides the left-hand-side by the right-hand-side, rounding up
  * to an integral multiple of the right-hand-side, e.g. (9,5) -> 2 , (10,5) -> 2, (11,5) -> 3.
@@ -75,26 +107,24 @@ inline S round_down_safe(S number_to_round, S modulus)
  * the result will be incorrect
  */
 template <typename S, typename T>
-constexpr inline S div_rounding_up_unsafe(const S& dividend, const T& divisor) noexcept
+constexpr S div_rounding_up_unsafe(const S& dividend, const T& divisor) noexcept
 {
   return (dividend + divisor - 1) / divisor;
 }
 
 namespace detail {
 template <typename I>
-constexpr inline I div_rounding_up_safe(std::integral_constant<bool, false>,
-                                        I dividend,
-                                        I divisor) noexcept
+constexpr I div_rounding_up_safe(std::integral_constant<bool, false>,
+                                 I dividend,
+                                 I divisor) noexcept
 {
   // TODO: This could probably be implemented faster
   return (dividend > divisor) ? 1 + div_rounding_up_unsafe(dividend - divisor, divisor)
                               : (dividend > 0);
 }
 
 template <typename I>
-constexpr inline I div_rounding_up_safe(std::integral_constant<bool, true>,
-                                        I dividend,
-                                        I divisor) noexcept
+constexpr I div_rounding_up_safe(std::integral_constant<bool, true>, I dividend, I divisor) noexcept
 {
   auto quotient  = dividend / divisor;
   auto remainder = dividend % divisor;
@@ -116,14 +146,14 @@ constexpr inline I div_rounding_up_safe(std::integral_constant<bool, true>,
  * approach of using (dividend + divisor - 1) / divisor
  */
 template <typename I>
-constexpr inline I div_rounding_up_safe(I dividend, I divisor) noexcept
+constexpr I div_rounding_up_safe(I dividend, I divisor) noexcept
 {
   using i_is_a_signed_type = std::integral_constant<bool, std::is_signed<I>::value>;
   return detail::div_rounding_up_safe(i_is_a_signed_type{}, dividend, divisor);
 }
 
 template <typename I>
-constexpr inline bool is_a_power_of_two(I val) noexcept
+constexpr bool is_a_power_of_two(I val) noexcept
 {
   static_assert(std::is_integral<I>::value, "This function only applies to integral types");
   return ((val - 1) & val) == 0;
@@ -153,7 +183,7 @@ constexpr inline bool is_a_power_of_two(I val) noexcept
  * @return Absolute value if value type is signed.
  */
 template <typename T>
-constexpr inline auto absolute_value(T value) -> T
+constexpr auto absolute_value(T value) -> T
 {
   if constexpr (cuda::std::is_signed<T>()) return numeric::detail::abs(value);
   return value;

diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
@@ -40,13 +40,6 @@ namespace {
 // align all column size allocations to this boundary so that all output column buffers
 // start at that alignment.
 static constexpr std::size_t split_align = 64;
-inline __device__ std::size_t _round_up_safe(std::size_t number_to_round, std::size_t modulus)
-{
-  auto remainder = number_to_round % modulus;
-  if (remainder == 0) { return number_to_round; }
-  auto rounded_up = number_to_round - remainder + modulus;
-  return rounded_up;
-}
 
 /**
  * @brief Struct which contains information on a source buffer.
@@ -960,7 +953,7 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
       std::size_t const bytes =
         static_cast<std::size_t>(num_elements) * static_cast<std::size_t>(element_size);
 
-      return dst_buf_info{_round_up_safe(bytes, 64),
+      return dst_buf_info{util::round_up_unsafe(bytes, 64ul),
                           num_elements,
                           element_size,
                           num_rows,

diff --git a/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java b/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java
@@ -393,7 +393,7 @@ public final void setInts(long offset, int[] data, long srcOffset, long len) {
    */
   public final long getLong(long offset) {
     long requestedAddress = this.address + offset;
-    addressOutOfBoundsCheck(requestedAddress, 8, "setLong");
+    addressOutOfBoundsCheck(requestedAddress, 8, "getLong");
     return UnsafeMemoryAccessor.getLong(requestedAddress);
   }
 
@@ -404,7 +404,7 @@ public final long getLong(long offset) {
    */
   public final void setLong(long offset, long value) {
     long requestedAddress = this.address + offset;
-    addressOutOfBoundsCheck(requestedAddress, 8, "getLong");
+    addressOutOfBoundsCheck(requestedAddress, 8, "setLong");
     UnsafeMemoryAccessor.setLong(requestedAddress, value);
   }
 

diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -663,8 +663,12 @@ private static native long[] scatterScalars(long[] srcScalarHandles, long scatte
 
   private static native long[] convertToRows(long nativeHandle);
 
+  private static native long[] convertToRowsFixedWidthOptimized(long nativeHandle);
+
   private static native long[] convertFromRows(long nativeColumnView, int[] types, int[] scale);
 
+  private static native long[] convertFromRowsFixedWidthOptimized(long nativeColumnView, int[] types, int[] scale);
+
   private static native long[] repeatStaticCount(long tableHandle, int count);
 
   private static native long[] repeatColumnCount(long tableHandle,
@@ -2781,6 +2785,23 @@ public GatherMap conditionalLeftAntiJoinGatherMap(Table rightTable,
     return buildSemiJoinGatherMap(gatherMapData);
   }
 
+  /**
+   * For details about how this method functions refer to
+   * {@link #convertToRowsFixedWidthOptimized()}.
+   *
+   * The only thing different between this method and {@link #convertToRowsFixedWidthOptimized()}
+   * is that this can handle rougly 250M columns while {@link #convertToRowsFixedWidthOptimized()}
+   * can only handle columns less than 100
+   */
+  public ColumnVector[] convertToRows() {
+    long[] ptrs = convertToRows(nativeHandle);
+    ColumnVector[] ret = new ColumnVector[ptrs.length];
+    for (int i = 0; i < ptrs.length; i++) {
+      ret[i] = new ColumnVector(ptrs[i]);
+    }
+    return ret;
+  }
+
   /**
    * Convert this table of columns into a row major format that is useful for interacting with other
    * systems that do row major processing of the data. Currently only fixed-width column types are
@@ -2855,8 +2876,8 @@ public GatherMap conditionalLeftAntiJoinGatherMap(Table rightTable,
    * There are some limits on the size of a single row.  If the row is larger than 1KB this will
    * throw an exception.
    */
-  public ColumnVector[] convertToRows() {
-    long[] ptrs = convertToRows(nativeHandle);
+  public ColumnVector[] convertToRowsFixedWidthOptimized() {
+    long[] ptrs = convertToRowsFixedWidthOptimized(nativeHandle);
     ColumnVector[] ret = new ColumnVector[ptrs.length];
     for (int i = 0; i < ptrs.length; i++) {
       ret[i] = new ColumnVector(ptrs[i]);
@@ -2867,13 +2888,14 @@ public ColumnVector[] convertToRows() {
   /**
    * Convert a column of list of bytes that is formatted like the output from `convertToRows`
    * and convert it back to a table.
+   *
+   * NOTE: This method doesn't support nested types
+   *
    * @param vec the row data to process.
    * @param schema the types of each column.
    * @return the parsed table.
    */
   public static Table convertFromRows(ColumnView vec, DType ... schema) {
-    // TODO at some point we need a schema that support nesting so we can support nested types
-    // TODO we will need scale at some point very soon too
     int[] types = new int[schema.length];
     int[] scale = new int[schema.length];
     for (int i = 0; i < schema.length; i++) {
@@ -2884,6 +2906,27 @@ public static Table convertFromRows(ColumnView vec, DType ... schema) {
     return new Table(convertFromRows(vec.getNativeView(), types, scale));
   }
 
+  /**
+   * Convert a column of list of bytes that is formatted like the output from `convertToRows`
+   * and convert it back to a table.
+   *
+   * NOTE: This method doesn't support nested types
+   *
+   * @param vec the row data to process.
+   * @param schema the types of each column.
+   * @return the parsed table.
+   */
+  public static Table convertFromRowsFixedWidthOptimized(ColumnView vec, DType ... schema) {
+    int[] types = new int[schema.length];
+    int[] scale = new int[schema.length];
+    for (int i = 0; i < schema.length; i++) {
+      types[i] = schema[i].typeId.nativeId;
+      scale[i] = schema[i].getScale();
+
+    }
+    return new Table(convertFromRowsFixedWidthOptimized(vec.getNativeView(), types, scale));
+  }
+
   /**
    * Construct a table from a packed representation.
    * @param metadata host-based metadata for the table

diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
@@ -2861,6 +2861,25 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_gather(JNIEnv *env, jclas
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_convertToRowsFixedWidthOptimized(JNIEnv *env, jclass, jlong input_table) {
+  JNI_NULL_CHECK(env, input_table, "input table is null", 0);
+
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::table_view *n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
+    std::vector<std::unique_ptr<cudf::column>> cols =
+        cudf::jni::convert_to_rows_fixed_width_optimized(*n_input_table);
+    int num_columns = cols.size();
+    cudf::jni::native_jlongArray outcol_handles(env, num_columns);
+    for (int i = 0; i < num_columns; i++) {
+      outcol_handles[i] = reinterpret_cast<jlong>(cols[i].release());
+    }
+    return outcol_handles.get_jArray();
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_scatterTable(JNIEnv *env, jclass,
                                                                     jlong j_input, jlong j_map,
                                                                     jlong j_target,
@@ -2908,7 +2927,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRows(JNIEnv *env
   try {
     cudf::jni::auto_set_device(env);
     cudf::table_view *n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
-    std::vector<std::unique_ptr<cudf::column>> cols = cudf::java::convert_to_rows(*n_input_table);
+    std::vector<std::unique_ptr<cudf::column>> cols = cudf::jni::convert_to_rows(*n_input_table);
     int num_columns = cols.size();
     cudf::jni::native_jlongArray outcol_handles(env, num_columns);
     for (int i = 0; i < num_columns; i++) {
@@ -2919,6 +2938,28 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRows(JNIEnv *env
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRowsFixedWidthOptimized(
+    JNIEnv *env, jclass, jlong input_column, jintArray types, jintArray scale) {
+  JNI_NULL_CHECK(env, input_column, "input column is null", 0);
+  JNI_NULL_CHECK(env, types, "types is null", 0);
+
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_column);
+    cudf::lists_column_view list_input(*input);
+    cudf::jni::native_jintArray n_types(env, types);
+    cudf::jni::native_jintArray n_scale(env, scale);
+    std::vector<cudf::data_type> types_vec;
+    for (int i = 0; i < n_types.size(); i++) {
+      types_vec.emplace_back(cudf::jni::make_data_type(n_types[i], n_scale[i]));
+    }
+    std::unique_ptr<cudf::table> result =
+        cudf::jni::convert_from_rows_fixed_width_optimized(list_input, types_vec);
+    return cudf::jni::convert_table_for_return(env, result);
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRows(JNIEnv *env, jclass,
                                                                        jlong input_column,
                                                                        jintArray types,
@@ -2936,7 +2977,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRows(JNIEnv *e
     for (int i = 0; i < n_types.size(); i++) {
       types_vec.emplace_back(cudf::jni::make_data_type(n_types[i], n_scale[i]));
     }
-    std::unique_ptr<cudf::table> result = cudf::java::convert_from_rows(list_input, types_vec);
+    std::unique_ptr<cudf::table> result = cudf::jni::convert_from_rows(list_input, types_vec);
     return cudf::jni::convert_table_for_return(env, result);
   }
   CATCH_STD(env, 0);