Read FIXED_LEN_BYTE_ARRAY as binary in parquet reader (#13437)

Closes #12590 This PR adds support of reading `FIXED_LEN_BYTE_ARRAY` as lists of `INT8` in the parquet reader. Authors: - Yunsong Wang (https://github.com/PointKernel) - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Mike Wilson (https://github.com/hyperbolic2346) - Robert (Bobby) Evans (https://github.com/revans2) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: #13437
rapidsai · Aug 24, 2023 · 6095a92 · 6095a92
1 parent ff99f98
commit 6095a92
Show file tree

Hide file tree

Showing 7 changed files with 95 additions and 58 deletions.
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
@@ -48,7 +48,8 @@ inline __device__ void gpuOutputString(volatile page_state_s* s,
                                        void* dstv)
 {
   auto [ptr, len] = gpuGetStringData(s, sb, src_pos);
-  if (s->dtype_len == 4) {
+  // make sure to only hash `BYTE_ARRAY` when specified with the output type size
+  if (s->dtype_len == 4 and (s->col.data_type & 7) == BYTE_ARRAY) {
     // Output hash. This hash value is used if the option to convert strings to
     // categoricals is enabled. The seed value is chosen arbitrarily.
     uint32_t constexpr hash_seed = 33;
@@ -456,8 +457,12 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
   if (s->dict_base) {
     out_thread0 = (s->dict_bits > 0) ? 64 : 32;
   } else {
-    out_thread0 =
-      ((s->col.data_type & 7) == BOOLEAN || (s->col.data_type & 7) == BYTE_ARRAY) ? 64 : 32;
+    switch (s->col.data_type & 7) {
+      case BOOLEAN: [[fallthrough]];
+      case BYTE_ARRAY: [[fallthrough]];
+      case FIXED_LEN_BYTE_ARRAY: out_thread0 = 64; break;
+      default: out_thread0 = 32;
+    }
   }
 
   PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
@@ -494,7 +499,8 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
         src_target_pos = gpuDecodeDictionaryIndices<false>(s, sb, src_target_pos, t & 0x1f).first;
       } else if ((s->col.data_type & 7) == BOOLEAN) {
         src_target_pos = gpuDecodeRleBooleans(s, sb, src_target_pos, t & 0x1f);
-      } else if ((s->col.data_type & 7) == BYTE_ARRAY) {
+      } else if ((s->col.data_type & 7) == BYTE_ARRAY or
+                 (s->col.data_type & 7) == FIXED_LEN_BYTE_ARRAY) {
         gpuInitStringDescriptors<false>(s, sb, src_target_pos, t & 0x1f);
       }
       if (t == 32) { *(volatile int32_t*)&s->dict_pos = src_target_pos; }
@@ -564,6 +570,8 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
               }
               break;
           }
+        } else if (dtype == FIXED_LEN_BYTE_ARRAY) {
+          gpuOutputString(s, sb, val_src_pos, dst);
         } else if (dtype == INT96) {
           gpuOutputInt96Timestamp(s, sb, val_src_pos, static_cast<int64_t*>(dst));
         } else if (dtype_len == 8) {

diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
@@ -421,13 +421,15 @@ __device__ size_type gpuInitStringDescriptors(page_state_s volatile* s,
     int k              = s->dict_val;
 
     while (pos < target_pos) {
-      int len;
-      if (k + 4 <= dict_size) {
-        len = (cur[k]) | (cur[k + 1] << 8) | (cur[k + 2] << 16) | (cur[k + 3] << 24);
-        k += 4;
-        if (k + len > dict_size) { len = 0; }
+      int len = 0;
+      if ((s->col.data_type & 7) == FIXED_LEN_BYTE_ARRAY) {
+        if (k < dict_size) { len = s->dtype_len_in; }
       } else {
-        len = 0;
+        if (k + 4 <= dict_size) {
+          len = (cur[k]) | (cur[k + 1] << 8) | (cur[k + 2] << 16) | (cur[k + 3] << 24);
+          k += 4;
+          if (k + len > dict_size) { len = 0; }
+        }
       }
       if constexpr (!sizes_only) {
         sb->dict_idx[rolling_index<state_buf::dict_buf_size>(pos)] = k;
@@ -1154,16 +1156,20 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
       }
       // Special check for downconversions
       s->dtype_len_in = s->dtype_len;
-      if (s->col.converted_type == DECIMAL && data_type == FIXED_LEN_BYTE_ARRAY) {
-        s->dtype_len = [dtype_len = s->dtype_len]() {
-          if (dtype_len <= sizeof(int32_t)) {
-            return sizeof(int32_t);
-          } else if (dtype_len <= sizeof(int64_t)) {
-            return sizeof(int64_t);
-          } else {
-            return sizeof(__int128_t);
-          }
-        }();
+      if (data_type == FIXED_LEN_BYTE_ARRAY) {
+        if (s->col.converted_type == DECIMAL) {
+          s->dtype_len = [dtype_len = s->dtype_len]() {
+            if (dtype_len <= sizeof(int32_t)) {
+              return sizeof(int32_t);
+            } else if (dtype_len <= sizeof(int64_t)) {
+              return sizeof(int64_t);
+            } else {
+              return sizeof(__int128_t);
+            }
+          }();
+        } else {
+          s->dtype_len = sizeof(string_index_pair);
+        }
       } else if (data_type == INT32) {
         if (dtype_len_out == 1) {
           // INT8 output
@@ -1219,7 +1225,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
               uint32_t len = idx < max_depth - 1 ? sizeof(cudf::size_type) : s->dtype_len;
               // if this is a string column, then dtype_len is a lie. data will be offsets rather
               // than (ptr,len) tuples.
-              if (data_type == BYTE_ARRAY && s->dtype_len != 4) { len = sizeof(cudf::size_type); }
+              if (is_string_col(s->col)) { len = sizeof(cudf::size_type); }
               nesting_info->data_out += (output_offset * len);
             }
             if (nesting_info->string_out != nullptr) {

diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
@@ -521,39 +521,44 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
     pp->num_valids = s->page.num_valids;
   }
 
-  // now process string info in the range [start_value, end_value)
-  // set up for decoding strings...can be either plain or dictionary
-  auto const& col          = s->col;
-  uint8_t const* data      = s->data_start;
-  uint8_t const* const end = s->data_end;
-  uint8_t const* dict_base = nullptr;
-  int dict_size            = 0;
-  size_t str_bytes         = 0;
-
-  switch (pp->encoding) {
-    case Encoding::PLAIN_DICTIONARY:
-    case Encoding::RLE_DICTIONARY:
-      // RLE-packed dictionary indices, first byte indicates index length in bits
-      if (col.str_dict_index) {
-        // String dictionary: use index
-        dict_base = reinterpret_cast<const uint8_t*>(col.str_dict_index);
-        dict_size = col.page_info[0].num_input_values * sizeof(string_index_pair);
-      } else {
-        dict_base = col.page_info[0].page_data;  // dictionary is always stored in the first page
-        dict_size = col.page_info[0].uncompressed_page_size;
-      }
+  auto const& col  = s->col;
+  size_t str_bytes = 0;
+  // short circuit for FIXED_LEN_BYTE_ARRAY
+  if ((col.data_type & 7) == FIXED_LEN_BYTE_ARRAY) {
+    str_bytes = pp->num_valids * s->dtype_len_in;
+  } else {
+    // now process string info in the range [start_value, end_value)
+    // set up for decoding strings...can be either plain or dictionary
+    uint8_t const* data      = s->data_start;
+    uint8_t const* const end = s->data_end;
+    uint8_t const* dict_base = nullptr;
+    int dict_size            = 0;
+
+    switch (pp->encoding) {
+      case Encoding::PLAIN_DICTIONARY:
+      case Encoding::RLE_DICTIONARY:
+        // RLE-packed dictionary indices, first byte indicates index length in bits
+        if (col.str_dict_index) {
+          // String dictionary: use index
+          dict_base = reinterpret_cast<const uint8_t*>(col.str_dict_index);
+          dict_size = col.page_info[0].num_input_values * sizeof(string_index_pair);
+        } else {
+          dict_base = col.page_info[0].page_data;  // dictionary is always stored in the first page
+          dict_size = col.page_info[0].uncompressed_page_size;
+        }
 
-      // FIXME: need to return an error condition...this won't actually do anything
-      if (s->dict_bits > 32 || !dict_base) { CUDF_UNREACHABLE("invalid dictionary bit size"); }
-
-      str_bytes = totalDictEntriesSize(
-        data, dict_base, s->dict_bits, dict_size, (end - data), start_value, end_value);
-      break;
-    case Encoding::PLAIN:
-      dict_size = static_cast<int32_t>(end - data);
-      str_bytes = is_bounds_pg ? totalPlainEntriesSize(data, dict_size, start_value, end_value)
-                               : dict_size - sizeof(int) * (pp->num_input_values - pp->num_nulls);
-      break;
+        // FIXME: need to return an error condition...this won't actually do anything
+        if (s->dict_bits > 32 || !dict_base) { CUDF_UNREACHABLE("invalid dictionary bit size"); }
+
+        str_bytes = totalDictEntriesSize(
+          data, dict_base, s->dict_bits, dict_size, (end - data), start_value, end_value);
+        break;
+      case Encoding::PLAIN:
+        dict_size = static_cast<int32_t>(end - data);
+        str_bytes = is_bounds_pg ? totalPlainEntriesSize(data, dict_size, start_value, end_value)
+                                 : dict_size - sizeof(int) * pp->num_valids;
+        break;
+    }
   }
 
   if (t == 0) {

diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -452,8 +452,11 @@ struct EncPage {
  */
 constexpr bool is_string_col(ColumnChunkDesc const& chunk)
 {
-  return (chunk.data_type & 7) == BYTE_ARRAY and (chunk.data_type >> 3) != 4 and
-         chunk.converted_type != DECIMAL;
+  auto const not_converted_to_decimal = chunk.converted_type != DECIMAL;
+  auto const non_hashed_byte_array =
+    (chunk.data_type & 7) == BYTE_ARRAY and (chunk.data_type >> 3) != 4;
+  auto const fixed_len_byte_array = (chunk.data_type & 7) == FIXED_LEN_BYTE_ARRAY;
+  return not_converted_to_decimal and (non_hashed_byte_array or fixed_len_byte_array);
 }
 
 /**

diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
@@ -401,9 +401,15 @@ table_with_metadata reader::impl::read_chunk_internal(
 
   // Create the final output cudf columns.
   for (size_t i = 0; i < _output_buffers.size(); ++i) {
-    auto const metadata = _reader_column_schema.has_value()
-                            ? std::make_optional<reader_column_schema>((*_reader_column_schema)[i])
-                            : std::nullopt;
+    auto metadata      = _reader_column_schema.has_value()
+                           ? std::make_optional<reader_column_schema>((*_reader_column_schema)[i])
+                           : std::nullopt;
+    auto const& schema = _metadata->get_schema(_output_column_schemas[i]);
+    // FIXED_LEN_BYTE_ARRAY never read as string
+    if (schema.type == FIXED_LEN_BYTE_ARRAY and schema.converted_type != DECIMAL) {
+      metadata = std::make_optional<reader_column_schema>();
+      metadata->set_convert_binary_to_strings(false);
+    }
     // Only construct `out_metadata` if `_output_metadata` has not been cached.
     if (!_output_metadata) {
       column_name_info& col_name = out_metadata.schema_info[i];

diff --git a/python/cudf/cudf/tests/data/parquet/fixed_len_byte_array.parquet b/python/cudf/cudf/tests/data/parquet/fixed_len_byte_array.parquet
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
@@ -2568,6 +2568,15 @@ def test_parquet_reader_binary_decimal(datadir):
     assert_eq(expect, got)
 
 
+def test_parquet_reader_fixed_bin(datadir):
+    fname = datadir / "fixed_len_byte_array.parquet"
+
+    expect = pd.read_parquet(fname)
+    got = cudf.read_parquet(fname)
+
+    assert_eq(expect, got)
+
+
 def test_parquet_reader_rle_boolean(datadir):
     fname = datadir / "rle_boolean_encoding.parquet"