rapidsai · rapids-bot · Jun 23, 2021 · May 28, 2021 · Jun 7, 2021 · Jun 10, 2021
@@ -109,6 +109,8 @@ class csv_reader_options {
 
   // Conversion settings
 
+  // Per-column types; if provided, this takes precedence over _dtypes
+  std::vector<data_type> _data_types;
   // Per-column types; disables type inference on those columns
   std::vector<std::string> _dtypes;
   // Additional values to recognize as boolean true values
@@ -286,6 +288,11 @@ class csv_reader_options {
    */
   std::vector<int> const& get_infer_date_indexes() const { return _infer_date_indexes; }
 
+  /**
+   * @brief Returns per-column types.
+   */
+  std::vector<data_type> const& get_data_types() const { return _data_types; }
+
   /**
    * @brief Returns per-column types.
    */
@@ -558,6 +565,13 @@ class csv_reader_options {
     _infer_date_indexes = std::move(col_ind);
   }
 
+  /**
+   * @brief Sets per-column types. If this is set, this takes precedence over dtypes.
+   *
+   * @param types Vector specifying the columns' target data types.
+   */
+  void set_data_types(std::vector<data_type> types) { _data_types = std::move(types); }
+
   /**
    * @brief Sets per-column types.
    *
@@ -962,6 +976,18 @@ class csv_reader_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Sets per-column types.
+   *
+   * @param types Vector of data types in which the column needs to be read.
+   * @return this for chaining.
+   */
+  csv_reader_options_builder& data_types(std::vector<data_type> types)
+  {
+    options._data_types = std::move(types);
+    return *this;
+  }
+
   /**
    * @brief Sets per-column types.
    *

@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <thrust/optional.h>
+#include <thrust/pair.h>
+
+namespace cudf {
+namespace strings {
+namespace detail {
+
+/**
+ * @brief Return the integer component of a decimal string.
+ *
+ * This is reads everything up to the exponent 'e' notation.
+ * The return includes the integer digits and any exponent offset.
+ *
+ * @param[in,out] iter Start of characters to parse
+ * @param[in] end End of characters to parse
+ * @return Integer component and exponent offset.
+ */
+__device__ inline thrust::pair<uint64_t, int32_t> parse_integer(char const*& iter,
+                                                                char const* iter_end,
+                                                                const char decimal_pt_char = '.')
+{
+  // highest value where another decimal digit cannot be appended without an overflow;
+  // this preserves the most digits when scaling the final result
+  constexpr uint64_t decimal_max = (std::numeric_limits<uint64_t>::max() - 9L) / 10L;
+
+  uint64_t value     = 0;  // for checking overflow
+  int32_t exp_offset = 0;
+  bool decimal_found = false;
+
+  while (iter < iter_end) {
+    auto const ch = *iter++;
+    if (ch == decimal_pt_char && !decimal_found) {
+      decimal_found = true;
+      continue;
+    }
+    if (ch < '0' || ch > '9') {
+      --iter;
+      break;
+    }
+    if (value > decimal_max) {
+      exp_offset += static_cast<int32_t>(!decimal_found);
+    } else {
+      value = (value * 10) + static_cast<uint64_t>(ch - '0');
+      exp_offset -= static_cast<int32_t>(decimal_found);
+    }
+  }
+  return {value, exp_offset};
+}
+
+/**
+ * @brief Return the exponent of a decimal string.
+ *
+ * This should only be called after the exponent 'e' notation was detected.
+ * The return is the exponent (base-10) integer and can only be
+ * invalid if `check_only == true` and invalid characters are found or the
+ * exponent overflows an int32.
+ *
+ * @tparam check_only Set to true to verify the characters are valid and the
+ *         exponent value in the decimal string does not overflow int32
+ * @param[in,out] iter Start of characters to parse
+ *                     (points to the character after the 'E' or 'e')
+ * @param[in] end End of characters to parse
+ * @return Integer value of the exponent
+ */
+template <bool check_only = false>
+__device__ thrust::optional<int32_t> parse_exponent(char const* iter, char const* iter_end)
+{
+  constexpr uint32_t exponent_max = static_cast<uint32_t>(std::numeric_limits<int32_t>::max());
+
+  // get optional exponent sign
+  int32_t const exp_sign = [&iter] {
+    auto const ch = *iter;
+    if (ch != '-' && ch != '+') { return 1; }
+    ++iter;
+    return (ch == '-' ? -1 : 1);
+  }();
+
+  // parse exponent integer
+  int32_t exp_ten = 0;
+  while (iter < iter_end) {
+    auto const ch = *iter++;
+    if (ch < '0' || ch > '9') {
+      if (check_only) { return thrust::nullopt; }
+      break;
+    }
+
+    uint32_t exp_check = static_cast<uint32_t>(exp_ten * 10) + static_cast<uint32_t>(ch - '0');
+    if (check_only && (exp_check > exponent_max)) { return thrust::nullopt; }  // check overflow
+    exp_ten = static_cast<int32_t>(exp_check);
+  }
+
+  return exp_ten * exp_sign;
+}
+
+/**
+ * @brief Converts the string in the range [iter, iter_end) into a decimal.
+ *
+ * @tparam DecimalType The decimal type to be returned
+ * @param iter The beginning of the string. Unless iter >= iter_end, iter is dereferenced
+ * @param iter_end The end of the characters to parse
+ * @param scale The scale to be applied
+ * @return
+ */
+template <typename DecimalType>
+__device__ DecimalType parse_decimal(char const* iter, char const* iter_end, int32_t scale)
+{
+  auto const sign = [&] {
+    if (iter_end <= iter) { return 0; }
+    if (*iter == '-') { return -1; }
+    if (*iter == '+') { return 1; }
+    return 0;
+  }();
+
+  // if string begins with a sign, continue with next character
+  if (sign != 0) ++iter;
+
+  auto [value, exp_offset] = parse_integer(iter, iter_end);
+  if (value == 0) { return DecimalType{0}; }
+
+  // check for exponent
+  int32_t exp_ten = 0;
+  if ((iter < iter_end) && (*iter == 'e' || *iter == 'E')) {
+    ++iter;
+    if (iter < iter_end) { exp_ten = parse_exponent<false>(iter, iter_end).value(); }
+  }
+  exp_ten += exp_offset;
+
+  // shift the output value based on the exp_ten and the scale values
+  if (exp_ten < scale) {
+    value = value / static_cast<uint64_t>(exp10(static_cast<double>(scale - exp_ten)));
+  } else {
+    value = value * static_cast<uint64_t>(exp10(static_cast<double>(exp_ten - scale)));
+  }
+
+  return static_cast<DecimalType>(value) * (sign == 0 ? 1 : sign);
+}
+}  // namespace detail
+}  // namespace strings
+}  // namespace cudf
@@ -25,6 +25,7 @@
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/lists/list_view.cuh>
 #include <cudf/null_mask.hpp>
+#include <cudf/strings/detail/convert/fixed_point.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/structs/struct_view.hpp>
 #include <cudf/utilities/bit.hpp>
@@ -410,26 +411,6 @@ __inline__ __device__ cudf::list_view decode_value(char const *begin,
   return cudf::list_view{};
 }
 
-// The purpose of this is merely to allow compilation ONLY
-// TODO : make this work for csv
-template <>
-__inline__ __device__ numeric::decimal32 decode_value(char const *begin,
-                                                      char const *end,
-                                                      parse_options_view const &opts)
-{
-  return numeric::decimal32{};
-}
-
-// The purpose of this is merely to allow compilation ONLY
-// TODO : make this work for csv
-template <>
-__inline__ __device__ numeric::decimal64 decode_value(char const *begin,
-                                                      char const *end,
-                                                      parse_options_view const &opts)
-{
-  return numeric::decimal64{};
-}
-
 // The purpose of this is merely to allow compilation ONLY
 // TODO : make this work for csv
 template <>
@@ -451,11 +432,13 @@ struct decode_op {
    *
    * @return bool Whether the parsed value is valid.
    */
-  template <typename T,
-            typename std::enable_if_t<std::is_integral<T>::value and !std::is_same<T, bool>::value>
-              * = nullptr>
+  template <
+    typename T,
+    typename std::enable_if_t<std::is_integral<T>::value and !std::is_same<T, bool>::value and
+                              !cudf::is_fixed_point<T>()> * = nullptr>
   __host__ __device__ __forceinline__ bool operator()(void *out_buffer,
                                                       size_t row,
+                                                      const data_type output_type,
                                                       char const *begin,
                                                       char const *end,
                                                       parse_options_view const &opts,
@@ -473,12 +456,37 @@ struct decode_op {
     return true;
   }
 
+  /**
+   * @brief Dispatch for fixed point types.
+   *
+   * @return bool Whether the parsed value is valid.
+   */
+  template <typename T, typename std::enable_if_t<cudf::is_fixed_point<T>()> * = nullptr>
+  __host__ __device__ __forceinline__ bool operator()(void *out_buffer,
+                                                      size_t row,
+                                                      const data_type output_type,
+                                                      char const *begin,
+                                                      char const *end,
+                                                      parse_options_view const &opts,
+                                                      column_parse::flags flags)
+  {
+    static_cast<device_storage_type_t<T> *>(out_buffer)[row] =
+      [&flags, &opts, output_type, begin, end]() -> device_storage_type_t<T> {
+      auto const field_len = static_cast<size_t>(end - begin);
+      return strings::detail::parse_decimal<device_storage_type_t<T>>(
+        begin, end, output_type.scale());
+    }();
+
+    return true;
+  }
+
   /**
    * @brief Dispatch for boolean type types.
    */
   template <typename T, typename std::enable_if_t<std::is_same<T, bool>::value> * = nullptr>
   __host__ __device__ __forceinline__ bool operator()(void *out_buffer,
                                                       size_t row,
+                                                      const data_type output_type,
                                                       char const *begin,
                                                       char const *end,
                                                       parse_options_view const &opts,
@@ -502,6 +510,7 @@ struct decode_op {
   template <typename T, typename std::enable_if_t<std::is_floating_point<T>::value> * = nullptr>
   __host__ __device__ __forceinline__ bool operator()(void *out_buffer,
                                                       size_t row,
+                                                      const data_type output_type,
                                                       char const *begin,
                                                       char const *end,
                                                       parse_options_view const &opts,
@@ -516,11 +525,13 @@ struct decode_op {
   /**
    * @brief Dispatch for all other types.
    */
-  template <typename T,
-            typename std::enable_if_t<!std::is_integral<T>::value and
-                                      !std::is_floating_point<T>::value> * = nullptr>
+  template <
+    typename T,
+    typename std::enable_if_t<!std::is_integral<T>::value and !std::is_floating_point<T>::value and
+                              !cudf::is_fixed_point<T>()> * = nullptr>
   __host__ __device__ __forceinline__ bool operator()(void *out_buffer,
                                                       size_t row,
+                                                      const data_type output_type,
                                                       char const *begin,
                                                       char const *end,
                                                       parse_options_view const &opts,
@@ -605,6 +616,7 @@ __global__ void __launch_bounds__(csvparse_block_dim)
                                     decode_op{},
                                     columns[actual_col],
                                     rec_id,
+                                    dtypes[actual_col],
                                     field_start,
                                     field_end,
                                     options,

@@ -566,6 +566,9 @@ std::vector<data_type> reader::impl::gather_column_types(device_span<char const>
                                                          device_span<uint64_t const> row_offsets,
                                                          rmm::cuda_stream_view stream)
 {
+  // if vector of data_types was provdided this takes precedence over other options
+  if (!opts_.get_data_types().empty()) { return opts_.get_data_types(); }
+
   std::vector<data_type> dtypes;
 
   if (opts_.get_dtypes().empty()) {