Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Decimal support csv reader #8511

Merged
merged 12 commits into from
Jun 23, 2021
26 changes: 26 additions & 0 deletions cpp/include/cudf/io/csv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,8 @@ class csv_reader_options {

// Conversion settings

// Per-column types; if provided, this takes precedence over _dtypes
std::vector<data_type> _data_types;
// Per-column types; disables type inference on those columns
std::vector<std::string> _dtypes;
// Additional values to recognize as boolean true values
Expand Down Expand Up @@ -286,6 +288,11 @@ class csv_reader_options {
*/
std::vector<int> const& get_infer_date_indexes() const { return _infer_date_indexes; }

/**
* @brief Returns per-column types.
*/
std::vector<data_type> const& get_data_types() const { return _data_types; }
vuule marked this conversation as resolved.
Show resolved Hide resolved

/**
* @brief Returns per-column types.
*/
Expand Down Expand Up @@ -558,6 +565,13 @@ class csv_reader_options {
_infer_date_indexes = std::move(col_ind);
}

/**
* @brief Sets per-column types. If this is set, this takes precedence over dtypes.
*
* @param types Vector specifying the columns' target data types.
*/
void set_data_types(std::vector<data_type> types) { _data_types = std::move(types); }

/**
* @brief Sets per-column types.
*
Expand Down Expand Up @@ -962,6 +976,18 @@ class csv_reader_options_builder {
return *this;
}

/**
* @brief Sets per-column types.
*
* @param types Vector of data types in which the column needs to be read.
* @return this for chaining.
*/
csv_reader_options_builder& data_types(std::vector<data_type> types)
{
options._data_types = std::move(types);
return *this;
}

/**
* @brief Sets per-column types.
*
Expand Down
155 changes: 155 additions & 0 deletions cpp/include/cudf/strings/detail/convert/fixed_point.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <thrust/optional.h>
#include <thrust/pair.h>

namespace cudf {
namespace strings {
namespace detail {

/**
* @brief Return the integer component of a decimal string.
*
* This is reads everything up to the exponent 'e' notation.
* The return includes the integer digits and any exponent offset.
*
* @param[in,out] iter Start of characters to parse
* @param[in] end End of characters to parse
* @return Integer component and exponent offset.
*/
__device__ inline thrust::pair<uint64_t, int32_t> parse_integer(char const*& iter,
char const* iter_end,
const char decimal_pt_char = '.')
{
// highest value where another decimal digit cannot be appended without an overflow;
// this preserves the most digits when scaling the final result
constexpr uint64_t decimal_max = (std::numeric_limits<uint64_t>::max() - 9L) / 10L;

uint64_t value = 0; // for checking overflow
int32_t exp_offset = 0;
bool decimal_found = false;

while (iter < iter_end) {
auto const ch = *iter++;
if (ch == decimal_pt_char && !decimal_found) {
decimal_found = true;
continue;
}
if (ch < '0' || ch > '9') {
--iter;
break;
}
if (value > decimal_max) {
exp_offset += static_cast<int32_t>(!decimal_found);
} else {
value = (value * 10) + static_cast<uint64_t>(ch - '0');
exp_offset -= static_cast<int32_t>(decimal_found);
}
}
return {value, exp_offset};
}

/**
* @brief Return the exponent of a decimal string.
*
* This should only be called after the exponent 'e' notation was detected.
* The return is the exponent (base-10) integer and can only be
* invalid if `check_only == true` and invalid characters are found or the
* exponent overflows an int32.
*
* @tparam check_only Set to true to verify the characters are valid and the
* exponent value in the decimal string does not overflow int32
* @param[in,out] iter Start of characters to parse
* (points to the character after the 'E' or 'e')
* @param[in] end End of characters to parse
* @return Integer value of the exponent
*/
template <bool check_only = false>
__device__ thrust::optional<int32_t> parse_exponent(char const* iter, char const* iter_end)
vuule marked this conversation as resolved.
Show resolved Hide resolved
{
constexpr uint32_t exponent_max = static_cast<uint32_t>(std::numeric_limits<int32_t>::max());

// get optional exponent sign
int32_t const exp_sign = [&iter] {
auto const ch = *iter;
if (ch != '-' && ch != '+') { return 1; }
++iter;
return (ch == '-' ? -1 : 1);
}();

// parse exponent integer
int32_t exp_ten = 0;
while (iter < iter_end) {
auto const ch = *iter++;
if (ch < '0' || ch > '9') {
if (check_only) { return thrust::nullopt; }
break;
}

uint32_t exp_check = static_cast<uint32_t>(exp_ten * 10) + static_cast<uint32_t>(ch - '0');
if (check_only && (exp_check > exponent_max)) { return thrust::nullopt; } // check overflow
exp_ten = static_cast<int32_t>(exp_check);
}

return exp_ten * exp_sign;
}

/**
* @brief Converts the string in the range [iter, iter_end) into a decimal.
*
* @tparam DecimalType The decimal type to be returned
* @param iter The beginning of the string. Unless iter >= iter_end, iter is dereferenced
* @param iter_end The end of the characters to parse
* @param scale The scale to be applied
* @return
*/
template <typename DecimalType>
__device__ DecimalType parse_decimal(char const* iter, char const* iter_end, int32_t scale)
{
auto const sign = [&] {
if (iter_end <= iter) { return 0; }
if (*iter == '-') { return -1; }
if (*iter == '+') { return 1; }
return 0;
}();

// if string begins with a sign, continue with next character
if (sign != 0) ++iter;

auto [value, exp_offset] = parse_integer(iter, iter_end);
if (value == 0) { return DecimalType{0}; }

// check for exponent
int32_t exp_ten = 0;
if ((iter < iter_end) && (*iter == 'e' || *iter == 'E')) {
++iter;
if (iter < iter_end) { exp_ten = parse_exponent<false>(iter, iter_end).value(); }
}
exp_ten += exp_offset;

// shift the output value based on the exp_ten and the scale values
if (exp_ten < scale) {
value = value / static_cast<uint64_t>(exp10(static_cast<double>(scale - exp_ten)));
} else {
value = value * static_cast<uint64_t>(exp10(static_cast<double>(exp_ten - scale)));
}

return static_cast<DecimalType>(value) * (sign == 0 ? 1 : sign);
}
} // namespace detail
} // namespace strings
} // namespace cudf
64 changes: 38 additions & 26 deletions cpp/src/io/csv/csv_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include <cudf/fixed_point/fixed_point.hpp>
#include <cudf/lists/list_view.cuh>
#include <cudf/null_mask.hpp>
#include <cudf/strings/detail/convert/fixed_point.cuh>
#include <cudf/strings/string_view.cuh>
#include <cudf/structs/struct_view.hpp>
#include <cudf/utilities/bit.hpp>
Expand Down Expand Up @@ -410,26 +411,6 @@ __inline__ __device__ cudf::list_view decode_value(char const *begin,
return cudf::list_view{};
}

// The purpose of this is merely to allow compilation ONLY
// TODO : make this work for csv
template <>
__inline__ __device__ numeric::decimal32 decode_value(char const *begin,
char const *end,
parse_options_view const &opts)
{
return numeric::decimal32{};
}

// The purpose of this is merely to allow compilation ONLY
// TODO : make this work for csv
template <>
__inline__ __device__ numeric::decimal64 decode_value(char const *begin,
char const *end,
parse_options_view const &opts)
{
return numeric::decimal64{};
}

// The purpose of this is merely to allow compilation ONLY
// TODO : make this work for csv
template <>
Expand All @@ -451,11 +432,13 @@ struct decode_op {
*
* @return bool Whether the parsed value is valid.
*/
template <typename T,
typename std::enable_if_t<std::is_integral<T>::value and !std::is_same<T, bool>::value>
* = nullptr>
template <
typename T,
typename std::enable_if_t<std::is_integral<T>::value and !std::is_same<T, bool>::value and
!cudf::is_fixed_point<T>()> * = nullptr>
__host__ __device__ __forceinline__ bool operator()(void *out_buffer,
size_t row,
const data_type output_type,
elstehle marked this conversation as resolved.
Show resolved Hide resolved
char const *begin,
char const *end,
parse_options_view const &opts,
Expand All @@ -473,12 +456,37 @@ struct decode_op {
return true;
}

/**
* @brief Dispatch for fixed point types.
*
* @return bool Whether the parsed value is valid.
*/
template <typename T, typename std::enable_if_t<cudf::is_fixed_point<T>()> * = nullptr>
__host__ __device__ __forceinline__ bool operator()(void *out_buffer,
size_t row,
const data_type output_type,
char const *begin,
char const *end,
parse_options_view const &opts,
column_parse::flags flags)
{
static_cast<device_storage_type_t<T> *>(out_buffer)[row] =
[&flags, &opts, output_type, begin, end]() -> device_storage_type_t<T> {
auto const field_len = static_cast<size_t>(end - begin);
elstehle marked this conversation as resolved.
Show resolved Hide resolved
return strings::detail::parse_decimal<device_storage_type_t<T>>(
begin, end, output_type.scale());
}();

return true;
}

/**
* @brief Dispatch for boolean type types.
*/
template <typename T, typename std::enable_if_t<std::is_same<T, bool>::value> * = nullptr>
__host__ __device__ __forceinline__ bool operator()(void *out_buffer,
size_t row,
const data_type output_type,
char const *begin,
char const *end,
parse_options_view const &opts,
Expand All @@ -502,6 +510,7 @@ struct decode_op {
template <typename T, typename std::enable_if_t<std::is_floating_point<T>::value> * = nullptr>
__host__ __device__ __forceinline__ bool operator()(void *out_buffer,
size_t row,
const data_type output_type,
char const *begin,
char const *end,
parse_options_view const &opts,
Expand All @@ -516,11 +525,13 @@ struct decode_op {
/**
* @brief Dispatch for all other types.
*/
template <typename T,
typename std::enable_if_t<!std::is_integral<T>::value and
!std::is_floating_point<T>::value> * = nullptr>
template <
typename T,
typename std::enable_if_t<!std::is_integral<T>::value and !std::is_floating_point<T>::value and
elstehle marked this conversation as resolved.
Show resolved Hide resolved
!cudf::is_fixed_point<T>()> * = nullptr>
__host__ __device__ __forceinline__ bool operator()(void *out_buffer,
size_t row,
const data_type output_type,
char const *begin,
char const *end,
parse_options_view const &opts,
Expand Down Expand Up @@ -605,6 +616,7 @@ __global__ void __launch_bounds__(csvparse_block_dim)
decode_op{},
columns[actual_col],
rec_id,
dtypes[actual_col],
field_start,
field_end,
options,
Expand Down
3 changes: 3 additions & 0 deletions cpp/src/io/csv/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -566,6 +566,9 @@ std::vector<data_type> reader::impl::gather_column_types(device_span<char const>
device_span<uint64_t const> row_offsets,
rmm::cuda_stream_view stream)
{
// if vector of data_types was provdided this takes precedence over other options
if (!opts_.get_data_types().empty()) { return opts_.get_data_types(); }

std::vector<data_type> dtypes;

if (opts_.get_dtypes().empty()) {
Expand Down
Loading