From a2e892917028c4dd86aa4d564259cb0af46ba94b Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 2 Sep 2021 14:41:36 -0400 Subject: [PATCH 1/4] Add support for single-digits in cudf::to_timestamps --- cpp/src/strings/convert/convert_datetime.cu | 250 ++++++++++++-------- cpp/tests/strings/datetime_tests.cpp | 197 ++++++++------- 2 files changed, 265 insertions(+), 182 deletions(-) diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu index ce5eb015039..e3f2270f5f7 100644 --- a/cpp/src/strings/convert/convert_datetime.cu +++ b/cpp/src/strings/convert/convert_datetime.cu @@ -159,6 +159,26 @@ struct format_compiler { int8_t subsecond_precision() const { return specifiers.at('f'); } }; +/** + * @brief Specialized function to return the value reading upto the specified + * bytes or until an invalid character is encountered. + * + * @param str Beginning of characters to read. + * @param bytes Number of bytes in str to read. + * @return Integer value of valid characters read and how bytes were not read. + */ +__device__ thrust::pair str2int2(const char* str, size_type bytes) +{ + // const char* ptr = str; + int32_t value = 0; + while (bytes-- > 0) { + char chr = *str++; + if (chr < '0' || chr > '9') break; + value = (value * 10) + static_cast(chr - '0'); + } + return thrust::make_pair(value, bytes + 1); +} + /** * @brief This parses date/time characters into a timestamp integer * @@ -168,35 +188,22 @@ template struct parse_datetime { column_device_view const d_strings; device_span const d_format_items; - int8_t subsecond_precision; + int8_t const subsecond_precision; /** * @brief Return power of ten value given an exponent. * * @return `1x10^exponent` for `0 <= exponent <= 9` */ - __device__ constexpr int64_t power_of_ten(int32_t exponent) + __device__ constexpr int64_t power_of_ten(int32_t exponent) const { constexpr int64_t powers_of_ten[] = { 1L, 10L, 100L, 1000L, 10000L, 100000L, 1000000L, 10000000L, 100000000L, 1000000000L}; return powers_of_ten[exponent]; } - // - __device__ int32_t str2int(const char* str, size_type bytes) - { - const char* ptr = str; - int32_t value = 0; - for (size_type idx = 0; idx < bytes; ++idx) { - char chr = *ptr++; - if (chr < '0' || chr > '9') break; - value = (value * 10) + static_cast(chr - '0'); - } - return value; - } - // Walk the format_items to parse the string into date/time components - __device__ timestamp_components parse_into_parts(string_view const& d_string) + __device__ timestamp_components parse_into_parts(string_view const& d_string) const { timestamp_components timeparts = {1970, 1, 1, 0}; // init to epoch time @@ -208,32 +215,71 @@ struct parse_datetime { if (item.item_type == format_char_type::literal) { // static character we'll just skip; - // consume item.length bytes from string + // consume item.length bytes from the input string ptr += item.length; length -= item.length; continue; } + size_type copied = item.length; // number of bytes processed // special logic for each specifier switch (item.value) { - case 'Y': timeparts.year = static_cast(str2int(ptr, item.length)); break; + case 'Y': { + auto const [year, left] = str2int2(ptr, item.length); + timeparts.year = static_cast(year); + copied -= left; + break; + } case 'y': { - auto const year = str2int(ptr, item.length); - timeparts.year = static_cast(year + (year < 69 ? 2000 : 1900)); + auto const [year, left] = str2int2(ptr, item.length); + timeparts.year = static_cast(year + (year < 69 ? 2000 : 1900)); + copied -= left; + break; + } + case 'm': { + auto const [month, left] = str2int2(ptr, item.length); + timeparts.month = static_cast(month); + copied -= left; + break; + } + case 'd': { + auto const [day, left] = str2int2(ptr, item.length); + timeparts.day = static_cast(day); + copied -= left; + break; + } + case 'j': { + auto const [day, left] = str2int2(ptr, item.length); + timeparts.day_of_year = static_cast(day); + copied -= left; break; } - case 'm': timeparts.month = static_cast(str2int(ptr, item.length)); break; - case 'd': timeparts.day = static_cast(str2int(ptr, item.length)); break; - case 'j': timeparts.day_of_year = static_cast(str2int(ptr, item.length)); break; case 'H': - case 'I': timeparts.hour = static_cast(str2int(ptr, item.length)); break; - case 'M': timeparts.minute = static_cast(str2int(ptr, item.length)); break; - case 'S': timeparts.second = static_cast(str2int(ptr, item.length)); break; + case 'I': { + auto const [hour, left] = str2int2(ptr, item.length); + timeparts.hour = static_cast(hour); + copied -= left; + break; + } + case 'M': { + auto const [minute, left] = str2int2(ptr, item.length); + timeparts.minute = static_cast(minute); + copied -= left; + break; + } + case 'S': { + auto const [second, left] = str2int2(ptr, item.length); + timeparts.second = static_cast(second); + copied -= left; + break; + } case 'f': { int32_t const read_size = std::min(static_cast(item.length), static_cast(length)); - int64_t const fraction = str2int(ptr, read_size) * power_of_ten(item.length - read_size); - timeparts.subsecond = static_cast(fraction); + auto const [fraction, left] = str2int2(ptr, read_size); + timeparts.subsecond = + static_cast(fraction * power_of_ten(item.length - read_size - left)); + copied = read_size - left; break; } case 'p': { @@ -247,24 +293,23 @@ struct parse_datetime { break; } case 'z': { - auto const sign = *ptr == '-' ? 1 : -1; // revert timezone back to UTC - auto const hh = str2int(ptr + 1, 2); - auto const mm = str2int(ptr + 3, 2); - // ignoring the rest for now - // item.length has how many chars we should read + auto const sign = *ptr == '-' ? 1 : -1; // revert timezone back to UTC + auto const [hh, lh] = str2int2(ptr + 1, 2); + auto const [mm, lm] = str2int2(ptr + 3, 2); timeparts.tz_minutes = sign * ((hh * 60) + mm); + copied -= lh + lm; break; } case 'Z': break; // skip default: break; } - ptr += item.length; - length -= item.length; + ptr += copied; + length -= copied; } return timeparts; } - __device__ int64_t timestamp_from_parts(timestamp_components const& timeparts) + __device__ int64_t timestamp_from_parts(timestamp_components const& timeparts) const { auto const ymd = // convenient chrono class handles the leap year calculations for us cuda::std::chrono::year_month_day( @@ -290,7 +335,7 @@ struct parse_datetime { return timestamp; } - __device__ T operator()(size_type idx) + __device__ T operator()(size_type idx) const { T epoch_time{typename T::duration{0}}; if (d_strings.is_null(idx)) return epoch_time; @@ -385,29 +430,6 @@ struct check_datetime_format { }); } - /** - * @brief Specialized function to return the value and check for non-decimal characters. - * - * If non-decimal characters are found within `str` and `str + bytes` then - * the returned result is `thrust::nullopt` (_does not contain a value_). - * Otherwise, the parsed integer result is returned. - * - * @param str Beginning of characters to read/check. - * @param bytes Number of bytes in str to read/check. - * @return Integer value if characters are valid. - */ - __device__ thrust::optional str2int(const char* str, size_type bytes) - { - const char* ptr = str; - int32_t value = 0; - for (size_type idx = 0; idx < bytes; ++idx) { - char chr = *ptr++; - if (chr < '0' || chr > '9') return thrust::nullopt; - value = (value * 10) + static_cast(chr - '0'); - } - return value; - } - /** * @brief Check the specified characters are between ['0','9'] * and the resulting integer is within [`min_value`, `max_value`]. @@ -416,18 +438,23 @@ struct check_datetime_format { * @param bytes Number of bytes to check. * @param min_value Inclusive minimum value * @param max_value Inclusive maximum value - * @return true if parsed value is between `min_value` and `max_value`. + * @return number of bytes not successfully read */ - __device__ bool check_value(const char* str, size_type bytes, int min_value, int max_value) + __device__ size_type check_value(const char* str, + size_type const bytes, + int const min_value, + int const max_value) { - const char* ptr = str; + // const char* ptr = str; + if (*str < '0' || *str > '9') return bytes; int32_t value = 0; - for (size_type idx = 0; idx < bytes; ++idx) { - char chr = *ptr++; - if (chr < '0' || chr > '9') return false; + size_type count = bytes; + while (count-- > 0) { + char chr = *str++; + if (chr < '0' || chr > '9') break; // return false; value = (value * 10) + static_cast(chr - '0'); } - return value >= min_value && value <= max_value; + return (value >= min_value && value <= max_value) ? (count + 1) : bytes; } /** @@ -459,44 +486,72 @@ struct check_datetime_format { // special logic for each specifier // reference: https://man7.org/linux/man-pages/man3/strptime.3.html - bool result = false; + bool result = false; + size_type copied = item.length; switch (item.value) { case 'Y': { - if (auto value = str2int(ptr, item.length)) { - result = true; - dateparts.year = static_cast(value.value()); - } + auto const [year, left] = str2int2(ptr, item.length); + result = (left < item.length); + dateparts.year = static_cast(year); + copied -= left; break; } case 'y': { - if (auto value = str2int(ptr, item.length)) { - result = true; - auto const year = value.value(); - dateparts.year = static_cast(year + (year < 69 ? 2000 : 1900)); - } + auto const [year, left] = str2int2(ptr, item.length); + result = (left < item.length); + dateparts.year = static_cast(year + (year < 69 ? 2000 : 1900)); + copied -= left; break; } case 'm': { - if (auto value = str2int(ptr, item.length)) { - result = true; - dateparts.month = static_cast(value.value()); - } + auto const [month, left] = str2int2(ptr, item.length); + result = (left < item.length); + dateparts.month = static_cast(month); + copied -= left; break; } case 'd': { - if (auto value = str2int(ptr, item.length)) { - result = true; - dateparts.day = static_cast(value.value()); - } + auto const [day, left] = str2int2(ptr, item.length); + result = (left < item.length); + dateparts.day = static_cast(day); // value.value() + copied -= left; + break; + } + case 'j': { + auto const left = check_value(ptr, item.length, 1, 366); + result = left < item.length; + copied -= left; + break; + } + case 'H': { + auto const left = check_value(ptr, item.length, 0, 23); + result = left < item.length; + copied -= left; + break; + } + case 'I': { + auto const left = check_value(ptr, item.length, 1, 12); + result = left < item.length; + copied -= left; + break; + } + case 'M': { + auto const left = check_value(ptr, item.length, 0, 59); + result = left < item.length; + copied -= left; + break; + } + case 'S': { + auto const left = check_value(ptr, item.length, 0, 60); + result = left < item.length; + copied -= left; break; } - case 'j': result = check_value(ptr, item.length, 1, 366); break; - case 'H': result = check_value(ptr, item.length, 0, 23); break; - case 'I': result = check_value(ptr, item.length, 1, 12); break; - case 'M': result = check_value(ptr, item.length, 0, 59); break; - case 'S': result = check_value(ptr, item.length, 0, 60); break; case 'f': { - result = check_digits(ptr, std::min(static_cast(item.length), length)); + int32_t const read_size = + std::min(static_cast(item.length), static_cast(length)); + result = check_digits(ptr, read_size); + copied = read_size; break; } case 'p': { @@ -509,9 +564,10 @@ struct check_datetime_format { } case 'z': { // timezone offset if (item.length == 5) { - result = (*ptr == '-' || *ptr == '+') && // sign - check_value(ptr + 1, 2, 0, 23) && // hour - check_value(ptr + 3, 2, 0, 59); // minute + auto const lh = check_value(ptr + 1, 2, 0, 23); + auto const lm = check_value(ptr + 3, 2, 0, 59); + result = (*ptr == '-' || *ptr == '+') && (lh < 2) && (lm < 2); + copied -= lh + lm; } break; } @@ -519,8 +575,8 @@ struct check_datetime_format { default: break; } if (!result) return thrust::nullopt; - ptr += item.length; - length -= item.length; + ptr += copied; + length -= copied; } return dateparts; } @@ -867,7 +923,7 @@ struct datetime_formatter : public from_timestamp_base { } // Value to use for int2str call at the end of the switch-statement. - // This simplifies the case statements and prevents alot of extra inlining. + // This simplifies the case statements and prevents a lot of extra inlining. int32_t copy_value = -1; // default set for non-int2str usage cases // special logic for each specifier diff --git a/cpp/tests/strings/datetime_tests.cpp b/cpp/tests/strings/datetime_tests.cpp index 1a814ea707e..cf166d9b0f9 100644 --- a/cpp/tests/strings/datetime_tests.cpp +++ b/cpp/tests/strings/datetime_tests.cpp @@ -155,6 +155,118 @@ TEST_F(StringsDatetimeTest, ToTimestampTimezone) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, is_expected); } +TEST_F(StringsDatetimeTest, ToTimestampSingleSpecifier) +{ + cudf::test::strings_column_wrapper strings{"12", "10", "09", "05"}; + auto strings_view = cudf::strings_column_view(strings); + auto results = cudf::strings::to_timestamps( + strings_view, cudf::data_type{cudf::type_id::TIMESTAMP_DAYS}, "%d"); + cudf::test::fixed_width_column_wrapper expected_days{ + 11, 9, 8, 4}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_days); + + results = cudf::strings::to_timestamps( + strings_view, cudf::data_type{cudf::type_id::TIMESTAMP_DAYS}, "%m"); + cudf::test::fixed_width_column_wrapper expected_months{ + 334, 273, 243, 120}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_months); + + results = cudf::strings::is_timestamp(strings_view, "%m"); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, + cudf::test::fixed_width_column_wrapper{1, 1, 1, 1}); +} + +TEST_F(StringsDatetimeTest, ToTimestampVariableFractions) +{ + cudf::test::strings_column_wrapper strings{"01:02:03.000001000", + "01:02:03.000001", + "01:02:03.1", + "01:02:03.01", + "01:02:03.0098700", + "01:02:03.0023456"}; + auto strings_view = cudf::strings_column_view(strings); + auto results = cudf::strings::to_timestamps( + strings_view, cudf::data_type{cudf::type_id::TIMESTAMP_NANOSECONDS}, "%H:%M:%S.%9f"); + auto durations = + cudf::cast(results->view(), cudf::data_type{cudf::type_id::DURATION_NANOSECONDS}); + + cudf::test::fixed_width_column_wrapper expected{ + cudf::duration_ns{3723000001000}, + cudf::duration_ns{3723000001000}, + cudf::duration_ns{3723100000000}, + cudf::duration_ns{3723010000000}, + cudf::duration_ns{3723009870000}, + cudf::duration_ns{3723002345600}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*durations, expected); + + results = cudf::strings::is_timestamp(strings_view, "%H:%M:%S.%f"); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, + cudf::test::fixed_width_column_wrapper{1, 1, 1, 1, 1, 1}); +} + +TEST_F(StringsDatetimeTest, ToTimestampYear) +{ + cudf::test::strings_column_wrapper strings{ + "28/02/74", "17/07/68", "20/03/19", "29/02/20", "07/02/69"}; + auto strings_view = cudf::strings_column_view(strings); + auto results = cudf::strings::to_timestamps( + strings_view, cudf::data_type{cudf::type_id::TIMESTAMP_DAYS}, "%d/%m/%y"); + cudf::test::fixed_width_column_wrapper expected{ + 1519, 35992, 17975, 18321, -328}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + results = cudf::strings::is_timestamp(strings_view, "%d/%m/%y"); + cudf::test::fixed_width_column_wrapper is_expected({1, 1, 1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, is_expected); +} + +TEST_F(StringsDatetimeTest, ToTimestampSingleDigits) +{ + cudf::test::strings_column_wrapper strings{"1974-2-28 01:23:45.987000123", + "2019-7-17 2:34:56.001234567", + "2019-3-20 12:34:56.100100100", + "2020-02-2 00:00:00.555777999", + "1969-12-1 00:00:01.000055000", + "1944-07-21 11:15:09.333444000"}; + auto strings_view = cudf::strings_column_view(strings); + + auto results = cudf::strings::to_timestamps( + strings_view, cudf::data_type{cudf::type_id::TIMESTAMP_NANOSECONDS}, "%Y-%m-%d %H:%M:%S.%9f"); + cudf::test::fixed_width_column_wrapper expected_ns{ + 131246625987000123, + 1563330896001234567, + 1553085296100100100, + 1580601600555777999, + -2678398999945000, + -803047490666556000}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_ns); + + results = cudf::strings::is_timestamp(strings_view, "%Y-%m-%d %H:%M:%S.%6f"); + cudf::test::fixed_width_column_wrapper is_expected({1, 1, 1, 1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, is_expected); +} + +TEST_F(StringsDatetimeTest, IsTimestamp) +{ + cudf::test::strings_column_wrapper strings{"2020-10-07 13:02:03 1PM +0130", + "2020:10:07 01-02-03 1AM +0130", + "2020-10-7 11:02:03 11AM -1025", + "2020-13-07 01:02:03 1AM +0000", + "2020-10-32 01:32:03 1AM +0000", + "2020-10-07 25:02:03 1AM +0000", + "2020-10-07 01:62:03 1AM +0000", + "2020-10-07 01:02:63 1AM +0000", + "2020-02-29 01:32:03 1AM +0000", + "2020-02-30 01:32:03 01AM +0000", + "2020-00-31 01:32:03 1AM +0000", + "2020-02-00 02:32:03 2AM +0000", + "2020-2-9 9:12:13 9AM +1111"}; + auto strings_view = cudf::strings_column_view(strings); + auto results = cudf::strings::is_timestamp(strings_view, "%Y-%m-%d %H:%M:%S %I%p %z"); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *results, cudf::test::fixed_width_column_wrapper{1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1}); +} + TEST_F(StringsDatetimeTest, FromTimestamp) { std::vector h_timestamps{ @@ -465,88 +577,3 @@ TEST_F(StringsDatetimeTest, Errors) EXPECT_THROW(cudf::strings::from_timestamps(timestamps, ""), cudf::logic_error); EXPECT_THROW(cudf::strings::from_timestamps(timestamps, "%A %B", view), cudf::logic_error); } - -TEST_F(StringsDatetimeTest, ToTimestampSingleSpecifier) -{ - cudf::test::strings_column_wrapper strings{"12", "10", "09", "05"}; - auto strings_view = cudf::strings_column_view(strings); - auto results = cudf::strings::to_timestamps( - strings_view, cudf::data_type{cudf::type_id::TIMESTAMP_DAYS}, "%d"); - cudf::test::fixed_width_column_wrapper expected_days{ - 11, 9, 8, 4}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_days); - - results = cudf::strings::to_timestamps( - strings_view, cudf::data_type{cudf::type_id::TIMESTAMP_DAYS}, "%m"); - cudf::test::fixed_width_column_wrapper expected_months{ - 334, 273, 243, 120}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_months); - - results = cudf::strings::is_timestamp(strings_view, "%m"); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, - cudf::test::fixed_width_column_wrapper{1, 1, 1, 1}); -} - -TEST_F(StringsDatetimeTest, ToTimestampVariableFractions) -{ - cudf::test::strings_column_wrapper strings{"01:02:03.000001000", - "01:02:03.000001", - "01:02:03.1", - "01:02:03.01", - "01:02:03.0098700", - "01:02:03.0023456"}; - auto strings_view = cudf::strings_column_view(strings); - auto results = cudf::strings::to_timestamps( - strings_view, cudf::data_type{cudf::type_id::TIMESTAMP_NANOSECONDS}, "%H:%M:%S.%9f"); - auto durations = - cudf::cast(results->view(), cudf::data_type{cudf::type_id::DURATION_NANOSECONDS}); - - cudf::test::fixed_width_column_wrapper expected{ - cudf::duration_ns{3723000001000}, - cudf::duration_ns{3723000001000}, - cudf::duration_ns{3723100000000}, - cudf::duration_ns{3723010000000}, - cudf::duration_ns{3723009870000}, - cudf::duration_ns{3723002345600}}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*durations, expected); - - results = cudf::strings::is_timestamp(strings_view, "%H:%M:%S.%f"); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, - cudf::test::fixed_width_column_wrapper{1, 1, 1, 1, 1, 1}); -} - -TEST_F(StringsDatetimeTest, IsTimestamp) -{ - cudf::test::strings_column_wrapper strings{"2020-10-07 01:02:03", - "2020:10:07 01-02-03", - "2020-10-7 01:02:03", - "2020-13-07 01:02:03", - "2020-10-32 01:32:03", - "2020-10-07 25:02:03", - "2020-10-07 01:62:03", - "2020-10-07 01:02:63", - "2020-02-29 01:32:03", - "2020-02-30 01:32:03", - "2020-00-31 01:32:03", - "2020-02-00 01:32:03"}; - auto strings_view = cudf::strings_column_view(strings); - auto results = cudf::strings::is_timestamp(strings_view, "%Y-%m-%d %H:%M:%S"); - CUDF_TEST_EXPECT_COLUMNS_EQUAL( - *results, cudf::test::fixed_width_column_wrapper{1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0}); -} - -TEST_F(StringsDatetimeTest, ToTimestampYear) -{ - cudf::test::strings_column_wrapper strings{ - "28/02/74", "17/07/68", "20/03/19", "29/02/20", "07/02/69"}; - auto strings_view = cudf::strings_column_view(strings); - auto results = cudf::strings::to_timestamps( - strings_view, cudf::data_type{cudf::type_id::TIMESTAMP_DAYS}, "%d/%m/%y"); - cudf::test::fixed_width_column_wrapper expected{ - 1519, 35992, 17975, 18321, -328}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - - results = cudf::strings::is_timestamp(strings_view, "%d/%m/%y"); - cudf::test::fixed_width_column_wrapper is_expected({1, 1, 1, 1, 1}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, is_expected); -} From e01eed867d25ceba85c7122e70e327c3de1d8db7 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 7 Sep 2021 19:34:31 -0400 Subject: [PATCH 2/4] change temp fn str2int2 to parse_int --- cpp/src/strings/convert/convert_datetime.cu | 40 ++++++++++----------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu index e55ce422e62..2506b08f87b 100644 --- a/cpp/src/strings/convert/convert_datetime.cu +++ b/cpp/src/strings/convert/convert_datetime.cu @@ -160,16 +160,15 @@ struct format_compiler { }; /** - * @brief Specialized function to return the value reading upto the specified + * @brief Specialized function to return the value reading up to the specified * bytes or until an invalid character is encountered. * * @param str Beginning of characters to read. * @param bytes Number of bytes in str to read. - * @return Integer value of valid characters read and how bytes were not read. + * @return Integer value of valid characters read and how many bytes were not read. */ -__device__ thrust::pair str2int2(const char* str, size_type bytes) +__device__ thrust::pair parse_int(const char* str, size_type bytes) { - // const char* ptr = str; int32_t value = 0; while (bytes-- > 0) { char chr = *str++; @@ -225,50 +224,50 @@ struct parse_datetime { // special logic for each specifier switch (item.value) { case 'Y': { - auto const [year, left] = str2int2(ptr, item.length); + auto const [year, left] = parse_int(ptr, item.length); timeparts.year = static_cast(year); copied -= left; break; } case 'y': { - auto const [year, left] = str2int2(ptr, item.length); + auto const [year, left] = parse_int(ptr, item.length); timeparts.year = static_cast(year + (year < 69 ? 2000 : 1900)); copied -= left; break; } case 'm': { - auto const [month, left] = str2int2(ptr, item.length); + auto const [month, left] = parse_int(ptr, item.length); timeparts.month = static_cast(month); copied -= left; break; } case 'd': { - auto const [day, left] = str2int2(ptr, item.length); + auto const [day, left] = parse_int(ptr, item.length); timeparts.day = static_cast(day); copied -= left; break; } case 'j': { - auto const [day, left] = str2int2(ptr, item.length); + auto const [day, left] = parse_int(ptr, item.length); timeparts.day_of_year = static_cast(day); copied -= left; break; } case 'H': case 'I': { - auto const [hour, left] = str2int2(ptr, item.length); + auto const [hour, left] = parse_int(ptr, item.length); timeparts.hour = static_cast(hour); copied -= left; break; } case 'M': { - auto const [minute, left] = str2int2(ptr, item.length); + auto const [minute, left] = parse_int(ptr, item.length); timeparts.minute = static_cast(minute); copied -= left; break; } case 'S': { - auto const [second, left] = str2int2(ptr, item.length); + auto const [second, left] = parse_int(ptr, item.length); timeparts.second = static_cast(second); copied -= left; break; @@ -276,7 +275,7 @@ struct parse_datetime { case 'f': { int32_t const read_size = std::min(static_cast(item.length), static_cast(length)); - auto const [fraction, left] = str2int2(ptr, read_size); + auto const [fraction, left] = parse_int(ptr, read_size); timeparts.subsecond = static_cast(fraction * power_of_ten(item.length - read_size - left)); copied = read_size - left; @@ -293,9 +292,10 @@ struct parse_datetime { break; } case 'z': { + // 'z' format is +hh:mm -- single sign char and 2 chars each for hour and minute auto const sign = *ptr == '-' ? 1 : -1; // revert timezone back to UTC - auto const [hh, lh] = str2int2(ptr + 1, 2); - auto const [mm, lm] = str2int2(ptr + 3, 2); + auto const [hh, lh] = parse_int(ptr + 1, 2); + auto const [mm, lm] = parse_int(ptr + 3, 2); timeparts.tz_minutes = sign * ((hh * 60) + mm); copied -= lh + lm; break; @@ -438,7 +438,7 @@ struct check_datetime_format { * @param bytes Number of bytes to check. * @param min_value Inclusive minimum value * @param max_value Inclusive maximum value - * @return number of bytes not successfully read + * @return number of bytes not successfully processed */ __device__ size_type check_value(const char* str, size_type const bytes, @@ -490,28 +490,28 @@ struct check_datetime_format { size_type copied = item.length; switch (item.value) { case 'Y': { - auto const [year, left] = str2int2(ptr, item.length); + auto const [year, left] = parse_int(ptr, item.length); result = (left < item.length); dateparts.year = static_cast(year); copied -= left; break; } case 'y': { - auto const [year, left] = str2int2(ptr, item.length); + auto const [year, left] = parse_int(ptr, item.length); result = (left < item.length); dateparts.year = static_cast(year + (year < 69 ? 2000 : 1900)); copied -= left; break; } case 'm': { - auto const [month, left] = str2int2(ptr, item.length); + auto const [month, left] = parse_int(ptr, item.length); result = (left < item.length); dateparts.month = static_cast(month); copied -= left; break; } case 'd': { - auto const [day, left] = str2int2(ptr, item.length); + auto const [day, left] = parse_int(ptr, item.length); result = (left < item.length); dateparts.day = static_cast(day); // value.value() copied -= left; From 6052a708f867d64e0abae969df56e375174502f3 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 8 Sep 2021 08:12:01 -0400 Subject: [PATCH 3/4] add single-digit month and day test --- cpp/src/strings/convert/convert_datetime.cu | 67 +++++++++++---------- cpp/tests/strings/datetime_tests.cpp | 8 ++- 2 files changed, 39 insertions(+), 36 deletions(-) diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu index 2506b08f87b..22e1b77d9ef 100644 --- a/cpp/src/strings/convert/convert_datetime.cu +++ b/cpp/src/strings/convert/convert_datetime.cu @@ -160,7 +160,7 @@ struct format_compiler { }; /** - * @brief Specialized function to return the value reading up to the specified + * @brief Specialized function to return the integer value reading up to the specified * bytes or until an invalid character is encountered. * * @param str Beginning of characters to read. @@ -220,56 +220,56 @@ struct parse_datetime { continue; } - size_type copied = item.length; // number of bytes processed + size_type bytes_read = item.length; // number of bytes processed // special logic for each specifier switch (item.value) { case 'Y': { auto const [year, left] = parse_int(ptr, item.length); timeparts.year = static_cast(year); - copied -= left; + bytes_read -= left; break; } case 'y': { auto const [year, left] = parse_int(ptr, item.length); timeparts.year = static_cast(year + (year < 69 ? 2000 : 1900)); - copied -= left; + bytes_read -= left; break; } case 'm': { auto const [month, left] = parse_int(ptr, item.length); timeparts.month = static_cast(month); - copied -= left; + bytes_read -= left; break; } case 'd': { auto const [day, left] = parse_int(ptr, item.length); timeparts.day = static_cast(day); - copied -= left; + bytes_read -= left; break; } case 'j': { auto const [day, left] = parse_int(ptr, item.length); timeparts.day_of_year = static_cast(day); - copied -= left; + bytes_read -= left; break; } case 'H': case 'I': { auto const [hour, left] = parse_int(ptr, item.length); timeparts.hour = static_cast(hour); - copied -= left; + bytes_read -= left; break; } case 'M': { auto const [minute, left] = parse_int(ptr, item.length); timeparts.minute = static_cast(minute); - copied -= left; + bytes_read -= left; break; } case 'S': { auto const [second, left] = parse_int(ptr, item.length); timeparts.second = static_cast(second); - copied -= left; + bytes_read -= left; break; } case 'f': { @@ -278,7 +278,7 @@ struct parse_datetime { auto const [fraction, left] = parse_int(ptr, read_size); timeparts.subsecond = static_cast(fraction * power_of_ten(item.length - read_size - left)); - copied = read_size - left; + bytes_read = read_size - left; break; } case 'p': { @@ -293,18 +293,19 @@ struct parse_datetime { } case 'z': { // 'z' format is +hh:mm -- single sign char and 2 chars each for hour and minute - auto const sign = *ptr == '-' ? 1 : -1; // revert timezone back to UTC - auto const [hh, lh] = parse_int(ptr + 1, 2); - auto const [mm, lm] = parse_int(ptr + 3, 2); + auto const sign = *ptr == '-' ? 1 : -1; + auto const [hh, lh] = parse_int(ptr + 1, 2); + auto const [mm, lm] = parse_int(ptr + 3, 2); + // revert timezone back to UTC timeparts.tz_minutes = sign * ((hh * 60) + mm); - copied -= lh + lm; + bytes_read -= lh + lm; break; } case 'Z': break; // skip default: break; } - ptr += copied; - length -= copied; + ptr += bytes_read; + length -= bytes_read; } return timeparts; } @@ -486,72 +487,72 @@ struct check_datetime_format { // special logic for each specifier // reference: https://man7.org/linux/man-pages/man3/strptime.3.html - bool result = false; - size_type copied = item.length; + bool result = false; + size_type bytes_read = item.length; switch (item.value) { case 'Y': { auto const [year, left] = parse_int(ptr, item.length); result = (left < item.length); dateparts.year = static_cast(year); - copied -= left; + bytes_read -= left; break; } case 'y': { auto const [year, left] = parse_int(ptr, item.length); result = (left < item.length); dateparts.year = static_cast(year + (year < 69 ? 2000 : 1900)); - copied -= left; + bytes_read -= left; break; } case 'm': { auto const [month, left] = parse_int(ptr, item.length); result = (left < item.length); dateparts.month = static_cast(month); - copied -= left; + bytes_read -= left; break; } case 'd': { auto const [day, left] = parse_int(ptr, item.length); result = (left < item.length); dateparts.day = static_cast(day); // value.value() - copied -= left; + bytes_read -= left; break; } case 'j': { auto const left = check_value(ptr, item.length, 1, 366); result = left < item.length; - copied -= left; + bytes_read -= left; break; } case 'H': { auto const left = check_value(ptr, item.length, 0, 23); result = left < item.length; - copied -= left; + bytes_read -= left; break; } case 'I': { auto const left = check_value(ptr, item.length, 1, 12); result = left < item.length; - copied -= left; + bytes_read -= left; break; } case 'M': { auto const left = check_value(ptr, item.length, 0, 59); result = left < item.length; - copied -= left; + bytes_read -= left; break; } case 'S': { auto const left = check_value(ptr, item.length, 0, 60); result = left < item.length; - copied -= left; + bytes_read -= left; break; } case 'f': { int32_t const read_size = std::min(static_cast(item.length), static_cast(length)); - result = check_digits(ptr, read_size); - copied = read_size; + result = check_digits(ptr, read_size); + bytes_read = read_size; break; } case 'p': { @@ -567,7 +568,7 @@ struct check_datetime_format { auto const lh = check_value(ptr + 1, 2, 0, 23); auto const lm = check_value(ptr + 3, 2, 0, 59); result = (*ptr == '-' || *ptr == '+') && (lh < 2) && (lm < 2); - copied -= lh + lm; + bytes_read -= lh + lm; } break; } @@ -575,8 +576,8 @@ struct check_datetime_format { default: break; } if (!result) return thrust::nullopt; - ptr += copied; - length -= copied; + ptr += bytes_read; + length -= bytes_read; } return dateparts; } diff --git a/cpp/tests/strings/datetime_tests.cpp b/cpp/tests/strings/datetime_tests.cpp index cf166d9b0f9..4543607614f 100644 --- a/cpp/tests/strings/datetime_tests.cpp +++ b/cpp/tests/strings/datetime_tests.cpp @@ -227,7 +227,8 @@ TEST_F(StringsDatetimeTest, ToTimestampSingleDigits) "2019-3-20 12:34:56.100100100", "2020-02-2 00:00:00.555777999", "1969-12-1 00:00:01.000055000", - "1944-07-21 11:15:09.333444000"}; + "1944-07-21 11:15:09.333444000", + "2021-9-8 12:07:30.000000000"}; auto strings_view = cudf::strings_column_view(strings); auto results = cudf::strings::to_timestamps( @@ -238,11 +239,12 @@ TEST_F(StringsDatetimeTest, ToTimestampSingleDigits) 1553085296100100100, 1580601600555777999, -2678398999945000, - -803047490666556000}; + -803047490666556000, + 1631102850000000000}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_ns); results = cudf::strings::is_timestamp(strings_view, "%Y-%m-%d %H:%M:%S.%6f"); - cudf::test::fixed_width_column_wrapper is_expected({1, 1, 1, 1, 1, 1}); + cudf::test::fixed_width_column_wrapper is_expected({1, 1, 1, 1, 1, 1, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, is_expected); } From a50f67965cbb930111341d7a2495b27db13828dd Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 13 Sep 2021 14:02:07 -0400 Subject: [PATCH 4/4] change check_value to return pair --- cpp/src/strings/convert/convert_datetime.cu | 60 ++++++++++----------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu index 22e1b77d9ef..c0a20e1e47e 100644 --- a/cpp/src/strings/convert/convert_datetime.cu +++ b/cpp/src/strings/convert/convert_datetime.cu @@ -167,7 +167,7 @@ struct format_compiler { * @param bytes Number of bytes in str to read. * @return Integer value of valid characters read and how many bytes were not read. */ -__device__ thrust::pair parse_int(const char* str, size_type bytes) +__device__ thrust::pair parse_int(char const* str, size_type bytes) { int32_t value = 0; while (bytes-- > 0) { @@ -194,7 +194,7 @@ struct parse_datetime { * * @return `1x10^exponent` for `0 <= exponent <= 9` */ - __device__ constexpr int64_t power_of_ten(int32_t exponent) const + __device__ constexpr int64_t power_of_ten(int32_t const exponent) const { constexpr int64_t powers_of_ten[] = { 1L, 10L, 100L, 1000L, 10000L, 100000L, 1000000L, 10000000L, 100000000L, 1000000000L}; @@ -439,23 +439,23 @@ struct check_datetime_format { * @param bytes Number of bytes to check. * @param min_value Inclusive minimum value * @param max_value Inclusive maximum value - * @return number of bytes not successfully processed + * @return If value is valid and number of bytes not successfully processed */ - __device__ size_type check_value(const char* str, - size_type const bytes, - int const min_value, - int const max_value) + __device__ thrust::pair check_value(char const* str, + size_type const bytes, + int const min_value, + int const max_value) { - // const char* ptr = str; - if (*str < '0' || *str > '9') return bytes; + if (*str < '0' || *str > '9') { return thrust::make_pair(false, bytes); } int32_t value = 0; size_type count = bytes; while (count-- > 0) { char chr = *str++; - if (chr < '0' || chr > '9') break; // return false; + if (chr < '0' || chr > '9') break; value = (value * 10) + static_cast(chr - '0'); } - return (value >= min_value && value <= max_value) ? (count + 1) : bytes; + return (value >= min_value && value <= max_value) ? thrust::make_pair(true, count + 1) + : thrust::make_pair(false, bytes); } /** @@ -519,33 +519,33 @@ struct check_datetime_format { break; } case 'j': { - auto const left = check_value(ptr, item.length, 1, 366); - result = left < item.length; - bytes_read -= left; + auto const cv = check_value(ptr, item.length, 1, 366); + result = cv.first; + bytes_read -= cv.second; break; } case 'H': { - auto const left = check_value(ptr, item.length, 0, 23); - result = left < item.length; - bytes_read -= left; + auto const cv = check_value(ptr, item.length, 0, 23); + result = cv.first; + bytes_read -= cv.second; break; } case 'I': { - auto const left = check_value(ptr, item.length, 1, 12); - result = left < item.length; - bytes_read -= left; + auto const cv = check_value(ptr, item.length, 1, 12); + result = cv.first; + bytes_read -= cv.second; break; } case 'M': { - auto const left = check_value(ptr, item.length, 0, 59); - result = left < item.length; - bytes_read -= left; + auto const cv = check_value(ptr, item.length, 0, 59); + result = cv.first; + bytes_read -= cv.second; break; } case 'S': { - auto const left = check_value(ptr, item.length, 0, 60); - result = left < item.length; - bytes_read -= left; + auto const cv = check_value(ptr, item.length, 0, 60); + result = cv.first; + bytes_read -= cv.second; break; } case 'f': { @@ -565,10 +565,10 @@ struct check_datetime_format { } case 'z': { // timezone offset if (item.length == 5) { - auto const lh = check_value(ptr + 1, 2, 0, 23); - auto const lm = check_value(ptr + 3, 2, 0, 59); - result = (*ptr == '-' || *ptr == '+') && (lh < 2) && (lm < 2); - bytes_read -= lh + lm; + auto const cvh = check_value(ptr + 1, 2, 0, 23); + auto const cvm = check_value(ptr + 3, 2, 0, 59); + result = (*ptr == '-' || *ptr == '+') && cvh.first && cvm.first; + bytes_read -= cvh.second + cvm.second; } break; }