Skip to content

Commit

Permalink
Fix casting of varchar to integral types by trimming unicode and cont…
Browse files Browse the repository at this point in the history
…rol characters (facebookincubator#10957)

Summary:
Presto ignores whitespace (both unicode and ascii) along with control chars when trimming a varchar to cast to integer. However Velox which uses folly underneath doesnt do this. This PR brings velox to use similar behavior.

For e.g the following works in Presto but fails in Prestissimo:

```
select cast((from_utf8(from_hex('2002')) || x || (from_utf8(from_hex('0F')))) as integer) from (values '123') t(x);
```

Pull Request resolved: facebookincubator#10957

Reviewed By: kevinwilfong

Differential Revision: D62408635

Pulled By: kgpai
  • Loading branch information
kgpai authored and facebook-github-bot committed Sep 12, 2024
1 parent baaf559 commit a67cc61
Show file tree
Hide file tree
Showing 3 changed files with 112 additions and 17 deletions.
81 changes: 81 additions & 0 deletions velox/type/Conversions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
*/

#include "velox/type/Conversions.h"
#include "velox/functions/lib/string/StringImpl.h"

DEFINE_bool(
experimental_enable_legacy_cast,
Expand All @@ -23,3 +24,83 @@ DEFINE_bool(
" format of type conversions used for casting. This is a temporary solution"
" that aims to facilitate a seamless transition for users who rely on the"
" legacy behavior and hence can change in the future.");

namespace facebook::velox::util {

/// folly's tryTo doesn't ignore control characters or other unicode whitespace.
/// We trim the string for control and unicode whitespace
/// from both directions and return a StringView of the result.
StringView trimWhiteSpace(const char* data, size_t length) {
if (length == 0) {
return StringView(data, 0);
}

size_t startIndex = 0;
size_t endIndex = length - 1;
const auto end = data + length;
int size;

// We need to trim unicode chars and control chars
// from left side of the string.
for (auto i = 0; i < length;) {
size = 0;
auto isWhiteSpaceOrControlChar = false;

if (data[i] & 0x80) {
// Unicode - only check for whitespace.
auto codePoint = utf8proc_codepoint(data + i, end, size);
isWhiteSpaceOrControlChar =
velox::functions::stringImpl::isUnicodeWhiteSpace(codePoint);
} else {
// Ascii - Check for both whitespace and control chars
isWhiteSpaceOrControlChar =
velox::functions::stringImpl::isAsciiWhiteSpace(data[i]) ||
(data[i] > 0 && data[i] < 32);
}

if (!isWhiteSpaceOrControlChar) {
startIndex = i;
break;
}

i += size > 0 ? size : 1;
}

// Trim whitespace from right side.
for (auto i = length - 1; i > startIndex;) {
size = 0;
auto isWhiteSpaceOrControlChar = false;

if (data[i] & 0x80) {
// Unicode - only check for whitespace.
utf8proc_int32_t codePoint;
// Find the right codepoint
while ((codePoint = utf8proc_codepoint(data + i, end, size)) < 0 &&
i > startIndex) {
i--;
}
isWhiteSpaceOrControlChar =
velox::functions::stringImpl::isUnicodeWhiteSpace(codePoint);
} else {
// Ascii - check if control char or whitespace
isWhiteSpaceOrControlChar =
velox::functions::stringImpl::isAsciiWhiteSpace(data[i]) ||
(data[i] > 0 && data[i] < 32);
}

if (!isWhiteSpaceOrControlChar) {
endIndex = i;
break;
}

if (i > 0) {
i--;
}
}

// If we end on a unicode char make sure we add that to the end.
auto charSize = size > 0 ? size : 1;
return StringView(data + startIndex, endIndex - startIndex + charSize);
}

} // namespace facebook::velox::util
14 changes: 11 additions & 3 deletions velox/type/Conversions.h
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,11 @@ struct Converter<TypeKind::BOOLEAN, void, TPolicy> {
}
};

/// Presto compatible trim of whitespace. This also trims
/// control characters from both front and back and returns
/// a StringView of the trimmed string.
StringView trimWhiteSpace(const char* data, size_t length);

/// To TINYINT, SMALLINT, INTEGER, BIGINT, and HUGEINT converter.
template <TypeKind KIND, typename TPolicy>
struct Converter<
Expand Down Expand Up @@ -317,23 +322,26 @@ struct Converter<
if constexpr (TPolicy::truncate) {
return convertStringToInt(v);
} else {
return detail::callFollyTo<T>(v);
auto trimmed = trimWhiteSpace(v.data(), v.size());
return detail::callFollyTo<T>(trimmed);
}
}

static Expected<T> tryCast(const StringView& v) {
if constexpr (TPolicy::truncate) {
return convertStringToInt(folly::StringPiece(v));
} else {
return detail::callFollyTo<T>(folly::StringPiece(v));
auto trimmed = trimWhiteSpace(v.data(), v.size());
return detail::callFollyTo<T>(trimmed);
}
}

static Expected<T> tryCast(const std::string& v) {
if constexpr (TPolicy::truncate) {
return convertStringToInt(v);
} else {
return detail::callFollyTo<T>(v);
auto trimmed = trimWhiteSpace(v.data(), v.length());
return detail::callFollyTo<T>(trimmed);
}
}

Expand Down
34 changes: 20 additions & 14 deletions velox/type/tests/ConversionsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -438,12 +438,15 @@ TEST_F(ConversionsTest, toIntegeralTypes) {
"1",
"+1",
"-100",
},
{
1,
1,
-100,
},
"\u000f100",
"\u000f100\u000f",
// unicode space's interspaced with control chars
" -100\u000f",
"\u000f+100",
"\u001f101\u000e",
"\u001f-102\u000f",
},
{1, 1, -100, 100, 100, -100, 100, 101, -102},
/*truncate*/ false);

// When TRUNCATE = false, invalid cases.
Expand All @@ -466,14 +469,17 @@ TEST_F(ConversionsTest, toIntegeralTypes) {
testConversion<std::string, int8_t>(
{"1234567"}, {}, /*truncate*/ false, false, /*expectError*/ true);
testConversion<std::string, int64_t>(
{
"1a",
"",
"1'234'567",
"1,234,567",
"infinity",
"nan",
},
{"1a",
"",
"1'234'567",
"1,234,567",
"infinity",
"nan",
// Unicode spaces and control chars and unicode at the end
// Should fail conversion since we do not support unicode digits
"\u001f-103٢\u000f",
// All spaces
"   \u000f"},
{},
/*truncate*/ false,
false,
Expand Down

0 comments on commit a67cc61

Please sign in to comment.