Skip to content

Commit

Permalink
Add support to trim whitespace.
Browse files Browse the repository at this point in the history
  • Loading branch information
kgpai committed Sep 12, 2024
1 parent 485329e commit a41f98d
Show file tree
Hide file tree
Showing 3 changed files with 110 additions and 17 deletions.
79 changes: 79 additions & 0 deletions velox/type/Conversions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
*/

#include "velox/type/Conversions.h"
#include "velox/functions/lib/string/StringImpl.h"

DEFINE_bool(
experimental_enable_legacy_cast,
Expand All @@ -23,3 +24,81 @@ DEFINE_bool(
" format of type conversions used for casting. This is a temporary solution"
" that aims to facilitate a seamless transition for users who rely on the"
" legacy behavior and hence can change in the future.");

namespace facebook::velox::util {

/// folly's tryTo doesn't ignore control characters or other unicode whitespace.
/// We trim the string for control and unicode whitespace
/// from both directions and return a StringView of the result.
StringView trimWhiteSpace(const char* data, size_t length) {
if (length == 0) {
return StringView(data, 0);
}

size_t startIndex = 0;
size_t endIndex = length - 1;
const auto end = data + length;
int size;

// We need to trim unicode chars and control chars
// from left side of the string.
for (auto i = 0; i < length;) {
size = 0;
auto isWhiteSpaceOrControlChar = false;

if (data[i] & 0x80) {
// Unicode - only check for whitespace.
auto codePoint = utf8proc_codepoint(data + i, end, size);
isWhiteSpaceOrControlChar =
velox::functions::stringImpl::isUnicodeWhiteSpace(codePoint);
} else {
// Ascii - Check for both whitespace and control chars
isWhiteSpaceOrControlChar =
velox::functions::stringImpl::isAsciiWhiteSpace(data[i]) ||
(data[i] > 0 && data[i] < 32);
}

if (!isWhiteSpaceOrControlChar) {
startIndex = i;
break;
}

i += size > 0 ? size : 1;
}

// Trim whitespace from right side.
for (auto i = length - 1; i > startIndex;) {
size = 0;
auto isWhiteSpaceOrControlChar = false;

if (data[i] & 0x80) {
// Unicode - only check for whitespace.
utf8proc_int32_t codePoint;
// Find the right codepoint
while ((codePoint = utf8proc_codepoint(data + i, end, size)) < 0 &&
i > startIndex) {
i--;
}
isWhiteSpaceOrControlChar =
velox::functions::stringImpl::isUnicodeWhiteSpace(codePoint);
} else {
// Ascii - check if control char or whitespace
isWhiteSpaceOrControlChar =
velox::functions::stringImpl::isAsciiWhiteSpace(data[i]) ||
(data[i] > 0 && data[i] < 32);
}

if (!isWhiteSpaceOrControlChar) {
endIndex = i;
break;
}

i--;
}

// If we end on a unicode char make sure we add that to the end.
auto charSize = size > 0 ? size : 1;
return StringView(data + startIndex, endIndex - startIndex + charSize);
}

} // namespace facebook::velox::util
14 changes: 11 additions & 3 deletions velox/type/Conversions.h
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,11 @@ struct Converter<TypeKind::BOOLEAN, void, TPolicy> {
}
};

/// Presto compatible trim of whitespace. This also trims
/// control characters from both front and back and returns
/// a StringView of the trimmed string.
StringView trimWhiteSpace(const char* data, size_t length);

/// To TINYINT, SMALLINT, INTEGER, BIGINT, and HUGEINT converter.
template <TypeKind KIND, typename TPolicy>
struct Converter<
Expand Down Expand Up @@ -317,23 +322,26 @@ struct Converter<
if constexpr (TPolicy::truncate) {
return convertStringToInt(v);
} else {
return detail::callFollyTo<T>(v);
auto trimmed = trimWhiteSpace(v.data(), v.size());
return detail::callFollyTo<T>(trimmed);
}
}

static Expected<T> tryCast(const StringView& v) {
if constexpr (TPolicy::truncate) {
return convertStringToInt(folly::StringPiece(v));
} else {
return detail::callFollyTo<T>(folly::StringPiece(v));
auto trimmed = trimWhiteSpace(v.data(), v.size());
return detail::callFollyTo<T>(trimmed);
}
}

static Expected<T> tryCast(const std::string& v) {
if constexpr (TPolicy::truncate) {
return convertStringToInt(v);
} else {
return detail::callFollyTo<T>(v);
auto trimmed = trimWhiteSpace(v.data(), v.length());
return detail::callFollyTo<T>(trimmed);
}
}

Expand Down
34 changes: 20 additions & 14 deletions velox/type/tests/ConversionsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -438,12 +438,15 @@ TEST_F(ConversionsTest, toIntegeralTypes) {
"1",
"+1",
"-100",
},
{
1,
1,
-100,
},
"\u000f100",
"\u000f100\u000f",
// unicode space's interspaced with control chars
" -100\u000f",
"\u000f+100",
"\u001f101\u000e",
"\u001f-102\u000f",
},
{1, 1, -100, 100, 100, -100, 100, 101, -102},
/*truncate*/ false);

// When TRUNCATE = false, invalid cases.
Expand All @@ -466,14 +469,17 @@ TEST_F(ConversionsTest, toIntegeralTypes) {
testConversion<std::string, int8_t>(
{"1234567"}, {}, /*truncate*/ false, false, /*expectError*/ true);
testConversion<std::string, int64_t>(
{
"1a",
"",
"1'234'567",
"1,234,567",
"infinity",
"nan",
},
{"1a",
"",
"1'234'567",
"1,234,567",
"infinity",
"nan",
// Unicode spaces and control chars and unicode at the end
// Should fail conversion since we do not support unicode digits
"\u001f-103٢\u000f",
// All spaces
"   \u000f"},
{},
/*truncate*/ false,
false,
Expand Down

0 comments on commit a41f98d

Please sign in to comment.