Skip to content

Commit

Permalink
Ensure we only use subgroups if they are matched in url_extract_* pre…
Browse files Browse the repository at this point in the history
…sto functions (#7866)

Summary:
Pull Request resolved: #7866

It is possible to trigger address sanitizer when passed garbage strings in matchAuthorityAndPath function. We need to ensure that subgroups are actually matched, since accessing an unmatched string can lead to asan errors.

Reviewed By: mbasmanova

Differential Revision: D51822902

fbshipit-source-id: 6214c333859c56321572480e2231a7d2340d8d85
  • Loading branch information
Krishna Pai authored and facebook-github-bot committed Dec 7, 2023
1 parent db64f6f commit 61ab60d
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 70 deletions.
2 changes: 1 addition & 1 deletion velox/docs/functions/presto/url.rst
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ Extraction Functions
.. function:: url_extract_port(url) -> bigint
Returns the port number from ``url``.
Returns the port number from ``url``. Returns NULL if port is missing.
.. function:: url_extract_protocol(url) -> varchar
Expand Down
21 changes: 11 additions & 10 deletions velox/functions/prestosql/URLFunctions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,25 +15,24 @@
*/

#include "URLFunctions.h"
#include <optional>
#include "velox/type/Type.h"

namespace facebook::velox::functions {

bool matchAuthorityAndPath(
const boost::cmatch& urlMatch,
boost::cmatch& authAndPathMatch,
std::optional<StringView> matchAuthorityAndPath(
StringView authorityAndPath,
boost::cmatch& authorityMatch,
bool& hasAuthority) {
int subGroup) {
boost::cmatch authAndPathMatch;
static const boost::regex kAuthorityAndPathRegex("//([^/]*)(/.*)?");
auto authorityAndPath = submatch(urlMatch, 3);
if (!boost::regex_match(
authorityAndPath.begin(),
authorityAndPath.end(),
authAndPathMatch,
kAuthorityAndPathRegex)) {
// Does not start with //, doesn't have authority.
hasAuthority = false;
return true;
return std::nullopt;
}

static const boost::regex kAuthorityRegex(
Expand All @@ -45,12 +44,14 @@ bool matchAuthorityAndPath(
const auto authority = authAndPathMatch[1];
if (!boost::regex_match(
authority.first, authority.second, authorityMatch, kAuthorityRegex)) {
return false; // Invalid URI Authority.
return std::nullopt; // Invalid URI Authority.
}

hasAuthority = true;
if (authorityMatch[subGroup].matched) {
return submatch(authorityMatch, subGroup);
}

return true;
return std::nullopt;
}

} // namespace facebook::velox::functions
137 changes: 78 additions & 59 deletions velox/functions/prestosql/URLFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,33 @@

#include <boost/regex.hpp>
#include <cctype>
#include <optional>
#include "velox/functions/Macros.h"
#include "velox/functions/lib/string/StringImpl.h"

namespace facebook::velox::functions {

namespace {

const auto kScheme = 2;
const auto kAuthority = 3;
const auto kPath = 5;
const auto kQuery = 7;
const auto kFragment = 9;
const auto kHost = 3; // From the authority and path regex.
const auto kPort = 4; // From the authority and path regex.

// Returns true if the given character is valid in a domain name.
bool isDomainChar(char ch) {
return std::isalnum(ch) || ch == '-' || ch == '.';
}

FOLLY_ALWAYS_INLINE StringView submatch(const boost::cmatch& match, int idx) {
const auto& sub = match[idx];
return StringView(sub.first, sub.length());
}

template <typename TInString>
bool parse(const TInString& rawUrl, boost::cmatch& match) {
bool parse(const char* rawUrlData, size_t rawUrlsize, boost::cmatch& match) {
/// This regex is taken from RFC - 3986.
/// See: https://www.rfc-editor.org/rfc/rfc3986#appendix-B
/// The basic groups are:
Expand Down Expand Up @@ -59,7 +73,24 @@ bool parse(const TInString& rawUrl, boost::cmatch& match) {
"(#(.*))?"); // #fragment

return boost::regex_match(
rawUrl.data(), rawUrl.data() + rawUrl.size(), match, kUriRegex);
rawUrlData, rawUrlData + rawUrlsize, match, kUriRegex);
}

/// Parses the url and returns the matching subgroup if the particular sub group
/// is matched by the call to parse call above.
std::optional<StringView> parse(StringView rawUrl, int subGroup) {
boost::cmatch match;
if (!parse(rawUrl.data(), rawUrl.size(), match)) {
return std::nullopt;
}

VELOX_CHECK_LT(subGroup, match.size());

if (match[subGroup].matched) {
return submatch(match, subGroup);
}

return std::nullopt;
}

FOLLY_ALWAYS_INLINE unsigned char toHex(unsigned char c) {
Expand Down Expand Up @@ -110,7 +141,7 @@ FOLLY_ALWAYS_INLINE void urlEscape(TOutString& output, const TInString& input) {
/// Performs initial validation of the URI.
/// Checks if the URI contains ascii whitespaces or
/// unescaped '%' chars.
bool isValidURI(const StringView& input) {
bool isValidURI(StringView input) {
const char* p = input.data();
const char* end = p + input.size();
char buf[3];
Expand Down Expand Up @@ -178,11 +209,13 @@ FOLLY_ALWAYS_INLINE void urlUnescape(

} // namespace

bool matchAuthorityAndPath(
const boost::cmatch& urlMatch,
boost::cmatch& authAndPathMatch,
/// Matches the authority (i.e host[:port], ipaddress), and path from a string
/// representing the authority and path. Returns true if the regex matches, and
/// sets the appropriate groups matching authority in authorityMatch.
std::optional<StringView> matchAuthorityAndPath(
StringView authorityAndPath,
boost::cmatch& authorityMatch,
bool& hasAuthority);
int subGroup);

template <typename T>
struct UrlExtractProtocolFunction {
Expand All @@ -198,14 +231,13 @@ struct UrlExtractProtocolFunction {
out_type<Varchar>& result,
const arg_type<Varchar>& url) {
if (!isValidURI(url)) {
result.setEmpty();
return false;
}
boost::cmatch match;
if (!parse(url, match)) {
result.setEmpty();

if (auto protocol = parse(url, kScheme)) {
result.setNoCopy(protocol.value());
} else {
result.setNoCopy(submatch(match, 2));
result.setEmpty();
}
return true;
}
Expand All @@ -225,14 +257,13 @@ struct UrlExtractFragmentFunction {
out_type<Varchar>& result,
const arg_type<Varchar>& url) {
if (!isValidURI(url)) {
result.setEmpty();
return false;
}
boost::cmatch match;
if (!parse(url, match)) {
result.setEmpty();

if (auto fragment = parse(url, kFragment)) {
result.setNoCopy(fragment.value());
} else {
result.setNoCopy(submatch(match, 9));
result.setEmpty();
}
return true;
}
Expand All @@ -252,22 +283,19 @@ struct UrlExtractHostFunction {
out_type<Varchar>& result,
const arg_type<Varchar>& url) {
if (!isValidURI(url)) {
result.setEmpty();
return false;
}
boost::cmatch match;
if (!parse(url, match)) {

auto authAndPath = parse(url, kAuthority);
if (!authAndPath) {
result.setEmpty();
return true;
}
boost::cmatch authAndPathMatch;
boost::cmatch authorityMatch;
bool hasAuthority;

if (matchAuthorityAndPath(
match, authAndPathMatch, authorityMatch, hasAuthority) &&
hasAuthority) {
result.setNoCopy(submatch(authorityMatch, 3));
if (auto host =
matchAuthorityAndPath(authAndPath.value(), authorityMatch, kHost)) {
result.setNoCopy(host.value());
} else {
result.setEmpty();
}
Expand All @@ -283,21 +311,18 @@ struct UrlExtractPortFunction {
if (!isValidURI(url)) {
return false;
}
boost::cmatch match;
if (!parse(url, match)) {

auto authAndPath = parse(url, kAuthority);
if (!authAndPath) {
return false;
}

boost::cmatch authAndPathMatch;
boost::cmatch authorityMatch;
bool hasAuthority;
if (matchAuthorityAndPath(
match, authAndPathMatch, authorityMatch, hasAuthority) &&
hasAuthority) {
auto port = submatch(authorityMatch, 4);
if (!port.empty()) {
if (auto port =
matchAuthorityAndPath(authAndPath.value(), authorityMatch, kPort)) {
if (!port.value().empty()) {
try {
result = to<int64_t>(port);
result = to<int64_t>(port.value());
return true;
} catch (folly::ConversionError const&) {
}
Expand All @@ -317,17 +342,13 @@ struct UrlExtractPathFunction {
out_type<Varchar>& result,
const arg_type<Varchar>& url) {
if (!isValidURI(url)) {
result.setEmpty();
return false;
}

boost::cmatch match;
if (!parse(url, match)) {
result.setEmpty();
return false;
}

urlUnescape(result, submatch(match, 5));
auto path = parse(url, kPath);
VELOX_USER_CHECK(
path.has_value(), "Unable to determine path for URL: {}", url);
urlUnescape(result, path.value());

return true;
}
Expand All @@ -347,17 +368,15 @@ struct UrlExtractQueryFunction {
out_type<Varchar>& result,
const arg_type<Varchar>& url) {
if (!isValidURI(url)) {
result.setEmpty();
return false;
}
boost::cmatch match;
if (!parse(url, match)) {

if (auto query = parse(url, kQuery)) {
result.setNoCopy(query.value());
} else {
result.setEmpty();
return true;
}

auto query = submatch(match, 7);
result.setNoCopy(query);
return true;
}
};
Expand All @@ -377,17 +396,15 @@ struct UrlExtractParameterFunction {
const arg_type<Varchar>& url,
const arg_type<Varchar>& param) {
if (!isValidURI(url)) {
result.setEmpty();
return false;
}
boost::cmatch match;
if (!parse(url, match)) {
result.setEmpty();

auto query = parse(url, kQuery);
if (!query) {
return false;
}

auto query = submatch(match, 7);
if (!query.empty()) {
if (!query.value().empty()) {
// Parse query string.
static const boost::regex kQueryParamRegex(
"(^|&)" // start of query or start of parameter "&"
Expand All @@ -398,11 +415,13 @@ struct UrlExtractParameterFunction {
);

const boost::cregex_iterator begin(
query.data(), query.data() + query.size(), kQueryParamRegex);
query.value().data(),
query.value().data() + query.value().size(),
kQueryParamRegex);
boost::cregex_iterator end;

for (auto it = begin; it != end; ++it) {
if (it->length(2) != 0) { // key shouldnt be empty.
if (it->length(2) != 0 && (*it)[2].matched) { // key shouldnt be empty.
auto key = submatch((*it), 2);
if (param.compare(key) == 0) {
auto value = submatch((*it), 3);
Expand Down
9 changes: 9 additions & 0 deletions velox/functions/prestosql/tests/URLFunctionsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,14 @@ TEST_F(URLFunctionsTest, validateURL) {
std::nullopt,
std::nullopt,
std::nullopt);
validate(
"IC6S!8hGVRpo+!,yTaJEy/$RUZpqcr",
"",
"",
"IC6S!8hGVRpo !,yTaJEy/$RUZpqcr",
"",
"",
std::nullopt);
}

TEST_F(URLFunctionsTest, extractPath) {
Expand All @@ -133,6 +141,7 @@ TEST_F(URLFunctionsTest, extractPath) {
extractPath("https://www.ucu.edu.uy/agenda/evento/%%UCUrlCompartir%%"));
EXPECT_EQ("foo", extractPath("foo"));
EXPECT_EQ(std::nullopt, extractPath("BAD URL!"));
EXPECT_EQ("", extractPath("http://www.yahoo.com"));
}

TEST_F(URLFunctionsTest, extractParameter) {
Expand Down

0 comments on commit 61ab60d

Please sign in to comment.