Skip to content

Commit

Permalink
[Extensions] Canonicalize URLPattern hosts
Browse files Browse the repository at this point in the history
Ensure that URLPattern hosts, when non-empty, are canonicalized.
Silently update non-canonical hosts (e.g. gOOgle.com) to be
canonical (google.com). Fail only if canonicalization fails.

Bug: 811932

Change-Id: I87dbea12b972af00a7648f6d715eccb836613366
Reviewed-on: https://chromium-review.googlesource.com/916942
Commit-Queue: Devlin <rdevlin.cronin@chromium.org>
Reviewed-by: Karan Bhatia <karandeepb@chromium.org>
Cr-Commit-Position: refs/heads/master@{#538659}
  • Loading branch information
rdcronin authored and Commit Bot committed Feb 23, 2018
1 parent f00604c commit b32af8c
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 15 deletions.
16 changes: 15 additions & 1 deletion extensions/common/url_pattern.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "content/public/common/url_constants.h"
#include "extensions/common/constants.h"
#include "net/base/registry_controlled_domains/registry_controlled_domain.h"
#include "net/base/url_util.h"
#include "url/gurl.h"
#include "url/url_util.h"

Expand Down Expand Up @@ -305,6 +306,16 @@ URLPattern::ParseResult URLPattern::Parse(base::StringPiece pattern,
if (host_.find('*') != std::string::npos)
return PARSE_ERROR_INVALID_HOST_WILDCARD;

if (!host_.empty()) {
// If |host_| is present (i.e., isn't a wildcard), we need to canonicalize
// it.
url::CanonHostInfo host_info;
host_ = net::CanonicalizeHost(host_, &host_info);
// net::CanonicalizeHost() returns an empty string on failure.
if (host_.empty())
return PARSE_ERROR_INVALID_HOST;
}

// Null characters are not allowed in hosts.
if (host_.find('\0') != std::string::npos)
return PARSE_ERROR_INVALID_HOST;
Expand Down Expand Up @@ -437,7 +448,10 @@ bool URLPattern::MatchesScheme(base::StringPiece test) const {
}

bool URLPattern::MatchesHost(base::StringPiece host) const {
// TODO(devlin): This is a bit sad. Parsing urls is expensive.
// TODO(devlin): This is a bit sad. Parsing urls is expensive. However, it's
// important that we do this conversion to a GURL in order to canonicalize the
// host (the pattern's host_ already is canonicalized from Parse()). We can't
// just do string comparison.
return MatchesHost(
GURL(base::StringPrintf("%s%s%s/", url::kHttpScheme,
url::kStandardSchemeSeparator, host.data())));
Expand Down
67 changes: 53 additions & 14 deletions extensions/common/url_pattern_unittest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <memory>

#include "base/macros.h"
#include "base/strings/stringprintf.h"
#include "extensions/common/constants.h"
#include "testing/gtest/include/gtest/gtest.h"
#include "url/gurl.h"
Expand Down Expand Up @@ -190,7 +191,8 @@ TEST(ExtensionURLPatternTest, Match7) {
// allowed, but useless
EXPECT_EQ(URLPattern::PARSE_SUCCESS, pattern.Parse("http://*.0.0.1/*"));
EXPECT_EQ("http", pattern.scheme());
EXPECT_EQ("0.0.1", pattern.host());
// Canonicalization forces 0.0.1 to 0.0.0.1.
EXPECT_EQ("0.0.0.1", pattern.host());
EXPECT_TRUE(pattern.match_subdomains());
EXPECT_TRUE(pattern.match_effective_tld());
EXPECT_FALSE(pattern.match_all_urls());
Expand Down Expand Up @@ -489,19 +491,19 @@ TEST(URLPatternTest, EffectiveTldWildcard) {
static const struct GetAsStringPatterns {
const char* pattern;
} kGetAsStringTestCases[] = {
{ "http://www/" },
{ "http://*/*" },
{ "chrome://*/*" },
{ "chrome://newtab/" },
{ "about:*" },
{ "about:blank" },
{ "chrome-extension://*/*" },
{ "chrome-extension://FTW/" },
{ "data:*" },
{ "data:monkey" },
{ "javascript:*" },
{ "javascript:atemyhomework" },
{ "http://www.example.com:8080/foo" },
{"http://www/"},
{"http://*/*"},
{"chrome://*/*"},
{"chrome://newtab/"},
{"about:*"},
{"about:blank"},
{"chrome-extension://*/*"},
{"chrome-extension://ftw/"},
{"data:*"},
{"data:monkey"},
{"javascript:*"},
{"javascript:atemyhomework"},
{"http://www.example.com:8080/foo"},
};

TEST(ExtensionURLPatternTest, GetAsString) {
Expand Down Expand Up @@ -987,4 +989,41 @@ TEST(ExtensionURLPatternTest, MatchesEffectiveTLD) {
}
}

// Test that URLPattern properly canonicalizes uncanonicalized hosts.
TEST(ExtensionURLPatternTest, UncanonicalizedUrl) {
{
// Simple case: canonicalization should lowercase the host. This is
// important, since gOoGle.com would never be matched in practice.
const URLPattern pattern(URLPattern::SCHEME_ALL, "*://*.gOoGle.com/*");
EXPECT_TRUE(pattern.MatchesURL(GURL("https://google.com")));
EXPECT_TRUE(pattern.MatchesURL(GURL("https://maps.google.com")));
EXPECT_FALSE(pattern.MatchesURL(GURL("https://example.com")));
EXPECT_EQ("*://*.google.com/*", pattern.GetAsString());
}

{
// Trickier case: internationalization with UTF8 characters (the first 'g'
// isn't actually a 'g').
const URLPattern pattern(URLPattern::SCHEME_ALL, "https://*.ɡoogle.com/*");
constexpr char kCanonicalizedHost[] = "xn--oogle-qmc.com";
EXPECT_EQ(kCanonicalizedHost, pattern.host());
EXPECT_EQ(base::StringPrintf("https://*.%s/*", kCanonicalizedHost),
pattern.GetAsString());
EXPECT_FALSE(pattern.MatchesURL(GURL("https://google.com")));
// The pattern should match the canonicalized host, and the original
// UTF8 version.
EXPECT_TRUE(pattern.MatchesURL(
GURL(base::StringPrintf("https://%s/", kCanonicalizedHost))));
EXPECT_TRUE(pattern.MatchesHost("ɡoogle.com"));
}

{
// Sometimes, canonicalization can fail (such as here, where we have invalid
// unicode characters). In that case, URLPattern parsing should also fail.
URLPattern pattern(URLPattern::SCHEME_ALL);
EXPECT_EQ(URLPattern::PARSE_ERROR_INVALID_HOST,
pattern.Parse("https://\xef\xb7\x90zyx.com/*"));
}
}

} // namespace

0 comments on commit b32af8c

Please sign in to comment.