[Extensions] Canonicalize URLPattern hosts

Ensure that URLPattern hosts, when non-empty, are canonicalized. Silently update non-canonical hosts (e.g. gOOgle.com) to be canonical (google.com). Fail only if canonicalization fails. Bug: 811932 Change-Id: I87dbea12b972af00a7648f6d715eccb836613366 Reviewed-on: https://chromium-review.googlesource.com/916942 Commit-Queue: Devlin <rdevlin.cronin@chromium.org> Reviewed-by: Karan Bhatia <karandeepb@chromium.org> Cr-Commit-Position: refs/heads/master@{#538659}
Denger-Network · Feb 23, 2018 · b32af8c · b32af8c
1 parent f00604c
commit b32af8c
Show file tree

Hide file tree

Showing 2 changed files with 68 additions and 15 deletions.
diff --git a/extensions/common/url_pattern.cc b/extensions/common/url_pattern.cc
@@ -17,6 +17,7 @@
 #include "content/public/common/url_constants.h"
 #include "extensions/common/constants.h"
 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
+#include "net/base/url_util.h"
 #include "url/gurl.h"
 #include "url/url_util.h"
 
@@ -305,6 +306,16 @@ URLPattern::ParseResult URLPattern::Parse(base::StringPiece pattern,
   if (host_.find('*') != std::string::npos)
     return PARSE_ERROR_INVALID_HOST_WILDCARD;
 
+  if (!host_.empty()) {
+    // If |host_| is present (i.e., isn't a wildcard), we need to canonicalize
+    // it.
+    url::CanonHostInfo host_info;
+    host_ = net::CanonicalizeHost(host_, &host_info);
+    // net::CanonicalizeHost() returns an empty string on failure.
+    if (host_.empty())
+      return PARSE_ERROR_INVALID_HOST;
+  }
+
   // Null characters are not allowed in hosts.
   if (host_.find('\0') != std::string::npos)
     return PARSE_ERROR_INVALID_HOST;
@@ -437,7 +448,10 @@ bool URLPattern::MatchesScheme(base::StringPiece test) const {
 }
 
 bool URLPattern::MatchesHost(base::StringPiece host) const {
-  // TODO(devlin): This is a bit sad. Parsing urls is expensive.
+  // TODO(devlin): This is a bit sad. Parsing urls is expensive. However, it's
+  // important that we do this conversion to a GURL in order to canonicalize the
+  // host (the pattern's host_ already is canonicalized from Parse()). We can't
+  // just do string comparison.
   return MatchesHost(
       GURL(base::StringPrintf("%s%s%s/", url::kHttpScheme,
                               url::kStandardSchemeSeparator, host.data())));

diff --git a/extensions/common/url_pattern_unittest.cc b/extensions/common/url_pattern_unittest.cc
@@ -9,6 +9,7 @@
 #include <memory>
 
 #include "base/macros.h"
+#include "base/strings/stringprintf.h"
 #include "extensions/common/constants.h"
 #include "testing/gtest/include/gtest/gtest.h"
 #include "url/gurl.h"
@@ -190,7 +191,8 @@ TEST(ExtensionURLPatternTest, Match7) {
   // allowed, but useless
   EXPECT_EQ(URLPattern::PARSE_SUCCESS, pattern.Parse("http://*.0.0.1/*"));
   EXPECT_EQ("http", pattern.scheme());
-  EXPECT_EQ("0.0.1", pattern.host());
+  // Canonicalization forces 0.0.1 to 0.0.0.1.
+  EXPECT_EQ("0.0.0.1", pattern.host());
   EXPECT_TRUE(pattern.match_subdomains());
   EXPECT_TRUE(pattern.match_effective_tld());
   EXPECT_FALSE(pattern.match_all_urls());
@@ -489,19 +491,19 @@ TEST(URLPatternTest, EffectiveTldWildcard) {
 static const struct GetAsStringPatterns {
   const char* pattern;
 } kGetAsStringTestCases[] = {
-  { "http://www/" },
-  { "http://*/*" },
-  { "chrome://*/*" },
-  { "chrome://newtab/" },
-  { "about:*" },
-  { "about:blank" },
-  { "chrome-extension://*/*" },
-  { "chrome-extension://FTW/" },
-  { "data:*" },
-  { "data:monkey" },
-  { "javascript:*" },
-  { "javascript:atemyhomework" },
-  { "http://www.example.com:8080/foo" },
+    {"http://www/"},
+    {"http://*/*"},
+    {"chrome://*/*"},
+    {"chrome://newtab/"},
+    {"about:*"},
+    {"about:blank"},
+    {"chrome-extension://*/*"},
+    {"chrome-extension://ftw/"},
+    {"data:*"},
+    {"data:monkey"},
+    {"javascript:*"},
+    {"javascript:atemyhomework"},
+    {"http://www.example.com:8080/foo"},
 };
 
 TEST(ExtensionURLPatternTest, GetAsString) {
@@ -987,4 +989,41 @@ TEST(ExtensionURLPatternTest, MatchesEffectiveTLD) {
   }
 }
 
+// Test that URLPattern properly canonicalizes uncanonicalized hosts.
+TEST(ExtensionURLPatternTest, UncanonicalizedUrl) {
+  {
+    // Simple case: canonicalization should lowercase the host. This is
+    // important, since gOoGle.com would never be matched in practice.
+    const URLPattern pattern(URLPattern::SCHEME_ALL, "*://*.gOoGle.com/*");
+    EXPECT_TRUE(pattern.MatchesURL(GURL("https://google.com")));
+    EXPECT_TRUE(pattern.MatchesURL(GURL("https://maps.google.com")));
+    EXPECT_FALSE(pattern.MatchesURL(GURL("https://example.com")));
+    EXPECT_EQ("*://*.google.com/*", pattern.GetAsString());
+  }
+
+  {
+    // Trickier case: internationalization with UTF8 characters (the first 'g'
+    // isn't actually a 'g').
+    const URLPattern pattern(URLPattern::SCHEME_ALL, "https://*.ɡoogle.com/*");
+    constexpr char kCanonicalizedHost[] = "xn--oogle-qmc.com";
+    EXPECT_EQ(kCanonicalizedHost, pattern.host());
+    EXPECT_EQ(base::StringPrintf("https://*.%s/*", kCanonicalizedHost),
+              pattern.GetAsString());
+    EXPECT_FALSE(pattern.MatchesURL(GURL("https://google.com")));
+    // The pattern should match the canonicalized host, and the original
+    // UTF8 version.
+    EXPECT_TRUE(pattern.MatchesURL(
+        GURL(base::StringPrintf("https://%s/", kCanonicalizedHost))));
+    EXPECT_TRUE(pattern.MatchesHost("ɡoogle.com"));
+  }
+
+  {
+    // Sometimes, canonicalization can fail (such as here, where we have invalid
+    // unicode characters). In that case, URLPattern parsing should also fail.
+    URLPattern pattern(URLPattern::SCHEME_ALL);
+    EXPECT_EQ(URLPattern::PARSE_ERROR_INVALID_HOST,
+              pattern.Parse("https://\xef\xb7\x90zyx.com/*"));
+  }
+}
+
 }  // namespace