electron/chromium_src/extensions/common/url_pattern.cc

// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "extensions/common/url_pattern.h"

#include <stddef.h>

#include <ostream>

#include "base/macros.h"
#include "base/strings/pattern.h"
#include "base/strings/string_number_conversions.h"
#include "base/strings/string_piece.h"
#include "base/strings/string_split.h"
#include "base/strings/string_util.h"
#include "base/strings/stringprintf.h"
#include "content/public/common/url_constants.h"
#include "net/base/registry_controlled_domains/registry_controlled_domain.h"
#include "url/gurl.h"
#include "url/url_util.h"

const char URLPattern::kAllUrlsPattern[] = "<all_urls>";
const char kExtensionScheme[] = "chrome-extension";

namespace {

// TODO(aa): What about more obscure schemes like data: and javascript: ?
// Note: keep this array in sync with kValidSchemeMasks.
const char* const kValidSchemes[] = {
    url::kHttpScheme,       url::kHttpsScheme,        url::kFileScheme,
    url::kFtpScheme,        content::kChromeUIScheme, kExtensionScheme,
    url::kFileSystemScheme,
};

const int kValidSchemeMasks[] = {
    URLPattern::SCHEME_HTTP,       URLPattern::SCHEME_HTTPS,
    URLPattern::SCHEME_FILE,       URLPattern::SCHEME_FTP,
    URLPattern::SCHEME_CHROMEUI,   URLPattern::SCHEME_EXTENSION,
    URLPattern::SCHEME_FILESYSTEM,
};

static_assert(arraysize(kValidSchemes) == arraysize(kValidSchemeMasks),
              "must keep these arrays in sync");

const char kParseSuccess[] = "Success.";
const char kParseErrorMissingSchemeSeparator[] = "Missing scheme separator.";
const char kParseErrorInvalidScheme[] = "Invalid scheme.";
const char kParseErrorWrongSchemeType[] = "Wrong scheme type.";
const char kParseErrorEmptyHost[] = "Host can not be empty.";
const char kParseErrorInvalidHostWildcard[] = "Invalid host wildcard.";
const char kParseErrorEmptyPath[] = "Empty path.";
const char kParseErrorInvalidPort[] = "Invalid port.";
const char kParseErrorInvalidHost[] = "Invalid host.";

// Message explaining each URLPattern::ParseResult.
const char* const kParseResultMessages[] = {
    kParseSuccess,
    kParseErrorMissingSchemeSeparator,
    kParseErrorInvalidScheme,
    kParseErrorWrongSchemeType,
    kParseErrorEmptyHost,
    kParseErrorInvalidHostWildcard,
    kParseErrorEmptyPath,
    kParseErrorInvalidPort,
    kParseErrorInvalidHost,
};

static_assert(URLPattern::NUM_PARSE_RESULTS == arraysize(kParseResultMessages),
              "must add message for each parse result");

const char kPathSeparator[] = "/";

bool IsStandardScheme(base::StringPiece scheme) {
  // "*" gets the same treatment as a standard scheme.
  if (scheme == "*")
    return true;

  return url::IsStandard(scheme.data(),
                         url::Component(0, static_cast<int>(scheme.length())));
}

bool IsValidPortForScheme(base::StringPiece scheme, base::StringPiece port) {
  if (port == "*")
    return true;

  // Only accept non-wildcard ports if the scheme uses ports.
  if (url::DefaultPortForScheme(scheme.data(), scheme.length()) ==
      url::PORT_UNSPECIFIED) {
    return false;
  }

  int parsed_port = url::PORT_UNSPECIFIED;
  if (!base::StringToInt(port, &parsed_port))
    return false;
  return (parsed_port >= 0) && (parsed_port < 65536);
}

// Returns |path| with the trailing wildcard stripped if one existed.
//
// The functions that rely on this (OverlapsWith and Contains) are only
// called for the patterns inside URLPatternSet. In those cases, we know that
// the path will have only a single wildcard at the end. This makes figuring
// out overlap much easier. It seems like there is probably a computer-sciency
// way to solve the general case, but we don't need that yet.
base::StringPiece StripTrailingWildcard(base::StringPiece path) {
  if (path.ends_with("*"))
    path.remove_suffix(1);
  return path;
}

// Removes trailing dot from |host_piece| if any.
base::StringPiece CanonicalizeHostForMatching(base::StringPiece host_piece) {
  if (host_piece.ends_with("."))
    host_piece.remove_suffix(1);
  return host_piece;
}

}  // namespace

// static
bool URLPattern::IsValidSchemeForExtensions(base::StringPiece scheme) {
  for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
    if (scheme == kValidSchemes[i])
      return true;
  }
  return false;
}

URLPattern::URLPattern()
    : valid_schemes_(SCHEME_NONE),
      match_all_urls_(false),
      match_subdomains_(false),
      port_("*") {}

URLPattern::URLPattern(int valid_schemes)
    : valid_schemes_(valid_schemes),
      match_all_urls_(false),
      match_subdomains_(false),
      port_("*") {}

URLPattern::URLPattern(int valid_schemes, base::StringPiece pattern)
    // Strict error checking is used, because this constructor is only
    // appropriate when we know |pattern| is valid.
    : valid_schemes_(valid_schemes),
      match_all_urls_(false),
      match_subdomains_(false),
      port_("*") {
  ParseResult result = Parse(pattern);
  if (PARSE_SUCCESS != result)
    NOTREACHED() << "URLPattern invalid: " << pattern << " result " << result;
}

URLPattern::URLPattern(const URLPattern& other) = default;

URLPattern::~URLPattern() {}

bool URLPattern::operator<(const URLPattern& other) const {
  return GetAsString() < other.GetAsString();
}

bool URLPattern::operator>(const URLPattern& other) const {
  return GetAsString() > other.GetAsString();
}

bool URLPattern::operator==(const URLPattern& other) const {
  return GetAsString() == other.GetAsString();
}

std::ostream& operator<<(std::ostream& out, const URLPattern& url_pattern) {
  return out << '"' << url_pattern.GetAsString() << '"';
}

URLPattern::ParseResult URLPattern::Parse(base::StringPiece pattern) {
  spec_.clear();
  SetMatchAllURLs(false);
  SetMatchSubdomains(false);
  SetPort("*");

  // Special case pattern to match every valid URL.
  if (pattern == kAllUrlsPattern) {
    SetMatchAllURLs(true);
    return PARSE_SUCCESS;
  }

  // Parse out the scheme.
  size_t scheme_end_pos = pattern.find(url::kStandardSchemeSeparator);
  bool has_standard_scheme_separator = true;

  // Some urls also use ':' alone as the scheme separator.
  if (scheme_end_pos == base::StringPiece::npos) {
    scheme_end_pos = pattern.find(':');
    has_standard_scheme_separator = false;
  }

  if (scheme_end_pos == base::StringPiece::npos)
    return PARSE_ERROR_MISSING_SCHEME_SEPARATOR;

  if (!SetScheme(pattern.substr(0, scheme_end_pos)))
    return PARSE_ERROR_INVALID_SCHEME;

  bool standard_scheme = IsStandardScheme(scheme_);
  if (standard_scheme != has_standard_scheme_separator)
    return PARSE_ERROR_WRONG_SCHEME_SEPARATOR;

  // Advance past the scheme separator.
  scheme_end_pos +=
      (standard_scheme ? strlen(url::kStandardSchemeSeparator) : 1);
  if (scheme_end_pos >= pattern.size())
    return PARSE_ERROR_EMPTY_HOST;

  // Parse out the host and path.
  size_t host_start_pos = scheme_end_pos;
  size_t path_start_pos = 0;

  if (!standard_scheme) {
    path_start_pos = host_start_pos;
  } else if (scheme_ == url::kFileScheme) {
    size_t host_end_pos = pattern.find(kPathSeparator, host_start_pos);
    if (host_end_pos == base::StringPiece::npos) {
      // Allow hostname omission.
      // e.g. file://* is interpreted as file:///*,
      // file://foo* is interpreted as file:///foo*.
      path_start_pos = host_start_pos - 1;
    } else {
      // Ignore hostname if scheme is file://.
      // e.g. file://localhost/foo is equal to file:///foo.
      path_start_pos = host_end_pos;
    }
  } else {
    size_t host_end_pos = pattern.find(kPathSeparator, host_start_pos);

    // Host is required.
    if (host_start_pos == host_end_pos)
      return PARSE_ERROR_EMPTY_HOST;

    if (host_end_pos == base::StringPiece::npos)
      return PARSE_ERROR_EMPTY_PATH;

    // TODO(devlin): This whole series is expensive. Luckily we don't do it
    // *too* often, but it could be optimized.
    pattern.substr(host_start_pos, host_end_pos - host_start_pos)
        .CopyToString(&host_);

    // The first component can optionally be '*' to match all subdomains.
    std::vector<std::string> host_components = base::SplitString(
        host_, ".", base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL);

    // Could be empty if the host only consists of whitespace characters.
    if (host_components.empty() ||
        (host_components.size() == 1 && host_components[0].empty()))
      return PARSE_ERROR_EMPTY_HOST;

    if (host_components[0] == "*") {
      match_subdomains_ = true;
      host_components.erase(host_components.begin(),
                            host_components.begin() + 1);
    }
    host_ = base::JoinString(host_components, ".");

    path_start_pos = host_end_pos;
  }

  SetPath(pattern.substr(path_start_pos));

  size_t port_pos = host_.find(':');
  if (port_pos != std::string::npos) {
    if (!SetPort(host_.substr(port_pos + 1)))
      return PARSE_ERROR_INVALID_PORT;
    host_ = host_.substr(0, port_pos);
  }

  // No other '*' can occur in the host, though. This isn't necessary, but is
  // done as a convenience to developers who might otherwise be confused and
  // think '*' works as a glob in the host.
  if (host_.find('*') != std::string::npos)
    return PARSE_ERROR_INVALID_HOST_WILDCARD;

  // Null characters are not allowed in hosts.
  if (host_.find('\0') != std::string::npos)
    return PARSE_ERROR_INVALID_HOST;

  return PARSE_SUCCESS;
}

void URLPattern::SetValidSchemes(int valid_schemes) {
  spec_.clear();
  valid_schemes_ = valid_schemes;
}

void URLPattern::SetHost(base::StringPiece host) {
  spec_.clear();
  host.CopyToString(&host_);
}

void URLPattern::SetMatchAllURLs(bool val) {
  spec_.clear();
  match_all_urls_ = val;

  if (val) {
    match_subdomains_ = true;
    scheme_ = "*";
    host_.clear();
    SetPath("/*");
  }
}

void URLPattern::SetMatchSubdomains(bool val) {
  spec_.clear();
  match_subdomains_ = val;
}

bool URLPattern::SetScheme(base::StringPiece scheme) {
  spec_.clear();
  scheme.CopyToString(&scheme_);
  if (scheme_ == "*") {
    valid_schemes_ &= (SCHEME_HTTP | SCHEME_HTTPS);
  } else if (!IsValidScheme(scheme_)) {
    return false;
  }
  return true;
}

bool URLPattern::IsValidScheme(base::StringPiece scheme) const {
  if (valid_schemes_ == SCHEME_ALL)
    return true;

  for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
    if (scheme == kValidSchemes[i] && (valid_schemes_ & kValidSchemeMasks[i]))
      return true;
  }

  return false;
}

void URLPattern::SetPath(base::StringPiece path) {
  spec_.clear();
  path.CopyToString(&path_);
  path_escaped_ = path_;
  base::ReplaceSubstringsAfterOffset(&path_escaped_, 0, "\\", "\\\\");
  base::ReplaceSubstringsAfterOffset(&path_escaped_, 0, "?", "\\?");
}

bool URLPattern::SetPort(base::StringPiece port) {
  spec_.clear();
  if (IsValidPortForScheme(scheme_, port)) {
    port.CopyToString(&port_);
    return true;
  }
  return false;
}

bool URLPattern::MatchesURL(const GURL& test) const {
  const GURL* test_url = &test;
  bool has_inner_url = test.inner_url() != NULL;

  if (has_inner_url) {
    if (!test.SchemeIsFileSystem())
      return false;  // The only nested URLs we handle are filesystem URLs.
    test_url = test.inner_url();
  }

  if (!MatchesScheme(test_url->scheme_piece()))
    return false;

  if (match_all_urls_)
    return true;

  std::string path_for_request = test.PathForRequest();
  if (has_inner_url) {
    path_for_request = base::StringPrintf("%s%s", test_url->path_piece().data(),
                                          path_for_request.c_str());
  }

  return MatchesSecurityOriginHelper(*test_url) &&
         MatchesPath(path_for_request);
}

bool URLPattern::MatchesSecurityOrigin(const GURL& test) const {
  const GURL* test_url = &test;
  bool has_inner_url = test.inner_url() != NULL;

  if (has_inner_url) {
    if (!test.SchemeIsFileSystem())
      return false;  // The only nested URLs we handle are filesystem URLs.
    test_url = test.inner_url();
  }

  if (!MatchesScheme(test_url->scheme()))
    return false;

  if (match_all_urls_)
    return true;

  return MatchesSecurityOriginHelper(*test_url);
}

bool URLPattern::MatchesScheme(base::StringPiece test) const {
  if (!IsValidScheme(test))
    return false;

  return scheme_ == "*" || test == scheme_;
}

bool URLPattern::MatchesHost(base::StringPiece host) const {
  // TODO(devlin): This is a bit sad. Parsing urls is expensive.
  return MatchesHost(
      GURL(base::StringPrintf("%s%s%s/", url::kHttpScheme,
                              url::kStandardSchemeSeparator, host.data())));
}

bool URLPattern::MatchesHost(const GURL& test) const {
  const base::StringPiece test_host(
      CanonicalizeHostForMatching(test.host_piece()));
  const base::StringPiece pattern_host(CanonicalizeHostForMatching(host_));

  // If the hosts are exactly equal, we have a match.
  if (test_host == pattern_host)
    return true;

  // If we're matching subdomains, and we have no host in the match pattern,
  // that means that we're matching all hosts, which means we have a match no
  // matter what the test host is.
  if (match_subdomains_ && pattern_host.empty())
    return true;

  // Otherwise, we can only match if our match pattern matches subdomains.
  if (!match_subdomains_)
    return false;

  // We don't do subdomain matching against IP addresses, so we can give up now
  // if the test host is an IP address.
  if (test.HostIsIPAddress())
    return false;

  // Check if the test host is a subdomain of our host.
  if (test_host.length() <= (pattern_host.length() + 1))
    return false;

  if (!test_host.ends_with(pattern_host))
    return false;

  return test_host[test_host.length() - pattern_host.length() - 1] == '.';
}

bool URLPattern::ImpliesAllHosts() const {
  // Check if it matches all urls or is a pattern like http://*/*.
  if (match_all_urls_ ||
      (match_subdomains_ && host_.empty() && port_ == "*" && path_ == "/*")) {
    return true;
  }

  // If this doesn't even match subdomains, it can't possibly imply all hosts.
  if (!match_subdomains_)
    return false;

  // If there was more than just a TLD in the host (e.g., *.foobar.com), it
  // doesn't imply all hosts. We don't include private TLDs, so that, e.g.,
  // *.appspot.com does not imply all hosts.
  if (net::registry_controlled_domains::HostHasRegistryControlledDomain(
          host_, net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,
          net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES))
    return false;

  // At this point the host could either be just a TLD ("com") or some unknown
  // TLD-like string ("notatld"). To disambiguate between them construct a
  // fake URL, and check the registry.
  //
  // If we recognized this TLD, then this is a pattern like *.com, and it
  // should imply all hosts.
  return net::registry_controlled_domains::HostHasRegistryControlledDomain(
      "notatld." + host_,
      net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,
      net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
}

bool URLPattern::MatchesSingleOrigin() const {
  // Strictly speaking, the port is part of the origin, but in URLPattern it
  // defaults to *. It's not very interesting anyway, so leave it out.
  return !ImpliesAllHosts() && scheme_ != "*" && !match_subdomains_;
}

bool URLPattern::MatchesPath(base::StringPiece test) const {
  // Make the behaviour of OverlapsWith consistent with MatchesURL, which is
  // need to match hosted apps on e.g. 'google.com' also run on 'google.com/'.
  // The below if is a no-copy way of doing (test + "/*" == path_escaped_).
  if (path_escaped_.length() == test.length() + 2 &&
      base::StartsWith(path_escaped_.c_str(), test,
                       base::CompareCase::SENSITIVE) &&
      base::EndsWith(path_escaped_, "/*", base::CompareCase::SENSITIVE)) {
    return true;
  }

  return base::MatchPattern(test, path_escaped_);
}

const std::string& URLPattern::GetAsString() const {
  if (!spec_.empty())
    return spec_;

  if (match_all_urls_) {
    spec_ = kAllUrlsPattern;
    return spec_;
  }

  bool standard_scheme = IsStandardScheme(scheme_);

  std::string spec =
      scheme_ + (standard_scheme ? url::kStandardSchemeSeparator : ":");

  if (scheme_ != url::kFileScheme && standard_scheme) {
    if (match_subdomains_) {
      spec += "*";
      if (!host_.empty())
        spec += ".";
    }

    if (!host_.empty())
      spec += host_;

    if (port_ != "*") {
      spec += ":";
      spec += port_;
    }
  }

  if (!path_.empty())
    spec += path_;

  spec_ = std::move(spec);
  return spec_;
}

bool URLPattern::OverlapsWith(const URLPattern& other) const {
  if (match_all_urls() || other.match_all_urls())
    return true;
  return (MatchesAnyScheme(other.GetExplicitSchemes()) ||
          other.MatchesAnyScheme(GetExplicitSchemes())) &&
         (MatchesHost(other.host()) || other.MatchesHost(host())) &&
         (MatchesPortPattern(other.port()) ||
          other.MatchesPortPattern(port())) &&
         (MatchesPath(StripTrailingWildcard(other.path())) ||
          other.MatchesPath(StripTrailingWildcard(path())));
}

bool URLPattern::Contains(const URLPattern& other) const {
  if (match_all_urls())
    return true;
  return MatchesAllSchemes(other.GetExplicitSchemes()) &&
         MatchesHost(other.host()) &&
         (!other.match_subdomains_ || match_subdomains_) &&
         MatchesPortPattern(other.port()) &&
         MatchesPath(StripTrailingWildcard(other.path()));
}

bool URLPattern::MatchesAnyScheme(
    const std::vector<std::string>& schemes) const {
  for (std::vector<std::string>::const_iterator i = schemes.begin();
       i != schemes.end(); ++i) {
    if (MatchesScheme(*i))
      return true;
  }

  return false;
}

bool URLPattern::MatchesAllSchemes(
    const std::vector<std::string>& schemes) const {
  for (std::vector<std::string>::const_iterator i = schemes.begin();
       i != schemes.end(); ++i) {
    if (!MatchesScheme(*i))
      return false;
  }

  return true;
}

bool URLPattern::MatchesSecurityOriginHelper(const GURL& test) const {
  // Ignore hostname if scheme is file://.
  if (scheme_ != url::kFileScheme && !MatchesHost(test))
    return false;

  if (!MatchesPortPattern(base::IntToString(test.EffectiveIntPort())))
    return false;

  return true;
}

bool URLPattern::MatchesPortPattern(base::StringPiece port) const {
  return port_ == "*" || port_ == port;
}

std::vector<std::string> URLPattern::GetExplicitSchemes() const {
  std::vector<std::string> result;

  if (scheme_ != "*" && !match_all_urls_ && IsValidScheme(scheme_)) {
    result.push_back(scheme_);
    return result;
  }

  for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
    if (MatchesScheme(kValidSchemes[i])) {
      result.push_back(kValidSchemes[i]);
    }
  }

  return result;
}

std::vector<URLPattern> URLPattern::ConvertToExplicitSchemes() const {
  std::vector<std::string> explicit_schemes = GetExplicitSchemes();
  std::vector<URLPattern> result;

  for (std::vector<std::string>::const_iterator i = explicit_schemes.begin();
       i != explicit_schemes.end(); ++i) {
    URLPattern temp = *this;
    temp.SetScheme(*i);
    temp.SetMatchAllURLs(false);
    result.push_back(temp);
  }

  return result;
}

// static
const char* URLPattern::GetParseResultString(
    URLPattern::ParseResult parse_result) {
  return kParseResultMessages[parse_result];
}