// Copyright 2023 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

// Functions to canonicalize non-special URLs.

#include "url/url_canon.h"
#include "url/url_canon_internal.h"

namespace url {

namespace {

template <typename CHAR>
bool DoCanonicalizeNonSpecialUrl(const URLComponentSource<CHAR>& source,
                                 const Parsed& parsed,
                                 CharsetConverter* query_converter,
                                 CanonOutput& output,
                                 Parsed& new_parsed) {
  // The implementation is similar to `DoCanonicalizeStandardURL()`, but there
  // are many subtle differences. So we have a different function for
  // canonicalizing non-special URLs.
  //
  // Since canonicalization is also used from url::ReplaceComponents(),
  // we have to handle an invalid URL replacement here, such as:
  //
  // > const url = "git:///";
  // > url.username = "x";
  // > url.href
  // "git:///" (this should not be "git://x@").

  GURL_DCHECK(!parsed.has_opaque_path);

  // Scheme: this will append the colon.
  bool success =
      CanonicalizeScheme(parsed.scheme.maybe_as_string_view_on(source.scheme),
                         &output, &new_parsed.scheme);
  bool have_authority =
      (parsed.username.is_valid() || parsed.password.is_valid() ||
       parsed.host.is_valid() || parsed.port.is_valid());

  // Non-special URL examples which should be carefully handled:
  //
  // | URL      | parsed.user   | parsed.host   | have_authority | Valid URL? |
  // |----------+---------------+---------------+----------------+------------|
  // | git:/a   | invalid       | invalid       | false          | valid      |
  // | git://@/ | valid (empty) | invalid       | true           | invalid    |
  // | git:///  | invalid       | valid (empty) | true           | valid      |

  if (have_authority) {
    // Only write the authority separators when we have a scheme.
    if (parsed.scheme.is_valid()) {
      output.push_back('/');
      output.push_back('/');
    }

    // Username and Password
    //
    // URL Standard:
    // - https://url.spec.whatwg.org/#cannot-have-a-username-password-port
    // - https://url.spec.whatwg.org/#dom-url-username
    // - https://url.spec.whatwg.org/#dom-url-password
    if (parsed.host.is_nonempty()) {
      // User info: the canonicalizer will handle the : and @.
      success &= CanonicalizeUserInfo(
          parsed.username.maybe_as_string_view_on(source.username),
          parsed.password.maybe_as_string_view_on(source.password), &output,
          &new_parsed.username, &new_parsed.password);
    } else {
      new_parsed.username.reset();
      new_parsed.password.reset();
    }

    // Host
    if (parsed.host.is_valid()) {
      success &= CanonicalizeNonSpecialHost(
          std::basic_string_view<CHAR>(
              source.host, parsed.host.is_valid() ? parsed.host.end() : 0),
          parsed.host, output, new_parsed.host);
    } else {
      new_parsed.host.reset();
      // URL is invalid if `have_authority` is true, but `parsed.host` is
      // invalid. Example: "git://@/".
      success = false;
    }

    // Port
    //
    // URL Standard:
    // - https://url.spec.whatwg.org/#cannot-have-a-username-password-port
    // - https://url.spec.whatwg.org/#dom-url-port
    if (parsed.host.is_nonempty()) {
      success &=
          CanonicalizePort(parsed.port.maybe_as_string_view_on(source.port),
                           PORT_UNSPECIFIED, &output, &new_parsed.port);
    } else {
      new_parsed.port.reset();
    }
  } else {
    // No authority, clear the components.
    new_parsed.host.reset();
    new_parsed.username.reset();
    new_parsed.password.reset();
    new_parsed.port.reset();
  }

  // Path
  if (parsed.path.is_valid()) {
    if (!parsed.host.is_valid() && parsed.path.is_empty()) {
      // Handle an edge case: Replacing non-special path-only URL's pathname
      // with an empty path.
      //
      // Path-only non-special URLs cannot have their paths erased.
      //
      // Example:
      //
      // > const url = new URL("git:/a");
      // > url.pathname = '';
      // > url.href
      // => The result should be "git:/", instead of "git:".
      // > url.pathname
      // => The result should be "/", instead of "".
      //
      // URL Standard is https://url.spec.whatwg.org/#dom-url-pathname, however,
      // it would take some time to understand why url.pathname ends up as "/"
      // in this case. Please read the URL Standard carefully to understand
      // that.
      new_parsed.path.begin = output.length();
      output.push_back('/');
      new_parsed.path.len = output.length() - new_parsed.path.begin;
    } else {
      success &= CanonicalizePath(parsed.path.as_string_view_on(source.path),
                                  CanonMode::kNonSpecialURL, &output,
                                  &new_parsed.path);
      if (!parsed.host.is_valid() && new_parsed.path.is_valid() &&
          new_parsed.path.as_string_view_on(output.view().data())
              .starts_with("//")) {
        // To avoid path being treated as the host, prepend "/." to the path".
        //
        // Examples:
        //
        // > const url = new URL("git:/.//a");
        // > url.href
        // => The result should be "git:/.//a", instead of "git://a".
        //
        // > const url = new URL("git:/");
        // > url.pathname = "/.//a"
        // > url.href
        // => The result should be "git:/.//a", instead of "git://a".
        //
        // URL Standard: https://url.spec.whatwg.org/#concept-url-serializer
        //
        // > 3. If url’s host is null, url does not have an opaque path, url’s
        // > path’s size is greater than 1, and url’s path[0] is the empty
        // > string, then append U+002F (/) followed by U+002E (.) to output.
        //
        // Since the path length is unknown in advance, we post-process the new
        // path here. This case is likely to be infrequent, so the performance
        // impact should be minimal.
        size_t prior_output_length = output.length();
        output.Insert(new_parsed.path.begin, "/.");
        // Adjust path.
        new_parsed.path.begin += output.length() - prior_output_length;
      }
    }
  } else {
    new_parsed.path.reset();
  }

  // Query
  CanonicalizeQuery(parsed.query.maybe_as_string_view_on(source.query),
                    query_converter, &output, &new_parsed.query);

  // Ref: ignore failure for this, since the page can probably still be loaded.
  CanonicalizeRef(parsed.ref.maybe_as_string_view_on(source.ref), &output,
                  &new_parsed.ref);

  // Carry over the flag for potentially dangling markup:
  if (parsed.potentially_dangling_markup) {
    new_parsed.potentially_dangling_markup = true;
  }

  return success;
}

}  // namespace

bool CanonicalizeNonSpecialUrl(std::string_view spec,
                               const Parsed& parsed,
                               CharsetConverter* query_converter,
                               CanonOutput& output,
                               Parsed& new_parsed) {
  // Carry over the flag.
  new_parsed.has_opaque_path = parsed.has_opaque_path;

  if (parsed.has_opaque_path) {
    return CanonicalizePathUrl(spec, parsed, &output, &new_parsed);
  }
  return DoCanonicalizeNonSpecialUrl(URLComponentSource(spec.data()), parsed,
                                     query_converter, output, new_parsed);
}

bool CanonicalizeNonSpecialUrl(std::u16string_view spec,
                               const Parsed& parsed,
                               CharsetConverter* query_converter,
                               CanonOutput& output,
                               Parsed& new_parsed) {
  // Carry over the flag.
  new_parsed.has_opaque_path = parsed.has_opaque_path;

  if (parsed.has_opaque_path) {
    return CanonicalizePathUrl(spec, parsed, &output, &new_parsed);
  }
  return DoCanonicalizeNonSpecialUrl(URLComponentSource(spec.data()), parsed,
                                     query_converter, output, new_parsed);
}

bool ReplaceNonSpecialUrl(std::string_view base,
                          const Parsed& base_parsed,
                          const Replacements<char>& replacements,
                          CharsetConverter* query_converter,
                          CanonOutput& output,
                          Parsed& new_parsed) {
  // Carry over the flag.
  new_parsed.has_opaque_path = base_parsed.has_opaque_path;

  if (base_parsed.has_opaque_path) {
    return ReplacePathUrl(base, base_parsed, replacements, &output,
                          &new_parsed);
  }

  URLComponentSource<char> source(base.data());
  Parsed parsed(base_parsed);
  SetupOverrideComponents(base.data(), replacements, &source, &parsed);
  return DoCanonicalizeNonSpecialUrl(source, parsed, query_converter, output,
                                     new_parsed);
}

// For 16-bit replacements, we turn all the replacements into UTF-8 so the
// regular code path can be used.
bool ReplaceNonSpecialUrl(std::string_view base,
                          const Parsed& base_parsed,
                          const Replacements<char16_t>& replacements,
                          CharsetConverter* query_converter,
                          CanonOutput& output,
                          Parsed& new_parsed) {
  // Carry over the flag.
  new_parsed.has_opaque_path = base_parsed.has_opaque_path;

  if (base_parsed.has_opaque_path) {
    return ReplacePathUrl(base, base_parsed, replacements, &output,
                          &new_parsed);
  }

  RawCanonOutput<1024> utf8;
  URLComponentSource<char> source(base.data());
  Parsed parsed(base_parsed);
  SetupUTF16OverrideComponents(base.data(), replacements, &utf8, &source,
                               &parsed);
  return DoCanonicalizeNonSpecialUrl(source, parsed, query_converter, output,
                                     new_parsed);
}

}  // namespace url
