Update googleurl to the latest Chromium version
This updates googleurl to the Chromium upstream version
8948596515b3eaf06c0e3db256c9513b5dca52d8 from Tue Dec 13 16:51:45 2022 +0000
Change-Id: I16d52bfff8ebf049358821c8c8c0b11c377e6268
diff --git a/AUTHORS b/AUTHORS
index b32867a..0e64013 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -17,6 +17,7 @@
Aaron Randolph <aaron.randolph@gmail.com>
Aaryaman Vasishta <jem456.vasishta@gmail.com>
Abdu Ameen <abdu.ameen000@gmail.com>
+Abdullah Abu Tasneem <a.tasneem@samsung.com>
Abhijeet Kandalkar <abhijeet.k@samsung.com>
Abhinav Vij <abhinav.vij@samsung.com>
Abhishek Agarwal <abhishek.a21@samsung.com>
@@ -82,6 +83,7 @@
Alvaro Silva <alvaro.fagner@gmail.com>
Ambarish Rapte <ambarish.r@samsung.com>
Amey Jahagirdar <jahagird@amazon.com>
+Amit Paul <a.paul@samsung.com>
Amit Sarkar <amit.srkr@samsung.com>
Amogh Bihani <amogh.bihani@samsung.com>
Amos Lim <amoseui@gmail.com>
@@ -126,6 +128,7 @@
Anuj Kumar Sharma <anujk.sharma@samsung.com>
Ao Sun <ntusunao@gmail.com>
Ao Wang <wangao.james@bytedance.com>
+Aquibuzzaman Md. Sayem <md.sayem@samsung.com>
Arjun Karthik <arjunkar@amazon.com>
Arman Ghotb <armanghotb@gmail.com>
Armin Burgmeier <aburgmeier@bloomberg.net>
@@ -141,6 +144,7 @@
Arunoday Sarkar <a.sarkar.arun@gmail.com>
Arunprasad Rajkumar <ararunprasad@gmail.com>
Arunprasad Rajkumar <arurajku@cisco.com>
+Arup Barua <arup.barua@samsung.com>
Asami Doi <d0iasm.pub@gmail.com>
Ashish Kumar Gupta <guptaag@amazon.com>
Ashlin Joseph <ashlin.j@samsung.com>
@@ -170,12 +174,14 @@
Bhanukrushana Rout <b.rout@samsung.com>
Biljith Jayan <billy.jayan@samsung.com>
Bin Liao <bin.liao@intel.com>
+Bin Miao <bin.miao@intel.com>
Boaz Sender <boaz@bocoup.com>
Bobby Powers <bobbypowers@gmail.com>
Branden Archer <bma4@zips.uakron.edu>
Brendan Kirby <brendan.kirby@imgtec.com>
Brendan Long <self@brendanlong.com>
Brendon Tiszka <btiszka@gmail.com>
+Brett Lewis <brettlewis@brettlewis.us>
Brian Clifton <clifton@brave.com>
Brian Dunn <brian@theophil.us>
Brian G. Merrell <bgmerrell@gmail.com>
@@ -228,6 +234,7 @@
Cheng Yu <yuzichengcode@gmail.com>
Cheung Ho <uioptt24@gmail.com>
Choongwoo Han <cwhan.tunz@gmail.com>
+Choudhury M. Shamsujjoha <choudhury.s@samsung.com>
Chris Greene <cwgreene@amazon.com>
Chris Harrelson <chrishtr@gmail.com>
Chris Nardi <hichris123@gmail.com>
@@ -316,6 +323,7 @@
Dominic Farolino <domfarolino@gmail.com>
Dominic Jodoin <dominic.jodoin@gmail.com>
Dominik Röttsches <dominik.rottsches@intel.com>
+Dominik Schütz <do.sch.dev@gmail.com>
Don Woodward <woodward@adobe.com>
Donghee Na <corona10@gmail.com>
Dong-hee Na <donghee.na92@gmail.com>
@@ -432,6 +440,7 @@
Hari Singh <hari.singh1@samsung.com>
Harpreet Singh Khurana <harpreet.sk@samsung.com>
Harshikesh Kumar <harshikeshnobug@gmail.com>
+Harshit Pal <harshitp12345@gmail.com>
Hassan Salehe Matar <hassansalehe@gmail.com>
Hautio Kari <khautio@gmail.com>
Heejin R. Chung <heejin.r.chung@samsung.com>
@@ -460,6 +469,8 @@
Hyemi Shin <hyemi.sin@samsung.com>
HyeockJin Kim <kherootz@gmail.com>
Hyojeong Kim <42.4.hyojekim@gmail.com>
+Hyomin Kim <ajtwlsalsdl0@gmail.com>
+Hyomin Kim <hyoputer.kim@samsung.com>
Hyungchan Kim <inlinechan@gmail.com>
Hyungun Kim <khw3754@gmail.com>
Hyungwook Lee <hyungwook.lee@navercorp.com>
@@ -497,6 +508,7 @@
Jaemin Seo <jaemin86.seo@samsung.com>
Jaeseok Yoon <yjaeseok@gmail.com>
Jaewon Choi <jaewon.james.choi@gmail.com>
+Jaewon Jung <jw.jung@navercorp.com>
Jaeyong Bae <jdragon.bae@gmail.com>
Jagdish Chourasia <jagdish.c@samsung.com>
Jaime Soriano Pastor <jsorianopastor@gmail.com>
@@ -545,6 +557,7 @@
Jesus Sanchez-Palencia <jesus.sanchez-palencia.fernandez.fil@intel.com>
Jiadong Chen <chenjiadong@huawei.com>
Jiadong Zhu <jiadong.zhu@linaro.org>
+Jiahao Lu <lujjjh@gmail.com>
Jiahe Zhang <jiahe.zhang@intel.com>
Jiajia Qin <jiajia.qin@intel.com>
Jiajie Hu <jiajie.hu@intel.com>
@@ -811,6 +824,7 @@
Md. Hasanur Rashid <hasanur.r@samsung.com>
Md Jobed Hossain <jobed.h@samsung.com>
Md Raiyan bin Sayeed <mrbsayee@uwaterloo.ca>
+Md. Sadiqul Amin <sadiqul.amin@samsung.com>
Md Sami Uddin <md.sami@samsung.com>
Micha Hanselmann <micha.hanselmann@gmail.com>
Michael Cirone <mikecirone@gmail.com>
@@ -1241,6 +1255,7 @@
U. Artie Eoff <ullysses.a.eoff@intel.com>
Umar Hansa <umar.hansa@gmail.com>
Upendra Gowda <upendrag.gowda@gmail.com>
+Utzcoz <utzcoz@gmail.com>
UwU UwU <uwu7586@gmail.com>
Uzair Jaleel <uzair.jaleel@samsung.com>
Vadim Gorbachev <bmsdave@gmail.com>
@@ -1443,6 +1458,7 @@
Rakuten Kobo Inc. <*@kobo.com>
Rakuten Kobo Inc. <*@rakuten.com>
Red Hat Inc. <*@redhat.com>
+Sajeesh Sidharthan <sajeesh.sidharthan@amd.corp-partner.google.com>
Semihalf <*@semihalf.com>
Seznam.cz, a.s. <*@firma.seznam.cz>
Slack Technologies Inc. <*@slack-corp.com>
diff --git a/base/BUILD b/base/BUILD
index a1410fb..5f36123 100644
--- a/base/BUILD
+++ b/base/BUILD
@@ -18,6 +18,7 @@
"//conditions:default": [],
}),
hdrs = [
+ "bits.h",
"compiler_specific.h",
"containers/checked_iterators.h",
"containers/contains.h",
@@ -41,6 +42,7 @@
"numerics/safe_conversions.h",
"numerics/safe_conversions_arm_impl.h",
"numerics/safe_conversions_impl.h",
+ "numerics/safe_math.h",
"numerics/safe_math_arm_impl.h",
"numerics/safe_math_clang_gcc_impl.h",
"numerics/safe_math_shared_impl.h",
diff --git a/base/bits.h b/base/bits.h
new file mode 100644
index 0000000..ea011ad
--- /dev/null
+++ b/base/bits.h
@@ -0,0 +1,143 @@
+// Copyright 2013 The Chromium Authors
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// This file defines some bit utilities.
+
+#ifndef BASE_BITS_H_
+#define BASE_BITS_H_
+
+#include <limits.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <type_traits>
+
+#include "polyfills/base/check.h"
+#include "base/compiler_specific.h"
+#include "build/build_config.h"
+
+namespace gurl_base {
+namespace bits {
+
+// Returns true iff |value| is a power of 2.
+//
+// TODO(pkasting): When C++20 is available, replace with std::has_single_bit().
+template <typename T, typename = std::enable_if_t<std::is_integral<T>::value>>
+constexpr bool IsPowerOfTwo(T value) {
+ // From "Hacker's Delight": Section 2.1 Manipulating Rightmost Bits.
+ //
+ // Only positive integers with a single bit set are powers of two. If only one
+ // bit is set in x (e.g. 0b00000100000000) then |x-1| will have that bit set
+ // to zero and all bits to its right set to 1 (e.g. 0b00000011111111). Hence
+ // |x & (x-1)| is 0 iff x is a power of two.
+ return value > 0 && (value & (value - 1)) == 0;
+}
+
+// Round down |size| to a multiple of alignment, which must be a power of two.
+template <typename T, typename = std::enable_if_t<std::is_integral_v<T>>>
+constexpr T AlignDown(T size, T alignment) {
+ GURL_DCHECK(IsPowerOfTwo(alignment));
+ return size & ~(alignment - 1);
+}
+
+// Move |ptr| back to the previous multiple of alignment, which must be a power
+// of two. Defined for types where sizeof(T) is one byte.
+template <typename T, typename = typename std::enable_if<sizeof(T) == 1>::type>
+inline T* AlignDown(T* ptr, uintptr_t alignment) {
+ return reinterpret_cast<T*>(
+ AlignDown(reinterpret_cast<uintptr_t>(ptr), alignment));
+}
+
+// Round up |size| to a multiple of alignment, which must be a power of two.
+template <typename T, typename = std::enable_if_t<std::is_integral_v<T>>>
+constexpr T AlignUp(T size, T alignment) {
+ GURL_DCHECK(IsPowerOfTwo(alignment));
+ return (size + alignment - 1) & ~(alignment - 1);
+}
+
+// Advance |ptr| to the next multiple of alignment, which must be a power of
+// two. Defined for types where sizeof(T) is one byte.
+template <typename T, typename = typename std::enable_if<sizeof(T) == 1>::type>
+inline T* AlignUp(T* ptr, uintptr_t alignment) {
+ return reinterpret_cast<T*>(
+ AlignUp(reinterpret_cast<uintptr_t>(ptr), alignment));
+}
+
+// CountLeadingZeroBits(value) returns the number of zero bits following the
+// most significant 1 bit in |value| if |value| is non-zero, otherwise it
+// returns {sizeof(T) * 8}.
+// Example: 00100010 -> 2
+//
+// CountTrailingZeroBits(value) returns the number of zero bits preceding the
+// least significant 1 bit in |value| if |value| is non-zero, otherwise it
+// returns {sizeof(T) * 8}.
+// Example: 00100010 -> 1
+//
+// C does not have an operator to do this, but fortunately the various
+// compilers have built-ins that map to fast underlying processor instructions.
+//
+// TODO(pkasting): When C++20 is available, replace with std::countl_zero() and
+// similar.
+
+// __builtin_clz has undefined behaviour for an input of 0, even though there's
+// clearly a return value that makes sense, and even though some processor clz
+// instructions have defined behaviour for 0. We could drop to raw __asm__ to
+// do better, but we'll avoid doing that unless we see proof that we need to.
+template <typename T, int bits = sizeof(T) * 8>
+ALWAYS_INLINE constexpr
+ typename std::enable_if<std::is_unsigned<T>::value && sizeof(T) <= 8,
+ int>::type
+ CountLeadingZeroBits(T value) {
+ static_assert(bits > 0, "invalid instantiation");
+ return LIKELY(value)
+ ? bits == 64
+ ? __builtin_clzll(static_cast<uint64_t>(value))
+ : __builtin_clz(static_cast<uint32_t>(value)) - (32 - bits)
+ : bits;
+}
+
+template <typename T, int bits = sizeof(T) * 8>
+ALWAYS_INLINE constexpr
+ typename std::enable_if<std::is_unsigned<T>::value && sizeof(T) <= 8,
+ int>::type
+ CountTrailingZeroBits(T value) {
+ return LIKELY(value) ? bits == 64
+ ? __builtin_ctzll(static_cast<uint64_t>(value))
+ : __builtin_ctz(static_cast<uint32_t>(value))
+ : bits;
+}
+
+// Returns the integer i such as 2^i <= n < 2^(i+1).
+//
+// There is a common `BitLength` function, which returns the number of bits
+// required to represent a value. Rather than implement that function,
+// use `Log2Floor` and add 1 to the result.
+//
+// TODO(pkasting): When C++20 is available, replace with std::bit_xxx().
+constexpr int Log2Floor(uint32_t n) {
+ return 31 - CountLeadingZeroBits(n);
+}
+
+// Returns the integer i such as 2^(i-1) < n <= 2^i.
+constexpr int Log2Ceiling(uint32_t n) {
+ // When n == 0, we want the function to return -1.
+ // When n == 0, (n - 1) will underflow to 0xFFFFFFFF, which is
+ // why the statement below starts with (n ? 32 : -1).
+ return (n ? 32 : -1) - CountLeadingZeroBits(n - 1);
+}
+
+// Returns a value of type T with a single bit set in the left-most position.
+// Can be used instead of manually shifting a 1 to the left.
+template <typename T>
+constexpr T LeftmostBit() {
+ static_assert(std::is_integral<T>::value,
+ "This function can only be used with integral types.");
+ T one(1u);
+ return one << ((CHAR_BIT * sizeof(T) - 1));
+}
+
+} // namespace bits
+} // namespace base
+
+#endif // BASE_BITS_H_
diff --git a/base/numerics/checked_math.h b/base/numerics/checked_math.h
index 4973f09..0e6ad4f 100644
--- a/base/numerics/checked_math.h
+++ b/base/numerics/checked_math.h
@@ -41,11 +41,10 @@
// This is not an explicit constructor because we implicitly upgrade regular
// numerics to CheckedNumerics to make them easier to use.
- template <typename Src>
+ template <typename Src,
+ typename = std::enable_if_t<std::is_arithmetic<Src>::value>>
// NOLINTNEXTLINE(google-explicit-constructor)
- constexpr CheckedNumeric(Src value) : state_(value) {
- static_assert(UnderlyingType<Src>::is_numeric, "Argument must be numeric.");
- }
+ constexpr CheckedNumeric(Src value) : state_(value) {}
// This is not an explicit constructor because we want a seamless conversion
// from StrictNumeric types.
diff --git a/base/numerics/safe_conversions.h b/base/numerics/safe_conversions.h
index 4a9494e..3e04bf4 100644
--- a/base/numerics/safe_conversions.h
+++ b/base/numerics/safe_conversions.h
@@ -20,10 +20,6 @@
#define BASE_HAS_OPTIMIZED_SAFE_CONVERSIONS (0)
#endif
-#if !BASE_NUMERICS_DISABLE_OSTREAM_OPERATORS
-#include <ostream>
-#endif
-
namespace gurl_base {
namespace internal {
diff --git a/base/strings/escape.cc b/base/strings/escape.cc
index d855c1c..867e04b 100644
--- a/base/strings/escape.cc
+++ b/base/strings/escape.cc
@@ -7,7 +7,7 @@
#include <ostream>
#include "polyfills/base/check_op.h"
-#include "base/feature_list.h"
+#include "polyfills/base/feature_list.h"
#include "base/features.h"
#include "base/strings/string_piece.h"
#include "base/strings/string_util.h"
diff --git a/base/strings/string_piece.h b/base/strings/string_piece.h
index a1db548..fe524df 100644
--- a/base/strings/string_piece.h
+++ b/base/strings/string_piece.h
@@ -34,6 +34,7 @@
#include "polyfills/base/check_op.h"
#include "base/compiler_specific.h"
#include "base/cxx20_is_constant_evaluated.h"
+#include "base/numerics/safe_math.h"
#include "base/strings/string_piece_forward.h" // IWYU pragma: export
#include "build/build_config.h"
@@ -117,8 +118,9 @@
constexpr BasicStringPiece(const BasicStringPiece& other) noexcept = default;
constexpr BasicStringPiece& operator=(const BasicStringPiece& view) noexcept =
default;
- constexpr BasicStringPiece(const CharT* s, size_type count)
- : ptr_(s), length_(count) {}
+ constexpr BasicStringPiece(const CharT* s, CheckedNumeric<size_t> count)
+ : ptr_(s), length_(count.ValueOrDie()) {}
+ // NOLINTNEXTLINE(google-explicit-constructor)
constexpr BasicStringPiece(const CharT* s)
: ptr_(s), length_(s ? traits_type::length(s) : 0) {
// Intentional STL deviation: Null-check instead of UB.
diff --git a/base/strings/string_piece_rust.h b/base/strings/string_piece_rust.h
index 0d89aa4..ff08c34 100644
--- a/base/strings/string_piece_rust.h
+++ b/base/strings/string_piece_rust.h
@@ -5,6 +5,10 @@
#ifndef BASE_STRINGS_STRING_PIECE_RUST_H_
#define BASE_STRINGS_STRING_PIECE_RUST_H_
+#include "build/rust/rust_buildflags.h"
+
+#if BUILDFLAG(TOOLCHAIN_HAS_RUST)
+
#include <stdint.h>
#include "base/strings/string_piece.h"
@@ -35,4 +39,6 @@
} // namespace base
+#endif // BUILDFLAG(TOOLCHAIN_HAS_RUST)
+
#endif // BASE_STRINGS_STRING_PIECE_RUST_H_
diff --git a/base/strings/string_piece_rust_unittest.cc b/base/strings/string_piece_rust_unittest.cc
index 38d50d4..2f8db6c 100644
--- a/base/strings/string_piece_rust_unittest.cc
+++ b/base/strings/string_piece_rust_unittest.cc
@@ -3,9 +3,12 @@
// found in the LICENSE file.
#include "base/strings/string_piece_rust.h"
+#include "build/rust/rust_buildflags.h"
#include "testing/gtest/include/gtest/gtest.h"
+#if BUILDFLAG(TOOLCHAIN_HAS_RUST)
+
namespace gurl_base {
namespace {
@@ -28,3 +31,5 @@
} // namespace
} // namespace base
+
+#endif // BUILDFLAG(TOOLCHAIN_HAS_RUST)
diff --git a/base/strings/string_piece_unittest.cc b/base/strings/string_piece_unittest.cc
index fb1be66..9cd9b3f 100644
--- a/base/strings/string_piece_unittest.cc
+++ b/base/strings/string_piece_unittest.cc
@@ -720,6 +720,11 @@
}
}
+TEST(StringPieceTest, InvalidLengthDeath) {
+ int length = -1;
+ ASSERT_DEATH_IF_SUPPORTED({ StringPiece piece("hello", length); }, "");
+}
+
TEST(StringPieceTest, ConstexprData) {
{
constexpr StringPiece piece;
diff --git a/base/strings/string_util.h b/base/strings/string_util.h
index b7bee67..48d3fac 100644
--- a/base/strings/string_util.h
+++ b/base/strings/string_util.h
@@ -135,16 +135,21 @@
return (c >= 'a' && c <= 'z') ? static_cast<CharT>(c + 'A' - 'a') : c;
}
-// Converts the given string to it's ASCII-lowercase equivalent.
+// Converts the given string to its ASCII-lowercase equivalent. Non-ASCII
+// bytes (or UTF-16 code units in `StringPiece16`) are permitted but will be
+// unmodified.
BASE_EXPORT std::string ToLowerASCII(StringPiece str);
BASE_EXPORT std::u16string ToLowerASCII(StringPiece16 str);
-// Converts the given string to it's ASCII-uppercase equivalent.
+// Converts the given string to its ASCII-uppercase equivalent. Non-ASCII
+// bytes (or UTF-16 code units in `StringPiece16`) are permitted but will be
+// unmodified.
BASE_EXPORT std::string ToUpperASCII(StringPiece str);
BASE_EXPORT std::u16string ToUpperASCII(StringPiece16 str);
-// Functor for case-insensitive ASCII comparisons for STL algorithms like
-// std::search.
+// Functor for ASCII case-insensitive comparisons for STL algorithms like
+// std::search. Non-ASCII bytes (or UTF-16 code units in `StringPiece16`) are
+// permitted but will be compared as-is.
//
// Note that a full Unicode version of this functor is not possible to write
// because case mappings might change the number of characters, depend on
@@ -158,13 +163,17 @@
}
};
-// Like strcasecmp for case-insensitive ASCII characters only. Returns:
+// Like strcasecmp for ASCII case-insensitive comparisons only. Returns:
// -1 (a < b)
// 0 (a == b)
// 1 (a > b)
-// (unlike strcasecmp which can return values greater or less than 1/-1). For
-// full Unicode support, use gurl_base::i18n::ToLower or gurl_base::i18n::FoldCase
-// and then just call the normal string operators on the result.
+// (unlike strcasecmp which can return values greater or less than 1/-1). To
+// compare all Unicode code points case-insensitively, use gurl_base::i18n::ToLower
+// or gurl_base::i18n::FoldCase and then just call the normal string operators on the
+// result.
+//
+// Non-ASCII bytes (or UTF-16 code units in `StringPiece16`) are permitted but
+// will be compared unmodified.
BASE_EXPORT constexpr int CompareCaseInsensitiveASCII(StringPiece a,
StringPiece b) {
return internal::CompareCaseInsensitiveASCIIT(a, b);
@@ -174,9 +183,11 @@
return internal::CompareCaseInsensitiveASCIIT(a, b);
}
-// Equality for ASCII case-insensitive comparisons. For full Unicode support,
-// use gurl_base::i18n::ToLower or gurl_base::i18n::FoldCase and then compare with either
-// == or !=.
+// Equality for ASCII case-insensitive comparisons. Non-ASCII bytes (or UTF-16
+// code units in `StringPiece16`) are permitted but will be compared unmodified.
+// To compare all Unicode code points case-insensitively, use
+// gurl_base::i18n::ToLower or gurl_base::i18n::FoldCase and then compare with either ==
+// or !=.
inline bool EqualsCaseInsensitiveASCII(StringPiece a, StringPiece b) {
return internal::EqualsCaseInsensitiveASCIIT(a, b);
}
@@ -215,6 +226,9 @@
kWhitespaceNoCrLfUTF16[]; // Unicode w/o CR/LF.
BASE_EXPORT extern const char kWhitespaceASCII[];
BASE_EXPORT extern const char16_t kWhitespaceASCIIAs16[]; // No unicode.
+ //
+// https://infra.spec.whatwg.org/#ascii-whitespace
+BASE_EXPORT extern const char kInfraAsciiWhitespace[];
// Null-terminated string representing the UTF-8 byte order mark.
BASE_EXPORT extern const char kUtf8ByteOrderMark[];
diff --git a/base/strings/string_util_constants.cc b/base/strings/string_util_constants.cc
index fece0af..12a3c5e 100644
--- a/base/strings/string_util_constants.cc
+++ b/base/strings/string_util_constants.cc
@@ -49,6 +49,8 @@
const char kWhitespaceASCII[] = {WHITESPACE_ASCII, 0};
const char16_t kWhitespaceASCIIAs16[] = {WHITESPACE_ASCII, 0};
+const char kInfraAsciiWhitespace[] = {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0};
+
const char kUtf8ByteOrderMark[] = "\xEF\xBB\xBF";
} // namespace base
diff --git a/base/strings/string_util_impl_helpers.h b/base/strings/string_util_impl_helpers.h
index 970a912..4cd9a3b 100644
--- a/base/strings/string_util_impl_helpers.h
+++ b/base/strings/string_util_impl_helpers.h
@@ -228,10 +228,6 @@
case CompareCase::INSENSITIVE_ASCII:
return std::equal(search_for.begin(), search_for.end(), source.begin(),
CaseInsensitiveCompareASCII<CharT>());
-
- default:
- GURL_NOTREACHED();
- return false;
}
}
@@ -250,10 +246,6 @@
case CompareCase::INSENSITIVE_ASCII:
return std::equal(source.begin(), source.end(), search_for.begin(),
CaseInsensitiveCompareASCII<CharT>());
-
- default:
- GURL_NOTREACHED();
- return false;
}
}
diff --git a/base/strings/string_util_internal.h b/base/strings/string_util_internal.h
index 3a493dd..b05cb7a 100644
--- a/base/strings/string_util_internal.h
+++ b/base/strings/string_util_internal.h
@@ -5,6 +5,8 @@
#ifndef BASE_STRINGS_STRING_UTIL_INTERNAL_H_
#define BASE_STRINGS_STRING_UTIL_INTERNAL_H_
+#include <type_traits>
+
#include "base/ranges/algorithm.h"
#include "base/strings/string_piece.h"
@@ -18,15 +20,18 @@
return (c >= 'A' && c <= 'Z') ? (c + ('a' - 'A')) : c;
}
-template <typename T, typename CharT = typename T::value_type>
+template <typename T>
constexpr int CompareCaseInsensitiveASCIIT(T a, T b) {
// Find the first characters that aren't equal and compare them. If the end
// of one of the strings is found before a nonequal character, the lengths
- // of the strings are compared.
+ // of the strings are compared. Compare using the unsigned type so the sort
+ // order is independent of the signedness of `char`.
+ static_assert(std::is_integral_v<typename T::value_type>);
+ using UCharT = std::make_unsigned_t<typename T::value_type>;
size_t i = 0;
while (i < a.length() && i < b.length()) {
- CharT lower_a = ToLowerASCII(a[i]);
- CharT lower_b = ToLowerASCII(b[i]);
+ UCharT lower_a = static_cast<UCharT>(ToLowerASCII(a[i]));
+ UCharT lower_b = static_cast<UCharT>(ToLowerASCII(b[i]));
if (lower_a < lower_b)
return -1;
if (lower_a > lower_b)
diff --git a/base/strings/string_util_unittest.cc b/base/strings/string_util_unittest.cc
index 4109f8e..e15358f 100644
--- a/base/strings/string_util_unittest.cc
+++ b/base/strings/string_util_unittest.cc
@@ -648,6 +648,11 @@
EXPECT_EQ("cc2", ToLowerASCII("Cc2"));
EXPECT_EQ(u"cc2", ToLowerASCII(u"Cc2"));
+
+ // Non-ASCII characters are unmodified. U+00C4 is LATIN CAPITAL LETTER A WITH
+ // DIAERESIS.
+ EXPECT_EQ('\xc4', ToLowerASCII('\xc4'));
+ EXPECT_EQ(u'\x00c4', ToLowerASCII(u'\x00c4'));
}
TEST(StringUtilTest, ToUpperASCII) {
@@ -661,6 +666,11 @@
EXPECT_EQ("CC2", ToUpperASCII("Cc2"));
EXPECT_EQ(u"CC2", ToUpperASCII(u"Cc2"));
+
+ // Non-ASCII characters are unmodified. U+00E4 is LATIN SMALL LETTER A WITH
+ // DIAERESIS.
+ EXPECT_EQ('\xe4', ToUpperASCII('\xe4'));
+ EXPECT_EQ(u'\x00e4', ToUpperASCII(u'\x00e4'));
}
TEST(StringUtilTest, FormatBytesUnlocalized) {
@@ -1475,6 +1485,15 @@
EXPECT_EQ(-1, CompareCaseInsensitiveASCII("AsdfA", "aSDfb"));
EXPECT_EQ(1, CompareCaseInsensitiveASCII("Asdfb", "aSDfA"));
+ // Non-ASCII bytes are permitted, but they will be compared case-sensitively.
+ EXPECT_EQ(0, CompareCaseInsensitiveASCII("aaa \xc3\xa4", "AAA \xc3\xa4"));
+ EXPECT_EQ(-1, CompareCaseInsensitiveASCII("AAA \xc3\x84", "aaa \xc3\xa4"));
+ EXPECT_EQ(1, CompareCaseInsensitiveASCII("aaa \xc3\xa4", "AAA \xc3\x84"));
+
+ // ASCII bytes should sort before non-ASCII ones.
+ EXPECT_EQ(-1, CompareCaseInsensitiveASCII("a", "\xc3\xa4"));
+ EXPECT_EQ(1, CompareCaseInsensitiveASCII("\xc3\xa4", "a"));
+
// For constexpr.
static_assert(CompareCaseInsensitiveASCII("", "") == 0);
static_assert(CompareCaseInsensitiveASCII("Asdf", "aSDf") == 0);
@@ -1482,6 +1501,14 @@
static_assert(CompareCaseInsensitiveASCII("AsdfA", "aSDf") == 1);
static_assert(CompareCaseInsensitiveASCII("AsdfA", "aSDfb") == -1);
static_assert(CompareCaseInsensitiveASCII("Asdfb", "aSDfA") == 1);
+ static_assert(CompareCaseInsensitiveASCII("aaa \xc3\xa4", "AAA \xc3\xa4") ==
+ 0);
+ static_assert(CompareCaseInsensitiveASCII("AAA \xc3\x84", "aaa \xc3\xa4") ==
+ -1);
+ static_assert(CompareCaseInsensitiveASCII("aaa \xc3\xa4", "AAA \xc3\x84") ==
+ 1);
+ static_assert(CompareCaseInsensitiveASCII("a", "\xc3\xa4") == -1);
+ static_assert(CompareCaseInsensitiveASCII("\xc3\xa4", "a") == 1);
}
TEST(StringUtilTest, EqualsCaseInsensitiveASCII) {
@@ -1505,6 +1532,10 @@
EXPECT_FALSE(EqualsCaseInsensitiveASCII("bsdf", u"aSDF"));
EXPECT_FALSE(EqualsCaseInsensitiveASCII("Asdf", u"aSDFz"));
+ // Non-ASCII bytes are permitted, but they will be compared case-sensitively.
+ EXPECT_TRUE(EqualsCaseInsensitiveASCII("aaa \xc3\xa4", "AAA \xc3\xa4"));
+ EXPECT_FALSE(EqualsCaseInsensitiveASCII("aaa \xc3\x84", "AAA \xc3\xa4"));
+
// The `WStringPiece` overloads are only defined on Windows.
#if BUILDFLAG(IS_WIN)
EXPECT_TRUE(EqualsCaseInsensitiveASCII(L"", L""));
diff --git a/base/strings/sys_string_conversions_win.cc b/base/strings/sys_string_conversions_win.cc
index da19245..50b7c76 100644
--- a/base/strings/sys_string_conversions_win.cc
+++ b/base/strings/sys_string_conversions_win.cc
@@ -5,6 +5,7 @@
#include "base/strings/sys_string_conversions.h"
#include <windows.h>
+
#include <stdint.h>
#include "base/strings/string_piece.h"
diff --git a/copy.bara.sky b/copy.bara.sky
index 89f70ce..a586cea 100644
--- a/copy.bara.sky
+++ b/copy.bara.sky
@@ -12,6 +12,7 @@
include = [
"AUTHORS",
"LICENSE",
+ "base/bits.h",
"base/compiler_specific.h",
"base/containers/checked_iterators.h",
"base/containers/contains.h",
@@ -77,6 +78,7 @@
#"base/dcheck_is_on.h",
"base/debug/alias.h",
"base/export_template.h",
+ "base/feature_list.h",
"base/logging.h",
"base/memory/raw_ptr.h",
"base/notreached.h",
diff --git a/polyfills/BUILD b/polyfills/BUILD
index ea1b73e..7e887e3 100644
--- a/polyfills/BUILD
+++ b/polyfills/BUILD
@@ -15,6 +15,7 @@
"base/dcheck_is_on.h",
"base/debug/alias.h",
"base/export_template.h",
+ "base/feature_list.h",
"base/logging.h",
"base/memory/raw_ptr.h",
"base/metrics/histogram_macros.h",
diff --git a/polyfills/base/feature_list.h b/polyfills/base/feature_list.h
new file mode 100644
index 0000000..b687509
--- /dev/null
+++ b/polyfills/base/feature_list.h
@@ -0,0 +1,37 @@
+// Copyright 2022 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef POLYFILLS_BASE_FEATURE_LIST_H_
+#define POLYFILLS_BASE_FEATURE_LIST_H_
+
+#define BASE_DECLARE_FEATURE(feature) extern const gurl_base::Feature feature
+
+#define BASE_FEATURE(feature, name, default_value) \
+ const gurl_base::Feature feature(name, default_value)
+
+namespace gurl_base {
+
+enum FeatureState {
+ FEATURE_DISABLED_BY_DEFAULT,
+ FEATURE_ENABLED_BY_DEFAULT,
+};
+
+struct Feature {
+ constexpr Feature(const char* name, FeatureState default_state)
+ : name(name), default_state(default_state) {}
+
+ const char* const name;
+ const FeatureState default_state;
+};
+
+class FeatureList {
+ public:
+ static bool IsEnabled(const Feature& feature) {
+ return feature.default_state == FEATURE_ENABLED_BY_DEFAULT;
+ }
+};
+
+} // namespace gurl_base
+
+#endif // POLYFILLS_BASE_FEATURE_LIST_H_
diff --git a/url/BUILD b/url/BUILD
index c6b53ab..327c38a 100644
--- a/url/BUILD
+++ b/url/BUILD
@@ -31,6 +31,7 @@
"url_canon_stdstring.cc",
"url_canon_stdurl.cc",
"url_constants.cc",
+ "url_features.cc",
"url_parse_file.cc",
"url_parse_internal.h",
"url_util.cc",
@@ -44,6 +45,7 @@
"url_canon_ip.h",
"url_canon_stdstring.h",
"url_constants.h",
+ "url_features.h",
"url_file.h",
"url_util.h",
],
diff --git a/url/gurl.cc b/url/gurl.cc
index 6c0429e..c5e3f19 100644
--- a/url/gurl.cc
+++ b/url/gurl.cc
@@ -331,6 +331,15 @@
return Resolve(".");
}
+GURL GURL::GetWithoutRef() const {
+ if (!has_ref())
+ return GURL(*this);
+
+ Replacements replacements;
+ replacements.ClearRef();
+ return ReplaceComponents(replacements);
+}
+
bool GURL::IsStandard() const {
return url::IsStandard(spec_.data(), parsed_.scheme);
}
@@ -402,13 +411,13 @@
}
gurl_base::StringPiece GURL::PathForRequestPiece() const {
- GURL_DCHECK(parsed_.path.len > 0)
+ GURL_DCHECK(parsed_.path.is_nonempty())
<< "Canonical path for requests should be non-empty";
- if (parsed_.ref.len >= 0) {
+ if (parsed_.ref.is_valid()) {
// Clip off the reference when it exists. The reference starts after the
// #-sign, so we have to subtract one to also remove it.
- return gurl_base::StringPiece(&spec_[parsed_.path.begin],
- parsed_.ref.begin - parsed_.path.begin - 1);
+ return gurl_base::StringPiece(spec_).substr(
+ parsed_.path.begin, parsed_.ref.begin - parsed_.path.begin - 1);
}
// Compute the actual path length, rather than depending on the spec's
// terminator. If we're an inner_url, our spec continues on into our outer
@@ -417,7 +426,7 @@
if (parsed_.query.is_valid())
path_len = parsed_.query.end() - parsed_.path.begin;
- return gurl_base::StringPiece(&spec_[parsed_.path.begin], path_len);
+ return gurl_base::StringPiece(spec_).substr(parsed_.path.begin, path_len);
}
std::string GURL::PathForRequest() const {
@@ -446,7 +455,7 @@
if (!is_valid_)
return gurl_base::StringPiece();
url::Component content_component = parsed_.GetContent();
- if (!SchemeIs(url::kJavaScriptScheme) && parsed_.ref.len >= 0)
+ if (!SchemeIs(url::kJavaScriptScheme) && parsed_.ref.is_valid())
content_component.len -= parsed_.ref.len + 1;
return ComponentStringPiece(content_component);
}
diff --git a/url/gurl.h b/url/gurl.h
index 919ae5c..1b29989 100644
--- a/url/gurl.h
+++ b/url/gurl.h
@@ -189,6 +189,14 @@
// scheme, authority or path, it will return an empty, invalid GURL.
GURL GetWithoutFilename() const;
+ // A helper function to return a GURL without the Ref (also named Fragment
+ // Identifier). For example,
+ // GURL("https://www.foo.com/index.html#test").GetWithoutRef().spec()
+ // will return "https://www.foo.com/index.html".
+ // If the GURL is invalid or missing a
+ // scheme, authority or path, it will return an empty, invalid GURL.
+ GURL GetWithoutRef() const;
+
// A helper function to return a GURL containing just the scheme, host,
// and port from a URL. Equivalent to clearing any username and password,
// replacing the path with a slash, and clearing everything after that. If
@@ -285,9 +293,7 @@
bool HostIsIPAddress() const;
// Not including the colon. If you are comparing schemes, prefer SchemeIs.
- bool has_scheme() const {
- return parsed_.scheme.len >= 0;
- }
+ bool has_scheme() const { return parsed_.scheme.is_valid(); }
std::string scheme() const {
return ComponentString(parsed_.scheme);
}
@@ -295,9 +301,7 @@
return ComponentStringPiece(parsed_.scheme);
}
- bool has_username() const {
- return parsed_.username.len >= 0;
- }
+ bool has_username() const { return parsed_.username.is_valid(); }
std::string username() const {
return ComponentString(parsed_.username);
}
@@ -305,9 +309,7 @@
return ComponentStringPiece(parsed_.username);
}
- bool has_password() const {
- return parsed_.password.len >= 0;
- }
+ bool has_password() const { return parsed_.password.is_valid(); }
std::string password() const {
return ComponentString(parsed_.password);
}
@@ -320,7 +322,7 @@
// HostNoBrackets() below.
bool has_host() const {
// Note that hosts are special, absence of host means length 0.
- return parsed_.host.len > 0;
+ return parsed_.host.is_nonempty();
}
std::string host() const {
return ComponentString(parsed_.host);
@@ -332,9 +334,7 @@
// The port if one is explicitly specified. Most callers will want IntPort()
// or EffectiveIntPort() instead of these. The getters will not include the
// ':'.
- bool has_port() const {
- return parsed_.port.len >= 0;
- }
+ bool has_port() const { return parsed_.port.is_valid(); }
std::string port() const {
return ComponentString(parsed_.port);
}
@@ -344,9 +344,7 @@
// Including first slash following host, up to the query. The URL
// "http://www.google.com/" has a path of "/".
- bool has_path() const {
- return parsed_.path.len >= 0;
- }
+ bool has_path() const { return parsed_.path.is_valid(); }
std::string path() const {
return ComponentString(parsed_.path);
}
@@ -355,9 +353,7 @@
}
// Stuff following '?' up to the ref. The getters will not include the '?'.
- bool has_query() const {
- return parsed_.query.len >= 0;
- }
+ bool has_query() const { return parsed_.query.is_valid(); }
std::string query() const {
return ComponentString(parsed_.query);
}
@@ -367,9 +363,7 @@
// Stuff following '#' to the end of the string. This will be %-escaped UTF-8.
// The getters will not include the '#'.
- bool has_ref() const {
- return parsed_.ref.len >= 0;
- }
+ bool has_ref() const { return parsed_.ref.is_valid(); }
std::string ref() const {
return ComponentString(parsed_.ref);
}
@@ -470,16 +464,13 @@
// Returns the substring of the input identified by the given component.
std::string ComponentString(const url::Component& comp) const {
- if (!comp.is_nonempty())
- return std::string();
- return std::string(spec_, static_cast<size_t>(comp.begin),
- static_cast<size_t>(comp.len));
+ return std::string(ComponentStringPiece(comp));
}
gurl_base::StringPiece ComponentStringPiece(const url::Component& comp) const {
- if (!comp.is_nonempty())
+ if (comp.is_empty())
return gurl_base::StringPiece();
- return gurl_base::StringPiece(&spec_[static_cast<size_t>(comp.begin)],
- static_cast<size_t>(comp.len));
+ return gurl_base::StringPiece(spec_).substr(static_cast<size_t>(comp.begin),
+ static_cast<size_t>(comp.len));
}
void ProcessFileSystemURLAfterReplaceComponents();
diff --git a/url/gurl_unittest.cc b/url/gurl_unittest.cc
index 16e3a8e..c6be656 100644
--- a/url/gurl_unittest.cc
+++ b/url/gurl_unittest.cc
@@ -478,6 +478,81 @@
}
}
+TEST(GURLTest, GetWithoutRef) {
+ struct TestCase {
+ const char* input;
+ const char* expected;
+ } cases[] = {
+ // Common Standard URLs.
+ {"https://www.google.com/index.html",
+ "https://www.google.com/index.html"},
+ {"https://www.google.com/index.html#maps/",
+ "https://www.google.com/index.html"},
+
+ {"https://foo:bar@www.google.com/maps.htm",
+ "https://foo:bar@www.google.com/maps.htm"},
+ {"https://foo:bar@www.google.com/maps.htm#fragment",
+ "https://foo:bar@www.google.com/maps.htm"},
+
+ {"https://www.google.com/maps/au/index.html?q=maps",
+ "https://www.google.com/maps/au/index.html?q=maps"},
+ {"https://www.google.com/maps/au/index.html?q=maps#fragment/",
+ "https://www.google.com/maps/au/index.html?q=maps"},
+
+ {"http://www.google.com:8000/maps/au/index.html?q=maps",
+ "http://www.google.com:8000/maps/au/index.html?q=maps"},
+ {"http://www.google.com:8000/maps/au/index.html?q=maps#fragment/",
+ "http://www.google.com:8000/maps/au/index.html?q=maps"},
+
+ {"https://www.google.com/maps/au/north/?q=maps",
+ "https://www.google.com/maps/au/north/?q=maps"},
+ {"https://www.google.com/maps/au/north?q=maps#fragment",
+ "https://www.google.com/maps/au/north?q=maps"},
+
+ // Less common standard URLs.
+ {"filesystem:http://www.google.com/temporary/bar.html?baz=22",
+ "filesystem:http://www.google.com/temporary/bar.html?baz=22"},
+ {"file:///temporary/bar.html?baz=22#fragment",
+ "file:///temporary/bar.html?baz=22"},
+
+ {"ftp://foo/test/index.html", "ftp://foo/test/index.html"},
+ {"ftp://foo/test/index.html#fragment", "ftp://foo/test/index.html"},
+
+ {"gopher://foo/test/index.html", "gopher://foo/test/index.html"},
+ {"gopher://foo/test/index.html#fragment", "gopher://foo/test/index.html"},
+
+ {"ws://foo/test/index.html", "ws://foo/test/index.html"},
+ {"ws://foo/test/index.html#fragment", "ws://foo/test/index.html"},
+
+ // Non-standard, hierarchical URLs.
+ {"chrome://foo/bar.html", "chrome://foo/bar.html"},
+ {"chrome://foo/bar.html#fragment", "chrome://foo/bar.html"},
+
+ {"httpa://foo/test/index.html", "httpa://foo/test/index.html"},
+ {"httpa://foo/test/index.html#fragment", "httpa://foo/test/index.html"},
+
+ // Non-standard, non-hierarchical URLs.
+ {"blob:https://foo.bar/test/index.html",
+ "blob:https://foo.bar/test/index.html"},
+ {"blob:https://foo.bar/test/index.html#fragment",
+ "blob:https://foo.bar/test/index.html"},
+
+ {"about:blank", "about:blank"},
+ {"about:blank#ref", "about:blank"},
+
+ {"data:foobar", "data:foobar"},
+ {"scheme:opaque_data", "scheme:opaque_data"},
+ // Invalid URLs.
+ {"foobar", ""},
+ };
+
+ for (size_t i = 0; i < std::size(cases); i++) {
+ GURL url(cases[i].input);
+ GURL without_ref = url.GetWithoutRef();
+ EXPECT_EQ(cases[i].expected, without_ref.spec());
+ }
+}
+
TEST(GURLTest, Replacements) {
// The URL canonicalizer replacement test will handle most of these case.
// The most important thing to do here is to check that the proper
diff --git a/url/origin.h b/url/origin.h
index 2b8caa5..5da5d84 100644
--- a/url/origin.h
+++ b/url/origin.h
@@ -18,7 +18,6 @@
#include "base/unguessable_token.h"
#include "build/build_config.h"
#include "build/buildflag.h"
-#include "ipc/ipc_param_traits.h"
#include "absl/types/optional.h"
#include "polyfills/third_party/perfetto/include/perfetto/tracing/traced_value.h"
#include "url/scheme_host_port.h"
@@ -43,6 +42,11 @@
class SecurityOriginTest;
} // namespace blink
+namespace IPC {
+template <class P>
+struct ParamTraits;
+} // namespace IPC
+
namespace ipc_fuzzer {
template <class T>
struct FuzzTraits;
diff --git a/url/third_party/mozilla/url_parse.cc b/url/third_party/mozilla/url_parse.cc
index 2500fc6..a7b72a5 100644
--- a/url/third_party/mozilla/url_parse.cc
+++ b/url/third_party/mozilla/url_parse.cc
@@ -57,7 +57,7 @@
// Returns the offset of the next authority terminator in the input starting
// from start_offset. If no terminator is found, the return value will be equal
// to spec_len.
-template<typename CHAR>
+template <typename CHAR>
int FindNextAuthorityTerminator(const CHAR* spec,
int start_offset,
int spec_len) {
@@ -68,7 +68,7 @@
return spec_len; // Not found.
}
-template<typename CHAR>
+template <typename CHAR>
void ParseUserInfo(const CHAR* spec,
const Component& user,
Component* username,
@@ -82,8 +82,7 @@
if (colon_offset < user.len) {
// Found separator: <username>:<password>
*username = Component(user.begin, colon_offset);
- *password = MakeRange(user.begin + colon_offset + 1,
- user.begin + user.len);
+ *password = MakeRange(user.begin + colon_offset + 1, user.begin + user.len);
} else {
// No separator, treat everything as the username
*username = user;
@@ -91,7 +90,7 @@
}
}
-template<typename CHAR>
+template <typename CHAR>
void ParseServerInfo(const CHAR* spec,
const Component& serverinfo,
Component* hostname,
@@ -141,7 +140,7 @@
// parts. The port number will be parsed and the resulting integer will be
// filled into the given *port variable, or -1 if there is no port number or it
// is invalid.
-template<typename CHAR>
+template <typename CHAR>
void DoParseAuthority(const CHAR* spec,
const Component& auth,
Component* username,
@@ -165,10 +164,10 @@
if (spec[i] == '@') {
// Found user info: <user-info>@<server-info>
- ParseUserInfo(spec, Component(auth.begin, i - auth.begin),
- username, password);
- ParseServerInfo(spec, MakeRange(i + 1, auth.begin + auth.len),
- hostname, port_num);
+ ParseUserInfo(spec, Component(auth.begin, i - auth.begin), username,
+ password);
+ ParseServerInfo(spec, MakeRange(i + 1, auth.begin + auth.len), hostname,
+ port_num);
} else {
// No user info, everything is server info.
username->reset();
@@ -179,30 +178,47 @@
template <typename CHAR>
inline void FindQueryAndRefParts(const CHAR* spec,
- const Component& path,
- int* query_separator,
- int* ref_separator) {
- int path_end = path.begin + path.len;
- for (int i = path.begin; i < path_end; i++) {
- switch (spec[i]) {
- case '?':
- // Only match the query string if it precedes the reference fragment
- // and when we haven't found one already.
- if (*query_separator < 0)
- *query_separator = i;
- break;
- case '#':
- // Record the first # sign only.
- if (*ref_separator < 0) {
- *ref_separator = i;
- return;
- }
- break;
+ const Component& path,
+ int* query_separator,
+ int* ref_separator) {
+ if constexpr (sizeof(*spec) == 1) {
+ // memchr is much faster than any scalar code we can write.
+ const CHAR* ptr = spec + path.begin;
+ const CHAR* first_hash =
+ reinterpret_cast<const CHAR*>(memchr(ptr, '#', path.len));
+ size_t len_before_fragment =
+ first_hash == nullptr ? path.len : first_hash - ptr;
+ const CHAR* first_question =
+ reinterpret_cast<const CHAR*>(memchr(ptr, '?', len_before_fragment));
+ if (first_hash != nullptr) {
+ *ref_separator = first_hash - spec;
+ }
+ if (first_question != nullptr) {
+ *query_separator = first_question - spec;
+ }
+ } else {
+ int path_end = path.begin + path.len;
+ for (int i = path.begin; i < path_end; i++) {
+ switch (spec[i]) {
+ case '?':
+ // Only match the query string if it precedes the reference fragment
+ // and when we haven't found one already.
+ if (*query_separator < 0)
+ *query_separator = i;
+ break;
+ case '#':
+ // Record the first # sign only.
+ if (*ref_separator < 0) {
+ *ref_separator = i;
+ return;
+ }
+ break;
+ }
}
}
}
-template<typename CHAR>
+template <typename CHAR>
void ParsePath(const CHAR* spec,
const Component& path,
Component* filepath,
@@ -217,7 +233,7 @@
ref->reset();
return;
}
- GURL_DCHECK(path.len > 0) << "We should never have 0 length paths";
+ GURL_DCHECK(path.is_nonempty()) << "We should never have 0 length paths";
// Search for first occurrence of either ? or #.
int query_separator = -1; // Index of the '?'
@@ -255,10 +271,8 @@
filepath->reset();
}
-template<typename CHAR>
-bool DoExtractScheme(const CHAR* url,
- int url_len,
- Component* scheme) {
+template <typename CHAR>
+bool DoExtractScheme(const CHAR* url, int url_len, Component* scheme) {
// Skip leading whitespace and control characters.
int begin = 0;
while (begin < url_len && ShouldTrimFromURL(url[begin]))
@@ -326,7 +340,7 @@
// The main parsing function for standard URLs. Standard URLs have a scheme,
// host, path, etc.
-template<typename CHAR>
+template <typename CHAR>
void DoParseStandardURL(const CHAR* spec, int spec_len, Parsed* parsed) {
GURL_DCHECK(spec_len >= 0);
@@ -347,7 +361,7 @@
DoParseAfterScheme(spec, spec_len, after_scheme, parsed);
}
-template<typename CHAR>
+template <typename CHAR>
void DoParseFileSystemURL(const CHAR* spec, int spec_len, Parsed* parsed) {
GURL_DCHECK(spec_len >= 0);
@@ -356,9 +370,9 @@
parsed->password.reset();
parsed->host.reset();
parsed->port.reset();
- parsed->path.reset(); // May use this; reset for convenience.
- parsed->ref.reset(); // May use this; reset for convenience.
- parsed->query.reset(); // May use this; reset for convenience.
+ parsed->path.reset(); // May use this; reset for convenience.
+ parsed->ref.reset(); // May use this; reset for convenience.
+ parsed->query.reset(); // May use this; reset for convenience.
parsed->clear_inner_parsed(); // May use this; reset for convenience.
// Strip leading & trailing spaces and control characters.
@@ -453,8 +467,7 @@
return;
}
int inner_path_end = inner_parsed.path.begin + 1; // skip the leading slash
- while (inner_path_end < spec_len &&
- !IsURLSlash(spec[inner_path_end]))
+ while (inner_path_end < spec_len && !IsURLSlash(spec[inner_path_end]))
++inner_path_end;
parsed->path.begin = inner_path_end;
int new_inner_path_length = inner_path_end - inner_parsed.path.begin;
@@ -464,8 +477,9 @@
// Initializes a path URL which is merely a scheme followed by a path. Examples
// include "about:foo" and "javascript:alert('bar');"
-template<typename CHAR>
-void DoParsePathURL(const CHAR* spec, int spec_len,
+template <typename CHAR>
+void DoParsePathURL(const CHAR* spec,
+ int spec_len,
bool trim_path_end,
Parsed* parsed) {
// Get the non-path and non-scheme parts of the URL out of the way, we never
@@ -507,14 +521,11 @@
return;
GURL_DCHECK_LT(path_begin, spec_len);
- ParsePath(spec,
- MakeRange(path_begin, spec_len),
- &parsed->path,
- &parsed->query,
- &parsed->ref);
+ ParsePath(spec, MakeRange(path_begin, spec_len), &parsed->path,
+ &parsed->query, &parsed->ref);
}
-template<typename CHAR>
+template <typename CHAR>
void DoParseMailtoURL(const CHAR* spec, int spec_len, Parsed* parsed) {
GURL_DCHECK(spec_len >= 0);
@@ -580,11 +591,11 @@
// sscanf but our input is not NULL-terminated, which sscanf requires. Instead,
// we copy the digits to a small stack buffer (since we know the maximum number
// of digits in a valid port number) that we can NULL terminate.
-template<typename CHAR>
+template <typename CHAR>
int DoParsePort(const CHAR* spec, const Component& component) {
// Easy success case when there is no port.
const int kMaxDigits = 5;
- if (!component.is_nonempty())
+ if (component.is_empty())
return PORT_UNSPECIFIED;
// Skip over any leading 0s.
@@ -623,12 +634,12 @@
return port;
}
-template<typename CHAR>
+template <typename CHAR>
void DoExtractFileName(const CHAR* spec,
const Component& path,
Component* file_name) {
// Handle empty paths: they have no file names.
- if (!path.is_nonempty()) {
+ if (path.is_empty()) {
file_name->reset();
return;
}
@@ -652,7 +663,7 @@
return;
}
-template<typename CHAR>
+template <typename CHAR>
bool DoExtractQueryKeyValue(const CHAR* spec,
Component* query,
Component* key,
diff --git a/url/third_party/mozilla/url_parse.h b/url/third_party/mozilla/url_parse.h
index 2246d53..d44e20a 100644
--- a/url/third_party/mozilla/url_parse.h
+++ b/url/third_party/mozilla/url_parse.h
@@ -24,17 +24,14 @@
return begin + len;
}
- // Returns true if this component is valid, meaning the length is given. Even
- // valid components may be empty to record the fact that they exist.
- bool is_valid() const {
- return (len != -1);
- }
+ // Returns true if this component is valid, meaning the length is given.
+ // Valid components may be empty to record the fact that they exist.
+ bool is_valid() const { return len >= 0; }
- // Returns true if the given component is specified on false, the component
- // is either empty or invalid.
- bool is_nonempty() const {
- return (len > 0);
- }
+ // Determine if the component is empty or not. Empty means the length is
+ // zero or the component is invalid.
+ bool is_empty() const { return len <= 0; }
+ bool is_nonempty() const { return len > 0; }
void reset() {
begin = 0;
diff --git a/url/url_canon.h b/url/url_canon.h
index abeea84..1eed379 100644
--- a/url/url_canon.h
+++ b/url/url_canon.h
@@ -26,7 +26,7 @@
// resize function that is called when the existing buffer is not big enough.
// The derived class is then in charge of setting up our buffer which we will
// manage.
-template<typename T>
+template <typename T>
class CanonOutputT {
public:
CanonOutputT() = default;
@@ -60,12 +60,8 @@
// Called by the user of this class to get the output. The output will NOT
// be NULL-terminated. Call length() to get the
// length.
- const T* data() const {
- return buffer_;
- }
- T* data() {
- return buffer_;
- }
+ const T* data() const { return buffer_; }
+ T* data() { return buffer_; }
// Shortens the URL to the new length. Used for "backing up" when processing
// relative paths. This can also be used if an external function writes a lot
@@ -102,8 +98,7 @@
if (!Grow(str_len - (buffer_len_ - cur_len_)))
return;
}
- for (size_t i = 0; i < str_len; i++)
- buffer_[cur_len_ + i] = str[i];
+ memcpy(buffer_ + cur_len_, str, str_len * sizeof(T));
cur_len_ += str_len;
}
@@ -140,7 +135,7 @@
// Simple implementation of the CanonOutput using new[]. This class
// also supports a static buffer so if it is allocated on the stack, most
// URLs can be canonicalized with no heap allocations.
-template<typename T, int fixed_capacity = 1024>
+template <typename T, int fixed_capacity = 1024>
class RawCanonOutputT : public CanonOutputT<T> {
public:
RawCanonOutputT() : CanonOutputT<T>() {
@@ -178,7 +173,7 @@
typedef CanonOutputT<char> CanonOutput;
typedef CanonOutputT<char16_t> CanonOutputW;
-template<int fixed_capacity>
+template <int fixed_capacity>
class RawCanonOutput : public RawCanonOutputT<char, fixed_capacity> {};
template <int fixed_capacity>
class RawCanonOutputW : public RawCanonOutputT<char16_t, fixed_capacity> {};
@@ -357,16 +352,16 @@
// This field summarizes how the input was classified by the canonicalizer.
enum Family {
- NEUTRAL, // - Doesn't resemble an IP address. As far as the IP
- // canonicalizer is concerned, it should be treated as a
- // hostname.
- BROKEN, // - Almost an IP, but was not canonicalized. This could be an
- // IPv4 address where truncation occurred, or something
- // containing the special characters :[] which did not parse
- // as an IPv6 address. Never attempt to connect to this
- // address, because it might actually succeed!
- IPV4, // - Successfully canonicalized as an IPv4 address.
- IPV6, // - Successfully canonicalized as an IPv6 address.
+ NEUTRAL, // - Doesn't resemble an IP address. As far as the IP
+ // canonicalizer is concerned, it should be treated as a
+ // hostname.
+ BROKEN, // - Almost an IP, but was not canonicalized. This could be an
+ // IPv4 address where truncation occurred, or something
+ // containing the special characters :[] which did not parse
+ // as an IPv6 address. Never attempt to connect to this
+ // address, because it might actually succeed!
+ IPV4, // - Successfully canonicalized as an IPv4 address.
+ IPV6, // - Successfully canonicalized as an IPv6 address.
};
Family family;
@@ -392,7 +387,6 @@
}
};
-
// Host.
//
// The 8-bit version requires UTF-8 encoding. Use this version when you only
@@ -709,7 +703,7 @@
// This structures does not own any data. It is the caller's responsibility to
// ensure that the data the pointers point to stays in scope and is not
// modified.
-template<typename CHAR>
+template <typename CHAR>
struct URLComponentSource {
// Constructor normally used by callers wishing to replace components. This
// will make them all NULL, which is no replacement. The caller would then
@@ -734,8 +728,7 @@
port(default_value),
path(default_value),
query(default_value),
- ref(default_value) {
- }
+ ref(default_value) {}
const CHAR* scheme;
const CHAR* username;
@@ -757,11 +750,10 @@
// IN SCOPE BY THE CALLER for as long as this object exists!
//
// Prefer the 8-bit replacement version if possible since it is more efficient.
-template<typename CHAR>
+template <typename CHAR>
class Replacements {
public:
- Replacements() {
- }
+ Replacements() {}
// Scheme
void SetScheme(const CHAR* s, const Component& comp) {
diff --git a/url/url_canon_etc.cc b/url/url_canon_etc.cc
index e54b843..cfe3fe8 100644
--- a/url/url_canon_etc.cc
+++ b/url/url_canon_etc.cc
@@ -31,12 +31,22 @@
// Fast verification that there's nothing that needs removal. This is the 99%
// case, so we want it to be fast and don't care about impacting the speed
// when we do find whitespace.
- int found_whitespace = false;
- for (int i = 0; i < input_len; i++) {
- if (!IsRemovableURLWhitespace(input[i]))
- continue;
- found_whitespace = true;
- break;
+ bool found_whitespace = false;
+ if (sizeof(*input) == 1 && input_len >= kMinimumLengthForSIMD) {
+ // For large strings, memchr is much faster than any scalar code we can
+ // write, even if we need to run it three times. (If this turns out to still
+ // be a bottleneck, we could write our own vector code, but given that
+ // memchr is so fast, it's unlikely to be relevant.)
+ found_whitespace = memchr(input, '\n', input_len) != nullptr ||
+ memchr(input, '\r', input_len) != nullptr ||
+ memchr(input, '\t', input_len) != nullptr;
+ } else {
+ for (int i = 0; i < input_len; i++) {
+ if (!IsRemovableURLWhitespace(input[i]))
+ continue;
+ found_whitespace = true;
+ break;
+ }
}
if (!found_whitespace) {
@@ -72,6 +82,7 @@
// Contains the canonical version of each possible input letter in the scheme
// (basically, lower-cased). The corresponding entry will be 0 if the letter
// is not allowed in a scheme.
+// clang-format off
const char kSchemeCanonical[0x80] = {
// 00-1f: all are invalid
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -88,6 +99,7 @@
0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
// p q r s t u v w x y z { | } ~
'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0 , 0 , 0 , 0 , 0 };
+// clang-format on
// This could be a table lookup as well by setting the high bit for each
// valid character, but it's only called once per URL, and it makes the lookup
@@ -96,12 +108,12 @@
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
}
-template<typename CHAR, typename UCHAR>
+template <typename CHAR, typename UCHAR>
bool DoScheme(const CHAR* spec,
const Component& scheme,
CanonOutput* output,
Component* out_scheme) {
- if (!scheme.is_nonempty()) {
+ if (scheme.is_empty()) {
// Scheme is unspecified or empty, convert to empty by appending a colon.
*out_scheme = Component(output->length(), 0);
output->push_back(':');
@@ -161,7 +173,7 @@
// *_spec strings. Typically, these specs will be the same (we're
// canonicalizing a single source string), but may be different when
// replacing components.
-template<typename CHAR, typename UCHAR>
+template <typename CHAR, typename UCHAR>
bool DoUserInfo(const CHAR* username_spec,
const Component& username,
const CHAR* password_spec,
@@ -169,7 +181,7 @@
CanonOutput* output,
Component* out_username,
Component* out_password) {
- if (username.len <= 0 && password.len <= 0) {
+ if (username.is_empty() && password.is_empty()) {
// Common case: no user info. We strip empty username/passwords.
*out_username = Component();
*out_password = Component();
@@ -178,7 +190,7 @@
// Write the username.
out_username->begin = output->length();
- if (username.len > 0) {
+ if (username.is_nonempty()) {
// This will escape characters not valid for the username.
AppendStringOfType(&username_spec[username.begin],
static_cast<size_t>(username.len), CHAR_USERINFO,
@@ -188,7 +200,7 @@
// When there is a password, we need the separator. Note that we strip
// empty but specified passwords.
- if (password.len > 0) {
+ if (password.is_nonempty()) {
output->push_back(':');
out_password->begin = output->length();
AppendStringOfType(&password_spec[password.begin],
@@ -209,7 +221,7 @@
}
// This function will prepend the colon if there will be a port.
-template<typename CHAR, typename UCHAR>
+template <typename CHAR, typename UCHAR>
bool DoPort(const CHAR* spec,
const Component& port,
int default_port_for_scheme,
@@ -284,7 +296,7 @@
};
// clang-format on
-template<typename CHAR, typename UCHAR>
+template <typename CHAR, typename UCHAR>
void DoCanonicalizeRef(const CHAR* spec,
const Component& ref,
CanonOutput* output,
@@ -364,9 +376,9 @@
CanonOutput* output,
Component* out_username,
Component* out_password) {
- return DoUserInfo<char, unsigned char>(
- username_source, username, password_source, password,
- output, out_username, out_password);
+ return DoUserInfo<char, unsigned char>(username_source, username,
+ password_source, password, output,
+ out_username, out_password);
}
bool CanonicalizeUserInfo(const char16_t* username_source,
@@ -386,8 +398,7 @@
int default_port_for_scheme,
CanonOutput* output,
Component* out_port) {
- return DoPort<char, unsigned char>(spec, port,
- default_port_for_scheme,
+ return DoPort<char, unsigned char>(spec, port, default_port_for_scheme,
output, out_port);
}
diff --git a/url/url_canon_host.cc b/url/url_canon_host.cc
index d29f7ab..eacc69f 100644
--- a/url/url_canon_host.cc
+++ b/url/url_canon_host.cc
@@ -357,7 +357,7 @@
const Component& host,
CanonOutput* output,
CanonHostInfo* host_info) {
- if (!host.is_nonempty()) {
+ if (host.is_empty()) {
// Empty hosts don't need anything.
host_info->family = CanonHostInfo::NEUTRAL;
host_info->out_host = Component();
diff --git a/url/url_canon_internal.cc b/url/url_canon_internal.cc
index eb24cee..393fc4b 100644
--- a/url/url_canon_internal.cc
+++ b/url/url_canon_internal.cc
@@ -7,10 +7,16 @@
#include <errno.h>
#include <stddef.h>
#include <stdlib.h>
+#ifdef __SSE2__
+#include <immintrin.h>
+#elif defined(__aarch64__)
+#include <arm_neon.h>
+#endif
#include <cstdio>
#include <string>
+#include "base/bits.h"
#include "base/numerics/safe_conversions.h"
#include "base/strings/utf_string_conversion_utils.h"
@@ -18,12 +24,62 @@
namespace {
+// Find the initial segment of the given string that consists solely
+// of characters valid for CHAR_QUERY. (We can have false negatives in
+// one specific case, namely the exclamation mark 0x21, but false negatives
+// are fine, and it's not worth adding a separate test for.) This is
+// a fast path to speed up checking of very long query strings that are
+// already valid, which happen on some web pages.
+//
+// This has some startup cost to load the constants and such, so it's
+// usually not worth it for short strings.
+size_t FindInitialQuerySafeString(const char* source, size_t length) {
+#if defined(__SSE2__) || defined(__aarch64__)
+ constexpr size_t kChunkSize = 16;
+ size_t i;
+ for (i = 0; i < gurl_base::bits::AlignDown(length, kChunkSize); i += kChunkSize) {
+ char b __attribute__((vector_size(16)));
+ memcpy(&b, source + i, sizeof(b));
+
+ // Compare each element with the ranges for CHAR_QUERY
+ // (see kSharedCharTypeTable), vectorized so that it creates
+ // a mask of which elements match. For completeness, we could
+ // have had (...) | b == 0x21 here, but exclamation marks are
+ // rare and the extra test costs us some time.
+ auto mask = b >= 0x24 && b <= 0x7e && b != 0x27 && b != 0x3c && b != 0x3e;
+
+#ifdef __SSE2__
+ if (_mm_movemask_epi8(mask) != 0xffff) {
+ return i;
+ }
+#else
+ if (vminvq_u8(mask) == 0) {
+ return i;
+ }
+#endif
+ }
+ return i;
+#else
+ // Need SIMD support (with fast reductions) for this to be efficient.
+ return 0;
+#endif
+}
+
template <typename CHAR, typename UCHAR>
void DoAppendStringOfType(const CHAR* source,
size_t length,
SharedCharTypes type,
CanonOutput* output) {
- for (size_t i = 0; i < length; i++) {
+ size_t i = 0;
+ // We only instantiate this for char, to avoid a Clang crash
+ // (and because Append() does not support converting).
+ if constexpr (sizeof(CHAR) == 1) {
+ if (type == CHAR_QUERY && length >= kMinimumLengthForSIMD) {
+ i = FindInitialQuerySafeString(source, length);
+ output->Append(source, i);
+ }
+ }
+ for (; i < length; i++) {
if (static_cast<UCHAR>(source[i]) >= 0x80) {
// ReadChar will fill the code point with kUnicodeReplacementCharacter
// when the input is invalid, which is what we want.
@@ -113,6 +169,7 @@
} // namespace
// See the header file for this array's declaration.
+// clang-format off
const unsigned char kSharedCharTypeTable[0x100] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x00 - 0x0f
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x10 - 0x1f
@@ -221,6 +278,7 @@
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xe0 - 0xef
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xf0 - 0xff
};
+// clang-format on
const char kHexCharLookup[0x10] = {
'0', '1', '2', '3', '4', '5', '6', '7',
@@ -324,27 +382,27 @@
const URLComponentSource<char>& repl_source = repl.sources();
const Parsed& repl_parsed = repl.components();
- DoOverrideComponent(repl_source.scheme, repl_parsed.scheme,
- &source->scheme, &parsed->scheme);
+ DoOverrideComponent(repl_source.scheme, repl_parsed.scheme, &source->scheme,
+ &parsed->scheme);
DoOverrideComponent(repl_source.username, repl_parsed.username,
&source->username, &parsed->username);
DoOverrideComponent(repl_source.password, repl_parsed.password,
&source->password, &parsed->password);
// Our host should be empty if not present, so override the default setup.
- DoOverrideComponent(repl_source.host, repl_parsed.host,
- &source->host, &parsed->host);
+ DoOverrideComponent(repl_source.host, repl_parsed.host, &source->host,
+ &parsed->host);
if (parsed->host.len == -1)
parsed->host.len = 0;
- DoOverrideComponent(repl_source.port, repl_parsed.port,
- &source->port, &parsed->port);
- DoOverrideComponent(repl_source.path, repl_parsed.path,
- &source->path, &parsed->path);
- DoOverrideComponent(repl_source.query, repl_parsed.query,
- &source->query, &parsed->query);
- DoOverrideComponent(repl_source.ref, repl_parsed.ref,
- &source->ref, &parsed->ref);
+ DoOverrideComponent(repl_source.port, repl_parsed.port, &source->port,
+ &parsed->port);
+ DoOverrideComponent(repl_source.path, repl_parsed.path, &source->path,
+ &parsed->path);
+ DoOverrideComponent(repl_source.query, repl_parsed.query, &source->query,
+ &parsed->query);
+ DoOverrideComponent(repl_source.ref, repl_parsed.ref, &source->ref,
+ &parsed->ref);
}
bool SetupUTF16OverrideComponents(const char* base,
@@ -359,41 +417,43 @@
const Parsed& repl_parsed = repl.components();
success &= PrepareUTF16OverrideComponent(
- repl_source.scheme, repl_parsed.scheme,
- utf8_buffer, &parsed->scheme);
- success &= PrepareUTF16OverrideComponent(
- repl_source.username, repl_parsed.username,
- utf8_buffer, &parsed->username);
- success &= PrepareUTF16OverrideComponent(
- repl_source.password, repl_parsed.password,
- utf8_buffer, &parsed->password);
- success &= PrepareUTF16OverrideComponent(
- repl_source.host, repl_parsed.host,
- utf8_buffer, &parsed->host);
- success &= PrepareUTF16OverrideComponent(
- repl_source.port, repl_parsed.port,
- utf8_buffer, &parsed->port);
- success &= PrepareUTF16OverrideComponent(
- repl_source.path, repl_parsed.path,
- utf8_buffer, &parsed->path);
- success &= PrepareUTF16OverrideComponent(
- repl_source.query, repl_parsed.query,
- utf8_buffer, &parsed->query);
- success &= PrepareUTF16OverrideComponent(
- repl_source.ref, repl_parsed.ref,
- utf8_buffer, &parsed->ref);
+ repl_source.scheme, repl_parsed.scheme, utf8_buffer, &parsed->scheme);
+ success &=
+ PrepareUTF16OverrideComponent(repl_source.username, repl_parsed.username,
+ utf8_buffer, &parsed->username);
+ success &=
+ PrepareUTF16OverrideComponent(repl_source.password, repl_parsed.password,
+ utf8_buffer, &parsed->password);
+ success &= PrepareUTF16OverrideComponent(repl_source.host, repl_parsed.host,
+ utf8_buffer, &parsed->host);
+ success &= PrepareUTF16OverrideComponent(repl_source.port, repl_parsed.port,
+ utf8_buffer, &parsed->port);
+ success &= PrepareUTF16OverrideComponent(repl_source.path, repl_parsed.path,
+ utf8_buffer, &parsed->path);
+ success &= PrepareUTF16OverrideComponent(repl_source.query, repl_parsed.query,
+ utf8_buffer, &parsed->query);
+ success &= PrepareUTF16OverrideComponent(repl_source.ref, repl_parsed.ref,
+ utf8_buffer, &parsed->ref);
// PrepareUTF16OverrideComponent will not have set the data pointer since the
// buffer could be resized, invalidating the pointers. We set the data
// pointers for affected components now that the buffer is finalized.
- if (repl_source.scheme) source->scheme = utf8_buffer->data();
- if (repl_source.username) source->username = utf8_buffer->data();
- if (repl_source.password) source->password = utf8_buffer->data();
- if (repl_source.host) source->host = utf8_buffer->data();
- if (repl_source.port) source->port = utf8_buffer->data();
- if (repl_source.path) source->path = utf8_buffer->data();
- if (repl_source.query) source->query = utf8_buffer->data();
- if (repl_source.ref) source->ref = utf8_buffer->data();
+ if (repl_source.scheme)
+ source->scheme = utf8_buffer->data();
+ if (repl_source.username)
+ source->username = utf8_buffer->data();
+ if (repl_source.password)
+ source->password = utf8_buffer->data();
+ if (repl_source.host)
+ source->host = utf8_buffer->data();
+ if (repl_source.port)
+ source->port = utf8_buffer->data();
+ if (repl_source.path)
+ source->path = utf8_buffer->data();
+ if (repl_source.query)
+ source->query = utf8_buffer->data();
+ if (repl_source.ref)
+ source->ref = utf8_buffer->data();
return success;
}
diff --git a/url/url_canon_internal.h b/url/url_canon_internal.h
index 58ae144..b9ac5bf 100644
--- a/url/url_canon_internal.h
+++ b/url/url_canon_internal.h
@@ -132,9 +132,8 @@
// does no checking that thee character requires escaping.
// Escaping makes sense only 8 bit chars, so code works in all cases of
// input parameters (8/16bit).
-template<typename UINCHAR, typename OUTCHAR>
-inline void AppendEscapedChar(UINCHAR ch,
- CanonOutputT<OUTCHAR>* output) {
+template <typename UINCHAR, typename OUTCHAR>
+inline void AppendEscapedChar(UINCHAR ch, CanonOutputT<OUTCHAR>* output) {
output->push_back('%');
output->push_back(static_cast<OUTCHAR>(kHexCharLookup[(ch >> 4) & 0xf]));
output->push_back(static_cast<OUTCHAR>(kHexCharLookup[ch & 0xf]));
@@ -173,22 +172,17 @@
Appender(static_cast<unsigned char>(char_value), output);
} else if (char_value <= 0x7ff) {
// 110xxxxx 10xxxxxx
- Appender(static_cast<unsigned char>(0xC0 | (char_value >> 6)),
- output);
- Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)),
- output);
+ Appender(static_cast<unsigned char>(0xC0 | (char_value >> 6)), output);
+ Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)), output);
} else if (char_value <= 0xffff) {
// 1110xxxx 10xxxxxx 10xxxxxx
- Appender(static_cast<unsigned char>(0xe0 | (char_value >> 12)),
- output);
+ Appender(static_cast<unsigned char>(0xe0 | (char_value >> 12)), output);
Appender(static_cast<unsigned char>(0x80 | ((char_value >> 6) & 0x3f)),
output);
- Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)),
- output);
+ Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)), output);
} else {
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
- Appender(static_cast<unsigned char>(0xf0 | (char_value >> 18)),
- output);
+ Appender(static_cast<unsigned char>(0xf0 | (char_value >> 18)), output);
Appender(static_cast<unsigned char>(0x80 | ((char_value >> 12) & 0x3f)),
output);
Appender(static_cast<unsigned char>(0x80 | ((char_value >> 6) & 0x3f)),
@@ -316,8 +310,8 @@
size_t* begin,
size_t end,
unsigned char* unescaped_value) {
- if (*begin + 3 > end ||
- !Is8BitChar(spec[*begin + 1]) || !Is8BitChar(spec[*begin + 2])) {
+ if (*begin + 3 > end || !Is8BitChar(spec[*begin + 1]) ||
+ !Is8BitChar(spec[*begin + 2])) {
// Invalid escape sequence because there's not enough room, or the
// digits are not ASCII.
return false;
@@ -446,7 +440,7 @@
int _itow_s(int value, char16_t* buffer, size_t size_in_chars, int radix);
// Secure template overloads for these functions
-template<size_t N>
+template <size_t N>
inline int _itoa_s(int value, char (&buffer)[N], int radix) {
return _itoa_s(value, buffer, N, radix);
}
@@ -458,12 +452,20 @@
// _strtoui64 and strtoull behave the same
inline unsigned long long _strtoui64(const char* nptr,
- char** endptr, int base) {
+ char** endptr,
+ int base) {
return strtoull(nptr, endptr, base);
}
#endif // WIN32
+// The threshold we set to consider SIMD processing, in bytes; there is
+// no deep theory here, it's just set empirically to a value that seems
+// to be good. (We don't really know why there's a slowdown for zero;
+// but a guess would be that there's no need in going into a complex loop
+// with a lot of setup for a five-byte string.)
+static constexpr int kMinimumLengthForSIMD = 50;
+
} // namespace url
#endif // URL_URL_CANON_INTERNAL_H_
diff --git a/url/url_canon_ip.cc b/url/url_canon_ip.cc
index fde31f1..ec8617e 100644
--- a/url/url_canon_ip.cc
+++ b/url/url_canon_ip.cc
@@ -44,7 +44,7 @@
const Component& component,
uint32_t* number) {
// Empty components are considered non-numeric.
- if (!component.is_nonempty())
+ if (component.is_empty())
return CanonHostInfo::NEUTRAL;
// Figure out the base
@@ -133,7 +133,7 @@
--host.len;
// Do nothing if empty.
- if (!host.is_nonempty())
+ if (host.is_empty())
return CanonHostInfo::NEUTRAL;
// Read component values. The first `existing_components` of them are
@@ -302,7 +302,7 @@
// Zero-out the info.
parsed->reset();
- if (!host.is_nonempty())
+ if (host.is_empty())
return false;
// The index for start and end of address range (no brackets).
@@ -447,7 +447,7 @@
unsigned char address[16]) {
// Make sure the component is bounded by '[' and ']'.
int end = host.end();
- if (!host.is_nonempty() || spec[host.begin] != '[' || spec[end - 1] != ']')
+ if (host.is_empty() || spec[host.begin] != '[' || spec[end - 1] != ']')
return false;
// Exclude the square brackets.
diff --git a/url/url_canon_path.cc b/url/url_canon_path.cc
index 3480517..9a03fb4 100644
--- a/url/url_canon_path.cc
+++ b/url/url_canon_path.cc
@@ -255,7 +255,7 @@
const Component& path,
size_t path_begin_in_output,
CanonOutput* output) {
- if (!path.is_nonempty())
+ if (path.is_empty())
return true;
size_t end = static_cast<size_t>(path.end());
@@ -407,7 +407,7 @@
Component* out_path) {
bool success = true;
out_path->begin = output->length();
- if (path.len > 0) {
+ if (path.is_nonempty()) {
// Write out an initial slash if the input has none. If we just parse a URL
// and then canonicalize it, it will of course have a slash already. This
// check is for the replacement and relative URL resolving cases of file
diff --git a/url/url_canon_query.cc b/url/url_canon_query.cc
index d326ce8..b48800c 100644
--- a/url/url_canon_query.cc
+++ b/url/url_canon_query.cc
@@ -106,7 +106,7 @@
CharsetConverter* converter,
CanonOutput* output,
Component* out_query) {
- if (query.len < 0) {
+ if (!query.is_valid()) {
*out_query = Component();
return;
}
diff --git a/url/url_canon_relative.cc b/url/url_canon_relative.cc
index 80588fe..67780b1 100644
--- a/url/url_canon_relative.cc
+++ b/url/url_canon_relative.cc
@@ -239,7 +239,7 @@
const Component& source_component,
CanonOutput* output,
Component* output_component) {
- if (source_component.len < 0) {
+ if (!source_component.is_valid()) {
// This component is not present.
*output_component = Component();
return;
@@ -323,7 +323,7 @@
std::max({path.end(), query.end(), ref.end()}));
output->Append(base_url, base_parsed.path.begin);
- if (path.len > 0) {
+ if (path.is_nonempty()) {
// The path is replaced or modified.
int true_path_begin = output->length();
@@ -492,7 +492,7 @@
// paths (even the default path of "/" is OK).
//
// We allow hosts with no length so we can handle file URLs, for example.
- if (base_parsed.path.len <= 0) {
+ if (base_parsed.path.is_empty()) {
// On error, return the input (resolving a relative URL on a non-relative
// base = the base).
int base_len = base_parsed.Length();
@@ -501,7 +501,7 @@
return false;
}
- if (relative_component.len <= 0) {
+ if (relative_component.is_empty()) {
// Empty relative URL, leave unchanged, only removing the ref component.
int base_len = base_parsed.Length();
base_len -= base_parsed.ref.len + 1;
diff --git a/url/url_canon_stdurl.cc b/url/url_canon_stdurl.cc
index da18d42..8096b56 100644
--- a/url/url_canon_stdurl.cc
+++ b/url/url_canon_stdurl.cc
@@ -58,7 +58,7 @@
output, &new_parsed->host);
// Host must not be empty for standard URLs.
- if (!parsed.host.is_nonempty())
+ if (parsed.host.is_empty())
success = false;
// Port: the port canonicalizer will handle the colon.
diff --git a/url/url_canon_unittest.cc b/url/url_canon_unittest.cc
index 62a5c36..8890639 100644
--- a/url/url_canon_unittest.cc
+++ b/url/url_canon_unittest.cc
@@ -10,10 +10,12 @@
#include "base/strings/string_piece.h"
#include "base/strings/utf_string_conversions.h"
#include "base/test/gtest_util.h"
+#include "base/test/scoped_feature_list.h"
#include "testing/gtest/include/gtest/gtest.h"
#include "url/third_party/mozilla/url_parse.h"
#include "url/url_canon_internal.h"
#include "url/url_canon_stdstring.h"
+#include "url/url_features.h"
#include "url/url_test_utils.h"
namespace url {
@@ -285,38 +287,78 @@
EXPECT_EQ(0, out_comp.len);
}
-TEST(URLCanonTest, Host) {
+// IDNA mode to use in CanonHost tests.
+enum class IDNAMode { kTransitional, kNonTransitional };
+
+class URLCanonHostTest : public ::testing::Test,
+ public ::testing::WithParamInterface<IDNAMode> {
+ public:
+ URLCanonHostTest() {
+ if (GetParam() == IDNAMode::kNonTransitional) {
+ scoped_feature_list_.InitAndEnableFeature(kUseIDNA2008NonTransitional);
+ } else {
+ scoped_feature_list_.InitAndDisableFeature(kUseIDNA2008NonTransitional);
+ }
+ }
+
+ private:
+ gurl_base::test::ScopedFeatureList scoped_feature_list_;
+};
+
+INSTANTIATE_TEST_SUITE_P(All,
+ URLCanonHostTest,
+ ::testing::Values(IDNAMode::kTransitional,
+ IDNAMode::kNonTransitional));
+
+TEST_P(URLCanonHostTest, Host) {
+ bool use_idna_non_transitional = IsUsingIDNA2008NonTransitional();
+
IPAddressCase host_cases[] = {
- // Basic canonicalization, uppercase should be converted to lowercase.
- {"GoOgLe.CoM", L"GoOgLe.CoM", "google.com", Component(0, 10), CanonHostInfo::NEUTRAL, -1, ""},
+ // Basic canonicalization, uppercase should be converted to lowercase.
+ {"GoOgLe.CoM", L"GoOgLe.CoM", "google.com", Component(0, 10),
+ CanonHostInfo::NEUTRAL, -1, ""},
// Spaces and some other characters should be escaped.
- {"Goo%20 goo%7C|.com", L"Goo%20 goo%7C|.com", "goo%20%20goo%7C%7C.com", Component(0, 22), CanonHostInfo::NEUTRAL, -1, ""},
+ {"Goo%20 goo%7C|.com", L"Goo%20 goo%7C|.com", "goo%20%20goo%7C%7C.com",
+ Component(0, 22), CanonHostInfo::NEUTRAL, -1, ""},
// Exciting different types of spaces!
- {NULL, L"GOO\x00a0\x3000goo.com", "goo%20%20goo.com", Component(0, 16), CanonHostInfo::NEUTRAL, -1, ""},
+ {NULL, L"GOO\x00a0\x3000goo.com", "goo%20%20goo.com", Component(0, 16),
+ CanonHostInfo::NEUTRAL, -1, ""},
// Other types of space (no-break, zero-width, zero-width-no-break) are
// name-prepped away to nothing.
- {NULL, L"GOO\x200b\x2060\xfeffgoo.com", "googoo.com", Component(0, 10), CanonHostInfo::NEUTRAL, -1, ""},
+ {NULL, L"GOO\x200b\x2060\xfeffgoo.com", "googoo.com", Component(0, 10),
+ CanonHostInfo::NEUTRAL, -1, ""},
// Ideographic full stop (full-width period for Chinese, etc.) should be
// treated as a dot.
- {NULL, L"www.foo\x3002" L"bar.com", "www.foo.bar.com", Component(0, 15), CanonHostInfo::NEUTRAL, -1, ""},
+ {NULL,
+ L"www.foo\x3002"
+ L"bar.com",
+ "www.foo.bar.com", Component(0, 15), CanonHostInfo::NEUTRAL, -1, ""},
// Invalid unicode characters should fail...
// ...In wide input, ICU will barf and we'll end up with the input as
// escaped UTF-8 (the invalid character should be replaced with the
// replacement character).
- {"\xef\xb7\x90zyx.com", L"\xfdd0zyx.com", "%EF%BF%BDzyx.com", Component(0, 16), CanonHostInfo::BROKEN, -1, ""},
+ {"\xef\xb7\x90zyx.com", L"\xfdd0zyx.com", "%EF%BF%BDzyx.com",
+ Component(0, 16), CanonHostInfo::BROKEN, -1, ""},
// ...This is the same as previous but with with escaped.
- {"%ef%b7%90zyx.com", L"%ef%b7%90zyx.com", "%EF%BF%BDzyx.com", Component(0, 16), CanonHostInfo::BROKEN, -1, ""},
- // Test name prepping, fullwidth input should be converted to ASCII and NOT
+ {"%ef%b7%90zyx.com", L"%ef%b7%90zyx.com", "%EF%BF%BDzyx.com",
+ Component(0, 16), CanonHostInfo::BROKEN, -1, ""},
+ // Test name prepping, fullwidth input should be converted to ASCII and
+ // NOT
// IDN-ized. This is "Go" in fullwidth UTF-8/UTF-16.
- {"\xef\xbc\xa7\xef\xbd\x8f.com", L"\xff27\xff4f.com", "go.com", Component(0, 6), CanonHostInfo::NEUTRAL, -1, ""},
+ {"\xef\xbc\xa7\xef\xbd\x8f.com", L"\xff27\xff4f.com", "go.com",
+ Component(0, 6), CanonHostInfo::NEUTRAL, -1, ""},
// Test that fullwidth escaped values are properly name-prepped,
// then converted or rejected.
// ...%41 in fullwidth = 'A' (also as escaped UTF-8 input)
- {"\xef\xbc\x85\xef\xbc\x94\xef\xbc\x91.com", L"\xff05\xff14\xff11.com", "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""},
- {"%ef%bc%85%ef%bc%94%ef%bc%91.com", L"%ef%bc%85%ef%bc%94%ef%bc%91.com", "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""},
+ {"\xef\xbc\x85\xef\xbc\x94\xef\xbc\x91.com", L"\xff05\xff14\xff11.com",
+ "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""},
+ {"%ef%bc%85%ef%bc%94%ef%bc%91.com", L"%ef%bc%85%ef%bc%94%ef%bc%91.com",
+ "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""},
// ...%00 in fullwidth should fail (also as escaped UTF-8 input)
- {"\xef\xbc\x85\xef\xbc\x90\xef\xbc\x90.com", L"\xff05\xff10\xff10.com", "%00.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
- {"%ef%bc%85%ef%bc%90%ef%bc%90.com", L"%ef%bc%85%ef%bc%90%ef%bc%90.com", "%00.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
+ {"\xef\xbc\x85\xef\xbc\x90\xef\xbc\x90.com", L"\xff05\xff10\xff10.com",
+ "%00.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
+ {"%ef%bc%85%ef%bc%90%ef%bc%90.com", L"%ef%bc%85%ef%bc%90%ef%bc%90.com",
+ "%00.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
// ICU will convert weird percents into ASCII percents, but not unescape
// further. A weird percent is U+FE6A (EF B9 AA in UTF-8) which is a
// "small percent". At this point we should be within our rights to mark
@@ -324,12 +366,30 @@
// happens to allow ASCII characters (%41 = "A" -> 'a') to be unescaped
// and kept as valid, so we validate that behavior here, but this level
// of fixing the input shouldn't be seen as required. "%81" is invalid.
- {"\xef\xb9\xaa" "41.com", L"\xfe6a" L"41.com", "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""},
- {"%ef%b9%aa" "41.com", L"\xfe6a" L"41.com", "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""},
- {"\xef\xb9\xaa" "81.com", L"\xfe6a" L"81.com", "%81.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
- {"%ef%b9%aa" "81.com", L"\xfe6a" L"81.com", "%81.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
+ {"\xef\xb9\xaa"
+ "41.com",
+ L"\xfe6a"
+ L"41.com",
+ "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""},
+ {"%ef%b9%aa"
+ "41.com",
+ L"\xfe6a"
+ L"41.com",
+ "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""},
+ {"\xef\xb9\xaa"
+ "81.com",
+ L"\xfe6a"
+ L"81.com",
+ "%81.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
+ {"%ef%b9%aa"
+ "81.com",
+ L"\xfe6a"
+ L"81.com",
+ "%81.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
// Basic IDN support, UTF-8 and UTF-16 input should be converted to IDN
- {"\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d\x4f60\x597d", "xn--6qqa088eba", Component(0, 14), CanonHostInfo::NEUTRAL, -1, ""},
+ {"\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xbd\xa0\xe5\xa5\xbd",
+ L"\x4f60\x597d\x4f60\x597d", "xn--6qqa088eba", Component(0, 14),
+ CanonHostInfo::NEUTRAL, -1, ""},
// See http://unicode.org/cldr/utility/idna.jsp for other
// examples/experiments and http://goo.gl/7yG11o
// for the full list of characters handled differently by
@@ -337,169 +397,206 @@
// 4 Deviation characters are mapped/ignored in UTS 46 transitional
// mechansm. UTS 46, table 4 row (g).
- // Sharp-s is mapped to 'ss' in UTS 46 and IDNA 2003.
- // Otherwise, it'd be "xn--fuball-cta.de".
- {"fu\xc3\x9f" "ball.de", L"fu\x00df" L"ball.de", "fussball.de",
- Component(0, 11), CanonHostInfo::NEUTRAL, -1, ""},
- // Final-sigma (U+03C3) is mapped to regular sigma (U+03C2).
- // Otherwise, it'd be "xn--wxaijb9b".
- {"\xcf\x83\xcf\x8c\xce\xbb\xce\xbf\xcf\x82", L"\x3c3\x3cc\x3bb\x3bf\x3c2",
- "xn--wxaikc6b", Component(0, 12),
- CanonHostInfo::NEUTRAL, -1, ""},
+ // Sharp-s is mapped to 'ss' in IDNA 2003, not in IDNA 2008 or UTF 46
+ // after transitional period.
+ // Previously, it'd be "fussball.de".
+ {"fu\xc3\x9f"
+ "ball.de",
+ L"fu\x00df"
+ L"ball.de",
+ use_idna_non_transitional ? "xn--fuball-cta.de" : "fussball.de",
+ use_idna_non_transitional ? Component(0, 17) : Component(0, 11),
+ CanonHostInfo::NEUTRAL, -1, ""},
+
+ // Final-sigma (U+03C3) was mapped to regular sigma (U+03C2).
+ // Previously, it'd be "xn--wxaikc9b".
+ {"\xcf\x83\xcf\x8c\xce\xbb\xce\xbf\xcf\x82", L"\x3c3\x3cc\x3bb\x3bf\x3c2",
+ use_idna_non_transitional ? "xn--wxaijb9b" : "xn--wxaikc6b",
+ Component(0, 12), CanonHostInfo::NEUTRAL, -1, ""},
+
// ZWNJ (U+200C) and ZWJ (U+200D) are mapped away in UTS 46 transitional
- // handling as well as in IDNA 2003.
- {"a\xe2\x80\x8c" "b\xe2\x80\x8d" "c", L"a\x200c" L"b\x200d" L"c", "abc",
- Component(0, 3), CanonHostInfo::NEUTRAL, -1, ""},
- // ZWJ between Devanagari characters is still mapped away in UTS 46
- // transitional handling. IDNA 2008 would give xn--11bo0mv54g.
- {"\xe0\xa4\x95\xe0\xa5\x8d\xe2\x80\x8d\xe0\xa4\x9c",
- L"\x915\x94d\x200d\x91c", "xn--11bo0m",
- Component(0, 10), CanonHostInfo::NEUTRAL, -1, ""},
+ // handling as well as in IDNA 2003, but not thereafter.
+ {"a\xe2\x80\x8c"
+ "b\xe2\x80\x8d"
+ "c",
+ L"a\x200c"
+ L"b\x200d"
+ L"c",
+ use_idna_non_transitional ? "xn--abc-9m0ag" : "abc",
+ use_idna_non_transitional ? Component(0, 13) : Component(0, 3),
+ CanonHostInfo::NEUTRAL, -1, ""},
+
+ // ZWJ between Devanagari characters was still mapped away in UTS 46
+ // transitional handling. IDNA 2008 gives xn--11bo0mv54g.
+ // Previously "xn--11bo0m".
+ {"\xe0\xa4\x95\xe0\xa5\x8d\xe2\x80\x8d\xe0\xa4\x9c",
+ L"\x915\x94d\x200d\x91c",
+ use_idna_non_transitional ? "xn--11bo0mv54g" : "xn--11bo0m",
+ use_idna_non_transitional ? Component(0, 14) : Component(0, 10),
+ CanonHostInfo::NEUTRAL, -1, ""},
+
// Fullwidth exclamation mark is disallowed. UTS 46, table 4, row (b)
// However, we do allow this at the moment because we don't use
// STD3 rules and canonicalize full-width ASCII to ASCII.
- {"wow\xef\xbc\x81", L"wow\xff01", "wow%21",
- Component(0, 6), CanonHostInfo::NEUTRAL, -1, ""},
+ {"wow\xef\xbc\x81", L"wow\xff01", "wow%21", Component(0, 6),
+ CanonHostInfo::NEUTRAL, -1, ""},
// U+2132 (turned capital F) is disallowed. UTS 46, table 4, row (c)
// Allowed in IDNA 2003, but the mapping changed after Unicode 3.2
- {"\xe2\x84\xb2oo", L"\x2132oo", "%E2%84%B2oo",
- Component(0, 11), CanonHostInfo::BROKEN, -1, ""},
+ {"\xe2\x84\xb2oo", L"\x2132oo", "%E2%84%B2oo", Component(0, 11),
+ CanonHostInfo::BROKEN, -1, ""},
// U+2F868 (CJK Comp) is disallowed. UTS 46, table 4, row (d)
// Allowed in IDNA 2003, but the mapping changed after Unicode 3.2
- {"\xf0\xaf\xa1\xa8\xe5\xa7\xbb.cn", L"\xd87e\xdc68\x59fb.cn",
- "%F0%AF%A1%A8%E5%A7%BB.cn",
- Component(0, 24), CanonHostInfo::BROKEN, -1, ""},
+ {"\xf0\xaf\xa1\xa8\xe5\xa7\xbb.cn", L"\xd87e\xdc68\x59fb.cn",
+ "%F0%AF%A1%A8%E5%A7%BB.cn", Component(0, 24), CanonHostInfo::BROKEN, -1,
+ ""},
// Maps uppercase letters to lower case letters. UTS 46 table 4 row (e)
- {"M\xc3\x9cNCHEN", L"M\xdcNCHEN", "xn--mnchen-3ya",
- Component(0, 14), CanonHostInfo::NEUTRAL, -1, ""},
+ {"M\xc3\x9cNCHEN", L"M\xdcNCHEN", "xn--mnchen-3ya", Component(0, 14),
+ CanonHostInfo::NEUTRAL, -1, ""},
// An already-IDNA host is not modified.
- {"xn--mnchen-3ya", L"xn--mnchen-3ya", "xn--mnchen-3ya",
- Component(0, 14), CanonHostInfo::NEUTRAL, -1, ""},
+ {"xn--mnchen-3ya", L"xn--mnchen-3ya", "xn--mnchen-3ya", Component(0, 14),
+ CanonHostInfo::NEUTRAL, -1, ""},
// Symbol/punctuations are allowed in IDNA 2003/UTS46.
// Not allowed in IDNA 2008. UTS 46 table 4 row (f).
- {"\xe2\x99\xa5ny.us", L"\x2665ny.us", "xn--ny-s0x.us",
- Component(0, 13), CanonHostInfo::NEUTRAL, -1, ""},
+ {"\xe2\x99\xa5ny.us", L"\x2665ny.us", "xn--ny-s0x.us", Component(0, 13),
+ CanonHostInfo::NEUTRAL, -1, ""},
// U+11013 is new in Unicode 6.0 and is allowed. UTS 46 table 4, row (h)
// We used to allow it because we passed through unassigned code points.
- {"\xf0\x91\x80\x93.com", L"\xd804\xdc13.com", "xn--n00d.com",
- Component(0, 12), CanonHostInfo::NEUTRAL, -1, ""},
+ {"\xf0\x91\x80\x93.com", L"\xd804\xdc13.com", "xn--n00d.com",
+ Component(0, 12), CanonHostInfo::NEUTRAL, -1, ""},
// U+0602 is disallowed in UTS46/IDNA 2008. UTS 46 table 4, row(i)
// Used to be allowed in INDA 2003.
- {"\xd8\x82.eg", L"\x602.eg", "%D8%82.eg",
- Component(0, 9), CanonHostInfo::BROKEN, -1, ""},
+ {"\xd8\x82.eg", L"\x602.eg", "%D8%82.eg", Component(0, 9),
+ CanonHostInfo::BROKEN, -1, ""},
// U+20B7 is new in Unicode 5.2 (not a part of IDNA 2003 based
// on Unicode 3.2). We did allow it in the past because we let unassigned
// code point pass. We continue to allow it even though it's a
// "punctuation and symbol" blocked in IDNA 2008.
// UTS 46 table 4, row (j)
- {"\xe2\x82\xb7.com", L"\x20b7.com", "xn--wzg.com",
- Component(0, 11), CanonHostInfo::NEUTRAL, -1, ""},
+ {"\xe2\x82\xb7.com", L"\x20b7.com", "xn--wzg.com", Component(0, 11),
+ CanonHostInfo::NEUTRAL, -1, ""},
// Maps uppercase letters to lower case letters.
// In IDNA 2003, it's allowed without case-folding
// ( xn--bc-7cb.com ) because it's not defined in Unicode 3.2
// (added in Unicode 4.1). UTS 46 table 4 row (k)
- {"bc\xc8\xba.com", L"bc\x23a.com", "xn--bc-is1a.com",
- Component(0, 15), CanonHostInfo::NEUTRAL, -1, ""},
+ {"bc\xc8\xba.com", L"bc\x23a.com", "xn--bc-is1a.com", Component(0, 15),
+ CanonHostInfo::NEUTRAL, -1, ""},
// Maps U+FF43 (Full Width Small Letter C) to 'c'.
- {"ab\xef\xbd\x83.xyz", L"ab\xff43.xyz", "abc.xyz",
- Component(0, 7), CanonHostInfo::NEUTRAL, -1, ""},
+ {"ab\xef\xbd\x83.xyz", L"ab\xff43.xyz", "abc.xyz", Component(0, 7),
+ CanonHostInfo::NEUTRAL, -1, ""},
// Maps U+1D68C (Math Monospace Small C) to 'c'.
// U+1D68C = \xD835\xDE8C in UTF-16
- {"ab\xf0\x9d\x9a\x8c.xyz", L"ab\xd835\xde8c.xyz", "abc.xyz",
- Component(0, 7), CanonHostInfo::NEUTRAL, -1, ""},
+ {"ab\xf0\x9d\x9a\x8c.xyz", L"ab\xd835\xde8c.xyz", "abc.xyz",
+ Component(0, 7), CanonHostInfo::NEUTRAL, -1, ""},
// BiDi check test
// "Divehi" in Divehi (Thaana script) ends with BidiClass=NSM.
// Disallowed in IDNA 2003 but now allowed in UTS 46/IDNA 2008.
- {"\xde\x8b\xde\xa8\xde\x88\xde\xac\xde\x80\xde\xa8",
- L"\x78b\x7a8\x788\x7ac\x780\x7a8", "xn--hqbpi0jcw",
- Component(0, 13), CanonHostInfo::NEUTRAL, -1, ""},
+ {"\xde\x8b\xde\xa8\xde\x88\xde\xac\xde\x80\xde\xa8",
+ L"\x78b\x7a8\x788\x7ac\x780\x7a8", "xn--hqbpi0jcw", Component(0, 13),
+ CanonHostInfo::NEUTRAL, -1, ""},
// Disallowed in both IDNA 2003 and 2008 with BiDi check.
// Labels starting with a RTL character cannot end with a LTR character.
- {"\xd8\xac\xd8\xa7\xd8\xb1xyz", L"\x62c\x627\x631xyz",
- "%D8%AC%D8%A7%D8%B1xyz", Component(0, 21),
- CanonHostInfo::BROKEN, -1, ""},
+ {"\xd8\xac\xd8\xa7\xd8\xb1xyz", L"\x62c\x627\x631xyz",
+ "%D8%AC%D8%A7%D8%B1xyz", Component(0, 21), CanonHostInfo::BROKEN, -1,
+ ""},
// Labels starting with a RTL character can end with BC=EN (European
// number). Disallowed in IDNA 2003 but now allowed.
- {"\xd8\xac\xd8\xa7\xd8\xb1" "2", L"\x62c\x627\x631" L"2",
- "xn--2-ymcov", Component(0, 11),
- CanonHostInfo::NEUTRAL, -1, ""},
+ {"\xd8\xac\xd8\xa7\xd8\xb1"
+ "2",
+ L"\x62c\x627\x631"
+ L"2",
+ "xn--2-ymcov", Component(0, 11), CanonHostInfo::NEUTRAL, -1, ""},
// Labels starting with a RTL character cannot have "L" characters
// even if it ends with an BC=EN. Disallowed in both IDNA 2003/2008.
- {"\xd8\xac\xd8\xa7\xd8\xb1xy2", L"\x62c\x627\x631xy2",
- "%D8%AC%D8%A7%D8%B1xy2", Component(0, 21),
- CanonHostInfo::BROKEN, -1, ""},
+ {"\xd8\xac\xd8\xa7\xd8\xb1xy2", L"\x62c\x627\x631xy2",
+ "%D8%AC%D8%A7%D8%B1xy2", Component(0, 21), CanonHostInfo::BROKEN, -1,
+ ""},
// Labels starting with a RTL character can end with BC=AN (Arabic number)
// Disallowed in IDNA 2003, but now allowed.
- {"\xd8\xac\xd8\xa7\xd8\xb1\xd9\xa2", L"\x62c\x627\x631\x662",
- "xn--mgbjq0r", Component(0, 11),
- CanonHostInfo::NEUTRAL, -1, ""},
+ {"\xd8\xac\xd8\xa7\xd8\xb1\xd9\xa2", L"\x62c\x627\x631\x662",
+ "xn--mgbjq0r", Component(0, 11), CanonHostInfo::NEUTRAL, -1, ""},
// Labels starting with a RTL character cannot have "L" characters
// even if it ends with an BC=AN (Arabic number).
// Disallowed in both IDNA 2003/2008.
- {"\xd8\xac\xd8\xa7\xd8\xb1xy\xd9\xa2", L"\x62c\x627\x631xy\x662",
- "%D8%AC%D8%A7%D8%B1xy%D9%A2", Component(0, 26),
- CanonHostInfo::BROKEN, -1, ""},
+ {"\xd8\xac\xd8\xa7\xd8\xb1xy\xd9\xa2", L"\x62c\x627\x631xy\x662",
+ "%D8%AC%D8%A7%D8%B1xy%D9%A2", Component(0, 26), CanonHostInfo::BROKEN,
+ -1, ""},
// Labels starting with a RTL character cannot mix BC=EN and BC=AN
- {"\xd8\xac\xd8\xa7\xd8\xb1xy2\xd9\xa2", L"\x62c\x627\x631xy2\x662",
- "%D8%AC%D8%A7%D8%B1xy2%D9%A2", Component(0, 27),
- CanonHostInfo::BROKEN, -1, ""},
+ {"\xd8\xac\xd8\xa7\xd8\xb1xy2\xd9\xa2", L"\x62c\x627\x631xy2\x662",
+ "%D8%AC%D8%A7%D8%B1xy2%D9%A2", Component(0, 27), CanonHostInfo::BROKEN,
+ -1, ""},
// As of Unicode 6.2, U+20CF is not assigned. We do not allow it.
- {"\xe2\x83\x8f.com", L"\x20cf.com", "%E2%83%8F.com",
- Component(0, 13), CanonHostInfo::BROKEN, -1, ""},
+ {"\xe2\x83\x8f.com", L"\x20cf.com", "%E2%83%8F.com", Component(0, 13),
+ CanonHostInfo::BROKEN, -1, ""},
// U+0080 is not allowed.
- {"\xc2\x80.com", L"\x80.com", "%C2%80.com",
- Component(0, 10), CanonHostInfo::BROKEN, -1, ""},
+ {"\xc2\x80.com", L"\x80.com", "%C2%80.com", Component(0, 10),
+ CanonHostInfo::BROKEN, -1, ""},
// Mixed UTF-8 and escaped UTF-8 (narrow case) and UTF-16 and escaped
// Mixed UTF-8 and escaped UTF-8 (narrow case) and UTF-16 and escaped
// UTF-8 (wide case). The output should be equivalent to the true wide
// character input above).
- {"%E4%BD%A0%E5%A5%BD\xe4\xbd\xa0\xe5\xa5\xbd",
- L"%E4%BD%A0%E5%A5%BD\x4f60\x597d", "xn--6qqa088eba",
- Component(0, 14), CanonHostInfo::NEUTRAL, -1, ""},
+ {"%E4%BD%A0%E5%A5%BD\xe4\xbd\xa0\xe5\xa5\xbd",
+ L"%E4%BD%A0%E5%A5%BD\x4f60\x597d", "xn--6qqa088eba", Component(0, 14),
+ CanonHostInfo::NEUTRAL, -1, ""},
// Invalid escaped characters should fail and the percents should be
// escaped.
- {"%zz%66%a", L"%zz%66%a", "%25zzf%25a", Component(0, 10),
- CanonHostInfo::BROKEN, -1, ""},
+ {"%zz%66%a", L"%zz%66%a", "%25zzf%25a", Component(0, 10),
+ CanonHostInfo::BROKEN, -1, ""},
// If we get an invalid character that has been escaped.
- {"%25", L"%25", "%25", Component(0, 3),
- CanonHostInfo::BROKEN, -1, ""},
- {"hello%00", L"hello%00", "hello%00", Component(0, 8),
- CanonHostInfo::BROKEN, -1, ""},
+ {"%25", L"%25", "%25", Component(0, 3), CanonHostInfo::BROKEN, -1, ""},
+ {"hello%00", L"hello%00", "hello%00", Component(0, 8),
+ CanonHostInfo::BROKEN, -1, ""},
// Escaped numbers should be treated like IP addresses if they are.
- {"%30%78%63%30%2e%30%32%35%30.01", L"%30%78%63%30%2e%30%32%35%30.01",
- "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 3,
- "C0A80001"},
- {"%30%78%63%30%2e%30%32%35%30.01%2e", L"%30%78%63%30%2e%30%32%35%30.01%2e",
- "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 3,
- "C0A80001"},
+ {"%30%78%63%30%2e%30%32%35%30.01", L"%30%78%63%30%2e%30%32%35%30.01",
+ "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 3, "C0A80001"},
+ {"%30%78%63%30%2e%30%32%35%30.01%2e",
+ L"%30%78%63%30%2e%30%32%35%30.01%2e", "192.168.0.1", Component(0, 11),
+ CanonHostInfo::IPV4, 3, "C0A80001"},
// Invalid escaping should trigger the regular host error handling.
- {"%3g%78%63%30%2e%30%32%35%30%2E.01", L"%3g%78%63%30%2e%30%32%35%30%2E.01", "%253gxc0.0250..01", Component(0, 17), CanonHostInfo::BROKEN, -1, ""},
+ {"%3g%78%63%30%2e%30%32%35%30%2E.01",
+ L"%3g%78%63%30%2e%30%32%35%30%2E.01", "%253gxc0.0250..01",
+ Component(0, 17), CanonHostInfo::BROKEN, -1, ""},
// Something that isn't exactly an IP should get treated as a host and
// spaces escaped.
- {"192.168.0.1 hello", L"192.168.0.1 hello", "192.168.0.1%20hello", Component(0, 19), CanonHostInfo::NEUTRAL, -1, ""},
+ {"192.168.0.1 hello", L"192.168.0.1 hello", "192.168.0.1%20hello",
+ Component(0, 19), CanonHostInfo::NEUTRAL, -1, ""},
// Fullwidth and escaped UTF-8 fullwidth should still be treated as IP.
// These are "0Xc0.0250.01" in fullwidth.
- {"\xef\xbc\x90%Ef%bc\xb8%ef%Bd%83\xef\xbc\x90%EF%BC%8E\xef\xbc\x90\xef\xbc\x92\xef\xbc\x95\xef\xbc\x90\xef\xbc%8E\xef\xbc\x90\xef\xbc\x91", L"\xff10\xff38\xff43\xff10\xff0e\xff10\xff12\xff15\xff10\xff0e\xff10\xff11", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 3, "C0A80001"},
+ {"\xef\xbc\x90%Ef%bc\xb8%ef%Bd%83\xef\xbc\x90%EF%BC%"
+ "8E\xef\xbc\x90\xef\xbc\x92\xef\xbc\x95\xef\xbc\x90\xef\xbc%"
+ "8E\xef\xbc\x90\xef\xbc\x91",
+ L"\xff10\xff38\xff43\xff10\xff0e\xff10\xff12\xff15\xff10\xff0e\xff10"
+ L"\xff11",
+ "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 3, "C0A80001"},
// Broken IP addresses get marked as such.
- {"192.168.0.257", L"192.168.0.257", "192.168.0.257", Component(0, 13), CanonHostInfo::BROKEN, -1, ""},
- {"[google.com]", L"[google.com]", "[google.com]", Component(0, 12), CanonHostInfo::BROKEN, -1, ""},
+ {"192.168.0.257", L"192.168.0.257", "192.168.0.257", Component(0, 13),
+ CanonHostInfo::BROKEN, -1, ""},
+ {"[google.com]", L"[google.com]", "[google.com]", Component(0, 12),
+ CanonHostInfo::BROKEN, -1, ""},
// Cyrillic letter followed by '(' should return punycode for '(' escaped
// before punycode string was created. I.e.
// if '(' is escaped after punycode is created we would get xn--%28-8tb
// (incorrect).
- {"\xd1\x82(", L"\x0442(", "xn--%28-7ed", Component(0, 11),
- CanonHostInfo::NEUTRAL, -1, ""},
- // Address with all hexidecimal characters with leading number of 1<<32
+ {"\xd1\x82(", L"\x0442(", "xn--%28-7ed", Component(0, 11),
+ CanonHostInfo::NEUTRAL, -1, ""},
+ // Address with all hexadecimal characters with leading number of 1<<32
// or greater and should return NEUTRAL rather than BROKEN if not all
// components are numbers.
- {"12345678912345.de", L"12345678912345.de", "12345678912345.de", Component(0, 17), CanonHostInfo::NEUTRAL, -1, ""},
- {"1.12345678912345.de", L"1.12345678912345.de", "1.12345678912345.de", Component(0, 19), CanonHostInfo::NEUTRAL, -1, ""},
- {"12345678912345.12345678912345.de", L"12345678912345.12345678912345.de", "12345678912345.12345678912345.de", Component(0, 32), CanonHostInfo::NEUTRAL, -1, ""},
- {"1.2.0xB3A73CE5B59.de", L"1.2.0xB3A73CE5B59.de", "1.2.0xb3a73ce5b59.de", Component(0, 20), CanonHostInfo::NEUTRAL, -1, ""},
- {"12345678912345.0xde", L"12345678912345.0xde", "12345678912345.0xde", Component(0, 19), CanonHostInfo::BROKEN, -1, ""},
- // A label that starts with "xn--" but contains non-ASCII characters should
- // be an error. Escape the invalid characters.
- {"xn--m\xc3\xbcnchen", L"xn--m\xfcnchen", "xn--m%C3%BCnchen", Component(0, 16), CanonHostInfo::BROKEN, -1, ""},
+ {"12345678912345.de", L"12345678912345.de", "12345678912345.de",
+ Component(0, 17), CanonHostInfo::NEUTRAL, -1, ""},
+ {"1.12345678912345.de", L"1.12345678912345.de", "1.12345678912345.de",
+ Component(0, 19), CanonHostInfo::NEUTRAL, -1, ""},
+ {"12345678912345.12345678912345.de", L"12345678912345.12345678912345.de",
+ "12345678912345.12345678912345.de", Component(0, 32),
+ CanonHostInfo::NEUTRAL, -1, ""},
+ {"1.2.0xB3A73CE5B59.de", L"1.2.0xB3A73CE5B59.de", "1.2.0xb3a73ce5b59.de",
+ Component(0, 20), CanonHostInfo::NEUTRAL, -1, ""},
+ {"12345678912345.0xde", L"12345678912345.0xde", "12345678912345.0xde",
+ Component(0, 19), CanonHostInfo::BROKEN, -1, ""},
+ // A label that starts with "xn--" but contains non-ASCII characters
+ // should
+ // be an error. Escape the invalid characters.
+ {"xn--m\xc3\xbcnchen", L"xn--m\xfcnchen", "xn--m%C3%BCnchen",
+ Component(0, 16), CanonHostInfo::BROKEN, -1, ""},
};
// CanonicalizeHost() non-verbose.
diff --git a/url/url_features.cc b/url/url_features.cc
new file mode 100644
index 0000000..149cd4a
--- /dev/null
+++ b/url/url_features.cc
@@ -0,0 +1,16 @@
+// Copyright 2022 The Chromium Authors
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "url/url_features.h"
+
+namespace url {
+
+BASE_FEATURE(kUseIDNA2008NonTransitional,
+ "UseIDNA2008NonTransitional",
+ gurl_base::FEATURE_ENABLED_BY_DEFAULT);
+
+bool IsUsingIDNA2008NonTransitional() {
+ return gurl_base::FeatureList::IsEnabled(kUseIDNA2008NonTransitional);
+}
+} // namespace url
diff --git a/url/url_features.h b/url/url_features.h
new file mode 100644
index 0000000..3fed085
--- /dev/null
+++ b/url/url_features.h
@@ -0,0 +1,19 @@
+// Copyright 2022 The Chromium Authors
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef URL_URL_FEATURES_H_
+#define URL_URL_FEATURES_H_
+
+#include "polyfills/base/component_export.h"
+#include "polyfills/base/feature_list.h"
+
+namespace url {
+
+COMPONENT_EXPORT(URL) BASE_DECLARE_FEATURE(kUseIDNA2008NonTransitional);
+
+// Returns true if Chrome is using IDNA 2008 in Non-Transitional mode.
+COMPONENT_EXPORT(URL) bool IsUsingIDNA2008NonTransitional();
+} // namespace url
+
+#endif // URL_URL_FEATURES_H_
diff --git a/url/url_idna_icu.cc b/url/url_idna_icu.cc
index 356a1cd..4a3a602 100644
--- a/url/url_idna_icu.cc
+++ b/url/url_idna_icu.cc
@@ -15,21 +15,25 @@
#include <unicode/utypes.h>
#include "url/url_canon_icu.h"
#include "url/url_canon_internal.h" // for _itoa_s
+#include "url/url_features.h"
namespace url {
+namespace {
+
// Use UIDNA, a C pointer to a UTS46/IDNA 2008 handling object opened with
// uidna_openUTS46().
//
// We use UTS46 with BiDiCheck to migrate from IDNA 2003 (with unassigned
-// code points allowed) to IDNA 2008 with
-// the backward compatibility in mind. What it does:
+// code points allowed) to IDNA 2008 with the backward compatibility in mind.
+// What it does:
//
// 1. Use the up-to-date Unicode data.
// 2. Define a case folding/mapping with the up-to-date Unicode data as
// in IDNA 2003.
-// 3. Use transitional mechanism for 4 deviation characters (sharp-s,
-// final sigma, ZWJ and ZWNJ) for now.
+// 3. If `use_idna_non_transitional` is true, use non-transitional mechanism for
+// 4 deviation characters (sharp-s, final sigma, ZWJ and ZWNJ) per
+// url.spec.whatwg.org.
// 4. Continue to allow symbols and punctuations.
// 5. Apply new BiDi check rules more permissive than the IDNA 2003 BiDI rules.
// 6. Do not apply STD3 rules
@@ -39,25 +43,39 @@
// http://goo.gl/3XBhqw ).
// See http://http://unicode.org/reports/tr46/ and references therein
// for more details.
-UIDNA* GetUIDNA() {
- static UIDNA* uidna = [] {
- UErrorCode err = U_ZERO_ERROR;
- // TODO(jungshik): Change options as different parties (browsers,
- // registrars, search engines) converge toward a consensus.
- UIDNA* value = uidna_openUTS46(UIDNA_CHECK_BIDI, &err);
- if (U_FAILURE(err)) {
- GURL_CHECK(false) << "failed to open UTS46 data with error: "
- << u_errorName(err)
- << ". If you see this error message in a test environment "
- << "your test environment likely lacks the required data "
- << "tables for libicu. See https://crbug.com/778929.";
- value = nullptr;
- }
- return value;
- }();
- return uidna;
+UIDNA* CreateIDNA(bool use_idna_non_transitional) {
+ uint32_t options = UIDNA_CHECK_BIDI;
+ if (use_idna_non_transitional) {
+ // Use non-transitional processing if enabled. See
+ // https://url.spec.whatwg.org/#idna for details.
+ options |=
+ UIDNA_NONTRANSITIONAL_TO_ASCII | UIDNA_NONTRANSITIONAL_TO_UNICODE;
+ }
+ UErrorCode err = U_ZERO_ERROR;
+ UIDNA* idna = uidna_openUTS46(options, &err);
+ if (U_FAILURE(err)) {
+ GURL_CHECK(false) << "failed to open UTS46 data with error: " << u_errorName(err)
+ << ". If you see this error message in a test environment "
+ << "your test environment likely lacks the required data "
+ << "tables for libicu. See https://crbug.com/778929.";
+ idna = nullptr;
+ }
+ return idna;
}
+UIDNA* GetUIDNA() {
+ // This logic results in having two UIDNA instances in tests. This is okay.
+ if (IsUsingIDNA2008NonTransitional()) {
+ static UIDNA* uidna = CreateIDNA(/*use_idna_non_transitional=*/true);
+ return uidna;
+ } else {
+ static UIDNA* uidna = CreateIDNA(/*use_idna_non_transitional=*/false);
+ return uidna;
+ }
+}
+
+} // namespace
+
// Converts the Unicode input representing a hostname to ASCII using IDN rules.
// The output must be ASCII, but is represented as wide characters.
//
diff --git a/url/url_parse_unittest.cc b/url/url_parse_unittest.cc
index f67a445..88b6f05 100644
--- a/url/url_parse_unittest.cc
+++ b/url/url_parse_unittest.cc
@@ -89,8 +89,8 @@
bool ComponentMatches(const char* input,
const char* reference,
const Component& component) {
- // If the component is nonexistent (length == -1), it should begin at 0.
- EXPECT_TRUE(component.len >= 0 || component.len == -1);
+ // Check that the -1 sentinel is the only allowed negative value.
+ EXPECT_TRUE(component.is_valid() || component.len == -1);
// Begin should be valid.
EXPECT_LE(0, component.begin);
@@ -98,7 +98,7 @@
// A NULL reference means the component should be nonexistent.
if (!reference)
return component.len == -1;
- if (component.len < 0)
+ if (!component.is_valid())
return false; // Reference is not NULL but we don't have anything
if (strlen(reference) != static_cast<size_t>(component.len))
diff --git a/url/url_util.cc b/url/url_util.cc
index 872e469..da29651 100644
--- a/url/url_util.cc
+++ b/url/url_util.cc
@@ -163,7 +163,7 @@
inline bool DoCompareSchemeComponent(const CHAR* spec,
const Component& component,
const char* compare_to) {
- if (!component.is_nonempty())
+ if (component.is_empty())
return compare_to[0] == 0; // When component is empty, match empty scheme.
return gurl_base::EqualsCaseInsensitiveASCII(
typename CharToStringPiece<CHAR>::Piece(&spec[component.begin],
@@ -178,7 +178,7 @@
const Component& scheme,
SchemeType* type,
const std::vector<SchemeWithType>& schemes) {
- if (!scheme.is_nonempty())
+ if (scheme.is_empty())
return false; // Empty or invalid schemes are non-standard.
for (const SchemeWithType& scheme_with_type : schemes) {