| // Copyright 2020 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #ifndef BASE_STRINGS_STRING_UTIL_INTERNAL_H_ |
| #define BASE_STRINGS_STRING_UTIL_INTERNAL_H_ |
| |
| #include <algorithm> |
| |
| #include "polyfills/base/check.h" |
| #include "polyfills/base/check_op.h" |
| #include "polyfills/base/logging.h" |
| #include "polyfills/base/notreached.h" |
| #include "base/ranges/algorithm.h" |
| #include "base/strings/string_piece.h" |
| #include "base/third_party/icu/icu_utf.h" |
| |
| namespace gurl_base { |
| |
| namespace internal { |
| |
| // Used by ReplaceStringPlaceholders to track the position in the string of |
| // replaced parameters. |
| struct ReplacementOffset { |
| ReplacementOffset(uintptr_t parameter, size_t offset) |
| : parameter(parameter), offset(offset) {} |
| |
| // Index of the parameter. |
| uintptr_t parameter; |
| |
| // Starting position in the string. |
| size_t offset; |
| }; |
| |
| static bool CompareParameter(const ReplacementOffset& elem1, |
| const ReplacementOffset& elem2) { |
| return elem1.parameter < elem2.parameter; |
| } |
| |
| // Assuming that a pointer is the size of a "machine word", then |
| // uintptr_t is an integer type that is also a machine word. |
| using MachineWord = uintptr_t; |
| |
| inline bool IsMachineWordAligned(const void* pointer) { |
| return !(reinterpret_cast<MachineWord>(pointer) & (sizeof(MachineWord) - 1)); |
| } |
| |
| template <typename T, typename CharT = typename T::value_type> |
| std::basic_string<CharT> ToLowerASCIIImpl(T str) { |
| std::basic_string<CharT> ret; |
| ret.reserve(str.size()); |
| for (size_t i = 0; i < str.size(); i++) |
| ret.push_back(ToLowerASCII(str[i])); |
| return ret; |
| } |
| |
| template <typename T, typename CharT = typename T::value_type> |
| std::basic_string<CharT> ToUpperASCIIImpl(T str) { |
| std::basic_string<CharT> ret; |
| ret.reserve(str.size()); |
| for (size_t i = 0; i < str.size(); i++) |
| ret.push_back(ToUpperASCII(str[i])); |
| return ret; |
| } |
| |
| template <typename T, typename CharT = typename T::value_type> |
| int CompareCaseInsensitiveASCIIT(T a, T b) { |
| // Find the first characters that aren't equal and compare them. If the end |
| // of one of the strings is found before a nonequal character, the lengths |
| // of the strings are compared. |
| size_t i = 0; |
| while (i < a.length() && i < b.length()) { |
| CharT lower_a = ToLowerASCII(a[i]); |
| CharT lower_b = ToLowerASCII(b[i]); |
| if (lower_a < lower_b) |
| return -1; |
| if (lower_a > lower_b) |
| return 1; |
| i++; |
| } |
| |
| // End of one string hit before finding a different character. Expect the |
| // common case to be "strings equal" at this point so check that first. |
| if (a.length() == b.length()) |
| return 0; |
| |
| if (a.length() < b.length()) |
| return -1; |
| return 1; |
| } |
| |
| template <typename T, typename CharT = typename T::value_type> |
| TrimPositions TrimStringT(T input, |
| T trim_chars, |
| TrimPositions positions, |
| std::basic_string<CharT>* output) { |
| // Find the edges of leading/trailing whitespace as desired. Need to use |
| // a StringPiece version of input to be able to call find* on it with the |
| // StringPiece version of trim_chars (normally the trim_chars will be a |
| // constant so avoid making a copy). |
| const size_t last_char = input.length() - 1; |
| const size_t first_good_char = |
| (positions & TRIM_LEADING) ? input.find_first_not_of(trim_chars) : 0; |
| const size_t last_good_char = (positions & TRIM_TRAILING) |
| ? input.find_last_not_of(trim_chars) |
| : last_char; |
| |
| // When the string was all trimmed, report that we stripped off characters |
| // from whichever position the caller was interested in. For empty input, we |
| // stripped no characters, but we still need to clear |output|. |
| if (input.empty() || first_good_char == std::basic_string<CharT>::npos || |
| last_good_char == std::basic_string<CharT>::npos) { |
| bool input_was_empty = input.empty(); // in case output == &input |
| output->clear(); |
| return input_was_empty ? TRIM_NONE : positions; |
| } |
| |
| // Trim. |
| output->assign(input.data() + first_good_char, |
| last_good_char - first_good_char + 1); |
| |
| // Return where we trimmed from. |
| return static_cast<TrimPositions>( |
| (first_good_char == 0 ? TRIM_NONE : TRIM_LEADING) | |
| (last_good_char == last_char ? TRIM_NONE : TRIM_TRAILING)); |
| } |
| |
| template <typename T, typename CharT = typename T::value_type> |
| T TrimStringPieceT(T input, T trim_chars, TrimPositions positions) { |
| size_t begin = |
| (positions & TRIM_LEADING) ? input.find_first_not_of(trim_chars) : 0; |
| size_t end = (positions & TRIM_TRAILING) |
| ? input.find_last_not_of(trim_chars) + 1 |
| : input.size(); |
| return input.substr(std::min(begin, input.size()), end - begin); |
| } |
| |
| template <typename T, typename CharT = typename T::value_type> |
| std::basic_string<CharT> CollapseWhitespaceT( |
| T text, |
| bool trim_sequences_with_line_breaks) { |
| std::basic_string<CharT> result; |
| result.resize(text.size()); |
| |
| // Set flags to pretend we're already in a trimmed whitespace sequence, so we |
| // will trim any leading whitespace. |
| bool in_whitespace = true; |
| bool already_trimmed = true; |
| |
| int chars_written = 0; |
| for (auto c : text) { |
| if (IsUnicodeWhitespace(c)) { |
| if (!in_whitespace) { |
| // Reduce all whitespace sequences to a single space. |
| in_whitespace = true; |
| result[chars_written++] = L' '; |
| } |
| if (trim_sequences_with_line_breaks && !already_trimmed && |
| ((c == '\n') || (c == '\r'))) { |
| // Whitespace sequences containing CR or LF are eliminated entirely. |
| already_trimmed = true; |
| --chars_written; |
| } |
| } else { |
| // Non-whitespace characters are copied straight across. |
| in_whitespace = false; |
| already_trimmed = false; |
| result[chars_written++] = c; |
| } |
| } |
| |
| if (in_whitespace && !already_trimmed) { |
| // Any trailing whitespace is eliminated. |
| --chars_written; |
| } |
| |
| result.resize(chars_written); |
| return result; |
| } |
| |
| template <class Char> |
| bool DoIsStringASCII(const Char* characters, size_t length) { |
| // Bitmasks to detect non ASCII characters for character sizes of 8, 16 and 32 |
| // bits. |
| constexpr MachineWord NonASCIIMasks[] = { |
| 0, MachineWord(0x8080808080808080ULL), MachineWord(0xFF80FF80FF80FF80ULL), |
| 0, MachineWord(0xFFFFFF80FFFFFF80ULL), |
| }; |
| |
| if (!length) |
| return true; |
| constexpr MachineWord non_ascii_bit_mask = NonASCIIMasks[sizeof(Char)]; |
| static_assert(non_ascii_bit_mask, "Error: Invalid Mask"); |
| MachineWord all_char_bits = 0; |
| const Char* end = characters + length; |
| |
| // Prologue: align the input. |
| while (!IsMachineWordAligned(characters) && characters < end) |
| all_char_bits |= *characters++; |
| if (all_char_bits & non_ascii_bit_mask) |
| return false; |
| |
| // Compare the values of CPU word size. |
| constexpr size_t chars_per_word = sizeof(MachineWord) / sizeof(Char); |
| constexpr int batch_count = 16; |
| while (characters <= end - batch_count * chars_per_word) { |
| all_char_bits = 0; |
| for (int i = 0; i < batch_count; ++i) { |
| all_char_bits |= *(reinterpret_cast<const MachineWord*>(characters)); |
| characters += chars_per_word; |
| } |
| if (all_char_bits & non_ascii_bit_mask) |
| return false; |
| } |
| |
| // Process the remaining words. |
| all_char_bits = 0; |
| while (characters <= end - chars_per_word) { |
| all_char_bits |= *(reinterpret_cast<const MachineWord*>(characters)); |
| characters += chars_per_word; |
| } |
| |
| // Process the remaining bytes. |
| while (characters < end) |
| all_char_bits |= *characters++; |
| |
| return !(all_char_bits & non_ascii_bit_mask); |
| } |
| |
| template <bool (*Validator)(uint32_t)> |
| inline bool DoIsStringUTF8(StringPiece str) { |
| const char* src = str.data(); |
| int32_t src_len = static_cast<int32_t>(str.length()); |
| int32_t char_index = 0; |
| |
| while (char_index < src_len) { |
| int32_t code_point; |
| CBU8_NEXT(src, char_index, src_len, code_point); |
| if (!Validator(code_point)) |
| return false; |
| } |
| return true; |
| } |
| |
| // Implementation note: Normally this function will be called with a hardcoded |
| // constant for the lowercase_ascii parameter. Constructing a StringPiece from |
| // a C constant requires running strlen, so the result will be two passes |
| // through the buffers, one to file the length of lowercase_ascii, and one to |
| // compare each letter. |
| // |
| // This function could have taken a const char* to avoid this and only do one |
| // pass through the string. But the strlen is faster than the case-insensitive |
| // compares and lets us early-exit in the case that the strings are different |
| // lengths (will often be the case for non-matches). So whether one approach or |
| // the other will be faster depends on the case. |
| // |
| // The hardcoded strings are typically very short so it doesn't matter, and the |
| // string piece gives additional flexibility for the caller (doesn't have to be |
| // null terminated) so we choose the StringPiece route. |
| template <typename T, typename CharT = typename T::value_type> |
| inline bool DoLowerCaseEqualsASCII(T str, StringPiece lowercase_ascii) { |
| return std::equal( |
| str.begin(), str.end(), lowercase_ascii.begin(), lowercase_ascii.end(), |
| [](auto lhs, auto rhs) { return ToLowerASCII(lhs) == rhs; }); |
| } |
| |
| template <typename T, typename CharT = typename T::value_type> |
| bool StartsWithT(T str, T search_for, CompareCase case_sensitivity) { |
| if (search_for.size() > str.size()) |
| return false; |
| |
| BasicStringPiece<CharT> source = str.substr(0, search_for.size()); |
| |
| switch (case_sensitivity) { |
| case CompareCase::SENSITIVE: |
| return source == search_for; |
| |
| case CompareCase::INSENSITIVE_ASCII: |
| return std::equal(search_for.begin(), search_for.end(), source.begin(), |
| CaseInsensitiveCompareASCII<CharT>()); |
| |
| default: |
| GURL_NOTREACHED(); |
| return false; |
| } |
| } |
| |
| template <typename T, typename CharT = typename T::value_type> |
| bool EndsWithT(T str, T search_for, CompareCase case_sensitivity) { |
| if (search_for.size() > str.size()) |
| return false; |
| |
| BasicStringPiece<CharT> source = |
| str.substr(str.size() - search_for.size(), search_for.size()); |
| |
| switch (case_sensitivity) { |
| case CompareCase::SENSITIVE: |
| return source == search_for; |
| |
| case CompareCase::INSENSITIVE_ASCII: |
| return std::equal(source.begin(), source.end(), search_for.begin(), |
| CaseInsensitiveCompareASCII<CharT>()); |
| |
| default: |
| GURL_NOTREACHED(); |
| return false; |
| } |
| } |
| |
| // A Matcher for DoReplaceMatchesAfterOffset() that matches substrings. |
| template <class CharT> |
| struct SubstringMatcher { |
| BasicStringPiece<CharT> find_this; |
| |
| size_t Find(const std::basic_string<CharT>& input, size_t pos) { |
| return input.find(find_this.data(), pos, find_this.length()); |
| } |
| size_t MatchSize() { return find_this.length(); } |
| }; |
| |
| // Type deduction helper for SubstringMatcher. |
| template <typename T, typename CharT = typename T::value_type> |
| auto MakeSubstringMatcher(T find_this) { |
| return SubstringMatcher<CharT>{find_this}; |
| } |
| |
| // A Matcher for DoReplaceMatchesAfterOffset() that matches single characters. |
| template <class CharT> |
| struct CharacterMatcher { |
| BasicStringPiece<CharT> find_any_of_these; |
| |
| size_t Find(const std::basic_string<CharT>& input, size_t pos) { |
| return input.find_first_of(find_any_of_these.data(), pos, |
| find_any_of_these.length()); |
| } |
| constexpr size_t MatchSize() { return 1; } |
| }; |
| |
| // Type deduction helper for CharacterMatcher. |
| template <typename T, typename CharT = typename T::value_type> |
| auto MakeCharacterMatcher(T find_any_of_these) { |
| return CharacterMatcher<CharT>{find_any_of_these}; |
| } |
| |
| enum class ReplaceType { REPLACE_ALL, REPLACE_FIRST }; |
| |
| // Runs in O(n) time in the length of |str|, and transforms the string without |
| // reallocating when possible. Returns |true| if any matches were found. |
| // |
| // This is parameterized on a |Matcher| traits type, so that it can be the |
| // implementation for both ReplaceChars() and ReplaceSubstringsAfterOffset(). |
| template <typename Matcher, typename T, typename CharT = typename T::value_type> |
| bool DoReplaceMatchesAfterOffset(std::basic_string<CharT>* str, |
| size_t initial_offset, |
| Matcher matcher, |
| T replace_with, |
| ReplaceType replace_type) { |
| using CharTraits = std::char_traits<CharT>; |
| |
| const size_t find_length = matcher.MatchSize(); |
| if (!find_length) |
| return false; |
| |
| // If the find string doesn't appear, there's nothing to do. |
| size_t first_match = matcher.Find(*str, initial_offset); |
| if (first_match == std::basic_string<CharT>::npos) |
| return false; |
| |
| // If we're only replacing one instance, there's no need to do anything |
| // complicated. |
| const size_t replace_length = replace_with.length(); |
| if (replace_type == ReplaceType::REPLACE_FIRST) { |
| str->replace(first_match, find_length, replace_with.data(), replace_length); |
| return true; |
| } |
| |
| // If the find and replace strings are the same length, we can simply use |
| // replace() on each instance, and finish the entire operation in O(n) time. |
| if (find_length == replace_length) { |
| auto* buffer = &((*str)[0]); |
| for (size_t offset = first_match; offset != std::basic_string<CharT>::npos; |
| offset = matcher.Find(*str, offset + replace_length)) { |
| CharTraits::copy(buffer + offset, replace_with.data(), replace_length); |
| } |
| return true; |
| } |
| |
| // Since the find and replace strings aren't the same length, a loop like the |
| // one above would be O(n^2) in the worst case, as replace() will shift the |
| // entire remaining string each time. We need to be more clever to keep things |
| // O(n). |
| // |
| // When the string is being shortened, it's possible to just shift the matches |
| // down in one pass while finding, and truncate the length at the end of the |
| // search. |
| // |
| // If the string is being lengthened, more work is required. The strategy used |
| // here is to make two find() passes through the string. The first pass counts |
| // the number of matches to determine the new size. The second pass will |
| // either construct the new string into a new buffer (if the existing buffer |
| // lacked capacity), or else -- if there is room -- create a region of scratch |
| // space after |first_match| by shifting the tail of the string to a higher |
| // index, and doing in-place moves from the tail to lower indices thereafter. |
| size_t str_length = str->length(); |
| size_t expansion = 0; |
| if (replace_length > find_length) { |
| // This operation lengthens the string; determine the new length by counting |
| // matches. |
| const size_t expansion_per_match = (replace_length - find_length); |
| size_t num_matches = 0; |
| for (size_t match = first_match; match != std::basic_string<CharT>::npos; |
| match = matcher.Find(*str, match + find_length)) { |
| expansion += expansion_per_match; |
| ++num_matches; |
| } |
| const size_t final_length = str_length + expansion; |
| |
| if (str->capacity() < final_length) { |
| // If we'd have to allocate a new buffer to grow the string, build the |
| // result directly into the new allocation via append(). |
| std::basic_string<CharT> src(str->get_allocator()); |
| str->swap(src); |
| str->reserve(final_length); |
| |
| size_t pos = 0; |
| for (size_t match = first_match;; match = matcher.Find(src, pos)) { |
| str->append(src, pos, match - pos); |
| str->append(replace_with.data(), replace_length); |
| pos = match + find_length; |
| |
| // A mid-loop test/break enables skipping the final Find() call; the |
| // number of matches is known, so don't search past the last one. |
| if (!--num_matches) |
| break; |
| } |
| |
| // Handle substring after the final match. |
| str->append(src, pos, str_length - pos); |
| return true; |
| } |
| |
| // Prepare for the copy/move loop below -- expand the string to its final |
| // size by shifting the data after the first match to the end of the resized |
| // string. |
| size_t shift_src = first_match + find_length; |
| size_t shift_dst = shift_src + expansion; |
| |
| // Big |expansion| factors (relative to |str_length|) require padding up to |
| // |shift_dst|. |
| if (shift_dst > str_length) |
| str->resize(shift_dst); |
| |
| str->replace(shift_dst, str_length - shift_src, *str, shift_src, |
| str_length - shift_src); |
| str_length = final_length; |
| } |
| |
| // We can alternate replacement and move operations. This won't overwrite the |
| // unsearched region of the string so long as |write_offset| <= |read_offset|; |
| // that condition is always satisfied because: |
| // |
| // (a) If the string is being shortened, |expansion| is zero and |
| // |write_offset| grows slower than |read_offset|. |
| // |
| // (b) If the string is being lengthened, |write_offset| grows faster than |
| // |read_offset|, but |expansion| is big enough so that |write_offset| |
| // will only catch up to |read_offset| at the point of the last match. |
| auto* buffer = &((*str)[0]); |
| size_t write_offset = first_match; |
| size_t read_offset = first_match + expansion; |
| do { |
| if (replace_length) { |
| CharTraits::copy(buffer + write_offset, replace_with.data(), |
| replace_length); |
| write_offset += replace_length; |
| } |
| read_offset += find_length; |
| |
| // min() clamps std::basic_string<CharT>::npos (the largest unsigned value) |
| // to str_length. |
| size_t match = std::min(matcher.Find(*str, read_offset), str_length); |
| |
| size_t length = match - read_offset; |
| if (length) { |
| CharTraits::move(buffer + write_offset, buffer + read_offset, length); |
| write_offset += length; |
| read_offset += length; |
| } |
| } while (read_offset < str_length); |
| |
| // If we're shortening the string, truncate it now. |
| str->resize(write_offset); |
| return true; |
| } |
| |
| template <typename T, typename CharT = typename T::value_type> |
| bool ReplaceCharsT(T input, |
| T find_any_of_these, |
| T replace_with, |
| std::basic_string<CharT>* output) { |
| // Commonly, this is called with output and input being the same string; in |
| // that case, skip the copy. |
| if (input.data() != output->data() || input.size() != output->size()) |
| output->assign(input.data(), input.size()); |
| |
| return DoReplaceMatchesAfterOffset(output, 0, |
| MakeCharacterMatcher(find_any_of_these), |
| replace_with, ReplaceType::REPLACE_ALL); |
| } |
| |
| template <class string_type> |
| inline typename string_type::value_type* WriteIntoT(string_type* str, |
| size_t length_with_null) { |
| GURL_DCHECK_GE(length_with_null, 1u); |
| str->reserve(length_with_null); |
| str->resize(length_with_null - 1); |
| return &((*str)[0]); |
| } |
| |
| // Generic version for all JoinString overloads. |list_type| must be a sequence |
| // (gurl_base::span or std::initializer_list) of strings/StringPieces (std::string, |
| // std::u16string, StringPiece or StringPiece16). |CharT| is either char or |
| // char16_t. |
| template <typename list_type, |
| typename T, |
| typename CharT = typename T::value_type> |
| static std::basic_string<CharT> JoinStringT(list_type parts, T sep) { |
| if (std::empty(parts)) |
| return std::basic_string<CharT>(); |
| |
| // Pre-allocate the eventual size of the string. Start with the size of all of |
| // the separators (note that this *assumes* parts.size() > 0). |
| size_t total_size = (parts.size() - 1) * sep.size(); |
| for (const auto& part : parts) |
| total_size += part.size(); |
| std::basic_string<CharT> result; |
| result.reserve(total_size); |
| |
| auto iter = parts.begin(); |
| GURL_DCHECK(iter != parts.end()); |
| result.append(iter->data(), iter->size()); |
| ++iter; |
| |
| for (; iter != parts.end(); ++iter) { |
| result.append(sep.data(), sep.size()); |
| result.append(iter->data(), iter->size()); |
| } |
| |
| // Sanity-check that we pre-allocated correctly. |
| GURL_DCHECK_EQ(total_size, result.size()); |
| |
| return result; |
| } |
| |
| template <typename T, typename CharT = typename T::value_type> |
| std::basic_string<CharT> DoReplaceStringPlaceholders( |
| T format_string, |
| const std::vector<std::basic_string<CharT>>& subst, |
| std::vector<size_t>* offsets) { |
| size_t substitutions = subst.size(); |
| GURL_DCHECK_LT(substitutions, 10U); |
| |
| size_t sub_length = 0; |
| for (const auto& cur : subst) |
| sub_length += cur.length(); |
| |
| std::basic_string<CharT> formatted; |
| formatted.reserve(format_string.length() + sub_length); |
| |
| std::vector<ReplacementOffset> r_offsets; |
| for (auto i = format_string.begin(); i != format_string.end(); ++i) { |
| if ('$' == *i) { |
| if (i + 1 != format_string.end()) { |
| ++i; |
| if ('$' == *i) { |
| while (i != format_string.end() && '$' == *i) { |
| formatted.push_back('$'); |
| ++i; |
| } |
| --i; |
| } else { |
| if (*i < '1' || *i > '9') { |
| GURL_DLOG(ERROR) << "Invalid placeholder: $" << *i; |
| continue; |
| } |
| uintptr_t index = *i - '1'; |
| if (offsets) { |
| ReplacementOffset r_offset(index, |
| static_cast<int>(formatted.size())); |
| r_offsets.insert( |
| ranges::upper_bound(r_offsets, r_offset, &CompareParameter), |
| r_offset); |
| } |
| if (index < substitutions) |
| formatted.append(subst.at(index)); |
| } |
| } |
| } else { |
| formatted.push_back(*i); |
| } |
| } |
| if (offsets) { |
| for (const auto& cur : r_offsets) |
| offsets->push_back(cur.offset); |
| } |
| return formatted; |
| } |
| |
| // The following code is compatible with the OpenBSD lcpy interface. See: |
| // http://www.gratisoft.us/todd/papers/strlcpy.html |
| // ftp://ftp.openbsd.org/pub/OpenBSD/src/lib/libc/string/{wcs,str}lcpy.c |
| |
| template <typename CHAR> |
| size_t lcpyT(CHAR* dst, const CHAR* src, size_t dst_size) { |
| for (size_t i = 0; i < dst_size; ++i) { |
| if ((dst[i] = src[i]) == 0) // We hit and copied the terminating NULL. |
| return i; |
| } |
| |
| // We were left off at dst_size. We over copied 1 byte. Null terminate. |
| if (dst_size != 0) |
| dst[dst_size - 1] = 0; |
| |
| // Count the rest of the |src|, and return it's length in characters. |
| while (src[dst_size]) |
| ++dst_size; |
| return dst_size; |
| } |
| |
| } // namespace internal |
| |
| } // namespace base |
| |
| #endif // BASE_STRINGS_STRING_UTIL_INTERNAL_H_ |