base/strings/utf_offset_string_conversions.cc - googleurl - Git at Google

 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "base/strings/utf_offset_string_conversions.h"

 #include <stdint.h>

 #include <algorithm>
 #include <memory>

 #include "polyfills/base/check_op.h"
 #include "base/strings/string_piece.h"
 #include "base/strings/utf_string_conversion_utils.h"

 namespace gurl_base {

 OffsetAdjuster::Adjustment::Adjustment(size_t original_offset,
                                        size_t original_length,
                                        size_t output_length)
     : original_offset(original_offset),
       original_length(original_length),
       output_length(output_length) {
 }

 // static
 void OffsetAdjuster::AdjustOffsets(const Adjustments& adjustments,
                                    std::vector<size_t>* offsets_for_adjustment,
                                    size_t limit) {
   GURL_DCHECK(offsets_for_adjustment);
   for (auto& i : *offsets_for_adjustment)
     AdjustOffset(adjustments, &i, limit);
 }

 // static
 void OffsetAdjuster::AdjustOffset(const Adjustments& adjustments,
                                   size_t* offset,
                                   size_t limit) {
   GURL_DCHECK(offset);
   if (*offset == std::u16string::npos)
     return;
   int adjustment = 0;
   for (const auto& i : adjustments) {
     if (*offset <= i.original_offset)
       break;
     if (*offset < (i.original_offset + i.original_length)) {
       *offset = std::u16string::npos;
       return;
     }
     adjustment += static_cast<int>(i.original_length - i.output_length);
   }
   *offset -= adjustment;

   if (*offset > limit)
     *offset = std::u16string::npos;
 }

 // static
 void OffsetAdjuster::UnadjustOffsets(
     const Adjustments& adjustments,
     std::vector<size_t>* offsets_for_unadjustment) {
   if (!offsets_for_unadjustment || adjustments.empty())
     return;
   for (auto& i : *offsets_for_unadjustment)
     UnadjustOffset(adjustments, &i);
 }

 // static
 void OffsetAdjuster::UnadjustOffset(const Adjustments& adjustments,
                                     size_t* offset) {
   if (*offset == std::u16string::npos)
     return;
   int adjustment = 0;
   for (const auto& i : adjustments) {
     if (*offset + adjustment <= i.original_offset)
       break;
     adjustment += static_cast<int>(i.original_length - i.output_length);
     if ((*offset + adjustment) < (i.original_offset + i.original_length)) {
       *offset = std::u16string::npos;
       return;
     }
   }
   *offset += adjustment;
 }

 // static
 void OffsetAdjuster::MergeSequentialAdjustments(
     const Adjustments& first_adjustments,
     Adjustments* adjustments_on_adjusted_string) {
   auto adjusted_iter = adjustments_on_adjusted_string->begin();
   auto first_iter = first_adjustments.begin();
   // Simultaneously iterate over all |adjustments_on_adjusted_string| and
   // |first_adjustments|, pushing adjustments at the end of
   // |adjustments_builder| as we go.  |shift| keeps track of the current number
   // of characters collapsed by |first_adjustments| up to this point.
   // |currently_collapsing| keeps track of the number of characters collapsed by
   // |first_adjustments| into the current |adjusted_iter|'s length.  These are
   // characters that will change |shift| as soon as we're done processing the
   // current |adjusted_iter|; they are not yet reflected in |shift|.
   size_t shift = 0;
   size_t currently_collapsing = 0;
   // While we *could* update |adjustments_on_adjusted_string| in place by
   // inserting new adjustments into the middle, we would be repeatedly calling
   // |std::vector::insert|. That would cost O(n) time per insert, relative to
   // distance from end of the string.  By instead allocating
   // |adjustments_builder| and calling |std::vector::push_back|, we only pay
   // amortized constant time per push. We are trading space for time.
   Adjustments adjustments_builder;
   while (adjusted_iter != adjustments_on_adjusted_string->end()) {
     if ((first_iter == first_adjustments.end()) ||
         ((adjusted_iter->original_offset + shift +
           adjusted_iter->original_length) <= first_iter->original_offset)) {
       // Entire |adjusted_iter| (accounting for its shift and including its
       // whole original length) comes before |first_iter|.
       //
       // Correct the offset at |adjusted_iter| and move onto the next
       // adjustment that needs revising.
       adjusted_iter->original_offset += shift;
       shift += currently_collapsing;
       currently_collapsing = 0;
       adjustments_builder.push_back(*adjusted_iter);
       ++adjusted_iter;
     } else if ((adjusted_iter->original_offset + shift) >
                first_iter->original_offset) {
       // |first_iter| comes before the |adjusted_iter| (as adjusted by |shift|).

       // It's not possible for the adjustments to overlap.  (It shouldn't
       // be possible that we have an |adjusted_iter->original_offset| that,
       // when adjusted by the computed |shift|, is in the middle of
       // |first_iter|'s output's length.  After all, that would mean the
       // current adjustment_on_adjusted_string somehow points to an offset
       // that was supposed to have been eliminated by the first set of
       // adjustments.)
       GURL_DCHECK_LE(first_iter->original_offset + first_iter->output_length,
                 adjusted_iter->original_offset + shift);

       // Add the |first_iter| to the full set of adjustments.
       shift += first_iter->original_length - first_iter->output_length;
       adjustments_builder.push_back(*first_iter);
       ++first_iter;
     } else {
       // The first adjustment adjusted something that then got further adjusted
       // by the second set of adjustments.  In other words, |first_iter| points
       // to something in the range covered by |adjusted_iter|'s length (after
       // accounting for |shift|).  Precisely,
       //   adjusted_iter->original_offset + shift
       //   <=
       //   first_iter->original_offset
       //   <=
       //   adjusted_iter->original_offset + shift +
       //       adjusted_iter->original_length

       // Modify the current |adjusted_iter| to include whatever collapsing
       // happened in |first_iter|, then advance to the next |first_adjustments|
       // because we dealt with the current one.
       const int collapse = static_cast<int>(first_iter->original_length) -
           static_cast<int>(first_iter->output_length);
       // This function does not know how to deal with a string that expands and
       // then gets modified, only strings that collapse and then get modified.
       GURL_DCHECK_GT(collapse, 0);
       adjusted_iter->original_length += collapse;
       currently_collapsing += collapse;
       ++first_iter;
     }
   }
   GURL_DCHECK_EQ(0u, currently_collapsing);
   if (first_iter != first_adjustments.end()) {
     // Only first adjustments are left.  These do not need to be modified.
     // (Their offsets are already correct with respect to the original string.)
     // Append them all.
     GURL_DCHECK(adjusted_iter == adjustments_on_adjusted_string->end());
     adjustments_builder.insert(adjustments_builder.end(), first_iter,
                                first_adjustments.end());
   }
   *adjustments_on_adjusted_string = std::move(adjustments_builder);
 }

 // Converts the given source Unicode character type to the given destination
 // Unicode character type as a STL string. The given input buffer and size
 // determine the source, and the given output STL string will be replaced by
 // the result.  If non-NULL, |adjustments| is set to reflect the all the
 // alterations to the string that are not one-character-to-one-character.
 // It will always be sorted by increasing offset.
 template<typename SrcChar, typename DestStdString>
 bool ConvertUnicode(const SrcChar* src,
                     size_t src_len,
                     DestStdString* output,
                     OffsetAdjuster::Adjustments* adjustments) {
   if (adjustments)
     adjustments->clear();
   // ICU requires 32-bit numbers.
   bool success = true;
   int32_t src_len32 = static_cast<int32_t>(src_len);
   for (int32_t i = 0; i < src_len32; i++) {
     uint32_t code_point;
     size_t original_i = i;
     size_t chars_written = 0;
     if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) {
       chars_written = WriteUnicodeCharacter(code_point, output);
     } else {
       chars_written = WriteUnicodeCharacter(0xFFFD, output);
       success = false;
     }

     // Only bother writing an adjustment if this modification changed the
     // length of this character.
     // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last
     // character read, not after it (so that incrementing it in the loop
     // increment will place it at the right location), so we need to account
     // for that in determining the amount that was read.
     if (adjustments && ((i - original_i + 1) != chars_written)) {
       adjustments->push_back(OffsetAdjuster::Adjustment(
           original_i, i - original_i + 1, chars_written));
     }
   }
   return success;
 }

 bool UTF8ToUTF16WithAdjustments(
     const char* src,
     size_t src_len,
     std::u16string* output,
     gurl_base::OffsetAdjuster::Adjustments* adjustments) {
   PrepareForUTF16Or32Output(src, src_len, output);
   return ConvertUnicode(src, src_len, output, adjustments);
 }

 std::u16string UTF8ToUTF16WithAdjustments(
     const gurl_base::StringPiece& utf8,
     gurl_base::OffsetAdjuster::Adjustments* adjustments) {
   std::u16string result;
   UTF8ToUTF16WithAdjustments(utf8.data(), utf8.length(), &result, adjustments);
   return result;
 }

 std::u16string UTF8ToUTF16AndAdjustOffsets(
     const gurl_base::StringPiece& utf8,
     std::vector<size_t>* offsets_for_adjustment) {
   for (size_t& offset : *offsets_for_adjustment) {
     if (offset > utf8.length())
       offset = std::u16string::npos;
   }
   OffsetAdjuster::Adjustments adjustments;
   std::u16string result = UTF8ToUTF16WithAdjustments(utf8, &adjustments);
   OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment);
   return result;
 }

 std::string UTF16ToUTF8AndAdjustOffsets(
     const gurl_base::StringPiece16& utf16,
     std::vector<size_t>* offsets_for_adjustment) {
   for (size_t& offset : *offsets_for_adjustment) {
     if (offset > utf16.length())
       offset = std::u16string::npos;
   }
   std::string result;
   PrepareForUTF8Output(utf16.data(), utf16.length(), &result);
   OffsetAdjuster::Adjustments adjustments;
   ConvertUnicode(utf16.data(), utf16.length(), &result, &adjustments);
   OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment);
   return result;
 }

 }  // namespace base
	// Copyright (c) 2011 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include "base/strings/utf_offset_string_conversions.h"

	#include <stdint.h>

	#include <algorithm>
	#include <memory>

	#include "polyfills/base/check_op.h"
	#include "base/strings/string_piece.h"
	#include "base/strings/utf_string_conversion_utils.h"

	namespace gurl_base {

	OffsetAdjuster::Adjustment::Adjustment(size_t original_offset,
	size_t original_length,
	size_t output_length)
	: original_offset(original_offset),
	original_length(original_length),
	output_length(output_length) {
	}

	// static
	void OffsetAdjuster::AdjustOffsets(const Adjustments& adjustments,
	std::vector<size_t>* offsets_for_adjustment,
	size_t limit) {
	GURL_DCHECK(offsets_for_adjustment);
	for (auto& i : *offsets_for_adjustment)
	AdjustOffset(adjustments, &i, limit);
	}

	// static
	void OffsetAdjuster::AdjustOffset(const Adjustments& adjustments,
	size_t* offset,
	size_t limit) {
	GURL_DCHECK(offset);
	if (*offset == std::u16string::npos)
	return;
	int adjustment = 0;
	for (const auto& i : adjustments) {
	if (*offset <= i.original_offset)
	break;
	if (*offset < (i.original_offset + i.original_length)) {
	*offset = std::u16string::npos;
	return;
	}
	adjustment += static_cast<int>(i.original_length - i.output_length);
	}
	*offset -= adjustment;

	if (*offset > limit)
	*offset = std::u16string::npos;
	}

	// static
	void OffsetAdjuster::UnadjustOffsets(
	const Adjustments& adjustments,
	std::vector<size_t>* offsets_for_unadjustment) {
	if (!offsets_for_unadjustment \|\| adjustments.empty())
	return;
	for (auto& i : *offsets_for_unadjustment)
	UnadjustOffset(adjustments, &i);
	}

	// static
	void OffsetAdjuster::UnadjustOffset(const Adjustments& adjustments,
	size_t* offset) {
	if (*offset == std::u16string::npos)
	return;
	int adjustment = 0;
	for (const auto& i : adjustments) {
	if (*offset + adjustment <= i.original_offset)
	break;
	adjustment += static_cast<int>(i.original_length - i.output_length);
	if ((*offset + adjustment) < (i.original_offset + i.original_length)) {
	*offset = std::u16string::npos;
	return;
	}
	}
	*offset += adjustment;
	}

	// static
	void OffsetAdjuster::MergeSequentialAdjustments(
	const Adjustments& first_adjustments,
	Adjustments* adjustments_on_adjusted_string) {
	auto adjusted_iter = adjustments_on_adjusted_string->begin();
	auto first_iter = first_adjustments.begin();
	// Simultaneously iterate over all \|adjustments_on_adjusted_string\| and
	// \|first_adjustments\|, pushing adjustments at the end of
	// \|adjustments_builder\| as we go. \|shift\| keeps track of the current number
	// of characters collapsed by \|first_adjustments\| up to this point.
	// \|currently_collapsing\| keeps track of the number of characters collapsed by
	// \|first_adjustments\| into the current \|adjusted_iter\|'s length. These are
	// characters that will change \|shift\| as soon as we're done processing the
	// current \|adjusted_iter\|; they are not yet reflected in \|shift\|.
	size_t shift = 0;
	size_t currently_collapsing = 0;
	// While we could update \|adjustments_on_adjusted_string\| in place by
	// inserting new adjustments into the middle, we would be repeatedly calling
	// \|std::vector::insert\|. That would cost O(n) time per insert, relative to
	// distance from end of the string. By instead allocating
	// \|adjustments_builder\| and calling \|std::vector::push_back\|, we only pay
	// amortized constant time per push. We are trading space for time.
	Adjustments adjustments_builder;
	while (adjusted_iter != adjustments_on_adjusted_string->end()) {
	if ((first_iter == first_adjustments.end()) \|\|
	((adjusted_iter->original_offset + shift +
	adjusted_iter->original_length) <= first_iter->original_offset)) {
	// Entire \|adjusted_iter\| (accounting for its shift and including its
	// whole original length) comes before \|first_iter\|.
	//
	// Correct the offset at \|adjusted_iter\| and move onto the next
	// adjustment that needs revising.
	adjusted_iter->original_offset += shift;
	shift += currently_collapsing;
	currently_collapsing = 0;
	adjustments_builder.push_back(*adjusted_iter);
	++adjusted_iter;
	} else if ((adjusted_iter->original_offset + shift) >
	first_iter->original_offset) {
	// \|first_iter\| comes before the \|adjusted_iter\| (as adjusted by \|shift\|).

	// It's not possible for the adjustments to overlap. (It shouldn't
	// be possible that we have an \|adjusted_iter->original_offset\| that,
	// when adjusted by the computed \|shift\|, is in the middle of
	// \|first_iter\|'s output's length. After all, that would mean the
	// current adjustment_on_adjusted_string somehow points to an offset
	// that was supposed to have been eliminated by the first set of
	// adjustments.)
	GURL_DCHECK_LE(first_iter->original_offset + first_iter->output_length,
	adjusted_iter->original_offset + shift);

	// Add the \|first_iter\| to the full set of adjustments.
	shift += first_iter->original_length - first_iter->output_length;
	adjustments_builder.push_back(*first_iter);
	++first_iter;
	} else {
	// The first adjustment adjusted something that then got further adjusted
	// by the second set of adjustments. In other words, \|first_iter\| points
	// to something in the range covered by \|adjusted_iter\|'s length (after
	// accounting for \|shift\|). Precisely,
	// adjusted_iter->original_offset + shift
	// <=
	// first_iter->original_offset
	// <=
	// adjusted_iter->original_offset + shift +
	// adjusted_iter->original_length

	// Modify the current \|adjusted_iter\| to include whatever collapsing
	// happened in \|first_iter\|, then advance to the next \|first_adjustments\|
	// because we dealt with the current one.
	const int collapse = static_cast<int>(first_iter->original_length) -
	static_cast<int>(first_iter->output_length);
	// This function does not know how to deal with a string that expands and
	// then gets modified, only strings that collapse and then get modified.
	GURL_DCHECK_GT(collapse, 0);
	adjusted_iter->original_length += collapse;
	currently_collapsing += collapse;
	++first_iter;
	}
	}
	GURL_DCHECK_EQ(0u, currently_collapsing);
	if (first_iter != first_adjustments.end()) {
	// Only first adjustments are left. These do not need to be modified.
	// (Their offsets are already correct with respect to the original string.)
	// Append them all.
	GURL_DCHECK(adjusted_iter == adjustments_on_adjusted_string->end());
	adjustments_builder.insert(adjustments_builder.end(), first_iter,
	first_adjustments.end());
	}
	*adjustments_on_adjusted_string = std::move(adjustments_builder);
	}

	// Converts the given source Unicode character type to the given destination
	// Unicode character type as a STL string. The given input buffer and size
	// determine the source, and the given output STL string will be replaced by
	// the result. If non-NULL, \|adjustments\| is set to reflect the all the
	// alterations to the string that are not one-character-to-one-character.
	// It will always be sorted by increasing offset.
	template<typename SrcChar, typename DestStdString>
	bool ConvertUnicode(const SrcChar* src,
	size_t src_len,
	DestStdString* output,
	OffsetAdjuster::Adjustments* adjustments) {
	if (adjustments)
	adjustments->clear();
	// ICU requires 32-bit numbers.
	bool success = true;
	int32_t src_len32 = static_cast<int32_t>(src_len);
	for (int32_t i = 0; i < src_len32; i++) {
	uint32_t code_point;
	size_t original_i = i;
	size_t chars_written = 0;
	if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) {
	chars_written = WriteUnicodeCharacter(code_point, output);
	} else {
	chars_written = WriteUnicodeCharacter(0xFFFD, output);
	success = false;
	}

	// Only bother writing an adjustment if this modification changed the
	// length of this character.
	// NOTE: ReadUnicodeCharacter() adjusts \|i\| to point _at_ the last
	// character read, not after it (so that incrementing it in the loop
	// increment will place it at the right location), so we need to account
	// for that in determining the amount that was read.
	if (adjustments && ((i - original_i + 1) != chars_written)) {
	adjustments->push_back(OffsetAdjuster::Adjustment(
	original_i, i - original_i + 1, chars_written));
	}
	}
	return success;
	}

	bool UTF8ToUTF16WithAdjustments(
	const char* src,
	size_t src_len,
	std::u16string* output,
	gurl_base::OffsetAdjuster::Adjustments* adjustments) {
	PrepareForUTF16Or32Output(src, src_len, output);
	return ConvertUnicode(src, src_len, output, adjustments);
	}

	std::u16string UTF8ToUTF16WithAdjustments(
	const gurl_base::StringPiece& utf8,
	gurl_base::OffsetAdjuster::Adjustments* adjustments) {
	std::u16string result;
	UTF8ToUTF16WithAdjustments(utf8.data(), utf8.length(), &result, adjustments);
	return result;
	}

	std::u16string UTF8ToUTF16AndAdjustOffsets(
	const gurl_base::StringPiece& utf8,
	std::vector<size_t>* offsets_for_adjustment) {
	for (size_t& offset : *offsets_for_adjustment) {
	if (offset > utf8.length())
	offset = std::u16string::npos;
	}
	OffsetAdjuster::Adjustments adjustments;
	std::u16string result = UTF8ToUTF16WithAdjustments(utf8, &adjustments);
	OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment);
	return result;
	}

	std::string UTF16ToUTF8AndAdjustOffsets(
	const gurl_base::StringPiece16& utf16,
	std::vector<size_t>* offsets_for_adjustment) {
	for (size_t& offset : *offsets_for_adjustment) {
	if (offset > utf16.length())
	offset = std::u16string::npos;
	}
	std::string result;
	PrepareForUTF8Output(utf16.data(), utf16.length(), &result);
	OffsetAdjuster::Adjustments adjustments;
	ConvertUnicode(utf16.data(), utf16.length(), &result, &adjustments);
	OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment);
	return result;
	}

	} // namespace base