base/third_party/icu/icu_utf.cc - googleurl - Git at Google

 // © 2016 and later: Unicode, Inc. and others.
 // License & terms of use: http://www.unicode.org/copyright.html
 /*
 ******************************************************************************
 *
 *   Copyright (C) 1999-2012, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 ******************************************************************************
 *   file name:  utf_impl.cpp
 *   encoding:   UTF-8
 *   tab size:   8 (not used)
 *   indentation:4
 *
 *   created on: 1999sep13
 *   created by: Markus W. Scherer
 *
 *   This file provides implementation functions for macros in the utfXX.h
 *   that would otherwise be too long as macros.
 */

 #include "base/third_party/icu/icu_utf.h"

 namespace base_icu {

 // source/common/utf_impl.cpp

 static const UChar32
 utf8_errorValue[6]={
     // Same values as UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE,
     // but without relying on the obsolete unicode/utf_old.h.
     0x15, 0x9f, 0xffff,
     0x10ffff
 };

 static UChar32
 errorValue(int32_t count, int8_t strict) {
     if(strict>=0) {
         return utf8_errorValue[count];
     } else if(strict==-3) {
         return 0xfffd;
     } else {
         return CBU_SENTINEL;
     }
 }

 /*
  * Handle the non-inline part of the U8_NEXT() and U8_NEXT_FFFD() macros
  * and their obsolete sibling UTF8_NEXT_CHAR_SAFE().
  *
  * U8_NEXT() supports NUL-terminated strings indicated via length<0.
  *
  * The "strict" parameter controls the error behavior:
  * <0  "Safe" behavior of U8_NEXT():
  *     -1: All illegal byte sequences yield U_SENTINEL=-1.
  *     -2: Same as -1, except for lenient treatment of surrogate code points as legal.
  *         Some implementations use this for roundtripping of
  *         Unicode 16-bit strings that are not well-formed UTF-16, that is, they
  *         contain unpaired surrogates.
  *     -3: All illegal byte sequences yield U+FFFD.
  *  0  Obsolete "safe" behavior of UTF8_NEXT_CHAR_SAFE(..., FALSE):
  *     All illegal byte sequences yield a positive code point such that this
  *     result code point would be encoded with the same number of bytes as
  *     the illegal sequence.
  * >0  Obsolete "strict" behavior of UTF8_NEXT_CHAR_SAFE(..., TRUE):
  *     Same as the obsolete "safe" behavior, but non-characters are also treated
  *     like illegal sequences.
  *
  * Note that a UBool is the same as an int8_t.
  */
 UChar32
 utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict) {
     // *pi is one after byte c.
     int32_t i=*pi;
     // length can be negative for NUL-terminated strings: Read and validate one byte at a time.
     if(i==length || c>0xf4) {
         // end of string, or not a lead byte
     } else if(c>=0xf0) {
         // Test for 4-byte sequences first because
         // U8_NEXT() handles shorter valid sequences inline.
         uint8_t t1=s[i], t2, t3;
         c&=7;
         if(CBU8_IS_VALID_LEAD4_AND_T1(c, t1) &&
                 ++i!=length && (t2=s[i]-0x80)<=0x3f &&
                 ++i!=length && (t3=s[i]-0x80)<=0x3f) {
             ++i;
             c=(c<<18)|((t1&0x3f)<<12)|(t2<<6)|t3;
             // strict: forbid non-characters like U+fffe
             if(strict<=0 || !CBU_IS_UNICODE_NONCHAR(c)) {
                 *pi=i;
                 return c;
             }
         }
     } else if(c>=0xe0) {
         c&=0xf;
         if(strict!=-2) {
             uint8_t t1=s[i], t2;
             if(CBU8_IS_VALID_LEAD3_AND_T1(c, t1) &&
                     ++i!=length && (t2=s[i]-0x80)<=0x3f) {
                 ++i;
                 c=(c<<12)|((t1&0x3f)<<6)|t2;
                 // strict: forbid non-characters like U+fffe
                 if(strict<=0 || !CBU_IS_UNICODE_NONCHAR(c)) {
                     *pi=i;
                     return c;
                 }
             }
         } else {
             // strict=-2 -> lenient: allow surrogates
             uint8_t t1=s[i]-0x80, t2;
             if(t1<=0x3f && (c>0 || t1>=0x20) &&
                     ++i!=length && (t2=s[i]-0x80)<=0x3f) {
                 *pi=i+1;
                 return (c<<12)|(t1<<6)|t2;
             }
         }
     } else if(c>=0xc2) {
         uint8_t t1=s[i]-0x80;
         if(t1<=0x3f) {
             *pi=i+1;
             return ((c-0xc0)<<6)|t1;
         }
     }  // else 0x80<=c<0xc2 is not a lead byte

     /* error handling */
     c=errorValue(i-*pi, strict);
     *pi=i;
     return c;
 }

 }  // namespace base_icu
	// © 2016 and later: Unicode, Inc. and others.
	// License & terms of use: http://www.unicode.org/copyright.html
	/*
	******************************************************************************
	*
	* Copyright (C) 1999-2012, International Business Machines
	* Corporation and others. All Rights Reserved.
	*
	******************************************************************************
	* file name: utf_impl.cpp
	* encoding: UTF-8
	* tab size: 8 (not used)
	* indentation:4
	*
	* created on: 1999sep13
	* created by: Markus W. Scherer
	*
	* This file provides implementation functions for macros in the utfXX.h
	* that would otherwise be too long as macros.
	*/

	#include "base/third_party/icu/icu_utf.h"

	namespace base_icu {

	// source/common/utf_impl.cpp

	static const UChar32
	utf8_errorValue[6]={
	// Same values as UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE,
	// but without relying on the obsolete unicode/utf_old.h.
	0x15, 0x9f, 0xffff,
	0x10ffff
	};

	static UChar32
	errorValue(int32_t count, int8_t strict) {
	if(strict>=0) {
	return utf8_errorValue[count];
	} else if(strict==-3) {
	return 0xfffd;
	} else {
	return CBU_SENTINEL;
	}
	}

	/*
	* Handle the non-inline part of the U8_NEXT() and U8_NEXT_FFFD() macros
	* and their obsolete sibling UTF8_NEXT_CHAR_SAFE().
	*
	* U8_NEXT() supports NUL-terminated strings indicated via length<0.
	*
	* The "strict" parameter controls the error behavior:
	* <0 "Safe" behavior of U8_NEXT():
	* -1: All illegal byte sequences yield U_SENTINEL=-1.
	* -2: Same as -1, except for lenient treatment of surrogate code points as legal.
	* Some implementations use this for roundtripping of
	* Unicode 16-bit strings that are not well-formed UTF-16, that is, they
	* contain unpaired surrogates.
	* -3: All illegal byte sequences yield U+FFFD.
	* 0 Obsolete "safe" behavior of UTF8_NEXT_CHAR_SAFE(..., FALSE):
	* All illegal byte sequences yield a positive code point such that this
	* result code point would be encoded with the same number of bytes as
	* the illegal sequence.
	* >0 Obsolete "strict" behavior of UTF8_NEXT_CHAR_SAFE(..., TRUE):
	* Same as the obsolete "safe" behavior, but non-characters are also treated
	* like illegal sequences.
	*
	* Note that a UBool is the same as an int8_t.
	*/
	UChar32
	utf8_nextCharSafeBody(const uint8_t s, int32_t pi, int32_t length, UChar32 c, UBool strict) {
	// *pi is one after byte c.
	int32_t i=*pi;
	// length can be negative for NUL-terminated strings: Read and validate one byte at a time.
	if(i==length \|\| c>0xf4) {
	// end of string, or not a lead byte
	} else if(c>=0xf0) {
	// Test for 4-byte sequences first because
	// U8_NEXT() handles shorter valid sequences inline.
	uint8_t t1=s[i], t2, t3;
	c&=7;
	if(CBU8_IS_VALID_LEAD4_AND_T1(c, t1) &&
	++i!=length && (t2=s[i]-0x80)<=0x3f &&
	++i!=length && (t3=s[i]-0x80)<=0x3f) {
	++i;
	c=(c<<18)\|((t1&0x3f)<<12)\|(t2<<6)\|t3;
	// strict: forbid non-characters like U+fffe
	if(strict<=0 \|\| !CBU_IS_UNICODE_NONCHAR(c)) {
	*pi=i;
	return c;
	}
	}
	} else if(c>=0xe0) {
	c&=0xf;
	if(strict!=-2) {
	uint8_t t1=s[i], t2;
	if(CBU8_IS_VALID_LEAD3_AND_T1(c, t1) &&
	++i!=length && (t2=s[i]-0x80)<=0x3f) {
	++i;
	c=(c<<12)\|((t1&0x3f)<<6)\|t2;
	// strict: forbid non-characters like U+fffe
	if(strict<=0 \|\| !CBU_IS_UNICODE_NONCHAR(c)) {
	*pi=i;
	return c;
	}
	}
	} else {
	// strict=-2 -> lenient: allow surrogates
	uint8_t t1=s[i]-0x80, t2;
	if(t1<=0x3f && (c>0 \|\| t1>=0x20) &&
	++i!=length && (t2=s[i]-0x80)<=0x3f) {
	*pi=i+1;
	return (c<<12)\|(t1<<6)\|t2;
	}
	}
	} else if(c>=0xc2) {
	uint8_t t1=s[i]-0x80;
	if(t1<=0x3f) {
	*pi=i+1;
	return ((c-0xc0)<<6)\|t1;
	}
	} // else 0x80<=c<0xc2 is not a lead byte

	/* error handling */
	c=errorValue(i-*pi, strict);
	*pi=i;
	return c;
	}

	} // namespace base_icu