| // Copyright 2014 The Chromium Authors |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "url/url_canon_icu.h" |
| |
| #include <stddef.h> |
| |
| #include "polyfills/base/logging.h" |
| #include "base/memory/raw_ptr.h" |
| #include "testing/gtest/include/gtest/gtest.h" |
| #include <unicode/ucnv.h> |
| #include "url/url_canon.h" |
| #include "url/url_canon_stdstring.h" |
| #include "url/url_test_utils.h" |
| |
| namespace url { |
| |
| namespace { |
| |
| // Wrapper around a UConverter object that managers creation and destruction. |
| class UConvScoper { |
| public: |
| explicit UConvScoper(const char* charset_name) { |
| UErrorCode err = U_ZERO_ERROR; |
| converter_ = ucnv_open(charset_name, &err); |
| if (!converter_) { |
| GURL_LOG(ERROR) << "Failed to open charset " << charset_name << ": " |
| << u_errorName(err); |
| } |
| } |
| |
| ~UConvScoper() { |
| if (converter_) |
| ucnv_close(converter_); |
| } |
| |
| // Returns the converter object, may be NULL. |
| UConverter* converter() const { return converter_; } |
| |
| private: |
| raw_ptr<UConverter> converter_; |
| }; |
| |
| TEST(URLCanonIcuTest, ICUCharsetConverter) { |
| struct ICUCase { |
| const wchar_t* input; |
| const char* encoding; |
| const char* expected; |
| } icu_cases[] = { |
| // UTF-8. |
| {L"Hello, world", "utf-8", "Hello, world"}, |
| {L"\x4f60\x597d", "utf-8", "\xe4\xbd\xa0\xe5\xa5\xbd"}, |
| // Non-BMP UTF-8. |
| {L"!\xd800\xdf00!", "utf-8", "!\xf0\x90\x8c\x80!"}, |
| // Big5 |
| {L"\x4f60\x597d", "big5", "\xa7\x41\xa6\x6e"}, |
| // Unrepresentable character in the destination set. |
| {L"hello\x4f60\x06de\x597dworld", "big5", |
| "hello\xa7\x41%26%231758%3B\xa6\x6eworld"}, |
| }; |
| |
| for (size_t i = 0; i < std::size(icu_cases); i++) { |
| UConvScoper conv(icu_cases[i].encoding); |
| ASSERT_TRUE(conv.converter() != NULL); |
| ICUCharsetConverter converter(conv.converter()); |
| |
| std::string str; |
| StdStringCanonOutput output(&str); |
| |
| std::u16string input_str( |
| test_utils::TruncateWStringToUTF16(icu_cases[i].input)); |
| int input_len = static_cast<int>(input_str.length()); |
| converter.ConvertFromUTF16(input_str.c_str(), input_len, &output); |
| output.Complete(); |
| |
| EXPECT_STREQ(icu_cases[i].expected, str.c_str()); |
| } |
| |
| // Test string sizes around the resize boundary for the output to make sure |
| // the converter resizes as needed. |
| const int static_size = 16; |
| UConvScoper conv("utf-8"); |
| ASSERT_TRUE(conv.converter()); |
| ICUCharsetConverter converter(conv.converter()); |
| for (int i = static_size - 2; i <= static_size + 2; i++) { |
| // Make a string with the appropriate length. |
| std::u16string input; |
| for (int ch = 0; ch < i; ch++) |
| input.push_back('a'); |
| |
| RawCanonOutput<static_size> output; |
| converter.ConvertFromUTF16(input.c_str(), static_cast<int>(input.length()), |
| &output); |
| EXPECT_EQ(input.length(), output.length()); |
| } |
| } |
| |
| TEST(URLCanonIcuTest, QueryWithConverter) { |
| struct QueryCase { |
| const char* input8; |
| const wchar_t* input16; |
| const char* encoding; |
| const char* expected; |
| } query_cases[] = { |
| // Regular ASCII case in some different encodings. |
| {"foo=bar", L"foo=bar", "utf-8", "?foo=bar"}, |
| {"foo=bar", L"foo=bar", "shift_jis", "?foo=bar"}, |
| {"foo=bar", L"foo=bar", "gb2312", "?foo=bar"}, |
| // Chinese input/output |
| {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "gb2312", |
| "?q=%C4%E3%BA%C3"}, |
| {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "big5", "?q=%A7A%A6n"}, |
| // Unencodable character in the destination character set should be |
| // escaped. The escape sequence unescapes to be the entity name: |
| // "?q=你" |
| {"q=Chinese\xef\xbc\xa7", L"q=Chinese\xff27", "iso-8859-1", |
| "?q=Chinese%26%2365319%3B"}, |
| }; |
| |
| for (size_t i = 0; i < std::size(query_cases); i++) { |
| Component out_comp; |
| |
| UConvScoper conv(query_cases[i].encoding); |
| ASSERT_TRUE(!query_cases[i].encoding || conv.converter()); |
| ICUCharsetConverter converter(conv.converter()); |
| |
| if (query_cases[i].input8) { |
| int len = static_cast<int>(strlen(query_cases[i].input8)); |
| Component in_comp(0, len); |
| std::string out_str; |
| |
| StdStringCanonOutput output(&out_str); |
| CanonicalizeQuery(query_cases[i].input8, in_comp, &converter, &output, |
| &out_comp); |
| output.Complete(); |
| |
| EXPECT_EQ(query_cases[i].expected, out_str); |
| } |
| |
| if (query_cases[i].input16) { |
| std::u16string input16( |
| test_utils::TruncateWStringToUTF16(query_cases[i].input16)); |
| int len = static_cast<int>(input16.length()); |
| Component in_comp(0, len); |
| std::string out_str; |
| |
| StdStringCanonOutput output(&out_str); |
| CanonicalizeQuery(input16.c_str(), in_comp, &converter, &output, |
| &out_comp); |
| output.Complete(); |
| |
| EXPECT_EQ(query_cases[i].expected, out_str); |
| } |
| } |
| |
| // Extra test for input with embedded NULL; |
| std::string out_str; |
| StdStringCanonOutput output(&out_str); |
| Component out_comp; |
| CanonicalizeQuery("a \x00z\x01", Component(0, 5), NULL, &output, &out_comp); |
| output.Complete(); |
| EXPECT_EQ("?a%20%00z%01", out_str); |
| } |
| |
| } // namespace |
| |
| } // namespace url |