303 lines
8.4 KiB
C++
303 lines
8.4 KiB
C++
/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
|
|
// test_utf8_codecvt.cpp
|
|
|
|
// (C) Copyright 2002-4 Robert Ramey - http://www.rrsd.com .
|
|
// Use, modification and distribution is subject to the Boost Software
|
|
// License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
|
|
// http://www.boost.org/LICENSE_1_0.txt)
|
|
|
|
#include <algorithm> // std::copy
|
|
#include <fstream>
|
|
#include <iostream>
|
|
#include <iterator>
|
|
#include <locale>
|
|
#include <vector>
|
|
#include <string>
|
|
|
|
#include <cstddef> // size_t
|
|
#include <cwchar>
|
|
#include <boost/config.hpp>
|
|
#include <boost/core/no_exceptions_support.hpp>
|
|
|
|
#define BOOST_UTF8_BEGIN_NAMESPACE namespace boost { namespace detail {
|
|
#define BOOST_UTF8_END_NAMESPACE } }
|
|
#include <boost/detail/utf8_codecvt_facet.hpp>
|
|
#include <boost/detail/utf8_codecvt_facet.ipp>
|
|
|
|
#if defined(BOOST_NO_STDC_NAMESPACE)
|
|
namespace std{
|
|
using ::size_t;
|
|
using ::wcslen;
|
|
#if !defined(UNDER_CE) && !defined(__PGIC__)
|
|
using ::w_int;
|
|
#endif
|
|
} // namespace std
|
|
#endif
|
|
|
|
// Note: copied from boost/iostreams/char_traits.hpp
|
|
//
|
|
// Dinkumware that comes with QNX Momentics 6.3.0, 4.0.2, incorrectly defines
|
|
// the EOF and WEOF macros to not std:: qualify the wint_t type (and so does
|
|
// Sun C++ 5.8 + STLport 4). Fix by placing the def in this scope.
|
|
// NOTE: Use BOOST_WORKAROUND?
|
|
#if (defined(__QNX__) && defined(BOOST_DINKUMWARE_STDLIB)) \
|
|
|| defined(__SUNPRO_CC)
|
|
using ::std::wint_t;
|
|
#endif
|
|
|
|
#include <boost/core/lightweight_test.hpp>
|
|
|
|
template<std::size_t s>
|
|
struct test_data
|
|
{
|
|
static unsigned char utf8_encoding[];
|
|
static wchar_t wchar_encoding[];
|
|
};
|
|
|
|
template<>
|
|
unsigned char test_data<2>::utf8_encoding[] = {
|
|
0x01,
|
|
0x7f,
|
|
0xc2, 0x80,
|
|
0xdf, 0xbf,
|
|
0xe0, 0xa0, 0x80,
|
|
0xe7, 0xbf, 0xbf
|
|
};
|
|
|
|
template<>
|
|
wchar_t test_data<2>::wchar_encoding[] = {
|
|
0x0001,
|
|
0x007f,
|
|
0x0080,
|
|
0x07ff,
|
|
0x0800,
|
|
0x7fff
|
|
};
|
|
|
|
template<>
|
|
unsigned char test_data<4>::utf8_encoding[] = {
|
|
0x01,
|
|
0x7f,
|
|
0xc2, 0x80,
|
|
0xdf, 0xbf,
|
|
0xe0, 0xa0, 0x80,
|
|
0xef, 0xbf, 0xbf,
|
|
0xf0, 0x90, 0x80, 0x80,
|
|
0xf4, 0x8f, 0xbf, 0xbf,
|
|
/* codecvt implementations for clang and gcc don't handle more than 21 bits and
|
|
* return eof accordlingly. So don't test the whole 32 range
|
|
*/
|
|
/*
|
|
0xf7, 0xbf, 0xbf, 0xbf,
|
|
0xf8, 0x88, 0x80, 0x80, 0x80,
|
|
0xfb, 0xbf, 0xbf, 0xbf, 0xbf,
|
|
0xfc, 0x84, 0x80, 0x80, 0x80, 0x80,
|
|
0xfd, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf
|
|
*/
|
|
};
|
|
|
|
template<>
|
|
wchar_t test_data<4>::wchar_encoding[] = {
|
|
(wchar_t)0x00000001,
|
|
(wchar_t)0x0000007f,
|
|
(wchar_t)0x00000080,
|
|
(wchar_t)0x000007ff,
|
|
(wchar_t)0x00000800,
|
|
(wchar_t)0x0000ffff,
|
|
(wchar_t)0x00010000,
|
|
(wchar_t)0x0010ffff,
|
|
/* codecvt implementations for clang and gcc don't handle more than 21 bits and
|
|
* return eof accordlingly. So don't test the whole 32 range
|
|
*/
|
|
/*
|
|
(wchar_t)0x001fffff,
|
|
(wchar_t)0x00200000,
|
|
(wchar_t)0x03ffffff,
|
|
(wchar_t)0x04000000,
|
|
(wchar_t)0x7fffffff
|
|
*/
|
|
};
|
|
|
|
int
|
|
test_main(int /* argc */, char * /* argv */[]) {
|
|
std::locale utf8_locale
|
|
= std::locale(
|
|
std::locale::classic(),
|
|
new boost::detail::utf8_codecvt_facet
|
|
);
|
|
|
|
typedef char utf8_t;
|
|
// define test data compatible with the wchar_t implementation
|
|
// as either ucs-2 or ucs-4 depending on the compiler/library.
|
|
typedef test_data<sizeof(wchar_t)> td;
|
|
|
|
// Send our test UTF-8 data to file
|
|
{
|
|
std::ofstream ofs;
|
|
ofs.open("test.dat");
|
|
std::copy(
|
|
td::utf8_encoding,
|
|
td::utf8_encoding + sizeof(td::utf8_encoding) / sizeof(unsigned char),
|
|
std::ostream_iterator<utf8_t>(ofs)
|
|
);
|
|
}
|
|
|
|
// Read the test data back in, converting to UCS-4 on the way in
|
|
std::vector<wchar_t> from_file;
|
|
{
|
|
std::wifstream ifs;
|
|
ifs.imbue(utf8_locale);
|
|
ifs.open("test.dat");
|
|
|
|
std::wint_t item = 0;
|
|
// note can't use normal vector from iterator constructor because
|
|
// dinkumware doesn't have it.
|
|
for(;;){
|
|
item = ifs.get();
|
|
if(item == WEOF)
|
|
break;
|
|
//ifs >> item;
|
|
//if(ifs.eof())
|
|
// break;
|
|
from_file.push_back(item);
|
|
}
|
|
}
|
|
|
|
BOOST_TEST(std::equal(from_file.begin(), from_file.end(), td::wchar_encoding));
|
|
|
|
// Send the UCS4_data back out, converting to UTF-8
|
|
{
|
|
std::wofstream ofs;
|
|
ofs.imbue(utf8_locale);
|
|
ofs.open("test2.dat");
|
|
std::copy(
|
|
from_file.begin(),
|
|
from_file.end(),
|
|
std::ostream_iterator<wchar_t, wchar_t>(ofs)
|
|
);
|
|
}
|
|
|
|
// Make sure that both files are the same
|
|
{
|
|
typedef std::istream_iterator<utf8_t> is_iter;
|
|
is_iter end_iter;
|
|
|
|
std::ifstream ifs1("test.dat");
|
|
is_iter it1(ifs1);
|
|
std::vector<utf8_t> data1;
|
|
std::copy(it1, end_iter, std::back_inserter(data1));
|
|
|
|
std::ifstream ifs2("test2.dat");
|
|
is_iter it2(ifs2);
|
|
std::vector<utf8_t> data2;
|
|
std::copy(it2, end_iter, std::back_inserter(data2));
|
|
|
|
BOOST_TEST(data1 == data2);
|
|
}
|
|
|
|
// some libraries have trouble that only shows up with longer strings
|
|
|
|
const wchar_t * test3_data = L"\
|
|
<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\" ?>\
|
|
<!DOCTYPE boost_serialization>\
|
|
<boost_serialization signature=\"serialization::archive\" version=\"3\">\
|
|
<a class_id=\"0\" tracking_level=\"0\">\
|
|
<b>1</b>\
|
|
<f>96953204</f>\
|
|
<g>177129195</g>\
|
|
<l>1</l>\
|
|
<m>5627</m>\
|
|
<n>23010</n>\
|
|
<o>7419</o>\
|
|
<p>16212</p>\
|
|
<q>4086</q>\
|
|
<r>2749</r>\
|
|
<c>-33</c>\
|
|
<s>124</s>\
|
|
<t>28</t>\
|
|
<u>32225</u>\
|
|
<v>17543</v>\
|
|
<w>0.84431422</w>\
|
|
<x>1.0170664757130923</x>\
|
|
<y>tjbx</y>\
|
|
<z>cuwjentqpkejp</z>\
|
|
</a>\
|
|
</boost_serialization>\
|
|
";
|
|
|
|
// Send the UCS4_data back out, converting to UTF-8
|
|
std::size_t l = std::wcslen(test3_data);
|
|
{
|
|
std::wofstream ofs;
|
|
ofs.imbue(utf8_locale);
|
|
ofs.open("test3.dat");
|
|
std::copy(
|
|
test3_data,
|
|
test3_data + l,
|
|
std::ostream_iterator<wchar_t, wchar_t>(ofs)
|
|
);
|
|
}
|
|
|
|
// Make sure that both files are the same
|
|
{
|
|
std::wifstream ifs;
|
|
ifs.imbue(utf8_locale);
|
|
ifs.open("test3.dat");
|
|
ifs >> std::noskipws;
|
|
BOOST_TEST(
|
|
std::equal(
|
|
test3_data,
|
|
test3_data + l,
|
|
std::istream_iterator<wchar_t, wchar_t>(ifs)
|
|
)
|
|
);
|
|
}
|
|
|
|
// Test length calculation
|
|
{
|
|
std::codecvt<wchar_t, char, std::mbstate_t> const& fac = std::use_facet< std::codecvt<wchar_t, char, std::mbstate_t> >(utf8_locale);
|
|
std::mbstate_t mbs = std::mbstate_t();
|
|
const int utf8_len = sizeof(td::utf8_encoding) / sizeof(*td::utf8_encoding);
|
|
int res = fac.length(mbs, reinterpret_cast< const char* >(td::utf8_encoding), reinterpret_cast< const char* >(td::utf8_encoding + utf8_len), ~static_cast< std::size_t >(0u));
|
|
BOOST_TEST_EQ(utf8_len, res);
|
|
}
|
|
|
|
// Test that length calculation detects character boundaries
|
|
{
|
|
std::codecvt<wchar_t, char, std::mbstate_t> const& fac = std::use_facet< std::codecvt<wchar_t, char, std::mbstate_t> >(utf8_locale);
|
|
std::mbstate_t mbs = std::mbstate_t();
|
|
// The first 5 bytes of utf8_encoding contain 3 complete UTF-8 characters (taking 4 bytes in total) and 1 byte of an incomplete character.
|
|
// This last byte should not be accounted by length().
|
|
const int input_len = 5;
|
|
const int utf8_len = 4;
|
|
int res = fac.length(mbs, reinterpret_cast< const char* >(td::utf8_encoding), reinterpret_cast< const char* >(td::utf8_encoding + input_len), ~static_cast< std::size_t >(0u));
|
|
BOOST_TEST_EQ(utf8_len, res);
|
|
}
|
|
|
|
return EXIT_SUCCESS;
|
|
}
|
|
|
|
int
|
|
main(int argc, char * argv[]){
|
|
|
|
int retval = 1;
|
|
BOOST_TRY{
|
|
retval = test_main(argc, argv);
|
|
}
|
|
#ifndef BOOST_NO_EXCEPTION_STD_NAMESPACE
|
|
BOOST_CATCH(const std::exception & e){
|
|
BOOST_ERROR(e.what());
|
|
}
|
|
#endif
|
|
BOOST_CATCH(...){
|
|
BOOST_ERROR("failed with uncaught exception:");
|
|
}
|
|
BOOST_CATCH_END
|
|
|
|
int error_count = boost::report_errors();
|
|
if(error_count > 0)
|
|
retval = error_count;
|
|
return retval;
|
|
}
|
|
|