/** * A Character Encoding Set Implementation * Nana C++ Library(http://www.nanapro.org) * Copyright(C) 2003-2017 Jinhao(cnjinhao@hotmail.com) * * Distributed under the Boost Software License, Version 1.0. * (See accompanying file LICENSE_1_0.txt or copy at * http://www.boost.org/LICENSE_1_0.txt) * * @file nana/charset.cpp * @brief A conversion between unicode characters and multi bytes characters * @contributions * UTF16 4-byte decoding issue by Renke Yan. * Pr0curo(pr#98) * crillion */ #include #include #include #include #include #include //Added by Pr0curo(pr#98) #include #include //Added by crillion //GCC 4.7.0 does not implement the and codecvt_utfx classes #ifndef STD_CODECVT_NOT_SUPPORTED #include #endif #if defined(NANA_WINDOWS) #include #endif namespace nana { namespace utf { /// return a pointer to the code unit of the character at pos const char* char_ptr(const char* text, unsigned pos) { auto ustr = reinterpret_cast(text); auto const end = ustr + std::strlen(text); for (unsigned i = 0; i != pos; ++i) { const auto uch = *ustr; if (uch < 0x80) { ++ustr; continue; } if (uch < 0xC0) // use police ? return nullptr; if ((uch < 0xE0) && (ustr + 1 < end)) //? *(ustr + 1) < 0xE0 ustr += 2; else if (uch < 0xF0 && (ustr + 2 <= end)) ustr += 3; else if (uch < 0x1F && (ustr + 3 <= end)) ustr += 4; else return nullptr; } return reinterpret_cast(ustr); } /// return a pointer to the code unit of the character at pos - reuse ^ ? const char* char_ptr(const std::string& text_utf8, unsigned pos) { auto ustr = reinterpret_cast(text_utf8.c_str()); auto const end = ustr + text_utf8.size(); for (unsigned i = 0; i != pos; ++i) { const auto uch = *ustr; if (uch < 0x80) { ++ustr; continue; } if (uch < 0xC0) return nullptr; if ((uch < 0xE0) && (ustr + 1 < end)) ustr += 2; else if (uch < 0xF0 && (ustr + 2 <= end)) ustr += 3; else if (uch < 0x1F && (ustr + 3 <= end)) ustr += 4; else return nullptr; } return reinterpret_cast(ustr); } /// return a code point (max 16 bits?) and the len in code units of the character at pos wchar_t char_at(const char* text_utf8, unsigned pos, unsigned * len) { if (!text_utf8) return 0; if (pos) { text_utf8 = char_ptr(text_utf8, pos); if (!text_utf8) return 0; } const wchar_t uch = *reinterpret_cast(text_utf8); if (uch < 0x80) { if (len) *len = 1; return *text_utf8; // uch ? } if (uch < 0xC0) // use police or ?? { if (len) *len = 0; return 0; } const auto end = text_utf8 + std::strlen(text_utf8); if (uch < 0xE0 && (text_utf8 + 1 <= end)) { if (len) *len = 2; return (wchar_t(uch & 0x1F) << 6) | (reinterpret_cast(text_utf8)[1] & 0x3F); } else if (uch < 0xF0 && (text_utf8 + 2 <= end)) { if (len) *len = 3; return ((((uch & 0xF) << 6) | (reinterpret_cast(text_utf8)[1] & 0x3F)) << 6) | (reinterpret_cast(text_utf8)[2] & 0x3F); } else if (uch < 0x1F && (text_utf8 + 3 <= end)) { if (len) *len = 4; return ((((((uch & 0x7) << 6) | (reinterpret_cast(text_utf8)[1] & 0x3F)) << 6) | (reinterpret_cast(text_utf8)[2] & 0x3F)) << 6) | (reinterpret_cast(text_utf8)[3] & 0x3F); } if (len) *len = 0; return 0; } /// return a code point (max 16 bits?) and the len in code units of the character at pos wchar_t char_at(const ::std::string& text_utf8, unsigned pos, unsigned * len) { const char* ptr; if (pos) { ptr = char_ptr(text_utf8, pos); if (!ptr) return 0; } else ptr = text_utf8.c_str(); const wchar_t uch = *reinterpret_cast(ptr); if (uch < 0x80) { if (len) *len = 1; return *ptr; } if (uch < 0xC0) { if (len) *len = 0; return 0; } const auto end = text_utf8.c_str() + text_utf8.size(); if (uch < 0xE0 && (ptr + 1 <= end)) { if (len) *len = 2; return (wchar_t(uch & 0x1F) << 6) | (reinterpret_cast(ptr)[1] & 0x3F); } else if (uch < 0xF0 && (ptr + 2 <= end)) { if (len) *len = 3; return ((((uch & 0xF) << 6) | (reinterpret_cast(ptr)[1] & 0x3F)) << 6) | (reinterpret_cast(ptr)[2] & 0x3F); } else if (uch < 0x1F && (ptr + 3 <= end)) { if (len) *len = 4; return ((((((uch & 0x7) << 6) | (reinterpret_cast(ptr)[1] & 0x3F)) << 6) | (reinterpret_cast(ptr)[2] & 0x3F)) << 6) | (reinterpret_cast(ptr)[3] & 0x3F); } if (len) *len = 0; return 0; } } namespace detail { /// candidate to be more general?? class locale_initializer { public: static void init() { static bool initialized = false; if (initialized) return; initialized = true; //Only set the C library locale std::setlocale(LC_CTYPE, ""); } }; /// convert wchar C string from ? ANSI code page CP_ACP (windows) or LC_CTYPE c locale (-nix) into utf8 std::string bool wc2mb(std::string& mbstr, const wchar_t * s) { if(nullptr == s || *s == 0) { mbstr.clear(); return true; } #if defined(NANA_WINDOWS) int bytes = ::WideCharToMultiByte(CP_ACP, 0, s, -1, 0, 0, 0, 0); if(bytes > 1) { mbstr.resize(bytes - 1); ::WideCharToMultiByte(CP_ACP, 0, s, -1, &(mbstr[0]), bytes - 1, 0, 0); } return true; #else locale_initializer::init(); std::mbstate_t mbstate = std::mbstate_t(); std::size_t len = std::wcsrtombs(nullptr, &s, 0, &mbstate); if(len == static_cast(-1)) return false; if(len) { mbstr.resize(len); std::wcsrtombs(&(mbstr[0]), &s, len, &mbstate); } else mbstr.clear(); #endif return true; } /// convert a char C-string from The system default Windows ANSI code page CP_ACP or from LC_CTYPE c locale (-nix) into utf16 std::wstring bool mb2wc(std::wstring& wcstr, const char* s) { if(nullptr == s || *s == 0) { wcstr.clear(); return true; } #if defined(NANA_WINDOWS) int chars = ::MultiByteToWideChar(CP_ACP, 0, s, -1, 0, 0); if(chars > 1) { wcstr.resize(chars - 1); ::MultiByteToWideChar(CP_ACP, 0, s, -1, &wcstr[0], chars - 1); } #else locale_initializer::init(); std::mbstate_t mbstate = std::mbstate_t(); std::size_t len = std::mbsrtowcs(nullptr, &s, 0, &mbstate); if(len == static_cast(-1)) return false; if(len) { wcstr.resize(len); std::mbsrtowcs(&wcstr[0], &s, len, &mbstate); } else wcstr.clear(); #endif return true; } /// convert a char C string from The system default Windows ANSI code page CP_ACP or LC_CTYPE c locale (-nix) into utf16 std::string bool mb2wc(std::string& wcstr, const char* s) { if(nullptr == s || *s == 0) { wcstr.clear(); return true; } #if defined(NANA_WINDOWS) int chars = ::MultiByteToWideChar(CP_ACP, 0, s, -1, 0, 0); if(chars > 1) { wcstr.resize((chars - 1) * sizeof(wchar_t)); ::MultiByteToWideChar(CP_ACP, 0, s, -1, reinterpret_cast(&wcstr[0]), chars - 1); // ^ the trick ! } #else locale_initializer::init(); std::mbstate_t mbstate = std::mbstate_t(); std::size_t len = std::mbsrtowcs(nullptr, &s, 0, &mbstate); if(len == static_cast(-1)) return false; if(len) { wcstr.resize(sizeof(wchar_t) * len); std::mbsrtowcs(reinterpret_cast(&wcstr[0]), &s, len, &mbstate); } else wcstr.clear(); #endif return true; } class charset_encoding_interface { public: virtual ~charset_encoding_interface(){} virtual charset_encoding_interface * clone() const = 0; virtual std::string str() const = 0; virtual std::string&& str_move() = 0; virtual std::string str(unicode) const = 0; virtual std::wstring wstr() const = 0; virtual std::wstring&& wstr_move() = 0; }; /// playing with the idea - we need a mechanisme to set a user selected police - Testing an abtract interphase struct encoding_error_police { virtual unsigned long next_code_point(const unsigned char*& current_code_unit, const unsigned char* end) = 0; virtual ~encoding_error_police() = default; }; /// the current nana default: it is safe - you may want to keep it ! use the other at your risk: mainly for debugging struct utf8_error_police : public encoding_error_police { unsigned long next_code_point(const unsigned char*& current_code_unit, const unsigned char* end) override { current_code_unit = end; return 0; } }; /// struct utf8_error_police_def_char : public encoding_error_police { static unsigned long def_error_mark ; unsigned long error_mark{ def_error_mark }; utf8_error_police_def_char() = default; utf8_error_police_def_char( unsigned long mark): error_mark{mark}{} unsigned long next_code_point(const unsigned char*& current_code_unit, const unsigned char* end) override { if(current_code_unit < end) ++current_code_unit; return error_mark; } }; unsigned long utf8_error_police_def_char::def_error_mark{ '*' }; /// struct utf8_error_police_throw : public encoding_error_police { unsigned long next_code_point(const unsigned char*& current_code_unit, const unsigned char* end) override { //utf8_Error::use_throw = true; utf8_Error(std::string("The text is not encoded in UTF8: ") + reinterpret_cast( current_code_unit) ).emit();; current_code_unit = end; return 0; } }; struct utf8_error_police_latin : public encoding_error_police { unsigned long next_code_point(const unsigned char*& current_code_unit, const unsigned char* /*end*/) override { return *(current_code_unit++) ; } }; /// buggie? struct utf8_error_police_system : public encoding_error_police { unsigned long next_code_point(const unsigned char*& current_code_unit, const unsigned char* /*end*/) override { std::wstring wc; mb2wc(wc, reinterpret_cast(current_code_unit)); current_code_unit++; return wc[0]; // use utf16char but what endian? } }; // auto def_encoding_error_police = std::make_unique(); // the nana default // auto def_encoding_error_police = std::make_unique(); // auto def_encoding_error_police = std::make_unique(); // auto def_encoding_error_police = std::make_unique('X'); auto def_encoding_error_police = std::make_unique(); #ifndef STD_CODECVT_NOT_SUPPORTED class charset_string : public charset_encoding_interface { public: charset_string(const std::string& s) : data_(s) {} charset_string(std::string&& s) : data_(std::move(s)) {} charset_string(const std::string& s, unicode encoding) : data_(s), is_unicode_(true), utf_x_(encoding) {} charset_string(std::string&& s, unicode encoding) : data_(std::move(s)), is_unicode_(true), utf_x_(encoding) {} private: virtual charset_encoding_interface * clone() const { return new charset_string(*this); } virtual std::string str() const { if(is_unicode_) { std::wstring wcstr; switch(utf_x_) { case unicode::utf8: wcstr = std::wstring_convert>().from_bytes(data_); break; case unicode::utf16: wcstr = std::wstring_convert>().from_bytes(data_); break; case unicode::utf32: wcstr.append(reinterpret_cast(data_.c_str()), data_.size() / sizeof(wchar_t)); break; } std::string mbstr; wc2mb(mbstr, wcstr.c_str()); return mbstr; } return data_; } virtual std::string&& str_move() { if(is_unicode_) data_ = str(); return std::move(data_); } virtual std::string str(unicode encoding) const { if(is_unicode_ && (utf_x_ != encoding)) { switch(utf_x_) { case unicode::utf8: switch(encoding) { case unicode::utf16: return std::wstring_convert, char16_t>().to_bytes( std::wstring_convert, char16_t>().from_bytes(data_) ); case unicode::utf32: { std::u32string u32str = std::wstring_convert, char32_t>().from_bytes(data_); return std::string(reinterpret_cast(u32str.c_str()), u32str.size() * sizeof(char32_t)); } default: break; //no conversion } break; case unicode::utf16: switch(encoding) { case unicode::utf8: return std::wstring_convert, char16_t>().to_bytes( std::wstring_convert, char16_t>().from_bytes(data_) ); case unicode::utf32: { std::u32string u32str = std::wstring_convert, char32_t>().from_bytes(data_); return std::string(reinterpret_cast(u32str.c_str()), u32str.size() * sizeof(char32_t)); } default: break; //no conversion } break; case unicode::utf32: switch(encoding) { case unicode::utf8: return std::wstring_convert, char32_t>().to_bytes( std::u32string(reinterpret_cast(data_.c_str()), data_.size() / sizeof(char32_t)) ); case unicode::utf16: return std::wstring_convert, char32_t>().to_bytes( std::u32string(reinterpret_cast(data_.c_str()), data_.size() / sizeof(char32_t)) ); default: break; //no conversion } break; } return{}; } std::wstring wcstr; if(mb2wc(wcstr, data_.c_str())) { switch(encoding) { case unicode::utf8: return std::wstring_convert>().to_bytes(wcstr); case unicode::utf16: return std::wstring_convert>().to_bytes(wcstr); case unicode::utf32: #if defined(NANA_WINDOWS) { const char * bytes = reinterpret_cast(wcstr.c_str()); std::u32string utf32str = std::wstring_convert, char32_t>().from_bytes(bytes, bytes + sizeof(wchar_t) * wcstr.size()); return std::string(reinterpret_cast(utf32str.c_str()), sizeof(char32_t) * utf32str.size()); } #elif defined(NANA_POSIX) return std::string(reinterpret_cast(wcstr.c_str()), sizeof(wchar_t) * wcstr.size()); #else throw std::runtime_error("Bad charset"); #endif } } return{}; } virtual std::wstring wstr() const { if(is_unicode_) { switch(utf_x_) { case unicode::utf8: return std::wstring_convert>().from_bytes(data_); case unicode::utf16: return std::wstring_convert>().from_bytes(data_); case unicode::utf32: return std::wstring(reinterpret_cast(data_.c_str()), data_.size() * sizeof(wchar_t)); } return{}; } std::wstring wcstr; mb2wc(wcstr, data_.c_str()); return wcstr; } virtual std::wstring && wstr_move() { wdata_for_move_ = wstr(); return std::move(wdata_for_move_); } private: std::string data_; std::wstring wdata_for_move_{}; bool is_unicode_{ false }; unicode utf_x_{ unicode::utf8 }; }; class charset_wstring : public charset_encoding_interface { public: charset_wstring(const std::wstring& s) : data_(s) {} charset_wstring(std::wstring&& s) : data_(std::move(s)) {} virtual charset_encoding_interface * clone() const { return new charset_wstring(*this); } virtual std::string str() const { if(data_.size()) { std::string mbstr; wc2mb(mbstr, data_.c_str()); return mbstr; } return{}; } virtual std::string&& str_move() { data_for_move_ = str(); return std::move(data_for_move_); } virtual std::string str(unicode encoding) const { switch(encoding) { case unicode::utf8: return std::wstring_convert>().to_bytes(data_); case unicode::utf16: return std::wstring_convert>().to_bytes(data_); case unicode::utf32: #if defined (NANA_WINDOWS) { const char* bytes = reinterpret_cast(data_.c_str()); std::u32string utf32str = std::wstring_convert, char32_t>().from_bytes(bytes, bytes + sizeof(wchar_t) * data_.size()); return std::string(reinterpret_cast(utf32str.c_str()), sizeof(char32_t) * utf32str.size()); } #elif defined(NANA_POSIX) return std::string(reinterpret_cast(data_.c_str()), data_.size() * sizeof(wchar_t)); #else throw std::runtime_error("Bad charset"); #endif } return std::string(); } virtual std::wstring wstr() const { return data_; } virtual std::wstring&& wstr_move() { return std::move(data_); } private: std::wstring data_; std::string data_for_move_; }; #else /// return the first code point and move the pointer to next character, springing to the end by errors unsigned long utf8char(const unsigned char*& p, const unsigned char* end) { if(p != end) { if(*p < 0x80) // ASCII char 0-127 or 0-0x80 { return *(p++); } unsigned ch = *p; unsigned long code; if(ch < 0xC0) // error? - move to end. Posible ANSI or ISO code-page { //return *(p++); // temp: assume equal //p = end; //return 0; return def_encoding_error_police->next_code_point(p, end); } else if(ch < 0xE0 && (p + 1 <= end)) // two byte chararcter { code = ((ch & 0x1F) << 6) | (p[1] & 0x3F); p += 2; } else if(ch < 0xF0 && (p + 2 <= end)) // 3 byte character { code = ((((ch & 0xF) << 6) | (p[1] & 0x3F)) << 6) | (p[2] & 0x3F); p += 3; } else if(ch < 0x1F && (p + 3 <= end)) // 4 byte character { code = ((((((ch & 0x7) << 6) | (p[1] & 0x3F)) << 6) | (p[2] & 0x3F)) << 6) | (p[3] & 0x3F); p += 4; } else // error, go to end { p = end; return 0; } return code; } return 0; } unsigned long utf16char(const unsigned char* & bytes, const unsigned char* end, bool le_or_be) { unsigned long code; if(le_or_be) { if((end - bytes >= 4) && ((bytes[1] & 0xFC) == 0xD8)) { //32bit encoding unsigned long ch0 = bytes[0] | (bytes[1] << 8); unsigned long ch1 = bytes[2] | (bytes[3] << 8); code = ((ch0 & 0x3FF) << 10) | (ch1 & 0x3FF); bytes += 4; } else if(end - bytes >= 2) { code = bytes[0] | (bytes[1] << 8); bytes += 2; } else { bytes = end; return 0; } } else { if((end - bytes >= 4) && ((bytes[0] & 0xFC) == 0xD8)) { //32bit encoding unsigned long ch0 = (bytes[0] << 8) | bytes[1]; unsigned long ch1 = (bytes[2] << 8) | bytes[3]; code = (((ch0 & 0x3FF) << 10) | (ch1 & 0x3FF)) + 0x10000; bytes += 4; } else if(end - bytes >= 2) { code = (bytes[0] << 8) | bytes[1]; bytes += 2; } else { bytes = end; return 0; } } return code; } unsigned long utf32char(const unsigned char* & bytes, const unsigned char* end, bool le_or_be) { if(end - bytes >= 4) { unsigned long code; if(le_or_be) code = bytes[0] | (bytes[1] << 8) | (bytes[2] << 16) | (bytes[3] << 24); else code = bytes[3] | (bytes[2] << 8) | (bytes[1] << 16) | (bytes[0] << 24); bytes += 4; return code; } bytes = end; return 0; } void put_utf8char(std::string& s, unsigned long code) { if(code < 0x80) { s += static_cast(code); } else if(code < 0x800) { s += static_cast(0xC0 | (code >> 6)); s += static_cast(0x80 | (code & 0x3F)); } else if(code < 0x10000) { s += static_cast(0xE0 | (code >> 12)); s += static_cast(0x80 | ((code >> 6) & 0x3F)); s += static_cast(0x80 | (code & 0x3F)); } else { s += static_cast(0xF0 | (code >> 18)); s += static_cast(0x80 | ((code >> 12) & 0x3F)); s += static_cast(0x80 | ((code >> 6) & 0x3F)); s += static_cast(0x80 | (code & 0x3F)); } } //le_or_be, true = le, false = be void put_utf16char(std::string& s, unsigned long code, bool le_or_be) { if(code <= 0xFFFF) { if(le_or_be) { s += static_cast(code & 0xFF); s += static_cast((code & 0xFF00) >> 8); } else { s += static_cast((code & 0xFF00) >> 8); s += static_cast(code & 0xFF); } } else { unsigned long ch0 = (0xD800 | ((code - 0x10000) >> 10)); unsigned long ch1 = (0xDC00 | ((code - 0x10000) & 0x3FF)); if(le_or_be) { s += static_cast(ch0 & 0xFF); s += static_cast((ch0 & 0xFF00) >> 8); s += static_cast(ch1 & 0xFF); s += static_cast((ch1 & 0xFF00) >> 8); } else { s += static_cast((ch0 & 0xFF00) >> 8); s += static_cast(ch0 & 0xFF); s += static_cast((ch1 & 0xFF00) >> 8); s += static_cast(ch1 & 0xFF); } } } void put_utf32char(std::string& s, unsigned long code, bool le_or_be) { if(le_or_be) { s += static_cast(code & 0xFF); s += static_cast((code & 0xFF00) >> 8); s += static_cast((code & 0xFF0000) >> 16); s += static_cast((code & 0xFF000000) >> 24); } else { s += static_cast((code & 0xFF000000) >> 24); s += static_cast((code & 0xFF0000) >> 16); s += static_cast((code & 0xFF00) >> 8); s += static_cast(code & 0xFF); } } std::string utf8_to_utf16(const std::string& s, bool le_or_be) { const unsigned char * bytes = reinterpret_cast(s.c_str()); const unsigned char * end = bytes + s.size(); std::string utf16str; //If there is a BOM, ignore it. if(s.size() >= 3) { if(bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) { bytes += 3; put_utf16char(utf16str, 0xFEFF, le_or_be); } } while(bytes != end) { put_utf16char(utf16str, utf8char(bytes, end), le_or_be); } return utf16str; } std::string utf8_to_utf32(const std::string& s, bool le_or_be) { const unsigned char * bytes = reinterpret_cast(s.c_str()); const unsigned char * end = bytes + s.size(); std::string utf32str; //If there is a BOM, ignore it. if(s.size() >= 3) { if(bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) { bytes += 3; put_utf32char(utf32str, 0xFEFF, le_or_be); } } while(bytes != end) { put_utf32char(utf32str, utf8char(bytes, end), le_or_be); } return utf32str; } std::string utf16_to_utf8(const std::string& s) { const unsigned char * bytes = reinterpret_cast(s.c_str()); const unsigned char * end = bytes + s.size(); bool le_or_be = true; std::string utf8str; //If there is a BOM, ignore it if(s.size() >= 2) { if(bytes[0] == 0xFF && bytes[1] == 0xFE) { bytes += 2; le_or_be = true; utf8str += (char)0xEF; utf8str += (char)0xBB; utf8str += (char)0xBF; } else if(bytes[0] == 0xFE && bytes[1] == 0xFF) { bytes += 2; le_or_be = false; utf8str += (char)(0xEF); utf8str += (char)(0xBB); utf8str += (char)(0xBF); } } while(bytes != end) { put_utf8char(utf8str, utf16char(bytes, end, le_or_be)); } return utf8str; } std::string utf16_to_utf32(const std::string& s) { const unsigned char * bytes = reinterpret_cast(s.c_str()); const unsigned char * end = bytes + s.size(); bool le_or_be = true; std::string utf32str; //If there is a BOM, ignore it if(s.size() >= 2) { if(bytes[0] == 0xFF && bytes[1] == 0xFE) { bytes += 2; le_or_be = true; put_utf32char(utf32str, 0xFEFF, true); } else if(bytes[0] == 0xFE && bytes[1] == 0xFF) { bytes += 2; le_or_be = false; put_utf32char(utf32str, 0xFEFF, false); } } while(bytes != end) { put_utf32char(utf32str, utf16char(bytes, end, le_or_be), le_or_be); } return utf32str; } std::string utf32_to_utf8(const std::string& s) { const unsigned char * bytes = reinterpret_cast(s.c_str()); const unsigned char * end = bytes + (s.size() & (~4 + 1)); std::string utf8str; bool le_or_be = true; //If there is a BOM, ignore it if(s.size() >= 4) { if(bytes[0] == 0 && bytes[1] == 0 && bytes[2] == 0xFE && bytes[3] == 0xFF) { le_or_be = false; bytes += 4; utf8str += (char)0xEF; utf8str += (char)0xBB; utf8str += (char)0xBF; } else if(bytes[0] == 0xFF && bytes[1] == 0xFE && bytes[2] == 0 && bytes[3] == 0) { le_or_be = true; bytes += 4; utf8str += (char)0xEF; utf8str += (char)0xBB; utf8str += (char)0xBF; } } while(bytes < end) { put_utf8char(utf8str, utf32char(bytes, end, le_or_be)); } return utf8str; } std::string utf32_to_utf16(const std::string& s) { const unsigned char * bytes = reinterpret_cast(s.c_str()); const unsigned char * end = bytes + (s.size() & (~4 + 1)); std::string utf16str; bool le_or_be = true; //If there is a BOM, ignore it if(s.size() >= 4) { if(bytes[0] == 0 && bytes[1] == 0 && bytes[2] == 0xFE && bytes[3] == 0xFF) { le_or_be = false; bytes += 4; put_utf16char(utf16str, 0xFEFF, false); } else if(bytes[0] == 0xFF && bytes[1] == 0xFE && bytes[2] == 0 && bytes[3] == 0) { le_or_be = true; bytes += 4; put_utf16char(utf16str, 0xFEFF, true); } } while(bytes < end) { put_utf16char(utf16str, utf32char(bytes, end, le_or_be), le_or_be); } return utf16str; } class charset_string : public charset_encoding_interface { public: charset_string(const std::string& s) : data_(s) {} charset_string(std::string&& s) : data_(std::move(s)) {} charset_string(const std::string& s, unicode encoding) : data_(s), is_unicode_(true), utf_x_(encoding) {} charset_string(std::string&& s, unicode encoding) : data_(std::move(s)), is_unicode_(true), utf_x_(encoding) {} private: virtual charset_encoding_interface * clone() const { return new charset_string(*this); } virtual std::string str() const { if(is_unicode_) { std::string strbuf; switch(utf_x_) { case unicode::utf8: #if defined(NANA_WINDOWS) strbuf = detail::utf8_to_utf16(data_, true); detail::put_utf16char(strbuf, 0, true); #else strbuf = detail::utf8_to_utf32(data_, true); detail::put_utf32char(strbuf, 0, true); #endif break; case unicode::utf16: #if defined(NANA_WINDOWS) strbuf = data_; detail::put_utf16char(strbuf, 0, true); #else strbuf = detail::utf16_to_utf32(data_); detail::put_utf32char(strbuf, 0, true); #endif break; case unicode::utf32: #if defined(NANA_WINDOWS) strbuf = detail::utf32_to_utf16(data_); detail::put_utf16char(strbuf, 0, true); #else strbuf = data_; detail::put_utf32char(strbuf, 0, true); #endif break; } std::string mbstr; wc2mb(mbstr, reinterpret_cast(strbuf.c_str())); return mbstr; } return data_; } virtual std::string && str_move() { if(is_unicode_) data_ = std::move(str()); return std::move(data_); } virtual std::string str(unicode encoding) const { if(is_unicode_ && (utf_x_ != encoding)) { switch(utf_x_) { case unicode::utf8: switch(encoding) { case unicode::utf16: return detail::utf8_to_utf16(data_, true); case unicode::utf32: return detail::utf8_to_utf32(data_, true); default: break; } break; case unicode::utf16: switch(encoding) { case unicode::utf8: return detail::utf16_to_utf8(data_); case unicode::utf32: return detail::utf16_to_utf32(data_); default: break; } break; case unicode::utf32: switch(encoding) { case unicode::utf8: return detail::utf32_to_utf8(data_); case unicode::utf16: return detail::utf32_to_utf16(data_); default: break; } break; } return {}; } std::string wcstr; if(mb2wc(wcstr, data_.c_str())) { switch(encoding) { #if defined(NANA_WINDOWS) case unicode::utf8: return utf16_to_utf8(wcstr); case unicode::utf32: return utf16_to_utf32(wcstr); case unicode::utf16: return wcstr; #else //POSIX case unicode::utf8: return utf32_to_utf8(wcstr); case unicode::utf16: return utf32_to_utf16(wcstr); case unicode::utf32: return wcstr; #endif } } return {}; } virtual std::wstring wstr() const { if(is_unicode_) { std::string bytes; switch(utf_x_) { case unicode::utf8: #if defined(NANA_WINDOWS) bytes = detail::utf8_to_utf16(data_, true); #else bytes = detail::utf8_to_utf32(data_, true); #endif break; case unicode::utf16: #if defined(NANA_WINDOWS) bytes = data_; #else bytes = detail::utf16_to_utf32(data_); #endif break; case unicode::utf32: #if defined(NANA_WINDOWS) bytes = detail::utf32_to_utf16(data_); #else bytes = data_; #endif break; } return std::wstring(reinterpret_cast(bytes.c_str()), bytes.size() / sizeof(wchar_t)); } std::wstring wcstr; mb2wc(wcstr, data_.c_str()); return wcstr; } virtual std::wstring&& wstr_move() { wdata_for_move_ = std::move(wstr()); return std::move(wdata_for_move_); } private: std::string data_; std::wstring wdata_for_move_{}; bool is_unicode_{ false }; unicode utf_x_{ unicode::utf8 }; }; class charset_wstring : public charset_encoding_interface { public: charset_wstring(const std::wstring& s) : data_(s) {} virtual charset_encoding_interface * clone() const { return new charset_wstring(*this); } virtual std::string str() const { if(data_.size()) { std::string mbstr; wc2mb(mbstr, data_.c_str()); return mbstr; } return {}; } virtual std::string && str_move() { data_for_move_ = std::move(str()); return std::move(data_for_move_); } virtual std::string str(unicode encoding) const { switch(encoding) { case unicode::utf8: #if defined(NANA_WINDOWS) return detail::utf16_to_utf8(std::string(reinterpret_cast(data_.c_str()), data_.size() * sizeof(wchar_t))); #else return detail::utf32_to_utf8(std::string(reinterpret_cast(data_.c_str()), data_.size() * sizeof(wchar_t))); #endif case unicode::utf16: #if defined(NANA_WINDOWS) return std::string(reinterpret_cast(data_.c_str()), data_.size() * sizeof(wchar_t)); #else return detail::utf32_to_utf16(std::string(reinterpret_cast(data_.c_str()), data_.size() * sizeof(wchar_t))); #endif case unicode::utf32: #if defined(NANA_WINDOWS) return detail::utf16_to_utf32(std::string(reinterpret_cast(data_.c_str()), data_.size() * sizeof(wchar_t))); #else return std::string(reinterpret_cast(data_.c_str()), data_.size() * sizeof(wchar_t)); #endif } return {}; } virtual std::wstring wstr() const { return data_; } virtual std::wstring && wstr_move() { return std::move(data_); } private: std::wstring data_; std::string data_for_move_{}; }; #endif } //class charset charset::charset(const charset& rhs) : impl_(rhs.impl_ ? rhs.impl_->clone() : 0) {} charset & charset::operator=(const charset& rhs) { if(this != &rhs) { delete impl_; impl_ = (rhs.impl_ ? rhs.impl_->clone() : 0); } return *this; } charset::charset(charset&& r) : impl_(r.impl_) { r.impl_ = 0; } charset & charset::operator=(charset&& r) { if(this != &r) { delete impl_; impl_ = r.impl_; r.impl_ = nullptr; } return *this; } charset::charset(const std::string& s) : impl_(new detail::charset_string(s)) {} charset::charset(std::string&& s) : impl_(new detail::charset_string(std::move(s))) {} charset::charset(const std::string& s, unicode encoding) : impl_(new detail::charset_string(s, encoding)) {} charset::charset(std::string&& s, unicode encoding) : impl_(new detail::charset_string(std::move(s), encoding)) {} charset::charset(const std::wstring& s) : impl_(new detail::charset_wstring(s)) {} charset::charset(std::wstring&& s) : impl_(new detail::charset_wstring(std::move(s))) {} charset::~charset() { delete impl_; } charset::operator std::string() const { return impl_->str(); } charset::operator std::string&&() { return impl_->str_move(); } charset::operator std::wstring() const { return impl_->wstr(); } charset::operator std::wstring&&() { return impl_->wstr_move(); } std::string charset::to_bytes(unicode encoding) const { return impl_->str(encoding); } //end class charset }//end namespace nana