/* * A Character Encoding Set Implementation * Nana C++ Library(http://www.nanapro.org) * Copyright(C) 2003-2014 Jinhao(cnjinhao@hotmail.com) * * Distributed under the Boost Software License, Version 1.0. * (See accompanying file LICENSE_1_0.txt or copy at * http://www.boost.org/LICENSE_1_0.txt) * * @file: nana/charset.cpp * @brief: A conversion between unicode characters and multi bytes characters * @contributions: * UTF16 4-byte decoding issue by Renke Yan. */ #include #include #include #include #include #include //GCC 4.7.0 does not implement the and codecvt_utfx classes #ifndef STD_CODECVT_NOT_SUPPORTED #include #endif #if defined(NANA_WINDOWS) #include #endif namespace nana { namespace utf { const char* char_ptr(const char* text, unsigned pos) { auto ustr = reinterpret_cast(text); auto const end = ustr + std::strlen(text); for (unsigned i = 0; i != pos; ++i) { const auto uch = *ustr; if (uch < 0x80) { ++ustr; continue; } if (uch < 0xC0) return nullptr; if ((uch < 0xE0) && (ustr + 1 < end)) ustr += 2; else if (uch < 0xF0 && (ustr + 2 <= end)) ustr += 3; else if (uch < 0x1F && (ustr + 3 <= end)) ustr += 4; else return nullptr; } return reinterpret_cast(ustr); } const char* char_ptr(const std::string& text_utf8, unsigned pos) { auto ustr = reinterpret_cast(text_utf8.c_str()); auto const end = ustr + text_utf8.size(); for (unsigned i = 0; i != pos; ++i) { const auto uch = *ustr; if (uch < 0x80) { ++ustr; continue; } if (uch < 0xC0) return nullptr; if ((uch < 0xE0) && (ustr + 1 < end)) ustr += 2; else if (uch < 0xF0 && (ustr + 2 <= end)) ustr += 3; else if (uch < 0x1F && (ustr + 3 <= end)) ustr += 4; else return nullptr; } return reinterpret_cast(ustr); } wchar_t char_at(const char* text_utf8, unsigned pos, unsigned * len) { if (!text_utf8) return 0; if (pos) { text_utf8 = char_ptr(text_utf8, pos); if (!text_utf8) return 0; } const wchar_t uch = *reinterpret_cast(text_utf8); if (uch < 0x80) { if (len) *len = 1; return *text_utf8; } if (uch < 0xC0) { if (len) *len = 0; return 0; } const auto end = text_utf8 + std::strlen(text_utf8); if (uch < 0xE0 && (text_utf8 + 1 <= end)) { if (len) *len = 2; return (wchar_t(uch & 0x1F) << 6) | (reinterpret_cast(text_utf8)[1] & 0x3F); } else if (uch < 0xF0 && (text_utf8 + 2 <= end)) { if (len) *len = 3; return ((((uch & 0xF) << 6) | (reinterpret_cast(text_utf8)[1] & 0x3F)) << 6) | (reinterpret_cast(text_utf8)[2] & 0x3F); } else if (uch < 0x1F && (text_utf8 + 3 <= end)) { if (len) *len = 4; return ((((((uch & 0x7) << 6) | (reinterpret_cast(text_utf8)[1] & 0x3F)) << 6) | (reinterpret_cast(text_utf8)[2] & 0x3F)) << 6) | (reinterpret_cast(text_utf8)[3] & 0x3F); } if (len) *len = 0; return 0; } wchar_t char_at(const ::std::string& text_utf8, unsigned pos, unsigned * len) { const char* ptr; if (pos) { ptr = char_ptr(text_utf8, pos); if (!ptr) return 0; } else ptr = text_utf8.c_str(); const wchar_t uch = *reinterpret_cast(ptr); if (uch < 0x80) { if (len) *len = 1; return *ptr; } if (uch < 0xC0) { if (len) *len = 0; return 0; } const auto end = text_utf8.c_str() + text_utf8.size(); if (uch < 0xE0 && (ptr + 1 <= end)) { if (len) *len = 2; return (wchar_t(uch & 0x1F) << 6) | (reinterpret_cast(ptr)[1] & 0x3F); } else if (uch < 0xF0 && (ptr + 2 <= end)) { if (len) *len = 3; return ((((uch & 0xF) << 6) | (reinterpret_cast(ptr)[1] & 0x3F)) << 6) | (reinterpret_cast(ptr)[2] & 0x3F); } else if (uch < 0x1F && (ptr + 3 <= end)) { if (len) *len = 4; return ((((((uch & 0x7) << 6) | (reinterpret_cast(ptr)[1] & 0x3F)) << 6) | (reinterpret_cast(ptr)[2] & 0x3F)) << 6) | (reinterpret_cast(ptr)[3] & 0x3F); } if (len) *len = 0; return 0; } } namespace detail { class locale_initializer { public: static void init() { static bool initialized = false; if(false == initialized) { initialized = true; //Only set the C library locale std::setlocale(LC_CTYPE, ""); } } }; bool wc2mb(std::string& mbstr, const wchar_t * s) { if(nullptr == s || *s == 0) { mbstr.clear(); return true; } #if defined(NANA_WINDOWS) int bytes = ::WideCharToMultiByte(CP_ACP, 0, s, -1, 0, 0, 0, 0); if(bytes > 1) { mbstr.resize(bytes - 1); ::WideCharToMultiByte(CP_ACP, 0, s, -1, &(mbstr[0]), bytes - 1, 0, 0); } return true; #else locale_initializer::init(); std::mbstate_t mbstate = std::mbstate_t(); std::size_t len = std::wcsrtombs(nullptr, &s, 0, &mbstate); if(len == static_cast(-1)) return false; if(len) { mbstr.resize(len); std::wcsrtombs(&(mbstr[0]), &s, len, &mbstate); } else mbstr.clear(); #endif return true; } bool mb2wc(std::wstring& wcstr, const char* s) { if(nullptr == s || *s == 0) { wcstr.clear(); return true; } #if defined(NANA_WINDOWS) int chars = ::MultiByteToWideChar(CP_ACP, 0, s, -1, 0, 0); if(chars > 1) { wcstr.resize(chars - 1); ::MultiByteToWideChar(CP_ACP, 0, s, -1, &wcstr[0], chars - 1); } #else locale_initializer::init(); std::mbstate_t mbstate = std::mbstate_t(); std::size_t len = std::mbsrtowcs(nullptr, &s, 0, &mbstate); if(len == static_cast(-1)) return false; if(len) { wcstr.resize(len); std::mbsrtowcs(&wcstr[0], &s, len, &mbstate); } else wcstr.clear(); #endif return true; } bool mb2wc(std::string& wcstr, const char* s) { if(nullptr == s || *s == 0) { wcstr.clear(); return true; } #if defined(NANA_WINDOWS) int chars = ::MultiByteToWideChar(CP_ACP, 0, s, -1, 0, 0); if(chars > 1) { wcstr.resize((chars - 1) * sizeof(wchar_t)); ::MultiByteToWideChar(CP_ACP, 0, s, -1, reinterpret_cast(&wcstr[0]), chars - 1); } #else locale_initializer::init(); std::mbstate_t mbstate = std::mbstate_t(); std::size_t len = std::mbsrtowcs(nullptr, &s, 0, &mbstate); if(len == static_cast(-1)) return false; if(len) { wcstr.resize(sizeof(wchar_t) * len); std::mbsrtowcs(reinterpret_cast(&wcstr[0]), &s, len, &mbstate); } else wcstr.clear(); #endif return true; } class charset_encoding_interface { public: virtual ~charset_encoding_interface(){} virtual charset_encoding_interface * clone() const = 0; virtual std::string str() const = 0; virtual std::string&& str_move() = 0; virtual std::string str(unicode) const = 0; virtual std::wstring wstr() const = 0; virtual std::wstring&& wstr_move() = 0; }; #ifndef STD_CODECVT_NOT_SUPPORTED class charset_string : public charset_encoding_interface { public: charset_string(const std::string& s) : data_(s), is_unicode_(false) {} charset_string(std::string&& s) : data_(std::move(s)), is_unicode_(false) {} charset_string(const std::string& s, unicode encoding) : data_(s), is_unicode_(true), utf_x_(encoding) {} charset_string(std::string&& s, unicode encoding) : data_(std::move(s)), is_unicode_(true), utf_x_(encoding) {} private: virtual charset_encoding_interface * clone() const { return new charset_string(*this); } virtual std::string str() const { if(is_unicode_) { std::wstring wcstr; switch(utf_x_) { case unicode::utf8: wcstr = std::wstring_convert>().from_bytes(data_); break; case unicode::utf16: wcstr = std::wstring_convert>().from_bytes(data_); break; case unicode::utf32: wcstr.append(reinterpret_cast(data_.c_str()), data_.size() / sizeof(wchar_t)); break; } std::string mbstr; wc2mb(mbstr, wcstr.c_str()); return mbstr; } return data_; } virtual std::string&& str_move() { if(is_unicode_) data_ = std::move(str()); return std::move(data_); } virtual std::string str(unicode encoding) const { if(is_unicode_ && (utf_x_ != encoding)) { switch(utf_x_) { case unicode::utf8: switch(encoding) { case unicode::utf16: return std::wstring_convert, char16_t>().to_bytes( std::wstring_convert, char16_t>().from_bytes(data_) ); case unicode::utf32: { std::u32string u32str = std::wstring_convert, char32_t>().from_bytes(data_); return std::string(reinterpret_cast(u32str.c_str()), u32str.size() * sizeof(char32_t)); } } break; case unicode::utf16: switch(encoding) { case unicode::utf8: return std::wstring_convert, char16_t>().to_bytes( std::wstring_convert, char16_t>().from_bytes(data_) ); case unicode::utf32: { std::u32string u32str = std::wstring_convert, char32_t>().from_bytes(data_); return std::string(reinterpret_cast(u32str.c_str()), u32str.size() * sizeof(char32_t)); } } break; case unicode::utf32: switch(encoding) { case unicode::utf8: return std::wstring_convert, char32_t>().to_bytes( std::u32string(reinterpret_cast(data_.c_str()), data_.size() / sizeof(char32_t)) ); case unicode::utf16: return std::wstring_convert, char32_t>().to_bytes( std::u32string(reinterpret_cast(data_.c_str()), data_.size() / sizeof(char32_t)) ); } break; } return{}; } std::wstring wcstr; if(mb2wc(wcstr, data_.c_str())) { switch(encoding) { case unicode::utf8: return std::wstring_convert>().to_bytes(wcstr); case unicode::utf16: return std::wstring_convert>().to_bytes(wcstr); case unicode::utf32: #if defined(NANA_WINDOWS) { const char * bytes = reinterpret_cast(wcstr.c_str()); std::u32string utf32str = std::wstring_convert, char32_t>().from_bytes(bytes, bytes + sizeof(wchar_t) * wcstr.size()); return std::string(reinterpret_cast(utf32str.c_str()), sizeof(char32_t) * utf32str.size()); } #elif defined(NANA_LINUX) || defined(NANA_MACOS) return std::string(reinterpret_cast(wcstr.c_str()), sizeof(wchar_t) * wcstr.size()); #else throw std::runtime_error("Bad charset"); #endif } } return{}; } virtual std::wstring wstr() const { if(is_unicode_) { switch(utf_x_) { case unicode::utf8: return std::wstring_convert>().from_bytes(data_); case unicode::utf16: return std::wstring_convert>().from_bytes(data_); case unicode::utf32: return std::wstring(reinterpret_cast(data_.c_str()), data_.size() * sizeof(wchar_t)); } return{}; } std::wstring wcstr; mb2wc(wcstr, data_.c_str()); return wcstr; } virtual std::wstring && wstr_move() { wdata_for_move_ = std::move(wstr()); return std::move(wdata_for_move_); } private: std::string data_; std::wstring wdata_for_move_; bool is_unicode_; unicode utf_x_; }; class charset_wstring : public charset_encoding_interface { public: charset_wstring(const std::wstring& s) : data_(s) {} charset_wstring(std::wstring&& s) : data_(std::move(s)) {} virtual charset_encoding_interface * clone() const { return new charset_wstring(*this); } virtual std::string str() const { if(data_.size()) { std::string mbstr; wc2mb(mbstr, data_.c_str()); return mbstr; } return{}; } virtual std::string&& str_move() { data_for_move_ = str(); return std::move(data_for_move_); } virtual std::string str(unicode encoding) const { switch(encoding) { case unicode::utf8: return std::wstring_convert>().to_bytes(data_); case unicode::utf16: return std::wstring_convert>().to_bytes(data_); case unicode::utf32: #if defined (NANA_WINDOWS) { const char* bytes = reinterpret_cast(data_.c_str()); std::u32string utf32str = std::wstring_convert, char32_t>().from_bytes(bytes, bytes + sizeof(wchar_t) * data_.size()); return std::string(reinterpret_cast(utf32str.c_str()), sizeof(char32_t) * utf32str.size()); } #elif defined(NANA_LINUX) || defined(NANA_MACOS) return std::string(reinterpret_cast(data_.c_str()), data_.size() * sizeof(wchar_t)); #else throw std::runtime_error("Bad charset"); #endif } return std::string(); } virtual std::wstring wstr() const { return data_; } virtual std::wstring&& wstr_move() { return std::move(data_); } private: std::wstring data_; std::string data_for_move_; }; #else unsigned long utf8char(const unsigned char*& p, const unsigned char* end) { if(p != end) { if(*p < 0x80) { return *(p++); } unsigned ch = *p; unsigned long code; if(ch < 0xC0) { p = end; return 0; } else if(ch < 0xE0 && (p + 1 <= end)) { code = ((ch & 0x1F) << 6) | (p[1] & 0x3F); p += 2; } else if(ch < 0xF0 && (p + 2 <= end)) { code = ((((ch & 0xF) << 6) | (p[1] & 0x3F)) << 6) | (p[2] & 0x3F); p += 3; } else if(ch < 0x1F && (p + 3 <= end)) { code = ((((((ch & 0x7) << 6) | (p[1] & 0x3F)) << 6) | (p[2] & 0x3F)) << 6) | (p[3] & 0x3F); p += 4; } else { p = end; return 0; } return code; } return 0; } unsigned long utf16char(const unsigned char* & bytes, const unsigned char* end, bool le_or_be) { unsigned long code; if(le_or_be) { if((end - bytes >= 4) && ((bytes[1] & 0xFC) == 0xD8)) { //32bit encoding unsigned long ch0 = bytes[0] | (bytes[1] << 8); unsigned long ch1 = bytes[2] | (bytes[3] << 8); code = ((ch0 & 0x3FF) << 10) | (ch1 & 0x3FF); bytes += 4; } else if(end - bytes >= 2) { code = bytes[0] | (bytes[1] << 8); bytes += 2; } else { bytes = end; return 0; } } else { if((end - bytes >= 4) && ((bytes[0] & 0xFC) == 0xD8)) { //32bit encoding unsigned long ch0 = (bytes[0] << 8) | bytes[1]; unsigned long ch1 = (bytes[2] << 8) | bytes[3]; code = (((ch0 & 0x3FF) << 10) | (ch1 & 0x3FF)) + 0x10000; bytes += 4; } else if(end - bytes >= 2) { code = (bytes[0] << 8) | bytes[1]; bytes += 2; } else { bytes = end; return 0; } } return code; } unsigned long utf32char(const unsigned char* & bytes, const unsigned char* end, bool le_or_be) { if(end - bytes >= 4) { unsigned long code; if(le_or_be) code = bytes[0] | (bytes[1] << 8) | (bytes[2] << 16) | (bytes[3] << 24); else code = bytes[3] | (bytes[2] << 8) | (bytes[1] << 16) | (bytes[0] << 24); bytes += 4; return code; } bytes = end; return 0; } void put_utf8char(std::string& s, unsigned long code) { if(code < 0x80) { s += static_cast(code); } else if(code < 0x800) { s += static_cast(0xC0 | (code >> 6)); s += static_cast(0x80 | (code & 0x3F)); } else if(code < 0x10000) { s += static_cast(0xE0 | (code >> 12)); s += static_cast(0x80 | ((code >> 6) & 0x3F)); s += static_cast(0x80 | (code & 0x3F)); } else { s += static_cast(0xF0 | (code >> 18)); s += static_cast(0x80 | ((code >> 12) & 0x3F)); s += static_cast(0x80 | ((code >> 6) & 0x3F)); s += static_cast(0x80 | (code & 0x3F)); } } //le_or_be, true = le, false = be void put_utf16char(std::string& s, unsigned long code, bool le_or_be) { if(code <= 0xFFFF) { if(le_or_be) { s += static_cast(code & 0xFF); s += static_cast((code & 0xFF00) >> 8); } else { s += static_cast((code & 0xFF00) >> 8); s += static_cast(code & 0xFF); } } else { unsigned long ch0 = (0xD800 | ((code - 0x10000) >> 10)); unsigned long ch1 = (0xDC00 | ((code - 0x10000) & 0x3FF)); if(le_or_be) { s += static_cast(ch0 & 0xFF); s += static_cast((ch0 & 0xFF00) >> 8); s += static_cast(ch1 & 0xFF); s += static_cast((ch1 & 0xFF00) >> 8); } else { s += static_cast((ch0 & 0xFF00) >> 8); s += static_cast(ch0 & 0xFF); s += static_cast((ch1 & 0xFF00) >> 8); s += static_cast(ch1 & 0xFF); } } } void put_utf32char(std::string& s, unsigned long code, bool le_or_be) { if(le_or_be) { s += static_cast(code & 0xFF); s += static_cast((code & 0xFF00) >> 8); s += static_cast((code & 0xFF0000) >> 16); s += static_cast((code & 0xFF000000) >> 24); } else { s += static_cast((code & 0xFF000000) >> 24); s += static_cast((code & 0xFF0000) >> 16); s += static_cast((code & 0xFF00) >> 8); s += static_cast(code & 0xFF); } } std::string utf8_to_utf16(const std::string& s, bool le_or_be) { const unsigned char * bytes = reinterpret_cast(s.c_str()); const unsigned char * end = bytes + s.size(); std::string utf16str; //If there is a BOM, ignore it. if(s.size() >= 3) { if(bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) { bytes += 3; put_utf16char(utf16str, 0xFEFF, le_or_be); } } while(bytes != end) { put_utf16char(utf16str, utf8char(bytes, end), le_or_be); } return utf16str; } std::string utf8_to_utf32(const std::string& s, bool le_or_be) { const unsigned char * bytes = reinterpret_cast(s.c_str()); const unsigned char * end = bytes + s.size(); std::string utf32str; //If there is a BOM, ignore it. if(s.size() >= 3) { if(bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) { bytes += 3; put_utf32char(utf32str, 0xFEFF, le_or_be); } } while(bytes != end) { put_utf32char(utf32str, utf8char(bytes, end), le_or_be); } return utf32str; } std::string utf16_to_utf8(const std::string& s) { const unsigned char * bytes = reinterpret_cast(s.c_str()); const unsigned char * end = bytes + s.size(); bool le_or_be = true; std::string utf8str; //If there is a BOM, ignore it if(s.size() >= 2) { if(bytes[0] == 0xFF && bytes[1] == 0xFE) { bytes += 2; le_or_be = true; utf8str += (char)0xEF; utf8str += (char)0xBB; utf8str += (char)0xBF; } else if(bytes[0] == 0xFE && bytes[1] == 0xFF) { bytes += 2; le_or_be = false; utf8str += (char)(0xEF); utf8str += (char)(0xBB); utf8str += (char)(0xBF); } } while(bytes != end) { put_utf8char(utf8str, utf16char(bytes, end, le_or_be)); } return utf8str; } std::string utf16_to_utf32(const std::string& s) { const unsigned char * bytes = reinterpret_cast(s.c_str()); const unsigned char * end = bytes + s.size(); bool le_or_be = true; std::string utf32str; //If there is a BOM, ignore it if(s.size() >= 2) { if(bytes[0] == 0xFF && bytes[1] == 0xFE) { bytes += 2; le_or_be = true; put_utf32char(utf32str, 0xFEFF, true); } else if(bytes[0] == 0xFE && bytes[1] == 0xFF) { bytes += 2; le_or_be = false; put_utf32char(utf32str, 0xFEFF, false); } } while(bytes != end) { put_utf32char(utf32str, utf16char(bytes, end, le_or_be), le_or_be); } return utf32str; } std::string utf32_to_utf8(const std::string& s) { const unsigned char * bytes = reinterpret_cast(s.c_str()); const unsigned char * end = bytes + (s.size() & (~4 + 1)); std::string utf8str; bool le_or_be = true; //If there is a BOM, ignore it if(s.size() >= 4) { if(bytes[0] == 0 && bytes[1] == 0 && bytes[2] == 0xFE && bytes[3] == 0xFF) { le_or_be = false; bytes += 4; utf8str += (char)0xEF; utf8str += (char)0xBB; utf8str += (char)0xBF; } else if(bytes[0] == 0xFF && bytes[1] == 0xFE && bytes[2] == 0 && bytes[3] == 0) { le_or_be = true; bytes += 4; utf8str += (char)0xEF; utf8str += (char)0xBB; utf8str += (char)0xBF; } } while(bytes < end) { put_utf8char(utf8str, utf32char(bytes, end, le_or_be)); } return utf8str; } std::string utf32_to_utf16(const std::string& s) { const unsigned char * bytes = reinterpret_cast(s.c_str()); const unsigned char * end = bytes + (s.size() & (~4 + 1)); std::string utf16str; bool le_or_be = true; //If there is a BOM, ignore it if(s.size() >= 4) { if(bytes[0] == 0 && bytes[1] == 0 && bytes[2] == 0xFE && bytes[3] == 0xFF) { le_or_be = false; bytes += 4; put_utf16char(utf16str, 0xFEFF, false); } else if(bytes[0] == 0xFF && bytes[1] == 0xFE && bytes[2] == 0 && bytes[3] == 0) { le_or_be = true; bytes += 4; put_utf16char(utf16str, 0xFEFF, true); } } while(bytes < end) { put_utf16char(utf16str, utf32char(bytes, end, le_or_be), le_or_be); } return utf16str; } class charset_string : public charset_encoding_interface { public: charset_string(const std::string& s) : data_(s), is_unicode_(false) {} charset_string(std::string&& s) : data_(std::move(s)), is_unicode_(false) {} charset_string(const std::string& s, unicode encoding) : data_(s), is_unicode_(true), utf_x_(encoding) {} charset_string(std::string&& s, unicode encoding) : data_(std::move(s)), is_unicode_(true), utf_x_(encoding) {} private: virtual charset_encoding_interface * clone() const { return new charset_string(*this); } virtual std::string str() const { if(is_unicode_) { std::string strbuf; switch(utf_x_) { case unicode::utf8: #if defined(NANA_WINDOWS) strbuf = detail::utf8_to_utf16(data_, true); detail::put_utf16char(strbuf, 0, true); #else strbuf = detail::utf8_to_utf32(data_, true); detail::put_utf32char(strbuf, 0, true); #endif break; case unicode::utf16: #if defined(NANA_WINDOWS) strbuf = data_; detail::put_utf16char(strbuf, 0, true); #else strbuf = detail::utf16_to_utf32(data_); detail::put_utf32char(strbuf, 0, true); #endif break; case unicode::utf32: #if defined(NANA_WINDOWS) strbuf = detail::utf32_to_utf16(data_); detail::put_utf16char(strbuf, 0, true); #else strbuf = data_; detail::put_utf32char(strbuf, 0, true); #endif break; } std::string mbstr; wc2mb(mbstr, reinterpret_cast(strbuf.c_str())); return mbstr; } return data_; } virtual std::string && str_move() { if(is_unicode_) data_ = std::move(str()); return std::move(data_); } virtual std::string str(unicode encoding) const { if(is_unicode_ && (utf_x_ != encoding)) { switch(utf_x_) { case unicode::utf8: switch(encoding) { case unicode::utf16: return detail::utf8_to_utf16(data_, true); case unicode::utf32: return detail::utf8_to_utf32(data_, true); default: break; } break; case unicode::utf16: switch(encoding) { case unicode::utf8: return detail::utf16_to_utf8(data_); case unicode::utf32: return detail::utf16_to_utf32(data_); default: break; } break; case unicode::utf32: switch(encoding) { case unicode::utf8: return detail::utf32_to_utf8(data_); case unicode::utf16: return detail::utf32_to_utf16(data_); default: break; } break; } return {}; } std::string wcstr; if(mb2wc(wcstr, data_.c_str())) { switch(encoding) { case unicode::utf8: return utf32_to_utf8(wcstr); case unicode::utf16: return utf32_to_utf16(wcstr); case unicode::utf32: return wcstr; } } return {}; } virtual std::wstring wstr() const { if(is_unicode_) { std::string bytes; switch(utf_x_) { case unicode::utf8: #if defined(NANA_WINDOWS) bytes = detail::utf8_to_utf16(data_, true); #else bytes = detail::utf8_to_utf32(data_, true); #endif break; case unicode::utf16: #if defined(NANA_WINDOWS) bytes = data_; #else bytes = detail::utf16_to_utf32(data_); #endif break; case unicode::utf32: #if defined(NANA_WINDOWS) bytes = detail::utf32_to_utf16(data_); #else bytes = data_; #endif break; } return std::wstring(reinterpret_cast(bytes.c_str()), bytes.size() / sizeof(wchar_t)); } std::wstring wcstr; mb2wc(wcstr, data_.c_str()); return wcstr; } virtual std::wstring&& wstr_move() { wdata_for_move_ = std::move(wstr()); return std::move(wdata_for_move_); } private: std::string data_; std::wstring wdata_for_move_; bool is_unicode_; unicode utf_x_; }; class charset_wstring : public charset_encoding_interface { public: charset_wstring(const std::wstring& s) : data_(s) {} virtual charset_encoding_interface * clone() const { return new charset_wstring(*this); } virtual std::string str() const { if(data_.size()) { std::string mbstr; wc2mb(mbstr, data_.c_str()); return mbstr; } return {}; } virtual std::string && str_move() { data_for_move_ = std::move(str()); return std::move(data_for_move_); } virtual std::string str(unicode encoding) const { switch(encoding) { case unicode::utf8: #if defined(NANA_WINDOWS) return detail::utf16_to_utf8(std::string(reinterpret_cast(data_.c_str()), data_.size() * sizeof(wchar_t))); #else return detail::utf32_to_utf8(std::string(reinterpret_cast(data_.c_str()), data_.size() * sizeof(wchar_t))); #endif case unicode::utf16: #if defined(NANA_WINDOWS) return std::string(reinterpret_cast(data_.c_str()), data_.size() * sizeof(wchar_t)); #else return detail::utf32_to_utf16(std::string(reinterpret_cast(data_.c_str()), data_.size() * sizeof(wchar_t))); #endif case unicode::utf32: #if defined(NANA_WINDOWS) return detail::utf16_to_utf32(std::string(reinterpret_cast(data_.c_str()), data_.size() * sizeof(wchar_t))); #else return std::string(reinterpret_cast(data_.c_str()), data_.size() * sizeof(wchar_t)); #endif } return {}; } virtual std::wstring wstr() const { return data_; } virtual std::wstring && wstr_move() { return std::move(data_); } private: std::wstring data_; std::string data_for_move_; }; #endif } //class charset charset::charset(const charset& rhs) : impl_(rhs.impl_ ? rhs.impl_->clone() : 0) {} charset & charset::operator=(const charset& rhs) { if(this != &rhs) { delete impl_; impl_ = (rhs.impl_ ? rhs.impl_->clone() : 0); } return *this; } charset::charset(charset&& r) : impl_(r.impl_) { r.impl_ = 0; } charset & charset::operator=(charset&& r) { if(this != &r) { delete impl_; impl_ = r.impl_; r.impl_ = nullptr; } return *this; } charset::charset(const std::string& s) : impl_(new detail::charset_string(s)) {} charset::charset(std::string&& s) : impl_(new detail::charset_string(std::move(s))) {} charset::charset(const std::string& s, unicode encoding) : impl_(new detail::charset_string(s, encoding)) {} charset::charset(std::string&& s, unicode encoding) : impl_(new detail::charset_string(std::move(s), encoding)) {} charset::charset(const std::wstring& s) : impl_(new detail::charset_wstring(s)) {} charset::charset(std::wstring&& s) : impl_(new detail::charset_wstring(std::move(s))) {} charset::~charset() { delete impl_; } charset::operator std::string() const { return impl_->str(); } charset::operator std::string&&() { return impl_->str_move(); } charset::operator std::wstring() const { return impl_->wstr(); } charset::operator std::wstring&&() { return impl_->wstr_move(); } std::string charset::to_bytes(unicode encoding) const { return impl_->str(encoding); } //end class charset }//end namespace nana