add a namespace nana::utf for UTF-8 character processing

2016-01-01 12:30:18 +08:00 · 2016-01-01 12:30:18 +08:00 · 0a4f04267e
commit 0a4f04267e
parent a42ebe19b4
4 changed files with 199 additions and 2 deletions
--- a/include/nana/charset.hpp
+++ b/include/nana/charset.hpp
@ -18,6 +18,24 @@

 namespace nana
 {
+	namespace utf
+	{
+		/// Attempt to get a pointer to a character of UTF-8 string by a specifed character index.
+		/// @param text_utf8 A string encoded as UTF-8.
+		/// @param pos The unicode character index.
+		/// @returns A pointer to the unicode character. It returns a null if pos is out of range. 
+		const char* char_ptr(const char* text_utf8, unsigned pos);
+		const char* char_ptr(const ::std::string& text_utf8, unsigned pos);
+
+		/// Get the unicode character by a specified character index.
+		/// @param text_utf8 A string encoded as UTF-8.
+		/// @param pos The unicode character index.
+		/// @param len A unsigned pointer to receive the number of bytes it takes in UTF-8 encoded. If len is a nullptr, it is ignored.
+		/// @returns A unicode character. '\0' if pos is out of range.
+		wchar_t char_at(const char* text_utf8, unsigned pos, unsigned * len);
+		wchar_t char_at(const ::std::string& text_utf8, unsigned pos, unsigned * len);
+	}
+
 	enum class unicode
 	{
 		utf8, utf16, utf32
--- a/source/charset.cpp
+++ b/source/charset.cpp
@ -30,6 +30,183 @@

 namespace nana
 {
+	namespace utf
+	{
+		const char* char_ptr(const char* text, unsigned pos)
+		{
+			auto ustr = reinterpret_cast<const unsigned char*>(text);
+			auto const end = ustr + std::strlen(text);
+			
+			for (unsigned i = 0; i != pos; ++i)
+			{
+				const auto uch = *ustr;
+				if (uch < 0x80)
+				{
+					++ustr;
+					continue;
+				}
+
+				if (uch < 0xC0)
+					return nullptr;
+
+				if ((uch < 0xE0) && (ustr + 1 < end))
+					ustr += 2;
+				else if (uch < 0xF0 && (ustr + 2 <= end))
+					ustr += 3;
+				else if (uch < 0x1F && (ustr + 3 <= end))
+					ustr += 4;
+				else
+					return nullptr;
+			}
+
+			return reinterpret_cast<const char*>(ustr);
+		}
+
+		const char* char_ptr(const std::string& text_utf8, unsigned pos)
+		{
+			auto ustr = reinterpret_cast<const unsigned char*>(text_utf8.c_str());
+			auto const end = ustr + text_utf8.size();
+
+			for (unsigned i = 0; i != pos; ++i)
+			{
+				const auto uch = *ustr;
+				if (uch < 0x80)
+				{
+					++ustr;
+					continue;
+				}
+
+				if (uch < 0xC0)
+					return nullptr;
+
+				if ((uch < 0xE0) && (ustr + 1 < end))
+					ustr += 2;
+				else if (uch < 0xF0 && (ustr + 2 <= end))
+					ustr += 3;
+				else if (uch < 0x1F && (ustr + 3 <= end))
+					ustr += 4;
+				else
+					return nullptr;
+			}
+
+			return reinterpret_cast<const char*>(ustr);
+		}
+
+		wchar_t char_at(const char* text_utf8, unsigned pos, unsigned * len)
+		{
+			if (!text_utf8)
+				return 0;
+
+			if (pos)
+			{
+				text_utf8 = char_ptr(text_utf8, pos);
+				if (!text_utf8)
+					return 0;
+			}
+
+			const wchar_t uch = *reinterpret_cast<const unsigned char*>(text_utf8);
+			if (uch < 0x80)
+			{
+				if (len)
+					*len = 1;
+
+				return *text_utf8;
+			}
+
+			if (uch < 0xC0)
+			{
+				if (len)
+					*len = 0;
+
+				return 0;
+			}
+
+			const auto end = text_utf8 + std::strlen(text_utf8);
+
+			if (uch < 0xE0 && (text_utf8 + 1 <= end))
+			{
+				if (len)
+					*len = 2;
+				return (wchar_t(uch & 0x1F) << 6) | (reinterpret_cast<const unsigned char*>(text_utf8)[1] & 0x3F);
+			}
+			else if (uch < 0xF0 && (text_utf8 + 2 <= end))
+			{
+				if (len)
+					*len = 3;
+
+				return ((((uch & 0xF) << 6) | (reinterpret_cast<const unsigned char*>(text_utf8)[1] & 0x3F)) << 6) | (reinterpret_cast<const unsigned char*>(text_utf8)[2] & 0x3F);
+			}
+			else if (uch < 0x1F && (text_utf8 + 3 <= end))
+			{
+				if (len)
+					*len = 4;
+				return ((((((uch & 0x7) << 6) | (reinterpret_cast<const unsigned char*>(text_utf8)[1] & 0x3F)) << 6) | (reinterpret_cast<const unsigned char*>(text_utf8)[2] & 0x3F)) << 6) | (reinterpret_cast<const unsigned char*>(text_utf8)[3] & 0x3F);
+			}
+
+			if (len)
+				*len = 0;
+
+			return 0;
+		}
+
+		wchar_t char_at(const ::std::string& text_utf8, unsigned pos, unsigned * len)
+		{
+			const char* ptr;
+			if (pos)
+			{
+				ptr = char_ptr(text_utf8, pos);
+				if (!ptr)
+					return 0;
+			}
+			else
+				ptr = text_utf8.c_str();
+
+			const wchar_t uch = *reinterpret_cast<const unsigned char*>(ptr);
+			if (uch < 0x80)
+			{
+				if (len)
+					*len = 1;
+
+				return *ptr;
+			}
+
+			if (uch < 0xC0)
+			{
+				if (len)
+					*len = 0;
+
+				return 0;
+			}
+
+			const auto end = text_utf8.c_str() + text_utf8.size();
+
+			if (uch < 0xE0 && (ptr + 1 <= end))
+			{
+				if (len)
+					*len = 2;
+				return (wchar_t(uch & 0x1F) << 6) | (reinterpret_cast<const unsigned char*>(ptr)[1] & 0x3F);
+			}
+			else if (uch < 0xF0 && (ptr + 2 <= end))
+			{
+				if (len)
+					*len = 3;
+
+				return ((((uch & 0xF) << 6) | (reinterpret_cast<const unsigned char*>(ptr)[1] & 0x3F)) << 6) | (reinterpret_cast<const unsigned char*>(ptr)[2] & 0x3F);
+			}
+			else if (uch < 0x1F && (ptr + 3 <= end))
+			{
+				if (len)
+					*len = 4;
+				return ((((((uch & 0x7) << 6) | (reinterpret_cast<const unsigned char*>(ptr)[1] & 0x3F)) << 6) | (reinterpret_cast<const unsigned char*>(ptr)[2] & 0x3F)) << 6) | (reinterpret_cast<const unsigned char*>(ptr)[3] & 0x3F);
+			}
+
+			if (len)
+				*len = 0;
+
+			return 0;
+		}
+	}
+
 	namespace detail
 	{
 		class locale_initializer
--- a/source/gui/programming_interface.cpp
+++ b/source/gui/programming_interface.cpp
@ -348,7 +348,7 @@ namespace API
 				text.erase(pos, 1);
 				if(shortkey == 0 && pos < text.length())
 				{
-					shortkey = text.at(pos);
+					shortkey = utf::char_at(text.c_str() + pos, 0, nullptr);
 					if(shortkey == '&')	//This indicates the text contains "&&", it means the symbol have to be ignored.
 						shortkey = 0;
 					else if(skpos)
--- a/source/gui/widgets/button.cpp
+++ b/source/gui/widgets/button.cpp
@ -258,7 +258,9 @@ namespace nana{	namespace drawerbase
 					if(shortkey)
 					{
 						unsigned off_w = (shortkey_pos ? graph.text_extent_size(mbstr.c_str(), static_cast<unsigned>(shortkey_pos)).width : 0);
-						nana::size shortkey_size = graph.text_extent_size(to_wstring(mbstr.c_str() + shortkey_pos), 1);
+
+						wchar_t keystr[2] = {nana::utf::char_at(mbstr.c_str() + shortkey_pos, 0, 0), 0};
+						auto shortkey_size = graph.text_extent_size(keystr, 1);

 						unsigned ascent, descent, inleading;
 						graph.text_metrics(ascent, descent, inleading);