From b46966bf02dc96b09d5a8e35c6f8dbfb0dc93451 Mon Sep 17 00:00:00 2001
From: Jinhao <cnjinhao@hotmail.com>
Date: Tue, 24 Mar 2020 22:20:04 +0800
Subject: [PATCH] fix rendering issue of RTL languages

---
 .../widgets/skeletons/text_token_stream.hpp   | 486 +++++++++++++++---
 1 file changed, 414 insertions(+), 72 deletions(-)

diff --git a/include/nana/gui/widgets/skeletons/text_token_stream.hpp b/include/nana/gui/widgets/skeletons/text_token_stream.hpp
index 36560067..805840c7 100644
--- a/include/nana/gui/widgets/skeletons/text_token_stream.hpp
+++ b/include/nana/gui/widgets/skeletons/text_token_stream.hpp
@@ -1,7 +1,7 @@
 /*
  *	Text Token Stream
  *	Nana C++ Library(http://www.nanapro.org)
- *	Copyright(C) 2003-2018 Jinhao(cnjinhao@hotmail.com)
+ *	Copyright(C) 2003-2020 Jinhao(cnjinhao@hotmail.com)
  *
  *	Distributed under the Boost Software License, Version 1.0.
  *	(See accompanying file LICENSE_1_0.txt or copy at
@@ -44,11 +44,22 @@ namespace nana{ namespace widgets{	namespace skeletons
 	class tokenizer
 	{
 	public:
-		tokenizer(const std::wstring& s, bool format_enabled)
-			:	iptr_(s.data()),
-				endptr_(s.data() + s.size()),
-				format_enabled_(format_enabled)
+		tokenizer(const std::wstring& s, bool format_enabled) :
+#if 0 //deprecated
+			iptr_(s.data()),
+			endptr_(s.data() + s.size()),
+#endif
+			format_enabled_(format_enabled)
 		{
+			entities_ = unicode_bidi{}.reorder(s.c_str(), s.size());
+			for (auto & e : entities_)
+			{
+				ptr_ = e.begin;
+				if (e.begin < e.end)
+					break;
+
+				++idx_;
+			}
 		}
 
 		void push(token tk)
@@ -59,18 +70,23 @@ namespace nana{ namespace widgets{	namespace skeletons
 		//Read the token.
 		token read()
 		{
-			if(revert_token_ != token::eof)
+			if (revert_token_ != token::eof)
 			{
 				token tk = revert_token_;
 				revert_token_ = token::eof;
 				return tk;
 			}
 
-			if(iptr_ == endptr_)
+#if 0	//deprecated
+			if (iptr_ == endptr_)
 				return token::eof;
+#else
+			if (_m_eof())
+				return token::eof;
+#endif
 
 			//Check whether it is a format token.
-			if(format_enabled_ && format_state_)
+			if (format_enabled_ && format_state_)
 				return _m_format_token();
 
 			return _m_token();
@@ -106,15 +122,19 @@ namespace nana{ namespace widgets{	namespace skeletons
 		//Read the data token
 		token _m_token()
 		{
+#if 0	//deprecated
 			wchar_t ch = *iptr_;
-
-			if(ch > 0xFF)
+#else
+			auto ch = _m_get();
+#endif
+			if (ch > 0xFF)
 			{
 				//This is the Unicode.
 
 				idstr_.clear();
 				idstr_.append(1, ch);
 
+#if 0	//deprecated
 				if (_m_unicode_word_breakable(iptr_))
 				{
 					++iptr_;
@@ -122,53 +142,89 @@ namespace nana{ namespace widgets{	namespace skeletons
 				}
 
 				ch = *++iptr_;
-				while((iptr_ != endptr_) && (ch > 0xFF) && (false == _m_unicode_word_breakable(iptr_)))
+				while ((iptr_ != endptr_) && (ch > 0xFF) && (false == _m_unicode_word_breakable(iptr_)))
 				{
 					idstr_.append(1, ch);
 
 					ch = *++iptr_;
 				}
+#else
+				if (_m_unicode_word_breakable(ptr_))
+				{
+					_m_read();
+					return token::data;
+				}
+
+				_m_read();
+				ch = _m_get();
+				while ((!_m_eof()) && (ch > 0xFF) && (false == _m_unicode_word_breakable(ptr_)))
+				{
+					idstr_.append(1, ch);
+					_m_read();
+					ch = _m_get();
+				}
+
+				//When the last _m_unicode_word_breakable returns true, it implies the ch(left character)
+				//is not the breakable character. So it belongs to the data.
+				idstr_.append(1, ch);
+				_m_read();
+#endif
 
 				return token::data;
 			}
 
-			if('\n' == ch)
+			if ('\n' == ch)
 			{
+#if 0 //deprecated
 				++iptr_;
+#else
+				_m_read();
+#endif
 				return token::endl;
 			}
 
-			if(('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z'))
+			if (('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z'))
 			{
+#if 0	//deprecated
 				auto idstr = iptr_;
 				do
 				{
 					ch = *(++iptr_);
-				}
-				while(('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z'));
+				} while (('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z'));
 
 				idstr_.assign(idstr, iptr_);
+#else
+				auto idstr = ptr_;
+				do
+				{
+					_m_read();
+					ch = _m_get();
+				} while (('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z'));
 
+				idstr_.assign(idstr, ptr_);
+#endif
 				return token::data;
 			}
 
-			if('0' <= ch && ch <= '9')
+			if ('0' <= ch && ch <= '9')
 			{
 				_m_read_number();
 				return token::data;
 			}
 
-			if(('<' == ch) && format_enabled_)
+			if (('<' == ch) && format_enabled_)
 			{
-				//pos keeps the current position, and it used for restring
+#if 0 //deprecated
+				//pos keeps the current position, and it used for restoring
 				//iptr_ when the search is failed.
+
 				auto pos = ++iptr_;
 				_m_eat_whitespace();
-				if(*iptr_ == '/')
+				if (*iptr_ == '/')
 				{
 					++iptr_;
 					_m_eat_whitespace();
-					if(*iptr_ == '>')
+					if (*iptr_ == '>')
 					{
 						++iptr_;
 						return token::format_end;
@@ -177,6 +233,29 @@ namespace nana{ namespace widgets{	namespace skeletons
 
 				//Restore the iptr_;
 				iptr_ = pos;
+#else
+				//pos keeps the current position, and it used for restoring
+				//iptr_ when the search is failed.
+				_m_read();
+				auto idx = idx_;
+				auto ptr = ptr_;
+
+				_m_eat_whitespace();
+				if (_m_get() == '/')
+				{
+					_m_read();
+					_m_eat_whitespace();
+					if (_m_get() == '>')
+					{
+						_m_read();
+						return token::format_end;
+					}
+				}
+
+				//Restore the iptr_;
+				idx_ = idx;
+				ptr_ = ptr;
+#endif
 
 				format_state_ = true;
 				return token::tag_begin;
@@ -184,9 +263,10 @@ namespace nana{ namespace widgets{	namespace skeletons
 
 
 			//Escape
-			if(this->format_enabled_ && (ch == '\\'))
+#if 0 //deprecated
+			if (this->format_enabled_ && (ch == '\\'))
 			{
-				if(iptr_ + 1 < endptr_)
+				if (iptr_ + 1 < endptr_)
 				{
 					ch = *(iptr_ + 1);
 
@@ -209,6 +289,32 @@ namespace nana{ namespace widgets{	namespace skeletons
 			}
 			else
 				++iptr_;
+#else
+			if (this->format_enabled_ && (ch == '\\'))
+			{
+				if (!_m_eof(1))
+				{
+					_m_read();
+					ch = _m_get();
+					if ('<' == ch || '>' == ch)	//two characters need to be escaped.
+					{
+						_m_read();
+					}
+					else
+					{
+						//ignore escape
+						ch = '\\';
+					}
+				}
+				else
+				{
+					_m_set_eof();
+					return token::eof;
+				}
+			}
+			else
+				_m_read();
+#endif
 
 			idstr_.clear();
 			idstr_.append(1, ch);
@@ -220,9 +326,12 @@ namespace nana{ namespace widgets{	namespace skeletons
 		{
 			_m_eat_whitespace();
 
+#if 0	//deprecated
 			auto ch = *iptr_++;
-
-			switch(ch)
+#else
+			auto ch = _m_read();
+#endif
+			switch (ch)
 			{
 			case ',':	return token::comma;
 			case '/':	return token::backslash;
@@ -232,40 +341,48 @@ namespace nana{ namespace widgets{	namespace skeletons
 				return token::tag_end;
 			case '"':
 				//Here is a string and all the meta characters will be ignored except "
+#if 0 //deprecated
+			{
+				auto str = iptr_;
+
+				while ((iptr_ != endptr_) && (*iptr_ != '"'))
+					++iptr_;
+
+				idstr_.assign(str, iptr_++);
+			}
+#else
+				while (!(_m_eof() || ('"' == _m_get())))
 				{
-					auto str = iptr_;
-
-					while((iptr_ != endptr_) && (*iptr_ != '"'))
-						++iptr_;
-
-					idstr_.assign(str, iptr_++);
+					idstr_ += _m_read();
 				}
+#endif
 				return token::string;
 			case '(':
 				_m_eat_whitespace();
-				if((iptr_ < endptr_) && _m_is_idstr_element(*iptr_))
+#if 0 //deprecated
+				if ((iptr_ < endptr_) && _m_is_idstr_element(*iptr_))
 				{
 					auto pbegin = iptr_;
-					while((iptr_ < endptr_) && _m_is_idstr_element(*iptr_))
+					while ((iptr_ < endptr_) && _m_is_idstr_element(*iptr_))
 						++iptr_;
 
 					binary_.first.assign(pbegin, iptr_);
 
 					_m_eat_whitespace();
-					if((iptr_ < endptr_) && (',' == *iptr_))
+					if ((iptr_ < endptr_) && (',' == *iptr_))
 					{
 						++iptr_;
 						_m_eat_whitespace();
-						if((iptr_ < endptr_) && _m_is_idstr_element(*iptr_))
+						if ((iptr_ < endptr_) && _m_is_idstr_element(*iptr_))
 						{
 							pbegin = iptr_;
-							while((iptr_ < endptr_) && _m_is_idstr_element(*iptr_))
+							while ((iptr_ < endptr_) && _m_is_idstr_element(*iptr_))
 								++iptr_;
 
 							binary_.second.assign(pbegin, iptr_);
 
 							_m_eat_whitespace();
-							if((iptr_ < endptr_) && (')' == *iptr_))
+							if ((iptr_ < endptr_) && (')' == *iptr_))
 							{
 								++iptr_;
 								return token::binary;
@@ -273,64 +390,91 @@ namespace nana{ namespace widgets{	namespace skeletons
 						}
 					}
 				}
+#else
+				if ((!_m_eof()) && _m_is_idstr_element(_m_get()))
+				{
+					while ((!_m_eof()) && _m_is_idstr_element(_m_get()))
+						binary_.first += _m_read();
+
+					_m_eat_whitespace();
+					if ((!_m_eof()) && (',' == _m_get()))
+					{
+						_m_read();
+						_m_eat_whitespace();
+						if ((!_m_eof()) && _m_is_idstr_element(_m_get()))
+						{
+							while ((!_m_eof()) && _m_is_idstr_element(_m_get()))
+								binary_.second += _m_read();
+
+							_m_eat_whitespace();
+							if ((!_m_eof()) && (')' == _m_get()))
+							{
+								_m_read();
+								return token::binary;
+							}
+						}
+					}
+				}
+#endif
 				return token::eof;
 			}
 
-			
 
-			if(('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') || '_' == ch)
+
+			if (('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') || '_' == ch)
 			{
-				--iptr_;
+				_m_move_back();
+
 				//Here is a identifier
 				_m_read_idstr();
 
-				if(L"font" == idstr_)
+				if (L"font" == idstr_)
 					return token::font;
-				else if(L"bold" == idstr_)
+				else if (L"bold" == idstr_)
 					return token::bold;
-				else if(L"size" == idstr_)
+				else if (L"size" == idstr_)
 					return token::size;
-				else if(L"baseline" == idstr_)
+				else if (L"baseline" == idstr_)
 					return token::baseline;
-				else if(L"top" == idstr_)
+				else if (L"top" == idstr_)
 					return token::top;
-				else if(L"center" == idstr_)
+				else if (L"center" == idstr_)
 					return token::center;
-				else if(L"bottom" == idstr_)
+				else if (L"bottom" == idstr_)
 					return token::bottom;
-				else if(L"color" == idstr_)
+				else if (L"color" == idstr_)
 					return token::color;
-				else if(L"image" == idstr_)
+				else if (L"image" == idstr_)
 					return token::image;
-				else if(L"true" == idstr_)
+				else if (L"true" == idstr_)
 					return token::_true;
-				else if(L"url" == idstr_)
+				else if (L"url" == idstr_)
 					return token::url;
-				else if(L"target" == idstr_)
+				else if (L"target" == idstr_)
 					return token::target;
-				else if(L"false" == idstr_)
+				else if (L"false" == idstr_)
 					return token::_false;
-				else if(L"red" == idstr_)
+				else if (L"red" == idstr_)
 					return token::red;
-				else if(L"green" == idstr_)
+				else if (L"green" == idstr_)
 					return token::green;
-				else if(L"blue" == idstr_)
+				else if (L"blue" == idstr_)
 					return token::blue;
-				else if(L"white" == idstr_)
+				else if (L"white" == idstr_)
 					return token::white;
-				else if(L"black" == idstr_)
+				else if (L"black" == idstr_)
 					return token::black;
-				else if(L"min_limited" == idstr_)
+				else if (L"min_limited" == idstr_)
 					return token::min_limited;
-				else if(L"max_limited" == idstr_)
+				else if (L"max_limited" == idstr_)
 					return token::max_limited;
 
 				return token::string;
 			}
 
-			if('0' <= ch && ch <= '9')
+			if ('0' <= ch && ch <= '9')
 			{
-				--iptr_;
+				_m_move_back();
 				_m_read_number();
 				return token::number;
 			}
@@ -346,16 +490,28 @@ namespace nana{ namespace widgets{	namespace skeletons
 		//Read the identifier.
 		void _m_read_idstr()
 		{
+#if 0 //deprecated
 			auto idstr = iptr_;
 
 			wchar_t ch;
 			do
 			{
 				ch = *(++iptr_);
-			}
-			while(('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') || ('_' == ch) || ('0' <= ch && ch <= '9'));
+			} while (('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') || ('_' == ch) || ('0' <= ch && ch <= '9'));
 
 			idstr_.assign(idstr, iptr_);
+#else
+			auto idstr = ptr_;
+
+			wchar_t ch;
+			do
+			{
+				_m_read();
+				ch = _m_get();
+			} while (('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') || ('_' == ch) || ('0' <= ch && ch <= '9'));
+
+			idstr_.assign(idstr, ptr_);
+#endif
 		}
 
 		//Read the number
@@ -363,23 +519,24 @@ namespace nana{ namespace widgets{	namespace skeletons
 		{
 			idstr_.clear();
 
+#if 0 //deprecated
 			wchar_t ch = *iptr_;
 
 			idstr_ += ch;
 
 			//First check the number whether will be a hex number.
-			if('0' == ch)
+			if ('0' == ch)
 			{
 				ch = *++iptr_;
-				if((!('0' <= ch && ch <= '9')) && (ch != 'x' && ch != 'X'))
+				if ((!('0' <= ch && ch <= '9')) && (ch != 'x' && ch != 'X'))
 					return;
 
-				if(ch == 'x' || ch == 'X')
+				if (ch == 'x' || ch == 'X')
 				{
 					//Here is a hex number
 					idstr_ += 'x';
 					ch = *++iptr_;
-					while(('0' <= ch && ch <= '9') || ('a' <= ch && ch <= 'f') || ('A' <= ch && ch <= 'F'))
+					while (('0' <= ch && ch <= '9') || ('a' <= ch && ch <= 'f') || ('A' <= ch && ch <= 'F'))
 					{
 						idstr_ += ch;
 						ch = *++iptr_;
@@ -392,18 +549,60 @@ namespace nana{ namespace widgets{	namespace skeletons
 			}
 
 			ch = *++iptr_;
-			while('0' <= ch && ch <= '9')
+			while ('0' <= ch && ch <= '9')
 			{
 				idstr_ += ch;
 				ch = *++iptr_;
 			}
+#else
+			auto ch = _m_get();
+
+			idstr_ += ch;
+
+			//First check the number whether will be a hex number.
+			if ('0' == ch)
+			{
+				_m_read();
+				ch = _m_get();
+				if ((!('0' <= ch && ch <= '9')) && (ch != 'x' && ch != 'X'))
+					return;
+
+				if (ch == 'x' || ch == 'X')
+				{
+					//Here is a hex number
+					idstr_ += 'x';
+					_m_read();
+					ch = _m_get();
+					while (('0' <= ch && ch <= '9') || ('a' <= ch && ch <= 'f') || ('A' <= ch && ch <= 'F'))
+					{
+						idstr_ += ch;
+						_m_read();
+						ch = _m_get();
+					}
+					return;
+				}
+
+				//Here is not a hex number
+				idstr_ += ch;
+			}
+
+			_m_read();
+			ch = _m_get();
+			while ('0' <= ch && ch <= '9')
+			{
+				idstr_ += ch;
+				_m_read();
+				ch = _m_get();
+			}
+#endif
 		}
 
 		void _m_eat_whitespace()
 		{
-			while(true)
+			while (true)
 			{
-				switch(*iptr_)
+#if 0 //deprecated
+				switch (*iptr_)
 				{
 				case ' ':
 				case '\t':
@@ -412,11 +611,126 @@ namespace nana{ namespace widgets{	namespace skeletons
 				default:
 					return;
 				}
+#else
+				switch (_m_get())
+				{
+				case ' ':
+				case '\t':
+					_m_read();
+					break;
+				default:
+					return;
+				}
+#endif
 			}
 		}
 	private:
+		wchar_t _m_get() const noexcept
+		{
+			if (idx_ < entities_.size())
+				return *ptr_;
+			return 0;
+		}
+
+		void _m_set_eof()
+		{
+			idx_ = entities_.size();
+			if (0 == idx_)
+				ptr_ = nullptr;
+			else
+				ptr_ = entities_.back().end;
+		}
+
+		bool _m_eof(std::size_t off)
+		{
+			if (0 == off)
+				return _m_eof();
+
+			bool eof = false;
+			auto idx = idx_;
+			auto ptr = ptr_;
+			while (off)
+			{
+				if (_m_eof())
+				{
+					eof = true;
+					break;
+				}
+
+				_m_read();
+			}
+
+			idx_ = idx;
+			ptr_ = ptr;
+			return eof;
+		}
+
+		bool _m_eof() noexcept
+		{
+			if (idx_ == entities_.size())
+				return true;
+
+			if (ptr_ == entities_[idx_].end)
+			{
+				auto idx = idx_;
+
+				while (++idx < entities_.size())
+				{
+					if (entities_[idx].begin != entities_[idx].end)
+						return false;
+				}
+				return true;
+			}
+
+			return false;
+		}
+
+		wchar_t _m_read() noexcept
+		{
+			if (idx_ < entities_.size())
+			{
+				if (ptr_ < entities_[idx_].end)
+				{
+					if (ptr_ + 1 < entities_[idx_].end)
+						return *(ptr_++);
+				}
+
+				auto ch = *ptr_;
+
+				while ((++idx_) < entities_.size())
+				{
+					if (entities_[idx_].begin != entities_[idx_].end)
+					{
+						ptr_ = entities_[idx_].begin;
+						return ch;
+					}
+				}
+			}
+			
+			return 0;
+		}
+
+		void _m_move_back() noexcept
+		{
+			if ((idx_ == entities_.size()) || (entities_[idx_].begin == ptr_))
+			{
+				if (0 == idx_)
+					return;
+				--idx_;
+				ptr_ = entities_[idx_].end;
+			}
+
+			--ptr_;
+		}
+	private:
+		std::vector<unicode_bidi::entity> entities_;
+		std::size_t idx_{ 0 };
+		const wchar_t* ptr_{ nullptr };
+
+#if 0 //deprecated
 		const wchar_t * iptr_;
 		const wchar_t * endptr_;
+#endif
 		const bool	format_enabled_;
 		bool	format_state_{false};
 
@@ -641,6 +955,8 @@ namespace nana{ namespace widgets{	namespace skeletons
 			while(true)
 			{
 				token tk = tknizer.read();
+				if (token::eof == tk)
+					break;
 
 				switch(tk)
 				{
@@ -665,12 +981,38 @@ namespace nana{ namespace widgets{	namespace skeletons
 					if(fstack.size() > 1)
 						fstack.pop();
 					break;
-				case token::eof:
-					return;
 				default:
 					throw std::runtime_error("invalid token");
 				}
 			}
+
+			if (!format_enabled)
+				return;
+
+			//Reorder the sequence of line blocks for RTL languages.
+			for (auto & ln : lines_)
+			{
+				std::wstring str;
+				std::vector<std::size_t> position;
+				for (auto & b : ln)
+				{
+					position.push_back(str.size());
+					str += b.data_ptr->text();
+				}
+
+				std::remove_reference<decltype(ln)>::type dump;
+				dump.swap(ln);
+
+				auto entities = unicode_bidi{}.reorder(str.c_str(), str.size());
+				for (auto & e : entities)
+				{
+					auto pos = e.begin - str.c_str();
+
+					auto i = std::find(position.cbegin(), position.cend(), pos);
+					ln.push_back(dump[i - position.cbegin()]);
+				}
+				
+			}
 		}
 
 		iterator begin()