From c9b9451443aaa394ac326ac6649faceaa4ef7873 Mon Sep 17 00:00:00 2001 From: Jinhao Date: Thu, 26 Mar 2020 01:41:32 +0800 Subject: [PATCH] add partial support of Arabic reshaping on Linux --- .../widgets/skeletons/text_token_stream.hpp | 43 +-- include/nana/unicode_bidi.hpp | 2 +- source/detail/platform_abstraction.cpp | 44 ++- source/detail/text_reshaping.hpp | 297 ++++++++++++++++++ source/unicode_bidi.cpp | 5 + 5 files changed, 363 insertions(+), 28 deletions(-) create mode 100644 source/detail/text_reshaping.hpp diff --git a/include/nana/gui/widgets/skeletons/text_token_stream.hpp b/include/nana/gui/widgets/skeletons/text_token_stream.hpp index 805840c7..e60b4c97 100644 --- a/include/nana/gui/widgets/skeletons/text_token_stream.hpp +++ b/include/nana/gui/widgets/skeletons/text_token_stream.hpp @@ -41,11 +41,12 @@ namespace nana{ namespace widgets{ namespace skeletons eof }; +#define REORDER_TOKENSx class tokenizer { public: tokenizer(const std::wstring& s, bool format_enabled) : -#if 0 //deprecated +#ifndef REORDER_TOKENS //deprecated iptr_(s.data()), endptr_(s.data() + s.size()), #endif @@ -77,7 +78,7 @@ namespace nana{ namespace widgets{ namespace skeletons return tk; } -#if 0 //deprecated +#ifndef REORDER_TOKENS //deprecated if (iptr_ == endptr_) return token::eof; #else @@ -122,7 +123,7 @@ namespace nana{ namespace widgets{ namespace skeletons //Read the data token token _m_token() { -#if 0 //deprecated +#ifndef REORDER_TOKENS //deprecated wchar_t ch = *iptr_; #else auto ch = _m_get(); @@ -134,7 +135,7 @@ namespace nana{ namespace widgets{ namespace skeletons idstr_.clear(); idstr_.append(1, ch); -#if 0 //deprecated +#ifndef REORDER_TOKENS //deprecated if (_m_unicode_word_breakable(iptr_)) { ++iptr_; @@ -148,6 +149,11 @@ namespace nana{ namespace widgets{ namespace skeletons ch = *++iptr_; } + + //When the last _m_unicode_word_breakable returns true, it implies the ch(left character) + //is not the breakable character. So it belongs to the data. + idstr_.append(1, ch); + ++iptr_; #else if (_m_unicode_word_breakable(ptr_)) { @@ -175,7 +181,7 @@ namespace nana{ namespace widgets{ namespace skeletons if ('\n' == ch) { -#if 0 //deprecated +#ifndef REORDER_TOKENS //deprecated ++iptr_; #else _m_read(); @@ -185,7 +191,7 @@ namespace nana{ namespace widgets{ namespace skeletons if (('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z')) { -#if 0 //deprecated +#ifndef REORDER_TOKENS //deprecated auto idstr = iptr_; do { @@ -214,7 +220,7 @@ namespace nana{ namespace widgets{ namespace skeletons if (('<' == ch) && format_enabled_) { -#if 0 //deprecated +#ifndef REORDER_TOKENS //deprecated //pos keeps the current position, and it used for restoring //iptr_ when the search is failed. @@ -263,7 +269,7 @@ namespace nana{ namespace widgets{ namespace skeletons //Escape -#if 0 //deprecated +#ifndef REORDER_TOKENS //deprecated if (this->format_enabled_ && (ch == '\\')) { if (iptr_ + 1 < endptr_) @@ -326,7 +332,7 @@ namespace nana{ namespace widgets{ namespace skeletons { _m_eat_whitespace(); -#if 0 //deprecated +#ifndef REORDER_TOKENS //deprecated auto ch = *iptr_++; #else auto ch = _m_read(); @@ -341,7 +347,7 @@ namespace nana{ namespace widgets{ namespace skeletons return token::tag_end; case '"': //Here is a string and all the meta characters will be ignored except " -#if 0 //deprecated +#ifndef REORDER_TOKENS //deprecated { auto str = iptr_; @@ -359,7 +365,7 @@ namespace nana{ namespace widgets{ namespace skeletons return token::string; case '(': _m_eat_whitespace(); -#if 0 //deprecated +#ifndef REORDER_TOKENS //deprecated if ((iptr_ < endptr_) && _m_is_idstr_element(*iptr_)) { auto pbegin = iptr_; @@ -423,7 +429,11 @@ namespace nana{ namespace widgets{ namespace skeletons if (('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') || '_' == ch) { +#ifndef REORDER_TOKENS //deprecated + --iptr_; +#else _m_move_back(); +#endif //Here is a identifier _m_read_idstr(); @@ -490,7 +500,7 @@ namespace nana{ namespace widgets{ namespace skeletons //Read the identifier. void _m_read_idstr() { -#if 0 //deprecated +#ifndef REORDER_TOKENS //deprecated auto idstr = iptr_; wchar_t ch; @@ -519,7 +529,7 @@ namespace nana{ namespace widgets{ namespace skeletons { idstr_.clear(); -#if 0 //deprecated +#ifndef REORDER_TOKENS //deprecated wchar_t ch = *iptr_; idstr_ += ch; @@ -601,7 +611,7 @@ namespace nana{ namespace widgets{ namespace skeletons { while (true) { -#if 0 //deprecated +#ifndef REORDER_TOKENS //deprecated switch (*iptr_) { case ' ': @@ -727,7 +737,7 @@ namespace nana{ namespace widgets{ namespace skeletons std::size_t idx_{ 0 }; const wchar_t* ptr_{ nullptr }; -#if 0 //deprecated +#ifndef REORDER_TOKENS //deprecated const wchar_t * iptr_; const wchar_t * endptr_; #endif @@ -986,9 +996,6 @@ namespace nana{ namespace widgets{ namespace skeletons } } - if (!format_enabled) - return; - //Reorder the sequence of line blocks for RTL languages. for (auto & ln : lines_) { diff --git a/include/nana/unicode_bidi.hpp b/include/nana/unicode_bidi.hpp index 5efd7003..2d745167 100644 --- a/include/nana/unicode_bidi.hpp +++ b/include/nana/unicode_bidi.hpp @@ -50,6 +50,7 @@ namespace nana }; std::vector reorder(const char_type*, std::size_t len); + static bool is_text_right(const entity&); private: static unsigned _m_paragraph_level(const char_type * begin, const char_type * end); @@ -72,7 +73,6 @@ namespace nana std::vector unicode_reorder(const wchar_t* text, std::size_t length); bool unicode_wordbreak(wchar_t left, wchar_t right); - } #include diff --git a/source/detail/platform_abstraction.cpp b/source/detail/platform_abstraction.cpp index ea6032b2..d540ea1a 100644 --- a/source/detail/platform_abstraction.cpp +++ b/source/detail/platform_abstraction.cpp @@ -146,6 +146,8 @@ IsWindows8OrGreater() # include "posix/platform_spec.hpp" # include # if defined(NANA_USE_XFT) +# include +# include "text_reshaping.hpp" # include # include # include @@ -211,16 +213,40 @@ namespace nana int const init_x = x; std::unique_ptr glyph_indexes(new FT_UInt[len]); - while(true) + //The RTL and shaping should be handled manually, because the libXft and X doesn't support these language features. + std::wstring rtl; + auto ents = unicode_reorder(str, len); + for(auto & e : ents) { - auto preferred = _m_scan_fonts(xft, str, len, glyph_indexes.get()); - x += _m_draw(xftdraw, xftcolor, preferred.first, x, y, str, preferred.second, glyph_indexes.get()); + auto size = static_cast(e.end - e.begin); + auto p = e.begin; - if(len == preferred.second) - break; + if(unicode_bidi::is_text_right(e)) + { + std::wstring reverse_str; + for(auto i = static_cast(e.end - e.begin) - 1; i > -1; --i) + reverse_str += e.begin[i]; - len -= preferred.second; - str += preferred.second; + //Process reshaping. + rtl = nana::reshaping::arabic::reshape(reverse_str); + p = rtl.c_str(); + size = rtl.size(); + } + + while(true) + { + //Scan the string until the character which font is not same with the font of the first character where the scan begins. + auto preferred = _m_scan_fonts(xft, p, size, glyph_indexes.get()); + x += _m_draw(xftdraw, xftcolor, preferred.first, x, y, p, preferred.second, glyph_indexes.get()); + + if(size == preferred.second) + break; + + size -= preferred.second; + p += preferred.second; + } + + rtl.clear(); } return x - init_x; @@ -298,14 +324,14 @@ namespace nana if(ptab == p) { ++p; - //x += static_cast(tab_pixels_); continue; } auto const size = ptab - p; + ::XftDrawGlyphs(xftdraw, xftcolor, xft, x, y, glyph_indexes + off, size); ::XftGlyphExtents(disp_, xft, glyph_indexes + off, size, &ext); - + x += ext.xOff; if(ptab == end) diff --git a/source/detail/text_reshaping.hpp b/source/detail/text_reshaping.hpp new file mode 100644 index 00000000..31bfac54 --- /dev/null +++ b/source/detail/text_reshaping.hpp @@ -0,0 +1,297 @@ + +namespace nana +{ + namespace reshaping + { + namespace arabic + { + const unsigned short TATWEEL = 0x0640; + const unsigned short ZWJ = 0x200D; + + const int unshaped = 255; + const int isolated = 0; + const int initial = 1; + const int medial = 2; + const int final = 3; + + unsigned short letters[][4] = { + {0xFE80, 0, 0, 0}, //ARABIC LETTER HAMZA + {0xFE81, 0, 0, 0xFE82}, //ARABIC LETTER ALEF WITH MADDA ABOVE + {0xFE83, 0, 0, 0xFE84}, //ARABIC LETTER ALEF WITH HAMZA ABOVE + {0xFE85, 0, 0, 0xFE86}, //ARABIC LETTER WAW WITH HAMZA ABOVE + {0xFE87, 0, 0, 0xFE88}, //ARABIC LETTER ALEF WITH HAMZA BELOW + {0xFE89, 0xFE8B, 0xFE8C, 0xFE8A}, //ARABIC LETTER YEH WITH HAMZA ABOVE + {0xFE8D, 0, 0, 0xFE8E}, //ARABIC LETTER ALEF + {0xFE8F, 0xFE91, 0xFE92, 0xFE90}, //ARABIC LETTER BEH + {0xFE93, 0, 0, 0xFE94}, //ARABIC LETTER TEH MARBUTA + {0xFE95, 0xFE97, 0xFE98, 0xFE96}, //ARABIC LETTER TEH + {0xFE99, 0xFE9B, 0xFE9C, 0xFE9A}, //ARABIC LETTER THEH + {0xFE9D, 0xFE9F, 0xFEA0, 0xFE9E}, //ARABIC LETTER JEEM + {0xFEA1, 0xFEA3, 0xFEA4, 0xFEA2}, //ARABIC LETTER HAH + {0xFEA5, 0xFEA7, 0xFEA8, 0xFEA6}, //ARABIC LETTER KHAH + {0xFEA9, 0, 0, 0xFEAA}, //ARABIC LETTER DAL + {0xFEAB, 0, 0, 0xFEAC}, //ARABIC LETTER THAL + {0xFEAD, 0, 0, 0xFEAE}, //ARABIC LETTER REH + {0xFEAF, 0, 0, 0xFEB0}, //ARABIC LETTER ZAIN + {0xFEB1, 0xFEB3, 0xFEB4, 0xFEB2}, //ARABIC LETTER SEEN + {0xFEB5, 0xFEB7, 0xFEB8, 0xFEB6}, //ARABIC LETTER SHEEN + {0xFEB9, 0xFEBB, 0xFEBC, 0xFEBA}, //ARABIC LETTER SAD + {0xFEBD, 0xFEBF, 0xFEC0, 0xFEBE}, //ARABIC LETTER DAD + {0xFEC1, 0xFEC3, 0xFEC4, 0xFEC2}, //ARABIC LETTER TAH + {0xFEC5, 0xFEC7, 0xFEC8, 0xFEC6}, //ARABIC LETTER ZAH + {0xFEC9, 0xFECB, 0xFECC, 0xFECA}, //ARABIC LETTER AIN + {0xFECD, 0xFECF, 0xFED0, 0xFECE}, //ARABIC LETTER GHAIN + {TATWEEL, TATWEEL, TATWEEL, TATWEEL}, //ARABIC TATWEEL + {0xFED1, 0xFED3, 0xFED4, 0xFED2}, //ARABIC LETTER FEH + {0xFED5, 0xFED7, 0xFED8, 0xFED6}, //ARABIC LETTER QAF + {0xFED9, 0xFEDB, 0xFEDC, 0xFEDA}, //ARABIC LETTER KAF + {0xFEDD, 0xFEDF, 0xFEE0, 0xFEDE}, //ARABIC LETTER LAM + {0xFEE1, 0xFEE3, 0xFEE4, 0xFEE2}, //ARABIC LETTER MEEM + {0xFEE5, 0xFEE7, 0xFEE8, 0xFEE6}, //ARABIC LETTER NOON + {0xFEE9, 0xFEEB, 0xFEEC, 0xFEEA}, //ARABIC LETTER HEH + {0xFEED, 0, 0, 0xFEEE}, //ARABIC LETTER WAW + {0xFEEF, 0xFBE8, 0xFBE9, 0xFEF0}, //ARABIC LETTER (UIGHUR KAZAKH KIRGHIZ)? ALEF MAKSURA + {0xFEF1, 0xFEF3, 0xFEF4, 0xFEF2}, //ARABIC LETTER YEH + {0xFB50, 0, 0, 0xFB51}, //ARABIC LETTER ALEF WASLA + {0xFBDD, 0, 0, 0}, //ARABIC LETTER U WITH HAMZA ABOVE + {0xFB66, 0xFB68, 0xFB69, 0xFB67}, //ARABIC LETTER TTEH + {0xFB5E, 0xFB60, 0xFB61, 0xFB5F}, //ARABIC LETTER TTEHEH + {0xFB52, 0xFB54, 0xFB55, 0xFB53}, //ARABIC LETTER BEEH + {0xFB56, 0xFB58, 0xFB59, 0xFB57}, //ARABIC LETTER PEH + {0xFB62, 0xFB64, 0xFB65, 0xFB63}, //ARABIC LETTER TEHEH + {0xFB5A, 0xFB5C, 0xFB5D, 0xFB5B}, //ARABIC LETTER BEHEH + {0xFB76, 0xFB78, 0xFB79, 0xFB77}, //ARABIC LETTER NYEH + {0xFB72, 0xFB74, 0xFB75, 0xFB73}, //ARABIC LETTER DYEH + {0xFB7A, 0xFB7C, 0xFB7D, 0xFB7B}, //ARABIC LETTER TCHEH + {0xFB7E, 0xFB80, 0xFB81, 0xFB7F}, //ARABIC LETTER TCHEHEH + {0xFB88, 0, 0, 0xFB89}, //ARABIC LETTER DDAL + {0xFB84, 0, 0, 0xFB85}, //ARABIC LETTER DAHAL + {0xFB82, 0, 0, 0xFB83}, //ARABIC LETTER DDAHAL + {0xFB86, 0, 0, 0xFB87}, //ARABIC LETTER DUL + {0xFB8C, 0, 0, 0xFB8D}, //ARABIC LETTER RREH + {0xFB8A, 0, 0, 0xFB8B}, //ARABIC LETTER JEH + {0xFB6A, 0xFB6C, 0xFB6D, 0xFB6B}, //ARABIC LETTER VEH + {0xFB6E, 0xFB70, 0xFB71, 0xFB6F}, //ARABIC LETTER PEHEH + {0xFB8E, 0xFB90, 0xFB91, 0xFB8F}, //ARABIC LETTER KEHEH + {0xFBD3, 0xFBD5, 0xFBD6, 0xFBD4}, //ARABIC LETTER NG + {0xFB92, 0xFB94, 0xFB95, 0xFB93}, //ARABIC LETTER GAF + {0xFB9A, 0xFB9C, 0xFB9D, 0xFB9B}, //ARABIC LETTER NGOEH + {0xFB96, 0xFB98, 0xFB99, 0xFB97}, //ARABIC LETTER GUEH + {0xFB9E, 0, 0, 0xFB9F}, //ARABIC LETTER NOON GHUNNA + {0xFBA0, 0xFBA2, 0xFBA3, 0xFBA1}, //ARABIC LETTER RNOON + {0xFBAA, 0xFBAC, 0xFBAD, 0xFBAB}, //ARABIC LETTER HEH DOACHASHMEE + {0xFBA4, 0, 0, 0xFBA5}, //ARABIC LETTER HEH WITH YEH ABOVE + {0xFBA6, 0xFBA8, 0xFBA9, 0xFBA7}, //ARABIC LETTER HEH GOAL + {0xFBE0, 0, 0, 0xFBE1}, //ARABIC LETTER KIRGHIZ OE + {0xFBD9, 0, 0, 0xFBDA}, //ARABIC LETTER OE + {0xFBD7, 0, 0, 0xFBD8}, //ARABIC LETTER U + {0xFBDB, 0, 0, 0xFBDC}, //ARABIC LETTER YU + {0xFBE2, 0, 0, 0xFBE3}, //ARABIC LETTER KIRGHIZ YU + {0xFBDE, 0, 0, 0xFBDF}, //ARABIC LETTER VE + {0xFBFC, 0xFBFE, 0xFBFF, 0xFBFD}, //ARABIC LETTER FARSI YEH + {0xFBE4, 0xFBE6, 0xFBE7, 0xFBE5}, //ARABIC LETTER E + {0xFBAE, 0, 0, 0xFBAF}, //ARABIC LETTER YEH BARREE + {0xFBB0, 0, 0, 0xFBB1}, //ARABIC LETTER YEH BARREE WITH HAMZA ABOVE + {ZWJ, ZWJ, ZWJ, ZWJ} + }; + + bool harakat(wchar_t letter) + { + return + (0x0610 <= letter && letter <= 0x061A) || + (0x064B <= letter && letter <= 0x065F) || + (0x0670 == letter) || + (0x06D6 <= letter && letter <= 0x06DC) || + (0x06DF <= letter && letter <= 0x06E8) || + (0x06EA <= letter && letter <= 0x06ED) || + (0x08D4 <= letter && letter <= 0x08E1) || + (0x08D4 <= letter && letter <= 0x08ED) || + (0x08E3 <= letter && letter <= 0x08FF); + } + + int form_index(wchar_t letter) + { + static unsigned short ranges[][2]={ + {0x0621, 0x063A}, + {0x0640, 0x064A}, + {0x0671, 0x0671}, + {0x0677, 0x0677}, + {0x0679, 0x067B}, + {0x067E, 0x0680}, + {0x0683, 0x0684}, + {0x0686, 0x0688}, + {0x068C, 0x068E}, + {0x0691, 0x0691}, + {0x0698, 0x0698}, + {0x06A4, 0x06A4}, + {0x06A6, 0x06A6}, + {0x06A9, 0x06A9}, + {0x06AD, 0x06AD}, + {0x06AF, 0x06AF}, + {0x06B1, 0x06B1}, + {0x06B3, 0x06B3}, + {0x06BA, 0x06BB}, + {0x06BE, 0x06BE}, + {0x06C0, 0x06C1}, + {0x06C5, 0x06C9}, + {0x06CB, 0x06CC}, + {0x06D0, 0x06D0}, + {0x06D2, 0x06D3}, + {ZWJ, ZWJ} + }; + + if((letter < 0x0621) || (0x06D3 < letter && letter != ZWJ)) + return -1; + + int base = 0; + for(std::size_t i = 0; i < sizeof(ranges) / sizeof(unsigned short) / 2; ++i) + { + if(ranges[i][0] <= letter && letter <= ranges[i][1]) + return static_cast(letter - ranges[i][0]) + base; + + base += static_cast(ranges[i][1] - ranges[i][0]) + 1; + } + return base; + } + + wchar_t connect_before(wchar_t letter) + { + auto idx = form_index(letter); + if(idx < 0) + return 0; + + return letters[idx][final] || letters[idx][medial]; + } + + wchar_t connect_after(wchar_t letter) + { + auto idx = form_index(letter); + if(idx < 0) + return 0; + + return letters[idx][initial] || letters[idx][medial]; + } + + wchar_t connect_before_after(wchar_t letter) + { + auto idx = form_index(letter); + if(idx < 0) + return 0; + + return letters[idx][medial]; + } + + std::wstring reshape(const std::wstring& text) + { + bool const use_unshaped_instead_of_isolated = false; + bool const delete_harakat = true; + bool const shift_harakat_position = false; + bool const delete_tatweel = false; + bool const support_zwj = true; + + const int no_form = -1; + const int isolated_form = use_unshaped_instead_of_isolated ? unshaped : isolated; + + std::wstring output; + std::vector forms; + + std::map positions_harakat; + + for(auto letter: text) + { + if(harakat(letter)) + { + if(!delete_harakat) + { + int position = static_cast(output.size()) - 1; + + if (shift_harakat_position) + --position; + if (positions_harakat.count(position) == 0) + positions_harakat[position]; + + if (shift_harakat_position) + { + auto & ph = positions_harakat[position]; + ph.insert(ph.cbegin(), letter); + } + else + positions_harakat[position] += letter; + } + continue; + } + else if(((TATWEEL == letter) && delete_tatweel) || (ZWJ == letter && !support_zwj)) + { + continue; + } + + auto idx = form_index(letter); + if(idx < 0) + { + output += letter; + forms.push_back(no_form); + continue; + } + + if(forms.empty()) + { + output += letter; + forms.push_back(isolated_form); + continue; + } + + if((forms.back() == no_form) || (!connect_before(letter)) || (!connect_after(output.back())) || + ((forms.back() == final) && !connect_before_after(output.back()))) + { + output += letter; + forms.push_back(isolated_form); + } + else if(forms.back() == isolated_form) + { + forms.back() = initial; + + output += letter; + forms.push_back(final); + } + else + { + forms.back() = medial; + output += letter; + forms.push_back(final); + } + + //Remove ZWJ if it's the second to last item as it won't be useful + if(support_zwj && (output.size() > 1) && (output[output.size() - 2] == ZWJ)) + output.erase(output.size() - 2, 1); + } + + //Remove ZWJ if it's the second to last item as it won't be useful + if(support_zwj && (output.size() > 0) && (output.back() == ZWJ)) + output.pop_back(); + + + std::wstring result; + if((!delete_harakat) && positions_harakat.count(-1)) + result += positions_harakat[-1]; + + for(std::size_t i = 0; i < output.size(); ++i) + { + if(output[i]) + { + if(forms[i] == no_form || forms[i] == unshaped) + result += output[i]; + else + result += letters[form_index(output[i])][forms[i]]; + } + + if(!delete_harakat) + if(positions_harakat.count(i)) + result += positions_harakat[i]; + } + + return result; + } + + }//end namespace arabic + }//end namespace reshaping +} \ No newline at end of file diff --git a/source/unicode_bidi.cpp b/source/unicode_bidi.cpp index b2f5f6df..5957981e 100644 --- a/source/unicode_bidi.cpp +++ b/source/unicode_bidi.cpp @@ -606,6 +606,11 @@ namespace nana return reordered; } + bool unicode_bidi::is_text_right(const entity& e) + { + return ((e.bidi_char_type != unicode_bidi::bidi_char::L) && (e.level & 1)); + } + unsigned unicode_bidi::_m_paragraph_level(const char_type * begin, const char_type * end) { for(const char_type* i = begin; i != end; ++i)