namespace nana { namespace reshaping { namespace arabic { const unsigned short TATWEEL = 0x0640; const unsigned short ZWJ = 0x200D; const int unshaped = 255; const int isolated = 0; const int initial = 1; const int medial = 2; const int final = 3; unsigned short letters[][4] = { {0xFE80, 0, 0, 0}, //ARABIC LETTER HAMZA {0xFE81, 0, 0, 0xFE82}, //ARABIC LETTER ALEF WITH MADDA ABOVE {0xFE83, 0, 0, 0xFE84}, //ARABIC LETTER ALEF WITH HAMZA ABOVE {0xFE85, 0, 0, 0xFE86}, //ARABIC LETTER WAW WITH HAMZA ABOVE {0xFE87, 0, 0, 0xFE88}, //ARABIC LETTER ALEF WITH HAMZA BELOW {0xFE89, 0xFE8B, 0xFE8C, 0xFE8A}, //ARABIC LETTER YEH WITH HAMZA ABOVE {0xFE8D, 0, 0, 0xFE8E}, //ARABIC LETTER ALEF {0xFE8F, 0xFE91, 0xFE92, 0xFE90}, //ARABIC LETTER BEH {0xFE93, 0, 0, 0xFE94}, //ARABIC LETTER TEH MARBUTA {0xFE95, 0xFE97, 0xFE98, 0xFE96}, //ARABIC LETTER TEH {0xFE99, 0xFE9B, 0xFE9C, 0xFE9A}, //ARABIC LETTER THEH {0xFE9D, 0xFE9F, 0xFEA0, 0xFE9E}, //ARABIC LETTER JEEM {0xFEA1, 0xFEA3, 0xFEA4, 0xFEA2}, //ARABIC LETTER HAH {0xFEA5, 0xFEA7, 0xFEA8, 0xFEA6}, //ARABIC LETTER KHAH {0xFEA9, 0, 0, 0xFEAA}, //ARABIC LETTER DAL {0xFEAB, 0, 0, 0xFEAC}, //ARABIC LETTER THAL {0xFEAD, 0, 0, 0xFEAE}, //ARABIC LETTER REH {0xFEAF, 0, 0, 0xFEB0}, //ARABIC LETTER ZAIN {0xFEB1, 0xFEB3, 0xFEB4, 0xFEB2}, //ARABIC LETTER SEEN {0xFEB5, 0xFEB7, 0xFEB8, 0xFEB6}, //ARABIC LETTER SHEEN {0xFEB9, 0xFEBB, 0xFEBC, 0xFEBA}, //ARABIC LETTER SAD {0xFEBD, 0xFEBF, 0xFEC0, 0xFEBE}, //ARABIC LETTER DAD {0xFEC1, 0xFEC3, 0xFEC4, 0xFEC2}, //ARABIC LETTER TAH {0xFEC5, 0xFEC7, 0xFEC8, 0xFEC6}, //ARABIC LETTER ZAH {0xFEC9, 0xFECB, 0xFECC, 0xFECA}, //ARABIC LETTER AIN {0xFECD, 0xFECF, 0xFED0, 0xFECE}, //ARABIC LETTER GHAIN {TATWEEL, TATWEEL, TATWEEL, TATWEEL}, //ARABIC TATWEEL {0xFED1, 0xFED3, 0xFED4, 0xFED2}, //ARABIC LETTER FEH {0xFED5, 0xFED7, 0xFED8, 0xFED6}, //ARABIC LETTER QAF {0xFED9, 0xFEDB, 0xFEDC, 0xFEDA}, //ARABIC LETTER KAF {0xFEDD, 0xFEDF, 0xFEE0, 0xFEDE}, //ARABIC LETTER LAM {0xFEE1, 0xFEE3, 0xFEE4, 0xFEE2}, //ARABIC LETTER MEEM {0xFEE5, 0xFEE7, 0xFEE8, 0xFEE6}, //ARABIC LETTER NOON {0xFEE9, 0xFEEB, 0xFEEC, 0xFEEA}, //ARABIC LETTER HEH {0xFEED, 0, 0, 0xFEEE}, //ARABIC LETTER WAW {0xFEEF, 0xFBE8, 0xFBE9, 0xFEF0}, //ARABIC LETTER (UIGHUR KAZAKH KIRGHIZ)? ALEF MAKSURA {0xFEF1, 0xFEF3, 0xFEF4, 0xFEF2}, //ARABIC LETTER YEH {0xFB50, 0, 0, 0xFB51}, //ARABIC LETTER ALEF WASLA {0xFBDD, 0, 0, 0}, //ARABIC LETTER U WITH HAMZA ABOVE {0xFB66, 0xFB68, 0xFB69, 0xFB67}, //ARABIC LETTER TTEH {0xFB5E, 0xFB60, 0xFB61, 0xFB5F}, //ARABIC LETTER TTEHEH {0xFB52, 0xFB54, 0xFB55, 0xFB53}, //ARABIC LETTER BEEH {0xFB56, 0xFB58, 0xFB59, 0xFB57}, //ARABIC LETTER PEH {0xFB62, 0xFB64, 0xFB65, 0xFB63}, //ARABIC LETTER TEHEH {0xFB5A, 0xFB5C, 0xFB5D, 0xFB5B}, //ARABIC LETTER BEHEH {0xFB76, 0xFB78, 0xFB79, 0xFB77}, //ARABIC LETTER NYEH {0xFB72, 0xFB74, 0xFB75, 0xFB73}, //ARABIC LETTER DYEH {0xFB7A, 0xFB7C, 0xFB7D, 0xFB7B}, //ARABIC LETTER TCHEH {0xFB7E, 0xFB80, 0xFB81, 0xFB7F}, //ARABIC LETTER TCHEHEH {0xFB88, 0, 0, 0xFB89}, //ARABIC LETTER DDAL {0xFB84, 0, 0, 0xFB85}, //ARABIC LETTER DAHAL {0xFB82, 0, 0, 0xFB83}, //ARABIC LETTER DDAHAL {0xFB86, 0, 0, 0xFB87}, //ARABIC LETTER DUL {0xFB8C, 0, 0, 0xFB8D}, //ARABIC LETTER RREH {0xFB8A, 0, 0, 0xFB8B}, //ARABIC LETTER JEH {0xFB6A, 0xFB6C, 0xFB6D, 0xFB6B}, //ARABIC LETTER VEH {0xFB6E, 0xFB70, 0xFB71, 0xFB6F}, //ARABIC LETTER PEHEH {0xFB8E, 0xFB90, 0xFB91, 0xFB8F}, //ARABIC LETTER KEHEH {0xFBD3, 0xFBD5, 0xFBD6, 0xFBD4}, //ARABIC LETTER NG {0xFB92, 0xFB94, 0xFB95, 0xFB93}, //ARABIC LETTER GAF {0xFB9A, 0xFB9C, 0xFB9D, 0xFB9B}, //ARABIC LETTER NGOEH {0xFB96, 0xFB98, 0xFB99, 0xFB97}, //ARABIC LETTER GUEH {0xFB9E, 0, 0, 0xFB9F}, //ARABIC LETTER NOON GHUNNA {0xFBA0, 0xFBA2, 0xFBA3, 0xFBA1}, //ARABIC LETTER RNOON {0xFBAA, 0xFBAC, 0xFBAD, 0xFBAB}, //ARABIC LETTER HEH DOACHASHMEE {0xFBA4, 0, 0, 0xFBA5}, //ARABIC LETTER HEH WITH YEH ABOVE {0xFBA6, 0xFBA8, 0xFBA9, 0xFBA7}, //ARABIC LETTER HEH GOAL {0xFBE0, 0, 0, 0xFBE1}, //ARABIC LETTER KIRGHIZ OE {0xFBD9, 0, 0, 0xFBDA}, //ARABIC LETTER OE {0xFBD7, 0, 0, 0xFBD8}, //ARABIC LETTER U {0xFBDB, 0, 0, 0xFBDC}, //ARABIC LETTER YU {0xFBE2, 0, 0, 0xFBE3}, //ARABIC LETTER KIRGHIZ YU {0xFBDE, 0, 0, 0xFBDF}, //ARABIC LETTER VE {0xFBFC, 0xFBFE, 0xFBFF, 0xFBFD}, //ARABIC LETTER FARSI YEH {0xFBE4, 0xFBE6, 0xFBE7, 0xFBE5}, //ARABIC LETTER E {0xFBAE, 0, 0, 0xFBAF}, //ARABIC LETTER YEH BARREE {0xFBB0, 0, 0, 0xFBB1}, //ARABIC LETTER YEH BARREE WITH HAMZA ABOVE {ZWJ, ZWJ, ZWJ, ZWJ} }; bool harakat(wchar_t letter) { return (0x0610 <= letter && letter <= 0x061A) || (0x064B <= letter && letter <= 0x065F) || (0x0670 == letter) || (0x06D6 <= letter && letter <= 0x06DC) || (0x06DF <= letter && letter <= 0x06E8) || (0x06EA <= letter && letter <= 0x06ED) || (0x08D4 <= letter && letter <= 0x08E1) || (0x08D4 <= letter && letter <= 0x08ED) || (0x08E3 <= letter && letter <= 0x08FF); } int form_index(wchar_t letter) { static unsigned short ranges[][2]={ {0x0621, 0x063A}, {0x0640, 0x064A}, {0x0671, 0x0671}, {0x0677, 0x0677}, {0x0679, 0x067B}, {0x067E, 0x0680}, {0x0683, 0x0684}, {0x0686, 0x0688}, {0x068C, 0x068E}, {0x0691, 0x0691}, {0x0698, 0x0698}, {0x06A4, 0x06A4}, {0x06A6, 0x06A6}, {0x06A9, 0x06A9}, {0x06AD, 0x06AD}, {0x06AF, 0x06AF}, {0x06B1, 0x06B1}, {0x06B3, 0x06B3}, {0x06BA, 0x06BB}, {0x06BE, 0x06BE}, {0x06C0, 0x06C1}, {0x06C5, 0x06C9}, {0x06CB, 0x06CC}, {0x06D0, 0x06D0}, {0x06D2, 0x06D3}, {ZWJ, ZWJ} }; if((letter < 0x0621) || (0x06D3 < letter && letter != ZWJ)) return -1; int base = 0; for(std::size_t i = 0; i < sizeof(ranges) / sizeof(unsigned short) / 2; ++i) { if(ranges[i][0] <= letter && letter <= ranges[i][1]) return static_cast(letter - ranges[i][0]) + base; base += static_cast(ranges[i][1] - ranges[i][0]) + 1; } return base; } wchar_t connect_before(wchar_t letter) { auto idx = form_index(letter); if(idx < 0) return 0; return letters[idx][final] || letters[idx][medial]; } wchar_t connect_after(wchar_t letter) { auto idx = form_index(letter); if(idx < 0) return 0; return letters[idx][initial] || letters[idx][medial]; } wchar_t connect_before_after(wchar_t letter) { auto idx = form_index(letter); if(idx < 0) return 0; return letters[idx][medial]; } std::wstring reshape(const std::wstring& text) { bool const use_unshaped_instead_of_isolated = false; bool const delete_harakat = true; bool const shift_harakat_position = false; bool const delete_tatweel = false; bool const support_zwj = true; const int no_form = -1; const int isolated_form = use_unshaped_instead_of_isolated ? unshaped : isolated; std::wstring output; std::vector forms; std::map positions_harakat; for(auto letter: text) { if(harakat(letter)) { if(!delete_harakat) { int position = static_cast(output.size()) - 1; if (shift_harakat_position) --position; if (positions_harakat.count(position) == 0) positions_harakat[position]; if (shift_harakat_position) { auto & ph = positions_harakat[position]; ph.insert(ph.cbegin(), letter); } else positions_harakat[position] += letter; } continue; } else if(((TATWEEL == letter) && delete_tatweel) || (ZWJ == letter && !support_zwj)) { continue; } auto idx = form_index(letter); if(idx < 0) { output += letter; forms.push_back(no_form); continue; } if(forms.empty()) { output += letter; forms.push_back(isolated_form); continue; } if((forms.back() == no_form) || (!connect_before(letter)) || (!connect_after(output.back())) || ((forms.back() == final) && !connect_before_after(output.back()))) { output += letter; forms.push_back(isolated_form); } else if(forms.back() == isolated_form) { forms.back() = initial; output += letter; forms.push_back(final); } else { forms.back() = medial; output += letter; forms.push_back(final); } //Remove ZWJ if it's the second to last item as it won't be useful if(support_zwj && (output.size() > 1) && (output[output.size() - 2] == ZWJ)) output.erase(output.size() - 2, 1); } //Remove ZWJ if it's the second to last item as it won't be useful if(support_zwj && (output.size() > 0) && (output.back() == ZWJ)) output.pop_back(); std::wstring result; if((!delete_harakat) && positions_harakat.count(-1)) result += positions_harakat[-1]; for(std::size_t i = 0; i < output.size(); ++i) { if(output[i]) { if(forms[i] == no_form || forms[i] == unshaped) result += output[i]; else result += letters[form_index(output[i])][forms[i]]; } if(!delete_harakat) if(positions_harakat.count(i)) result += positions_harakat[i]; } return result; } }//end namespace arabic }//end namespace reshaping }