297 lines
9.8 KiB
C++
297 lines
9.8 KiB
C++
|
|
namespace nana
|
|
{
|
|
namespace reshaping
|
|
{
|
|
namespace arabic
|
|
{
|
|
const unsigned short TATWEEL = 0x0640;
|
|
const unsigned short ZWJ = 0x200D;
|
|
|
|
const int unshaped = 255;
|
|
const int isolated = 0;
|
|
const int initial = 1;
|
|
const int medial = 2;
|
|
const int final = 3;
|
|
|
|
unsigned short letters[][4] = {
|
|
{0xFE80, 0, 0, 0}, //ARABIC LETTER HAMZA
|
|
{0xFE81, 0, 0, 0xFE82}, //ARABIC LETTER ALEF WITH MADDA ABOVE
|
|
{0xFE83, 0, 0, 0xFE84}, //ARABIC LETTER ALEF WITH HAMZA ABOVE
|
|
{0xFE85, 0, 0, 0xFE86}, //ARABIC LETTER WAW WITH HAMZA ABOVE
|
|
{0xFE87, 0, 0, 0xFE88}, //ARABIC LETTER ALEF WITH HAMZA BELOW
|
|
{0xFE89, 0xFE8B, 0xFE8C, 0xFE8A}, //ARABIC LETTER YEH WITH HAMZA ABOVE
|
|
{0xFE8D, 0, 0, 0xFE8E}, //ARABIC LETTER ALEF
|
|
{0xFE8F, 0xFE91, 0xFE92, 0xFE90}, //ARABIC LETTER BEH
|
|
{0xFE93, 0, 0, 0xFE94}, //ARABIC LETTER TEH MARBUTA
|
|
{0xFE95, 0xFE97, 0xFE98, 0xFE96}, //ARABIC LETTER TEH
|
|
{0xFE99, 0xFE9B, 0xFE9C, 0xFE9A}, //ARABIC LETTER THEH
|
|
{0xFE9D, 0xFE9F, 0xFEA0, 0xFE9E}, //ARABIC LETTER JEEM
|
|
{0xFEA1, 0xFEA3, 0xFEA4, 0xFEA2}, //ARABIC LETTER HAH
|
|
{0xFEA5, 0xFEA7, 0xFEA8, 0xFEA6}, //ARABIC LETTER KHAH
|
|
{0xFEA9, 0, 0, 0xFEAA}, //ARABIC LETTER DAL
|
|
{0xFEAB, 0, 0, 0xFEAC}, //ARABIC LETTER THAL
|
|
{0xFEAD, 0, 0, 0xFEAE}, //ARABIC LETTER REH
|
|
{0xFEAF, 0, 0, 0xFEB0}, //ARABIC LETTER ZAIN
|
|
{0xFEB1, 0xFEB3, 0xFEB4, 0xFEB2}, //ARABIC LETTER SEEN
|
|
{0xFEB5, 0xFEB7, 0xFEB8, 0xFEB6}, //ARABIC LETTER SHEEN
|
|
{0xFEB9, 0xFEBB, 0xFEBC, 0xFEBA}, //ARABIC LETTER SAD
|
|
{0xFEBD, 0xFEBF, 0xFEC0, 0xFEBE}, //ARABIC LETTER DAD
|
|
{0xFEC1, 0xFEC3, 0xFEC4, 0xFEC2}, //ARABIC LETTER TAH
|
|
{0xFEC5, 0xFEC7, 0xFEC8, 0xFEC6}, //ARABIC LETTER ZAH
|
|
{0xFEC9, 0xFECB, 0xFECC, 0xFECA}, //ARABIC LETTER AIN
|
|
{0xFECD, 0xFECF, 0xFED0, 0xFECE}, //ARABIC LETTER GHAIN
|
|
{TATWEEL, TATWEEL, TATWEEL, TATWEEL}, //ARABIC TATWEEL
|
|
{0xFED1, 0xFED3, 0xFED4, 0xFED2}, //ARABIC LETTER FEH
|
|
{0xFED5, 0xFED7, 0xFED8, 0xFED6}, //ARABIC LETTER QAF
|
|
{0xFED9, 0xFEDB, 0xFEDC, 0xFEDA}, //ARABIC LETTER KAF
|
|
{0xFEDD, 0xFEDF, 0xFEE0, 0xFEDE}, //ARABIC LETTER LAM
|
|
{0xFEE1, 0xFEE3, 0xFEE4, 0xFEE2}, //ARABIC LETTER MEEM
|
|
{0xFEE5, 0xFEE7, 0xFEE8, 0xFEE6}, //ARABIC LETTER NOON
|
|
{0xFEE9, 0xFEEB, 0xFEEC, 0xFEEA}, //ARABIC LETTER HEH
|
|
{0xFEED, 0, 0, 0xFEEE}, //ARABIC LETTER WAW
|
|
{0xFEEF, 0xFBE8, 0xFBE9, 0xFEF0}, //ARABIC LETTER (UIGHUR KAZAKH KIRGHIZ)? ALEF MAKSURA
|
|
{0xFEF1, 0xFEF3, 0xFEF4, 0xFEF2}, //ARABIC LETTER YEH
|
|
{0xFB50, 0, 0, 0xFB51}, //ARABIC LETTER ALEF WASLA
|
|
{0xFBDD, 0, 0, 0}, //ARABIC LETTER U WITH HAMZA ABOVE
|
|
{0xFB66, 0xFB68, 0xFB69, 0xFB67}, //ARABIC LETTER TTEH
|
|
{0xFB5E, 0xFB60, 0xFB61, 0xFB5F}, //ARABIC LETTER TTEHEH
|
|
{0xFB52, 0xFB54, 0xFB55, 0xFB53}, //ARABIC LETTER BEEH
|
|
{0xFB56, 0xFB58, 0xFB59, 0xFB57}, //ARABIC LETTER PEH
|
|
{0xFB62, 0xFB64, 0xFB65, 0xFB63}, //ARABIC LETTER TEHEH
|
|
{0xFB5A, 0xFB5C, 0xFB5D, 0xFB5B}, //ARABIC LETTER BEHEH
|
|
{0xFB76, 0xFB78, 0xFB79, 0xFB77}, //ARABIC LETTER NYEH
|
|
{0xFB72, 0xFB74, 0xFB75, 0xFB73}, //ARABIC LETTER DYEH
|
|
{0xFB7A, 0xFB7C, 0xFB7D, 0xFB7B}, //ARABIC LETTER TCHEH
|
|
{0xFB7E, 0xFB80, 0xFB81, 0xFB7F}, //ARABIC LETTER TCHEHEH
|
|
{0xFB88, 0, 0, 0xFB89}, //ARABIC LETTER DDAL
|
|
{0xFB84, 0, 0, 0xFB85}, //ARABIC LETTER DAHAL
|
|
{0xFB82, 0, 0, 0xFB83}, //ARABIC LETTER DDAHAL
|
|
{0xFB86, 0, 0, 0xFB87}, //ARABIC LETTER DUL
|
|
{0xFB8C, 0, 0, 0xFB8D}, //ARABIC LETTER RREH
|
|
{0xFB8A, 0, 0, 0xFB8B}, //ARABIC LETTER JEH
|
|
{0xFB6A, 0xFB6C, 0xFB6D, 0xFB6B}, //ARABIC LETTER VEH
|
|
{0xFB6E, 0xFB70, 0xFB71, 0xFB6F}, //ARABIC LETTER PEHEH
|
|
{0xFB8E, 0xFB90, 0xFB91, 0xFB8F}, //ARABIC LETTER KEHEH
|
|
{0xFBD3, 0xFBD5, 0xFBD6, 0xFBD4}, //ARABIC LETTER NG
|
|
{0xFB92, 0xFB94, 0xFB95, 0xFB93}, //ARABIC LETTER GAF
|
|
{0xFB9A, 0xFB9C, 0xFB9D, 0xFB9B}, //ARABIC LETTER NGOEH
|
|
{0xFB96, 0xFB98, 0xFB99, 0xFB97}, //ARABIC LETTER GUEH
|
|
{0xFB9E, 0, 0, 0xFB9F}, //ARABIC LETTER NOON GHUNNA
|
|
{0xFBA0, 0xFBA2, 0xFBA3, 0xFBA1}, //ARABIC LETTER RNOON
|
|
{0xFBAA, 0xFBAC, 0xFBAD, 0xFBAB}, //ARABIC LETTER HEH DOACHASHMEE
|
|
{0xFBA4, 0, 0, 0xFBA5}, //ARABIC LETTER HEH WITH YEH ABOVE
|
|
{0xFBA6, 0xFBA8, 0xFBA9, 0xFBA7}, //ARABIC LETTER HEH GOAL
|
|
{0xFBE0, 0, 0, 0xFBE1}, //ARABIC LETTER KIRGHIZ OE
|
|
{0xFBD9, 0, 0, 0xFBDA}, //ARABIC LETTER OE
|
|
{0xFBD7, 0, 0, 0xFBD8}, //ARABIC LETTER U
|
|
{0xFBDB, 0, 0, 0xFBDC}, //ARABIC LETTER YU
|
|
{0xFBE2, 0, 0, 0xFBE3}, //ARABIC LETTER KIRGHIZ YU
|
|
{0xFBDE, 0, 0, 0xFBDF}, //ARABIC LETTER VE
|
|
{0xFBFC, 0xFBFE, 0xFBFF, 0xFBFD}, //ARABIC LETTER FARSI YEH
|
|
{0xFBE4, 0xFBE6, 0xFBE7, 0xFBE5}, //ARABIC LETTER E
|
|
{0xFBAE, 0, 0, 0xFBAF}, //ARABIC LETTER YEH BARREE
|
|
{0xFBB0, 0, 0, 0xFBB1}, //ARABIC LETTER YEH BARREE WITH HAMZA ABOVE
|
|
{ZWJ, ZWJ, ZWJ, ZWJ}
|
|
};
|
|
|
|
bool harakat(wchar_t letter)
|
|
{
|
|
return
|
|
(0x0610 <= letter && letter <= 0x061A) ||
|
|
(0x064B <= letter && letter <= 0x065F) ||
|
|
(0x0670 == letter) ||
|
|
(0x06D6 <= letter && letter <= 0x06DC) ||
|
|
(0x06DF <= letter && letter <= 0x06E8) ||
|
|
(0x06EA <= letter && letter <= 0x06ED) ||
|
|
(0x08D4 <= letter && letter <= 0x08E1) ||
|
|
(0x08D4 <= letter && letter <= 0x08ED) ||
|
|
(0x08E3 <= letter && letter <= 0x08FF);
|
|
}
|
|
|
|
int form_index(wchar_t letter)
|
|
{
|
|
static unsigned short ranges[][2]={
|
|
{0x0621, 0x063A},
|
|
{0x0640, 0x064A},
|
|
{0x0671, 0x0671},
|
|
{0x0677, 0x0677},
|
|
{0x0679, 0x067B},
|
|
{0x067E, 0x0680},
|
|
{0x0683, 0x0684},
|
|
{0x0686, 0x0688},
|
|
{0x068C, 0x068E},
|
|
{0x0691, 0x0691},
|
|
{0x0698, 0x0698},
|
|
{0x06A4, 0x06A4},
|
|
{0x06A6, 0x06A6},
|
|
{0x06A9, 0x06A9},
|
|
{0x06AD, 0x06AD},
|
|
{0x06AF, 0x06AF},
|
|
{0x06B1, 0x06B1},
|
|
{0x06B3, 0x06B3},
|
|
{0x06BA, 0x06BB},
|
|
{0x06BE, 0x06BE},
|
|
{0x06C0, 0x06C1},
|
|
{0x06C5, 0x06C9},
|
|
{0x06CB, 0x06CC},
|
|
{0x06D0, 0x06D0},
|
|
{0x06D2, 0x06D3},
|
|
{ZWJ, ZWJ}
|
|
};
|
|
|
|
if((letter < 0x0621) || (0x06D3 < letter && letter != ZWJ))
|
|
return -1;
|
|
|
|
int base = 0;
|
|
for(std::size_t i = 0; i < sizeof(ranges) / sizeof(unsigned short) / 2; ++i)
|
|
{
|
|
if(ranges[i][0] <= letter && letter <= ranges[i][1])
|
|
return static_cast<int>(letter - ranges[i][0]) + base;
|
|
|
|
base += static_cast<int>(ranges[i][1] - ranges[i][0]) + 1;
|
|
}
|
|
return base;
|
|
}
|
|
|
|
wchar_t connect_before(wchar_t letter)
|
|
{
|
|
auto idx = form_index(letter);
|
|
if(idx < 0)
|
|
return 0;
|
|
|
|
return letters[idx][final] || letters[idx][medial];
|
|
}
|
|
|
|
wchar_t connect_after(wchar_t letter)
|
|
{
|
|
auto idx = form_index(letter);
|
|
if(idx < 0)
|
|
return 0;
|
|
|
|
return letters[idx][initial] || letters[idx][medial];
|
|
}
|
|
|
|
wchar_t connect_before_after(wchar_t letter)
|
|
{
|
|
auto idx = form_index(letter);
|
|
if(idx < 0)
|
|
return 0;
|
|
|
|
return letters[idx][medial];
|
|
}
|
|
|
|
std::wstring reshape(const std::wstring& text)
|
|
{
|
|
bool const use_unshaped_instead_of_isolated = false;
|
|
bool const delete_harakat = true;
|
|
bool const shift_harakat_position = false;
|
|
bool const delete_tatweel = false;
|
|
bool const support_zwj = true;
|
|
|
|
const int no_form = -1;
|
|
const int isolated_form = use_unshaped_instead_of_isolated ? unshaped : isolated;
|
|
|
|
std::wstring output;
|
|
std::vector<int> forms;
|
|
|
|
std::map<int, std::wstring> positions_harakat;
|
|
|
|
for(auto letter: text)
|
|
{
|
|
if(harakat(letter))
|
|
{
|
|
if(!delete_harakat)
|
|
{
|
|
int position = static_cast<int>(output.size()) - 1;
|
|
|
|
if (shift_harakat_position)
|
|
--position;
|
|
if (positions_harakat.count(position) == 0)
|
|
positions_harakat[position];
|
|
|
|
if (shift_harakat_position)
|
|
{
|
|
auto & ph = positions_harakat[position];
|
|
ph.insert(ph.cbegin(), letter);
|
|
}
|
|
else
|
|
positions_harakat[position] += letter;
|
|
}
|
|
continue;
|
|
}
|
|
else if(((TATWEEL == letter) && delete_tatweel) || (ZWJ == letter && !support_zwj))
|
|
{
|
|
continue;
|
|
}
|
|
|
|
auto idx = form_index(letter);
|
|
if(idx < 0)
|
|
{
|
|
output += letter;
|
|
forms.push_back(no_form);
|
|
continue;
|
|
}
|
|
|
|
if(forms.empty())
|
|
{
|
|
output += letter;
|
|
forms.push_back(isolated_form);
|
|
continue;
|
|
}
|
|
|
|
if((forms.back() == no_form) || (!connect_before(letter)) || (!connect_after(output.back())) ||
|
|
((forms.back() == final) && !connect_before_after(output.back())))
|
|
{
|
|
output += letter;
|
|
forms.push_back(isolated_form);
|
|
}
|
|
else if(forms.back() == isolated_form)
|
|
{
|
|
forms.back() = initial;
|
|
|
|
output += letter;
|
|
forms.push_back(final);
|
|
}
|
|
else
|
|
{
|
|
forms.back() = medial;
|
|
output += letter;
|
|
forms.push_back(final);
|
|
}
|
|
|
|
//Remove ZWJ if it's the second to last item as it won't be useful
|
|
if(support_zwj && (output.size() > 1) && (output[output.size() - 2] == ZWJ))
|
|
output.erase(output.size() - 2, 1);
|
|
}
|
|
|
|
//Remove ZWJ if it's the second to last item as it won't be useful
|
|
if(support_zwj && (output.size() > 0) && (output.back() == ZWJ))
|
|
output.pop_back();
|
|
|
|
|
|
std::wstring result;
|
|
if((!delete_harakat) && positions_harakat.count(-1))
|
|
result += positions_harakat[-1];
|
|
|
|
for(std::size_t i = 0; i < output.size(); ++i)
|
|
{
|
|
if(output[i])
|
|
{
|
|
if(forms[i] == no_form || forms[i] == unshaped)
|
|
result += output[i];
|
|
else
|
|
result += letters[form_index(output[i])][forms[i]];
|
|
}
|
|
|
|
if(!delete_harakat)
|
|
if(positions_harakat.count(i))
|
|
result += positions_harakat[i];
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
}//end namespace arabic
|
|
}//end namespace reshaping
|
|
} |