experimenting with def_encoding_error_police

in my case all the 5 variant are working very well !!
This commit is contained in:
qPCR4vir 2016-03-24 03:30:14 +01:00
parent 6b2d5afa6b
commit b2b2bf2858

View File

@ -1,4 +1,4 @@
/* /**
* A Character Encoding Set Implementation * A Character Encoding Set Implementation
* Nana C++ Library(http://www.nanapro.org) * Nana C++ Library(http://www.nanapro.org)
* Copyright(C) 2003-2016 Jinhao(cnjinhao@hotmail.com) * Copyright(C) 2003-2016 Jinhao(cnjinhao@hotmail.com)
@ -7,9 +7,9 @@
* (See accompanying file LICENSE_1_0.txt or copy at * (See accompanying file LICENSE_1_0.txt or copy at
* http://www.boost.org/LICENSE_1_0.txt) * http://www.boost.org/LICENSE_1_0.txt)
* *
* @file: nana/charset.cpp * @file nana/charset.cpp
* @brief: A conversion between unicode characters and multi bytes characters * @brief A conversion between unicode characters and multi bytes characters
* @contributions: * @contributions
* UTF16 4-byte decoding issue by Renke Yan. * UTF16 4-byte decoding issue by Renke Yan.
* Pr0curo(pr#98) * Pr0curo(pr#98)
*/ */
@ -20,6 +20,7 @@
#include <cwchar> #include <cwchar>
#include <clocale> #include <clocale>
#include <cstring> //Added by Pr0curo(pr#98) #include <cstring> //Added by Pr0curo(pr#98)
#include <memory>
//GCC 4.7.0 does not implement the <codecvt> and codecvt_utfx classes //GCC 4.7.0 does not implement the <codecvt> and codecvt_utfx classes
#ifndef STD_CODECVT_NOT_SUPPORTED #ifndef STD_CODECVT_NOT_SUPPORTED
@ -211,21 +212,22 @@ namespace nana
namespace detail namespace detail
{ {
/// candidate to be more general??
class locale_initializer class locale_initializer
{ {
public: public:
static void init() static void init()
{ {
static bool initialized = false; static bool initialized = false;
if(false == initialized) if (initialized) return;
{
initialized = true; initialized = true;
//Only set the C library locale //Only set the C library locale
std::setlocale(LC_CTYPE, ""); std::setlocale(LC_CTYPE, "");
} }
}
}; };
/// convert wchar C string from ? ANSI code page CP_ACP (windows) or LC_CTYPE c locale (-nix) into utf8 std::string
bool wc2mb(std::string& mbstr, const wchar_t * s) bool wc2mb(std::string& mbstr, const wchar_t * s)
{ {
if(nullptr == s || *s == 0) if(nullptr == s || *s == 0)
@ -259,6 +261,7 @@ namespace nana
return true; return true;
} }
/// convert a char C-string from The system default Windows ANSI code page CP_ACP or from LC_CTYPE c locale (-nix) into utf16 std::wstring
bool mb2wc(std::wstring& wcstr, const char* s) bool mb2wc(std::wstring& wcstr, const char* s)
{ {
if(nullptr == s || *s == 0) if(nullptr == s || *s == 0)
@ -291,6 +294,7 @@ namespace nana
return true; return true;
} }
/// convert a char C string from The system default Windows ANSI code page CP_ACP or LC_CTYPE c locale (-nix) into utf16 std::string
bool mb2wc(std::string& wcstr, const char* s) bool mb2wc(std::string& wcstr, const char* s)
{ {
if(nullptr == s || *s == 0) if(nullptr == s || *s == 0)
@ -304,6 +308,7 @@ namespace nana
{ {
wcstr.resize((chars - 1) * sizeof(wchar_t)); wcstr.resize((chars - 1) * sizeof(wchar_t));
::MultiByteToWideChar(CP_ACP, 0, s, -1, reinterpret_cast<wchar_t*>(&wcstr[0]), chars - 1); ::MultiByteToWideChar(CP_ACP, 0, s, -1, reinterpret_cast<wchar_t*>(&wcstr[0]), chars - 1);
// ^ the trick !
} }
#else #else
locale_initializer::init(); locale_initializer::init();
@ -338,6 +343,84 @@ namespace nana
virtual std::wstring&& wstr_move() = 0; virtual std::wstring&& wstr_move() = 0;
}; };
/// playing with the idea - we need a mechanisme to set a user selected police - Testing an abtract interphase
struct encoding_error_police
{
virtual unsigned long next_code_point(const unsigned char*& current_code_unit, const unsigned char* end) = 0;
virtual ~encoding_error_police() = default;
};
/// the current nana default: it is safe - you may want to keep it ! use the other at your risk: mainly for debugging
struct utf8_error_police : public encoding_error_police
{
unsigned long next_code_point(const unsigned char*& current_code_unit, const unsigned char* end) override
{
current_code_unit = end;
return 0;
}
};
///
struct utf8_error_police_def_char : public encoding_error_police
{
static unsigned long def_error_mark ;
unsigned long error_mark{ def_error_mark };
utf8_error_police_def_char() = default;
utf8_error_police_def_char( unsigned long mark): error_mark{mark}{}
unsigned long next_code_point(const unsigned char*& current_code_unit, const unsigned char* end) override
{
++current_code_unit; //check (p != end) ?
return error_mark;
}
};
unsigned long utf8_error_police_def_char::def_error_mark{ '*' };
///
struct utf8_error_police_throw : public encoding_error_police
{
unsigned long next_code_point(const unsigned char*& current_code_unit, const unsigned char* end) override
{
//utf8_Error::use_throw = true;
utf8_Error(std::string("The text is not encoded in UTF8: ") +
reinterpret_cast<const char*>( current_code_unit) ).emit();;
current_code_unit = end;
return 0;
}
};
struct utf8_error_police_latin : public encoding_error_police
{
unsigned long next_code_point(const unsigned char*& current_code_unit, const unsigned char* end) override
{
return *(current_code_unit++) ;
}
};
struct utf8_error_police_system : public encoding_error_police
{
unsigned long next_code_point(const unsigned char*& current_code_unit, const unsigned char* end) override
{
std::wstring wc;
mb2wc(wc, reinterpret_cast<const char*>(current_code_unit));
current_code_unit++;
return wc[0]; // use utf16char
}
};
auto def_encoding_error_police = std::make_unique<utf8_error_police>(); // the nana default
// auto def_encoding_error_police = std::make_unique<utf8_error_police_latin>();
// auto def_encoding_error_police = std::make_unique<utf8_error_police_throw>();
// auto def_encoding_error_police = std::make_unique<utf8_error_police_def_char>('X');
// auto def_encoding_error_police = std::make_unique<utf8_error_police_system>();
#ifndef STD_CODECVT_NOT_SUPPORTED #ifndef STD_CODECVT_NOT_SUPPORTED
class charset_string class charset_string
: public charset_encoding_interface : public charset_encoding_interface
@ -578,6 +661,8 @@ namespace nana
std::string data_for_move_; std::string data_for_move_;
}; };
#else #else
/// return the first code point and move the pointer to next character, springing to the end by errors /// return the first code point and move the pointer to next character, springing to the end by errors
unsigned long utf8char(const unsigned char*& p, const unsigned char* end) unsigned long utf8char(const unsigned char*& p, const unsigned char* end)
{ {
@ -591,9 +676,10 @@ namespace nana
unsigned long code; unsigned long code;
if(ch < 0xC0) // error? - move to end. Posible ANSI or ISO code-page if(ch < 0xC0) // error? - move to end. Posible ANSI or ISO code-page
{ {
return *(p++); // temp: assume equal //return *(p++); // temp: assume equal
p = end; //p = end;
return 0; //return 0;
return def_encoding_error_police->next_code_point(p, end);
} }
else if(ch < 0xE0 && (p + 1 <= end)) // two byte chararcter else if(ch < 0xE0 && (p + 1 <= end)) // two byte chararcter
{ {