455 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			455 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
//
 | 
						|
//  Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
 | 
						|
//
 | 
						|
//  Distributed under the Boost Software License, Version 1.0. (See
 | 
						|
//  accompanying file LICENSE_1_0.txt or copy at
 | 
						|
//  http://www.boost.org/LICENSE_1_0.txt)
 | 
						|
//
 | 
						|
#ifndef BOOST_LOCALE_UTF_HPP_INCLUDED
 | 
						|
#define BOOST_LOCALE_UTF_HPP_INCLUDED
 | 
						|
 | 
						|
#include <boost/cstdint.hpp>
 | 
						|
 | 
						|
namespace boost {
 | 
						|
namespace locale {
 | 
						|
///
 | 
						|
/// \brief Namespace that holds basic operations on UTF encoded sequences 
 | 
						|
///
 | 
						|
/// All functions defined in this namespace do not require linking with Boost.Locale library
 | 
						|
///
 | 
						|
namespace utf {
 | 
						|
    /// \cond INTERNAL
 | 
						|
    #ifdef __GNUC__
 | 
						|
    #   define BOOST_LOCALE_LIKELY(x)   __builtin_expect((x),1)
 | 
						|
    #   define BOOST_LOCALE_UNLIKELY(x) __builtin_expect((x),0)
 | 
						|
    #else
 | 
						|
    #   define BOOST_LOCALE_LIKELY(x)   (x)
 | 
						|
    #   define BOOST_LOCALE_UNLIKELY(x) (x)
 | 
						|
    #endif
 | 
						|
    /// \endcond
 | 
						|
 | 
						|
    ///
 | 
						|
    /// \brief The integral type type that can hold a Unicode code point
 | 
						|
    ///
 | 
						|
    typedef uint32_t code_point;
 | 
						|
 | 
						|
    ///
 | 
						|
    /// \brief Special constant that defines illegal code point
 | 
						|
    ///
 | 
						|
    static const code_point illegal = 0xFFFFFFFFu;
 | 
						|
 | 
						|
    ///
 | 
						|
    /// \brief Special constant that defines incomplete code point
 | 
						|
    ///
 | 
						|
    static const code_point incomplete = 0xFFFFFFFEu;
 | 
						|
 | 
						|
    ///
 | 
						|
    /// \brief the function checks if \a v is a valid code point
 | 
						|
    ///
 | 
						|
    inline bool is_valid_codepoint(code_point v)
 | 
						|
    {
 | 
						|
        if(v>0x10FFFF)
 | 
						|
            return false;
 | 
						|
        if(0xD800 <=v && v<= 0xDFFF) // surragates
 | 
						|
            return false;
 | 
						|
        return true;
 | 
						|
    }
 | 
						|
 | 
						|
    #ifdef BOOST_LOCALE_DOXYGEN
 | 
						|
    ///
 | 
						|
    /// \brief UTF Traits class - functions to convert UTF sequences to and from Unicode code points
 | 
						|
    ///
 | 
						|
    template<typename CharType,int size=sizeof(CharType)>
 | 
						|
    struct utf_traits {
 | 
						|
        ///
 | 
						|
        /// The type of the character
 | 
						|
        ///
 | 
						|
        typedef CharType char_type;
 | 
						|
        ///
 | 
						|
        /// Read one code point from the range [p,e) and return it.
 | 
						|
        ///
 | 
						|
        /// - If the sequence that was read is incomplete sequence returns \ref incomplete,
 | 
						|
        /// - If illegal sequence detected returns \ref illegal
 | 
						|
        ///
 | 
						|
        /// Requirements
 | 
						|
        ///
 | 
						|
        /// - Iterator is valid input iterator
 | 
						|
        ///
 | 
						|
        /// Postconditions
 | 
						|
        ///
 | 
						|
        /// - p points to the last consumed character
 | 
						|
        /// 
 | 
						|
        template<typename Iterator>
 | 
						|
        static code_point decode(Iterator &p,Iterator e);
 | 
						|
 | 
						|
        ///
 | 
						|
        /// Maximal width of valid sequence in the code units:
 | 
						|
        ///
 | 
						|
        /// - UTF-8  - 4
 | 
						|
        /// - UTF-16 - 2
 | 
						|
        /// - UTF-32 - 1
 | 
						|
        ///
 | 
						|
        static const int max_width;
 | 
						|
        ///
 | 
						|
        /// The width of specific code point in the code units.
 | 
						|
        ///
 | 
						|
        /// Requirement: value is a valid Unicode code point
 | 
						|
        /// Returns value in range [1..max_width]
 | 
						|
        ///
 | 
						|
        static int width(code_point value);
 | 
						|
 | 
						|
        ///
 | 
						|
        /// Get the size of the trail part of variable length encoded sequence.
 | 
						|
        ///
 | 
						|
        /// Returns -1 if C is not valid lead character
 | 
						|
        /// 
 | 
						|
        static int trail_length(char_type c);
 | 
						|
        ///
 | 
						|
        /// Returns true if c is trail code unit, always false for UTF-32
 | 
						|
        ///
 | 
						|
        static bool is_trail(char_type c);
 | 
						|
        ///
 | 
						|
        /// Returns true if c is lead code unit, always true of UTF-32
 | 
						|
        ///
 | 
						|
        static bool is_lead(char_type c);
 | 
						|
 | 
						|
        ///
 | 
						|
        /// Convert valid Unicode code point \a value to the UTF sequence.
 | 
						|
        ///
 | 
						|
        /// Requirements: 
 | 
						|
        ///
 | 
						|
        /// - \a value is valid code point
 | 
						|
        /// - \a out is an output iterator should be able to accept at least width(value) units
 | 
						|
        /// 
 | 
						|
        /// Returns the iterator past the last written code unit.
 | 
						|
        ///
 | 
						|
        template<typename Iterator>
 | 
						|
        static Iterator encode(code_point value,Iterator out);
 | 
						|
        ///
 | 
						|
        /// Decodes valid UTF sequence that is pointed by p into code point.
 | 
						|
        ///
 | 
						|
        /// If the sequence is invalid or points to end the behavior is undefined
 | 
						|
        ///
 | 
						|
        template<typename Iterator>
 | 
						|
        static code_point decode_valid(Iterator &p);
 | 
						|
    };
 | 
						|
    
 | 
						|
    #else
 | 
						|
 | 
						|
    template<typename CharType,int size=sizeof(CharType)>
 | 
						|
    struct utf_traits;
 | 
						|
 | 
						|
    template<typename CharType>
 | 
						|
    struct utf_traits<CharType,1> {
 | 
						|
 | 
						|
        typedef CharType char_type;
 | 
						|
        
 | 
						|
        static int trail_length(char_type ci) 
 | 
						|
        {
 | 
						|
            unsigned char c = ci;
 | 
						|
            if(c < 128)
 | 
						|
                return 0;
 | 
						|
            if(BOOST_LOCALE_UNLIKELY(c < 194))
 | 
						|
                return -1;
 | 
						|
            if(c < 224)
 | 
						|
                return 1;
 | 
						|
            if(c < 240)
 | 
						|
                return 2;
 | 
						|
            if(BOOST_LOCALE_LIKELY(c <=244))
 | 
						|
                return 3;
 | 
						|
            return -1;
 | 
						|
        }
 | 
						|
        
 | 
						|
        static const int max_width = 4;
 | 
						|
 | 
						|
        static int width(code_point value)
 | 
						|
        {
 | 
						|
            if(value <=0x7F) {
 | 
						|
                return 1;
 | 
						|
            }
 | 
						|
            else if(value <=0x7FF) {
 | 
						|
                return 2;
 | 
						|
            }
 | 
						|
            else if(BOOST_LOCALE_LIKELY(value <=0xFFFF)) {
 | 
						|
                return 3;
 | 
						|
            }
 | 
						|
            else {
 | 
						|
                return 4;
 | 
						|
            }
 | 
						|
        }
 | 
						|
 | 
						|
        static bool is_trail(char_type ci)
 | 
						|
        {
 | 
						|
            unsigned char c=ci;
 | 
						|
            return (c & 0xC0)==0x80;
 | 
						|
        }
 | 
						|
 | 
						|
        static bool is_lead(char_type ci)
 | 
						|
        {
 | 
						|
            return !is_trail(ci);
 | 
						|
        }
 | 
						|
        
 | 
						|
        template<typename Iterator>
 | 
						|
        static code_point decode(Iterator &p,Iterator e)
 | 
						|
        {
 | 
						|
            if(BOOST_LOCALE_UNLIKELY(p==e))
 | 
						|
                return incomplete;
 | 
						|
 | 
						|
            unsigned char lead = *p++;
 | 
						|
 | 
						|
            // First byte is fully validated here
 | 
						|
            int trail_size = trail_length(lead);
 | 
						|
 | 
						|
            if(BOOST_LOCALE_UNLIKELY(trail_size < 0))
 | 
						|
                return illegal;
 | 
						|
 | 
						|
            //
 | 
						|
            // Ok as only ASCII may be of size = 0
 | 
						|
            // also optimize for ASCII text
 | 
						|
            //
 | 
						|
            if(trail_size == 0)
 | 
						|
                return lead;
 | 
						|
            
 | 
						|
            code_point c = lead & ((1<<(6-trail_size))-1);
 | 
						|
 | 
						|
            // Read the rest
 | 
						|
            unsigned char tmp;
 | 
						|
            switch(trail_size) {
 | 
						|
            case 3:
 | 
						|
                if(BOOST_LOCALE_UNLIKELY(p==e))
 | 
						|
                    return incomplete;
 | 
						|
                tmp = *p++;
 | 
						|
                c = (c << 6) | ( tmp & 0x3F);
 | 
						|
            case 2:
 | 
						|
                if(BOOST_LOCALE_UNLIKELY(p==e))
 | 
						|
                    return incomplete;
 | 
						|
                tmp = *p++;
 | 
						|
                c = (c << 6) | ( tmp & 0x3F);
 | 
						|
            case 1:
 | 
						|
                if(BOOST_LOCALE_UNLIKELY(p==e))
 | 
						|
                    return incomplete;
 | 
						|
                tmp = *p++;
 | 
						|
                c = (c << 6) | ( tmp & 0x3F);
 | 
						|
            }
 | 
						|
 | 
						|
            // Check code point validity: no surrogates and
 | 
						|
            // valid range
 | 
						|
            if(BOOST_LOCALE_UNLIKELY(!is_valid_codepoint(c)))
 | 
						|
                return illegal;
 | 
						|
 | 
						|
            // make sure it is the most compact representation
 | 
						|
            if(BOOST_LOCALE_UNLIKELY(width(c)!=trail_size + 1))
 | 
						|
                return illegal;
 | 
						|
 | 
						|
            return c;
 | 
						|
 | 
						|
        }
 | 
						|
        
 | 
						|
        template<typename Iterator>
 | 
						|
        static code_point decode_valid(Iterator &p)
 | 
						|
        {
 | 
						|
            unsigned char lead = *p++;
 | 
						|
            if(lead < 192)
 | 
						|
                return lead;
 | 
						|
 | 
						|
            int trail_size;
 | 
						|
 | 
						|
            if(lead < 224)
 | 
						|
                trail_size = 1;
 | 
						|
            else if(BOOST_LOCALE_LIKELY(lead < 240)) // non-BMP rare
 | 
						|
                trail_size = 2;
 | 
						|
            else
 | 
						|
                trail_size = 3;
 | 
						|
            
 | 
						|
            code_point c = lead & ((1<<(6-trail_size))-1);
 | 
						|
 | 
						|
            switch(trail_size) {
 | 
						|
            case 3:
 | 
						|
                c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
 | 
						|
            case 2:
 | 
						|
                c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
 | 
						|
            case 1:
 | 
						|
                c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
 | 
						|
            }
 | 
						|
 | 
						|
            return c;
 | 
						|
        }
 | 
						|
 | 
						|
 | 
						|
 | 
						|
        template<typename Iterator>
 | 
						|
        static Iterator encode(code_point value,Iterator out)
 | 
						|
        {
 | 
						|
            if(value <=0x7F) {
 | 
						|
                *out++ = value;
 | 
						|
            }
 | 
						|
            else if(value <=0x7FF) {
 | 
						|
                *out++=(value >> 6) | 0xC0;
 | 
						|
                *out++=(value & 0x3F) | 0x80;
 | 
						|
            }
 | 
						|
            else if(BOOST_LOCALE_LIKELY(value <=0xFFFF)) {
 | 
						|
                *out++=(value >> 12) | 0xE0;
 | 
						|
                *out++=((value >> 6) & 0x3F) | 0x80;
 | 
						|
                *out++=(value & 0x3F) | 0x80;
 | 
						|
            }
 | 
						|
            else {
 | 
						|
                *out++=(value >> 18) | 0xF0;
 | 
						|
                *out++=((value >> 12) & 0x3F) | 0x80;
 | 
						|
                *out++=((value >> 6) & 0x3F) | 0x80;
 | 
						|
                *out++=(value & 0x3F) | 0x80;
 | 
						|
            }
 | 
						|
            return out;
 | 
						|
        }
 | 
						|
    }; // utf8
 | 
						|
 | 
						|
    template<typename CharType>
 | 
						|
    struct utf_traits<CharType,2> {
 | 
						|
        typedef CharType char_type;
 | 
						|
 | 
						|
        // See RFC 2781
 | 
						|
        static bool is_first_surrogate(uint16_t x)
 | 
						|
        {
 | 
						|
            return 0xD800 <=x && x<= 0xDBFF;
 | 
						|
        }
 | 
						|
        static bool is_second_surrogate(uint16_t x)
 | 
						|
        {
 | 
						|
            return 0xDC00 <=x && x<= 0xDFFF;
 | 
						|
        }
 | 
						|
        static code_point combine_surrogate(uint16_t w1,uint16_t w2)
 | 
						|
        {
 | 
						|
            return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000;
 | 
						|
        }
 | 
						|
        static int trail_length(char_type c)
 | 
						|
        {
 | 
						|
            if(is_first_surrogate(c))
 | 
						|
                return 1;
 | 
						|
            if(is_second_surrogate(c))
 | 
						|
                return -1;
 | 
						|
            return 0;
 | 
						|
        }
 | 
						|
        ///
 | 
						|
        /// Returns true if c is trail code unit, always false for UTF-32
 | 
						|
        ///
 | 
						|
        static bool is_trail(char_type c)
 | 
						|
        {
 | 
						|
            return is_second_surrogate(c);
 | 
						|
        }
 | 
						|
        ///
 | 
						|
        /// Returns true if c is lead code unit, always true of UTF-32
 | 
						|
        ///
 | 
						|
        static bool is_lead(char_type c)
 | 
						|
        {
 | 
						|
            return !is_second_surrogate(c);
 | 
						|
        }
 | 
						|
 | 
						|
        template<typename It>
 | 
						|
        static code_point decode(It ¤t,It last)
 | 
						|
        {
 | 
						|
            if(BOOST_LOCALE_UNLIKELY(current == last))
 | 
						|
                return incomplete;
 | 
						|
            uint16_t w1=*current++;
 | 
						|
            if(BOOST_LOCALE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) {
 | 
						|
                return w1;
 | 
						|
            }
 | 
						|
            if(w1 > 0xDBFF)
 | 
						|
                return illegal;
 | 
						|
            if(current==last)
 | 
						|
                return incomplete;
 | 
						|
            uint16_t w2=*current++;
 | 
						|
            if(w2 < 0xDC00 || 0xDFFF < w2)
 | 
						|
                return illegal;
 | 
						|
            return combine_surrogate(w1,w2);
 | 
						|
        }
 | 
						|
        template<typename It>
 | 
						|
        static code_point decode_valid(It ¤t)
 | 
						|
        {
 | 
						|
            uint16_t w1=*current++;
 | 
						|
            if(BOOST_LOCALE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) {
 | 
						|
                return w1;
 | 
						|
            }
 | 
						|
            uint16_t w2=*current++;
 | 
						|
            return combine_surrogate(w1,w2);
 | 
						|
        }
 | 
						|
 | 
						|
        static const int max_width = 2;
 | 
						|
        static int width(code_point u)
 | 
						|
        {
 | 
						|
            return u>=0x10000 ? 2 : 1;
 | 
						|
        }
 | 
						|
        template<typename It>
 | 
						|
        static It encode(code_point u,It out)
 | 
						|
        {
 | 
						|
            if(BOOST_LOCALE_LIKELY(u<=0xFFFF)) {
 | 
						|
                *out++ = u;
 | 
						|
            }
 | 
						|
            else {
 | 
						|
                u-=0x10000;
 | 
						|
                *out++=0xD800 | (u>>10);
 | 
						|
                *out++=0xDC00 | (u & 0x3FF);
 | 
						|
            }
 | 
						|
            return out;
 | 
						|
        }
 | 
						|
    }; // utf16;
 | 
						|
 | 
						|
        
 | 
						|
    template<typename CharType>
 | 
						|
    struct utf_traits<CharType,4> {
 | 
						|
        typedef CharType char_type;
 | 
						|
        static int trail_length(char_type c)
 | 
						|
        {
 | 
						|
            if(is_valid_codepoint(c))
 | 
						|
                return 0;
 | 
						|
            return -1;
 | 
						|
        }
 | 
						|
        static bool is_trail(char_type /*c*/)
 | 
						|
        {
 | 
						|
            return false;
 | 
						|
        }
 | 
						|
        static bool is_lead(char_type /*c*/)
 | 
						|
        {
 | 
						|
            return true;
 | 
						|
        }
 | 
						|
 | 
						|
        template<typename It>
 | 
						|
        static code_point decode_valid(It ¤t)
 | 
						|
        {
 | 
						|
            return *current++;
 | 
						|
        }
 | 
						|
 | 
						|
        template<typename It>
 | 
						|
        static code_point decode(It ¤t,It last)
 | 
						|
        {
 | 
						|
            if(BOOST_LOCALE_UNLIKELY(current == last))
 | 
						|
                return boost::locale::utf::incomplete;
 | 
						|
            code_point c=*current++;
 | 
						|
            if(BOOST_LOCALE_UNLIKELY(!is_valid_codepoint(c)))
 | 
						|
                return boost::locale::utf::illegal;
 | 
						|
            return c;
 | 
						|
        }
 | 
						|
        static const int max_width = 1;
 | 
						|
        static int width(code_point /*u*/)
 | 
						|
        {
 | 
						|
            return 1;
 | 
						|
        }
 | 
						|
        template<typename It>
 | 
						|
        static It encode(code_point u,It out)
 | 
						|
        {
 | 
						|
            *out++ = u;
 | 
						|
            return out;
 | 
						|
        }
 | 
						|
 | 
						|
    }; // utf32
 | 
						|
 | 
						|
    #endif
 | 
						|
 | 
						|
 | 
						|
} // utf
 | 
						|
} // locale
 | 
						|
} // boost
 | 
						|
 | 
						|
 | 
						|
#endif
 | 
						|
 | 
						|
// vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
 | 
						|
 |