| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455 | ////  Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)////  Distributed under the Boost Software License, Version 1.0. (See//  accompanying file LICENSE_1_0.txt or copy at//  http://www.boost.org/LICENSE_1_0.txt)//#ifndef BOOST_NOWIDE_UTF_HPP_INCLUDED#define BOOST_NOWIDE_UTF_HPP_INCLUDED#include <boost/nowide/config.hpp>#include <boost/cstdint.hpp>namespace boost {namespace nowide {    namespace detail {        ///        /// \brief Namespace that holds basic operations on UTF encoded sequences        ///        /// All functions defined in this namespace do not require linking with Boost.Nowide library        /// Extracted from Boost.Locale        ///        namespace utf {            ///            /// \brief The integral type that can hold a Unicode code point            ///            typedef uint32_t code_point;            ///            /// \brief Special constant that defines illegal code point            ///            static const code_point illegal = 0xFFFFFFFFu;            ///            /// \brief Special constant that defines incomplete code point            ///            static const code_point incomplete = 0xFFFFFFFEu;            ///            /// \brief the function checks if \a v is a valid code point            ///            inline bool is_valid_codepoint(code_point v)            {                if(v > 0x10FFFF)                    return false;                if(0xD800 <= v && v <= 0xDFFF) // surrogates                    return false;                return true;            }#ifdef BOOST_NOWIDE_DOXYGEN            ///            /// \brief UTF Traits class - functions to convert UTF sequences to and from Unicode code points            ///            template<typename CharType, int size = sizeof(CharType)>            struct utf_traits            {                ///                /// The type of the character                ///                typedef CharType char_type;                ///                /// Read one code point from the range [p,e) and return it.                ///                /// - If the sequence that was read is incomplete sequence returns \ref incomplete,                /// - If illegal sequence detected returns \ref illegal                ///                /// Requirements                ///                /// - Iterator is valid input iterator                ///                /// Postconditions                ///                /// - p points to the last consumed character                ///                template<typename Iterator>                static code_point decode(Iterator& p, Iterator e);                ///                /// Maximal width of valid sequence in the code units:                ///                /// - UTF-8  - 4                /// - UTF-16 - 2                /// - UTF-32 - 1                ///                static const int max_width;                ///                /// The width of specific code point in the code units.                ///                /// Requirement: value is a valid Unicode code point                /// Returns value in range [1..max_width]                ///                static int width(code_point value);                ///                /// Get the size of the trail part of variable length encoded sequence.                ///                /// Returns -1 if C is not valid lead character                ///                static int trail_length(char_type c);                ///                /// Returns true if c is trail code unit, always false for UTF-32                ///                static bool is_trail(char_type c);                ///                /// Returns true if c is lead code unit, always true of UTF-32                ///                static bool is_lead(char_type c);                ///                /// Convert valid Unicode code point \a value to the UTF sequence.                ///                /// Requirements:                ///                /// - \a value is valid code point                /// - \a out is an output iterator should be able to accept at least width(value) units                ///                /// Returns the iterator past the last written code unit.                ///                template<typename Iterator>                static Iterator encode(code_point value, Iterator out);                ///                /// Decodes valid UTF sequence that is pointed by p into code point.                ///                /// If the sequence is invalid or points to end the behavior is undefined                ///                template<typename Iterator>                static code_point decode_valid(Iterator& p);            };#else            template<typename CharType, int size = sizeof(CharType)>            struct utf_traits;            template<typename CharType>            struct utf_traits<CharType, 1>            {                typedef CharType char_type;                static int trail_length(char_type ci)                {                    unsigned char c = ci;                    if(c < 128)                        return 0;                    if(BOOST_UNLIKELY(c < 194))                        return -1;                    if(c < 224)                        return 1;                    if(c < 240)                        return 2;                    if(BOOST_LIKELY(c <= 244))                        return 3;                    return -1;                }                static const int max_width = 4;                static int width(code_point value)                {                    if(value <= 0x7F)                    {                        return 1;                    } else if(value <= 0x7FF)                    {                        return 2;                    } else if(BOOST_LIKELY(value <= 0xFFFF))                    {                        return 3;                    } else                    {                        return 4;                    }                }                static bool is_trail(char_type ci)                {                    unsigned char c = ci;                    return (c & 0xC0) == 0x80;                }                static bool is_lead(char_type ci)                {                    return !is_trail(ci);                }                template<typename Iterator>                static code_point decode(Iterator& p, Iterator e)                {                    if(BOOST_UNLIKELY(p == e))                        return incomplete;                    unsigned char lead = *p++;                    // First byte is fully validated here                    int trail_size = trail_length(lead);                    if(BOOST_UNLIKELY(trail_size < 0))                        return illegal;                    //                    // OK as only ASCII may be of size = 0                    // also optimize for ASCII text                    //                    if(trail_size == 0)                        return lead;                    code_point c = lead & ((1 << (6 - trail_size)) - 1);                    // Read the rest                    unsigned char tmp;                    switch(trail_size)                    {                    case 3:                        if(BOOST_UNLIKELY(p == e))                            return incomplete;                        tmp = *p++;                        if(!is_trail(tmp))                            return illegal;                        c = (c << 6) | (tmp & 0x3F);                        BOOST_NOWIDE_FALLTHROUGH;                    case 2:                        if(BOOST_UNLIKELY(p == e))                            return incomplete;                        tmp = *p++;                        if(!is_trail(tmp))                            return illegal;                        c = (c << 6) | (tmp & 0x3F);                        BOOST_NOWIDE_FALLTHROUGH;                    case 1:                        if(BOOST_UNLIKELY(p == e))                            return incomplete;                        tmp = *p++;                        if(!is_trail(tmp))                            return illegal;                        c = (c << 6) | (tmp & 0x3F);                    }                    // Check code point validity: no surrogates and                    // valid range                    if(BOOST_UNLIKELY(!is_valid_codepoint(c)))                        return illegal;                    // make sure it is the most compact representation                    if(BOOST_UNLIKELY(width(c) != trail_size + 1))                        return illegal;                    return c;                }                template<typename Iterator>                static code_point decode_valid(Iterator& p)                {                    unsigned char lead = *p++;                    if(lead < 192)                        return lead;                    int trail_size;                    if(lead < 224)                        trail_size = 1;                    else if(BOOST_LIKELY(lead < 240)) // non-BMP rare                        trail_size = 2;                    else                        trail_size = 3;                    code_point c = lead & ((1 << (6 - trail_size)) - 1);                    switch(trail_size)                    {                    case 3: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_NOWIDE_FALLTHROUGH;                    case 2: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_NOWIDE_FALLTHROUGH;                    case 1: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F);                    }                    return c;                }                template<typename Iterator>                static Iterator encode(code_point value, Iterator out)                {                    if(value <= 0x7F)                    {                        *out++ = static_cast<char_type>(value);                    } else if(value <= 0x7FF)                    {                        *out++ = static_cast<char_type>((value >> 6) | 0xC0);                        *out++ = static_cast<char_type>((value & 0x3F) | 0x80);                    } else if(BOOST_LIKELY(value <= 0xFFFF))                    {                        *out++ = static_cast<char_type>((value >> 12) | 0xE0);                        *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);                        *out++ = static_cast<char_type>((value & 0x3F) | 0x80);                    } else                    {                        *out++ = static_cast<char_type>((value >> 18) | 0xF0);                        *out++ = static_cast<char_type>(((value >> 12) & 0x3F) | 0x80);                        *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);                        *out++ = static_cast<char_type>((value & 0x3F) | 0x80);                    }                    return out;                }            }; // utf8            template<typename CharType>            struct utf_traits<CharType, 2>            {                typedef CharType char_type;                // See RFC 2781                static bool is_first_surrogate(uint16_t x)                {                    return 0xD800 <= x && x <= 0xDBFF;                }                static bool is_second_surrogate(uint16_t x)                {                    return 0xDC00 <= x && x <= 0xDFFF;                }                static code_point combine_surrogate(uint16_t w1, uint16_t w2)                {                    return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000;                }                static int trail_length(char_type c)                {                    if(is_first_surrogate(c))                        return 1;                    if(is_second_surrogate(c))                        return -1;                    return 0;                }                ///                /// Returns true if c is trail code unit, always false for UTF-32                ///                static bool is_trail(char_type c)                {                    return is_second_surrogate(c);                }                ///                /// Returns true if c is lead code unit, always true of UTF-32                ///                static bool is_lead(char_type c)                {                    return !is_second_surrogate(c);                }                template<typename It>                static code_point decode(It& current, It last)                {                    if(BOOST_UNLIKELY(current == last))                        return incomplete;                    uint16_t w1 = *current++;                    if(BOOST_LIKELY(w1 < 0xD800 || 0xDFFF < w1))                    {                        return w1;                    }                    if(w1 > 0xDBFF)                        return illegal;                    if(current == last)                        return incomplete;                    uint16_t w2 = *current++;                    if(w2 < 0xDC00 || 0xDFFF < w2)                        return illegal;                    return combine_surrogate(w1, w2);                }                template<typename It>                static code_point decode_valid(It& current)                {                    uint16_t w1 = *current++;                    if(BOOST_LIKELY(w1 < 0xD800 || 0xDFFF < w1))                    {                        return w1;                    }                    uint16_t w2 = *current++;                    return combine_surrogate(w1, w2);                }                static const int max_width = 2;                static int width(code_point u)                {                    return u >= 0x10000 ? 2 : 1;                }                template<typename It>                static It encode(code_point u, It out)                {                    if(BOOST_LIKELY(u <= 0xFFFF))                    {                        *out++ = static_cast<char_type>(u);                    } else                    {                        u -= 0x10000;                        *out++ = static_cast<char_type>(0xD800 | (u >> 10));                        *out++ = static_cast<char_type>(0xDC00 | (u & 0x3FF));                    }                    return out;                }            }; // utf16;            template<typename CharType>            struct utf_traits<CharType, 4>            {                typedef CharType char_type;                static int trail_length(char_type c)                {                    if(is_valid_codepoint(c))                        return 0;                    return -1;                }                static bool is_trail(char_type /*c*/)                {                    return false;                }                static bool is_lead(char_type /*c*/)                {                    return true;                }                template<typename It>                static code_point decode_valid(It& current)                {                    return *current++;                }                template<typename It>                static code_point decode(It& current, It last)                {                    if(BOOST_UNLIKELY(current == last))                        return incomplete;                    code_point c = *current++;                    if(BOOST_UNLIKELY(!is_valid_codepoint(c)))                        return illegal;                    return c;                }                static const int max_width = 1;                static int width(code_point /*u*/)                {                    return 1;                }                template<typename It>                static It encode(code_point u, It out)                {                    *out++ = static_cast<char_type>(u);                    return out;                }            }; // utf32#endif        } // namespace utf    }     // namespace detail} // namespace nowide} // namespace boost#endif
 |