utf.hpp 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455
  1. //
  2. // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
  3. //
  4. // Distributed under the Boost Software License, Version 1.0. (See
  5. // accompanying file LICENSE_1_0.txt or copy at
  6. // http://www.boost.org/LICENSE_1_0.txt)
  7. //
  8. #ifndef BOOST_NOWIDE_UTF_HPP_INCLUDED
  9. #define BOOST_NOWIDE_UTF_HPP_INCLUDED
  10. #include <boost/nowide/config.hpp>
  11. #include <boost/cstdint.hpp>
  12. namespace boost {
  13. namespace nowide {
  14. namespace detail {
  15. ///
  16. /// \brief Namespace that holds basic operations on UTF encoded sequences
  17. ///
  18. /// All functions defined in this namespace do not require linking with Boost.Nowide library
  19. /// Extracted from Boost.Locale
  20. ///
  21. namespace utf {
  22. ///
  23. /// \brief The integral type that can hold a Unicode code point
  24. ///
  25. typedef uint32_t code_point;
  26. ///
  27. /// \brief Special constant that defines illegal code point
  28. ///
  29. static const code_point illegal = 0xFFFFFFFFu;
  30. ///
  31. /// \brief Special constant that defines incomplete code point
  32. ///
  33. static const code_point incomplete = 0xFFFFFFFEu;
  34. ///
  35. /// \brief the function checks if \a v is a valid code point
  36. ///
  37. inline bool is_valid_codepoint(code_point v)
  38. {
  39. if(v > 0x10FFFF)
  40. return false;
  41. if(0xD800 <= v && v <= 0xDFFF) // surrogates
  42. return false;
  43. return true;
  44. }
  45. #ifdef BOOST_NOWIDE_DOXYGEN
  46. ///
  47. /// \brief UTF Traits class - functions to convert UTF sequences to and from Unicode code points
  48. ///
  49. template<typename CharType, int size = sizeof(CharType)>
  50. struct utf_traits
  51. {
  52. ///
  53. /// The type of the character
  54. ///
  55. typedef CharType char_type;
  56. ///
  57. /// Read one code point from the range [p,e) and return it.
  58. ///
  59. /// - If the sequence that was read is incomplete sequence returns \ref incomplete,
  60. /// - If illegal sequence detected returns \ref illegal
  61. ///
  62. /// Requirements
  63. ///
  64. /// - Iterator is valid input iterator
  65. ///
  66. /// Postconditions
  67. ///
  68. /// - p points to the last consumed character
  69. ///
  70. template<typename Iterator>
  71. static code_point decode(Iterator& p, Iterator e);
  72. ///
  73. /// Maximal width of valid sequence in the code units:
  74. ///
  75. /// - UTF-8 - 4
  76. /// - UTF-16 - 2
  77. /// - UTF-32 - 1
  78. ///
  79. static const int max_width;
  80. ///
  81. /// The width of specific code point in the code units.
  82. ///
  83. /// Requirement: value is a valid Unicode code point
  84. /// Returns value in range [1..max_width]
  85. ///
  86. static int width(code_point value);
  87. ///
  88. /// Get the size of the trail part of variable length encoded sequence.
  89. ///
  90. /// Returns -1 if C is not valid lead character
  91. ///
  92. static int trail_length(char_type c);
  93. ///
  94. /// Returns true if c is trail code unit, always false for UTF-32
  95. ///
  96. static bool is_trail(char_type c);
  97. ///
  98. /// Returns true if c is lead code unit, always true of UTF-32
  99. ///
  100. static bool is_lead(char_type c);
  101. ///
  102. /// Convert valid Unicode code point \a value to the UTF sequence.
  103. ///
  104. /// Requirements:
  105. ///
  106. /// - \a value is valid code point
  107. /// - \a out is an output iterator should be able to accept at least width(value) units
  108. ///
  109. /// Returns the iterator past the last written code unit.
  110. ///
  111. template<typename Iterator>
  112. static Iterator encode(code_point value, Iterator out);
  113. ///
  114. /// Decodes valid UTF sequence that is pointed by p into code point.
  115. ///
  116. /// If the sequence is invalid or points to end the behavior is undefined
  117. ///
  118. template<typename Iterator>
  119. static code_point decode_valid(Iterator& p);
  120. };
  121. #else
  122. template<typename CharType, int size = sizeof(CharType)>
  123. struct utf_traits;
  124. template<typename CharType>
  125. struct utf_traits<CharType, 1>
  126. {
  127. typedef CharType char_type;
  128. static int trail_length(char_type ci)
  129. {
  130. unsigned char c = ci;
  131. if(c < 128)
  132. return 0;
  133. if(BOOST_UNLIKELY(c < 194))
  134. return -1;
  135. if(c < 224)
  136. return 1;
  137. if(c < 240)
  138. return 2;
  139. if(BOOST_LIKELY(c <= 244))
  140. return 3;
  141. return -1;
  142. }
  143. static const int max_width = 4;
  144. static int width(code_point value)
  145. {
  146. if(value <= 0x7F)
  147. {
  148. return 1;
  149. } else if(value <= 0x7FF)
  150. {
  151. return 2;
  152. } else if(BOOST_LIKELY(value <= 0xFFFF))
  153. {
  154. return 3;
  155. } else
  156. {
  157. return 4;
  158. }
  159. }
  160. static bool is_trail(char_type ci)
  161. {
  162. unsigned char c = ci;
  163. return (c & 0xC0) == 0x80;
  164. }
  165. static bool is_lead(char_type ci)
  166. {
  167. return !is_trail(ci);
  168. }
  169. template<typename Iterator>
  170. static code_point decode(Iterator& p, Iterator e)
  171. {
  172. if(BOOST_UNLIKELY(p == e))
  173. return incomplete;
  174. unsigned char lead = *p++;
  175. // First byte is fully validated here
  176. int trail_size = trail_length(lead);
  177. if(BOOST_UNLIKELY(trail_size < 0))
  178. return illegal;
  179. //
  180. // OK as only ASCII may be of size = 0
  181. // also optimize for ASCII text
  182. //
  183. if(trail_size == 0)
  184. return lead;
  185. code_point c = lead & ((1 << (6 - trail_size)) - 1);
  186. // Read the rest
  187. unsigned char tmp;
  188. switch(trail_size)
  189. {
  190. case 3:
  191. if(BOOST_UNLIKELY(p == e))
  192. return incomplete;
  193. tmp = *p++;
  194. if(!is_trail(tmp))
  195. return illegal;
  196. c = (c << 6) | (tmp & 0x3F);
  197. BOOST_NOWIDE_FALLTHROUGH;
  198. case 2:
  199. if(BOOST_UNLIKELY(p == e))
  200. return incomplete;
  201. tmp = *p++;
  202. if(!is_trail(tmp))
  203. return illegal;
  204. c = (c << 6) | (tmp & 0x3F);
  205. BOOST_NOWIDE_FALLTHROUGH;
  206. case 1:
  207. if(BOOST_UNLIKELY(p == e))
  208. return incomplete;
  209. tmp = *p++;
  210. if(!is_trail(tmp))
  211. return illegal;
  212. c = (c << 6) | (tmp & 0x3F);
  213. }
  214. // Check code point validity: no surrogates and
  215. // valid range
  216. if(BOOST_UNLIKELY(!is_valid_codepoint(c)))
  217. return illegal;
  218. // make sure it is the most compact representation
  219. if(BOOST_UNLIKELY(width(c) != trail_size + 1))
  220. return illegal;
  221. return c;
  222. }
  223. template<typename Iterator>
  224. static code_point decode_valid(Iterator& p)
  225. {
  226. unsigned char lead = *p++;
  227. if(lead < 192)
  228. return lead;
  229. int trail_size;
  230. if(lead < 224)
  231. trail_size = 1;
  232. else if(BOOST_LIKELY(lead < 240)) // non-BMP rare
  233. trail_size = 2;
  234. else
  235. trail_size = 3;
  236. code_point c = lead & ((1 << (6 - trail_size)) - 1);
  237. switch(trail_size)
  238. {
  239. case 3: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_NOWIDE_FALLTHROUGH;
  240. case 2: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_NOWIDE_FALLTHROUGH;
  241. case 1: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F);
  242. }
  243. return c;
  244. }
  245. template<typename Iterator>
  246. static Iterator encode(code_point value, Iterator out)
  247. {
  248. if(value <= 0x7F)
  249. {
  250. *out++ = static_cast<char_type>(value);
  251. } else if(value <= 0x7FF)
  252. {
  253. *out++ = static_cast<char_type>((value >> 6) | 0xC0);
  254. *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
  255. } else if(BOOST_LIKELY(value <= 0xFFFF))
  256. {
  257. *out++ = static_cast<char_type>((value >> 12) | 0xE0);
  258. *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
  259. *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
  260. } else
  261. {
  262. *out++ = static_cast<char_type>((value >> 18) | 0xF0);
  263. *out++ = static_cast<char_type>(((value >> 12) & 0x3F) | 0x80);
  264. *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
  265. *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
  266. }
  267. return out;
  268. }
  269. }; // utf8
  270. template<typename CharType>
  271. struct utf_traits<CharType, 2>
  272. {
  273. typedef CharType char_type;
  274. // See RFC 2781
  275. static bool is_first_surrogate(uint16_t x)
  276. {
  277. return 0xD800 <= x && x <= 0xDBFF;
  278. }
  279. static bool is_second_surrogate(uint16_t x)
  280. {
  281. return 0xDC00 <= x && x <= 0xDFFF;
  282. }
  283. static code_point combine_surrogate(uint16_t w1, uint16_t w2)
  284. {
  285. return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000;
  286. }
  287. static int trail_length(char_type c)
  288. {
  289. if(is_first_surrogate(c))
  290. return 1;
  291. if(is_second_surrogate(c))
  292. return -1;
  293. return 0;
  294. }
  295. ///
  296. /// Returns true if c is trail code unit, always false for UTF-32
  297. ///
  298. static bool is_trail(char_type c)
  299. {
  300. return is_second_surrogate(c);
  301. }
  302. ///
  303. /// Returns true if c is lead code unit, always true of UTF-32
  304. ///
  305. static bool is_lead(char_type c)
  306. {
  307. return !is_second_surrogate(c);
  308. }
  309. template<typename It>
  310. static code_point decode(It& current, It last)
  311. {
  312. if(BOOST_UNLIKELY(current == last))
  313. return incomplete;
  314. uint16_t w1 = *current++;
  315. if(BOOST_LIKELY(w1 < 0xD800 || 0xDFFF < w1))
  316. {
  317. return w1;
  318. }
  319. if(w1 > 0xDBFF)
  320. return illegal;
  321. if(current == last)
  322. return incomplete;
  323. uint16_t w2 = *current++;
  324. if(w2 < 0xDC00 || 0xDFFF < w2)
  325. return illegal;
  326. return combine_surrogate(w1, w2);
  327. }
  328. template<typename It>
  329. static code_point decode_valid(It& current)
  330. {
  331. uint16_t w1 = *current++;
  332. if(BOOST_LIKELY(w1 < 0xD800 || 0xDFFF < w1))
  333. {
  334. return w1;
  335. }
  336. uint16_t w2 = *current++;
  337. return combine_surrogate(w1, w2);
  338. }
  339. static const int max_width = 2;
  340. static int width(code_point u)
  341. {
  342. return u >= 0x10000 ? 2 : 1;
  343. }
  344. template<typename It>
  345. static It encode(code_point u, It out)
  346. {
  347. if(BOOST_LIKELY(u <= 0xFFFF))
  348. {
  349. *out++ = static_cast<char_type>(u);
  350. } else
  351. {
  352. u -= 0x10000;
  353. *out++ = static_cast<char_type>(0xD800 | (u >> 10));
  354. *out++ = static_cast<char_type>(0xDC00 | (u & 0x3FF));
  355. }
  356. return out;
  357. }
  358. }; // utf16;
  359. template<typename CharType>
  360. struct utf_traits<CharType, 4>
  361. {
  362. typedef CharType char_type;
  363. static int trail_length(char_type c)
  364. {
  365. if(is_valid_codepoint(c))
  366. return 0;
  367. return -1;
  368. }
  369. static bool is_trail(char_type /*c*/)
  370. {
  371. return false;
  372. }
  373. static bool is_lead(char_type /*c*/)
  374. {
  375. return true;
  376. }
  377. template<typename It>
  378. static code_point decode_valid(It& current)
  379. {
  380. return *current++;
  381. }
  382. template<typename It>
  383. static code_point decode(It& current, It last)
  384. {
  385. if(BOOST_UNLIKELY(current == last))
  386. return incomplete;
  387. code_point c = *current++;
  388. if(BOOST_UNLIKELY(!is_valid_codepoint(c)))
  389. return illegal;
  390. return c;
  391. }
  392. static const int max_width = 1;
  393. static int width(code_point /*u*/)
  394. {
  395. return 1;
  396. }
  397. template<typename It>
  398. static It encode(code_point u, It out)
  399. {
  400. *out++ = static_cast<char_type>(u);
  401. return out;
  402. }
  403. }; // utf32
  404. #endif
  405. } // namespace utf
  406. } // namespace detail
  407. } // namespace nowide
  408. } // namespace boost
  409. #endif