Boost.Nowide
utf.hpp
1 //
2 // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
3 // Copyright (c) 2020 Alexander Grund
4 //
5 // Distributed under the Boost Software License, Version 1.0. (See
6 // accompanying file LICENSE or copy at
7 // http://www.boost.org/LICENSE_1_0.txt)
8 //
9 #ifndef BOOST_NOWIDE_UTF_HPP_INCLUDED
10 #define BOOST_NOWIDE_UTF_HPP_INCLUDED
11 
12 #include <boost/nowide/config.hpp>
13 #include <cstdint>
14 
15 namespace boost {
16 namespace nowide {
23  namespace utf {
24 
28  using code_point = uint32_t;
29 
33  static const code_point illegal = 0xFFFFFFFFu;
34 
38  static const code_point incomplete = 0xFFFFFFFEu;
39 
44  {
45  if(v > 0x10FFFF)
46  return false;
47  if(0xD800 <= v && v <= 0xDFFF) // surrogates
48  return false;
49  return true;
50  }
51 
52 #ifdef BOOST_NOWIDE_DOXYGEN
53  template<typename CharType, int size = sizeof(CharType)>
57  struct utf_traits
58  {
62  using char_type = CharType;
77  template<typename Iterator>
78  static code_point decode(Iterator& p, Iterator e);
79 
87  static const int max_width;
94  static int width(code_point value);
95 
101  static int trail_length(char_type c);
105  static bool is_trail(char_type c);
109  static bool is_lead(char_type c);
110 
121  template<typename Iterator>
122  static Iterator encode(code_point value, Iterator out);
128  template<typename Iterator>
129  static code_point decode_valid(Iterator& p);
130  };
131 
132 #else
133 
134  template<typename CharType, int size = sizeof(CharType)>
135  struct utf_traits;
136 
137  template<typename CharType>
138  struct utf_traits<CharType, 1>
139  {
140  using char_type = CharType;
141 
142  static int trail_length(char_type ci)
143  {
144  unsigned char c = ci;
145  if(c < 128)
146  return 0;
147  if(BOOST_UNLIKELY(c < 194))
148  return -1;
149  if(c < 224)
150  return 1;
151  if(c < 240)
152  return 2;
153  if(BOOST_LIKELY(c <= 244))
154  return 3;
155  return -1;
156  }
157 
158  static const int max_width = 4;
159 
160  static int width(code_point value)
161  {
162  if(value <= 0x7F)
163  {
164  return 1;
165  } else if(value <= 0x7FF)
166  {
167  return 2;
168  } else if(BOOST_LIKELY(value <= 0xFFFF))
169  {
170  return 3;
171  } else
172  {
173  return 4;
174  }
175  }
176 
177  static bool is_trail(char_type ci)
178  {
179  unsigned char c = ci;
180  return (c & 0xC0) == 0x80;
181  }
182 
183  static bool is_lead(char_type ci)
184  {
185  return !is_trail(ci);
186  }
187 
188  template<typename Iterator>
189  static code_point decode(Iterator& p, Iterator e)
190  {
191  if(BOOST_UNLIKELY(p == e))
192  return incomplete;
193 
194  unsigned char lead = *p++;
195 
196  // First byte is fully validated here
197  int trail_size = trail_length(lead);
198 
199  if(BOOST_UNLIKELY(trail_size < 0))
200  return illegal;
201 
202  // OK as only ASCII may be of size = 0
203  // also optimize for ASCII text
204  if(trail_size == 0)
205  return lead;
206 
207  code_point c = lead & ((1 << (6 - trail_size)) - 1);
208 
209  // Read the rest
210  unsigned char tmp;
211  switch(trail_size)
212  {
213  case 3:
214  if(BOOST_UNLIKELY(p == e))
215  return incomplete;
216  tmp = *p++;
217  if(!is_trail(tmp))
218  return illegal;
219  c = (c << 6) | (tmp & 0x3F);
220  BOOST_NOWIDE_FALLTHROUGH;
221  case 2:
222  if(BOOST_UNLIKELY(p == e))
223  return incomplete;
224  tmp = *p++;
225  if(!is_trail(tmp))
226  return illegal;
227  c = (c << 6) | (tmp & 0x3F);
228  BOOST_NOWIDE_FALLTHROUGH;
229  case 1:
230  if(BOOST_UNLIKELY(p == e))
231  return incomplete;
232  tmp = *p++;
233  if(!is_trail(tmp))
234  return illegal;
235  c = (c << 6) | (tmp & 0x3F);
236  }
237 
238  // Check code point validity:
239  // - no surrogates and valid range
240  // - most compact representation
241  if(BOOST_UNLIKELY(!is_valid_codepoint(c)) || BOOST_UNLIKELY(width(c) != trail_size + 1))
242  {
243  p -= trail_size;
244  return illegal;
245  }
246 
247  return c;
248  }
249 
250  template<typename Iterator>
251  static code_point decode_valid(Iterator& p)
252  {
253  unsigned char lead = *p++;
254  if(lead < 192)
255  return lead;
256 
257  int trail_size;
258 
259  if(lead < 224)
260  trail_size = 1;
261  else if(BOOST_LIKELY(lead < 240)) // non-BMP rare
262  trail_size = 2;
263  else
264  trail_size = 3;
265 
266  code_point c = lead & ((1 << (6 - trail_size)) - 1);
267 
268  switch(trail_size)
269  {
270  case 3: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_NOWIDE_FALLTHROUGH;
271  case 2: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_NOWIDE_FALLTHROUGH;
272  case 1: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F);
273  }
274 
275  return c;
276  }
277 
278  template<typename Iterator>
279  static Iterator encode(code_point value, Iterator out)
280  {
281  if(value <= 0x7F)
282  {
283  *out++ = static_cast<char_type>(value);
284  } else if(value <= 0x7FF)
285  {
286  *out++ = static_cast<char_type>((value >> 6) | 0xC0);
287  *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
288  } else if(BOOST_LIKELY(value <= 0xFFFF))
289  {
290  *out++ = static_cast<char_type>((value >> 12) | 0xE0);
291  *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
292  *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
293  } else
294  {
295  *out++ = static_cast<char_type>((value >> 18) | 0xF0);
296  *out++ = static_cast<char_type>(((value >> 12) & 0x3F) | 0x80);
297  *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
298  *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
299  }
300  return out;
301  }
302  }; // utf8
303 
304  template<typename CharType>
305  struct utf_traits<CharType, 2>
306  {
307  using char_type = CharType;
308 
309  // See RFC 2781
310  static bool is_single_codepoint(uint16_t x)
311  {
312  // Ranges [U+0000, 0+D7FF], [U+E000, U+FFFF] are numerically equal in UTF-16
313  return x <= 0xD7FF || x >= 0xE000;
314  }
315  static bool is_first_surrogate(uint16_t x)
316  {
317  // Range [U+D800, 0+DBFF]: High surrogate
318  return 0xD800 <= x && x <= 0xDBFF;
319  }
320  static bool is_second_surrogate(uint16_t x)
321  {
322  // Range [U+DC00, 0+DFFF]: Low surrogate
323  return 0xDC00 <= x && x <= 0xDFFF;
324  }
325  static code_point combine_surrogate(uint16_t w1, uint16_t w2)
326  {
327  return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000;
328  }
329  static int trail_length(char_type c)
330  {
331  if(is_first_surrogate(c))
332  return 1;
333  if(is_second_surrogate(c))
334  return -1;
335  return 0;
336  }
338  static bool is_trail(char_type c)
339  {
340  return is_second_surrogate(c);
341  }
343  static bool is_lead(char_type c)
344  {
345  return !is_second_surrogate(c);
346  }
347 
348  template<typename It>
349  static code_point decode(It& current, It last)
350  {
351  if(BOOST_UNLIKELY(current == last))
352  return incomplete;
353  uint16_t w1 = *current++;
354  if(BOOST_LIKELY(is_single_codepoint(w1)))
355  {
356  return w1;
357  }
358  // Now it's either a high or a low surrogate, the latter is invalid
359  if(w1 >= 0xDC00)
360  return illegal;
361  if(current == last)
362  return incomplete;
363  uint16_t w2 = *current++;
364  if(!is_second_surrogate(w2))
365  return illegal;
366  return combine_surrogate(w1, w2);
367  }
368  template<typename It>
369  static code_point decode_valid(It& current)
370  {
371  uint16_t w1 = *current++;
372  if(BOOST_LIKELY(is_single_codepoint(w1)))
373  {
374  return w1;
375  }
376  uint16_t w2 = *current++;
377  return combine_surrogate(w1, w2);
378  }
379 
380  static const int max_width = 2;
381  static int width(code_point u)
382  {
383  return u >= 0x10000 ? 2 : 1;
384  }
385  template<typename It>
386  static It encode(code_point u, It out)
387  {
388  if(BOOST_LIKELY(u <= 0xFFFF))
389  {
390  *out++ = static_cast<char_type>(u);
391  } else
392  {
393  u -= 0x10000;
394  *out++ = static_cast<char_type>(0xD800 | (u >> 10));
395  *out++ = static_cast<char_type>(0xDC00 | (u & 0x3FF));
396  }
397  return out;
398  }
399  }; // utf16;
400 
401  template<typename CharType>
402  struct utf_traits<CharType, 4>
403  {
404  using char_type = CharType;
405  static int trail_length(char_type c)
406  {
407  if(is_valid_codepoint(c))
408  return 0;
409  return -1;
410  }
411  static bool is_trail(char_type /*c*/)
412  {
413  return false;
414  }
415  static bool is_lead(char_type /*c*/)
416  {
417  return true;
418  }
419 
420  template<typename It>
421  static code_point decode_valid(It& current)
422  {
423  return *current++;
424  }
425 
426  template<typename It>
427  static code_point decode(It& current, It last)
428  {
429  if(BOOST_UNLIKELY(current == last))
430  return incomplete;
431  code_point c = *current++;
432  if(BOOST_UNLIKELY(!is_valid_codepoint(c)))
433  return illegal;
434  return c;
435  }
436  static const int max_width = 1;
437  static int width(code_point /*u*/)
438  {
439  return 1;
440  }
441  template<typename It>
442  static It encode(code_point u, It out)
443  {
444  *out++ = static_cast<char_type>(u);
445  return out;
446  }
447  }; // utf32
448 
449 #endif
450 
451  } // namespace utf
452 } // namespace nowide
453 } // namespace boost
454 
455 #endif
static const int max_width
Definition: utf.hpp:87
UTF Traits class - functions to convert UTF sequences to and from Unicode code points.
Definition: utf.hpp:57
static bool is_trail(char_type c)
Namespace that holds basic operations on UTF encoded sequences.
Definition: convert.hpp:20
static Iterator encode(code_point value, Iterator out)
static bool is_lead(char_type c)
static const code_point illegal
Special constant that defines illegal code point.
Definition: utf.hpp:33
uint32_t code_point
The integral type that can hold a Unicode code point.
Definition: utf.hpp:28
CharType char_type
Definition: utf.hpp:62
static code_point decode_valid(Iterator &p)
static int trail_length(char_type c)
static const code_point incomplete
Special constant that defines incomplete code point.
Definition: utf.hpp:38
static int width(code_point value)
bool is_valid_codepoint(code_point v)
the function checks if v is a valid code point
Definition: utf.hpp:43
static code_point decode(Iterator &p, Iterator e)