Boost.Nowide
utf8_codecvt.hpp
1 //
2 // Copyright (c) 2015 Artyom Beilis (Tonkikh)
3 // Copyright (c) 2020 Alexander Grund
4 //
5 // Distributed under the Boost Software License, Version 1.0. (See
6 // accompanying file LICENSE or copy at
7 // http://www.boost.org/LICENSE_1_0.txt)
8 //
9 #ifndef BOOST_NOWIDE_UTF8_CODECVT_HPP_INCLUDED
10 #define BOOST_NOWIDE_UTF8_CODECVT_HPP_INCLUDED
11 
13 #include <boost/nowide/utf/utf.hpp>
14 #include <cstdint>
15 #include <locale>
16 
17 namespace boost {
18 namespace nowide {
19 
20  static_assert(sizeof(std::mbstate_t) >= 2, "mbstate_t is to small to store an UTF-16 codepoint");
21  namespace detail {
22  // Avoid including cstring for std::memcpy
23  inline void copy_uint16_t(void* dst, const void* src)
24  {
25  unsigned char* cdst = static_cast<unsigned char*>(dst);
26  const unsigned char* csrc = static_cast<const unsigned char*>(src);
27  cdst[0] = csrc[0];
28  cdst[1] = csrc[1];
29  }
30  inline std::uint16_t read_state(const std::mbstate_t& src)
31  {
32  std::uint16_t dst;
33  copy_uint16_t(&dst, &src);
34  return dst;
35  }
36  inline void write_state(std::mbstate_t& dst, const std::uint16_t src)
37  {
38  copy_uint16_t(&dst, &src);
39  }
40  } // namespace detail
41 
48  template<typename CharType, int CharSize = sizeof(CharType)>
49  class utf8_codecvt;
50 
52  template<typename CharType>
53  class BOOST_SYMBOL_VISIBLE utf8_codecvt<CharType, 2> : public std::codecvt<CharType, char, std::mbstate_t>
54  {
55  public:
56  static_assert(sizeof(CharType) >= 2, "CharType must be able to store UTF16 code point");
57 
58  utf8_codecvt(size_t refs = 0) : std::codecvt<CharType, char, std::mbstate_t>(refs)
59  {}
60 
61  protected:
62  using uchar = CharType;
63 
64  std::codecvt_base::result do_unshift(std::mbstate_t& s, char* from, char* /*to*/, char*& next) const override
65  {
66  if(detail::read_state(s) != 0)
67  return std::codecvt_base::error;
68  next = from;
69  return std::codecvt_base::ok;
70  }
71  int do_encoding() const noexcept override
72  {
73  return 0;
74  }
75  int do_max_length() const noexcept override
76  {
77  return 4;
78  }
79  bool do_always_noconv() const noexcept override
80  {
81  return false;
82  }
83 
84  int do_length(std::mbstate_t& std_state, const char* from, const char* from_end, size_t max) const override
85  {
86  using utf16_traits = utf::utf_traits<uchar, 2>;
87  std::uint16_t state = detail::read_state(std_state);
88  const char* save_from = from;
89  if(state && max > 0)
90  {
91  max--;
92  state = 0;
93  }
94  while(max > 0 && from < from_end)
95  {
96  const char* prev_from = from;
97  std::uint32_t ch = utf::utf_traits<char>::decode(from, from_end);
98  if(ch == utf::illegal)
99  {
101  } else if(ch == utf::incomplete)
102  {
103  from = prev_from;
104  break;
105  }
106  // If we can't write the char, we have to save the low surrogate in state
107  if(BOOST_LIKELY(static_cast<size_t>(utf16_traits::width(ch)) <= max))
108  {
109  max -= utf16_traits::width(ch);
110  } else
111  {
112  static_assert(utf16_traits::max_width == 2, "Required for below");
113  std::uint16_t tmpOut[2]{};
114  utf16_traits::encode(ch, tmpOut);
115  state = tmpOut[1];
116  break;
117  }
118  }
119  detail::write_state(std_state, state);
120  return static_cast<int>(from - save_from);
121  }
122 
123  std::codecvt_base::result do_in(std::mbstate_t& std_state,
124  const char* from,
125  const char* from_end,
126  const char*& from_next,
127  uchar* to,
128  uchar* to_end,
129  uchar*& to_next) const override
130  {
131  std::codecvt_base::result r = std::codecvt_base::ok;
132  using utf16_traits = utf::utf_traits<uchar, 2>;
133 
134  // mbstate_t is POD type and should be initialized to 0 (i.e. state = stateT())
135  // according to standard.
136  // We use it to store a low surrogate if it was not yet written, else state is 0
137  std::uint16_t state = detail::read_state(std_state);
138  // Write low surrogate if present
139  if(state && to < to_end)
140  {
141  *to++ = static_cast<CharType>(state);
142  state = 0;
143  }
144  while(to < to_end && from < from_end)
145  {
146  const char* from_saved = from;
147 
148  uint32_t ch = utf::utf_traits<char>::decode(from, from_end);
149 
150  if(ch == utf::illegal)
151  {
153  } else if(ch == utf::incomplete)
154  {
155  from = from_saved;
156  r = std::codecvt_base::partial;
157  break;
158  }
159  // If the encoded char fits, write directly, else safe the low surrogate in state
160  if(BOOST_LIKELY(utf16_traits::width(ch) <= to_end - to))
161  {
162  to = utf16_traits::encode(ch, to);
163  } else
164  {
165  static_assert(utf16_traits::max_width == 2, "Required for below");
166  std::uint16_t tmpOut[2]{};
167  utf16_traits::encode(ch, tmpOut);
168  *to++ = static_cast<CharType>(tmpOut[0]);
169  state = tmpOut[1];
170  break;
171  }
172  }
173  from_next = from;
174  to_next = to;
175  if(r == std::codecvt_base::ok && (from != from_end || state != 0))
176  r = std::codecvt_base::partial;
177  detail::write_state(std_state, state);
178  return r;
179  }
180 
181  std::codecvt_base::result do_out(std::mbstate_t& std_state,
182  const uchar* from,
183  const uchar* from_end,
184  const uchar*& from_next,
185  char* to,
186  char* to_end,
187  char*& to_next) const override
188  {
189  std::codecvt_base::result r = std::codecvt_base::ok;
190  using utf16_traits = utf::utf_traits<uchar, 2>;
191  // mbstate_t is POD type and should be initialized to 0
192  // (i.e. state = stateT()) according to standard.
193  // We use it to store the first observed surrogate pair, or 0 if there is none yet
194  std::uint16_t state = detail::read_state(std_state);
195  for(; to < to_end && from < from_end; ++from)
196  {
197  std::uint32_t ch = 0;
198  if(state != 0)
199  {
200  // We have a high surrogate, so now there should be a low surrogate
201  std::uint16_t w1 = state;
202  std::uint16_t w2 = *from;
203  if(BOOST_LIKELY(utf16_traits::is_trail(w2)))
204  {
205  ch = utf16_traits::combine_surrogate(w1, w2);
206  } else
207  {
209  }
210  } else
211  {
212  std::uint16_t w1 = *from;
213  if(BOOST_LIKELY(utf16_traits::is_single_codepoint(w1)))
214  {
215  ch = w1;
216  } else if(BOOST_LIKELY(utf16_traits::is_first_surrogate(w1)))
217  {
218  // Store into state and continue at next character
219  state = w1;
220  continue;
221  } else
222  {
223  // Neither a single codepoint nor a high surrogate so must be low surrogate.
224  // This is an error -> Replace character
226  }
227  }
228  if(!utf::is_valid_codepoint(ch))
229  {
230  r = std::codecvt_base::error;
231  break;
232  }
233  int len = utf::utf_traits<char>::width(ch);
234  if(to_end - to < len)
235  {
236  r = std::codecvt_base::partial;
237  break;
238  }
239  to = utf::utf_traits<char>::encode(ch, to);
240  state = 0;
241  }
242  from_next = from;
243  to_next = to;
244  if(r == std::codecvt_base::ok && (from != from_end || state != 0))
245  r = std::codecvt_base::partial;
246  detail::write_state(std_state, state);
247  return r;
248  }
249  };
250 
252  template<typename CharType>
253  class BOOST_SYMBOL_VISIBLE utf8_codecvt<CharType, 4> : public std::codecvt<CharType, char, std::mbstate_t>
254  {
255  public:
256  utf8_codecvt(size_t refs = 0) : std::codecvt<CharType, char, std::mbstate_t>(refs)
257  {}
258 
259  protected:
260  using uchar = CharType;
261 
262  std::codecvt_base::result
263  do_unshift(std::mbstate_t& /*s*/, char* from, char* /*to*/, char*& next) const override
264  {
265  next = from;
266  return std::codecvt_base::ok;
267  }
268  int do_encoding() const noexcept override
269  {
270  return 0;
271  }
272  int do_max_length() const noexcept override
273  {
274  return 4;
275  }
276  bool do_always_noconv() const noexcept override
277  {
278  return false;
279  }
280 
281  int do_length(std::mbstate_t& /*state*/, const char* from, const char* from_end, size_t max) const override
282  {
283  const char* start_from = from;
284 
285  while(max > 0 && from < from_end)
286  {
287  const char* save_from = from;
288  std::uint32_t ch = utf::utf_traits<char>::decode(from, from_end);
289  if(ch == utf::incomplete)
290  {
291  from = save_from;
292  break;
293  } else if(ch == utf::illegal)
294  {
296  }
297  max--;
298  }
299  return from - start_from;
300  }
301 
302  std::codecvt_base::result do_in(std::mbstate_t& /*state*/,
303  const char* from,
304  const char* from_end,
305  const char*& from_next,
306  uchar* to,
307  uchar* to_end,
308  uchar*& to_next) const override
309  {
310  std::codecvt_base::result r = std::codecvt_base::ok;
311 
312  while(to < to_end && from < from_end)
313  {
314  const char* from_saved = from;
315 
316  uint32_t ch = utf::utf_traits<char>::decode(from, from_end);
317 
318  if(ch == utf::illegal)
319  {
321  } else if(ch == utf::incomplete)
322  {
323  r = std::codecvt_base::partial;
324  from = from_saved;
325  break;
326  }
327  *to++ = ch;
328  }
329  from_next = from;
330  to_next = to;
331  if(r == std::codecvt_base::ok && from != from_end)
332  r = std::codecvt_base::partial;
333  return r;
334  }
335 
336  std::codecvt_base::result do_out(std::mbstate_t& /*std_state*/,
337  const uchar* from,
338  const uchar* from_end,
339  const uchar*& from_next,
340  char* to,
341  char* to_end,
342  char*& to_next) const override
343  {
344  std::codecvt_base::result r = std::codecvt_base::ok;
345  while(to < to_end && from < from_end)
346  {
347  std::uint32_t ch = 0;
348  ch = *from;
349  if(!utf::is_valid_codepoint(ch))
350  {
352  }
353  int len = utf::utf_traits<char>::width(ch);
354  if(to_end - to < len)
355  {
356  r = std::codecvt_base::partial;
357  break;
358  }
359  to = utf::utf_traits<char>::encode(ch, to);
360  from++;
361  }
362  from_next = from;
363  to_next = to;
364  if(r == std::codecvt_base::ok && from != from_end)
365  r = std::codecvt_base::partial;
366  return r;
367  }
368  };
369 
370 } // namespace nowide
371 } // namespace boost
372 
373 #endif
UTF Traits class - functions to convert UTF sequences to and from Unicode code points.
Definition: utf.hpp:57
static Iterator encode(code_point value, Iterator out)
Definition: utf8_codecvt.hpp:49
static const code_point illegal
Special constant that defines illegal code point.
Definition: utf.hpp:33
#define BOOST_NOWIDE_REPLACEMENT_CHARACTER
Definition: replacement.hpp:16
static const code_point incomplete
Special constant that defines incomplete code point.
Definition: utf.hpp:38
static int width(code_point value)
bool is_valid_codepoint(code_point v)
the function checks if v is a valid code point
Definition: utf.hpp:43
static code_point decode(Iterator &p, Iterator e)