GG
checked.h
Go to the documentation of this file.
1 // Copyright 2006 Nemanja Trifunovic
2 
3 /*
4 Permission is hereby granted, free of charge, to any person or organization
5 obtaining a copy of the software and accompanying documentation covered by
6 this license (the "Software") to use, reproduce, display, distribute,
7 execute, and transmit the Software, and to prepare derivative works of the
8 Software, and to permit third-parties to whom the Software is furnished to
9 do so, all subject to the following:
10 
11 The copyright notices in the Software and this entire statement, including
12 the above license grant, this restriction and the following disclaimer,
13 must be included in all copies of the Software, in whole or in part, and
14 all derivative works of the Software, unless such copies or derivative
15 works are solely in the form of machine-executable object code generated by
16 a source language processor.
17 
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
21 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
22 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
23 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE.
25 */
26 
31 #ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
32 #define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
33 
34 #include "core.h"
35 #include <stdexcept>
36 
37 namespace utf8
38 {
39  // Exceptions that may be thrown from the library functions.
40  class invalid_code_point : public std::exception {
41  uint32_t cp;
42  public:
43  invalid_code_point(uint32_t cp) : cp(cp) {}
44  virtual const char* what() const throw() { return "Invalid code point"; }
45  uint32_t code_point() const {return cp;}
46  };
47 
48  class invalid_utf8 : public std::exception {
49  uint8_t u8;
50  public:
51  invalid_utf8 (uint8_t u) : u8(u) {}
52  virtual const char* what() const throw() { return "Invalid UTF-8"; }
53  uint8_t utf8_octet() const {return u8;}
54  };
55 
56  class invalid_utf16 : public std::exception {
57  uint16_t u16;
58  public:
59  invalid_utf16 (uint16_t u) : u16(u) {}
60  virtual const char* what() const throw() { return "Invalid UTF-16"; }
61  uint16_t utf16_word() const {return u16;}
62  };
63 
64  class not_enough_room : public std::exception {
65  public:
66  virtual const char* what() const throw() { return "Not enough space"; }
67  };
68 
70 
71  template <typename octet_iterator, typename output_iterator>
72  output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
73  {
74  while (start != end) {
75  octet_iterator sequence_start = start;
76  internal::utf_error err_code = internal::validate_next(start, end);
77  switch (err_code) {
78  case internal::OK :
79  for (octet_iterator it = sequence_start; it != start; ++it)
80  *out++ = *it;
81  break;
82  case internal::NOT_ENOUGH_ROOM:
83  throw not_enough_room();
84  case internal::INVALID_LEAD:
85  append (replacement, out);
86  ++start;
87  break;
88  case internal::INCOMPLETE_SEQUENCE:
89  case internal::OVERLONG_SEQUENCE:
90  case internal::INVALID_CODE_POINT:
91  append (replacement, out);
92  ++start;
93  // just one replacement mark for the sequence
94  while (internal::is_trail(*start) && start != end)
95  ++start;
96  break;
97  }
98  }
99  return out;
100  }
101 
102  template <typename octet_iterator, typename output_iterator>
103  inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
104  {
105  static const uint32_t replacement_marker = internal::mask16(0xfffd);
106  return replace_invalid(start, end, out, replacement_marker);
107  }
108 
109  template <typename octet_iterator>
110  octet_iterator append(uint32_t cp, octet_iterator result)
111  {
112  if (!internal::is_code_point_valid(cp))
113  throw invalid_code_point(cp);
114 
115  if (cp < 0x80) // one octet
116  *(result++) = static_cast<uint8_t>(cp);
117  else if (cp < 0x800) { // two octets
118  *(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0);
119  *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
120  }
121  else if (cp < 0x10000) { // three octets
122  *(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
123  *(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f | 0x80);
124  *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
125  }
126  else if (cp <= internal::CODE_POINT_MAX) { // four octets
127  *(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
128  *(result++) = static_cast<uint8_t>((cp >> 12)& 0x3f | 0x80);
129  *(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f | 0x80);
130  *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
131  }
132  else
133  throw invalid_code_point(cp);
134 
135  return result;
136  }
137 
138  template <typename octet_iterator>
139  uint32_t next(octet_iterator& it, octet_iterator end)
140  {
141  uint32_t cp = 0;
142  internal::utf_error err_code = internal::validate_next(it, end, &cp);
143  switch (err_code) {
144  case internal::OK :
145  break;
146  case internal::NOT_ENOUGH_ROOM :
147  throw not_enough_room();
148  case internal::INVALID_LEAD :
149  case internal::INCOMPLETE_SEQUENCE :
150  case internal::OVERLONG_SEQUENCE :
151  throw invalid_utf8(*it);
152  case internal::INVALID_CODE_POINT :
153  throw invalid_code_point(cp);
154  }
155  return cp;
156  }
157 
158  template <typename octet_iterator>
159  uint32_t peek_next(octet_iterator it, octet_iterator end)
160  {
161  return next(it, end);
162  }
163 
164  template <typename octet_iterator>
165  uint32_t prior(octet_iterator& it, octet_iterator start)
166  {
167  octet_iterator end = it;
168  while (internal::is_trail(*(--it)))
169  if (it < start)
170  throw invalid_utf8(*it); // error - no lead byte in the sequence
171  octet_iterator temp = it;
172  return next(temp, end);
173  }
174 
176  template <typename octet_iterator>
177  uint32_t previous(octet_iterator& it, octet_iterator pass_start)
178  {
179  octet_iterator end = it;
180  while (internal::is_trail(*(--it)))
181  if (it == pass_start)
182  throw invalid_utf8(*it); // error - no lead byte in the sequence
183  octet_iterator temp = it;
184  return next(temp, end);
185  }
186 
187  template <typename octet_iterator, typename distance_type>
188  void advance (octet_iterator& it, distance_type n, octet_iterator end)
189  {
190  for (distance_type i = 0; i < n; ++i)
191  next(it, end);
192  }
193 
194  template <typename octet_iterator>
195  typename std::iterator_traits<octet_iterator>::difference_type
196  distance (octet_iterator first, octet_iterator last)
197  {
198  typename std::iterator_traits<octet_iterator>::difference_type dist;
199  for (dist = 0; first < last; ++dist)
200  next(first, last);
201  return dist;
202  }
203 
204  template <typename u16bit_iterator, typename octet_iterator>
205  octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
206  {
207  while (start != end) {
208  uint32_t cp = internal::mask16(*start++);
209  // Take care of surrogate pairs first
210  if (internal::is_surrogate(cp)) {
211  if (start != end) {
212  uint32_t trail_surrogate = internal::mask16(*start++);
213  if (trail_surrogate >= internal::TRAIL_SURROGATE_MIN && trail_surrogate <= internal::TRAIL_SURROGATE_MAX)
214  cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
215  else
216  throw invalid_utf16(static_cast<uint16_t>(trail_surrogate));
217  }
218  else
219  throw invalid_utf16(static_cast<uint16_t>(*start));
220 
221  }
222  result = append(cp, result);
223  }
224  return result;
225  }
226 
227  template <typename u16bit_iterator, typename octet_iterator>
228  u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
229  {
230  while (start != end) {
231  uint32_t cp = next(start, end);
232  if (cp > 0xffff) { //make a surrogate pair
233  *result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET);
234  *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
235  }
236  else
237  *result++ = static_cast<uint16_t>(cp);
238  }
239  return result;
240  }
241 
242  template <typename octet_iterator, typename u32bit_iterator>
243  octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
244  {
245  while (start != end)
246  result = append(*(start++), result);
247 
248  return result;
249  }
250 
251  template <typename octet_iterator, typename u32bit_iterator>
252  u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
253  {
254  while (start < end)
255  (*result++) = next(start, end);
256 
257  return result;
258  }
259 
260  // The iterator class
261  template <typename octet_iterator>
262  class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> {
263  octet_iterator it;
264  octet_iterator range_start;
265  octet_iterator range_end;
266  public:
267  iterator () {};
268  explicit iterator (const octet_iterator& octet_it,
269  const octet_iterator& range_start,
270  const octet_iterator& range_end) :
271  it(octet_it), range_start(range_start), range_end(range_end)
272  {
273  if (it < range_start || it > range_end)
274  throw std::out_of_range("Invalid utf-8 iterator position");
275  }
276  // the default "big three" are OK
277  octet_iterator base () const { return it; }
278  uint32_t operator * () const
279  {
280  octet_iterator temp = it;
281  return next(temp, range_end);
282  }
283  bool operator == (const iterator& rhs) const
284  {
285  if (range_start != rhs.range_start || range_end != rhs.range_end)
286  throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
287  return (it == rhs.it);
288  }
289  bool operator != (const iterator& rhs) const
290  {
291  return !(operator == (rhs));
292  }
293  iterator& operator ++ ()
294  {
295  next(it, range_end);
296  return *this;
297  }
298  iterator operator ++ (int)
299  {
300  iterator temp = *this;
301  next(it, range_end);
302  return temp;
303  }
304  iterator& operator -- ()
305  {
306  prior(it, range_start);
307  return *this;
308  }
309  iterator operator -- (int)
310  {
311  iterator temp = *this;
312  prior(it, range_start);
313  return temp;
314  }
315  }; // class iterator
316 
317  // The wchar_t iterator class
318  template <typename octet_iterator>
319  class wchar_iterator :
320  public std::iterator<std::bidirectional_iterator_tag, wchar_t>
321  {
322  octet_iterator it;
323  octet_iterator range_start;
324  octet_iterator range_end;
325  public:
326  wchar_iterator () {};
327  wchar_iterator (const octet_iterator& octet_it,
328  const octet_iterator& range_start,
329  const octet_iterator& range_end) :
330  it(octet_it), range_start(range_start), range_end(range_end)
331  {
332  if (it < range_start || it > range_end)
333  throw std::out_of_range("Invalid utf-8 iterator position");
334  }
335  // the default "big three" are OK
336  octet_iterator base () const { return it; }
337  wchar_t operator * () const
338  {
339  octet_iterator temp = it;
340  uint32_t retval = next(temp, range_end);
341  assert(retval <= WCHAR_MAX);
342  return retval;
343  }
344  bool operator == (const wchar_iterator& rhs) const
345  {
346  if (range_start != rhs.range_start || range_end != rhs.range_end)
347  throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
348  return (it == rhs.it);
349  }
350  bool operator != (const wchar_iterator& rhs) const
351  {
352  return !(operator == (rhs));
353  }
354  wchar_iterator& operator ++ ()
355  {
356  next(it, range_end);
357  return *this;
358  }
359  wchar_iterator operator ++ (int)
360  {
361  wchar_iterator temp = *this;
362  next(it, range_end);
363  return temp;
364  }
365  wchar_iterator& operator -- ()
366  {
367  prior(it, range_start);
368  return *this;
369  }
370  wchar_iterator operator -- (int)
371  {
372  wchar_iterator temp = *this;
373  prior(it, range_start);
374  return temp;
375  }
376  };
377 
378 } // namespace utf8
379 
380 #endif //header guard
381 
382