Drizzled Public API Documentation

core.h
1 // Copyright 2006 Nemanja Trifunovic
2 
3 /*
4 Permission is hereby granted, free of charge, to any person or organization
5 obtaining a copy of the software and accompanying documentation covered by
6 this license (the "Software") to use, reproduce, display, distribute,
7 execute, and transmit the Software, and to prepare derivative works of the
8 Software, and to permit third-parties to whom the Software is furnished to
9 do so, all subject to the following:
10 
11 The copyright notices in the Software and this entire statement, including
12 the above license grant, this restriction and the following disclaimer,
13 must be included in all copies of the Software, in whole or in part, and
14 all derivative works of the Software, unless such copies or derivative
15 works are solely in the form of machine-executable object code generated by
16 a source language processor.
17 
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
21 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
22 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
23 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE.
25 */
26 
27 #pragma once
28 
29  #include <iterator>
30 
31 namespace drizzled
32 {
33 namespace utf8
34 {
35 
36 // Helper code - not intended to be directly called by the library users. May be changed at any time
37 namespace internal
38 {
39  // Unicode constants
40  // Leading (high) surrogates: 0xd800 - 0xdbff
41  // Trailing (low) surrogates: 0xdc00 - 0xdfff
42  const uint16_t LEAD_SURROGATE_MIN = 0xd800u;
43  const uint16_t LEAD_SURROGATE_MAX = 0xdbffu;
44  const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
45  const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
46  const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10);
47  const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN;
48 
49  // Maximum valid value for a Unicode code point
50  const uint32_t CODE_POINT_MAX = 0x0010ffffu;
51 
52  template<typename octet_type>
53  inline uint8_t mask8(octet_type oc)
54  {
55  return static_cast<uint8_t>(0xff & oc);
56  }
57  template<typename u16_type>
58  inline uint16_t mask16(u16_type oc)
59  {
60  return static_cast<uint16_t>(0xffff & oc);
61  }
62  template<typename octet_type>
63  inline bool is_trail(octet_type oc)
64  {
65  return ((mask8(oc) >> 6) == 0x2);
66  }
67 
68  template <typename u16>
69  inline bool is_lead_surrogate(u16 cp)
70  {
71  return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
72  }
73 
74  template <typename u16>
75  inline bool is_trail_surrogate(u16 cp)
76  {
77  return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
78  }
79 
80  template <typename u16>
81  inline bool is_surrogate(u16 cp)
82  {
83  return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
84  }
85 
86  template <typename u32>
87  inline bool is_code_point_valid(u32 cp)
88  {
89  return (cp <= CODE_POINT_MAX && !is_surrogate(cp) && cp != 0xfffe && cp != 0xffff);
90  }
91 
92  template <typename octet_iterator>
93  inline typename std::iterator_traits<octet_iterator>::difference_type
94  sequence_length(octet_iterator lead_it)
95  {
96  uint8_t lead = mask8(*lead_it);
97  if (lead < 0x80)
98  return 1;
99  else if ((lead >> 5) == 0x6)
100  return 2;
101  else if ((lead >> 4) == 0xe)
102  return 3;
103  else if ((lead >> 3) == 0x1e)
104  return 4;
105  else
106  return 0;
107  }
108 
109  template <typename octet_difference_type>
110  inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length)
111  {
112  if (cp < 0x80) {
113  if (length != 1)
114  return true;
115  }
116  else if (cp < 0x800) {
117  if (length != 2)
118  return true;
119  }
120  else if (cp < 0x10000) {
121  if (length != 3)
122  return true;
123  }
124 
125  return false;
126  }
127 
128  enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
129 
131 
132  template <typename octet_iterator>
133  utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t* code_point)
134  {
135  if (it != end) {
136  if (code_point)
137  *code_point = mask8(*it);
138  return UTF8_OK;
139  }
140  return NOT_ENOUGH_ROOM;
141  }
142 
143  template <typename octet_iterator>
144  utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t* code_point)
145  {
146  utf_error ret_code = NOT_ENOUGH_ROOM;
147 
148  if (it != end) {
149  uint32_t cp = mask8(*it);
150  if (++it != end) {
151  if (is_trail(*it)) {
152  cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
153 
154  if (code_point)
155  *code_point = cp;
156  ret_code = UTF8_OK;
157  }
158  else
159  ret_code = INCOMPLETE_SEQUENCE;
160  }
161  else
162  ret_code = NOT_ENOUGH_ROOM;
163  }
164 
165  return ret_code;
166  }
167 
168  template <typename octet_iterator>
169  utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t* code_point)
170  {
171  utf_error ret_code = NOT_ENOUGH_ROOM;
172 
173  if (it != end) {
174  uint32_t cp = mask8(*it);
175  if (++it != end) {
176  if (is_trail(*it)) {
177  cp = ((cp << 12) & 0xffff) + ((mask8(*it) << 6) & 0xfff);
178  if (++it != end) {
179  if (is_trail(*it)) {
180  cp += (*it) & 0x3f;
181 
182  if (code_point)
183  *code_point = cp;
184  ret_code = UTF8_OK;
185  }
186  else
187  ret_code = INCOMPLETE_SEQUENCE;
188  }
189  else
190  ret_code = NOT_ENOUGH_ROOM;
191  }
192  else
193  ret_code = INCOMPLETE_SEQUENCE;
194  }
195  else
196  ret_code = NOT_ENOUGH_ROOM;
197  }
198 
199  return ret_code;
200  }
201 
202  template <typename octet_iterator>
203  utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t* code_point)
204  {
205  utf_error ret_code = NOT_ENOUGH_ROOM;
206 
207  if (it != end) {
208  uint32_t cp = mask8(*it);
209  if (++it != end) {
210  if (is_trail(*it)) {
211  cp = ((cp << 18) & 0x1fffff) + ((mask8(*it) << 12) & 0x3ffff);
212  if (++it != end) {
213  if (is_trail(*it)) {
214  cp += (mask8(*it) << 6) & 0xfff;
215  if (++it != end) {
216  if (is_trail(*it)) {
217  cp += (*it) & 0x3f;
218 
219  if (code_point)
220  *code_point = cp;
221  ret_code = UTF8_OK;
222  }
223  else
224  ret_code = INCOMPLETE_SEQUENCE;
225  }
226  else
227  ret_code = NOT_ENOUGH_ROOM;
228  }
229  else
230  ret_code = INCOMPLETE_SEQUENCE;
231  }
232  else
233  ret_code = NOT_ENOUGH_ROOM;
234  }
235  else
236  ret_code = INCOMPLETE_SEQUENCE;
237  }
238  else
239  ret_code = NOT_ENOUGH_ROOM;
240  }
241 
242  return ret_code;
243  }
244 
245  template <typename octet_iterator>
246  utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t* code_point)
247  {
248  // Save the original value of it so we can go back in case of failure
249  // Of course, it does not make much sense with i.e. stream iterators
250  octet_iterator original_it = it;
251 
252  uint32_t cp = 0;
253  // Determine the sequence length based on the lead octet
254  typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
255  octet_difference_type length = sequence_length(it);
256  if (length == 0)
257  return INVALID_LEAD;
258 
259  // Now that we have a valid sequence length, get trail octets and calculate the code point
260  utf_error err = UTF8_OK;
261  switch (length) {
262  case 1:
263  err = get_sequence_1(it, end, &cp);
264  break;
265  case 2:
266  err = get_sequence_2(it, end, &cp);
267  break;
268  case 3:
269  err = get_sequence_3(it, end, &cp);
270  break;
271  case 4:
272  err = get_sequence_4(it, end, &cp);
273  break;
274  }
275 
276  if (err == UTF8_OK) {
277  // Decoding succeeded. Now, security checks...
278  if (is_code_point_valid(cp)) {
279  if (!is_overlong_sequence(cp, length)){
280  // Passed! Return here.
281  if (code_point)
282  *code_point = cp;
283  ++it;
284  return UTF8_OK;
285  }
286  else
287  err = OVERLONG_SEQUENCE;
288  }
289  else
290  err = INVALID_CODE_POINT;
291  }
292 
293  // Failure branch - restore the original value of the iterator
294  it = original_it;
295  return err;
296  }
297 
298  template <typename octet_iterator>
299  inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
300  return validate_next(it, end, 0);
301  }
302 
303 } // namespace internal
304 
306 
307  // Byte order mark
308  const uint8_t bom[] = {0xef, 0xbb, 0xbf};
309 
310  template <typename octet_iterator>
311  octet_iterator find_invalid(octet_iterator start, octet_iterator end)
312  {
313  octet_iterator result = start;
314  while (result != end) {
315  internal::utf_error err_code = internal::validate_next(result, end);
316  if (err_code != internal::UTF8_OK)
317  return result;
318  }
319  return result;
320  }
321 
322  template <typename octet_iterator>
323  inline bool is_valid(octet_iterator start, octet_iterator end)
324  {
325  return (find_invalid(start, end) == end);
326  }
327 
328  template <typename octet_iterator>
329  inline bool starts_with_bom (octet_iterator it, octet_iterator end)
330  {
331  return (
332  ((it != end) && (internal::mask8(*it++)) == bom[0]) &&
333  ((it != end) && (internal::mask8(*it++)) == bom[1]) &&
334  ((it != end) && (internal::mask8(*it)) == bom[2])
335  );
336  }
337 
338  //Deprecated in release 2.3
339  template <typename octet_iterator>
340  inline bool is_bom (octet_iterator it)
341  {
342  return (
343  (internal::mask8(*it++)) == bom[0] &&
344  (internal::mask8(*it++)) == bom[1] &&
345  (internal::mask8(*it)) == bom[2]
346  );
347  }
348 } // namespace utf8
349 } // namespace drizzled
350 
351 
352 
TODO: Rename this file - func.h is stupid.