src
utf8.cc
Go to the documentation of this file.
2 
3 namespace re2c {
4 
5 const uint32_t utf8::ERROR = 0xFFFDu;
6 
7 const utf8::rune utf8::MAX_1BYTE_RUNE = 0x7Fu;
8 const utf8::rune utf8::MAX_2BYTE_RUNE = 0x7FFu;
9 const utf8::rune utf8::MAX_3BYTE_RUNE = 0xFFFFu;
10 const utf8::rune utf8::MAX_4BYTE_RUNE = 0x10FFFFu;
12 
13 const uint32_t utf8::PREFIX_1BYTE = 0u; // 0000 0000
14 const uint32_t utf8::INFIX = 0x80u; // 1000 0000
15 const uint32_t utf8::PREFIX_2BYTE = 0xC0u; // 1100 0000
16 const uint32_t utf8::PREFIX_3BYTE = 0xE0u; // 1110 0000
17 const uint32_t utf8::PREFIX_4BYTE = 0xF0u; // 1111 0000
18 
19 const uint32_t utf8::SHIFT = 6u;
20 const uint32_t utf8::MASK = 0x3Fu; // 0011 1111
21 
22 uint32_t utf8::rune_to_bytes(uint32_t *str, rune c)
23 {
24  // one byte sequence: 0-0x7F => 0xxxxxxx
25  if (c <= MAX_1BYTE_RUNE)
26  {
27  str[0] = PREFIX_1BYTE | c;
28  return 1;
29  }
30 
31  // two byte sequence: 0x80-0x7FF => 110xxxxx 10xxxxxx
32  if (c <= MAX_2BYTE_RUNE)
33  {
34  str[0] = PREFIX_2BYTE | (c >> 1*SHIFT);
35  str[1] = INFIX | (c & MASK);
36  return 2;
37  }
38 
39  // If the Rune is out of range, convert it to the error rune.
40  // Do this test here because the error rune encodes to three bytes.
41  // Doing it earlier would duplicate work, since an out of range
42  // Rune wouldn't have fit in one or two bytes.
43  if (c > MAX_RUNE)
44  c = ERROR;
45 
46  // three byte sequence: 0x800 - 0xFFFF => 1110xxxx 10xxxxxx 10xxxxxx
47  if (c <= MAX_3BYTE_RUNE)
48  {
49  str[0] = PREFIX_3BYTE | (c >> 2*SHIFT);
50  str[1] = INFIX | ((c >> 1*SHIFT) & MASK);
51  str[2] = INFIX | (c & MASK);
52  return 3;
53  }
54 
55  // four byte sequence (21-bit value):
56  // 0x10000 - 0x1FFFFF => 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
57  str[0] = PREFIX_4BYTE | (c >> 3*SHIFT);
58  str[1] = INFIX | ((c >> 2*SHIFT) & MASK);
59  str[2] = INFIX | ((c >> 1*SHIFT) & MASK);
60  str[3] = INFIX | (c & MASK);
61  return 4;
62 }
63 
65 {
66  if (r <= MAX_2BYTE_RUNE)
67  return r <= MAX_1BYTE_RUNE ? 1 : 2;
68  else
69  return r <= MAX_3BYTE_RUNE ? 3 : 4;
70 }
71 
73 {
74  switch (i)
75  {
76  case 1: return MAX_1BYTE_RUNE;
77  case 2: return MAX_2BYTE_RUNE;
78  case 3: return MAX_3BYTE_RUNE;
79  case 4: return MAX_4BYTE_RUNE;
80  default: return ERROR;
81  }
82 }
83 
84 } // namespace re2c
static const rune MAX_3BYTE_RUNE
Definition: utf8.h:23
static const uint32_t PREFIX_2BYTE
Definition: utf8.h:29
static const uint32_t SHIFT
Definition: utf8.h:33
uint32_t rune
Definition: utf8.h:11
static rune max_rune(uint32_t i)
Definition: utf8.cc:72
static const rune MAX_2BYTE_RUNE
Definition: utf8.h:22
static const uint32_t MASK
Definition: utf8.h:34
static const uint32_t PREFIX_1BYTE
Definition: utf8.h:27
static const rune MAX_4BYTE_RUNE
Definition: utf8.h:24
static const rune MAX_1BYTE_RUNE
Definition: utf8.h:21
static const uint32_t INFIX
Definition: utf8.h:28
static const uint32_t PREFIX_4BYTE
Definition: utf8.h:31
static const uint32_t ERROR
Definition: utf8.h:18
static uint32_t rune_to_bytes(uint32_t *s, rune r)
Definition: utf8.cc:22
static uint32_t rune_length(rune r)
Definition: utf8.cc:64
static const uint32_t PREFIX_3BYTE
Definition: utf8.h:30
Definition: bitmap.cc:10
static const rune MAX_RUNE
Definition: utf8.h:25