src
enc.h
Go to the documentation of this file.
1 #ifndef _RE2C_IR_REGEXP_ENCODING_ENC_
2 #define _RE2C_IR_REGEXP_ENCODING_ENC_
3 
4 #include "src/util/c99_stdint.h"
5 
6 namespace re2c {
7 
8 class Range;
9 
10 /*
11  * note [encodings]
12  *
13  * Each encoding defines two concepts:
14  *
15  * 1) Code point -- abstract number, which represents single encoding symbol.
16  * E.g., Unicode defines code points in the range [0 - 0x10FFFF] , so each
17  * Unicode encoding must be capable of representing 0x110000 code points.
18  *
19  * 2) Code unit -- the smallest unit of memory, which is used in the encoded
20  * text. One or more code units can be needed to represent a single code
21  * point, depending on the encoding. For each encoding, all code points
22  * either are represented with equal number of code units (fixed-length
23  * encodings), or with variable number of code units (variable-length
24  * encodings).
25  *
26  * +----------+------------------+-----------------------+-----------------+----------------+
27  * | encoding | code point range | code point size | code unit range | code unit size |
28  * +----------+------------------+-----------------------+-----------------+----------------+
29  * | ASCII | 0 - 0xFF | fixed, 1 byte | 0 - 0xFF | 1 byte |
30  * | EBCDIC | 0 - 0xFF | fixed, 1 byte | 0 - 0xFF | 1 byte |
31  * | UCS2 | 0 - 0xFFFF | fixed, 2 bytes | 0 - 0xFFFF | 2 bytes |
32  * | UTF16 | 0 - 0x10FFFF | variable, 2 - 4 bytes | 0 - 0xFFFF | 2 bytes |
33  * | UTF32 | 0 - 0x10FFFF | fixed, 4 bytes | 0 - 0x10FFFF | 4 bytes |
34  * | UTF8 | 0 - 0x10FFFF | variable, 1 - 4 bytes | 0 - 0xFF | 1 byte |
35  * +----------+------------------+-----------------------+-----------------+----------------+
36  */
37 
38 class Enc
39 {
40 public:
41  // Supported encodings.
42  enum type_t
45  , UCS2
48  , UTF8
49  };
50 
51  // What to do with invalid code points
52  enum policy_t
56  };
57 
58 private:
59  static const uint32_t asc2ebc[256];
60  static const uint32_t ebc2asc[256];
61  static const uint32_t SURR_MIN;
62  static const uint32_t SURR_MAX;
63  static const uint32_t UNICODE_ERROR;
64 
65  type_t type_;
66  policy_t policy_;
67 
68 public:
69  Enc()
70  : type_ (ASCII)
71  , policy_ (POLICY_IGNORE)
72  { }
73 
74  static const char * name (type_t t);
75 
76  bool operator != (const Enc & e) const { return type_ != e.type_; }
77 
78  inline uint32_t nCodePoints() const;
79  inline uint32_t nCodeUnits() const;
80  inline uint32_t szCodePoint() const;
81  inline uint32_t szCodeUnit() const;
82 
83  inline bool set(type_t t);
84  inline void unset(type_t);
85  inline type_t type () const;
86 
87  inline void setPolicy(policy_t t);
88 
89  bool encode(uint32_t & c) const;
90  uint32_t decodeUnsafe(uint32_t c) const;
91  Range * encodeRange(uint32_t l, uint32_t h) const;
92  Range * fullRange() const;
93 };
94 
95 inline const char * Enc::name (type_t t)
96 {
97  switch (t)
98  {
99  case ASCII: return "ASCII";
100  case EBCDIC: return "EBCDIC";
101  case UTF8: return "UTF8";
102  case UCS2: return "USC2";
103  case UTF16: return "UTF16";
104  case UTF32: return "UTF32";
105  default: return "<bad encoding>";
106  }
107 }
108 
109 inline uint32_t Enc::nCodePoints() const
110 {
111  switch (type_)
112  {
113  case ASCII:
114  case EBCDIC: return 0x100;
115  case UCS2: return 0x10000;
116  case UTF16:
117  case UTF32:
118  case UTF8:
119  default: return 0x110000;
120  }
121 }
122 
123 inline uint32_t Enc::nCodeUnits() const
124 {
125  switch (type_)
126  {
127  case ASCII:
128  case EBCDIC:
129  case UTF8: return 0x100;
130  case UCS2:
131  case UTF16: return 0x10000;
132  case UTF32:
133  default: return 0x110000;
134  }
135 }
136 
137 // returns *maximal* code point size for encoding
138 inline uint32_t Enc::szCodePoint() const
139 {
140  switch (type_)
141  {
142  case ASCII:
143  case EBCDIC: return 1;
144  case UCS2: return 2;
145  case UTF16:
146  case UTF32:
147  case UTF8:
148  default: return 4;
149  }
150 }
151 
152 inline uint32_t Enc::szCodeUnit() const
153 {
154  switch (type_)
155  {
156  case ASCII:
157  case EBCDIC:
158  case UTF8: return 1;
159  case UCS2:
160  case UTF16: return 2;
161  case UTF32:
162  default: return 4;
163  }
164 }
165 
166 inline bool Enc::set(type_t t)
167 {
168  if (type_ == t)
169  return true;
170  else if (type_ != ASCII)
171  return false;
172  else
173  {
174  type_ = t;
175  return true;
176  }
177 }
178 
179 inline void Enc::unset(type_t t)
180 {
181  if (type_ == t)
182  type_ = ASCII;
183 }
184 
185 inline Enc::type_t Enc::type () const
186 {
187  return type_;
188 }
189 
190 inline void Enc::setPolicy(policy_t t)
191 {
192  policy_ = t;
193 }
194 
195 } // namespace re2c
196 
197 #endif // _RE2C_IR_REGEXP_ENCODING_ENC_
void unset(type_t)
Definition: enc.h:179
bool set(type_t t)
Definition: enc.h:166
Range * encodeRange(uint32_t l, uint32_t h) const
Definition: enc.cc:132
type_t type() const
Definition: enc.h:185
Range * fullRange() const
Definition: enc.cc:195
static const char * name(type_t t)
Definition: enc.h:95
bool encode(uint32_t &c) const
Definition: enc.cc:62
uint32_t szCodeUnit() const
Definition: enc.h:152
Definition: enc.h:38
bool operator!=(const Enc &e) const
Definition: enc.h:76
policy_t
Definition: enc.h:52
uint32_t nCodePoints() const
Definition: enc.h:109
uint32_t nCodeUnits() const
Definition: enc.h:123
uint32_t szCodePoint() const
Definition: enc.h:138
Enc()
Definition: enc.h:69
uint32_t decodeUnsafe(uint32_t c) const
Definition: enc.cc:103
void setPolicy(policy_t t)
Definition: enc.h:190
Definition: bitmap.cc:10
type_t
Definition: enc.h:42