src
enc.cc
Go to the documentation of this file.
2 #include "src/util/range.h"
3 
4 namespace re2c {
5 
6 const uint32_t Enc::SURR_MIN = 0xD800;
7 const uint32_t Enc::SURR_MAX = 0xDFFF;
8 const uint32_t Enc::UNICODE_ERROR = 0xFFFD;
9 
10 const uint32_t Enc::asc2ebc[256] =
11  { /* Based on ISO 8859/1 and Code Page 37 */
12  0x00, 0x01, 0x02, 0x03, 0x37, 0x2d, 0x2e, 0x2f, 0x16, 0x05, 0x25, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
13  0x10, 0x11, 0x12, 0x13, 0x3c, 0x3d, 0x32, 0x26, 0x18, 0x19, 0x3f, 0x27, 0x1c, 0x1d, 0x1e, 0x1f,
14  0x40, 0x5a, 0x7f, 0x7b, 0x5b, 0x6c, 0x50, 0x7d, 0x4d, 0x5d, 0x5c, 0x4e, 0x6b, 0x60, 0x4b, 0x61,
15  0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0x7a, 0x5e, 0x4c, 0x7e, 0x6e, 0x6f,
16  0x7c, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6,
17  0xd7, 0xd8, 0xd9, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xba, 0xe0, 0xbb, 0xb0, 0x6d,
18  0x79, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
19  0x97, 0x98, 0x99, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xc0, 0x4f, 0xd0, 0xa1, 0x07,
20  0x20, 0x21, 0x22, 0x23, 0x24, 0x15, 0x06, 0x17, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x09, 0x0a, 0x1b,
21  0x30, 0x31, 0x1a, 0x33, 0x34, 0x35, 0x36, 0x08, 0x38, 0x39, 0x3a, 0x3b, 0x04, 0x14, 0x3e, 0xff,
22  0x41, 0xaa, 0x4a, 0xb1, 0x9f, 0xb2, 0x6a, 0xb5, 0xbd, 0xb4, 0x9a, 0x8a, 0x5f, 0xca, 0xaf, 0xbc,
23  0x90, 0x8f, 0xea, 0xfa, 0xbe, 0xa0, 0xb6, 0xb3, 0x9d, 0xda, 0x9b, 0x8b, 0xb7, 0xb8, 0xb9, 0xab,
24  0x64, 0x65, 0x62, 0x66, 0x63, 0x67, 0x9e, 0x68, 0x74, 0x71, 0x72, 0x73, 0x78, 0x75, 0x76, 0x77,
25  0xac, 0x69, 0xed, 0xee, 0xeb, 0xef, 0xec, 0xbf, 0x80, 0xfd, 0xfe, 0xfb, 0xfc, 0xad, 0x8e, 0x59,
26  0x44, 0x45, 0x42, 0x46, 0x43, 0x47, 0x9c, 0x48, 0x54, 0x51, 0x52, 0x53, 0x58, 0x55, 0x56, 0x57,
27  0x8c, 0x49, 0xcd, 0xce, 0xcb, 0xcf, 0xcc, 0xe1, 0x70, 0xdd, 0xde, 0xdb, 0xdc, 0x8d, 0xae, 0xdf
28  };
29 
30 const uint32_t Enc::ebc2asc[256] =
31  { /* Based on ISO 8859/1 and Code Page 37 */
32  0x00, 0x01, 0x02, 0x03, 0x9c, 0x09, 0x86, 0x7f, 0x97, 0x8d, 0x8e, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
33  0x10, 0x11, 0x12, 0x13, 0x9d, 0x85, 0x08, 0x87, 0x18, 0x19, 0x92, 0x8f, 0x1c, 0x1d, 0x1e, 0x1f,
34  0x80, 0x81, 0x82, 0x83, 0x84, 0x0a, 0x17, 0x1b, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x05, 0x06, 0x07,
35  0x90, 0x91, 0x16, 0x93, 0x94, 0x95, 0x96, 0x04, 0x98, 0x99, 0x9a, 0x9b, 0x14, 0x15, 0x9e, 0x1a,
36  0x20, 0xa0, 0xe2, 0xe4, 0xe0, 0xe1, 0xe3, 0xe5, 0xe7, 0xf1, 0xa2, 0x2e, 0x3c, 0x28, 0x2b, 0x7c,
37  0x26, 0xe9, 0xea, 0xeb, 0xe8, 0xed, 0xee, 0xef, 0xec, 0xdf, 0x21, 0x24, 0x2a, 0x29, 0x3b, 0xac,
38  0x2d, 0x2f, 0xc2, 0xc4, 0xc0, 0xc1, 0xc3, 0xc5, 0xc7, 0xd1, 0xa6, 0x2c, 0x25, 0x5f, 0x3e, 0x3f,
39  0xf8, 0xc9, 0xca, 0xcb, 0xc8, 0xcd, 0xce, 0xcf, 0xcc, 0x60, 0x3a, 0x23, 0x40, 0x27, 0x3d, 0x22,
40  0xd8, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xab, 0xbb, 0xf0, 0xfd, 0xde, 0xb1,
41  0xb0, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0xaa, 0xba, 0xe6, 0xb8, 0xc6, 0xa4,
42  0xb5, 0x7e, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0xa1, 0xbf, 0xd0, 0xdd, 0xfe, 0xae,
43  0x5e, 0xa3, 0xa5, 0xb7, 0xa9, 0xa7, 0xb6, 0xbc, 0xbd, 0xbe, 0x5b, 0x5d, 0xaf, 0xa8, 0xb4, 0xd7,
44  0x7b, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0xad, 0xf4, 0xf6, 0xf2, 0xf3, 0xf5,
45  0x7d, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0xb9, 0xfb, 0xfc, 0xf9, 0xfa, 0xff,
46  0x5c, 0xf7, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0xb2, 0xd4, 0xd6, 0xd2, 0xd3, 0xd5,
47  0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0xb3, 0xdb, 0xdc, 0xd9, 0xda, 0x9f
48  };
49 
50 /*
51  * Returns code point representation for current
52  * encoding with regard to current policy.
53  *
54  * Since code point is exacly specified by user,
55  * it is assumed that user considers it to be valid.
56  * We must check it.
57  *
58  * Returns false if this code point exceeds maximum
59  * or is forbidden by current policy, otherwise
60  * returns true. Overwrites code point.
61  */
62 bool Enc::encode(uint32_t & c) const
63 {
64  if (c >= nCodePoints ())
65  {
66  return false;
67  }
68 
69  switch (type_)
70  {
71  case ASCII:
72  return true;
73  case EBCDIC:
74  c = asc2ebc[c];
75  return true;
76  case UCS2:
77  case UTF16:
78  case UTF32:
79  case UTF8:
80  if (c < SURR_MIN || c > SURR_MAX)
81  return true;
82  else
83  {
84  switch (policy_)
85  {
86  case POLICY_FAIL:
87  return false;
88  case POLICY_SUBSTITUTE:
89  c = UNICODE_ERROR;
90  return true;
91  case POLICY_IGNORE:
92  return true;
93  }
94  }
95  }
96  return false; // to silence gcc warning
97 }
98 
99 /*
100  * Returns original representation of code point.
101  * Assumes code point is valid (hence 'unsafe').
102  */
103 uint32_t Enc::decodeUnsafe(uint32_t c) const
104 {
105  switch (type_)
106  {
107  case EBCDIC:
108  c = ebc2asc[c & 0xFF];
109  break;
110  case ASCII:
111  case UCS2:
112  case UTF16:
113  case UTF32:
114  case UTF8:
115  break;
116  }
117  return c;
118 }
119 
120 /*
121  * Returns [l - h] range representation for current
122  * encoding with regard to current policy.
123  *
124  * Since range borders are exacly specified by user,
125  * it is assumed that user considers that all code
126  * points from this range are valid. re2c must check it.
127  *
128  * Returns NULL if range contains code points that
129  * exceed maximum or are forbidden by current policy,
130  * otherwise returns pointer to newly constructed range.
131  */
132 Range * Enc::encodeRange(uint32_t l, uint32_t h) const
133 {
134  if (l >= nCodePoints () || h >= nCodePoints ())
135  {
136  return NULL;
137  }
138 
139  Range * r = NULL;
140  switch (type_)
141  {
142  case ASCII:
143  r = Range::ran (l, h + 1);
144  break;
145  case EBCDIC:
146  {
147  const uint32_t el = asc2ebc[l];
148  r = Range::sym (el);
149  for (uint32_t c = l + 1; c <= h; ++c)
150  {
151  const uint32_t ec = asc2ebc[c];
152  r = Range::add (r, Range::sym (ec));
153  }
154  break;
155  }
156  case UCS2:
157  case UTF16:
158  case UTF32:
159  case UTF8:
160  r = Range::ran (l, h + 1);
161  if (l <= SURR_MAX && h >= SURR_MIN)
162  {
163  switch (policy_)
164  {
165  case POLICY_FAIL:
166  r = NULL;
167  break;
168  case POLICY_SUBSTITUTE:
169  {
170  Range * surrs = Range::ran (SURR_MIN, SURR_MAX + 1);
171  Range * error = Range::sym (UNICODE_ERROR);
172  r = Range::sub (r, surrs);
173  r = Range::add (r, error);
174  break;
175  }
176  case POLICY_IGNORE:
177  break;
178  }
179  }
180  break;
181  }
182  return r;
183 }
184 
185 /*
186  * Returns full range representation for current encoding
187  * with regard to current policy.
188  *
189  * Since range is defined declaratively, re2c does
190  * all the necessary corrections 'for free'.
191  *
192  * Always succeeds, returns pointer to newly constructed
193  * range.
194  */
196 {
197  Range * r = Range::ran (0, nCodePoints());
198  if (policy_ != POLICY_IGNORE)
199  {
200  Range * surrs = Range::ran (SURR_MIN, SURR_MAX + 1);
201  r = Range::sub (r, surrs);
202  }
203  return r;
204 }
205 
206 } // namespace re2c
static Range * sub(const Range *r1, const Range *r2)
Definition: range.cc:61
static Range * ran(uint32_t l, uint32_t u)
Definition: range.h:31
void error(const char *fmt,...)
Definition: msg.cc:10
Range * encodeRange(uint32_t l, uint32_t h) const
Definition: enc.cc:132
Range * fullRange() const
Definition: enc.cc:195
bool encode(uint32_t &c) const
Definition: enc.cc:62
static Range * sym(uint32_t c)
Definition: range.h:27
uint32_t nCodePoints() const
Definition: enc.h:109
uint32_t decodeUnsafe(uint32_t c) const
Definition: enc.cc:103
Definition: bitmap.cc:10
static Range * add(const Range *r1, const Range *r2)
Definition: range.cc:26