src
utf8_range.cc
Go to the documentation of this file.
3 
4 namespace re2c {
5 
6 /*
7  * Now that we have catenation of byte ranges [l1-h1]...[lN-hN],
8  * we want to add it to existing range, merging suffixes on the fly.
9  */
10 void UTF8addContinuous(RangeSuffix * & root, utf8::rune l, utf8::rune h, uint32_t n)
11 {
12  uint32_t lcs[utf8::MAX_RUNE_LENGTH];
13  uint32_t hcs[utf8::MAX_RUNE_LENGTH];
14  utf8::rune_to_bytes(lcs, l);
15  utf8::rune_to_bytes(hcs, h);
16 
17  RangeSuffix ** p = &root;
18  for (uint32_t i = 1; i <= n; ++i)
19  {
20  const uint32_t lc = lcs[n - i];
21  const uint32_t hc = hcs[n - i];
22  for (;;)
23  {
24  if (*p == NULL)
25  {
26  *p = new RangeSuffix(lc, hc);
27  p = &(*p)->child;
28  break;
29  }
30  else if ((*p)->l == lc && (*p)->h == hc)
31  {
32  p = &(*p)->child;
33  break;
34  }
35  else
36  p = &(*p)->next;
37  }
38  }
39 }
40 
41 /*
42  * Split range into sub-ranges that agree on leading bytes.
43  *
44  * We have two Unicode runes of equal length, L and H, which
45  * map to UTF-8 sequences 'L_1 ... L_n' and 'H_1 ... H_n'.
46  * We want to represent Unicode range [L - H] as a catenation
47  * of byte ranges [L_1 - H_1], ..., [L_n - H_n].
48  *
49  * This is only possible if for all i > 1:
50  * if L_i /= H_i, then L_(i+1) == 0x80 and H_(i+1) == 0xbf.
51  * This condition ensures that:
52  * 1) all possible UTF-8 sequences between L and H are allowed
53  * 2) no byte ranges [b1 - b2] appear, such that b1 > b2
54  *
55  * E.g.:
56  * [\U000e0031-\U000e0043] => [f3-f3],[a0-a0],[80-81],[b1-83].
57  * The last byte range, [b1-83], is incorrect: its lower bound
58  * is greater than its upper bound. To fix this, we must split
59  * the original range into two sub-ranges:
60  * [\U000e0031-\U000e003f] => [f3-f3],[a0-a0],[80-80],[b1-bf]
61  * [\U000e0040-\U000e0043] => [f3-f3],[a0-a0],[81-81],[80-83]
62  *
63  * This function finds all such 'points of discontinuity'
64  * and represents original range as alternation of continuous
65  * sub-ranges.
66  */
67 void UTF8splitByContinuity(RangeSuffix * & root, utf8::rune l, utf8::rune h, uint32_t n)
68 {
69  for (uint32_t i = 1; i < n; ++i)
70  {
71  uint32_t m = (1u << (6u * i)) - 1u; // last i bytes of a UTF-8 sequence
72  if ((l & ~m) != (h & ~m))
73  {
74  if ((l & m) != 0)
75  {
76  UTF8splitByContinuity(root, l, l | m, n);
77  UTF8splitByContinuity(root, (l | m) + 1, h, n);
78  return;
79  }
80  if ((h & m) != m)
81  {
82  UTF8splitByContinuity(root, l, (h & ~m) - 1, n);
83  UTF8splitByContinuity(root, h & ~m, h, n);
84  return;
85  }
86  }
87  }
88  UTF8addContinuous(root, l, h, n);
89 }
90 
91 /*
92  * Split range into sub-ranges, so that all runes in the same
93  * sub-range have equal length of UTF-8 sequence. E.g., full
94  * Unicode range [0-0x10FFFF] gets split into sub-ranges:
95  * [0 - 0x7F] (1-byte UTF-8 sequences)
96  * [0x80 - 0x7FF] (2-byte UTF-8 sequences)
97  * [0x800 - 0xFFFF] (3-byte UTF-8 sequences)
98  * [0x10000 - 0x10FFFF] (4-byte UTF-8 sequences)
99  */
101 {
102  const uint32_t nh = utf8::rune_length(h);
103  for (uint32_t nl = utf8::rune_length(l); nl < nh; ++nl)
104  {
105  utf8::rune r = utf8::max_rune(nl);
106  UTF8splitByContinuity(root, l, r, nl);
107  l = r + 1;
108  }
109  UTF8splitByContinuity(root, l, h, nh);
110 }
111 
112 } // namespace re2c
RangeSuffix * child
Definition: range_suffix.h:21
RangeSuffix * next
Definition: range_suffix.h:20
uint32_t rune
Definition: utf8.h:11
static rune max_rune(uint32_t i)
Definition: utf8.cc:72
void UTF8addContinuous(RangeSuffix *&root, utf8::rune l, utf8::rune h, uint32_t n)
Definition: utf8_range.cc:10
void UTF8splitByContinuity(RangeSuffix *&root, utf8::rune l, utf8::rune h, uint32_t n)
Definition: utf8_range.cc:67
static uint32_t rune_to_bytes(uint32_t *s, rune r)
Definition: utf8.cc:22
static uint32_t rune_length(rune r)
Definition: utf8.cc:64
Definition: bitmap.cc:10
void UTF8splitByRuneLength(RangeSuffix *&root, utf8::rune l, utf8::rune h)
Definition: utf8_range.cc:100