src
utf16_range.cc
Go to the documentation of this file.
3 
4 namespace re2c {
5 
6 /*
7  * Add word range [w1-w2].
8  */
9 void UTF16addContinuous1(RangeSuffix * & root, uint32_t l, uint32_t h)
10 {
11  RangeSuffix ** p = &root;
12  for (;;)
13  {
14  if (*p == NULL)
15  {
16  *p = new RangeSuffix(l, h);
17  break;
18  }
19  else if ((*p)->l == l && (*p)->h == h)
20  {
21  break;
22  }
23  else
24  p = &(*p)->next;
25  }
26 }
27 
28 /*
29  * Now that we have catenation of word ranges [l1-h1],[l2-h2],
30  * we want to add it to existing range, merging suffixes on the fly.
31  */
32 void UTF16addContinuous2(RangeSuffix * & root, uint32_t l_ld, uint32_t h_ld, uint32_t l_tr, uint32_t h_tr)
33 {
34  RangeSuffix ** p = &root;
35  for (;;)
36  {
37  if (*p == NULL)
38  {
39  *p = new RangeSuffix(l_tr, h_tr);
40  p = &(*p)->child;
41  break;
42  }
43  else if ((*p)->l == l_tr && (*p)->h == h_tr)
44  {
45  p = &(*p)->child;
46  break;
47  }
48  else
49  p = &(*p)->next;
50  }
51  for (;;)
52  {
53  if (*p == NULL)
54  {
55  *p = new RangeSuffix(l_ld, h_ld);
56  break;
57  }
58  else if ((*p)->l == l_ld && (*p)->h == h_ld)
59  {
60  break;
61  }
62  else
63  p = &(*p)->next;
64  }
65 }
66 
67 /*
68  * Split range into sub-ranges that agree on leading surrogates.
69  *
70  * We have two Unicode runes, L and H, both map to UTF-16
71  * surrogate pairs 'L1 L2' and 'H1 H2'.
72  * We want to represent Unicode range [L - H] as a catenation
73  * of word ranges [L1 - H1],[L2 - H2].
74  *
75  * This is only possible if the following condition holds:
76  * if L1 /= H1, then L2 == 0xdc00 and H2 == 0xdfff.
77  * This condition ensures that:
78  * 1) all possible UTF-16 sequences between L and H are allowed
79  * 2) no word ranges [w1 - w2] appear, such that w1 > w2
80  *
81  * E.g.:
82  * [\U00010001-\U00010400] => [d800-d801],[dc01-dc00].
83  * The last word range, [dc01-dc00], is incorrect: its lower bound
84  * is greater than its upper bound. To fix this, we must split
85  * the original range into two sub-ranges:
86  * [\U00010001-\U000103ff] => [d800-d800],[dc01-dfff]
87  * [\U00010400-\U00010400] => [d801-d801],[dc00-dc00]
88  *
89  * This function finds all such 'points of discontinuity'
90  * and represents original range as alternation of continuous
91  * sub-ranges.
92  */
93 void UTF16splitByContinuity(RangeSuffix * & root, uint32_t l_ld, uint32_t h_ld, uint32_t l_tr, uint32_t h_tr)
94 {
95  if (l_ld != h_ld)
96  {
97  if (l_tr > utf16::MIN_TRAIL_SURR)
98  {
99  UTF16splitByContinuity(root, l_ld, l_ld, l_tr, utf16::MAX_TRAIL_SURR);
100  UTF16splitByContinuity(root, l_ld + 1, h_ld, utf16::MIN_TRAIL_SURR, h_tr);
101  return;
102  }
103  if (h_tr < utf16::MAX_TRAIL_SURR)
104  {
105  UTF16splitByContinuity(root, l_ld, h_ld - 1, l_tr, utf16::MAX_TRAIL_SURR);
106  UTF16splitByContinuity(root, h_ld, h_ld, utf16::MIN_TRAIL_SURR, h_tr);
107  return;
108  }
109  }
110  UTF16addContinuous2(root, l_ld, h_ld, l_tr, h_tr);
111 }
112 
113 /*
114  * Split range into sub-ranges, so that all runes in the same
115  * sub-range have equal length of UTF-16 sequence. E.g., full
116  * Unicode range [0-0x10FFFF] gets split into sub-ranges:
117  * [0 - 0xFFFF] (2-byte UTF-16 sequences)
118  * [0x10000 - 0x10FFFF] (4-byte UTF-16 sequences)
119  */
121 {
122  if (l <= utf16::MAX_1WORD_RUNE)
123  {
124  if (h <= utf16::MAX_1WORD_RUNE)
125  {
126  UTF16addContinuous1(root, l, h);
127  }
128  else
129  {
131  const uint32_t h_ld = utf16::lead_surr(h);
132  const uint32_t h_tr = utf16::trail_surr(h);
134  }
135  }
136  else
137  {
138  const uint32_t l_ld = utf16::lead_surr(l);
139  const uint32_t l_tr = utf16::trail_surr(l);
140  const uint32_t h_ld = utf16::lead_surr(h);
141  const uint32_t h_tr = utf16::trail_surr(h);
142  UTF16splitByContinuity(root, l_ld, h_ld, l_tr, h_tr);
143  }
144 }
145 
146 } // namespace re2c
RangeSuffix * child
Definition: range_suffix.h:21
RangeSuffix * next
Definition: range_suffix.h:20
static uint32_t trail_surr(rune r)
Definition: utf16.h:30
uint32_t rune
Definition: utf16.h:11
void UTF16addContinuous2(RangeSuffix *&root, uint32_t l_ld, uint32_t h_ld, uint32_t l_tr, uint32_t h_tr)
Definition: utf16_range.cc:32
static const uint32_t MIN_TRAIL_SURR
Definition: utf16.h:15
void UTF16splitByRuneLength(RangeSuffix *&root, utf16::rune l, utf16::rune h)
Definition: utf16_range.cc:120
void UTF16addContinuous1(RangeSuffix *&root, uint32_t l, uint32_t h)
Definition: utf16_range.cc:9
void UTF16splitByContinuity(RangeSuffix *&root, uint32_t l_ld, uint32_t h_ld, uint32_t l_tr, uint32_t h_tr)
Definition: utf16_range.cc:93
static const uint32_t MAX_TRAIL_SURR
Definition: utf16.h:16
static uint32_t lead_surr(rune r)
Definition: utf16.h:25
static const uint32_t MAX_1WORD_RUNE
Definition: utf16.h:13
Definition: bitmap.cc:10
static const uint32_t MIN_LEAD_SURR
Definition: utf16.h:14