src
regexp.cc
Go to the documentation of this file.
1 #include <stddef.h>
2 
3 #include "src/conf/opt.h"
4 #include "src/conf/warn.h"
5 #include "src/globals.h"
11 #include "src/ir/regexp/regexp.h"
17 #include "src/parse/scanner.h"
18 #include "src/util/range.h"
19 
20 namespace re2c
21 {
22 
23 static MatchOp * merge (MatchOp * m1, MatchOp * m2);
24 
25 free_list<RegExp*> RegExp::vFreeList;
26 
27 RegExp * doAlt (RegExp * e1, RegExp * e2)
28 {
29  if (!e1)
30  {
31  return e2;
32  }
33  if (!e2)
34  {
35  return e1;
36  }
37  return new AltOp (e1, e2);
38 }
39 
40 RegExp * mkAlt (RegExp * e1, RegExp * e2)
41 {
42  AltOp * a;
43  MatchOp * m1;
44  MatchOp * m2;
45 
46  a = dynamic_cast<AltOp*> (e1);
47  if (a != NULL)
48  {
49  m1 = dynamic_cast<MatchOp*> (a->exp1);
50  if (m1 != NULL)
51  {
52  e1 = a->exp2;
53  }
54  }
55  else
56  {
57  m1 = dynamic_cast<MatchOp*> (e1);
58  if (m1 != NULL)
59  {
60  e1 = NULL;
61  }
62  }
63  a = dynamic_cast<AltOp*> (e2);
64  if (a != NULL)
65  {
66  m2 = dynamic_cast<MatchOp*> (a->exp1);
67  if (m2 != NULL)
68  {
69  e2 = a->exp2;
70  }
71  }
72  else
73  {
74  m2 = dynamic_cast<MatchOp*> (e2);
75  if (m2 != NULL)
76  {
77  e2 = NULL;
78  }
79  }
80 
81  return doAlt (merge (m1, m2), doAlt (e1, e2));
82 }
83 
84 MatchOp * merge (MatchOp * m1, MatchOp * m2)
85 {
86  if (!m1)
87  {
88  return m2;
89  }
90  if (!m2)
91  {
92  return m1;
93  }
94  MatchOp * m = new MatchOp (Range::add (m1->match, m2->match));
95  return m;
96 }
97 
98 RegExp * doCat (RegExp * e1, RegExp * e2)
99 {
100  if (!e1)
101  {
102  return e2;
103  }
104  if (!e2)
105  {
106  return e1;
107  }
108  return new CatOp (e1, e2);
109 }
110 
111 RegExp *Scanner::schr(uint32_t c) const
112 {
113  if (!opts->encoding.encode(c)) {
114  fatalf("Bad code point: '0x%X'", c);
115  }
116  switch (opts->encoding.type ()) {
117  case Enc::UTF16: return UTF16Symbol(c);
118  case Enc::UTF8: return UTF8Symbol(c);
119  default: return new MatchOp(Range::sym(c));
120  }
121 }
122 
123 RegExp *Scanner::ichr(uint32_t c) const
124 {
125  if (is_alpha(c)) {
126  RegExp *l = schr(to_lower_unsafe(c));
127  RegExp *u = schr(to_upper_unsafe(c));
128  return mkAlt(l, u);
129  } else {
130  return schr(c);
131  }
132 }
133 
134 RegExp *Scanner::cls(Range *r) const
135 {
136  if (!r)
137  {
138  switch (opts->empty_class_policy)
139  {
142  return new NullOp;
145  break;
146  case EMPTY_CLASS_ERROR:
147  fatal ("empty character class");
148  break;
149  }
150  }
151 
152  switch (opts->encoding.type ())
153  {
154  case Enc::UTF16: return UTF16Range(r);
155  case Enc::UTF8: return UTF8Range(r);
156  default: return new MatchOp(r);
157  }
158 }
159 
160 RegExp * Scanner::mkDiff (RegExp * e1, RegExp * e2) const
161 {
162  MatchOp * m1 = dynamic_cast<MatchOp *> (e1);
163  MatchOp * m2 = dynamic_cast<MatchOp *> (e2);
164  if (m1 == NULL || m2 == NULL)
165  {
166  fatal("can only difference char sets");
167  }
168  Range * r = Range::sub (m1->match, m2->match);
169 
170  return cls(r);
171 }
172 
174 {
175  Range * full = opts->encoding.fullRange();
176  uint32_t c = '\n';
177  if (!opts->encoding.encode(c))
178  fatalf("Bad code point: '0x%X'", c);
179  Range * ran = Range::sym (c);
180  Range * inv = Range::sub (full, ran);
181 
182  return cls(inv);
183 }
184 
185 /*
186  * Create a byte range that includes all possible input characters.
187  * This may include characters, which do not map to any valid symbol
188  * in current encoding. For encodings, which directly map symbols to
189  * input characters (ASCII, EBCDIC, UTF-32), it equals [^]. For other
190  * encodings (UTF-16, UTF-8), [^] and this range are different.
191  *
192  * Also note that default range doesn't respect encoding policy
193  * (the way invalid code points are treated).
194  */
196 {
197  Range * def = Range::ran (0, opts->encoding.nCodeUnits());
198  return new MatchOp(def);
199 }
200 
201 /*
202  * note [counted repetition expansion]
203  *
204  * r{0} ;;= <empty regexp>
205  * r{n} ::= r{n-1} r
206  * r{n,m} ::= r{n} (r{0} | ... | r{m-n})
207  * r{n,} ::= r{n} r*
208  */
209 
210 // see note [counted repetition expansion]
211 RegExp * repeat (RegExp * e, uint32_t n)
212 {
213  RegExp * r = NULL;
214  for (uint32_t i = 0; i < n; ++i)
215  {
216  r = doCat (r, e);
217  }
218  return r;
219 }
220 
221 // see note [counted repetition expansion]
222 RegExp * repeat_from_to (RegExp * e, uint32_t n, uint32_t m)
223 {
224  RegExp * r1 = repeat (e, n);
225  RegExp * r2 = NULL;
226  for (uint32_t i = n; i < m; ++i)
227  {
228  r2 = mkAlt (new NullOp, doCat (e, r2));
229  }
230  return doCat (r1, r2);
231 }
232 
233 // see note [counted repetition expansion]
234 RegExp * repeat_from (RegExp * e, uint32_t n)
235 {
236  RegExp * r1 = repeat (e, n);
237  RegExp * r2 = new CloseOp (e);
238  return doCat (r1, r2);
239 }
240 
241 } // namespace re2c
static Range * sub(const Range *r1, const Range *r2)
Definition: range.cc:61
static Range * ran(uint32_t l, uint32_t u)
Definition: range.h:31
RegExp * mkDot() const
Definition: regexp.cc:173
uint32_t get_line() const
Definition: scanner.h:130
RegExp * doAlt(RegExp *e1, RegExp *e2)
Definition: regexp.cc:27
Warn warn
Definition: warn.cc:11
RegExp * repeat_from(RegExp *e, uint32_t n)
Definition: regexp.cc:234
RegExp * mkDefault() const
Definition: regexp.cc:195
type_t type() const
Definition: enc.h:185
Range * fullRange() const
Definition: enc.cc:195
Range * match
Definition: regexp_match.h:13
static uint32_t merge(Span *x0, State *fg, State *bg)
Definition: prepare.cc:31
RegExp * repeat_from_to(RegExp *e, uint32_t n, uint32_t m)
Definition: regexp.cc:222
void void fatalf(const char *,...) const RE2C_GXX_ATTRIBUTE((format(printf
Definition: scanner.cc:160
void void void fatal(const char *) const
Definition: scanner.h:140
bool encode(uint32_t &c) const
Definition: enc.cc:62
uint32_t to_upper_unsafe(uint32_t c)
Definition: case.h:24
static free_list< RegExp * > vFreeList
Definition: regexp.h:23
void empty_class(uint32_t line)
Definition: warn.cc:99
Enc encoding
Definition: opt.h:118
static Range * sym(uint32_t c)
Definition: range.h:27
uint32_t nCodeUnits() const
Definition: enc.h:123
RegExp * UTF8Symbol(utf8::rune r)
Definition: utf8_regexp.cc:12
empty_class_policy_t empty_class_policy
Definition: opt.h:118
RegExp * UTF8Range(const Range *r)
Definition: utf8_regexp.cc:28
Opt opts
Definition: opt.cc:7
RegExp * mkDiff(RegExp *e1, RegExp *e2) const
Definition: regexp.cc:160
RegExp * UTF16Range(const Range *r)
Definition: utf16_regexp.cc:30
bool is_alpha(uint32_t c)
Definition: case.h:13
uint32_t to_lower_unsafe(uint32_t c)
Definition: case.h:19
RegExp * repeat(RegExp *e, uint32_t n)
Definition: regexp.cc:211
Definition: bitmap.cc:10
RegExp * UTF16Symbol(utf16::rune r)
Definition: utf16_regexp.cc:12
RegExp * doCat(RegExp *e1, RegExp *e2)
Definition: regexp.cc:98
RegExp * mkAlt(RegExp *e1, RegExp *e2)
Definition: regexp.cc:40
static Range * add(const Range *r1, const Range *r2)
Definition: range.cc:26