src
utf8_regexp.cc
Go to the documentation of this file.
1 #include "src/util/c99_stdint.h"
2 
8 #include "src/util/range.h"
9 
10 namespace re2c {
11 
13 {
14  uint32_t chars[utf8::MAX_RUNE_LENGTH];
15  const uint32_t chars_count = utf8::rune_to_bytes(chars, r);
16  RegExp * re = new MatchOp(Range::sym (chars[0]));
17  for (uint32_t i = 1; i < chars_count; ++i)
18  re = new CatOp(re, new MatchOp(Range::sym (chars[i])));
19  return re;
20 }
21 
22 /*
23  * Split Unicode character class {[l1, h1), ..., [lN, hN)} into
24  * ranges [l1, h1-1], ..., [lN, hN-1] and return alternation of
25  * them. We store partially built range in suffix tree, which
26  * allows to eliminate common suffixes while building.
27  */
28 RegExp * UTF8Range(const Range * r)
29 {
30  RangeSuffix * root = NULL;
31  for (; r != NULL; r = r->next ())
32  UTF8splitByRuneLength(root, r->lower (), r->upper () - 1);
33  return to_regexp (root);
34 }
35 
36 } // namespace re2c
uint32_t lower() const
Definition: range.h:40
uint32_t rune
Definition: utf8.h:11
RegExp * to_regexp(RangeSuffix *p)
Definition: range_suffix.cc:12
static Range * sym(uint32_t c)
Definition: range.h:27
RegExp * UTF8Symbol(utf8::rune r)
Definition: utf8_regexp.cc:12
RegExp * UTF8Range(const Range *r)
Definition: utf8_regexp.cc:28
uint32_t upper() const
Definition: range.h:41
static uint32_t rune_to_bytes(uint32_t *s, rune r)
Definition: utf8.cc:22
Definition: bitmap.cc:10
void UTF8splitByRuneLength(RangeSuffix *&root, utf8::rune l, utf8::rune h)
Definition: utf8_range.cc:100
Range * next() const
Definition: range.h:39