src
determinization.cc
Go to the documentation of this file.
1 #include <algorithm>
2 #include <limits>
3 #include <map>
4 #include <set>
5 #include <vector>
6 
7 #include "src/ir/dfa/dfa.h"
8 #include "src/ir/nfa/nfa.h"
9 #include "src/ir/regexp/regexp.h"
11 #include "src/ir/rule_rank.h"
12 #include "src/parse/rules.h"
13 #include "src/util/ord_hash_set.h"
14 #include "src/util/range.h"
15 
16 namespace re2c
17 {
18 
19 const size_t dfa_t::NIL = std::numeric_limits<size_t>::max();
20 
21 /*
22  * note [marking DFA states]
23  *
24  * DFA state is a set of NFA states.
25  * However, DFA state includes not all NFA states that are in
26  * epsilon-closure (NFA states that have only epsilon-transitions
27  * and are not context of final states are omitted).
28  * The included states are called 'kernel' states.
29  *
30  * We mark visited NFA states during closure construction.
31  * These marks serve two purposes:
32  * - avoid loops in NFA
33  * - avoid duplication of NFA states in kernel
34  *
35  * Note that after closure construction:
36  * - all non-kernel states must be unmarked (these states are
37  * not stored in kernel and it is impossible to unmark them
38  * afterwards)
39  * - all kernel states must be marked (because we may later
40  * extend this kernel with epsilon-closure of another NFA
41  * state). Kernel states are unmarked later (before finding
42  * or adding DFA state).
43  */
45 {
46  if (!n->mark)
47  {
48  n->mark = true;
49  switch (n->type)
50  {
51  case nfa_state_t::ALT:
52  cP = closure(cP, n->value.alt.out2);
53  cP = closure(cP, n->value.alt.out1);
54  n->mark = false;
55  break;
56  case nfa_state_t::CTX:
57  *(cP++) = n;
58  cP = closure(cP, n->value.ctx.out);
59  break;
60  default:
61  *(cP++) = n;
62  break;
63  }
64  }
65 
66  return cP;
67 }
68 
69 static size_t find_state
70  ( nfa_state_t **kernel
71  , nfa_state_t **end
72  , ord_hash_set_t &kernels
73  )
74 {
75  // zero-sized kernel corresponds to default state
76  if (kernel == end)
77  {
78  return dfa_t::NIL;
79  }
80 
81  // see note [marking DFA states]
82  for (nfa_state_t **p = kernel; p != end; ++p)
83  {
84  (*p)->mark = false;
85  }
86 
87  // sort kernel states: we need this to get stable hash
88  // and to compare states with simple 'memcmp'
89  std::sort(kernel, end);
90  const size_t size = static_cast<size_t>(end - kernel) * sizeof(nfa_state_t*);
91  return kernels.insert(kernel, size);
92 }
93 
94 dfa_t::dfa_t(const nfa_t &nfa, const charset_t &charset, rules_t &rules)
95  : states()
96  , nchars(charset.size() - 1) // (n + 1) bounds for n ranges
97 {
98  std::map<size_t, std::set<RuleOp*> > s2rules;
99  ord_hash_set_t kernels;
100  nfa_state_t **const buffer = new nfa_state_t*[nfa.size];
101  std::vector<std::vector<nfa_state_t*> > arcs(nchars);
102 
103  find_state(buffer, closure(buffer, nfa.root), kernels);
104  for (size_t i = 0; i < kernels.size(); ++i)
105  {
106  dfa_state_t *s = new dfa_state_t;
107  states.push_back(s);
108 
109  nfa_state_t **kernel;
110  const size_t kernel_size = kernels.deref<nfa_state_t*>(i, kernel);
111  for (size_t j = 0; j < kernel_size; ++j)
112  {
113  nfa_state_t *n = kernel[j];
114  switch (n->type)
115  {
116  case nfa_state_t::RAN:
117  {
118  nfa_state_t *m = n->value.ran.out;
119  size_t c = 0;
120  for (Range *r = n->value.ran.ran; r; r = r->next ())
121  {
122  for (; charset[c] != r->lower(); ++c);
123  for (; charset[c] != r->upper(); ++c)
124  {
125  arcs[c].push_back(m);
126  }
127  }
128  break;
129  }
130  case nfa_state_t::CTX:
131  s->ctx = true;
132  break;
133  case nfa_state_t::FIN:
134  s2rules[i].insert(n->value.fin.rule);
135  break;
136  default:
137  break;
138  }
139  }
140 
141  s->arcs = new size_t[nchars];
142  for(size_t c = 0; c < nchars; ++c)
143  {
144  nfa_state_t **end = buffer;
145  for (std::vector<nfa_state_t*>::const_iterator j = arcs[c].begin(); j != arcs[c].end(); ++j)
146  {
147  end = closure(end, *j);
148  }
149  s->arcs[c] = find_state(buffer, end, kernels);
150  }
151 
152  for(size_t c = 0; c < nchars; ++c)
153  {
154  arcs[c].clear();
155  }
156  }
157  delete[] buffer;
158 
159  const size_t count = states.size();
160  for (size_t i = 0; i < count; ++i)
161  {
162  dfa_state_t *s = states[i];
163  std::set<RuleOp*> &rs = s2rules[i];
164  // for each final state: choose the rule with the smallest rank
165  for (std::set<RuleOp*>::const_iterator j = rs.begin(); j != rs.end(); ++j)
166  {
167  RuleOp *rule = *j;
168  if (!s->rule || rule->rank < s->rule->rank)
169  {
170  s->rule = rule;
171  }
172  }
173  // other rules are shadowed by the chosen rule
174  for (std::set<RuleOp*>::const_iterator j = rs.begin(); j != rs.end(); ++j)
175  {
176  RuleOp *rule = *j;
177  if (s->rule != rule)
178  {
179  rules[rule->rank].shadow.insert(s->rule->rank);
180  }
181  }
182  }
183 }
184 
186 {
187  std::vector<dfa_state_t*>::iterator
188  i = states.begin(),
189  e = states.end();
190  for (; i != e; ++i)
191  {
192  delete *i;
193  }
194 }
195 
196 } // namespace re2c
197 
static Range * ran(uint32_t l, uint32_t u)
Definition: range.h:31
std::vector< uint32_t > charset_t
Definition: regexp.h:16
union re2c::nfa_state_t::@6 value
std::vector< dfa_state_t * > states
Definition: dfa.h:40
std::map< rule_rank_t, rule_info_t > rules_t
Definition: rules.h:25
dfa_t(const nfa_t &nfa, const charset_t &charset, rules_t &rules)
size_t insert(const void *data, size_t size)
Definition: ord_hash_set.h:84
static nfa_state_t ** closure(nfa_state_t **cP, nfa_state_t *n)
bool mark
Definition: nfa.h:45
uint32_t size
Definition: nfa.h:78
enum re2c::nfa_state_t::type_t type
struct re2c::nfa_state_t::@6::@10 fin
size_t deref(size_t i, data_t *&data)
Definition: ord_hash_set.h:106
const size_t nchars
Definition: dfa.h:41
struct re2c::nfa_state_t::@6::@7 alt
rule_rank_t rank
Definition: regexp_rule.h:23
static const size_t NIL
Definition: dfa.h:38
nfa_state_t * root
Definition: nfa.h:80
struct re2c::nfa_state_t::@6::@9 ctx
bool ctx
Definition: dfa.h:21
size_t size() const
Definition: ord_hash_set.h:79
size_t * arcs
Definition: dfa.h:19
static size_t find_state(nfa_state_t **kernel, nfa_state_t **end, ord_hash_set_t &kernels)
Range * ran
Definition: nfa.h:34
Definition: bitmap.cc:10
Range * next() const
Definition: range.h:39
RuleOp * rule
Definition: dfa.h:20