diff options
| -rw-r--r-- | src/parse/regex/dfa.rs | 2 | ||||
| -rw-r--r-- | src/parse/regex/enfa.rs | 143 |
2 files changed, 94 insertions, 51 deletions
diff --git a/src/parse/regex/dfa.rs b/src/parse/regex/dfa.rs index 7e5be7b..2fd1935 100644 --- a/src/parse/regex/dfa.rs +++ b/src/parse/regex/dfa.rs @@ -65,7 +65,7 @@ impl From<ENFA> for DFA { nfa.simplify(); for s in nfa.states.iter() { - if !s.epsilon_trans.is_empty() { + if s.trans.iter().any(|t| t.is_epsilon()) { panic!( "NFA simplification did not remove epsilon transitions - cannot proceed with powerset construction." ); diff --git a/src/parse/regex/enfa.rs b/src/parse/regex/enfa.rs index b3d3c4a..6b7b7bb 100644 --- a/src/parse/regex/enfa.rs +++ b/src/parse/regex/enfa.rs @@ -55,7 +55,7 @@ impl<'a> MultiState<'a> { let mut vec: Vec<_> = self .states .iter() - .flat_map(|&i| self.nfa.states[i].trans.iter().map(|x| x.0)) + .flat_map(|&i| self.nfa.states[i].trans.iter().map(|x| x.consumes.unwrap())) .collect(); vec = ByteRange::non_overlapping(vec); vec.sort(); @@ -69,10 +69,13 @@ impl<'a> MultiState<'a> { .states .iter() .flat_map(|&s| { - self.nfa.states[s] - .trans - .iter() - .filter_map(|&(c, k)| if c.overlaps(ch) { Some(k) } else { None }) + self.nfa.states[s].trans.iter().filter_map(|t| { + if t.consumes.unwrap().overlaps(ch) { + Some(t.to) + } else { + None + } + }) }) .collect(); @@ -108,14 +111,8 @@ impl ENFA { let mut s = self.states; for state in s.iter_mut() { - let trans = state.trans.iter().map(|(c, id)| (*c, id + amt)).collect(); - let epsilon_trans = state.epsilon_trans.iter().map(|e| e + amt).collect(); - - *state = EState { - trans, - epsilon_trans, - accept: false, - } + state.remap(|i| i + amt); + state.accept = false; } s @@ -126,8 +123,10 @@ impl ENFA { return; } visited[i] = true; - for &k in self.states[i].epsilon_trans.iter() { - self.epsilon_dfs(k, visited); + for t in self.states[i].trans.iter() { + if t.is_epsilon() { + self.epsilon_dfs(t.to, visited); + } } } @@ -145,12 +144,16 @@ impl ENFA { }) .collect(); - // inlining + // clear epsilons + for s in self.states.iter_mut() { + s.trans.retain(|t| !t.is_epsilon()); + } + + // inline real transitions for i in 0..self.states.len() { for &k in includes[i].iter() { let new = self.states[k].trans.clone(); self.states[i].trans.extend(new); - self.states[i].epsilon_trans.clear(); if self.states[k].accept { self.states[i].accept = true; } @@ -162,10 +165,7 @@ impl ENFA { let mut used = vec![false; self.states.len()]; used[0] = true; for s in self.states.iter() { - for &i in s.epsilon_trans.iter() { - used[i] = true; - } - for &(_, i) in s.trans.iter() { + for i in s.reachable_states() { used[i] = true; } } @@ -184,18 +184,7 @@ impl ENFA { } } for s in self.states.iter_mut() { - s.epsilon_trans = s - .epsilon_trans - .clone() - .into_iter() - .map(|i| remap[i]) - .collect(); - s.trans = s - .trans - .clone() - .into_iter() - .map(|(c, i)| (c, remap[i])) - .collect(); + s.remap(|i| remap[i]); } } @@ -236,8 +225,8 @@ impl ENFA { let mut states = vec![EState::start()]; states.append(&mut self.shift(1)); let len = states.len(); - states[0].epsilon_trans = set![1, len]; - states[len - 1].epsilon_trans = set![0, len]; + states[0].set_epsilon_transitions([Transition::epsilon(1), Transition::epsilon(len)]); + states[len - 1].set_epsilon_transitions([Transition::epsilon(0), Transition::epsilon(len)]); states.push(EState::terminal()); Self { states } } @@ -253,7 +242,9 @@ impl ENFA { let mut repped = self.repeat(x); assert_eq!(repped.states.len(), x * len); for i in 1..=x { - repped.states[0].epsilon_trans.insert(i * len - 1); + repped.states[0] + .trans + .insert(Transition::epsilon(i * len - 1)); } repped } @@ -270,7 +261,8 @@ impl ENFA { let len = states.len(); let mut ns = nfa.shift(len); if let Some(n) = states.last_mut() { - n.epsilon_trans = set![len]; + n.trans.retain(|t| t.consumes.is_some()); + n.trans.insert(Transition::epsilon(len)); } states.append(&mut ns); } @@ -287,11 +279,13 @@ impl std::fmt::Debug for ENFA { writeln!(f, "NFA {{")?; for (i, s) in self.states.iter().enumerate() { write!(f, " {i}: ")?; - for k in s.epsilon_trans.iter() { - write!(f, "~>{k} ")?; - } - for (c, k) in s.trans.iter() { - write!(f, "{c:?}=>{k} ")?; + for t in s.trans.iter() { + let k = t.to; + if let Some(c) = t.consumes { + write!(f, "{c:?}=>{k} ")?; + } else { + write!(f, "~>{k} ")?; + } } if s.accept { write!(f, "accept")?; @@ -304,10 +298,38 @@ impl std::fmt::Debug for ENFA { pub type StateId = usize; +#[derive(Debug, Clone, Hash, PartialEq, Eq)] +pub struct Transition { + to: StateId, + consumes: Option<ByteRange>, +} + +impl Transition { + fn new(consumes: ByteRange, to: StateId) -> Self { + let consumes = Some(consumes); + Self { to, consumes } + } + + fn epsilon(to: StateId) -> Self { + Self { to, consumes: None } + } + + fn remap(&mut self, mut f: impl FnMut(StateId) -> StateId) { + self.to = f(self.to); + } + + fn reachable_states(&self) -> impl Iterator<Item = StateId> { + [self.to].into_iter() + } + + pub fn is_epsilon(&self) -> bool { + self.consumes.is_none() + } +} + #[derive(Debug, Clone)] pub struct EState { - pub trans: HashSet<(ByteRange, StateId)>, - pub epsilon_trans: HashSet<StateId>, + pub trans: HashSet<Transition>, pub accept: bool, } @@ -315,17 +337,39 @@ impl EState { fn start() -> Self { Self { trans: HashSet::new(), - epsilon_trans: HashSet::new(), accept: false, } } fn terminal() -> Self { Self { trans: HashSet::new(), - epsilon_trans: HashSet::new(), accept: true, } } + + fn set_epsilon_transitions(&mut self, trans: impl IntoIterator<Item = Transition>) { + self.trans.retain(|t| t.consumes.is_some()); + for transition in trans.into_iter() { + assert!(transition.consumes.is_none()); + self.trans.insert(transition); + } + } + + fn remap(&mut self, mut f: impl FnMut(StateId) -> StateId) { + self.trans = self + .trans + .iter() + .cloned() + .map(|mut t| { + t.remap(&mut f); + t + }) + .collect(); + } + + fn reachable_states(&self) -> impl Iterator<Item = StateId> { + self.trans.iter().flat_map(|t| t.reachable_states()) + } } #[derive(Debug)] @@ -342,8 +386,7 @@ impl TryFrom<Pattern> for ENFA { Pattern::Range(c1, c2) => Self { states: vec![ EState { - trans: set![(ByteRange::new_range(c1, c2), 1)], - epsilon_trans: set![], + trans: set![Transition::new(ByteRange::new_range(c1, c2), 1)], accept: false, }, EState::terminal(), @@ -358,14 +401,14 @@ impl TryFrom<Pattern> for ENFA { let mut ends = vec![]; for nfa in nfas.into_iter() { let len = states.len(); - states[0].epsilon_trans.insert(len); + states[0].trans.insert(Transition::epsilon(len)); states.append(&mut (nfa.shift(len))); ends.push(states.len() - 1); } states.push(EState::terminal()); for end in ends.into_iter() { let last = states.len() - 1; - states[end].epsilon_trans.insert(last); + states[end].trans.insert(Transition::epsilon(last)); } Self { states } } |
