diff options
Diffstat (limited to 'src/parse/regex/enfa.rs')
| -rw-r--r-- | src/parse/regex/enfa.rs | 383 |
1 files changed, 383 insertions, 0 deletions
diff --git a/src/parse/regex/enfa.rs b/src/parse/regex/enfa.rs new file mode 100644 index 0000000..71998c9 --- /dev/null +++ b/src/parse/regex/enfa.rs @@ -0,0 +1,383 @@ +use std::{ + collections::HashSet, + hash::{DefaultHasher, Hash, Hasher}, +}; + +use super::Pattern; +use super::byte_range::ByteRange; + +/// NFA with epsilon transitions +#[derive(Clone)] +pub struct ENFA { + pub states: Vec<EState>, +} + +#[derive(Clone)] +pub struct MultiState<'a> { + nfa: &'a ENFA, + states: Vec<StateId>, + accept: bool, + hash: u64, +} + +impl<'a> PartialEq for MultiState<'a> { + fn eq(&self, other: &Self) -> bool { + (self.nfa as *const ENFA as u64) == (other.nfa as *const ENFA as u64) + && self.states == other.states + && self.accept == other.accept + && self.hash == other.hash + } +} +impl<'a> Eq for MultiState<'a> {} + +impl<'a> MultiState<'a> { + pub fn new(nfa: &'a ENFA, mut states: Vec<StateId>) -> Self { + states.sort(); + states.dedup(); + states.shrink_to_fit(); + + let accept = states.iter().any(|&x| nfa.states[x].accept); + let mut hasher = DefaultHasher::new(); + states.hash(&mut hasher); + let hash = hasher.finish(); + + Self { + nfa, + states, + accept, + hash, + } + } + + /// all the chars that will make an interesting transition + pub fn possible_transitions(&self) -> Vec<ByteRange> { + let mut vec: Vec<_> = self + .states + .iter() + .flat_map(|&i| self.nfa.states[i].trans.iter().map(|x| x.0)) + .collect(); + vec = ByteRange::non_overlapping(vec); + vec.sort(); + vec.dedup(); + vec.shrink_to_fit(); + vec + } + + pub fn transition(&self, ch: ByteRange) -> Self { + let new_states = self + .states + .iter() + .flat_map(|&s| { + self.nfa.states[s] + .trans + .iter() + .filter_map(|&(c, k)| if c.overlaps(ch) { Some(k) } else { None }) + }) + .collect(); + + Self::new(self.nfa, new_states) + } + + pub fn accept(&self) -> bool { + self.accept + } +} + +impl<'a> Hash for MultiState<'a> { + fn hash<H: Hasher>(&self, state: &mut H) { + self.hash.hash(state) + } +} + +macro_rules! set { + () => { + std::collections::HashSet::new() + }; + ( $( $x:expr ),* ) => {{ + let mut set = std::collections::HashSet::new(); + $( + set.insert($x); + )* + set + }}; +} + +impl ENFA { + fn shift(self, amt: usize) -> Vec<EState> { + let mut s = self.states; + + for state in s.iter_mut() { + let trans = state.trans.iter().map(|(c, id)| (*c, id + amt)).collect(); + let epsilon_trans = state.epsilon_trans.iter().map(|e| e + amt).collect(); + + *state = EState { + trans, + epsilon_trans, + accept: false, + } + } + + s + } + + fn epsilon_dfs(&self, i: StateId, visited: &mut [bool]) { + if visited[i] { + return; + } + visited[i] = true; + for &k in self.states[i].epsilon_trans.iter() { + self.epsilon_dfs(k, visited); + } + } + + fn resolve_epsilon(&mut self) { + // state X --> { state Y, Z, W which get inlined into X } + let includes: Vec<Vec<StateId>> = (0..self.states.len()) + .map(|i| { + let mut reach = vec![false; self.states.len()]; + self.epsilon_dfs(i, &mut reach); + reach + .into_iter() + .enumerate() + .filter_map(|(x, r)| if r { Some(x) } else { None }) + .collect() + }) + .collect(); + + // inlining + for i in 0..self.states.len() { + for &k in includes[i].iter() { + let new = self.states[k].trans.clone(); + self.states[i].trans.extend(new); + self.states[i].epsilon_trans.clear(); + if self.states[k].accept { + self.states[i].accept = true; + } + } + } + } + + fn remove_unreachable(&mut self) { + let mut used = vec![false; self.states.len()]; + used[0] = true; + for s in self.states.iter() { + for &i in s.epsilon_trans.iter() { + used[i] = true; + } + for &(_, i) in s.trans.iter() { + used[i] = true; + } + } + let mut remap = vec![0; self.states.len()]; + let mut shift = 0; + for i in 0..self.states.len() { + if used[i] { + remap[i] = i - shift; + } else { + shift += 1; + } + } + for i in (0..self.states.len()).rev() { + if !used[i] { + self.states.remove(i); + } + } + for s in self.states.iter_mut() { + s.epsilon_trans = s + .epsilon_trans + .clone() + .into_iter() + .map(|i| remap[i]) + .collect(); + s.trans = s + .trans + .clone() + .into_iter() + .map(|(c, i)| (c, remap[i])) + .collect(); + } + } + + pub fn simplify(&mut self) { + self.resolve_epsilon(); + self.remove_unreachable(); + } + + pub fn start_multi_state<'a>(&'a self) -> MultiState<'a> { + MultiState::new(self, vec![0]) + } + + pub fn void_multi_state<'a>(&'a self) -> MultiState<'a> { + MultiState::new(self, vec![]) + } + + pub fn all_multi_states<'a>(&'a self) -> HashSet<MultiState<'a>> { + let mut states = set![self.start_multi_state()]; + let mut q = vec![self.start_multi_state()]; + + while let Some(state) = q.pop() { + let chars = state.possible_transitions(); + + for chr in chars { + let new = state.transition(chr); + + if !states.contains(&new) { + states.insert(new.clone()); + q.push(new); + } + } + } + + states + } + + fn looping(self) -> Self { + let mut states = vec![EState::start()]; + states.append(&mut self.shift(1)); + let len = states.len(); + states[0].epsilon_trans = set![1, len]; + states[len - 1].epsilon_trans = set![0, len]; + states.push(EState::terminal()); + Self { states } + } + + fn repeat(self, times: usize) -> Self { + let reps = vec![self; times]; + Self::concat(reps) + } + + /// between 0 and x repetitions + fn optx(self, x: usize) -> Self { + let len = self.states.len(); + let mut repped = self.repeat(x); + assert_eq!(repped.states.len(), x * len); + for i in 1..=x { + repped.states[0].epsilon_trans.insert(i * len - 1); + } + repped + } + + fn concat(nfas: Vec<Self>) -> Self { + if nfas.is_empty() { + return Self { + states: vec![EState::terminal()], + }; + } + + let mut states: Vec<EState> = Vec::new(); + for nfa in nfas.into_iter() { + let len = states.len(); + let mut ns = nfa.shift(len); + if let Some(n) = states.last_mut() { + n.epsilon_trans = set![len]; + } + states.append(&mut ns); + } + + let len = states.len(); + states[len - 1].accept = true; + + Self { states } + } +} + +impl std::fmt::Debug for ENFA { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!(f, "NFA {{")?; + for (i, s) in self.states.iter().enumerate() { + write!(f, " {i}: ")?; + for k in s.epsilon_trans.iter() { + write!(f, "~>{k} ")?; + } + for (c, k) in s.trans.iter() { + write!(f, "{c:?}=>{k} ")?; + } + if s.accept { + write!(f, "accept")?; + } + writeln!(f)?; + } + write!(f, "}}") + } +} + +pub type StateId = usize; + +#[derive(Debug, Clone)] +pub struct EState { + pub trans: HashSet<(ByteRange, StateId)>, + pub epsilon_trans: HashSet<StateId>, + pub accept: bool, +} + +impl EState { + fn start() -> Self { + Self { + trans: HashSet::new(), + epsilon_trans: HashSet::new(), + accept: false, + } + } + fn terminal() -> Self { + Self { + trans: HashSet::new(), + epsilon_trans: HashSet::new(), + accept: true, + } + } +} + +impl From<Pattern> for ENFA { + fn from(value: Pattern) -> Self { + match value { + Pattern::Byte(c) => Self::from(Pattern::Range(c, c)), + Pattern::Range(c1, c2) => Self { + states: vec![ + EState { + trans: set![(ByteRange::new_range(c1, c2), 1)], + epsilon_trans: set![], + accept: false, + }, + EState::terminal(), + ], + }, + Pattern::Alt(alts) => { + let nfas: Vec<ENFA> = alts.into_iter().map(ENFA::from).collect(); + let mut states = vec![EState::start()]; + let mut ends = vec![]; + for nfa in nfas.into_iter() { + let len = states.len(); + states[0].epsilon_trans.insert(len); + states.append(&mut (nfa.shift(len))); + ends.push(states.len() - 1); + } + states.push(EState::terminal()); + for end in ends.into_iter() { + let last = states.len() - 1; + states[end].epsilon_trans.insert(last); + } + Self { states } + } + Pattern::Concat(seq) => { + let nfas: Vec<Self> = seq.into_iter().map(ENFA::from).collect(); + Self::concat(nfas) + } + Pattern::Rep(regex, min, None) => { + let nfa = ENFA::from(*regex); + let base = nfa.clone().repeat(min as usize); + let tail = nfa.looping(); + Self::concat(vec![base, tail]) + } + Pattern::Rep(regex, min, Some(max)) => { + assert!(min < max); + let nfa = Self::from(*regex); + let base = nfa.clone().repeat(min as usize); + let tail = nfa.optx((max - min) as usize); + Self::concat(vec![base, tail]) + } + Pattern::Nothing => Self { + states: vec![EState::terminal()], + }, + } + } +} |
