use std::{ collections::HashSet, hash::{DefaultHasher, Hash, Hasher}, }; use super::Pattern; use super::byte_range::ByteRange; /// NFA with epsilon transitions #[derive(Clone)] pub struct ENFA { pub states: Vec, } #[derive(Clone)] pub struct MultiState<'a> { nfa: &'a ENFA, states: Vec, accept: bool, hash: u64, } impl<'a> PartialEq for MultiState<'a> { fn eq(&self, other: &Self) -> bool { (self.nfa as *const ENFA as u64) == (other.nfa as *const ENFA as u64) && self.states == other.states && self.accept == other.accept && self.hash == other.hash } } impl<'a> Eq for MultiState<'a> {} impl<'a> MultiState<'a> { pub fn new(nfa: &'a ENFA, mut states: Vec) -> Self { states.sort(); states.dedup(); states.shrink_to_fit(); let accept = states.iter().any(|&x| nfa.states[x].accept); let mut hasher = DefaultHasher::new(); states.hash(&mut hasher); let hash = hasher.finish(); Self { nfa, states, accept, hash, } } /// all the chars that will make an interesting transition pub fn possible_transitions(&self) -> Vec { let mut vec: Vec<_> = self .states .iter() .flat_map(|&i| self.nfa.states[i].trans.iter().map(|x| x.0)) .collect(); vec = ByteRange::non_overlapping(vec); vec.sort(); vec.dedup(); vec.shrink_to_fit(); vec } pub fn transition(&self, ch: ByteRange) -> Self { let new_states = self .states .iter() .flat_map(|&s| { self.nfa.states[s] .trans .iter() .filter_map(|&(c, k)| if c.overlaps(ch) { Some(k) } else { None }) }) .collect(); Self::new(self.nfa, new_states) } pub fn accept(&self) -> bool { self.accept } } impl<'a> Hash for MultiState<'a> { fn hash(&self, state: &mut H) { self.hash.hash(state) } } macro_rules! set { () => { std::collections::HashSet::new() }; ( $( $x:expr ),* ) => {{ let mut set = std::collections::HashSet::new(); $( set.insert($x); )* set }}; } impl ENFA { fn shift(self, amt: usize) -> Vec { let mut s = self.states; for state in s.iter_mut() { let trans = state.trans.iter().map(|(c, id)| (*c, id + amt)).collect(); let epsilon_trans = state.epsilon_trans.iter().map(|e| e + amt).collect(); *state = EState { trans, epsilon_trans, accept: false, } } s } fn epsilon_dfs(&self, i: StateId, visited: &mut [bool]) { if visited[i] { return; } visited[i] = true; for &k in self.states[i].epsilon_trans.iter() { self.epsilon_dfs(k, visited); } } fn resolve_epsilon(&mut self) { // state X --> { state Y, Z, W which get inlined into X } let includes: Vec> = (0..self.states.len()) .map(|i| { let mut reach = vec![false; self.states.len()]; self.epsilon_dfs(i, &mut reach); reach .into_iter() .enumerate() .filter_map(|(x, r)| if r { Some(x) } else { None }) .collect() }) .collect(); // inlining for i in 0..self.states.len() { for &k in includes[i].iter() { let new = self.states[k].trans.clone(); self.states[i].trans.extend(new); self.states[i].epsilon_trans.clear(); if self.states[k].accept { self.states[i].accept = true; } } } } fn remove_unreachable(&mut self) { let mut used = vec![false; self.states.len()]; used[0] = true; for s in self.states.iter() { for &i in s.epsilon_trans.iter() { used[i] = true; } for &(_, i) in s.trans.iter() { used[i] = true; } } let mut remap = vec![0; self.states.len()]; let mut shift = 0; for i in 0..self.states.len() { if used[i] { remap[i] = i - shift; } else { shift += 1; } } for i in (0..self.states.len()).rev() { if !used[i] { self.states.remove(i); } } for s in self.states.iter_mut() { s.epsilon_trans = s .epsilon_trans .clone() .into_iter() .map(|i| remap[i]) .collect(); s.trans = s .trans .clone() .into_iter() .map(|(c, i)| (c, remap[i])) .collect(); } } pub fn simplify(&mut self) { self.resolve_epsilon(); self.remove_unreachable(); } pub fn start_multi_state<'a>(&'a self) -> MultiState<'a> { MultiState::new(self, vec![0]) } pub fn void_multi_state<'a>(&'a self) -> MultiState<'a> { MultiState::new(self, vec![]) } pub fn all_multi_states<'a>(&'a self) -> HashSet> { let mut states = set![self.start_multi_state()]; let mut q = vec![self.start_multi_state()]; while let Some(state) = q.pop() { let chars = state.possible_transitions(); for chr in chars { let new = state.transition(chr); if !states.contains(&new) { states.insert(new.clone()); q.push(new); } } } states } fn looping(self) -> Self { let mut states = vec![EState::start()]; states.append(&mut self.shift(1)); let len = states.len(); states[0].epsilon_trans = set![1, len]; states[len - 1].epsilon_trans = set![0, len]; states.push(EState::terminal()); Self { states } } fn repeat(self, times: usize) -> Self { let reps = vec![self; times]; Self::concat(reps) } /// between 0 and x repetitions fn optx(self, x: usize) -> Self { let len = self.states.len(); let mut repped = self.repeat(x); assert_eq!(repped.states.len(), x * len); for i in 1..=x { repped.states[0].epsilon_trans.insert(i * len - 1); } repped } fn concat(nfas: Vec) -> Self { if nfas.is_empty() { return Self { states: vec![EState::terminal()], }; } let mut states: Vec = Vec::new(); for nfa in nfas.into_iter() { let len = states.len(); let mut ns = nfa.shift(len); if let Some(n) = states.last_mut() { n.epsilon_trans = set![len]; } states.append(&mut ns); } let len = states.len(); states[len - 1].accept = true; Self { states } } } impl std::fmt::Debug for ENFA { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { writeln!(f, "NFA {{")?; for (i, s) in self.states.iter().enumerate() { write!(f, " {i}: ")?; for k in s.epsilon_trans.iter() { write!(f, "~>{k} ")?; } for (c, k) in s.trans.iter() { write!(f, "{c:?}=>{k} ")?; } if s.accept { write!(f, "accept")?; } writeln!(f)?; } write!(f, "}}") } } pub type StateId = usize; #[derive(Debug, Clone)] pub struct EState { pub trans: HashSet<(ByteRange, StateId)>, pub epsilon_trans: HashSet, pub accept: bool, } impl EState { fn start() -> Self { Self { trans: HashSet::new(), epsilon_trans: HashSet::new(), accept: false, } } fn terminal() -> Self { Self { trans: HashSet::new(), epsilon_trans: HashSet::new(), accept: true, } } } impl From for ENFA { fn from(value: Pattern) -> Self { match value { Pattern::Byte(c) => Self::from(Pattern::Range(c, c)), Pattern::Range(c1, c2) => Self { states: vec![ EState { trans: set![(ByteRange::new_range(c1, c2), 1)], epsilon_trans: set![], accept: false, }, EState::terminal(), ], }, Pattern::Alt(alts) => { let nfas: Vec = alts.into_iter().map(ENFA::from).collect(); let mut states = vec![EState::start()]; let mut ends = vec![]; for nfa in nfas.into_iter() { let len = states.len(); states[0].epsilon_trans.insert(len); states.append(&mut (nfa.shift(len))); ends.push(states.len() - 1); } states.push(EState::terminal()); for end in ends.into_iter() { let last = states.len() - 1; states[end].epsilon_trans.insert(last); } Self { states } } Pattern::Concat(seq) => { let nfas: Vec = seq.into_iter().map(ENFA::from).collect(); Self::concat(nfas) } Pattern::Rep(regex, min, None) => { let nfa = ENFA::from(*regex); let base = nfa.clone().repeat(min as usize); let tail = nfa.looping(); Self::concat(vec![base, tail]) } Pattern::Rep(regex, min, Some(max)) => { assert!(min < max); let nfa = Self::from(*regex); let base = nfa.clone().repeat(min as usize); let tail = nfa.optx((max - min) as usize); Self::concat(vec![base, tail]) } Pattern::Nothing => Self { states: vec![EState::terminal()], }, } } }