use std::{ collections::HashSet, hash::{DefaultHasher, Hash, Hasher}, }; use super::{LookDirection, LookPolarity, Pattern, byte_range::ByteRange}; /// NFA with epsilon transitions #[derive(Clone)] #[allow(clippy::upper_case_acronyms)] pub struct ENFA { pub states: Vec, pub has_submatches: bool, } fn cartesian_product(x: Vec>) -> Vec> { let mut result = vec![Vec::new()]; for xs in x { let mut next = Vec::new(); for prefix in &result { for item in &xs { let mut v = prefix.clone(); v.push(item.clone()); next.push(v); } } result = next; } result } #[cfg(test)] mod product_tests { use super::cartesian_product; #[test] fn basic_case() { let x = vec![vec![1, 2], vec![10, 20]]; let out = cartesian_product(x); assert_eq!( out, vec![vec![1, 10], vec![1, 20], vec![2, 10], vec![2, 20],] ); } #[test] fn single_dimension() { let x = vec![vec![1, 2, 3]]; let out = cartesian_product(x); assert_eq!(out, vec![vec![1], vec![2], vec![3],]); } #[test] fn empty_outer_vector() { let x: Vec> = vec![]; let out: Vec> = cartesian_product(x); let r: Vec> = vec![vec![]]; // One empty combination. assert_eq!(out, r); } #[test] fn empty_inner_vector() { let x = vec![vec![1, 2], vec![], vec![3, 4]]; let out = cartesian_product(x); assert!(out.is_empty()); } #[test] fn output_size_matches_product() { let x = vec![vec![1, 2, 3], vec![4, 5], vec![6, 7, 8, 9]]; let out = cartesian_product(x); assert_eq!(out.len(), 3 * 2 * 4); } #[test] fn every_output_has_correct_length() { let x = vec![vec!['a', 'b'], vec!['x', 'y', 'z'], vec!['0']]; let out = cartesian_product(x); assert!(out.iter().all(|v| v.len() == 3)); } #[test] fn works_with_strings() { let x = vec![ vec!["a".to_string(), "b".to_string()], vec!["x".to_string()], ]; let out = cartesian_product(x); assert_eq!( out, vec![ vec!["a".to_string(), "x".to_string()], vec!["b".to_string(), "x".to_string()], ] ); } } #[derive(Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)] struct Thread { state: StateId, positives: Vec, negatives: Vec, } impl Thread { fn new_simple(state: StateId) -> Self { Self { state, positives: Vec::new(), negatives: Vec::new(), } } fn new( enfa: &ENFA, state: StateId, mut positives: Vec, mut negatives: Vec, ) -> Option { positives.sort(); positives.dedup(); positives.retain(|t| !t.accept(enfa)); positives.shrink_to_fit(); negatives.sort(); negatives.dedup(); negatives.shrink_to_fit(); if negatives.iter().any(|t| t.accept(enfa)) { return None; } Some(Self { state, positives, negatives, }) } fn accept(&self, enfa: &ENFA) -> bool { let pos = self.positives.iter().all(|t| t.accept(enfa)); let neg = self.negatives.iter().all(|t| !t.accept(enfa)); let this = match enfa.states[self.state].accept { Acceptance::Accept => true, Acceptance::Assertion => true, Acceptance::NotYet => false, }; pos && neg && this } fn step_epsilon0( self, enfa: &ENFA, ret: &mut impl FnMut(Thread), visited: &mut [bool], new_assertions: Vec, ) { if visited[self.state] { return; } visited[self.state] = true; for t in enfa.states[self.state].trans.iter() { if t.consumes.is_some() { continue; } let mut new_assertions = new_assertions.clone(); if let Some(assertion) = enfa.states[self.state].assert.as_ref() { new_assertions.push(assertion.clone()); } Self { state: t.to, ..self.clone() } .step_epsilon0(enfa, ret, visited, new_assertions); } let Self { state, positives, negatives, } = self; let mut p = Vec::new(); let mut n = Vec::new(); for assertion in new_assertions { let threads = Self::new_simple(assertion.to).step_epsilon(enfa); let vec = match assertion.polarity { LookPolarity::Positive => &mut p, LookPolarity::Negative => &mut n, }; vec.push(threads); } let p = cartesian_product(p); for mut p in p { let mut positives = positives.clone(); let mut negatives = negatives.clone(); positives.append(&mut p); negatives.append(&mut n.iter().flatten().cloned().collect()); if let Some(thread) = Self::new(enfa, state, positives, negatives) { ret(thread); } } } fn step_epsilon(self, enfa: &ENFA) -> Vec { let mut vec = Vec::new(); self.step_epsilon0( enfa, &mut |t| vec.push(t), &mut vec![false; enfa.states.len()], Vec::new(), ); vec } fn step0(self, enfa: &ENFA, input: ByteRange, ret: &mut impl FnMut(Thread)) { let positives = self .positives .clone() .into_iter() .map(|t| t.step(enfa, input)) .collect(); let negatives: Vec<_> = self .negatives .into_iter() .flat_map(|t| t.step(enfa, input)) .collect(); let positives = cartesian_product(positives); let next_states: Vec = enfa.states[self.state] .trans .iter() .filter_map(|t| { if let Some(ch) = t.consumes && ch.overlaps(input) { Some(t.to) } else { None } }) .collect(); for s in next_states { for p in positives.clone() { if let Some(thread) = Self::new(enfa, s, p.clone(), negatives.clone()) { thread.step_epsilon0( enfa, ret, &mut vec![false; enfa.states.len()], Vec::new(), ); } } } } fn step(self, enfa: &ENFA, input: ByteRange) -> Vec { let mut vec = Vec::new(); self.step0(enfa, input, &mut |x| vec.push(x)); vec } fn possible_transitions(&self, enfa: &ENFA, f: &mut impl FnMut(ByteRange)) { for t in enfa.states[self.state].trans.iter() { if let Some(ch) = t.consumes { f(ch); } } for t in self.positives.iter() { t.possible_transitions(enfa, f); } for t in self.negatives.iter() { t.possible_transitions(enfa, f); } } } #[derive(Clone)] pub struct MultiState<'a> { nfa: &'a ENFA, threads: Vec, accept: bool, hash: u64, } impl<'a> PartialEq for MultiState<'a> { fn eq(&self, other: &Self) -> bool { (self.nfa as *const ENFA as u64) == (other.nfa as *const ENFA as u64) && self.threads == other.threads && self.accept == other.accept && self.hash == other.hash } } impl<'a> Eq for MultiState<'a> {} impl<'a> MultiState<'a> { fn new(nfa: &'a ENFA, mut threads: Vec) -> Self { threads.sort(); threads.dedup(); threads.shrink_to_fit(); let accept = threads.iter().any(|t| t.accept(nfa)); let mut hasher = DefaultHasher::new(); threads.hash(&mut hasher); let hash = hasher.finish(); Self { nfa, threads, accept, hash, } } /// all the chars that will make an interesting transition pub fn possible_transitions(&self) -> Vec { let mut vec = Vec::new(); for t in self.threads.iter() { t.possible_transitions(self.nfa, &mut |x| vec.push(x)); } vec = ByteRange::split_to_disjoint(vec); vec.sort(); vec.dedup(); vec.shrink_to_fit(); vec } pub fn transition(&self, ch: ByteRange) -> Self { let new_states = self .threads .iter() .flat_map(|t| t.clone().step(self.nfa, ch)) .collect(); Self::new(self.nfa, new_states) } pub fn accept(&self) -> bool { self.accept } } impl<'a> Hash for MultiState<'a> { fn hash(&self, state: &mut H) { self.hash.hash(state) } } macro_rules! set { () => { std::collections::HashSet::new() }; ( $( $x:expr ),* ) => {{ let mut set = std::collections::HashSet::new(); $( set.insert($x); )* set }}; } impl ENFA { fn shift(self, amt: usize) -> Vec { let mut s = self.states; for state in s.iter_mut() { state.remap(|i| i + amt); if state.accept == Acceptance::Accept { state.accept = Acceptance::NotYet; } } s } pub fn remove_unreachable(&mut self) { let mut used = vec![false; self.states.len()]; used[0] = true; for s in self.states.iter() { for i in s.reachable_states() { used[i] = true; } } let mut remap = vec![0; self.states.len()]; let mut shift = 0; for i in 0..self.states.len() { if used[i] { remap[i] = i - shift; } else { shift += 1; } } for i in (0..self.states.len()).rev() { if !used[i] { self.states.remove(i); } } for s in self.states.iter_mut() { s.remap(|i| remap[i]); } } } impl ENFA { fn looping(self) -> Self { let has_submatches = self.has_submatches; let mut states = vec![EState::start()]; states.append(&mut self.shift(1)); let len = states.len(); states[0].set_epsilon_transitions([Transition::epsilon(1), Transition::epsilon(len)]); states[len - 1].set_epsilon_transitions([Transition::epsilon(0), Transition::epsilon(len)]); states.push(EState::terminal()); Self { states, has_submatches } } fn repeat(self, times: usize) -> Self { let reps = vec![self; times]; Self::concat(reps) } /// between 0 and x repetitions fn optx(self, x: usize) -> Self { let len = self.states.len(); let mut repped = self.repeat(x); assert_eq!(repped.states.len(), x * len); for i in 1..=x { repped.states[0] .trans .insert(Transition::epsilon(i * len - 1)); } repped } fn concat(nfas: Vec) -> Self { if nfas.is_empty() { return Self { states: vec![EState::terminal()], has_submatches: false, }; } let mut has_submatches = false; let mut states: Vec = Vec::new(); for nfa in nfas.into_iter() { has_submatches = has_submatches || nfa.has_submatches; let len = states.len(); let mut ns = nfa.shift(len); if let Some(n) = states.last_mut() { n.trans.retain(|t| t.consumes.is_some()); n.trans.insert(Transition::epsilon(len)); } states.append(&mut ns); } let len = states.len(); states[len - 1].accept = Acceptance::Accept; Self { states, has_submatches } } } impl ENFA { pub fn start_multi_state<'a>(&'a self) -> MultiState<'a> { let threads = Thread::new_simple(0).step_epsilon(self); MultiState::new(self, threads) } pub fn void_multi_state<'a>(&'a self) -> MultiState<'a> { MultiState::new(self, vec![]) } pub fn all_multi_states<'a>(&'a self) -> HashSet> { let mut states = set![self.start_multi_state()]; let mut q = vec![self.start_multi_state()]; while let Some(state) = q.pop() { let chars = state.possible_transitions(); for chr in chars { let new = state.transition(chr); if !states.contains(&new) { states.insert(new.clone()); q.push(new); } } } states } } impl std::fmt::Debug for ENFA { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { writeln!(f, "NFA {{")?; for (i, s) in self.states.iter().enumerate() { write!(f, " {i}: ")?; if let Some(a) = s.assert.as_ref() { match a.polarity { LookPolarity::Positive => write!(f, "+{} ", a.to)?, LookPolarity::Negative => write!(f, "-{} ", a.to)?, } } for t in s.trans.iter() { let k = t.to; if let Some(c) = t.consumes { write!(f, "{c:?}=>{k} ")?; } else { write!(f, "~>{k} ")?; } } match s.accept { Acceptance::Accept => write!(f, "accept")?, Acceptance::Assertion => write!(f, "assert")?, Acceptance::NotYet => {} } writeln!(f)?; } write!(f, "}}") } } pub type StateId = usize; #[derive(Debug, Clone, Hash, PartialEq, Eq)] pub struct Assertion { to: StateId, polarity: LookPolarity, } #[derive(Debug, Clone, Hash, PartialEq, Eq)] pub struct Transition { to: StateId, consumes: Option, } impl Transition { fn new(consumes: ByteRange, to: StateId) -> Self { Self { to, consumes: Some(consumes), } } fn epsilon(to: StateId) -> Self { Self { to, consumes: None } } fn remap(&mut self, mut f: impl FnMut(StateId) -> StateId) { self.to = f(self.to); } fn reachable_states(&self) -> impl Iterator { [self.to].into_iter() } } #[derive(Debug, Copy, Clone, PartialEq)] pub enum Acceptance { Accept, Assertion, NotYet, } #[derive(Debug, Clone)] pub struct EState { pub trans: HashSet, pub assert: Option, pub accept: Acceptance, } impl EState { fn set_epsilon_transitions(&mut self, trans: impl IntoIterator) { self.trans.retain(|t| t.consumes.is_some()); for transition in trans.into_iter() { assert!(transition.consumes.is_none()); self.trans.insert(transition); } } fn start() -> Self { Self { trans: HashSet::new(), assert: None, accept: Acceptance::NotYet, } } fn terminal() -> Self { Self { trans: HashSet::new(), assert: None, accept: Acceptance::Accept, } } fn remap(&mut self, mut f: impl FnMut(StateId) -> StateId) { self.trans = self .trans .iter() .cloned() .map(|mut t| { t.remap(&mut f); t }) .collect(); if let Some(a) = self.assert.as_mut() { a.to = f(a.to); } } fn reachable_states(&self) -> impl Iterator { self.trans .iter() .flat_map(|t| t.reachable_states()) .chain(self.assert.iter().map(|a| a.to)) } } #[derive(Clone, Debug)] pub enum EnfaTranslationError { CharacterClassNotSupported, AssertionsNotSupported, } impl TryFrom for ENFA { type Error = EnfaTranslationError; fn try_from(value: Pattern) -> Result { Ok(match value { Pattern::Byte(c) => Self::try_from(Pattern::Range(c, c))?, Pattern::Range(c1, c2) => Self { states: vec![ EState { trans: set![Transition::new(ByteRange::new_range(c1, c2), 1)], assert: None, accept: Acceptance::NotYet, }, EState::terminal(), ], has_submatches: false, }, Pattern::CharacterClass(_) => { return Err(EnfaTranslationError::CharacterClassNotSupported); } Pattern::Alt(alts) => { let nfas: Vec = alts .into_iter() .map(Self::try_from) .collect::>()?; let mut states = vec![EState::start()]; let mut ends = vec![]; let mut has_submatches = false; for nfa in nfas.into_iter() { has_submatches = has_submatches || nfa.has_submatches; let len = states.len(); states[0].trans.insert(Transition::epsilon(len)); states.append(&mut (nfa.shift(len))); ends.push(states.len() - 1); } states.push(EState::terminal()); for end in ends.into_iter() { let last = states.len() - 1; states[end].trans.insert(Transition::epsilon(last)); } Self { states, has_submatches } } Pattern::Concat(seq) => { let nfas: Vec = seq .into_iter() .map(Self::try_from) .collect::>()?; Self::concat(nfas) } Pattern::Rep(regex, min, None, _) => { let nfa = ENFA::try_from(*regex)?; let base = nfa.clone().repeat(min as usize); let tail = nfa.looping(); Self::concat(vec![base, tail]) } Pattern::Rep(regex, min, Some(max), _) => { assert!(min < max); let nfa = Self::try_from(*regex)?; let base = nfa.clone().repeat(min as usize); let tail = nfa.optx((max - min) as usize); Self::concat(vec![base, tail]) } Pattern::Nothing => Self { states: vec![EState::terminal()], has_submatches: false, }, Pattern::Assertion(dir, polarity, pat) => { if dir == LookDirection::Behind { return Err(EnfaTranslationError::AssertionsNotSupported); } let mut regex = Self::try_from(*pat)?; for s in regex.states.iter_mut() { if s.accept == Acceptance::Accept { s.accept = Acceptance::Assertion; } } let mut regex = regex.shift(1); let mut states = Vec::with_capacity(regex.len() + 2); states.push(EState { trans: set![Transition::epsilon(regex.len() + 1)], assert: Some(Assertion { to: 1, polarity }), accept: Acceptance::NotYet, }); states.append(&mut regex); states.push(EState::terminal()); Self { states, has_submatches: false, } } Pattern::Submatch(pat) => { let mut this = Self::try_from(*pat)?; this.has_submatches = true; this } }) } }