diff options
Diffstat (limited to 'src/parse/regex/enfa.rs')
| -rw-r--r-- | src/parse/regex/enfa.rs | 733 |
1 files changed, 0 insertions, 733 deletions
diff --git a/src/parse/regex/enfa.rs b/src/parse/regex/enfa.rs deleted file mode 100644 index dd4cf6e..0000000 --- a/src/parse/regex/enfa.rs +++ /dev/null @@ -1,733 +0,0 @@ -use std::{ - collections::HashSet, - hash::{DefaultHasher, Hash, Hasher}, -}; - -use crate::parse::regex::{LookDirection, LookPolarity}; - -use super::Pattern; -use super::byte_range::ByteRange; - -/// NFA with epsilon transitions -#[derive(Clone)] -#[allow(clippy::upper_case_acronyms)] -pub struct ENFA { - pub states: Vec<EState>, -} - -fn cartesian_product<T: Clone>(x: Vec<Vec<T>>) -> Vec<Vec<T>> { - let mut result = vec![Vec::new()]; - - for xs in x { - let mut next = Vec::new(); - - for prefix in &result { - for item in &xs { - let mut v = prefix.clone(); - v.push(item.clone()); - next.push(v); - } - } - - result = next; - } - - result -} - -#[cfg(test)] -mod product_tests { - use super::cartesian_product; - - #[test] - fn basic_case() { - let x = vec![vec![1, 2], vec![10, 20]]; - - let out = cartesian_product(x); - - assert_eq!( - out, - vec![vec![1, 10], vec![1, 20], vec![2, 10], vec![2, 20],] - ); - } - - #[test] - fn single_dimension() { - let x = vec![vec![1, 2, 3]]; - - let out = cartesian_product(x); - - assert_eq!(out, vec![vec![1], vec![2], vec![3],]); - } - - #[test] - fn empty_outer_vector() { - let x: Vec<Vec<i32>> = vec![]; - - let out = cartesian_product(x); - - // One empty combination. - assert_eq!(out, vec![vec![]]); - } - - #[test] - fn empty_inner_vector() { - let x = vec![vec![1, 2], vec![], vec![3, 4]]; - - let out = cartesian_product(x); - - assert!(out.is_empty()); - } - - #[test] - fn output_size_matches_product() { - let x = vec![vec![1, 2, 3], vec![4, 5], vec![6, 7, 8, 9]]; - - let out = cartesian_product(x); - - assert_eq!(out.len(), 3 * 2 * 4); - } - - #[test] - fn every_output_has_correct_length() { - let x = vec![vec!['a', 'b'], vec!['x', 'y', 'z'], vec!['0']]; - - let out = cartesian_product(x); - - assert!(out.iter().all(|v| v.len() == 3)); - } - - #[test] - fn works_with_strings() { - let x = vec![ - vec!["a".to_string(), "b".to_string()], - vec!["x".to_string()], - ]; - - let out = cartesian_product(x); - - assert_eq!( - out, - vec![ - vec!["a".to_string(), "x".to_string()], - vec!["b".to_string(), "x".to_string()], - ] - ); - } -} - -#[derive(Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)] -struct Thread { - state: StateId, - positives: Vec<Thread>, - negatives: Vec<Thread>, -} - -impl Thread { - fn new_simple(state: StateId) -> Self { - Self { - state, - positives: Vec::new(), - negatives: Vec::new(), - } - } - - fn new( - enfa: &ENFA, - state: StateId, - mut positives: Vec<Thread>, - mut negatives: Vec<Thread>, - ) -> Option<Self> { - positives.sort(); - positives.dedup(); - positives.retain(|t| !t.accept(enfa)); - positives.shrink_to_fit(); - - negatives.sort(); - negatives.dedup(); - negatives.shrink_to_fit(); - - if negatives.iter().any(|t| t.accept(enfa)) { - return None; - } - - Some(Self { - state, - positives, - negatives, - }) - } - - fn accept(&self, enfa: &ENFA) -> bool { - let pos = self.positives.iter().all(|t| t.accept(enfa)); - let neg = self.negatives.iter().all(|t| !t.accept(enfa)); - let this = match enfa.states[self.state].accept { - Acceptance::Accept => true, - Acceptance::Assertion => true, - Acceptance::NotYet => false, - }; - pos && neg && this - } - - fn step_epsilon0( - self, - enfa: &ENFA, - ret: &mut impl FnMut(Thread), - visited: &mut [bool], - new_assertions: Vec<Assertion>, - ) { - if visited[self.state] { - return; - } - visited[self.state] = true; - - for t in enfa.states[self.state].trans.iter() { - if t.consumes.is_some() { - continue; - } - - let mut new_assertions = new_assertions.clone(); - if let Some(assertion) = enfa.states[self.state].assert.as_ref() { - new_assertions.push(assertion.clone()); - } - - Self { - state: t.to, - ..self.clone() - } - .step_epsilon0(enfa, ret, visited, new_assertions); - } - - let Self { - state, - positives, - negatives, - } = self; - let mut p = Vec::new(); - let mut n = Vec::new(); - for assertion in new_assertions { - let threads = Self::new_simple(assertion.to).step_epsilon(enfa); - let vec = match assertion.polarity { - LookPolarity::Positive => &mut p, - LookPolarity::Negative => &mut n, - }; - vec.push(threads); - } - let p = cartesian_product(p); - for mut p in p { - let mut positives = positives.clone(); - let mut negatives = negatives.clone(); - positives.append(&mut p); - negatives.append(&mut n.iter().flatten().cloned().collect()); - if let Some(thread) = Self::new(enfa, state, positives, negatives) { - ret(thread); - } - } - } - - fn step_epsilon(self, enfa: &ENFA) -> Vec<Self> { - let mut vec = Vec::new(); - self.step_epsilon0( - enfa, - &mut |t| vec.push(t), - &mut vec![false; enfa.states.len()], - Vec::new(), - ); - vec - } - - fn step0(self, enfa: &ENFA, input: ByteRange, ret: &mut impl FnMut(Thread)) { - let positives = self - .positives - .clone() - .into_iter() - .map(|t| t.step(enfa, input)) - .collect(); - let negatives: Vec<_> = self - .negatives - .into_iter() - .flat_map(|t| t.step(enfa, input)) - .collect(); - let positives = cartesian_product(positives); - let next_states: Vec<StateId> = enfa.states[self.state] - .trans - .iter() - .filter_map(|t| { - if let Some(ch) = t.consumes - && ch.overlaps(input) - { - Some(t.to) - } else { - None - } - }) - .collect(); - - for s in next_states { - for p in positives.clone() { - if let Some(thread) = Self::new(enfa, s, p.clone(), negatives.clone()) { - thread.step_epsilon0( - enfa, - ret, - &mut vec![false; enfa.states.len()], - Vec::new(), - ); - } - } - } - } - - fn step(self, enfa: &ENFA, input: ByteRange) -> Vec<Self> { - let mut vec = Vec::new(); - self.step0(enfa, input, &mut |x| vec.push(x)); - vec - } - - fn possible_transitions(&self, enfa: &ENFA, f: &mut impl FnMut(ByteRange)) { - for t in enfa.states[self.state].trans.iter() { - if let Some(ch) = t.consumes { - f(ch); - } - } - for t in self.positives.iter() { - t.possible_transitions(enfa, f); - } - for t in self.negatives.iter() { - t.possible_transitions(enfa, f); - } - } -} - -#[derive(Clone)] -pub struct MultiState<'a> { - nfa: &'a ENFA, - threads: Vec<Thread>, - accept: bool, - hash: u64, -} - -impl<'a> PartialEq for MultiState<'a> { - fn eq(&self, other: &Self) -> bool { - (self.nfa as *const ENFA as u64) == (other.nfa as *const ENFA as u64) - && self.threads == other.threads - && self.accept == other.accept - && self.hash == other.hash - } -} -impl<'a> Eq for MultiState<'a> {} - -impl<'a> MultiState<'a> { - fn new(nfa: &'a ENFA, mut threads: Vec<Thread>) -> Self { - threads.sort(); - threads.dedup(); - threads.shrink_to_fit(); - - let accept = threads.iter().any(|t| t.accept(nfa)); - let mut hasher = DefaultHasher::new(); - threads.hash(&mut hasher); - let hash = hasher.finish(); - - Self { - nfa, - threads, - accept, - hash, - } - } - - /// all the chars that will make an interesting transition - pub fn possible_transitions(&self) -> Vec<ByteRange> { - let mut vec = Vec::new(); - for t in self.threads.iter() { - t.possible_transitions(self.nfa, &mut |x| vec.push(x)); - } - vec = ByteRange::split_to_disjoint(vec); - vec.sort(); - vec.dedup(); - vec.shrink_to_fit(); - vec - } - - pub fn transition(&self, ch: ByteRange) -> Self { - let new_states = self - .threads - .iter() - .flat_map(|t| t.clone().step(self.nfa, ch)) - .collect(); - - Self::new(self.nfa, new_states) - } - - pub fn accept(&self) -> bool { - self.accept - } -} - -impl<'a> Hash for MultiState<'a> { - fn hash<H: Hasher>(&self, state: &mut H) { - self.hash.hash(state) - } -} - -macro_rules! set { - () => { - std::collections::HashSet::new() - }; - ( $( $x:expr ),* ) => {{ - let mut set = std::collections::HashSet::new(); - $( - set.insert($x); - )* - set - }}; -} - -impl ENFA { - fn shift(self, amt: usize) -> Vec<EState> { - let mut s = self.states; - - for state in s.iter_mut() { - state.remap(|i| i + amt); - if state.accept == Acceptance::Accept { - state.accept = Acceptance::NotYet; - } - } - - s - } - - pub fn remove_unreachable(&mut self) { - let mut used = vec![false; self.states.len()]; - used[0] = true; - for s in self.states.iter() { - for i in s.reachable_states() { - used[i] = true; - } - } - let mut remap = vec![0; self.states.len()]; - let mut shift = 0; - for i in 0..self.states.len() { - if used[i] { - remap[i] = i - shift; - } else { - shift += 1; - } - } - for i in (0..self.states.len()).rev() { - if !used[i] { - self.states.remove(i); - } - } - for s in self.states.iter_mut() { - s.remap(|i| remap[i]); - } - } -} - -impl ENFA { - fn looping(self) -> Self { - let mut states = vec![EState::start()]; - states.append(&mut self.shift(1)); - let len = states.len(); - states[0].set_epsilon_transitions([Transition::epsilon(1), Transition::epsilon(len)]); - states[len - 1].set_epsilon_transitions([Transition::epsilon(0), Transition::epsilon(len)]); - states.push(EState::terminal()); - Self { states } - } - - fn repeat(self, times: usize) -> Self { - let reps = vec![self; times]; - Self::concat(reps) - } - - /// between 0 and x repetitions - fn optx(self, x: usize) -> Self { - let len = self.states.len(); - let mut repped = self.repeat(x); - assert_eq!(repped.states.len(), x * len); - for i in 1..=x { - repped.states[0] - .trans - .insert(Transition::epsilon(i * len - 1)); - } - repped - } - - fn concat(nfas: Vec<Self>) -> Self { - if nfas.is_empty() { - return Self { - states: vec![EState::terminal()], - }; - } - - let mut states: Vec<EState> = Vec::new(); - for nfa in nfas.into_iter() { - let len = states.len(); - let mut ns = nfa.shift(len); - if let Some(n) = states.last_mut() { - n.trans.retain(|t| t.consumes.is_some()); - n.trans.insert(Transition::epsilon(len)); - } - states.append(&mut ns); - } - - let len = states.len(); - states[len - 1].accept = Acceptance::Accept; - - Self { states } - } -} - -impl ENFA { - pub fn start_multi_state<'a>(&'a self) -> MultiState<'a> { - let threads = Thread::new_simple(0).step_epsilon(self); - MultiState::new(self, threads) - } - - pub fn void_multi_state<'a>(&'a self) -> MultiState<'a> { - MultiState::new(self, vec![]) - } - - pub fn all_multi_states<'a>(&'a self) -> HashSet<MultiState<'a>> { - let mut states = set![self.start_multi_state()]; - let mut q = vec![self.start_multi_state()]; - - while let Some(state) = q.pop() { - let chars = state.possible_transitions(); - - for chr in chars { - let new = state.transition(chr); - - if !states.contains(&new) { - states.insert(new.clone()); - q.push(new); - } - } - } - - states - } -} - -impl std::fmt::Debug for ENFA { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - writeln!(f, "NFA {{")?; - for (i, s) in self.states.iter().enumerate() { - write!(f, " {i}: ")?; - if let Some(a) = s.assert.as_ref() { - match a.polarity { - LookPolarity::Positive => write!(f, "+{} ", a.to)?, - LookPolarity::Negative => write!(f, "-{} ", a.to)?, - } - } - for t in s.trans.iter() { - let k = t.to; - if let Some(c) = t.consumes { - write!(f, "{c:?}=>{k} ")?; - } else { - write!(f, "~>{k} ")?; - } - } - match s.accept { - Acceptance::Accept => write!(f, "accept")?, - Acceptance::Assertion => write!(f, "assert")?, - Acceptance::NotYet => {} - } - writeln!(f)?; - } - write!(f, "}}") - } -} - -pub type StateId = usize; - -#[derive(Debug, Clone, Hash, PartialEq, Eq)] -pub struct Assertion { - to: StateId, - polarity: LookPolarity, -} - -#[derive(Debug, Clone, Hash, PartialEq, Eq)] -pub struct Transition { - to: StateId, - consumes: Option<ByteRange>, -} - -impl Transition { - fn new(consumes: ByteRange, to: StateId) -> Self { - Self { - to, - consumes: Some(consumes), - } - } - - fn epsilon(to: StateId) -> Self { - Self { to, consumes: None } - } - - fn remap(&mut self, mut f: impl FnMut(StateId) -> StateId) { - self.to = f(self.to); - } - - fn reachable_states(&self) -> impl Iterator<Item = StateId> { - [self.to].into_iter() - } -} - -#[derive(Debug, Copy, Clone, PartialEq)] -pub enum Acceptance { - Accept, - Assertion, - NotYet, -} - -#[derive(Debug, Clone)] -pub struct EState { - pub trans: HashSet<Transition>, - pub assert: Option<Assertion>, - pub accept: Acceptance, -} - -impl EState { - fn set_epsilon_transitions(&mut self, trans: impl IntoIterator<Item = Transition>) { - self.trans.retain(|t| t.consumes.is_some()); - for transition in trans.into_iter() { - assert!(transition.consumes.is_none()); - self.trans.insert(transition); - } - } - - fn start() -> Self { - Self { - trans: HashSet::new(), - assert: None, - accept: Acceptance::NotYet, - } - } - fn terminal() -> Self { - Self { - trans: HashSet::new(), - assert: None, - accept: Acceptance::Accept, - } - } - - fn remap(&mut self, mut f: impl FnMut(StateId) -> StateId) { - self.trans = self - .trans - .iter() - .cloned() - .map(|mut t| { - t.remap(&mut f); - t - }) - .collect(); - if let Some(a) = self.assert.as_mut() { - a.to = f(a.to); - } - } - - fn reachable_states(&self) -> impl Iterator<Item = StateId> { - self.trans - .iter() - .flat_map(|t| t.reachable_states()) - .chain(self.assert.iter().map(|a| a.to)) - } -} - -#[derive(Debug)] -pub enum EnfaTranslationError { - CharacterClassNotSupported, - AssertionsNotSupported, -} - -impl TryFrom<Pattern> for ENFA { - type Error = EnfaTranslationError; - - fn try_from(value: Pattern) -> Result<Self, Self::Error> { - Ok(match value { - Pattern::Byte(c) => Self::try_from(Pattern::Range(c, c))?, - Pattern::Range(c1, c2) => Self { - states: vec![ - EState { - trans: set![Transition::new(ByteRange::new_range(c1, c2), 1)], - assert: None, - accept: Acceptance::NotYet, - }, - EState::terminal(), - ], - }, - Pattern::CharacterClass(_) => { - return Err(EnfaTranslationError::CharacterClassNotSupported); - } - Pattern::Alt(alts) => { - let nfas: Vec<ENFA> = alts - .into_iter() - .map(Self::try_from) - .collect::<Result<_, _>>()?; - let mut states = vec![EState::start()]; - let mut ends = vec![]; - for nfa in nfas.into_iter() { - let len = states.len(); - states[0].trans.insert(Transition::epsilon(len)); - states.append(&mut (nfa.shift(len))); - ends.push(states.len() - 1); - } - states.push(EState::terminal()); - for end in ends.into_iter() { - let last = states.len() - 1; - states[end].trans.insert(Transition::epsilon(last)); - } - Self { states } - } - Pattern::Concat(seq) => { - let nfas: Vec<Self> = seq - .into_iter() - .map(Self::try_from) - .collect::<Result<_, _>>()?; - Self::concat(nfas) - } - Pattern::Rep(regex, min, None, _) => { - let nfa = ENFA::try_from(*regex)?; - let base = nfa.clone().repeat(min as usize); - let tail = nfa.looping(); - Self::concat(vec![base, tail]) - } - Pattern::Rep(regex, min, Some(max), _) => { - assert!(min < max); - let nfa = Self::try_from(*regex)?; - let base = nfa.clone().repeat(min as usize); - let tail = nfa.optx((max - min) as usize); - Self::concat(vec![base, tail]) - } - Pattern::Nothing => Self { - states: vec![EState::terminal()], - }, - Pattern::Assertion(dir, polarity, pat) => { - if dir == LookDirection::Behind { - return Err(EnfaTranslationError::AssertionsNotSupported); - } - let mut regex = Self::try_from(*pat)?; - for s in regex.states.iter_mut() { - if s.accept == Acceptance::Accept { - s.accept = Acceptance::Assertion; - } - } - let mut regex = regex.shift(1); - let mut states = Vec::with_capacity(regex.len() + 2); - states.push(EState { - trans: set![Transition::epsilon(regex.len() + 1)], - assert: Some(Assertion { to: 1, polarity }), - accept: Acceptance::NotYet, - }); - states.append(&mut regex); - states.push(EState::terminal()); - Self { states } - } - Pattern::Submatch(pat) => { - // NFA does not track submatches, so we are skipping it. - Self::try_from(*pat)? - } - }) - } -} |
