From 53980774c327675e886179c0a2c140744dcf9b95 Mon Sep 17 00:00:00 2001 From: Jonas Maier Date: Sat, 6 Jun 2026 12:15:52 +0200 Subject: special cased regex for performance --- src/regex/enfa.rs | 741 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 741 insertions(+) create mode 100644 src/regex/enfa.rs (limited to 'src/regex/enfa.rs') diff --git a/src/regex/enfa.rs b/src/regex/enfa.rs new file mode 100644 index 0000000..8392642 --- /dev/null +++ b/src/regex/enfa.rs @@ -0,0 +1,741 @@ +use std::{ + collections::HashSet, + hash::{DefaultHasher, Hash, Hasher}, +}; + +use super::{LookDirection, LookPolarity, Pattern, byte_range::ByteRange}; + +/// NFA with epsilon transitions +#[derive(Clone)] +#[allow(clippy::upper_case_acronyms)] +pub struct ENFA { + pub states: Vec, + pub has_submatches: bool, +} + +fn cartesian_product(x: Vec>) -> Vec> { + let mut result = vec![Vec::new()]; + + for xs in x { + let mut next = Vec::new(); + + for prefix in &result { + for item in &xs { + let mut v = prefix.clone(); + v.push(item.clone()); + next.push(v); + } + } + + result = next; + } + + result +} + +#[cfg(test)] +mod product_tests { + use super::cartesian_product; + + #[test] + fn basic_case() { + let x = vec![vec![1, 2], vec![10, 20]]; + + let out = cartesian_product(x); + + assert_eq!( + out, + vec![vec![1, 10], vec![1, 20], vec![2, 10], vec![2, 20],] + ); + } + + #[test] + fn single_dimension() { + let x = vec![vec![1, 2, 3]]; + + let out = cartesian_product(x); + + assert_eq!(out, vec![vec![1], vec![2], vec![3],]); + } + + #[test] + fn empty_outer_vector() { + let x: Vec> = vec![]; + + let out: Vec> = cartesian_product(x); + let r: Vec> = vec![vec![]]; + + // One empty combination. + assert_eq!(out, r); + } + + #[test] + fn empty_inner_vector() { + let x = vec![vec![1, 2], vec![], vec![3, 4]]; + + let out = cartesian_product(x); + + assert!(out.is_empty()); + } + + #[test] + fn output_size_matches_product() { + let x = vec![vec![1, 2, 3], vec![4, 5], vec![6, 7, 8, 9]]; + + let out = cartesian_product(x); + + assert_eq!(out.len(), 3 * 2 * 4); + } + + #[test] + fn every_output_has_correct_length() { + let x = vec![vec!['a', 'b'], vec!['x', 'y', 'z'], vec!['0']]; + + let out = cartesian_product(x); + + assert!(out.iter().all(|v| v.len() == 3)); + } + + #[test] + fn works_with_strings() { + let x = vec![ + vec!["a".to_string(), "b".to_string()], + vec!["x".to_string()], + ]; + + let out = cartesian_product(x); + + assert_eq!( + out, + vec![ + vec!["a".to_string(), "x".to_string()], + vec!["b".to_string(), "x".to_string()], + ] + ); + } +} + +#[derive(Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)] +struct Thread { + state: StateId, + positives: Vec, + negatives: Vec, +} + +impl Thread { + fn new_simple(state: StateId) -> Self { + Self { + state, + positives: Vec::new(), + negatives: Vec::new(), + } + } + + fn new( + enfa: &ENFA, + state: StateId, + mut positives: Vec, + mut negatives: Vec, + ) -> Option { + positives.sort(); + positives.dedup(); + positives.retain(|t| !t.accept(enfa)); + positives.shrink_to_fit(); + + negatives.sort(); + negatives.dedup(); + negatives.shrink_to_fit(); + + if negatives.iter().any(|t| t.accept(enfa)) { + return None; + } + + Some(Self { + state, + positives, + negatives, + }) + } + + fn accept(&self, enfa: &ENFA) -> bool { + let pos = self.positives.iter().all(|t| t.accept(enfa)); + let neg = self.negatives.iter().all(|t| !t.accept(enfa)); + let this = match enfa.states[self.state].accept { + Acceptance::Accept => true, + Acceptance::Assertion => true, + Acceptance::NotYet => false, + }; + pos && neg && this + } + + fn step_epsilon0( + self, + enfa: &ENFA, + ret: &mut impl FnMut(Thread), + visited: &mut [bool], + new_assertions: Vec, + ) { + if visited[self.state] { + return; + } + visited[self.state] = true; + + for t in enfa.states[self.state].trans.iter() { + if t.consumes.is_some() { + continue; + } + + let mut new_assertions = new_assertions.clone(); + if let Some(assertion) = enfa.states[self.state].assert.as_ref() { + new_assertions.push(assertion.clone()); + } + + Self { + state: t.to, + ..self.clone() + } + .step_epsilon0(enfa, ret, visited, new_assertions); + } + + let Self { + state, + positives, + negatives, + } = self; + let mut p = Vec::new(); + let mut n = Vec::new(); + for assertion in new_assertions { + let threads = Self::new_simple(assertion.to).step_epsilon(enfa); + let vec = match assertion.polarity { + LookPolarity::Positive => &mut p, + LookPolarity::Negative => &mut n, + }; + vec.push(threads); + } + let p = cartesian_product(p); + for mut p in p { + let mut positives = positives.clone(); + let mut negatives = negatives.clone(); + positives.append(&mut p); + negatives.append(&mut n.iter().flatten().cloned().collect()); + if let Some(thread) = Self::new(enfa, state, positives, negatives) { + ret(thread); + } + } + } + + fn step_epsilon(self, enfa: &ENFA) -> Vec { + let mut vec = Vec::new(); + self.step_epsilon0( + enfa, + &mut |t| vec.push(t), + &mut vec![false; enfa.states.len()], + Vec::new(), + ); + vec + } + + fn step0(self, enfa: &ENFA, input: ByteRange, ret: &mut impl FnMut(Thread)) { + let positives = self + .positives + .clone() + .into_iter() + .map(|t| t.step(enfa, input)) + .collect(); + let negatives: Vec<_> = self + .negatives + .into_iter() + .flat_map(|t| t.step(enfa, input)) + .collect(); + let positives = cartesian_product(positives); + let next_states: Vec = enfa.states[self.state] + .trans + .iter() + .filter_map(|t| { + if let Some(ch) = t.consumes + && ch.overlaps(input) + { + Some(t.to) + } else { + None + } + }) + .collect(); + + for s in next_states { + for p in positives.clone() { + if let Some(thread) = Self::new(enfa, s, p.clone(), negatives.clone()) { + thread.step_epsilon0( + enfa, + ret, + &mut vec![false; enfa.states.len()], + Vec::new(), + ); + } + } + } + } + + fn step(self, enfa: &ENFA, input: ByteRange) -> Vec { + let mut vec = Vec::new(); + self.step0(enfa, input, &mut |x| vec.push(x)); + vec + } + + fn possible_transitions(&self, enfa: &ENFA, f: &mut impl FnMut(ByteRange)) { + for t in enfa.states[self.state].trans.iter() { + if let Some(ch) = t.consumes { + f(ch); + } + } + for t in self.positives.iter() { + t.possible_transitions(enfa, f); + } + for t in self.negatives.iter() { + t.possible_transitions(enfa, f); + } + } +} + +#[derive(Clone)] +pub struct MultiState<'a> { + nfa: &'a ENFA, + threads: Vec, + accept: bool, + hash: u64, +} + +impl<'a> PartialEq for MultiState<'a> { + fn eq(&self, other: &Self) -> bool { + (self.nfa as *const ENFA as u64) == (other.nfa as *const ENFA as u64) + && self.threads == other.threads + && self.accept == other.accept + && self.hash == other.hash + } +} +impl<'a> Eq for MultiState<'a> {} + +impl<'a> MultiState<'a> { + fn new(nfa: &'a ENFA, mut threads: Vec) -> Self { + threads.sort(); + threads.dedup(); + threads.shrink_to_fit(); + + let accept = threads.iter().any(|t| t.accept(nfa)); + let mut hasher = DefaultHasher::new(); + threads.hash(&mut hasher); + let hash = hasher.finish(); + + Self { + nfa, + threads, + accept, + hash, + } + } + + /// all the chars that will make an interesting transition + pub fn possible_transitions(&self) -> Vec { + let mut vec = Vec::new(); + for t in self.threads.iter() { + t.possible_transitions(self.nfa, &mut |x| vec.push(x)); + } + vec = ByteRange::split_to_disjoint(vec); + vec.sort(); + vec.dedup(); + vec.shrink_to_fit(); + vec + } + + pub fn transition(&self, ch: ByteRange) -> Self { + let new_states = self + .threads + .iter() + .flat_map(|t| t.clone().step(self.nfa, ch)) + .collect(); + + Self::new(self.nfa, new_states) + } + + pub fn accept(&self) -> bool { + self.accept + } +} + +impl<'a> Hash for MultiState<'a> { + fn hash(&self, state: &mut H) { + self.hash.hash(state) + } +} + +macro_rules! set { + () => { + std::collections::HashSet::new() + }; + ( $( $x:expr ),* ) => {{ + let mut set = std::collections::HashSet::new(); + $( + set.insert($x); + )* + set + }}; +} + +impl ENFA { + fn shift(self, amt: usize) -> Vec { + let mut s = self.states; + + for state in s.iter_mut() { + state.remap(|i| i + amt); + if state.accept == Acceptance::Accept { + state.accept = Acceptance::NotYet; + } + } + + s + } + + pub fn remove_unreachable(&mut self) { + let mut used = vec![false; self.states.len()]; + used[0] = true; + for s in self.states.iter() { + for i in s.reachable_states() { + used[i] = true; + } + } + let mut remap = vec![0; self.states.len()]; + let mut shift = 0; + for i in 0..self.states.len() { + if used[i] { + remap[i] = i - shift; + } else { + shift += 1; + } + } + for i in (0..self.states.len()).rev() { + if !used[i] { + self.states.remove(i); + } + } + for s in self.states.iter_mut() { + s.remap(|i| remap[i]); + } + } +} + +impl ENFA { + fn looping(self) -> Self { + let has_submatches = self.has_submatches; + let mut states = vec![EState::start()]; + states.append(&mut self.shift(1)); + let len = states.len(); + states[0].set_epsilon_transitions([Transition::epsilon(1), Transition::epsilon(len)]); + states[len - 1].set_epsilon_transitions([Transition::epsilon(0), Transition::epsilon(len)]); + states.push(EState::terminal()); + Self { states, has_submatches } + } + + fn repeat(self, times: usize) -> Self { + let reps = vec![self; times]; + Self::concat(reps) + } + + /// between 0 and x repetitions + fn optx(self, x: usize) -> Self { + let len = self.states.len(); + let mut repped = self.repeat(x); + assert_eq!(repped.states.len(), x * len); + for i in 1..=x { + repped.states[0] + .trans + .insert(Transition::epsilon(i * len - 1)); + } + repped + } + + fn concat(nfas: Vec) -> Self { + if nfas.is_empty() { + return Self { + states: vec![EState::terminal()], + has_submatches: false, + }; + } + + let mut has_submatches = false; + let mut states: Vec = Vec::new(); + for nfa in nfas.into_iter() { + has_submatches = has_submatches || nfa.has_submatches; + let len = states.len(); + let mut ns = nfa.shift(len); + if let Some(n) = states.last_mut() { + n.trans.retain(|t| t.consumes.is_some()); + n.trans.insert(Transition::epsilon(len)); + } + states.append(&mut ns); + } + + let len = states.len(); + states[len - 1].accept = Acceptance::Accept; + + Self { states, has_submatches } + } +} + +impl ENFA { + pub fn start_multi_state<'a>(&'a self) -> MultiState<'a> { + let threads = Thread::new_simple(0).step_epsilon(self); + MultiState::new(self, threads) + } + + pub fn void_multi_state<'a>(&'a self) -> MultiState<'a> { + MultiState::new(self, vec![]) + } + + pub fn all_multi_states<'a>(&'a self) -> HashSet> { + let mut states = set![self.start_multi_state()]; + let mut q = vec![self.start_multi_state()]; + + while let Some(state) = q.pop() { + let chars = state.possible_transitions(); + + for chr in chars { + let new = state.transition(chr); + + if !states.contains(&new) { + states.insert(new.clone()); + q.push(new); + } + } + } + + states + } +} + +impl std::fmt::Debug for ENFA { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!(f, "NFA {{")?; + for (i, s) in self.states.iter().enumerate() { + write!(f, " {i}: ")?; + if let Some(a) = s.assert.as_ref() { + match a.polarity { + LookPolarity::Positive => write!(f, "+{} ", a.to)?, + LookPolarity::Negative => write!(f, "-{} ", a.to)?, + } + } + for t in s.trans.iter() { + let k = t.to; + if let Some(c) = t.consumes { + write!(f, "{c:?}=>{k} ")?; + } else { + write!(f, "~>{k} ")?; + } + } + match s.accept { + Acceptance::Accept => write!(f, "accept")?, + Acceptance::Assertion => write!(f, "assert")?, + Acceptance::NotYet => {} + } + writeln!(f)?; + } + write!(f, "}}") + } +} + +pub type StateId = usize; + +#[derive(Debug, Clone, Hash, PartialEq, Eq)] +pub struct Assertion { + to: StateId, + polarity: LookPolarity, +} + +#[derive(Debug, Clone, Hash, PartialEq, Eq)] +pub struct Transition { + to: StateId, + consumes: Option, +} + +impl Transition { + fn new(consumes: ByteRange, to: StateId) -> Self { + Self { + to, + consumes: Some(consumes), + } + } + + fn epsilon(to: StateId) -> Self { + Self { to, consumes: None } + } + + fn remap(&mut self, mut f: impl FnMut(StateId) -> StateId) { + self.to = f(self.to); + } + + fn reachable_states(&self) -> impl Iterator { + [self.to].into_iter() + } +} + +#[derive(Debug, Copy, Clone, PartialEq)] +pub enum Acceptance { + Accept, + Assertion, + NotYet, +} + +#[derive(Debug, Clone)] +pub struct EState { + pub trans: HashSet, + pub assert: Option, + pub accept: Acceptance, +} + +impl EState { + fn set_epsilon_transitions(&mut self, trans: impl IntoIterator) { + self.trans.retain(|t| t.consumes.is_some()); + for transition in trans.into_iter() { + assert!(transition.consumes.is_none()); + self.trans.insert(transition); + } + } + + fn start() -> Self { + Self { + trans: HashSet::new(), + assert: None, + accept: Acceptance::NotYet, + } + } + fn terminal() -> Self { + Self { + trans: HashSet::new(), + assert: None, + accept: Acceptance::Accept, + } + } + + fn remap(&mut self, mut f: impl FnMut(StateId) -> StateId) { + self.trans = self + .trans + .iter() + .cloned() + .map(|mut t| { + t.remap(&mut f); + t + }) + .collect(); + if let Some(a) = self.assert.as_mut() { + a.to = f(a.to); + } + } + + fn reachable_states(&self) -> impl Iterator { + self.trans + .iter() + .flat_map(|t| t.reachable_states()) + .chain(self.assert.iter().map(|a| a.to)) + } +} + +#[derive(Clone, Debug)] +pub enum EnfaTranslationError { + CharacterClassNotSupported, + AssertionsNotSupported, +} + +impl TryFrom for ENFA { + type Error = EnfaTranslationError; + + fn try_from(value: Pattern) -> Result { + Ok(match value { + Pattern::Byte(c) => Self::try_from(Pattern::Range(c, c))?, + Pattern::Range(c1, c2) => Self { + states: vec![ + EState { + trans: set![Transition::new(ByteRange::new_range(c1, c2), 1)], + assert: None, + accept: Acceptance::NotYet, + }, + EState::terminal(), + ], + has_submatches: false, + }, + Pattern::CharacterClass(_) => { + return Err(EnfaTranslationError::CharacterClassNotSupported); + } + Pattern::Alt(alts) => { + let nfas: Vec = alts + .into_iter() + .map(Self::try_from) + .collect::>()?; + let mut states = vec![EState::start()]; + let mut ends = vec![]; + let mut has_submatches = false; + for nfa in nfas.into_iter() { + has_submatches = has_submatches || nfa.has_submatches; + let len = states.len(); + states[0].trans.insert(Transition::epsilon(len)); + states.append(&mut (nfa.shift(len))); + ends.push(states.len() - 1); + } + states.push(EState::terminal()); + for end in ends.into_iter() { + let last = states.len() - 1; + states[end].trans.insert(Transition::epsilon(last)); + } + Self { states, has_submatches } + } + Pattern::Concat(seq) => { + let nfas: Vec = seq + .into_iter() + .map(Self::try_from) + .collect::>()?; + Self::concat(nfas) + } + Pattern::Rep(regex, min, None, _) => { + let nfa = ENFA::try_from(*regex)?; + let base = nfa.clone().repeat(min as usize); + let tail = nfa.looping(); + Self::concat(vec![base, tail]) + } + Pattern::Rep(regex, min, Some(max), _) => { + assert!(min < max); + let nfa = Self::try_from(*regex)?; + let base = nfa.clone().repeat(min as usize); + let tail = nfa.optx((max - min) as usize); + Self::concat(vec![base, tail]) + } + Pattern::Nothing => Self { + states: vec![EState::terminal()], + has_submatches: false, + }, + Pattern::Assertion(dir, polarity, pat) => { + if dir == LookDirection::Behind { + return Err(EnfaTranslationError::AssertionsNotSupported); + } + let mut regex = Self::try_from(*pat)?; + for s in regex.states.iter_mut() { + if s.accept == Acceptance::Accept { + s.accept = Acceptance::Assertion; + } + } + let mut regex = regex.shift(1); + let mut states = Vec::with_capacity(regex.len() + 2); + states.push(EState { + trans: set![Transition::epsilon(regex.len() + 1)], + assert: Some(Assertion { to: 1, polarity }), + accept: Acceptance::NotYet, + }); + states.append(&mut regex); + states.push(EState::terminal()); + Self { states, has_submatches: false, } + } + Pattern::Submatch(pat) => { + let mut this = Self::try_from(*pat)?; + this.has_submatches = true; + this + } + }) + } +} -- cgit v1.2.3