aboutsummaryrefslogtreecommitdiffstats
path: root/src/parse/regex/enfa.rs
diff options
context:
space:
mode:
authorJonas Maier <jonas@x77.dev>2026-06-06 12:15:52 +0200
committerJonas Maier <jonas@x77.dev>2026-06-06 12:15:52 +0200
commit53980774c327675e886179c0a2c140744dcf9b95 (patch)
treeca1fdcc9938fce2c10c51e0a51659c6ba38ac5ba /src/parse/regex/enfa.rs
parent75e0c29cf91ddc6299c14a94a038c3e3df3d2805 (diff)
downloadpish-53980774c327675e886179c0a2c140744dcf9b95.tar.gz
special cased regex for performance
Diffstat (limited to 'src/parse/regex/enfa.rs')
-rw-r--r--src/parse/regex/enfa.rs733
1 files changed, 0 insertions, 733 deletions
diff --git a/src/parse/regex/enfa.rs b/src/parse/regex/enfa.rs
deleted file mode 100644
index dd4cf6e..0000000
--- a/src/parse/regex/enfa.rs
+++ /dev/null
@@ -1,733 +0,0 @@
-use std::{
- collections::HashSet,
- hash::{DefaultHasher, Hash, Hasher},
-};
-
-use crate::parse::regex::{LookDirection, LookPolarity};
-
-use super::Pattern;
-use super::byte_range::ByteRange;
-
-/// NFA with epsilon transitions
-#[derive(Clone)]
-#[allow(clippy::upper_case_acronyms)]
-pub struct ENFA {
- pub states: Vec<EState>,
-}
-
-fn cartesian_product<T: Clone>(x: Vec<Vec<T>>) -> Vec<Vec<T>> {
- let mut result = vec![Vec::new()];
-
- for xs in x {
- let mut next = Vec::new();
-
- for prefix in &result {
- for item in &xs {
- let mut v = prefix.clone();
- v.push(item.clone());
- next.push(v);
- }
- }
-
- result = next;
- }
-
- result
-}
-
-#[cfg(test)]
-mod product_tests {
- use super::cartesian_product;
-
- #[test]
- fn basic_case() {
- let x = vec![vec![1, 2], vec![10, 20]];
-
- let out = cartesian_product(x);
-
- assert_eq!(
- out,
- vec![vec![1, 10], vec![1, 20], vec![2, 10], vec![2, 20],]
- );
- }
-
- #[test]
- fn single_dimension() {
- let x = vec![vec![1, 2, 3]];
-
- let out = cartesian_product(x);
-
- assert_eq!(out, vec![vec![1], vec![2], vec![3],]);
- }
-
- #[test]
- fn empty_outer_vector() {
- let x: Vec<Vec<i32>> = vec![];
-
- let out = cartesian_product(x);
-
- // One empty combination.
- assert_eq!(out, vec![vec![]]);
- }
-
- #[test]
- fn empty_inner_vector() {
- let x = vec![vec![1, 2], vec![], vec![3, 4]];
-
- let out = cartesian_product(x);
-
- assert!(out.is_empty());
- }
-
- #[test]
- fn output_size_matches_product() {
- let x = vec![vec![1, 2, 3], vec![4, 5], vec![6, 7, 8, 9]];
-
- let out = cartesian_product(x);
-
- assert_eq!(out.len(), 3 * 2 * 4);
- }
-
- #[test]
- fn every_output_has_correct_length() {
- let x = vec![vec!['a', 'b'], vec!['x', 'y', 'z'], vec!['0']];
-
- let out = cartesian_product(x);
-
- assert!(out.iter().all(|v| v.len() == 3));
- }
-
- #[test]
- fn works_with_strings() {
- let x = vec![
- vec!["a".to_string(), "b".to_string()],
- vec!["x".to_string()],
- ];
-
- let out = cartesian_product(x);
-
- assert_eq!(
- out,
- vec![
- vec!["a".to_string(), "x".to_string()],
- vec!["b".to_string(), "x".to_string()],
- ]
- );
- }
-}
-
-#[derive(Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
-struct Thread {
- state: StateId,
- positives: Vec<Thread>,
- negatives: Vec<Thread>,
-}
-
-impl Thread {
- fn new_simple(state: StateId) -> Self {
- Self {
- state,
- positives: Vec::new(),
- negatives: Vec::new(),
- }
- }
-
- fn new(
- enfa: &ENFA,
- state: StateId,
- mut positives: Vec<Thread>,
- mut negatives: Vec<Thread>,
- ) -> Option<Self> {
- positives.sort();
- positives.dedup();
- positives.retain(|t| !t.accept(enfa));
- positives.shrink_to_fit();
-
- negatives.sort();
- negatives.dedup();
- negatives.shrink_to_fit();
-
- if negatives.iter().any(|t| t.accept(enfa)) {
- return None;
- }
-
- Some(Self {
- state,
- positives,
- negatives,
- })
- }
-
- fn accept(&self, enfa: &ENFA) -> bool {
- let pos = self.positives.iter().all(|t| t.accept(enfa));
- let neg = self.negatives.iter().all(|t| !t.accept(enfa));
- let this = match enfa.states[self.state].accept {
- Acceptance::Accept => true,
- Acceptance::Assertion => true,
- Acceptance::NotYet => false,
- };
- pos && neg && this
- }
-
- fn step_epsilon0(
- self,
- enfa: &ENFA,
- ret: &mut impl FnMut(Thread),
- visited: &mut [bool],
- new_assertions: Vec<Assertion>,
- ) {
- if visited[self.state] {
- return;
- }
- visited[self.state] = true;
-
- for t in enfa.states[self.state].trans.iter() {
- if t.consumes.is_some() {
- continue;
- }
-
- let mut new_assertions = new_assertions.clone();
- if let Some(assertion) = enfa.states[self.state].assert.as_ref() {
- new_assertions.push(assertion.clone());
- }
-
- Self {
- state: t.to,
- ..self.clone()
- }
- .step_epsilon0(enfa, ret, visited, new_assertions);
- }
-
- let Self {
- state,
- positives,
- negatives,
- } = self;
- let mut p = Vec::new();
- let mut n = Vec::new();
- for assertion in new_assertions {
- let threads = Self::new_simple(assertion.to).step_epsilon(enfa);
- let vec = match assertion.polarity {
- LookPolarity::Positive => &mut p,
- LookPolarity::Negative => &mut n,
- };
- vec.push(threads);
- }
- let p = cartesian_product(p);
- for mut p in p {
- let mut positives = positives.clone();
- let mut negatives = negatives.clone();
- positives.append(&mut p);
- negatives.append(&mut n.iter().flatten().cloned().collect());
- if let Some(thread) = Self::new(enfa, state, positives, negatives) {
- ret(thread);
- }
- }
- }
-
- fn step_epsilon(self, enfa: &ENFA) -> Vec<Self> {
- let mut vec = Vec::new();
- self.step_epsilon0(
- enfa,
- &mut |t| vec.push(t),
- &mut vec![false; enfa.states.len()],
- Vec::new(),
- );
- vec
- }
-
- fn step0(self, enfa: &ENFA, input: ByteRange, ret: &mut impl FnMut(Thread)) {
- let positives = self
- .positives
- .clone()
- .into_iter()
- .map(|t| t.step(enfa, input))
- .collect();
- let negatives: Vec<_> = self
- .negatives
- .into_iter()
- .flat_map(|t| t.step(enfa, input))
- .collect();
- let positives = cartesian_product(positives);
- let next_states: Vec<StateId> = enfa.states[self.state]
- .trans
- .iter()
- .filter_map(|t| {
- if let Some(ch) = t.consumes
- && ch.overlaps(input)
- {
- Some(t.to)
- } else {
- None
- }
- })
- .collect();
-
- for s in next_states {
- for p in positives.clone() {
- if let Some(thread) = Self::new(enfa, s, p.clone(), negatives.clone()) {
- thread.step_epsilon0(
- enfa,
- ret,
- &mut vec![false; enfa.states.len()],
- Vec::new(),
- );
- }
- }
- }
- }
-
- fn step(self, enfa: &ENFA, input: ByteRange) -> Vec<Self> {
- let mut vec = Vec::new();
- self.step0(enfa, input, &mut |x| vec.push(x));
- vec
- }
-
- fn possible_transitions(&self, enfa: &ENFA, f: &mut impl FnMut(ByteRange)) {
- for t in enfa.states[self.state].trans.iter() {
- if let Some(ch) = t.consumes {
- f(ch);
- }
- }
- for t in self.positives.iter() {
- t.possible_transitions(enfa, f);
- }
- for t in self.negatives.iter() {
- t.possible_transitions(enfa, f);
- }
- }
-}
-
-#[derive(Clone)]
-pub struct MultiState<'a> {
- nfa: &'a ENFA,
- threads: Vec<Thread>,
- accept: bool,
- hash: u64,
-}
-
-impl<'a> PartialEq for MultiState<'a> {
- fn eq(&self, other: &Self) -> bool {
- (self.nfa as *const ENFA as u64) == (other.nfa as *const ENFA as u64)
- && self.threads == other.threads
- && self.accept == other.accept
- && self.hash == other.hash
- }
-}
-impl<'a> Eq for MultiState<'a> {}
-
-impl<'a> MultiState<'a> {
- fn new(nfa: &'a ENFA, mut threads: Vec<Thread>) -> Self {
- threads.sort();
- threads.dedup();
- threads.shrink_to_fit();
-
- let accept = threads.iter().any(|t| t.accept(nfa));
- let mut hasher = DefaultHasher::new();
- threads.hash(&mut hasher);
- let hash = hasher.finish();
-
- Self {
- nfa,
- threads,
- accept,
- hash,
- }
- }
-
- /// all the chars that will make an interesting transition
- pub fn possible_transitions(&self) -> Vec<ByteRange> {
- let mut vec = Vec::new();
- for t in self.threads.iter() {
- t.possible_transitions(self.nfa, &mut |x| vec.push(x));
- }
- vec = ByteRange::split_to_disjoint(vec);
- vec.sort();
- vec.dedup();
- vec.shrink_to_fit();
- vec
- }
-
- pub fn transition(&self, ch: ByteRange) -> Self {
- let new_states = self
- .threads
- .iter()
- .flat_map(|t| t.clone().step(self.nfa, ch))
- .collect();
-
- Self::new(self.nfa, new_states)
- }
-
- pub fn accept(&self) -> bool {
- self.accept
- }
-}
-
-impl<'a> Hash for MultiState<'a> {
- fn hash<H: Hasher>(&self, state: &mut H) {
- self.hash.hash(state)
- }
-}
-
-macro_rules! set {
- () => {
- std::collections::HashSet::new()
- };
- ( $( $x:expr ),* ) => {{
- let mut set = std::collections::HashSet::new();
- $(
- set.insert($x);
- )*
- set
- }};
-}
-
-impl ENFA {
- fn shift(self, amt: usize) -> Vec<EState> {
- let mut s = self.states;
-
- for state in s.iter_mut() {
- state.remap(|i| i + amt);
- if state.accept == Acceptance::Accept {
- state.accept = Acceptance::NotYet;
- }
- }
-
- s
- }
-
- pub fn remove_unreachable(&mut self) {
- let mut used = vec![false; self.states.len()];
- used[0] = true;
- for s in self.states.iter() {
- for i in s.reachable_states() {
- used[i] = true;
- }
- }
- let mut remap = vec![0; self.states.len()];
- let mut shift = 0;
- for i in 0..self.states.len() {
- if used[i] {
- remap[i] = i - shift;
- } else {
- shift += 1;
- }
- }
- for i in (0..self.states.len()).rev() {
- if !used[i] {
- self.states.remove(i);
- }
- }
- for s in self.states.iter_mut() {
- s.remap(|i| remap[i]);
- }
- }
-}
-
-impl ENFA {
- fn looping(self) -> Self {
- let mut states = vec![EState::start()];
- states.append(&mut self.shift(1));
- let len = states.len();
- states[0].set_epsilon_transitions([Transition::epsilon(1), Transition::epsilon(len)]);
- states[len - 1].set_epsilon_transitions([Transition::epsilon(0), Transition::epsilon(len)]);
- states.push(EState::terminal());
- Self { states }
- }
-
- fn repeat(self, times: usize) -> Self {
- let reps = vec![self; times];
- Self::concat(reps)
- }
-
- /// between 0 and x repetitions
- fn optx(self, x: usize) -> Self {
- let len = self.states.len();
- let mut repped = self.repeat(x);
- assert_eq!(repped.states.len(), x * len);
- for i in 1..=x {
- repped.states[0]
- .trans
- .insert(Transition::epsilon(i * len - 1));
- }
- repped
- }
-
- fn concat(nfas: Vec<Self>) -> Self {
- if nfas.is_empty() {
- return Self {
- states: vec![EState::terminal()],
- };
- }
-
- let mut states: Vec<EState> = Vec::new();
- for nfa in nfas.into_iter() {
- let len = states.len();
- let mut ns = nfa.shift(len);
- if let Some(n) = states.last_mut() {
- n.trans.retain(|t| t.consumes.is_some());
- n.trans.insert(Transition::epsilon(len));
- }
- states.append(&mut ns);
- }
-
- let len = states.len();
- states[len - 1].accept = Acceptance::Accept;
-
- Self { states }
- }
-}
-
-impl ENFA {
- pub fn start_multi_state<'a>(&'a self) -> MultiState<'a> {
- let threads = Thread::new_simple(0).step_epsilon(self);
- MultiState::new(self, threads)
- }
-
- pub fn void_multi_state<'a>(&'a self) -> MultiState<'a> {
- MultiState::new(self, vec![])
- }
-
- pub fn all_multi_states<'a>(&'a self) -> HashSet<MultiState<'a>> {
- let mut states = set![self.start_multi_state()];
- let mut q = vec![self.start_multi_state()];
-
- while let Some(state) = q.pop() {
- let chars = state.possible_transitions();
-
- for chr in chars {
- let new = state.transition(chr);
-
- if !states.contains(&new) {
- states.insert(new.clone());
- q.push(new);
- }
- }
- }
-
- states
- }
-}
-
-impl std::fmt::Debug for ENFA {
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- writeln!(f, "NFA {{")?;
- for (i, s) in self.states.iter().enumerate() {
- write!(f, " {i}: ")?;
- if let Some(a) = s.assert.as_ref() {
- match a.polarity {
- LookPolarity::Positive => write!(f, "+{} ", a.to)?,
- LookPolarity::Negative => write!(f, "-{} ", a.to)?,
- }
- }
- for t in s.trans.iter() {
- let k = t.to;
- if let Some(c) = t.consumes {
- write!(f, "{c:?}=>{k} ")?;
- } else {
- write!(f, "~>{k} ")?;
- }
- }
- match s.accept {
- Acceptance::Accept => write!(f, "accept")?,
- Acceptance::Assertion => write!(f, "assert")?,
- Acceptance::NotYet => {}
- }
- writeln!(f)?;
- }
- write!(f, "}}")
- }
-}
-
-pub type StateId = usize;
-
-#[derive(Debug, Clone, Hash, PartialEq, Eq)]
-pub struct Assertion {
- to: StateId,
- polarity: LookPolarity,
-}
-
-#[derive(Debug, Clone, Hash, PartialEq, Eq)]
-pub struct Transition {
- to: StateId,
- consumes: Option<ByteRange>,
-}
-
-impl Transition {
- fn new(consumes: ByteRange, to: StateId) -> Self {
- Self {
- to,
- consumes: Some(consumes),
- }
- }
-
- fn epsilon(to: StateId) -> Self {
- Self { to, consumes: None }
- }
-
- fn remap(&mut self, mut f: impl FnMut(StateId) -> StateId) {
- self.to = f(self.to);
- }
-
- fn reachable_states(&self) -> impl Iterator<Item = StateId> {
- [self.to].into_iter()
- }
-}
-
-#[derive(Debug, Copy, Clone, PartialEq)]
-pub enum Acceptance {
- Accept,
- Assertion,
- NotYet,
-}
-
-#[derive(Debug, Clone)]
-pub struct EState {
- pub trans: HashSet<Transition>,
- pub assert: Option<Assertion>,
- pub accept: Acceptance,
-}
-
-impl EState {
- fn set_epsilon_transitions(&mut self, trans: impl IntoIterator<Item = Transition>) {
- self.trans.retain(|t| t.consumes.is_some());
- for transition in trans.into_iter() {
- assert!(transition.consumes.is_none());
- self.trans.insert(transition);
- }
- }
-
- fn start() -> Self {
- Self {
- trans: HashSet::new(),
- assert: None,
- accept: Acceptance::NotYet,
- }
- }
- fn terminal() -> Self {
- Self {
- trans: HashSet::new(),
- assert: None,
- accept: Acceptance::Accept,
- }
- }
-
- fn remap(&mut self, mut f: impl FnMut(StateId) -> StateId) {
- self.trans = self
- .trans
- .iter()
- .cloned()
- .map(|mut t| {
- t.remap(&mut f);
- t
- })
- .collect();
- if let Some(a) = self.assert.as_mut() {
- a.to = f(a.to);
- }
- }
-
- fn reachable_states(&self) -> impl Iterator<Item = StateId> {
- self.trans
- .iter()
- .flat_map(|t| t.reachable_states())
- .chain(self.assert.iter().map(|a| a.to))
- }
-}
-
-#[derive(Debug)]
-pub enum EnfaTranslationError {
- CharacterClassNotSupported,
- AssertionsNotSupported,
-}
-
-impl TryFrom<Pattern> for ENFA {
- type Error = EnfaTranslationError;
-
- fn try_from(value: Pattern) -> Result<Self, Self::Error> {
- Ok(match value {
- Pattern::Byte(c) => Self::try_from(Pattern::Range(c, c))?,
- Pattern::Range(c1, c2) => Self {
- states: vec![
- EState {
- trans: set![Transition::new(ByteRange::new_range(c1, c2), 1)],
- assert: None,
- accept: Acceptance::NotYet,
- },
- EState::terminal(),
- ],
- },
- Pattern::CharacterClass(_) => {
- return Err(EnfaTranslationError::CharacterClassNotSupported);
- }
- Pattern::Alt(alts) => {
- let nfas: Vec<ENFA> = alts
- .into_iter()
- .map(Self::try_from)
- .collect::<Result<_, _>>()?;
- let mut states = vec![EState::start()];
- let mut ends = vec![];
- for nfa in nfas.into_iter() {
- let len = states.len();
- states[0].trans.insert(Transition::epsilon(len));
- states.append(&mut (nfa.shift(len)));
- ends.push(states.len() - 1);
- }
- states.push(EState::terminal());
- for end in ends.into_iter() {
- let last = states.len() - 1;
- states[end].trans.insert(Transition::epsilon(last));
- }
- Self { states }
- }
- Pattern::Concat(seq) => {
- let nfas: Vec<Self> = seq
- .into_iter()
- .map(Self::try_from)
- .collect::<Result<_, _>>()?;
- Self::concat(nfas)
- }
- Pattern::Rep(regex, min, None, _) => {
- let nfa = ENFA::try_from(*regex)?;
- let base = nfa.clone().repeat(min as usize);
- let tail = nfa.looping();
- Self::concat(vec![base, tail])
- }
- Pattern::Rep(regex, min, Some(max), _) => {
- assert!(min < max);
- let nfa = Self::try_from(*regex)?;
- let base = nfa.clone().repeat(min as usize);
- let tail = nfa.optx((max - min) as usize);
- Self::concat(vec![base, tail])
- }
- Pattern::Nothing => Self {
- states: vec![EState::terminal()],
- },
- Pattern::Assertion(dir, polarity, pat) => {
- if dir == LookDirection::Behind {
- return Err(EnfaTranslationError::AssertionsNotSupported);
- }
- let mut regex = Self::try_from(*pat)?;
- for s in regex.states.iter_mut() {
- if s.accept == Acceptance::Accept {
- s.accept = Acceptance::Assertion;
- }
- }
- let mut regex = regex.shift(1);
- let mut states = Vec::with_capacity(regex.len() + 2);
- states.push(EState {
- trans: set![Transition::epsilon(regex.len() + 1)],
- assert: Some(Assertion { to: 1, polarity }),
- accept: Acceptance::NotYet,
- });
- states.append(&mut regex);
- states.push(EState::terminal());
- Self { states }
- }
- Pattern::Submatch(pat) => {
- // NFA does not track submatches, so we are skipping it.
- Self::try_from(*pat)?
- }
- })
- }
-}