diff options
| author | Jonas Maier <jonas@x77.dev> | 2026-06-02 21:18:48 +0200 |
|---|---|---|
| committer | Jonas Maier <jonas@x77.dev> | 2026-06-02 21:18:48 +0200 |
| commit | 5caaf765648b12a8b6002633f849f7fc4f1efc21 (patch) | |
| tree | 03170181fe77c3e29a051db09187eee84f349e7b /src | |
| parent | fda08e286d3eeca8db598bcd3dd1802e99209363 (diff) | |
| download | pish-5caaf765648b12a8b6002633f849f7fc4f1efc21.tar.gz | |
some code but it does not work
Diffstat (limited to 'src')
| -rw-r--r-- | src/parse/regex/enfa.rs | 369 |
1 files changed, 167 insertions, 202 deletions
diff --git a/src/parse/regex/enfa.rs b/src/parse/regex/enfa.rs index b702b40..3cac2dd 100644 --- a/src/parse/regex/enfa.rs +++ b/src/parse/regex/enfa.rs @@ -3,22 +3,6 @@ use std::{ hash::{DefaultHasher, Hash, Hasher}, }; -pub trait Stage: Clone + std::fmt::Debug + std::hash::Hash + Eq { - type Consume: Clone + std::fmt::Debug + std::hash::Hash + Eq; -} - -#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)] -pub struct Epsilon; -impl Stage for Epsilon { - type Consume = Option<ByteRange>; -} - -#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)] -pub struct Resolved; -impl Stage for Resolved { - type Consume = ByteRange; -} - use crate::parse::regex::{LookDirection, LookPolarity}; use super::Pattern; @@ -27,15 +11,8 @@ use super::byte_range::ByteRange; /// NFA with epsilon transitions #[derive(Clone)] #[allow(clippy::upper_case_acronyms)] -pub struct ENFA<S: Stage> { - pub states: Vec<EState<S>>, -} - -#[derive(Clone, Hash, PartialEq, Eq, PartialOrd, Ord)] -struct Thread { - own_state: StateId, - positives: Vec<Thread>, - negatives: Vec<Thread>, +pub struct ENFA { + pub states: Vec<EState>, } fn cartesian_product<T: Clone>(x: Vec<Vec<T>>) -> Vec<Vec<T>> { @@ -139,10 +116,25 @@ mod product_tests { } } +#[derive(Clone, Hash, PartialEq, Eq, PartialOrd, Ord)] +struct Thread { + state: StateId, + positives: Vec<Thread>, + negatives: Vec<Thread>, +} + impl Thread { + fn new_simple(state: StateId) -> Self { + Self { + state, + positives: Vec::new(), + negatives: Vec::new(), + } + } + fn new( - enfa: &ENFA<Resolved>, - own_state: StateId, + enfa: &ENFA, + state: StateId, mut positives: Vec<Thread>, mut negatives: Vec<Thread>, ) -> Option<Self> { @@ -160,16 +152,16 @@ impl Thread { } Some(Self { - own_state, + state, positives, negatives, }) } - fn accept(&self, enfa: &ENFA<Resolved>) -> bool { + fn accept(&self, enfa: &ENFA) -> bool { let pos = self.positives.iter().all(|t| t.accept(enfa)); let neg = self.negatives.iter().all(|t| !t.accept(enfa)); - let this = match enfa.states[self.own_state].accept { + let this = match enfa.states[self.state].accept { Acceptance::Accept => true, Acceptance::Assertion => true, Acceptance::NotYet => false, @@ -177,69 +169,133 @@ impl Thread { pos && neg && this } - fn possible_transitions(&self, enfa: &ENFA<Resolved>, f: &mut impl FnMut(ByteRange)) { - for t in enfa.states[self.own_state].trans.iter() { - f(t.consumes); + fn step_epsilon0( + self, + enfa: &ENFA, + ret: &mut impl FnMut(Thread), + visited: &mut [bool], + new_assertions: Vec<Assertion>, + ) { + if visited[self.state] { + return; } - for t in self.positives.iter() { - t.possible_transitions(enfa, f); + visited[self.state] = true; + + for t in enfa.states[self.state].trans.iter() { + if t.consumes.is_some() { + continue; + } + + let mut new_assertions = new_assertions.clone(); + if let Some(assertion) = enfa.states[self.state].assert.as_ref() { + new_assertions.push(assertion.clone()); + } + + Self { + state: t.to, + ..self.clone() + } + .step_epsilon0(enfa, ret, visited, new_assertions); } - for t in self.negatives.iter() { - t.possible_transitions(enfa, f); + + let Self { + state, + mut positives, + mut negatives, + } = self; + for assertion in new_assertions { + let mut threads = Self::new_simple(assertion.to).step_epsilon(enfa); + let vec = match assertion.polarity { + LookPolarity::Positive => &mut positives, + LookPolarity::Negative => &mut negatives, + }; + vec.append(&mut threads); + } + if let Some(thread) = Self::new(enfa, state, positives, negatives) { + ret(thread); } } - fn transition(&self, enfa: &ENFA<Resolved>, ch: ByteRange) -> Vec<Self> { - // if any of our negatives currently accept, we must kill this thread rn. - if self.negatives.iter().any(|t| t.accept(enfa)) { - return Vec::new(); - } + fn step_epsilon(self, enfa: &ENFA) -> Vec<Self> { + let mut vec = Vec::new(); + self.step_epsilon0( + enfa, + &mut |t| vec.push(t), + &mut vec![false; enfa.states.len()], + Vec::new(), + ); + vec + } - let pos: Vec<Vec<Self>> = self + fn step0(self, enfa: &ENFA, input: ByteRange, ret: &mut impl FnMut(Thread)) { + let positives = self .positives - .iter() - .map(|t| t.transition(enfa, ch)) + .into_iter() + .map(|t| t.step(enfa, input)) .collect(); - let neg: Vec<Vec<Self>> = self + let negatives = self .negatives + .into_iter() + .map(|t| t.step(enfa, input)) + .collect(); + let positives = cartesian_product(positives); + let negatives = cartesian_product(negatives); + let next_states: Vec<StateId> = enfa.states[self.state] + .trans .iter() - .map(|t| t.transition(enfa, ch)) + .filter_map(|t| { + if let Some(ch) = t.consumes + && ch.overlaps(input) + { + Some(t.to) + } else { + None + } + }) .collect(); - let pos = cartesian_product(pos); - let neg = cartesian_product(neg); - - let mut out = Vec::new(); - - for p in pos { - for n in neg.clone() { - for transition in enfa.states[self.own_state].trans.iter() { - if transition.consumes.overlaps(ch) { - let mut p = p.clone(); - let mut n = n.clone(); - for a in transition.assert.iter() { - if let Some(thread) = Thread::new(enfa, a.to, Vec::new(), Vec::new()) { - match a.polarity { - LookPolarity::Positive => p.push(thread), - LookPolarity::Negative => n.push(thread), - } - } - } - if let Some(thread) = Thread::new(enfa, transition.to, p, n) { - out.push(thread); - } + // TODO inspect negative behavior -- do we need the cartesian product or something more exocit? + + for s in next_states { + for p in positives.clone() { + for n in negatives.clone() { + if let Some(thread) = Self::new(enfa, s, p.clone(), n) { + thread.step_epsilon0( + enfa, + ret, + &mut vec![false; enfa.states.len()], + Vec::new(), + ); } } } } + } + + fn step(self, enfa: &ENFA, input: ByteRange) -> Vec<Self> { + let mut vec = Vec::new(); + self.step0(enfa, input, &mut |x| vec.push(x)); + vec + } - out + fn possible_transitions(&self, enfa: &ENFA, f: &mut impl FnMut(ByteRange)) { + for t in enfa.states[self.state].trans.iter() { + if let Some(ch) = t.consumes { + f(ch); + } + } + for t in self.positives.iter() { + t.possible_transitions(enfa, f); + } + for t in self.negatives.iter() { + t.possible_transitions(enfa, f); + } } } #[derive(Clone)] pub struct MultiState<'a> { - nfa: &'a ENFA<Resolved>, + nfa: &'a ENFA, threads: Vec<Thread>, accept: bool, hash: u64, @@ -247,7 +303,7 @@ pub struct MultiState<'a> { impl<'a> PartialEq for MultiState<'a> { fn eq(&self, other: &Self) -> bool { - (self.nfa as *const ENFA<Resolved> as u64) == (other.nfa as *const ENFA<Resolved> as u64) + (self.nfa as *const ENFA as u64) == (other.nfa as *const ENFA as u64) && self.threads == other.threads && self.accept == other.accept && self.hash == other.hash @@ -256,7 +312,7 @@ impl<'a> PartialEq for MultiState<'a> { impl<'a> Eq for MultiState<'a> {} impl<'a> MultiState<'a> { - fn new(nfa: &'a ENFA<Resolved>, mut threads: Vec<Thread>) -> Self { + fn new(nfa: &'a ENFA, mut threads: Vec<Thread>) -> Self { threads.sort(); threads.dedup(); threads.shrink_to_fit(); @@ -291,7 +347,7 @@ impl<'a> MultiState<'a> { let new_states = self .threads .iter() - .flat_map(|t| t.transition(self.nfa, ch)) + .flat_map(|t| t.clone().step(self.nfa, ch)) .collect(); Self::new(self.nfa, new_states) @@ -321,8 +377,8 @@ macro_rules! set { }}; } -impl<S: Stage> ENFA<S> { - fn shift(self, amt: usize) -> Vec<EState<S>> { +impl ENFA { + fn shift(self, amt: usize) -> Vec<EState> { let mut s = self.states; for state in s.iter_mut() { @@ -363,66 +419,7 @@ impl<S: Stage> ENFA<S> { } } -impl ENFA<Epsilon> { - fn epsilon_dfs( - &self, - i: StateId, - visited: &mut [Option<Vec<Assertion>>], - path: Vec<Assertion>, - ) { - if visited[i].is_some() { - return; - } - visited[i] = Some(path.clone()); - for t in self.states[i].trans.iter() { - if t.is_epsilon() { - let mut path = path.clone(); - path.append(&mut t.assert.clone()); - self.epsilon_dfs(t.to, visited, path); - } - } - } - - pub fn resolve_epsilon(self) -> ENFA<Resolved> { - // state X --> { state Y, Z, W which get inlined into X } - let includes: Vec<Vec<(StateId, Vec<Assertion>)>> = (0..self.states.len()) - .map(|i| { - let mut reach = vec![None; self.states.len()]; - self.epsilon_dfs(i, &mut reach, Vec::new()); - reach - .into_iter() - .enumerate() - .filter_map(|(x, r)| r.map(|r| (x, r))) - .collect() - }) - .collect(); - - // states without epsilon transitions - let mut states: Vec<EState<Resolved>> = self - .states - .into_iter() - .map(EState::remove_epsilon) - .collect(); - - // inline real transitions - for i in 0..states.len() { - for (k, assertions) in includes[i].iter() { - let new: Vec<Transition<_>> = states[*k] - .trans - .iter() - .cloned() - .map(|t| t.with_assertions(assertions.clone())) - .collect(); - states[i].trans.extend(new); - if states[*k].accept != Acceptance::NotYet { - states[i].accept = states[*k].accept; - } - } - } - - ENFA { states } - } - +impl ENFA { fn looping(self) -> Self { let mut states = vec![EState::start()]; states.append(&mut self.shift(1)); @@ -458,7 +455,7 @@ impl ENFA<Epsilon> { }; } - let mut states: Vec<EState<Epsilon>> = Vec::new(); + let mut states: Vec<EState> = Vec::new(); for nfa in nfas.into_iter() { let len = states.len(); let mut ns = nfa.shift(len); @@ -476,12 +473,10 @@ impl ENFA<Epsilon> { } } -impl ENFA<Resolved> { +impl ENFA { pub fn start_multi_state<'a>(&'a self) -> MultiState<'a> { - MultiState::new( - self, - vec![Thread::new(self, 0, Vec::new(), Vec::new()).unwrap()], - ) + let threads = Thread::new_simple(0).step_epsilon(self); + MultiState::new(self, threads) } pub fn void_multi_state<'a>(&'a self) -> MultiState<'a> { @@ -509,11 +504,17 @@ impl ENFA<Resolved> { } } -impl std::fmt::Debug for ENFA<Epsilon> { +impl std::fmt::Debug for ENFA { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { writeln!(f, "NFA {{")?; for (i, s) in self.states.iter().enumerate() { write!(f, " {i}: ")?; + if let Some(a) = s.assert.as_ref() { + match a.polarity { + LookPolarity::Positive => write!(f, "+{} ", a.to)?, + LookPolarity::Negative => write!(f, "-{} ", a.to)?, + } + } for t in s.trans.iter() { let k = t.to; if let Some(c) = t.consumes { @@ -542,49 +543,23 @@ pub struct Assertion { } #[derive(Debug, Clone, Hash, PartialEq, Eq)] -pub struct Transition<S: Stage> { +pub struct Transition { to: StateId, - assert: Vec<Assertion>, - consumes: S::Consume, + consumes: Option<ByteRange>, } -impl Transition<Epsilon> { +impl Transition { fn new(consumes: ByteRange, to: StateId) -> Self { Self { to, consumes: Some(consumes), - assert: Vec::new(), } } fn epsilon(to: StateId) -> Self { - Self { - to, - consumes: None, - assert: Vec::new(), - } + Self { to, consumes: None } } - pub fn is_epsilon(&self) -> bool { - self.consumes.is_none() - } - - fn non_epsilon(self) -> Option<Transition<Resolved>> { - let Self { - to, - assert, - consumes, - } = self; - let consumes = consumes?; - Some(Transition { - consumes, - to, - assert, - }) - } -} - -impl<S: Stage> Transition<S> { fn remap(&mut self, mut f: impl FnMut(StateId) -> StateId) { self.to = f(self.to); } @@ -592,11 +567,6 @@ impl<S: Stage> Transition<S> { fn reachable_states(&self) -> impl Iterator<Item = StateId> { [self.to].into_iter() } - - fn with_assertions(mut self, mut more: Vec<Assertion>) -> Self { - self.assert.append(&mut more); - self - } } #[derive(Debug, Copy, Clone, PartialEq)] @@ -607,41 +577,32 @@ pub enum Acceptance { } #[derive(Debug, Clone)] -pub struct EState<S: Stage> { - pub trans: HashSet<Transition<S>>, +pub struct EState { + pub trans: HashSet<Transition>, + pub assert: Option<Assertion>, pub accept: Acceptance, } -impl EState<Epsilon> { - fn remove_epsilon(self) -> EState<Resolved> { - let trans = self - .trans - .into_iter() - .filter_map(Transition::non_epsilon) - .collect(); - let accept = self.accept; - EState { trans, accept } - } - - fn set_epsilon_transitions(&mut self, trans: impl IntoIterator<Item = Transition<Epsilon>>) { +impl EState { + fn set_epsilon_transitions(&mut self, trans: impl IntoIterator<Item = Transition>) { self.trans.retain(|t| t.consumes.is_some()); for transition in trans.into_iter() { assert!(transition.consumes.is_none()); self.trans.insert(transition); } } -} -impl<S: Stage> EState<S> { fn start() -> Self { Self { trans: HashSet::new(), + assert: None, accept: Acceptance::NotYet, } } fn terminal() -> Self { Self { trans: HashSet::new(), + assert: None, accept: Acceptance::Accept, } } @@ -656,10 +617,16 @@ impl<S: Stage> EState<S> { t }) .collect(); + if let Some(a) = self.assert.as_mut() { + a.to = f(a.to); + } } fn reachable_states(&self) -> impl Iterator<Item = StateId> { - self.trans.iter().flat_map(|t| t.reachable_states()) + self.trans + .iter() + .flat_map(|t| t.reachable_states()) + .chain(self.assert.iter().map(|a| a.to)) } } @@ -668,7 +635,7 @@ pub enum EnfaTranslationError { AssertionsNotSupported, } -impl TryFrom<Pattern> for ENFA<Epsilon> { +impl TryFrom<Pattern> for ENFA { type Error = EnfaTranslationError; fn try_from(value: Pattern) -> Result<Self, Self::Error> { @@ -678,13 +645,14 @@ impl TryFrom<Pattern> for ENFA<Epsilon> { states: vec![ EState { trans: set![Transition::new(ByteRange::new_range(c1, c2), 1)], + assert: None, accept: Acceptance::NotYet, }, EState::terminal(), ], }, Pattern::Alt(alts) => { - let nfas: Vec<ENFA<Epsilon>> = alts + let nfas: Vec<ENFA> = alts .into_iter() .map(Self::try_from) .collect::<Result<_, _>>()?; @@ -739,11 +707,8 @@ impl TryFrom<Pattern> for ENFA<Epsilon> { let mut regex = regex.shift(1); let mut states = Vec::with_capacity(regex.len() + 2); states.push(EState { - trans: set![Transition { - to: regex.len() + 1, - assert: vec![Assertion { to: 1, polarity }], - consumes: None - }], + trans: set![Transition::epsilon(regex.len() + 1)], + assert: Some(Assertion { to: 1, polarity }), accept: Acceptance::NotYet, }); states.append(&mut regex); |
