diff options
| -rw-r--r-- | src/parse/regex/enfa.rs | 130 | ||||
| -rw-r--r-- | src/parse/regex/mod.rs | 2 |
2 files changed, 101 insertions, 31 deletions
diff --git a/src/parse/regex/enfa.rs b/src/parse/regex/enfa.rs index 88f4536..76b5e63 100644 --- a/src/parse/regex/enfa.rs +++ b/src/parse/regex/enfa.rs @@ -3,7 +3,7 @@ use std::{ hash::{DefaultHasher, Hash, Hasher}, }; -pub trait Stage : Clone + std::fmt::Debug + std::hash::Hash + Eq { +pub trait Stage: Clone + std::fmt::Debug + std::hash::Hash + Eq { type Consume: Clone + std::fmt::Debug + std::hash::Hash + Eq; } @@ -19,6 +19,8 @@ impl Stage for Resolved { type Consume = ByteRange; } +use crate::parse::regex::{LookDirection, LookPolarity}; + use super::Pattern; use super::byte_range::ByteRange; @@ -53,7 +55,9 @@ impl<'a> MultiState<'a> { states.dedup(); states.shrink_to_fit(); - let accept = states.iter().any(|&x| nfa.states[x].accept); + let accept = states + .iter() + .any(|&x| nfa.states[x].accept == Acceptance::Accept); let mut hasher = DefaultHasher::new(); states.hash(&mut hasher); let hash = hasher.finish(); @@ -128,7 +132,9 @@ impl<S: Stage> ENFA<S> { for state in s.iter_mut() { state.remap(|i| i + amt); - state.accept = false; + if state.accept == Acceptance::Accept { + state.accept = Acceptance::NotYet; + } } s @@ -163,28 +169,30 @@ impl<S: Stage> ENFA<S> { } impl ENFA<Epsilon> { - fn epsilon_dfs(&self, i: StateId, visited: &mut [bool]) { - if visited[i] { + fn epsilon_dfs(&self, i: StateId, visited: &mut [Option<Vec<Assertion>>], path: Vec<Assertion>) { + if visited[i].is_some() { return; } - visited[i] = true; + visited[i] = Some(path.clone()); for t in self.states[i].trans.iter() { if t.is_epsilon() { - self.epsilon_dfs(t.to, visited); + let mut path = path.clone(); + path.append(&mut t.assert.clone()); + self.epsilon_dfs(t.to, visited, path); } } } pub fn resolve_epsilon(self) -> ENFA<Resolved> { // state X --> { state Y, Z, W which get inlined into X } - let includes: Vec<Vec<StateId>> = (0..self.states.len()) + let includes: Vec<Vec<(StateId, Vec<Assertion>)>> = (0..self.states.len()) .map(|i| { - let mut reach = vec![false; self.states.len()]; - self.epsilon_dfs(i, &mut reach); + let mut reach = vec![None; self.states.len()]; + self.epsilon_dfs(i, &mut reach, Vec::new()); reach .into_iter() .enumerate() - .filter_map(|(x, r)| if r { Some(x) } else { None }) + .filter_map(|(x, r)| r.map(|r| (x, r))) .collect() }) .collect(); @@ -198,11 +206,16 @@ impl ENFA<Epsilon> { // inline real transitions for i in 0..states.len() { - for &k in includes[i].iter() { - let new = states[k].trans.clone(); + for (k, assertions) in includes[i].iter() { + let new: Vec<Transition<_>> = states[*k] + .trans + .iter() + .cloned() + .map(|t| t.with_assertions(assertions.clone())) + .collect(); states[i].trans.extend(new); - if states[k].accept { - states[i].accept = true; + if states[*k].accept != Acceptance::NotYet { + states[i].accept = states[*k].accept; } } } @@ -257,7 +270,7 @@ impl ENFA<Epsilon> { } let len = states.len(); - states[len - 1].accept = true; + states[len - 1].accept = Acceptance::Accept; Self { states } } @@ -306,8 +319,10 @@ impl std::fmt::Debug for ENFA<Epsilon> { write!(f, "~>{k} ")?; } } - if s.accept { - write!(f, "accept")?; + match s.accept { + Acceptance::Accept => write!(f, "accept")?, + Acceptance::Assertion => write!(f, "assert")?, + Acceptance::NotYet => {} } writeln!(f)?; } @@ -318,19 +333,33 @@ impl std::fmt::Debug for ENFA<Epsilon> { pub type StateId = usize; #[derive(Debug, Clone, Hash, PartialEq, Eq)] +pub struct Assertion { + to: StateId, + polarity: LookPolarity, +} + +#[derive(Debug, Clone, Hash, PartialEq, Eq)] pub struct Transition<S: Stage> { to: StateId, + assert: Vec<Assertion>, consumes: S::Consume, } impl Transition<Epsilon> { fn new(consumes: ByteRange, to: StateId) -> Self { - let consumes = Some(consumes); - Self { to, consumes } + Self { + to, + consumes: Some(consumes), + assert: Vec::new(), + } } fn epsilon(to: StateId) -> Self { - Self { to, consumes: None } + Self { + to, + consumes: None, + assert: Vec::new(), + } } pub fn is_epsilon(&self) -> bool { @@ -338,9 +367,17 @@ impl Transition<Epsilon> { } fn non_epsilon(self) -> Option<Transition<Resolved>> { - let consumes = self.consumes?; - let to = self.to; - Some(Transition { consumes, to }) + let Self { + to, + assert, + consumes, + } = self; + let consumes = consumes?; + Some(Transition { + consumes, + to, + assert, + }) } } @@ -352,12 +389,24 @@ impl<S: Stage> Transition<S> { fn reachable_states(&self) -> impl Iterator<Item = StateId> { [self.to].into_iter() } + + fn with_assertions(mut self, mut more: Vec<Assertion>) -> Self { + self.assert.append(&mut more); + self + } +} + +#[derive(Debug, Copy, Clone, PartialEq)] +pub enum Acceptance { + Accept, + Assertion, + NotYet, } #[derive(Debug, Clone)] pub struct EState<S: Stage> { pub trans: HashSet<Transition<S>>, - pub accept: bool, + pub accept: Acceptance, } impl EState<Epsilon> { @@ -384,13 +433,13 @@ impl<S: Stage> EState<S> { fn start() -> Self { Self { trans: HashSet::new(), - accept: false, + accept: Acceptance::NotYet, } } fn terminal() -> Self { Self { trans: HashSet::new(), - accept: true, + accept: Acceptance::Accept, } } @@ -426,7 +475,7 @@ impl TryFrom<Pattern> for ENFA<Epsilon> { states: vec![ EState { trans: set![Transition::new(ByteRange::new_range(c1, c2), 1)], - accept: false, + accept: Acceptance::NotYet, }, EState::terminal(), ], @@ -474,8 +523,29 @@ impl TryFrom<Pattern> for ENFA<Epsilon> { Pattern::Nothing => Self { states: vec![EState::terminal()], }, - Pattern::Assertion(..) => { - return Err(EnfaTranslationError::AssertionsNotSupported); + Pattern::Assertion(dir, polarity, pat) => { + if dir == LookDirection::Behind { + return Err(EnfaTranslationError::AssertionsNotSupported); + } + let mut regex = Self::try_from(*pat)?; + for s in regex.states.iter_mut() { + if s.accept == Acceptance::Accept { + s.accept = Acceptance::Assertion; + } + } + let mut regex = regex.shift(1); + let mut states = Vec::with_capacity(regex.len() + 2); + states.push(EState { + trans: set![Transition { + to: regex.len() + 1, + assert: vec![Assertion { to: 1, polarity }], + consumes: None + }], + accept: Acceptance::NotYet, + }); + states.append(&mut regex); + states.push(EState::terminal()); + Self { states } } }) } diff --git a/src/parse/regex/mod.rs b/src/parse/regex/mod.rs index 6add7a8..2f7c223 100644 --- a/src/parse/regex/mod.rs +++ b/src/parse/regex/mod.rs @@ -12,7 +12,7 @@ pub enum LookDirection { Behind, } -#[derive(PartialEq, Eq, Debug, Clone, Copy)] +#[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)] pub enum LookPolarity { Positive, Negative, |
