diff options
| author | Jonas Maier <jonas@x77.dev> | 2026-06-02 17:40:28 +0200 |
|---|---|---|
| committer | Jonas Maier <jonas@x77.dev> | 2026-06-02 17:40:28 +0200 |
| commit | fda08e286d3eeca8db598bcd3dd1802e99209363 (patch) | |
| tree | 14028ebaa69ee59ef2affe446b8791f0da92c238 /src/parse/regex/enfa.rs | |
| parent | e1f64a93c0246d7832a308c756b654965fe3710d (diff) | |
| download | pish-fda08e286d3eeca8db598bcd3dd1802e99209363.tar.gz | |
lookahead, implemented wrong
Diffstat (limited to 'src/parse/regex/enfa.rs')
| -rw-r--r-- | src/parse/regex/enfa.rs | 259 |
1 files changed, 231 insertions, 28 deletions
diff --git a/src/parse/regex/enfa.rs b/src/parse/regex/enfa.rs index 76b5e63..b702b40 100644 --- a/src/parse/regex/enfa.rs +++ b/src/parse/regex/enfa.rs @@ -31,10 +31,216 @@ pub struct ENFA<S: Stage> { pub states: Vec<EState<S>>, } +#[derive(Clone, Hash, PartialEq, Eq, PartialOrd, Ord)] +struct Thread { + own_state: StateId, + positives: Vec<Thread>, + negatives: Vec<Thread>, +} + +fn cartesian_product<T: Clone>(x: Vec<Vec<T>>) -> Vec<Vec<T>> { + let mut result = vec![Vec::new()]; + + for xs in x { + let mut next = Vec::new(); + + for prefix in &result { + for item in &xs { + let mut v = prefix.clone(); + v.push(item.clone()); + next.push(v); + } + } + + result = next; + } + + result +} + +#[cfg(test)] +mod product_tests { + use super::cartesian_product; + + #[test] + fn basic_case() { + let x = vec![vec![1, 2], vec![10, 20]]; + + let out = cartesian_product(x); + + assert_eq!( + out, + vec![vec![1, 10], vec![1, 20], vec![2, 10], vec![2, 20],] + ); + } + + #[test] + fn single_dimension() { + let x = vec![vec![1, 2, 3]]; + + let out = cartesian_product(x); + + assert_eq!(out, vec![vec![1], vec![2], vec![3],]); + } + + #[test] + fn empty_outer_vector() { + let x: Vec<Vec<i32>> = vec![]; + + let out = cartesian_product(x); + + // One empty combination. + assert_eq!(out, vec![vec![]]); + } + + #[test] + fn empty_inner_vector() { + let x = vec![vec![1, 2], vec![], vec![3, 4]]; + + let out = cartesian_product(x); + + assert!(out.is_empty()); + } + + #[test] + fn output_size_matches_product() { + let x = vec![vec![1, 2, 3], vec![4, 5], vec![6, 7, 8, 9]]; + + let out = cartesian_product(x); + + assert_eq!(out.len(), 3 * 2 * 4); + } + + #[test] + fn every_output_has_correct_length() { + let x = vec![vec!['a', 'b'], vec!['x', 'y', 'z'], vec!['0']]; + + let out = cartesian_product(x); + + assert!(out.iter().all(|v| v.len() == 3)); + } + + #[test] + fn works_with_strings() { + let x = vec![ + vec!["a".to_string(), "b".to_string()], + vec!["x".to_string()], + ]; + + let out = cartesian_product(x); + + assert_eq!( + out, + vec![ + vec!["a".to_string(), "x".to_string()], + vec!["b".to_string(), "x".to_string()], + ] + ); + } +} + +impl Thread { + fn new( + enfa: &ENFA<Resolved>, + own_state: StateId, + mut positives: Vec<Thread>, + mut negatives: Vec<Thread>, + ) -> Option<Self> { + positives.sort(); + positives.dedup(); + positives.retain(|t| !t.accept(enfa)); + positives.shrink_to_fit(); + + negatives.sort(); + negatives.dedup(); + negatives.shrink_to_fit(); + + if negatives.iter().any(|t| t.accept(enfa)) { + return None; + } + + Some(Self { + own_state, + positives, + negatives, + }) + } + + fn accept(&self, enfa: &ENFA<Resolved>) -> bool { + let pos = self.positives.iter().all(|t| t.accept(enfa)); + let neg = self.negatives.iter().all(|t| !t.accept(enfa)); + let this = match enfa.states[self.own_state].accept { + Acceptance::Accept => true, + Acceptance::Assertion => true, + Acceptance::NotYet => false, + }; + pos && neg && this + } + + fn possible_transitions(&self, enfa: &ENFA<Resolved>, f: &mut impl FnMut(ByteRange)) { + for t in enfa.states[self.own_state].trans.iter() { + f(t.consumes); + } + for t in self.positives.iter() { + t.possible_transitions(enfa, f); + } + for t in self.negatives.iter() { + t.possible_transitions(enfa, f); + } + } + + fn transition(&self, enfa: &ENFA<Resolved>, ch: ByteRange) -> Vec<Self> { + // if any of our negatives currently accept, we must kill this thread rn. + if self.negatives.iter().any(|t| t.accept(enfa)) { + return Vec::new(); + } + + let pos: Vec<Vec<Self>> = self + .positives + .iter() + .map(|t| t.transition(enfa, ch)) + .collect(); + let neg: Vec<Vec<Self>> = self + .negatives + .iter() + .map(|t| t.transition(enfa, ch)) + .collect(); + + let pos = cartesian_product(pos); + let neg = cartesian_product(neg); + + let mut out = Vec::new(); + + for p in pos { + for n in neg.clone() { + for transition in enfa.states[self.own_state].trans.iter() { + if transition.consumes.overlaps(ch) { + let mut p = p.clone(); + let mut n = n.clone(); + for a in transition.assert.iter() { + if let Some(thread) = Thread::new(enfa, a.to, Vec::new(), Vec::new()) { + match a.polarity { + LookPolarity::Positive => p.push(thread), + LookPolarity::Negative => n.push(thread), + } + } + } + if let Some(thread) = Thread::new(enfa, transition.to, p, n) { + out.push(thread); + } + } + } + } + } + + out + } +} + #[derive(Clone)] pub struct MultiState<'a> { nfa: &'a ENFA<Resolved>, - states: Vec<StateId>, + threads: Vec<Thread>, accept: bool, hash: u64, } @@ -42,7 +248,7 @@ pub struct MultiState<'a> { impl<'a> PartialEq for MultiState<'a> { fn eq(&self, other: &Self) -> bool { (self.nfa as *const ENFA<Resolved> as u64) == (other.nfa as *const ENFA<Resolved> as u64) - && self.states == other.states + && self.threads == other.threads && self.accept == other.accept && self.hash == other.hash } @@ -50,21 +256,19 @@ impl<'a> PartialEq for MultiState<'a> { impl<'a> Eq for MultiState<'a> {} impl<'a> MultiState<'a> { - pub fn new(nfa: &'a ENFA<Resolved>, mut states: Vec<StateId>) -> Self { - states.sort(); - states.dedup(); - states.shrink_to_fit(); + fn new(nfa: &'a ENFA<Resolved>, mut threads: Vec<Thread>) -> Self { + threads.sort(); + threads.dedup(); + threads.shrink_to_fit(); - let accept = states - .iter() - .any(|&x| nfa.states[x].accept == Acceptance::Accept); + let accept = threads.iter().any(|t| t.accept(nfa)); let mut hasher = DefaultHasher::new(); - states.hash(&mut hasher); + threads.hash(&mut hasher); let hash = hasher.finish(); Self { nfa, - states, + threads, accept, hash, } @@ -72,11 +276,10 @@ impl<'a> MultiState<'a> { /// all the chars that will make an interesting transition pub fn possible_transitions(&self) -> Vec<ByteRange> { - let mut vec: Vec<_> = self - .states - .iter() - .flat_map(|&i| self.nfa.states[i].trans.iter().map(|x| x.consumes)) - .collect(); + let mut vec = Vec::new(); + for t in self.threads.iter() { + t.possible_transitions(self.nfa, &mut |x| vec.push(x)); + } vec = ByteRange::non_overlapping(vec); vec.sort(); vec.dedup(); @@ -86,17 +289,9 @@ impl<'a> MultiState<'a> { pub fn transition(&self, ch: ByteRange) -> Self { let new_states = self - .states + .threads .iter() - .flat_map(|&s| { - self.nfa.states[s].trans.iter().filter_map(|t| { - if t.consumes.overlaps(ch) { - Some(t.to) - } else { - None - } - }) - }) + .flat_map(|t| t.transition(self.nfa, ch)) .collect(); Self::new(self.nfa, new_states) @@ -169,7 +364,12 @@ impl<S: Stage> ENFA<S> { } impl ENFA<Epsilon> { - fn epsilon_dfs(&self, i: StateId, visited: &mut [Option<Vec<Assertion>>], path: Vec<Assertion>) { + fn epsilon_dfs( + &self, + i: StateId, + visited: &mut [Option<Vec<Assertion>>], + path: Vec<Assertion>, + ) { if visited[i].is_some() { return; } @@ -278,7 +478,10 @@ impl ENFA<Epsilon> { impl ENFA<Resolved> { pub fn start_multi_state<'a>(&'a self) -> MultiState<'a> { - MultiState::new(self, vec![0]) + MultiState::new( + self, + vec![Thread::new(self, 0, Vec::new(), Vec::new()).unwrap()], + ) } pub fn void_multi_state<'a>(&'a self) -> MultiState<'a> { |
