From fda08e286d3eeca8db598bcd3dd1802e99209363 Mon Sep 17 00:00:00 2001 From: Jonas Maier Date: Tue, 2 Jun 2026 17:40:28 +0200 Subject: lookahead, implemented wrong --- src/parse/regex/enfa.rs | 259 ++++++++++++++++++++++++++++++---- test-cases/case2_lookahead/script.sh | 38 +++++ test-cases/case2_lookahead/stdout.txt | 0 tests/scripts.rs | 5 + 4 files changed, 274 insertions(+), 28 deletions(-) create mode 100644 test-cases/case2_lookahead/script.sh create mode 100644 test-cases/case2_lookahead/stdout.txt diff --git a/src/parse/regex/enfa.rs b/src/parse/regex/enfa.rs index 76b5e63..b702b40 100644 --- a/src/parse/regex/enfa.rs +++ b/src/parse/regex/enfa.rs @@ -31,10 +31,216 @@ pub struct ENFA { pub states: Vec>, } +#[derive(Clone, Hash, PartialEq, Eq, PartialOrd, Ord)] +struct Thread { + own_state: StateId, + positives: Vec, + negatives: Vec, +} + +fn cartesian_product(x: Vec>) -> Vec> { + let mut result = vec![Vec::new()]; + + for xs in x { + let mut next = Vec::new(); + + for prefix in &result { + for item in &xs { + let mut v = prefix.clone(); + v.push(item.clone()); + next.push(v); + } + } + + result = next; + } + + result +} + +#[cfg(test)] +mod product_tests { + use super::cartesian_product; + + #[test] + fn basic_case() { + let x = vec![vec![1, 2], vec![10, 20]]; + + let out = cartesian_product(x); + + assert_eq!( + out, + vec![vec![1, 10], vec![1, 20], vec![2, 10], vec![2, 20],] + ); + } + + #[test] + fn single_dimension() { + let x = vec![vec![1, 2, 3]]; + + let out = cartesian_product(x); + + assert_eq!(out, vec![vec![1], vec![2], vec![3],]); + } + + #[test] + fn empty_outer_vector() { + let x: Vec> = vec![]; + + let out = cartesian_product(x); + + // One empty combination. + assert_eq!(out, vec![vec![]]); + } + + #[test] + fn empty_inner_vector() { + let x = vec![vec![1, 2], vec![], vec![3, 4]]; + + let out = cartesian_product(x); + + assert!(out.is_empty()); + } + + #[test] + fn output_size_matches_product() { + let x = vec![vec![1, 2, 3], vec![4, 5], vec![6, 7, 8, 9]]; + + let out = cartesian_product(x); + + assert_eq!(out.len(), 3 * 2 * 4); + } + + #[test] + fn every_output_has_correct_length() { + let x = vec![vec!['a', 'b'], vec!['x', 'y', 'z'], vec!['0']]; + + let out = cartesian_product(x); + + assert!(out.iter().all(|v| v.len() == 3)); + } + + #[test] + fn works_with_strings() { + let x = vec![ + vec!["a".to_string(), "b".to_string()], + vec!["x".to_string()], + ]; + + let out = cartesian_product(x); + + assert_eq!( + out, + vec![ + vec!["a".to_string(), "x".to_string()], + vec!["b".to_string(), "x".to_string()], + ] + ); + } +} + +impl Thread { + fn new( + enfa: &ENFA, + own_state: StateId, + mut positives: Vec, + mut negatives: Vec, + ) -> Option { + positives.sort(); + positives.dedup(); + positives.retain(|t| !t.accept(enfa)); + positives.shrink_to_fit(); + + negatives.sort(); + negatives.dedup(); + negatives.shrink_to_fit(); + + if negatives.iter().any(|t| t.accept(enfa)) { + return None; + } + + Some(Self { + own_state, + positives, + negatives, + }) + } + + fn accept(&self, enfa: &ENFA) -> bool { + let pos = self.positives.iter().all(|t| t.accept(enfa)); + let neg = self.negatives.iter().all(|t| !t.accept(enfa)); + let this = match enfa.states[self.own_state].accept { + Acceptance::Accept => true, + Acceptance::Assertion => true, + Acceptance::NotYet => false, + }; + pos && neg && this + } + + fn possible_transitions(&self, enfa: &ENFA, f: &mut impl FnMut(ByteRange)) { + for t in enfa.states[self.own_state].trans.iter() { + f(t.consumes); + } + for t in self.positives.iter() { + t.possible_transitions(enfa, f); + } + for t in self.negatives.iter() { + t.possible_transitions(enfa, f); + } + } + + fn transition(&self, enfa: &ENFA, ch: ByteRange) -> Vec { + // if any of our negatives currently accept, we must kill this thread rn. + if self.negatives.iter().any(|t| t.accept(enfa)) { + return Vec::new(); + } + + let pos: Vec> = self + .positives + .iter() + .map(|t| t.transition(enfa, ch)) + .collect(); + let neg: Vec> = self + .negatives + .iter() + .map(|t| t.transition(enfa, ch)) + .collect(); + + let pos = cartesian_product(pos); + let neg = cartesian_product(neg); + + let mut out = Vec::new(); + + for p in pos { + for n in neg.clone() { + for transition in enfa.states[self.own_state].trans.iter() { + if transition.consumes.overlaps(ch) { + let mut p = p.clone(); + let mut n = n.clone(); + for a in transition.assert.iter() { + if let Some(thread) = Thread::new(enfa, a.to, Vec::new(), Vec::new()) { + match a.polarity { + LookPolarity::Positive => p.push(thread), + LookPolarity::Negative => n.push(thread), + } + } + } + if let Some(thread) = Thread::new(enfa, transition.to, p, n) { + out.push(thread); + } + } + } + } + } + + out + } +} + #[derive(Clone)] pub struct MultiState<'a> { nfa: &'a ENFA, - states: Vec, + threads: Vec, accept: bool, hash: u64, } @@ -42,7 +248,7 @@ pub struct MultiState<'a> { impl<'a> PartialEq for MultiState<'a> { fn eq(&self, other: &Self) -> bool { (self.nfa as *const ENFA as u64) == (other.nfa as *const ENFA as u64) - && self.states == other.states + && self.threads == other.threads && self.accept == other.accept && self.hash == other.hash } @@ -50,21 +256,19 @@ impl<'a> PartialEq for MultiState<'a> { impl<'a> Eq for MultiState<'a> {} impl<'a> MultiState<'a> { - pub fn new(nfa: &'a ENFA, mut states: Vec) -> Self { - states.sort(); - states.dedup(); - states.shrink_to_fit(); + fn new(nfa: &'a ENFA, mut threads: Vec) -> Self { + threads.sort(); + threads.dedup(); + threads.shrink_to_fit(); - let accept = states - .iter() - .any(|&x| nfa.states[x].accept == Acceptance::Accept); + let accept = threads.iter().any(|t| t.accept(nfa)); let mut hasher = DefaultHasher::new(); - states.hash(&mut hasher); + threads.hash(&mut hasher); let hash = hasher.finish(); Self { nfa, - states, + threads, accept, hash, } @@ -72,11 +276,10 @@ impl<'a> MultiState<'a> { /// all the chars that will make an interesting transition pub fn possible_transitions(&self) -> Vec { - let mut vec: Vec<_> = self - .states - .iter() - .flat_map(|&i| self.nfa.states[i].trans.iter().map(|x| x.consumes)) - .collect(); + let mut vec = Vec::new(); + for t in self.threads.iter() { + t.possible_transitions(self.nfa, &mut |x| vec.push(x)); + } vec = ByteRange::non_overlapping(vec); vec.sort(); vec.dedup(); @@ -86,17 +289,9 @@ impl<'a> MultiState<'a> { pub fn transition(&self, ch: ByteRange) -> Self { let new_states = self - .states + .threads .iter() - .flat_map(|&s| { - self.nfa.states[s].trans.iter().filter_map(|t| { - if t.consumes.overlaps(ch) { - Some(t.to) - } else { - None - } - }) - }) + .flat_map(|t| t.transition(self.nfa, ch)) .collect(); Self::new(self.nfa, new_states) @@ -169,7 +364,12 @@ impl ENFA { } impl ENFA { - fn epsilon_dfs(&self, i: StateId, visited: &mut [Option>], path: Vec) { + fn epsilon_dfs( + &self, + i: StateId, + visited: &mut [Option>], + path: Vec, + ) { if visited[i].is_some() { return; } @@ -278,7 +478,10 @@ impl ENFA { impl ENFA { pub fn start_multi_state<'a>(&'a self) -> MultiState<'a> { - MultiState::new(self, vec![0]) + MultiState::new( + self, + vec![Thread::new(self, 0, Vec::new(), Vec::new()).unwrap()], + ) } pub fn void_multi_state<'a>(&'a self) -> MultiState<'a> { diff --git a/test-cases/case2_lookahead/script.sh b/test-cases/case2_lookahead/script.sh new file mode 100644 index 0000000..1a2226b --- /dev/null +++ b/test-cases/case2_lookahead/script.sh @@ -0,0 +1,38 @@ +case foo { + f(?=o)oo {} + .* { echo fail 1 } +} + +case foo { + fo(?=x)o { echo fail 2 } + .* { } +} + +fun match { + set res = $($1 $2) + if [ $res != $3 ] { + echo "match $1 $2: expected $3, got $res" + } +} + +fun x0 { + case $1 { + a(?=b). { echo yes } + .* { echo no } + } +} +match x0 aa no +match x0 aaa no +match x0 ab yes + +fun x1 { + case $1 { + a(?!x)x { echo yes } + .* { echo no } + } +} +match x1 a no +match x1 aa no +match x1 ax no +match x1 axx no +match x1 aax no diff --git a/test-cases/case2_lookahead/stdout.txt b/test-cases/case2_lookahead/stdout.txt new file mode 100644 index 0000000..e69de29 diff --git a/tests/scripts.rs b/tests/scripts.rs index 8ee5f0e..945a506 100644 --- a/tests/scripts.rs +++ b/tests/scripts.rs @@ -11,6 +11,11 @@ fn case1() { common::test_case("case1", include_bytes!("../test-cases/case1/script.sh"), include_bytes!("../test-cases/case1/stdout.txt")); } +#[test] +fn case2_lookahead() { + common::test_case("case2_lookahead", include_bytes!("../test-cases/case2_lookahead/script.sh"), include_bytes!("../test-cases/case2_lookahead/stdout.txt")); +} + #[test] fn comment() { common::test_case("comment", include_bytes!("../test-cases/comment/script.sh"), include_bytes!("../test-cases/comment/stdout.txt")); -- cgit v1.2.3