aboutsummaryrefslogtreecommitdiffstats
path: root/src/parse
diff options
context:
space:
mode:
authorJonas Maier <jonas@x77.dev>2026-06-02 21:18:48 +0200
committerJonas Maier <jonas@x77.dev>2026-06-02 21:18:48 +0200
commit5caaf765648b12a8b6002633f849f7fc4f1efc21 (patch)
tree03170181fe77c3e29a051db09187eee84f349e7b /src/parse
parentfda08e286d3eeca8db598bcd3dd1802e99209363 (diff)
downloadpish-5caaf765648b12a8b6002633f849f7fc4f1efc21.tar.gz
some code but it does not work
Diffstat (limited to 'src/parse')
-rw-r--r--src/parse/regex/enfa.rs369
1 files changed, 167 insertions, 202 deletions
diff --git a/src/parse/regex/enfa.rs b/src/parse/regex/enfa.rs
index b702b40..3cac2dd 100644
--- a/src/parse/regex/enfa.rs
+++ b/src/parse/regex/enfa.rs
@@ -3,22 +3,6 @@ use std::{
hash::{DefaultHasher, Hash, Hasher},
};
-pub trait Stage: Clone + std::fmt::Debug + std::hash::Hash + Eq {
- type Consume: Clone + std::fmt::Debug + std::hash::Hash + Eq;
-}
-
-#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
-pub struct Epsilon;
-impl Stage for Epsilon {
- type Consume = Option<ByteRange>;
-}
-
-#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
-pub struct Resolved;
-impl Stage for Resolved {
- type Consume = ByteRange;
-}
-
use crate::parse::regex::{LookDirection, LookPolarity};
use super::Pattern;
@@ -27,15 +11,8 @@ use super::byte_range::ByteRange;
/// NFA with epsilon transitions
#[derive(Clone)]
#[allow(clippy::upper_case_acronyms)]
-pub struct ENFA<S: Stage> {
- pub states: Vec<EState<S>>,
-}
-
-#[derive(Clone, Hash, PartialEq, Eq, PartialOrd, Ord)]
-struct Thread {
- own_state: StateId,
- positives: Vec<Thread>,
- negatives: Vec<Thread>,
+pub struct ENFA {
+ pub states: Vec<EState>,
}
fn cartesian_product<T: Clone>(x: Vec<Vec<T>>) -> Vec<Vec<T>> {
@@ -139,10 +116,25 @@ mod product_tests {
}
}
+#[derive(Clone, Hash, PartialEq, Eq, PartialOrd, Ord)]
+struct Thread {
+ state: StateId,
+ positives: Vec<Thread>,
+ negatives: Vec<Thread>,
+}
+
impl Thread {
+ fn new_simple(state: StateId) -> Self {
+ Self {
+ state,
+ positives: Vec::new(),
+ negatives: Vec::new(),
+ }
+ }
+
fn new(
- enfa: &ENFA<Resolved>,
- own_state: StateId,
+ enfa: &ENFA,
+ state: StateId,
mut positives: Vec<Thread>,
mut negatives: Vec<Thread>,
) -> Option<Self> {
@@ -160,16 +152,16 @@ impl Thread {
}
Some(Self {
- own_state,
+ state,
positives,
negatives,
})
}
- fn accept(&self, enfa: &ENFA<Resolved>) -> bool {
+ fn accept(&self, enfa: &ENFA) -> bool {
let pos = self.positives.iter().all(|t| t.accept(enfa));
let neg = self.negatives.iter().all(|t| !t.accept(enfa));
- let this = match enfa.states[self.own_state].accept {
+ let this = match enfa.states[self.state].accept {
Acceptance::Accept => true,
Acceptance::Assertion => true,
Acceptance::NotYet => false,
@@ -177,69 +169,133 @@ impl Thread {
pos && neg && this
}
- fn possible_transitions(&self, enfa: &ENFA<Resolved>, f: &mut impl FnMut(ByteRange)) {
- for t in enfa.states[self.own_state].trans.iter() {
- f(t.consumes);
+ fn step_epsilon0(
+ self,
+ enfa: &ENFA,
+ ret: &mut impl FnMut(Thread),
+ visited: &mut [bool],
+ new_assertions: Vec<Assertion>,
+ ) {
+ if visited[self.state] {
+ return;
}
- for t in self.positives.iter() {
- t.possible_transitions(enfa, f);
+ visited[self.state] = true;
+
+ for t in enfa.states[self.state].trans.iter() {
+ if t.consumes.is_some() {
+ continue;
+ }
+
+ let mut new_assertions = new_assertions.clone();
+ if let Some(assertion) = enfa.states[self.state].assert.as_ref() {
+ new_assertions.push(assertion.clone());
+ }
+
+ Self {
+ state: t.to,
+ ..self.clone()
+ }
+ .step_epsilon0(enfa, ret, visited, new_assertions);
}
- for t in self.negatives.iter() {
- t.possible_transitions(enfa, f);
+
+ let Self {
+ state,
+ mut positives,
+ mut negatives,
+ } = self;
+ for assertion in new_assertions {
+ let mut threads = Self::new_simple(assertion.to).step_epsilon(enfa);
+ let vec = match assertion.polarity {
+ LookPolarity::Positive => &mut positives,
+ LookPolarity::Negative => &mut negatives,
+ };
+ vec.append(&mut threads);
+ }
+ if let Some(thread) = Self::new(enfa, state, positives, negatives) {
+ ret(thread);
}
}
- fn transition(&self, enfa: &ENFA<Resolved>, ch: ByteRange) -> Vec<Self> {
- // if any of our negatives currently accept, we must kill this thread rn.
- if self.negatives.iter().any(|t| t.accept(enfa)) {
- return Vec::new();
- }
+ fn step_epsilon(self, enfa: &ENFA) -> Vec<Self> {
+ let mut vec = Vec::new();
+ self.step_epsilon0(
+ enfa,
+ &mut |t| vec.push(t),
+ &mut vec![false; enfa.states.len()],
+ Vec::new(),
+ );
+ vec
+ }
- let pos: Vec<Vec<Self>> = self
+ fn step0(self, enfa: &ENFA, input: ByteRange, ret: &mut impl FnMut(Thread)) {
+ let positives = self
.positives
- .iter()
- .map(|t| t.transition(enfa, ch))
+ .into_iter()
+ .map(|t| t.step(enfa, input))
.collect();
- let neg: Vec<Vec<Self>> = self
+ let negatives = self
.negatives
+ .into_iter()
+ .map(|t| t.step(enfa, input))
+ .collect();
+ let positives = cartesian_product(positives);
+ let negatives = cartesian_product(negatives);
+ let next_states: Vec<StateId> = enfa.states[self.state]
+ .trans
.iter()
- .map(|t| t.transition(enfa, ch))
+ .filter_map(|t| {
+ if let Some(ch) = t.consumes
+ && ch.overlaps(input)
+ {
+ Some(t.to)
+ } else {
+ None
+ }
+ })
.collect();
- let pos = cartesian_product(pos);
- let neg = cartesian_product(neg);
-
- let mut out = Vec::new();
-
- for p in pos {
- for n in neg.clone() {
- for transition in enfa.states[self.own_state].trans.iter() {
- if transition.consumes.overlaps(ch) {
- let mut p = p.clone();
- let mut n = n.clone();
- for a in transition.assert.iter() {
- if let Some(thread) = Thread::new(enfa, a.to, Vec::new(), Vec::new()) {
- match a.polarity {
- LookPolarity::Positive => p.push(thread),
- LookPolarity::Negative => n.push(thread),
- }
- }
- }
- if let Some(thread) = Thread::new(enfa, transition.to, p, n) {
- out.push(thread);
- }
+ // TODO inspect negative behavior -- do we need the cartesian product or something more exocit?
+
+ for s in next_states {
+ for p in positives.clone() {
+ for n in negatives.clone() {
+ if let Some(thread) = Self::new(enfa, s, p.clone(), n) {
+ thread.step_epsilon0(
+ enfa,
+ ret,
+ &mut vec![false; enfa.states.len()],
+ Vec::new(),
+ );
}
}
}
}
+ }
+
+ fn step(self, enfa: &ENFA, input: ByteRange) -> Vec<Self> {
+ let mut vec = Vec::new();
+ self.step0(enfa, input, &mut |x| vec.push(x));
+ vec
+ }
- out
+ fn possible_transitions(&self, enfa: &ENFA, f: &mut impl FnMut(ByteRange)) {
+ for t in enfa.states[self.state].trans.iter() {
+ if let Some(ch) = t.consumes {
+ f(ch);
+ }
+ }
+ for t in self.positives.iter() {
+ t.possible_transitions(enfa, f);
+ }
+ for t in self.negatives.iter() {
+ t.possible_transitions(enfa, f);
+ }
}
}
#[derive(Clone)]
pub struct MultiState<'a> {
- nfa: &'a ENFA<Resolved>,
+ nfa: &'a ENFA,
threads: Vec<Thread>,
accept: bool,
hash: u64,
@@ -247,7 +303,7 @@ pub struct MultiState<'a> {
impl<'a> PartialEq for MultiState<'a> {
fn eq(&self, other: &Self) -> bool {
- (self.nfa as *const ENFA<Resolved> as u64) == (other.nfa as *const ENFA<Resolved> as u64)
+ (self.nfa as *const ENFA as u64) == (other.nfa as *const ENFA as u64)
&& self.threads == other.threads
&& self.accept == other.accept
&& self.hash == other.hash
@@ -256,7 +312,7 @@ impl<'a> PartialEq for MultiState<'a> {
impl<'a> Eq for MultiState<'a> {}
impl<'a> MultiState<'a> {
- fn new(nfa: &'a ENFA<Resolved>, mut threads: Vec<Thread>) -> Self {
+ fn new(nfa: &'a ENFA, mut threads: Vec<Thread>) -> Self {
threads.sort();
threads.dedup();
threads.shrink_to_fit();
@@ -291,7 +347,7 @@ impl<'a> MultiState<'a> {
let new_states = self
.threads
.iter()
- .flat_map(|t| t.transition(self.nfa, ch))
+ .flat_map(|t| t.clone().step(self.nfa, ch))
.collect();
Self::new(self.nfa, new_states)
@@ -321,8 +377,8 @@ macro_rules! set {
}};
}
-impl<S: Stage> ENFA<S> {
- fn shift(self, amt: usize) -> Vec<EState<S>> {
+impl ENFA {
+ fn shift(self, amt: usize) -> Vec<EState> {
let mut s = self.states;
for state in s.iter_mut() {
@@ -363,66 +419,7 @@ impl<S: Stage> ENFA<S> {
}
}
-impl ENFA<Epsilon> {
- fn epsilon_dfs(
- &self,
- i: StateId,
- visited: &mut [Option<Vec<Assertion>>],
- path: Vec<Assertion>,
- ) {
- if visited[i].is_some() {
- return;
- }
- visited[i] = Some(path.clone());
- for t in self.states[i].trans.iter() {
- if t.is_epsilon() {
- let mut path = path.clone();
- path.append(&mut t.assert.clone());
- self.epsilon_dfs(t.to, visited, path);
- }
- }
- }
-
- pub fn resolve_epsilon(self) -> ENFA<Resolved> {
- // state X --> { state Y, Z, W which get inlined into X }
- let includes: Vec<Vec<(StateId, Vec<Assertion>)>> = (0..self.states.len())
- .map(|i| {
- let mut reach = vec![None; self.states.len()];
- self.epsilon_dfs(i, &mut reach, Vec::new());
- reach
- .into_iter()
- .enumerate()
- .filter_map(|(x, r)| r.map(|r| (x, r)))
- .collect()
- })
- .collect();
-
- // states without epsilon transitions
- let mut states: Vec<EState<Resolved>> = self
- .states
- .into_iter()
- .map(EState::remove_epsilon)
- .collect();
-
- // inline real transitions
- for i in 0..states.len() {
- for (k, assertions) in includes[i].iter() {
- let new: Vec<Transition<_>> = states[*k]
- .trans
- .iter()
- .cloned()
- .map(|t| t.with_assertions(assertions.clone()))
- .collect();
- states[i].trans.extend(new);
- if states[*k].accept != Acceptance::NotYet {
- states[i].accept = states[*k].accept;
- }
- }
- }
-
- ENFA { states }
- }
-
+impl ENFA {
fn looping(self) -> Self {
let mut states = vec![EState::start()];
states.append(&mut self.shift(1));
@@ -458,7 +455,7 @@ impl ENFA<Epsilon> {
};
}
- let mut states: Vec<EState<Epsilon>> = Vec::new();
+ let mut states: Vec<EState> = Vec::new();
for nfa in nfas.into_iter() {
let len = states.len();
let mut ns = nfa.shift(len);
@@ -476,12 +473,10 @@ impl ENFA<Epsilon> {
}
}
-impl ENFA<Resolved> {
+impl ENFA {
pub fn start_multi_state<'a>(&'a self) -> MultiState<'a> {
- MultiState::new(
- self,
- vec![Thread::new(self, 0, Vec::new(), Vec::new()).unwrap()],
- )
+ let threads = Thread::new_simple(0).step_epsilon(self);
+ MultiState::new(self, threads)
}
pub fn void_multi_state<'a>(&'a self) -> MultiState<'a> {
@@ -509,11 +504,17 @@ impl ENFA<Resolved> {
}
}
-impl std::fmt::Debug for ENFA<Epsilon> {
+impl std::fmt::Debug for ENFA {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
writeln!(f, "NFA {{")?;
for (i, s) in self.states.iter().enumerate() {
write!(f, " {i}: ")?;
+ if let Some(a) = s.assert.as_ref() {
+ match a.polarity {
+ LookPolarity::Positive => write!(f, "+{} ", a.to)?,
+ LookPolarity::Negative => write!(f, "-{} ", a.to)?,
+ }
+ }
for t in s.trans.iter() {
let k = t.to;
if let Some(c) = t.consumes {
@@ -542,49 +543,23 @@ pub struct Assertion {
}
#[derive(Debug, Clone, Hash, PartialEq, Eq)]
-pub struct Transition<S: Stage> {
+pub struct Transition {
to: StateId,
- assert: Vec<Assertion>,
- consumes: S::Consume,
+ consumes: Option<ByteRange>,
}
-impl Transition<Epsilon> {
+impl Transition {
fn new(consumes: ByteRange, to: StateId) -> Self {
Self {
to,
consumes: Some(consumes),
- assert: Vec::new(),
}
}
fn epsilon(to: StateId) -> Self {
- Self {
- to,
- consumes: None,
- assert: Vec::new(),
- }
+ Self { to, consumes: None }
}
- pub fn is_epsilon(&self) -> bool {
- self.consumes.is_none()
- }
-
- fn non_epsilon(self) -> Option<Transition<Resolved>> {
- let Self {
- to,
- assert,
- consumes,
- } = self;
- let consumes = consumes?;
- Some(Transition {
- consumes,
- to,
- assert,
- })
- }
-}
-
-impl<S: Stage> Transition<S> {
fn remap(&mut self, mut f: impl FnMut(StateId) -> StateId) {
self.to = f(self.to);
}
@@ -592,11 +567,6 @@ impl<S: Stage> Transition<S> {
fn reachable_states(&self) -> impl Iterator<Item = StateId> {
[self.to].into_iter()
}
-
- fn with_assertions(mut self, mut more: Vec<Assertion>) -> Self {
- self.assert.append(&mut more);
- self
- }
}
#[derive(Debug, Copy, Clone, PartialEq)]
@@ -607,41 +577,32 @@ pub enum Acceptance {
}
#[derive(Debug, Clone)]
-pub struct EState<S: Stage> {
- pub trans: HashSet<Transition<S>>,
+pub struct EState {
+ pub trans: HashSet<Transition>,
+ pub assert: Option<Assertion>,
pub accept: Acceptance,
}
-impl EState<Epsilon> {
- fn remove_epsilon(self) -> EState<Resolved> {
- let trans = self
- .trans
- .into_iter()
- .filter_map(Transition::non_epsilon)
- .collect();
- let accept = self.accept;
- EState { trans, accept }
- }
-
- fn set_epsilon_transitions(&mut self, trans: impl IntoIterator<Item = Transition<Epsilon>>) {
+impl EState {
+ fn set_epsilon_transitions(&mut self, trans: impl IntoIterator<Item = Transition>) {
self.trans.retain(|t| t.consumes.is_some());
for transition in trans.into_iter() {
assert!(transition.consumes.is_none());
self.trans.insert(transition);
}
}
-}
-impl<S: Stage> EState<S> {
fn start() -> Self {
Self {
trans: HashSet::new(),
+ assert: None,
accept: Acceptance::NotYet,
}
}
fn terminal() -> Self {
Self {
trans: HashSet::new(),
+ assert: None,
accept: Acceptance::Accept,
}
}
@@ -656,10 +617,16 @@ impl<S: Stage> EState<S> {
t
})
.collect();
+ if let Some(a) = self.assert.as_mut() {
+ a.to = f(a.to);
+ }
}
fn reachable_states(&self) -> impl Iterator<Item = StateId> {
- self.trans.iter().flat_map(|t| t.reachable_states())
+ self.trans
+ .iter()
+ .flat_map(|t| t.reachable_states())
+ .chain(self.assert.iter().map(|a| a.to))
}
}
@@ -668,7 +635,7 @@ pub enum EnfaTranslationError {
AssertionsNotSupported,
}
-impl TryFrom<Pattern> for ENFA<Epsilon> {
+impl TryFrom<Pattern> for ENFA {
type Error = EnfaTranslationError;
fn try_from(value: Pattern) -> Result<Self, Self::Error> {
@@ -678,13 +645,14 @@ impl TryFrom<Pattern> for ENFA<Epsilon> {
states: vec![
EState {
trans: set![Transition::new(ByteRange::new_range(c1, c2), 1)],
+ assert: None,
accept: Acceptance::NotYet,
},
EState::terminal(),
],
},
Pattern::Alt(alts) => {
- let nfas: Vec<ENFA<Epsilon>> = alts
+ let nfas: Vec<ENFA> = alts
.into_iter()
.map(Self::try_from)
.collect::<Result<_, _>>()?;
@@ -739,11 +707,8 @@ impl TryFrom<Pattern> for ENFA<Epsilon> {
let mut regex = regex.shift(1);
let mut states = Vec::with_capacity(regex.len() + 2);
states.push(EState {
- trans: set![Transition {
- to: regex.len() + 1,
- assert: vec![Assertion { to: 1, polarity }],
- consumes: None
- }],
+ trans: set![Transition::epsilon(regex.len() + 1)],
+ assert: Some(Assertion { to: 1, polarity }),
accept: Acceptance::NotYet,
});
states.append(&mut regex);