aboutsummaryrefslogtreecommitdiffstats
path: root/src/parse/regex
diff options
context:
space:
mode:
authorJonas Maier <jonas@x77.dev>2026-06-02 16:19:05 +0200
committerJonas Maier <jonas@x77.dev>2026-06-02 16:19:05 +0200
commite1f64a93c0246d7832a308c756b654965fe3710d (patch)
treed3b5024f84eb1e15149ddbd493cdc7d30e86ea54 /src/parse/regex
parent93a7ccabdf85ab2733b2b67810750a97bf3509cb (diff)
downloadpish-e1f64a93c0246d7832a308c756b654965fe3710d.tar.gz
assertion propagation in enfa
Diffstat (limited to 'src/parse/regex')
-rw-r--r--src/parse/regex/enfa.rs130
-rw-r--r--src/parse/regex/mod.rs2
2 files changed, 101 insertions, 31 deletions
diff --git a/src/parse/regex/enfa.rs b/src/parse/regex/enfa.rs
index 88f4536..76b5e63 100644
--- a/src/parse/regex/enfa.rs
+++ b/src/parse/regex/enfa.rs
@@ -3,7 +3,7 @@ use std::{
hash::{DefaultHasher, Hash, Hasher},
};
-pub trait Stage : Clone + std::fmt::Debug + std::hash::Hash + Eq {
+pub trait Stage: Clone + std::fmt::Debug + std::hash::Hash + Eq {
type Consume: Clone + std::fmt::Debug + std::hash::Hash + Eq;
}
@@ -19,6 +19,8 @@ impl Stage for Resolved {
type Consume = ByteRange;
}
+use crate::parse::regex::{LookDirection, LookPolarity};
+
use super::Pattern;
use super::byte_range::ByteRange;
@@ -53,7 +55,9 @@ impl<'a> MultiState<'a> {
states.dedup();
states.shrink_to_fit();
- let accept = states.iter().any(|&x| nfa.states[x].accept);
+ let accept = states
+ .iter()
+ .any(|&x| nfa.states[x].accept == Acceptance::Accept);
let mut hasher = DefaultHasher::new();
states.hash(&mut hasher);
let hash = hasher.finish();
@@ -128,7 +132,9 @@ impl<S: Stage> ENFA<S> {
for state in s.iter_mut() {
state.remap(|i| i + amt);
- state.accept = false;
+ if state.accept == Acceptance::Accept {
+ state.accept = Acceptance::NotYet;
+ }
}
s
@@ -163,28 +169,30 @@ impl<S: Stage> ENFA<S> {
}
impl ENFA<Epsilon> {
- fn epsilon_dfs(&self, i: StateId, visited: &mut [bool]) {
- if visited[i] {
+ fn epsilon_dfs(&self, i: StateId, visited: &mut [Option<Vec<Assertion>>], path: Vec<Assertion>) {
+ if visited[i].is_some() {
return;
}
- visited[i] = true;
+ visited[i] = Some(path.clone());
for t in self.states[i].trans.iter() {
if t.is_epsilon() {
- self.epsilon_dfs(t.to, visited);
+ let mut path = path.clone();
+ path.append(&mut t.assert.clone());
+ self.epsilon_dfs(t.to, visited, path);
}
}
}
pub fn resolve_epsilon(self) -> ENFA<Resolved> {
// state X --> { state Y, Z, W which get inlined into X }
- let includes: Vec<Vec<StateId>> = (0..self.states.len())
+ let includes: Vec<Vec<(StateId, Vec<Assertion>)>> = (0..self.states.len())
.map(|i| {
- let mut reach = vec![false; self.states.len()];
- self.epsilon_dfs(i, &mut reach);
+ let mut reach = vec![None; self.states.len()];
+ self.epsilon_dfs(i, &mut reach, Vec::new());
reach
.into_iter()
.enumerate()
- .filter_map(|(x, r)| if r { Some(x) } else { None })
+ .filter_map(|(x, r)| r.map(|r| (x, r)))
.collect()
})
.collect();
@@ -198,11 +206,16 @@ impl ENFA<Epsilon> {
// inline real transitions
for i in 0..states.len() {
- for &k in includes[i].iter() {
- let new = states[k].trans.clone();
+ for (k, assertions) in includes[i].iter() {
+ let new: Vec<Transition<_>> = states[*k]
+ .trans
+ .iter()
+ .cloned()
+ .map(|t| t.with_assertions(assertions.clone()))
+ .collect();
states[i].trans.extend(new);
- if states[k].accept {
- states[i].accept = true;
+ if states[*k].accept != Acceptance::NotYet {
+ states[i].accept = states[*k].accept;
}
}
}
@@ -257,7 +270,7 @@ impl ENFA<Epsilon> {
}
let len = states.len();
- states[len - 1].accept = true;
+ states[len - 1].accept = Acceptance::Accept;
Self { states }
}
@@ -306,8 +319,10 @@ impl std::fmt::Debug for ENFA<Epsilon> {
write!(f, "~>{k} ")?;
}
}
- if s.accept {
- write!(f, "accept")?;
+ match s.accept {
+ Acceptance::Accept => write!(f, "accept")?,
+ Acceptance::Assertion => write!(f, "assert")?,
+ Acceptance::NotYet => {}
}
writeln!(f)?;
}
@@ -318,19 +333,33 @@ impl std::fmt::Debug for ENFA<Epsilon> {
pub type StateId = usize;
#[derive(Debug, Clone, Hash, PartialEq, Eq)]
+pub struct Assertion {
+ to: StateId,
+ polarity: LookPolarity,
+}
+
+#[derive(Debug, Clone, Hash, PartialEq, Eq)]
pub struct Transition<S: Stage> {
to: StateId,
+ assert: Vec<Assertion>,
consumes: S::Consume,
}
impl Transition<Epsilon> {
fn new(consumes: ByteRange, to: StateId) -> Self {
- let consumes = Some(consumes);
- Self { to, consumes }
+ Self {
+ to,
+ consumes: Some(consumes),
+ assert: Vec::new(),
+ }
}
fn epsilon(to: StateId) -> Self {
- Self { to, consumes: None }
+ Self {
+ to,
+ consumes: None,
+ assert: Vec::new(),
+ }
}
pub fn is_epsilon(&self) -> bool {
@@ -338,9 +367,17 @@ impl Transition<Epsilon> {
}
fn non_epsilon(self) -> Option<Transition<Resolved>> {
- let consumes = self.consumes?;
- let to = self.to;
- Some(Transition { consumes, to })
+ let Self {
+ to,
+ assert,
+ consumes,
+ } = self;
+ let consumes = consumes?;
+ Some(Transition {
+ consumes,
+ to,
+ assert,
+ })
}
}
@@ -352,12 +389,24 @@ impl<S: Stage> Transition<S> {
fn reachable_states(&self) -> impl Iterator<Item = StateId> {
[self.to].into_iter()
}
+
+ fn with_assertions(mut self, mut more: Vec<Assertion>) -> Self {
+ self.assert.append(&mut more);
+ self
+ }
+}
+
+#[derive(Debug, Copy, Clone, PartialEq)]
+pub enum Acceptance {
+ Accept,
+ Assertion,
+ NotYet,
}
#[derive(Debug, Clone)]
pub struct EState<S: Stage> {
pub trans: HashSet<Transition<S>>,
- pub accept: bool,
+ pub accept: Acceptance,
}
impl EState<Epsilon> {
@@ -384,13 +433,13 @@ impl<S: Stage> EState<S> {
fn start() -> Self {
Self {
trans: HashSet::new(),
- accept: false,
+ accept: Acceptance::NotYet,
}
}
fn terminal() -> Self {
Self {
trans: HashSet::new(),
- accept: true,
+ accept: Acceptance::Accept,
}
}
@@ -426,7 +475,7 @@ impl TryFrom<Pattern> for ENFA<Epsilon> {
states: vec![
EState {
trans: set![Transition::new(ByteRange::new_range(c1, c2), 1)],
- accept: false,
+ accept: Acceptance::NotYet,
},
EState::terminal(),
],
@@ -474,8 +523,29 @@ impl TryFrom<Pattern> for ENFA<Epsilon> {
Pattern::Nothing => Self {
states: vec![EState::terminal()],
},
- Pattern::Assertion(..) => {
- return Err(EnfaTranslationError::AssertionsNotSupported);
+ Pattern::Assertion(dir, polarity, pat) => {
+ if dir == LookDirection::Behind {
+ return Err(EnfaTranslationError::AssertionsNotSupported);
+ }
+ let mut regex = Self::try_from(*pat)?;
+ for s in regex.states.iter_mut() {
+ if s.accept == Acceptance::Accept {
+ s.accept = Acceptance::Assertion;
+ }
+ }
+ let mut regex = regex.shift(1);
+ let mut states = Vec::with_capacity(regex.len() + 2);
+ states.push(EState {
+ trans: set![Transition {
+ to: regex.len() + 1,
+ assert: vec![Assertion { to: 1, polarity }],
+ consumes: None
+ }],
+ accept: Acceptance::NotYet,
+ });
+ states.append(&mut regex);
+ states.push(EState::terminal());
+ Self { states }
}
})
}
diff --git a/src/parse/regex/mod.rs b/src/parse/regex/mod.rs
index 6add7a8..2f7c223 100644
--- a/src/parse/regex/mod.rs
+++ b/src/parse/regex/mod.rs
@@ -12,7 +12,7 @@ pub enum LookDirection {
Behind,
}
-#[derive(PartialEq, Eq, Debug, Clone, Copy)]
+#[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)]
pub enum LookPolarity {
Positive,
Negative,