aboutsummaryrefslogtreecommitdiffstats
path: root/src/regex/enfa.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/regex/enfa.rs')
-rw-r--r--src/regex/enfa.rs741
1 files changed, 741 insertions, 0 deletions
diff --git a/src/regex/enfa.rs b/src/regex/enfa.rs
new file mode 100644
index 0000000..8392642
--- /dev/null
+++ b/src/regex/enfa.rs
@@ -0,0 +1,741 @@
+use std::{
+ collections::HashSet,
+ hash::{DefaultHasher, Hash, Hasher},
+};
+
+use super::{LookDirection, LookPolarity, Pattern, byte_range::ByteRange};
+
+/// NFA with epsilon transitions
+#[derive(Clone)]
+#[allow(clippy::upper_case_acronyms)]
+pub struct ENFA {
+ pub states: Vec<EState>,
+ pub has_submatches: bool,
+}
+
+fn cartesian_product<T: Clone>(x: Vec<Vec<T>>) -> Vec<Vec<T>> {
+ let mut result = vec![Vec::new()];
+
+ for xs in x {
+ let mut next = Vec::new();
+
+ for prefix in &result {
+ for item in &xs {
+ let mut v = prefix.clone();
+ v.push(item.clone());
+ next.push(v);
+ }
+ }
+
+ result = next;
+ }
+
+ result
+}
+
+#[cfg(test)]
+mod product_tests {
+ use super::cartesian_product;
+
+ #[test]
+ fn basic_case() {
+ let x = vec![vec![1, 2], vec![10, 20]];
+
+ let out = cartesian_product(x);
+
+ assert_eq!(
+ out,
+ vec![vec![1, 10], vec![1, 20], vec![2, 10], vec![2, 20],]
+ );
+ }
+
+ #[test]
+ fn single_dimension() {
+ let x = vec![vec![1, 2, 3]];
+
+ let out = cartesian_product(x);
+
+ assert_eq!(out, vec![vec![1], vec![2], vec![3],]);
+ }
+
+ #[test]
+ fn empty_outer_vector() {
+ let x: Vec<Vec<i32>> = vec![];
+
+ let out: Vec<Vec<i32>> = cartesian_product(x);
+ let r: Vec<Vec<i32>> = vec![vec![]];
+
+ // One empty combination.
+ assert_eq!(out, r);
+ }
+
+ #[test]
+ fn empty_inner_vector() {
+ let x = vec![vec![1, 2], vec![], vec![3, 4]];
+
+ let out = cartesian_product(x);
+
+ assert!(out.is_empty());
+ }
+
+ #[test]
+ fn output_size_matches_product() {
+ let x = vec![vec![1, 2, 3], vec![4, 5], vec![6, 7, 8, 9]];
+
+ let out = cartesian_product(x);
+
+ assert_eq!(out.len(), 3 * 2 * 4);
+ }
+
+ #[test]
+ fn every_output_has_correct_length() {
+ let x = vec![vec!['a', 'b'], vec!['x', 'y', 'z'], vec!['0']];
+
+ let out = cartesian_product(x);
+
+ assert!(out.iter().all(|v| v.len() == 3));
+ }
+
+ #[test]
+ fn works_with_strings() {
+ let x = vec![
+ vec!["a".to_string(), "b".to_string()],
+ vec!["x".to_string()],
+ ];
+
+ let out = cartesian_product(x);
+
+ assert_eq!(
+ out,
+ vec![
+ vec!["a".to_string(), "x".to_string()],
+ vec!["b".to_string(), "x".to_string()],
+ ]
+ );
+ }
+}
+
+#[derive(Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
+struct Thread {
+ state: StateId,
+ positives: Vec<Thread>,
+ negatives: Vec<Thread>,
+}
+
+impl Thread {
+ fn new_simple(state: StateId) -> Self {
+ Self {
+ state,
+ positives: Vec::new(),
+ negatives: Vec::new(),
+ }
+ }
+
+ fn new(
+ enfa: &ENFA,
+ state: StateId,
+ mut positives: Vec<Thread>,
+ mut negatives: Vec<Thread>,
+ ) -> Option<Self> {
+ positives.sort();
+ positives.dedup();
+ positives.retain(|t| !t.accept(enfa));
+ positives.shrink_to_fit();
+
+ negatives.sort();
+ negatives.dedup();
+ negatives.shrink_to_fit();
+
+ if negatives.iter().any(|t| t.accept(enfa)) {
+ return None;
+ }
+
+ Some(Self {
+ state,
+ positives,
+ negatives,
+ })
+ }
+
+ fn accept(&self, enfa: &ENFA) -> bool {
+ let pos = self.positives.iter().all(|t| t.accept(enfa));
+ let neg = self.negatives.iter().all(|t| !t.accept(enfa));
+ let this = match enfa.states[self.state].accept {
+ Acceptance::Accept => true,
+ Acceptance::Assertion => true,
+ Acceptance::NotYet => false,
+ };
+ pos && neg && this
+ }
+
+ fn step_epsilon0(
+ self,
+ enfa: &ENFA,
+ ret: &mut impl FnMut(Thread),
+ visited: &mut [bool],
+ new_assertions: Vec<Assertion>,
+ ) {
+ if visited[self.state] {
+ return;
+ }
+ visited[self.state] = true;
+
+ for t in enfa.states[self.state].trans.iter() {
+ if t.consumes.is_some() {
+ continue;
+ }
+
+ let mut new_assertions = new_assertions.clone();
+ if let Some(assertion) = enfa.states[self.state].assert.as_ref() {
+ new_assertions.push(assertion.clone());
+ }
+
+ Self {
+ state: t.to,
+ ..self.clone()
+ }
+ .step_epsilon0(enfa, ret, visited, new_assertions);
+ }
+
+ let Self {
+ state,
+ positives,
+ negatives,
+ } = self;
+ let mut p = Vec::new();
+ let mut n = Vec::new();
+ for assertion in new_assertions {
+ let threads = Self::new_simple(assertion.to).step_epsilon(enfa);
+ let vec = match assertion.polarity {
+ LookPolarity::Positive => &mut p,
+ LookPolarity::Negative => &mut n,
+ };
+ vec.push(threads);
+ }
+ let p = cartesian_product(p);
+ for mut p in p {
+ let mut positives = positives.clone();
+ let mut negatives = negatives.clone();
+ positives.append(&mut p);
+ negatives.append(&mut n.iter().flatten().cloned().collect());
+ if let Some(thread) = Self::new(enfa, state, positives, negatives) {
+ ret(thread);
+ }
+ }
+ }
+
+ fn step_epsilon(self, enfa: &ENFA) -> Vec<Self> {
+ let mut vec = Vec::new();
+ self.step_epsilon0(
+ enfa,
+ &mut |t| vec.push(t),
+ &mut vec![false; enfa.states.len()],
+ Vec::new(),
+ );
+ vec
+ }
+
+ fn step0(self, enfa: &ENFA, input: ByteRange, ret: &mut impl FnMut(Thread)) {
+ let positives = self
+ .positives
+ .clone()
+ .into_iter()
+ .map(|t| t.step(enfa, input))
+ .collect();
+ let negatives: Vec<_> = self
+ .negatives
+ .into_iter()
+ .flat_map(|t| t.step(enfa, input))
+ .collect();
+ let positives = cartesian_product(positives);
+ let next_states: Vec<StateId> = enfa.states[self.state]
+ .trans
+ .iter()
+ .filter_map(|t| {
+ if let Some(ch) = t.consumes
+ && ch.overlaps(input)
+ {
+ Some(t.to)
+ } else {
+ None
+ }
+ })
+ .collect();
+
+ for s in next_states {
+ for p in positives.clone() {
+ if let Some(thread) = Self::new(enfa, s, p.clone(), negatives.clone()) {
+ thread.step_epsilon0(
+ enfa,
+ ret,
+ &mut vec![false; enfa.states.len()],
+ Vec::new(),
+ );
+ }
+ }
+ }
+ }
+
+ fn step(self, enfa: &ENFA, input: ByteRange) -> Vec<Self> {
+ let mut vec = Vec::new();
+ self.step0(enfa, input, &mut |x| vec.push(x));
+ vec
+ }
+
+ fn possible_transitions(&self, enfa: &ENFA, f: &mut impl FnMut(ByteRange)) {
+ for t in enfa.states[self.state].trans.iter() {
+ if let Some(ch) = t.consumes {
+ f(ch);
+ }
+ }
+ for t in self.positives.iter() {
+ t.possible_transitions(enfa, f);
+ }
+ for t in self.negatives.iter() {
+ t.possible_transitions(enfa, f);
+ }
+ }
+}
+
+#[derive(Clone)]
+pub struct MultiState<'a> {
+ nfa: &'a ENFA,
+ threads: Vec<Thread>,
+ accept: bool,
+ hash: u64,
+}
+
+impl<'a> PartialEq for MultiState<'a> {
+ fn eq(&self, other: &Self) -> bool {
+ (self.nfa as *const ENFA as u64) == (other.nfa as *const ENFA as u64)
+ && self.threads == other.threads
+ && self.accept == other.accept
+ && self.hash == other.hash
+ }
+}
+impl<'a> Eq for MultiState<'a> {}
+
+impl<'a> MultiState<'a> {
+ fn new(nfa: &'a ENFA, mut threads: Vec<Thread>) -> Self {
+ threads.sort();
+ threads.dedup();
+ threads.shrink_to_fit();
+
+ let accept = threads.iter().any(|t| t.accept(nfa));
+ let mut hasher = DefaultHasher::new();
+ threads.hash(&mut hasher);
+ let hash = hasher.finish();
+
+ Self {
+ nfa,
+ threads,
+ accept,
+ hash,
+ }
+ }
+
+ /// all the chars that will make an interesting transition
+ pub fn possible_transitions(&self) -> Vec<ByteRange> {
+ let mut vec = Vec::new();
+ for t in self.threads.iter() {
+ t.possible_transitions(self.nfa, &mut |x| vec.push(x));
+ }
+ vec = ByteRange::split_to_disjoint(vec);
+ vec.sort();
+ vec.dedup();
+ vec.shrink_to_fit();
+ vec
+ }
+
+ pub fn transition(&self, ch: ByteRange) -> Self {
+ let new_states = self
+ .threads
+ .iter()
+ .flat_map(|t| t.clone().step(self.nfa, ch))
+ .collect();
+
+ Self::new(self.nfa, new_states)
+ }
+
+ pub fn accept(&self) -> bool {
+ self.accept
+ }
+}
+
+impl<'a> Hash for MultiState<'a> {
+ fn hash<H: Hasher>(&self, state: &mut H) {
+ self.hash.hash(state)
+ }
+}
+
+macro_rules! set {
+ () => {
+ std::collections::HashSet::new()
+ };
+ ( $( $x:expr ),* ) => {{
+ let mut set = std::collections::HashSet::new();
+ $(
+ set.insert($x);
+ )*
+ set
+ }};
+}
+
+impl ENFA {
+ fn shift(self, amt: usize) -> Vec<EState> {
+ let mut s = self.states;
+
+ for state in s.iter_mut() {
+ state.remap(|i| i + amt);
+ if state.accept == Acceptance::Accept {
+ state.accept = Acceptance::NotYet;
+ }
+ }
+
+ s
+ }
+
+ pub fn remove_unreachable(&mut self) {
+ let mut used = vec![false; self.states.len()];
+ used[0] = true;
+ for s in self.states.iter() {
+ for i in s.reachable_states() {
+ used[i] = true;
+ }
+ }
+ let mut remap = vec![0; self.states.len()];
+ let mut shift = 0;
+ for i in 0..self.states.len() {
+ if used[i] {
+ remap[i] = i - shift;
+ } else {
+ shift += 1;
+ }
+ }
+ for i in (0..self.states.len()).rev() {
+ if !used[i] {
+ self.states.remove(i);
+ }
+ }
+ for s in self.states.iter_mut() {
+ s.remap(|i| remap[i]);
+ }
+ }
+}
+
+impl ENFA {
+ fn looping(self) -> Self {
+ let has_submatches = self.has_submatches;
+ let mut states = vec![EState::start()];
+ states.append(&mut self.shift(1));
+ let len = states.len();
+ states[0].set_epsilon_transitions([Transition::epsilon(1), Transition::epsilon(len)]);
+ states[len - 1].set_epsilon_transitions([Transition::epsilon(0), Transition::epsilon(len)]);
+ states.push(EState::terminal());
+ Self { states, has_submatches }
+ }
+
+ fn repeat(self, times: usize) -> Self {
+ let reps = vec![self; times];
+ Self::concat(reps)
+ }
+
+ /// between 0 and x repetitions
+ fn optx(self, x: usize) -> Self {
+ let len = self.states.len();
+ let mut repped = self.repeat(x);
+ assert_eq!(repped.states.len(), x * len);
+ for i in 1..=x {
+ repped.states[0]
+ .trans
+ .insert(Transition::epsilon(i * len - 1));
+ }
+ repped
+ }
+
+ fn concat(nfas: Vec<Self>) -> Self {
+ if nfas.is_empty() {
+ return Self {
+ states: vec![EState::terminal()],
+ has_submatches: false,
+ };
+ }
+
+ let mut has_submatches = false;
+ let mut states: Vec<EState> = Vec::new();
+ for nfa in nfas.into_iter() {
+ has_submatches = has_submatches || nfa.has_submatches;
+ let len = states.len();
+ let mut ns = nfa.shift(len);
+ if let Some(n) = states.last_mut() {
+ n.trans.retain(|t| t.consumes.is_some());
+ n.trans.insert(Transition::epsilon(len));
+ }
+ states.append(&mut ns);
+ }
+
+ let len = states.len();
+ states[len - 1].accept = Acceptance::Accept;
+
+ Self { states, has_submatches }
+ }
+}
+
+impl ENFA {
+ pub fn start_multi_state<'a>(&'a self) -> MultiState<'a> {
+ let threads = Thread::new_simple(0).step_epsilon(self);
+ MultiState::new(self, threads)
+ }
+
+ pub fn void_multi_state<'a>(&'a self) -> MultiState<'a> {
+ MultiState::new(self, vec![])
+ }
+
+ pub fn all_multi_states<'a>(&'a self) -> HashSet<MultiState<'a>> {
+ let mut states = set![self.start_multi_state()];
+ let mut q = vec![self.start_multi_state()];
+
+ while let Some(state) = q.pop() {
+ let chars = state.possible_transitions();
+
+ for chr in chars {
+ let new = state.transition(chr);
+
+ if !states.contains(&new) {
+ states.insert(new.clone());
+ q.push(new);
+ }
+ }
+ }
+
+ states
+ }
+}
+
+impl std::fmt::Debug for ENFA {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ writeln!(f, "NFA {{")?;
+ for (i, s) in self.states.iter().enumerate() {
+ write!(f, " {i}: ")?;
+ if let Some(a) = s.assert.as_ref() {
+ match a.polarity {
+ LookPolarity::Positive => write!(f, "+{} ", a.to)?,
+ LookPolarity::Negative => write!(f, "-{} ", a.to)?,
+ }
+ }
+ for t in s.trans.iter() {
+ let k = t.to;
+ if let Some(c) = t.consumes {
+ write!(f, "{c:?}=>{k} ")?;
+ } else {
+ write!(f, "~>{k} ")?;
+ }
+ }
+ match s.accept {
+ Acceptance::Accept => write!(f, "accept")?,
+ Acceptance::Assertion => write!(f, "assert")?,
+ Acceptance::NotYet => {}
+ }
+ writeln!(f)?;
+ }
+ write!(f, "}}")
+ }
+}
+
+pub type StateId = usize;
+
+#[derive(Debug, Clone, Hash, PartialEq, Eq)]
+pub struct Assertion {
+ to: StateId,
+ polarity: LookPolarity,
+}
+
+#[derive(Debug, Clone, Hash, PartialEq, Eq)]
+pub struct Transition {
+ to: StateId,
+ consumes: Option<ByteRange>,
+}
+
+impl Transition {
+ fn new(consumes: ByteRange, to: StateId) -> Self {
+ Self {
+ to,
+ consumes: Some(consumes),
+ }
+ }
+
+ fn epsilon(to: StateId) -> Self {
+ Self { to, consumes: None }
+ }
+
+ fn remap(&mut self, mut f: impl FnMut(StateId) -> StateId) {
+ self.to = f(self.to);
+ }
+
+ fn reachable_states(&self) -> impl Iterator<Item = StateId> {
+ [self.to].into_iter()
+ }
+}
+
+#[derive(Debug, Copy, Clone, PartialEq)]
+pub enum Acceptance {
+ Accept,
+ Assertion,
+ NotYet,
+}
+
+#[derive(Debug, Clone)]
+pub struct EState {
+ pub trans: HashSet<Transition>,
+ pub assert: Option<Assertion>,
+ pub accept: Acceptance,
+}
+
+impl EState {
+ fn set_epsilon_transitions(&mut self, trans: impl IntoIterator<Item = Transition>) {
+ self.trans.retain(|t| t.consumes.is_some());
+ for transition in trans.into_iter() {
+ assert!(transition.consumes.is_none());
+ self.trans.insert(transition);
+ }
+ }
+
+ fn start() -> Self {
+ Self {
+ trans: HashSet::new(),
+ assert: None,
+ accept: Acceptance::NotYet,
+ }
+ }
+ fn terminal() -> Self {
+ Self {
+ trans: HashSet::new(),
+ assert: None,
+ accept: Acceptance::Accept,
+ }
+ }
+
+ fn remap(&mut self, mut f: impl FnMut(StateId) -> StateId) {
+ self.trans = self
+ .trans
+ .iter()
+ .cloned()
+ .map(|mut t| {
+ t.remap(&mut f);
+ t
+ })
+ .collect();
+ if let Some(a) = self.assert.as_mut() {
+ a.to = f(a.to);
+ }
+ }
+
+ fn reachable_states(&self) -> impl Iterator<Item = StateId> {
+ self.trans
+ .iter()
+ .flat_map(|t| t.reachable_states())
+ .chain(self.assert.iter().map(|a| a.to))
+ }
+}
+
+#[derive(Clone, Debug)]
+pub enum EnfaTranslationError {
+ CharacterClassNotSupported,
+ AssertionsNotSupported,
+}
+
+impl TryFrom<Pattern> for ENFA {
+ type Error = EnfaTranslationError;
+
+ fn try_from(value: Pattern) -> Result<Self, Self::Error> {
+ Ok(match value {
+ Pattern::Byte(c) => Self::try_from(Pattern::Range(c, c))?,
+ Pattern::Range(c1, c2) => Self {
+ states: vec![
+ EState {
+ trans: set![Transition::new(ByteRange::new_range(c1, c2), 1)],
+ assert: None,
+ accept: Acceptance::NotYet,
+ },
+ EState::terminal(),
+ ],
+ has_submatches: false,
+ },
+ Pattern::CharacterClass(_) => {
+ return Err(EnfaTranslationError::CharacterClassNotSupported);
+ }
+ Pattern::Alt(alts) => {
+ let nfas: Vec<ENFA> = alts
+ .into_iter()
+ .map(Self::try_from)
+ .collect::<Result<_, _>>()?;
+ let mut states = vec![EState::start()];
+ let mut ends = vec![];
+ let mut has_submatches = false;
+ for nfa in nfas.into_iter() {
+ has_submatches = has_submatches || nfa.has_submatches;
+ let len = states.len();
+ states[0].trans.insert(Transition::epsilon(len));
+ states.append(&mut (nfa.shift(len)));
+ ends.push(states.len() - 1);
+ }
+ states.push(EState::terminal());
+ for end in ends.into_iter() {
+ let last = states.len() - 1;
+ states[end].trans.insert(Transition::epsilon(last));
+ }
+ Self { states, has_submatches }
+ }
+ Pattern::Concat(seq) => {
+ let nfas: Vec<Self> = seq
+ .into_iter()
+ .map(Self::try_from)
+ .collect::<Result<_, _>>()?;
+ Self::concat(nfas)
+ }
+ Pattern::Rep(regex, min, None, _) => {
+ let nfa = ENFA::try_from(*regex)?;
+ let base = nfa.clone().repeat(min as usize);
+ let tail = nfa.looping();
+ Self::concat(vec![base, tail])
+ }
+ Pattern::Rep(regex, min, Some(max), _) => {
+ assert!(min < max);
+ let nfa = Self::try_from(*regex)?;
+ let base = nfa.clone().repeat(min as usize);
+ let tail = nfa.optx((max - min) as usize);
+ Self::concat(vec![base, tail])
+ }
+ Pattern::Nothing => Self {
+ states: vec![EState::terminal()],
+ has_submatches: false,
+ },
+ Pattern::Assertion(dir, polarity, pat) => {
+ if dir == LookDirection::Behind {
+ return Err(EnfaTranslationError::AssertionsNotSupported);
+ }
+ let mut regex = Self::try_from(*pat)?;
+ for s in regex.states.iter_mut() {
+ if s.accept == Acceptance::Accept {
+ s.accept = Acceptance::Assertion;
+ }
+ }
+ let mut regex = regex.shift(1);
+ let mut states = Vec::with_capacity(regex.len() + 2);
+ states.push(EState {
+ trans: set![Transition::epsilon(regex.len() + 1)],
+ assert: Some(Assertion { to: 1, polarity }),
+ accept: Acceptance::NotYet,
+ });
+ states.append(&mut regex);
+ states.push(EState::terminal());
+ Self { states, has_submatches: false, }
+ }
+ Pattern::Submatch(pat) => {
+ let mut this = Self::try_from(*pat)?;
+ this.has_submatches = true;
+ this
+ }
+ })
+ }
+}