use core::fmt; use std::collections::{BinaryHeap, HashMap, HashSet}; use crate::regex::{Match, RegexEngine, enfa::EnfaTranslationError}; use super::{ Pattern, byte_range::ByteRange, enfa::{ENFA, MultiState}, }; pub type StateId = usize; pub struct State { trans: HashMap, default_trans: StateId, accept: bool, } #[allow(clippy::upper_case_acronyms)] pub struct DFA { start: StateId, states: Vec, } impl fmt::Debug for DFA { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { writeln!(f, "DFA {{")?; for (i, s) in self.states.iter().enumerate() { if self.start == i { write!(f, "-> {i}: ")?; } else { write!(f, " {i}: ")?; } for (chr, to) in s.trans.iter() { write!(f, "{chr:?} to {to}, ")?; } write!(f, "dfl to {}", s.default_trans)?; if s.accept { write!(f, ", accept")?; } writeln!(f)?; } writeln!(f, "}}") } } impl From for DFA { fn from(mut nfa: ENFA) -> Self { nfa.remove_unreachable(); let mut multi_states = nfa.all_multi_states(); multi_states.insert(nfa.void_multi_state()); let mut len = 0; let multi_to_dfa: HashMap = multi_states .clone() .into_iter() .map(|ms| { len += 1; (ms, len - 1) }) .collect(); let void = multi_to_dfa[&nfa.void_multi_state()]; let mut states: Vec = (0..len) .map(|_| State { trans: HashMap::new(), default_trans: void, accept: false, }) .collect(); for ms in multi_states.iter() { let i: usize = multi_to_dfa[ms]; states[i].accept = ms.accept(); for t in ms.possible_transitions() { let k = multi_to_dfa[&ms.transition(t)]; states[i].trans.insert(t, k); } } let mut this = Self { start: multi_to_dfa[&nfa.start_multi_state()], states, }; this.minify(); this } } #[derive(Clone, Debug)] pub enum DFACompileError { SubmatchesNotSupported, #[allow(unused)] NFAError(EnfaTranslationError), } impl RegexEngine for DFA { type CompileError = DFACompileError; fn compile(pat: Pattern) -> Result { match ENFA::try_from(pat) { Ok(nfa) => { if nfa.has_submatches { Err(DFACompileError::SubmatchesNotSupported) } else { Ok(Self::from(nfa)) } } Err(e) => Err(DFACompileError::NFAError(e)), } } fn run(&self, input: &[u8]) -> Option { if self.matches(input) { Some(Match::new_empty()) } else { None } } } impl DFA { pub fn matches(&self, x: &[u8]) -> bool { let mut state = self.start; 'next_byte: for &b in x.iter() { for (range, &next_state) in self.states[state].trans.iter() { if range.contains(b) { state = next_state; continue 'next_byte; } } state = self.states[state].default_trans; } self.states[state].accept } } mod state_set { #[derive(Hash, Clone, PartialEq, Eq)] pub struct StateSet { set: Vec, } impl StateSet { pub fn new(mut set: Vec) -> Self { set.sort(); set.dedup(); Self { set } } pub fn iter(&self) -> impl Iterator { self.set.iter().cloned() } pub fn intersection(&self, other: &Self) -> Self { let a = &self.set; let b = &other.set; let mut i = 0; let mut j = 0; let mut out = Vec::new(); while i < a.len() && j < b.len() { match a[i].cmp(&b[j]) { std::cmp::Ordering::Less => i += 1, std::cmp::Ordering::Greater => j += 1, std::cmp::Ordering::Equal => { out.push(a[i]); i += 1; j += 1; } } } Self::new(out) } pub fn difference(&self, other: &Self) -> Self { let a = &self.set; let b = &other.set; let mut i = 0; let mut j = 0; let mut out = Vec::new(); while i < a.len() && j < b.len() { match a[i].cmp(&b[j]) { std::cmp::Ordering::Less => { out.push(a[i]); i += 1; } std::cmp::Ordering::Greater => { j += 1; } std::cmp::Ordering::Equal => { i += 1; j += 1; } } } out.extend_from_slice(&a[i..]); Self::new(out) } pub fn is_empty(&self) -> bool { self.set.is_empty() } pub fn len(&self) -> usize { self.set.len() } pub fn primary_state(&self) -> Option { self.set.first().cloned() } } // custom implementation such that smaller sets come first in a BinaryHeap impl Ord for StateSet { fn cmp(&self, other: &Self) -> std::cmp::Ordering { match self.set.len().cmp(&other.set.len()) { std::cmp::Ordering::Equal => {} other => return other.reverse(), } self.set.cmp(&other.set) } } impl PartialOrd for StateSet { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } } use state_set::StateSet; trait GoesTo { fn goes_to(&self, to: usize) -> bool; } impl GoesTo for (&State, ByteRange) { fn goes_to(&self, target: usize) -> bool { let from = self.0; let ch = self.1; for (c, &to) in from.trans.iter() { if c.overlaps(ch) && to == target { return true; } } from.default_trans == target } } impl DFA { fn states_where(&self, mut f: impl FnMut(&State) -> bool) -> StateSet { let states = self .states .iter() .enumerate() .filter_map(|(i, s)| if f(s) { Some(i) } else { None }) .collect(); StateSet::new(states) } /// https://en.wikipedia.org/wiki/DFA_minimization fn hopcroft_minimization(&mut self) { let mut partitions: HashSet = HashSet::new(); partitions.insert(self.states_where(|s| s.accept)); partitions.insert(self.states_where(|s| !s.accept)); let mut ranges: Vec = self .states .iter() .flat_map(|s| s.trans.iter().map(|t| *t.0)) .chain([ByteRange::new_range(u8::MIN, u8::MAX)]) .collect(); ranges.sort(); ranges.dedup(); let ranges = ByteRange::split_to_disjoint(ranges); let mut queue: BinaryHeap = partitions.iter().cloned().collect(); let mut queue_set = partitions.clone(); while let Some(a) = queue.pop() { if !queue_set.contains(&a) { continue; } for &c in ranges.iter() { let x = self.states_where(|s| a.iter().any(|a| (s, c).goes_to(a))); let mut del_list = HashSet::new(); let mut add_list = Vec::new(); for y in partitions.iter() { let i = x.intersection(y); let d = y.difference(&x); if !i.is_empty() && !d.is_empty() { del_list.insert(y.clone()); add_list.push(i.clone()); add_list.push(d.clone()); if queue_set.contains(y) { queue_set.remove(y); queue.push(i.clone()); queue_set.insert(i); queue.push(d.clone()); queue_set.insert(d); } else if i.len() < d.len() { queue.push(i.clone()); queue_set.insert(i); } else { queue.push(d.clone()); queue_set.insert(d); } } } partitions.retain(|i| !del_list.contains(i)); for x in add_list { partitions.insert(x); } } } let mut replacement = vec![None; self.states.len()]; for partition in partitions { if let Some(x) = partition.primary_state() { for state in partition.iter() { assert!(replacement[state].is_none()); replacement[state] = Some(x); } } } // replacement indices in original index space let replacement: Vec = replacement.into_iter().map(|x| x.unwrap()).collect(); let is_alive = |idx: usize| replacement[idx] == idx; // compact index space let mut compact: Vec = vec![usize::MAX; self.states.len()]; let mut next = 0; for i in 0..self.states.len() { if is_alive(i) { compact[i] = next; next += 1; } } // remap everything and skip all no-longer-needed states let remap = |idx: usize| compact[replacement[idx]]; let mut new_states = Vec::with_capacity(next); for i in 0..self.states.len() { if is_alive(i) { let s = &self.states[i]; new_states.push(State { trans: s.trans.iter().map(|(&ch, &to)| (ch, remap(to))).collect(), default_trans: remap(s.default_trans), accept: s.accept, }); } } self.states = new_states; self.start = remap(self.start); } pub fn minify(&mut self) { for state in self.states.iter_mut() { state.trans.retain(|_, to| *to != state.default_trans); if state.trans.len() == 1 && state .trans .iter() .all(|t| *t.0 == ByteRange::new_range(0, 255)) { state.default_trans = state.trans.iter().map(|x| *x.1).next().unwrap(); state.trans.clear(); } } self.hopcroft_minimization(); } }