use core::fmt; use std::collections::{BinaryHeap, HashMap, HashSet}; use crate::regex::{Match, RegexEngine, decision_tree::DecisionTree, enfa::EnfaTranslationError}; use super::{ Pattern, byte_range::ByteRange, enfa::{ENFA, MultiState}, }; pub type StateId = usize; trait DfaDecider: From { fn decide(&self, byte: u8) -> StateId; } pub struct HashMapDecider { trans: HashMap, default_trans: StateId, } impl DfaDecider for HashMapDecider { fn decide(&self, byte: u8) -> StateId { for (range, to) in self.trans.iter() { if range.contains(byte) { return *to; } } self.default_trans } } impl From for DecisionTree { fn from(value: HashMapDecider) -> Self { DecisionTree::build(value.trans, value.default_trans) } } impl DfaDecider for DecisionTree { fn decide(&self, byte: u8) -> StateId { DecisionTree::decide(&self, byte) } } impl From for [u32; 256] { fn from(value: HashMapDecider) -> Self { let mut table = [0; 256]; for i in 0..256 { table[i] = value.decide(i as u8) as u32; } table } } impl DfaDecider for [u32; 256] { fn decide(&self, byte: u8) -> StateId { self[byte as usize] as StateId } } pub struct State { decision: D, accept: bool, } #[allow(clippy::upper_case_acronyms)] pub struct DFA { start: StateId, states: Vec>, } impl fmt::Debug for DFA { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { writeln!(f, "DFA {{")?; for (i, s) in self.states.iter().enumerate() { if self.start == i { write!(f, "-> {i}: ")?; } else { write!(f, " {i}: ")?; } for (chr, to) in s.decision.trans.iter() { write!(f, "{chr:?} to {to}, ")?; } write!(f, "dfl to {}", s.decision.default_trans)?; if s.accept { write!(f, ", accept")?; } writeln!(f)?; } writeln!(f, "}}") } } impl From for DFA { fn from(mut nfa: ENFA) -> Self { nfa.remove_unreachable(); let mut multi_states = nfa.all_multi_states(); multi_states.insert(nfa.void_multi_state()); let mut len = 0; let multi_to_dfa: HashMap = multi_states .clone() .into_iter() .map(|ms| { len += 1; (ms, len - 1) }) .collect(); let void = multi_to_dfa[&nfa.void_multi_state()]; let mut states: Vec> = (0..len) .map(|_| State { decision: HashMapDecider { trans: HashMap::new(), default_trans: void, }, accept: false, }) .collect(); for ms in multi_states.iter() { let i: usize = multi_to_dfa[ms]; states[i].accept = ms.accept(); for t in ms.possible_transitions() { let k = multi_to_dfa[&ms.transition(t)]; states[i].decision.trans.insert(t, k); } } let mut this = Self { start: multi_to_dfa[&nfa.start_multi_state()], states, }; this.minify(); this } } #[derive(Clone, Debug)] pub enum DFACompileError { SubmatchesNotSupported, #[allow(unused)] NFAError(EnfaTranslationError), } impl RegexEngine for DFA { type CompileError = DFACompileError; fn compile(pat: Pattern) -> Result { match ENFA::try_from(pat) { Ok(nfa) => { if nfa.has_submatches { Err(DFACompileError::SubmatchesNotSupported) } else { let this = DFA::from(nfa); Ok(Self { states: this .states .into_iter() .map(|s| State { decision: D::from(s.decision), accept: s.accept, }) .collect(), start: this.start, }) } } Err(e) => Err(DFACompileError::NFAError(e)), } } fn run(&self, input: &[u8]) -> Option { let mut state = self.start; let mut i = 0; while i + 7 < input.len() { state = self.states[state].decision.decide(input[i+0]); state = self.states[state].decision.decide(input[i+1]); state = self.states[state].decision.decide(input[i+2]); state = self.states[state].decision.decide(input[i+3]); state = self.states[state].decision.decide(input[i+4]); state = self.states[state].decision.decide(input[i+5]); state = self.states[state].decision.decide(input[i+6]); state = self.states[state].decision.decide(input[i+7]); i += 8; } while i < input.len() { state = self.states[state].decision.decide(input[i]); i += 1; } if self.states[state].accept { Some(Match::new_empty()) } else { None } } } mod state_set { #[derive(Hash, Clone, PartialEq, Eq)] pub struct StateSet { set: Vec, } impl StateSet { pub fn new(mut set: Vec) -> Self { set.sort(); set.dedup(); Self { set } } pub fn iter(&self) -> impl Iterator { self.set.iter().cloned() } pub fn intersection(&self, other: &Self) -> Self { let a = &self.set; let b = &other.set; let mut i = 0; let mut j = 0; let mut out = Vec::new(); while i < a.len() && j < b.len() { match a[i].cmp(&b[j]) { std::cmp::Ordering::Less => i += 1, std::cmp::Ordering::Greater => j += 1, std::cmp::Ordering::Equal => { out.push(a[i]); i += 1; j += 1; } } } Self::new(out) } pub fn difference(&self, other: &Self) -> Self { let a = &self.set; let b = &other.set; let mut i = 0; let mut j = 0; let mut out = Vec::new(); while i < a.len() && j < b.len() { match a[i].cmp(&b[j]) { std::cmp::Ordering::Less => { out.push(a[i]); i += 1; } std::cmp::Ordering::Greater => { j += 1; } std::cmp::Ordering::Equal => { i += 1; j += 1; } } } out.extend_from_slice(&a[i..]); Self::new(out) } pub fn is_empty(&self) -> bool { self.set.is_empty() } pub fn len(&self) -> usize { self.set.len() } pub fn primary_state(&self) -> Option { self.set.first().cloned() } } // custom implementation such that smaller sets come first in a BinaryHeap impl Ord for StateSet { fn cmp(&self, other: &Self) -> std::cmp::Ordering { match self.set.len().cmp(&other.set.len()) { std::cmp::Ordering::Equal => {} other => return other.reverse(), } self.set.cmp(&other.set) } } impl PartialOrd for StateSet { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } } use state_set::StateSet; trait GoesTo { fn goes_to(&self, to: usize) -> bool; } impl GoesTo for (&State, ByteRange) { fn goes_to(&self, target: usize) -> bool { let from = self.0; let ch = self.1; for (c, &to) in from.decision.trans.iter() { if c.overlaps(ch) && to == target { return true; } } from.decision.default_trans == target } } impl DFA { fn states_where(&self, mut f: impl FnMut(&State) -> bool) -> StateSet { let states = self .states .iter() .enumerate() .filter_map(|(i, s)| if f(s) { Some(i) } else { None }) .collect(); StateSet::new(states) } /// https://en.wikipedia.org/wiki/DFA_minimization fn hopcroft_minimization(&mut self) { let mut partitions: HashSet = HashSet::new(); partitions.insert(self.states_where(|s| s.accept)); partitions.insert(self.states_where(|s| !s.accept)); let mut ranges: Vec = self .states .iter() .flat_map(|s| s.decision.trans.iter().map(|t| *t.0)) .chain([ByteRange::new_range(u8::MIN, u8::MAX)]) .collect(); ranges.sort(); ranges.dedup(); let ranges = ByteRange::split_to_disjoint(ranges); let mut queue: BinaryHeap = partitions.iter().cloned().collect(); let mut queue_set = partitions.clone(); while let Some(a) = queue.pop() { if !queue_set.contains(&a) { continue; } for &c in ranges.iter() { let x = self.states_where(|s| a.iter().any(|a| (s, c).goes_to(a))); let mut del_list = HashSet::new(); let mut add_list = Vec::new(); for y in partitions.iter() { let i = x.intersection(y); let d = y.difference(&x); if !i.is_empty() && !d.is_empty() { del_list.insert(y.clone()); add_list.push(i.clone()); add_list.push(d.clone()); if queue_set.contains(y) { queue_set.remove(y); queue.push(i.clone()); queue_set.insert(i); queue.push(d.clone()); queue_set.insert(d); } else if i.len() < d.len() { queue.push(i.clone()); queue_set.insert(i); } else { queue.push(d.clone()); queue_set.insert(d); } } } partitions.retain(|i| !del_list.contains(i)); for x in add_list { partitions.insert(x); } } } let mut replacement = vec![None; self.states.len()]; for partition in partitions { if let Some(x) = partition.primary_state() { for state in partition.iter() { assert!(replacement[state].is_none()); replacement[state] = Some(x); } } } // replacement indices in original index space let replacement: Vec = replacement.into_iter().map(|x| x.unwrap()).collect(); let is_alive = |idx: usize| replacement[idx] == idx; // compact index space let mut compact: Vec = vec![usize::MAX; self.states.len()]; let mut next = 0; for i in 0..self.states.len() { if is_alive(i) { compact[i] = next; next += 1; } } // remap everything and skip all no-longer-needed states let remap = |idx: usize| compact[replacement[idx]]; let mut new_states = Vec::with_capacity(next); for i in 0..self.states.len() { if is_alive(i) { let s = &self.states[i]; let trans: HashMap = s .decision .trans .iter() .map(|(&ch, &to)| (ch, remap(to))) .collect(); let default_trans = remap(s.decision.default_trans); new_states.push(State { decision: HashMapDecider { trans, default_trans, }, accept: s.accept, }); } } self.states = new_states; self.start = remap(self.start); } pub fn minify(&mut self) { for state in self.states.iter_mut() { state .decision .trans .retain(|_, to| *to != state.decision.default_trans); if state.decision.trans.len() == 1 && state .decision .trans .iter() .all(|t| *t.0 == ByteRange::new_range(0, 255)) { state.decision.default_trans = state.decision.trans.iter().map(|x| *x.1).next().unwrap(); state.decision.trans.clear(); } } self.hopcroft_minimization(); } }