diff options
| author | Jonas Maier <jonas@x77.dev> | 2026-06-06 12:15:52 +0200 |
|---|---|---|
| committer | Jonas Maier <jonas@x77.dev> | 2026-06-06 12:15:52 +0200 |
| commit | 53980774c327675e886179c0a2c140744dcf9b95 (patch) | |
| tree | ca1fdcc9938fce2c10c51e0a51659c6ba38ac5ba /src/regex/dfa.rs | |
| parent | 75e0c29cf91ddc6299c14a94a038c3e3df3d2805 (diff) | |
| download | pish-53980774c327675e886179c0a2c140744dcf9b95.tar.gz | |
special cased regex for performance
Diffstat (limited to 'src/regex/dfa.rs')
| -rw-r--r-- | src/regex/dfa.rs | 381 |
1 files changed, 381 insertions, 0 deletions
diff --git a/src/regex/dfa.rs b/src/regex/dfa.rs new file mode 100644 index 0000000..c55d99d --- /dev/null +++ b/src/regex/dfa.rs @@ -0,0 +1,381 @@ +use core::fmt; +use std::collections::{BinaryHeap, HashMap, HashSet}; + +use crate::regex::{Match, RegexEngine, enfa::EnfaTranslationError}; + +use super::{ + Pattern, + byte_range::ByteRange, + enfa::{ENFA, MultiState}, +}; + +pub type StateId = usize; + +pub struct State { + trans: HashMap<ByteRange, StateId>, + default_trans: StateId, + accept: bool, +} + +#[allow(clippy::upper_case_acronyms)] +pub struct DFA { + start: StateId, + states: Vec<State>, +} + +impl fmt::Debug for DFA { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!(f, "DFA {{")?; + for (i, s) in self.states.iter().enumerate() { + if self.start == i { + write!(f, "-> {i}: ")?; + } else { + write!(f, " {i}: ")?; + } + + for (chr, to) in s.trans.iter() { + write!(f, "{chr:?} to {to}, ")?; + } + + write!(f, "dfl to {}", s.default_trans)?; + if s.accept { + write!(f, ", accept")?; + } + writeln!(f)?; + } + writeln!(f, "}}") + } +} + +impl From<ENFA> for DFA { + fn from(mut nfa: ENFA) -> Self { + nfa.remove_unreachable(); + + let mut multi_states = nfa.all_multi_states(); + multi_states.insert(nfa.void_multi_state()); + let mut len = 0; + let multi_to_dfa: HashMap<MultiState, StateId> = multi_states + .clone() + .into_iter() + .map(|ms| { + len += 1; + (ms, len - 1) + }) + .collect(); + + let void = multi_to_dfa[&nfa.void_multi_state()]; + + let mut states: Vec<State> = (0..len) + .map(|_| State { + trans: HashMap::new(), + default_trans: void, + accept: false, + }) + .collect(); + + for ms in multi_states.iter() { + let i: usize = multi_to_dfa[ms]; + states[i].accept = ms.accept(); + for t in ms.possible_transitions() { + let k = multi_to_dfa[&ms.transition(t)]; + states[i].trans.insert(t, k); + } + } + + let mut this = Self { + start: multi_to_dfa[&nfa.start_multi_state()], + states, + }; + this.minify(); + this + } +} + +#[derive(Clone, Debug)] +pub enum DFACompileError { + SubmatchesNotSupported, + #[allow(unused)] + NFAError(EnfaTranslationError), +} + +impl RegexEngine for DFA { + type CompileError = DFACompileError; + + fn compile(pat: Pattern) -> Result<Self, Self::CompileError> { + match ENFA::try_from(pat) { + Ok(nfa) => { + if nfa.has_submatches { + Err(DFACompileError::SubmatchesNotSupported) + } else { + Ok(Self::from(nfa)) + } + } + Err(e) => Err(DFACompileError::NFAError(e)), + } + } + + fn run(&self, input: &[u8]) -> Option<super::Match> { + if self.matches(input) { + Some(Match::new_empty()) + } else { + None + } + } +} + +impl DFA { + pub fn matches(&self, x: &[u8]) -> bool { + let mut state = self.start; + 'next_byte: for &b in x.iter() { + for (range, &next_state) in self.states[state].trans.iter() { + if range.contains(b) { + state = next_state; + continue 'next_byte; + } + } + state = self.states[state].default_trans; + } + self.states[state].accept + } +} + +mod state_set { + #[derive(Hash, Clone, PartialEq, Eq)] + pub struct StateSet { + set: Vec<usize>, + } + + impl StateSet { + pub fn new(mut set: Vec<usize>) -> Self { + set.sort(); + set.dedup(); + Self { set } + } + + pub fn iter(&self) -> impl Iterator<Item = usize> { + self.set.iter().cloned() + } + + pub fn intersection(&self, other: &Self) -> Self { + let a = &self.set; + let b = &other.set; + + let mut i = 0; + let mut j = 0; + let mut out = Vec::new(); + + while i < a.len() && j < b.len() { + match a[i].cmp(&b[j]) { + std::cmp::Ordering::Less => i += 1, + std::cmp::Ordering::Greater => j += 1, + std::cmp::Ordering::Equal => { + out.push(a[i]); + i += 1; + j += 1; + } + } + } + + Self::new(out) + } + + pub fn difference(&self, other: &Self) -> Self { + let a = &self.set; + let b = &other.set; + + let mut i = 0; + let mut j = 0; + let mut out = Vec::new(); + + while i < a.len() && j < b.len() { + match a[i].cmp(&b[j]) { + std::cmp::Ordering::Less => { + out.push(a[i]); + i += 1; + } + std::cmp::Ordering::Greater => { + j += 1; + } + std::cmp::Ordering::Equal => { + i += 1; + j += 1; + } + } + } + + out.extend_from_slice(&a[i..]); + Self::new(out) + } + + pub fn is_empty(&self) -> bool { + self.set.is_empty() + } + + pub fn len(&self) -> usize { + self.set.len() + } + + pub fn primary_state(&self) -> Option<usize> { + self.set.first().cloned() + } + } + + // custom implementation such that smaller sets come first in a BinaryHeap + impl Ord for StateSet { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + match self.set.len().cmp(&other.set.len()) { + std::cmp::Ordering::Equal => {} + other => return other.reverse(), + } + self.set.cmp(&other.set) + } + } + + impl PartialOrd for StateSet { + fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> { + Some(self.cmp(other)) + } + } +} + +use state_set::StateSet; + +trait GoesTo { + fn goes_to(&self, to: usize) -> bool; +} +impl GoesTo for (&State, ByteRange) { + fn goes_to(&self, target: usize) -> bool { + let from = self.0; + let ch = self.1; + for (c, &to) in from.trans.iter() { + if c.overlaps(ch) && to == target { + return true; + } + } + from.default_trans == target + } +} + +impl DFA { + fn states_where(&self, mut f: impl FnMut(&State) -> bool) -> StateSet { + let states = self + .states + .iter() + .enumerate() + .filter_map(|(i, s)| if f(s) { Some(i) } else { None }) + .collect(); + StateSet::new(states) + } + + /// https://en.wikipedia.org/wiki/DFA_minimization + fn hopcroft_minimization(&mut self) { + let mut partitions: HashSet<StateSet> = HashSet::new(); + partitions.insert(self.states_where(|s| s.accept)); + partitions.insert(self.states_where(|s| !s.accept)); + + let mut ranges: Vec<ByteRange> = self + .states + .iter() + .flat_map(|s| s.trans.iter().map(|t| *t.0)) + .chain([ByteRange::new_range(u8::MIN, u8::MAX)]) + .collect(); + ranges.sort(); + ranges.dedup(); + let ranges = ByteRange::split_to_disjoint(ranges); + + let mut queue: BinaryHeap<StateSet> = partitions.iter().cloned().collect(); + let mut queue_set = partitions.clone(); + + while let Some(a) = queue.pop() { + if !queue_set.contains(&a) { + continue; + } + + for &c in ranges.iter() { + let x = self.states_where(|s| a.iter().any(|a| (s, c).goes_to(a))); + + let mut del_list = HashSet::new(); + let mut add_list = Vec::new(); + for y in partitions.iter() { + let i = x.intersection(y); + let d = y.difference(&x); + + if !i.is_empty() && !d.is_empty() { + del_list.insert(y.clone()); + add_list.push(i.clone()); + add_list.push(d.clone()); + + if queue_set.contains(y) { + queue_set.remove(y); + + queue.push(i.clone()); + queue_set.insert(i); + + queue.push(d.clone()); + queue_set.insert(d); + } else if i.len() < d.len() { + queue.push(i.clone()); + queue_set.insert(i); + } else { + queue.push(d.clone()); + queue_set.insert(d); + } + } + } + + partitions.retain(|i| !del_list.contains(i)); + for x in add_list { + partitions.insert(x); + } + } + } + + let mut replacement = vec![None; self.states.len()]; + for partition in partitions { + if let Some(x) = partition.primary_state() { + for state in partition.iter() { + assert!(replacement[state].is_none()); + replacement[state] = Some(x); + } + } + } + + // replacement indices in original index space + let replacement: Vec<usize> = replacement.into_iter().map(|x| x.unwrap()).collect(); + let is_alive = |idx: usize| replacement[idx] == idx; + + // compact index space + let mut compact: Vec<usize> = vec![usize::MAX; self.states.len()]; + let mut next = 0; + for i in 0..self.states.len() { + if is_alive(i) { + compact[i] = next; + next += 1; + } + } + + // remap everything and skip all no-longer-needed states + let remap = |idx: usize| compact[replacement[idx]]; + let mut new_states = Vec::with_capacity(next); + for i in 0..self.states.len() { + if is_alive(i) { + let s = &self.states[i]; + new_states.push(State { + trans: s.trans.iter().map(|(&ch, &to)| (ch, remap(to))).collect(), + default_trans: remap(s.default_trans), + accept: s.accept, + }); + } + } + self.states = new_states; + self.start = remap(self.start); + } + + pub fn minify(&mut self) { + for state in self.states.iter_mut() { + state.trans.retain(|_, to| *to != state.default_trans); + } + + self.hopcroft_minimization(); + } +} |
