diff options
| author | Jonas Maier <jonas@x77.dev> | 2026-06-06 12:15:52 +0200 |
|---|---|---|
| committer | Jonas Maier <jonas@x77.dev> | 2026-06-06 12:15:52 +0200 |
| commit | 53980774c327675e886179c0a2c140744dcf9b95 (patch) | |
| tree | ca1fdcc9938fce2c10c51e0a51659c6ba38ac5ba /src/parse/regex/dfa.rs | |
| parent | 75e0c29cf91ddc6299c14a94a038c3e3df3d2805 (diff) | |
| download | pish-53980774c327675e886179c0a2c140744dcf9b95.tar.gz | |
special cased regex for performance
Diffstat (limited to 'src/parse/regex/dfa.rs')
| -rw-r--r-- | src/parse/regex/dfa.rs | 346 |
1 files changed, 0 insertions, 346 deletions
diff --git a/src/parse/regex/dfa.rs b/src/parse/regex/dfa.rs deleted file mode 100644 index 35f726c..0000000 --- a/src/parse/regex/dfa.rs +++ /dev/null @@ -1,346 +0,0 @@ -use core::fmt; -use std::collections::{BinaryHeap, HashMap, HashSet}; - -use super::{ - byte_range::ByteRange, - enfa::{ENFA, MultiState}, -}; - -pub type StateId = usize; - -pub struct State { - trans: HashMap<ByteRange, StateId>, - default_trans: StateId, - accept: bool, -} - -#[allow(clippy::upper_case_acronyms)] -pub struct DFA { - start: StateId, - states: Vec<State>, -} - -impl fmt::Debug for DFA { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - writeln!(f, "DFA {{")?; - for (i, s) in self.states.iter().enumerate() { - if self.start == i { - write!(f, "-> {i}: ")?; - } else { - write!(f, " {i}: ")?; - } - - for (chr, to) in s.trans.iter() { - write!(f, "{chr:?} to {to}, ")?; - } - - write!(f, "dfl to {}", s.default_trans)?; - if s.accept { - write!(f, ", accept")?; - } - writeln!(f)?; - } - writeln!(f, "}}") - } -} - -impl From<ENFA> for DFA { - fn from(mut nfa: ENFA) -> Self { - nfa.remove_unreachable(); - - let mut multi_states = nfa.all_multi_states(); - multi_states.insert(nfa.void_multi_state()); - let mut len = 0; - let multi_to_dfa: HashMap<MultiState, StateId> = multi_states - .clone() - .into_iter() - .map(|ms| { - len += 1; - (ms, len - 1) - }) - .collect(); - - let void = multi_to_dfa[&nfa.void_multi_state()]; - - let mut states: Vec<State> = (0..len) - .map(|_| State { - trans: HashMap::new(), - default_trans: void, - accept: false, - }) - .collect(); - - for ms in multi_states.iter() { - let i: usize = multi_to_dfa[ms]; - states[i].accept = ms.accept(); - for t in ms.possible_transitions() { - let k = multi_to_dfa[&ms.transition(t)]; - states[i].trans.insert(t, k); - } - } - - let mut this = Self { - start: multi_to_dfa[&nfa.start_multi_state()], - states, - }; - this.minify(); - this - } -} - -impl DFA { - pub fn matches(&self, x: &[u8]) -> bool { - let mut state = self.start; - 'next_byte: for &b in x.iter() { - for (range, &next_state) in self.states[state].trans.iter() { - if range.contains(b) { - state = next_state; - continue 'next_byte; - } - } - state = self.states[state].default_trans; - } - self.states[state].accept - } -} - -mod state_set { - #[derive(Hash, Clone, PartialEq, Eq)] - pub struct StateSet { - set: Vec<usize>, - } - - impl StateSet { - pub fn new(mut set: Vec<usize>) -> Self { - set.sort(); - set.dedup(); - Self { set } - } - - pub fn iter(&self) -> impl Iterator<Item = usize> { - self.set.iter().cloned() - } - - pub fn intersection(&self, other: &Self) -> Self { - let a = &self.set; - let b = &other.set; - - let mut i = 0; - let mut j = 0; - let mut out = Vec::new(); - - while i < a.len() && j < b.len() { - match a[i].cmp(&b[j]) { - std::cmp::Ordering::Less => i += 1, - std::cmp::Ordering::Greater => j += 1, - std::cmp::Ordering::Equal => { - out.push(a[i]); - i += 1; - j += 1; - } - } - } - - Self::new(out) - } - - pub fn difference(&self, other: &Self) -> Self { - let a = &self.set; - let b = &other.set; - - let mut i = 0; - let mut j = 0; - let mut out = Vec::new(); - - while i < a.len() && j < b.len() { - match a[i].cmp(&b[j]) { - std::cmp::Ordering::Less => { - out.push(a[i]); - i += 1; - } - std::cmp::Ordering::Greater => { - j += 1; - } - std::cmp::Ordering::Equal => { - i += 1; - j += 1; - } - } - } - - out.extend_from_slice(&a[i..]); - Self::new(out) - } - - pub fn is_empty(&self) -> bool { - self.set.is_empty() - } - - pub fn len(&self) -> usize { - self.set.len() - } - - pub fn primary_state(&self) -> Option<usize> { - self.set.first().cloned() - } - } - - // custom implementation such that smaller sets come first in a BinaryHeap - impl Ord for StateSet { - fn cmp(&self, other: &Self) -> std::cmp::Ordering { - match self.set.len().cmp(&other.set.len()) { - std::cmp::Ordering::Equal => {} - other => return other.reverse(), - } - self.set.cmp(&other.set) - } - } - - impl PartialOrd for StateSet { - fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> { - Some(self.cmp(other)) - } - } -} - -use state_set::StateSet; - -trait GoesTo { - fn goes_to(&self, to: usize) -> bool; -} -impl GoesTo for (&State, ByteRange) { - fn goes_to(&self, target: usize) -> bool { - let from = self.0; - let ch = self.1; - for (c, &to) in from.trans.iter() { - if c.overlaps(ch) && to == target { - return true; - } - } - from.default_trans == target - } -} - -impl DFA { - fn states_where(&self, mut f: impl FnMut(&State) -> bool) -> StateSet { - let states = self - .states - .iter() - .enumerate() - .filter_map(|(i, s)| if f(s) { Some(i) } else { None }) - .collect(); - StateSet::new(states) - } - - /// https://en.wikipedia.org/wiki/DFA_minimization - fn hopcroft_minimization(&mut self) { - let mut partitions: HashSet<StateSet> = HashSet::new(); - partitions.insert(self.states_where(|s| s.accept)); - partitions.insert(self.states_where(|s| !s.accept)); - - let mut ranges: Vec<ByteRange> = self - .states - .iter() - .flat_map(|s| s.trans.iter().map(|t| *t.0)) - .chain([ByteRange::new_range(u8::MIN, u8::MAX)]) - .collect(); - ranges.sort(); - ranges.dedup(); - let ranges = ByteRange::split_to_disjoint(ranges); - - let mut queue: BinaryHeap<StateSet> = partitions.iter().cloned().collect(); - let mut queue_set = partitions.clone(); - - while let Some(a) = queue.pop() { - if !queue_set.contains(&a) { - continue; - } - - for &c in ranges.iter() { - let x = self.states_where(|s| a.iter().any(|a| (s, c).goes_to(a))); - - let mut del_list = HashSet::new(); - let mut add_list = Vec::new(); - for y in partitions.iter() { - let i = x.intersection(y); - let d = y.difference(&x); - - if !i.is_empty() && !d.is_empty() { - del_list.insert(y.clone()); - add_list.push(i.clone()); - add_list.push(d.clone()); - - if queue_set.contains(y) { - queue_set.remove(y); - - queue.push(i.clone()); - queue_set.insert(i); - - queue.push(d.clone()); - queue_set.insert(d); - } else if i.len() < d.len() { - queue.push(i.clone()); - queue_set.insert(i); - } else { - queue.push(d.clone()); - queue_set.insert(d); - } - } - } - - partitions.retain(|i| !del_list.contains(i)); - for x in add_list { - partitions.insert(x); - } - } - } - - let mut replacement = vec![None; self.states.len()]; - for partition in partitions { - if let Some(x) = partition.primary_state() { - for state in partition.iter() { - assert!(replacement[state].is_none()); - replacement[state] = Some(x); - } - } - } - - // replacement indices in original index space - let replacement: Vec<usize> = replacement.into_iter().map(|x| x.unwrap()).collect(); - let is_alive = |idx: usize| replacement[idx] == idx; - - // compact index space - let mut compact: Vec<usize> = vec![usize::MAX; self.states.len()]; - let mut next = 0; - for i in 0..self.states.len() { - if is_alive(i) { - compact[i] = next; - next += 1; - } - } - - // remap everything and skip all no-longer-needed states - let remap = |idx: usize| compact[replacement[idx]]; - let mut new_states = Vec::with_capacity(next); - for i in 0..self.states.len() { - if is_alive(i) { - let s = &self.states[i]; - new_states.push(State { - trans: s.trans.iter().map(|(&ch, &to)| (ch, remap(to))).collect(), - default_trans: remap(s.default_trans), - accept: s.accept, - }); - } - } - self.states = new_states; - self.start = remap(self.start); - } - - pub fn minify(&mut self) { - for state in self.states.iter_mut() { - state.trans.retain(|_, to| *to != state.default_trans); - } - - self.hopcroft_minimization(); - } -} |
