diff options
Diffstat (limited to 'src/parse')
| -rw-r--r-- | src/parse/regex/dfa.rs | 283 |
1 files changed, 259 insertions, 24 deletions
diff --git a/src/parse/regex/dfa.rs b/src/parse/regex/dfa.rs index 0a5c78d..db63953 100644 --- a/src/parse/regex/dfa.rs +++ b/src/parse/regex/dfa.rs @@ -1,5 +1,5 @@ use core::fmt; -use std::collections::HashMap; +use std::collections::{BinaryHeap, HashMap, HashSet}; use super::{ byte_range::ByteRange, @@ -44,28 +44,6 @@ impl fmt::Debug for DFA { } } -impl DFA { - pub fn matches(&self, x: &[u8]) -> bool { - let mut state = self.start; - 'next_byte: for &b in x.iter() { - for (range, &next_state) in self.states[state].trans.iter() { - if range.contains(b) { - state = next_state; - continue 'next_byte; - } - } - state = self.states[state].default_trans; - } - self.states[state].accept - } - - pub fn simplify(&mut self) { - for state in self.states.iter_mut() { - state.trans.retain(|_, to| *to != state.default_trans); - } - } -} - impl From<ENFA> for DFA { fn from(mut nfa: ENFA) -> Self { nfa.remove_unreachable(); @@ -105,7 +83,264 @@ impl From<ENFA> for DFA { start: multi_to_dfa[&nfa.start_multi_state()], states, }; - this.simplify(); + this.minify(); this } } + +impl DFA { + pub fn matches(&self, x: &[u8]) -> bool { + let mut state = self.start; + 'next_byte: for &b in x.iter() { + for (range, &next_state) in self.states[state].trans.iter() { + if range.contains(b) { + state = next_state; + continue 'next_byte; + } + } + state = self.states[state].default_trans; + } + self.states[state].accept + } +} + +mod state_set { + #[derive(Hash, Clone, PartialEq, Eq)] + pub struct StateSet { + set: Vec<usize>, + } + + impl StateSet { + pub fn new(mut set: Vec<usize>) -> Self { + set.sort(); + set.dedup(); + Self { set } + } + + pub fn iter(&self) -> impl Iterator<Item = usize> { + self.set.iter().cloned() + } + + pub fn intersection(&self, other: &Self) -> Self { + let a = &self.set; + let b = &other.set; + + let mut i = 0; + let mut j = 0; + let mut out = Vec::new(); + + while i < a.len() && j < b.len() { + match a[i].cmp(&b[j]) { + std::cmp::Ordering::Less => i += 1, + std::cmp::Ordering::Greater => j += 1, + std::cmp::Ordering::Equal => { + out.push(a[i]); + i += 1; + j += 1; + } + } + } + + Self::new(out) + } + + pub fn difference(&self, other: &Self) -> Self { + let a = &self.set; + let b = &other.set; + + let mut i = 0; + let mut j = 0; + let mut out = Vec::new(); + + while i < a.len() && j < b.len() { + match a[i].cmp(&b[j]) { + std::cmp::Ordering::Less => { + out.push(a[i]); + i += 1; + } + std::cmp::Ordering::Greater => { + j += 1; + } + std::cmp::Ordering::Equal => { + i += 1; + j += 1; + } + } + } + + out.extend_from_slice(&a[i..]); + Self::new(out) + } + + pub fn is_empty(&self) -> bool { + self.set.is_empty() + } + + pub fn len(&self) -> usize { + self.set.len() + } + + pub fn primary_state(&self) -> Option<usize> { + self.set.first().cloned() + } + } + + // custom implementation such that smaller sets come first in a BinaryHeap + impl Ord for StateSet { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + match self.set.len().cmp(&other.set.len()) { + std::cmp::Ordering::Equal => {} + other => return other.reverse(), + } + self.set.cmp(&other.set) + } + } + + impl PartialOrd for StateSet { + fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> { + Some(self.cmp(other)) + } + } +} + +use state_set::StateSet; + +trait GoesTo { + fn goes_to(&self, to: usize) -> bool; +} +impl<'a> GoesTo for (&'a State, ByteRange) { + fn goes_to(&self, target: usize) -> bool { + let from = self.0; + let ch = self.1; + for (c, &to) in from.trans.iter() { + if c.overlaps(ch) && to == target { + return true; + } + } + from.default_trans == target + } +} + +impl DFA { + fn states_where(&self, mut f: impl FnMut(&State) -> bool) -> StateSet { + let states = self + .states + .iter() + .enumerate() + .filter_map(|(i, s)| if f(s) { Some(i) } else { None }) + .collect(); + StateSet::new(states) + } + + /// https://en.wikipedia.org/wiki/DFA_minimization + fn hopcroft_minimization(&mut self) { + let mut partitions: HashSet<StateSet> = HashSet::new(); + partitions.insert(self.states_where(|s| s.accept)); + partitions.insert(self.states_where(|s| !s.accept)); + + let mut ranges: Vec<ByteRange> = self + .states + .iter() + .flat_map(|s| s.trans.iter().map(|t| *t.0)) + .chain([ByteRange::new_range(u8::MIN, u8::MAX)]) + .collect(); + ranges.sort(); + ranges.dedup(); + let ranges = ByteRange::split_to_disjoint(ranges); + + let mut queue: BinaryHeap<StateSet> = partitions.iter().cloned().collect(); + let mut queue_set = partitions.clone(); + + while let Some(a) = queue.pop() { + if !queue_set.contains(&a) { + continue; + } + + for &c in ranges.iter() { + let x = self.states_where(|s| a.iter().any(|a| (s, c).goes_to(a))); + + let mut del_list = HashSet::new(); + let mut add_list = Vec::new(); + for y in partitions.iter() { + let i = x.intersection(y); + let d = y.difference(&x); + + if !i.is_empty() && !d.is_empty() { + del_list.insert(y.clone()); + add_list.push(i.clone()); + add_list.push(d.clone()); + + if queue_set.contains(y) { + queue_set.remove(y); + + queue.push(i.clone()); + queue_set.insert(i); + + queue.push(d.clone()); + queue_set.insert(d); + } else if i.len() < d.len() { + queue.push(i.clone()); + queue_set.insert(i); + } else { + queue.push(d.clone()); + queue_set.insert(d); + } + } + } + + partitions.retain(|i| !del_list.contains(i)); + for x in add_list { + partitions.insert(x); + } + } + } + + let mut replacement = vec![None; self.states.len()]; + for partition in partitions { + if let Some(x) = partition.primary_state() { + for state in partition.iter() { + assert!(replacement[state].is_none()); + replacement[state] = Some(x); + } + } + } + + // replacement indices in original index space + let replacement: Vec<usize> = replacement.into_iter().map(|x| x.unwrap()).collect(); + let is_alive = |idx: usize| replacement[idx] == idx; + + // compact index space + let mut compact: Vec<usize> = vec![usize::MAX; self.states.len()]; + let mut next = 0; + for i in 0..self.states.len() { + if is_alive(i) { + compact[i] = next; + next += 1; + } + } + + // remap everything and skip all no-longer-needed states + let remap = |idx: usize| compact[replacement[idx]]; + let mut new_states = Vec::with_capacity(next); + for i in 0..self.states.len() { + if is_alive(i) { + let s = &self.states[i]; + new_states.push(State { + trans: s.trans.iter().map(|(&ch, &to)| (ch, remap(to))).collect(), + default_trans: remap(s.default_trans), + accept: s.accept, + }); + } + } + self.states = new_states; + self.start = remap(self.start); + } + + pub fn minify(&mut self) { + for state in self.states.iter_mut() { + state.trans.retain(|_, to| *to != state.default_trans); + } + + self.hopcroft_minimization(); + } +} |
