aboutsummaryrefslogtreecommitdiffstats
path: root/src/parse/regex/dfa.rs
diff options
context:
space:
mode:
authorJonas Maier <jonas@x77.dev>2026-06-04 22:05:09 +0200
committerJonas Maier <jonas@x77.dev>2026-06-04 22:05:09 +0200
commit2184c1cab66771594051197451c5e0f4dd2567a2 (patch)
treeade6775f02bd571f03885b8bd8e41a8bd7d550a2 /src/parse/regex/dfa.rs
parent7c7a10dd58d50bacc712e91e2344c466a22a2d25 (diff)
downloadpish-2184c1cab66771594051197451c5e0f4dd2567a2.tar.gz
DFA minimization
Diffstat (limited to 'src/parse/regex/dfa.rs')
-rw-r--r--src/parse/regex/dfa.rs283
1 files changed, 259 insertions, 24 deletions
diff --git a/src/parse/regex/dfa.rs b/src/parse/regex/dfa.rs
index 0a5c78d..db63953 100644
--- a/src/parse/regex/dfa.rs
+++ b/src/parse/regex/dfa.rs
@@ -1,5 +1,5 @@
use core::fmt;
-use std::collections::HashMap;
+use std::collections::{BinaryHeap, HashMap, HashSet};
use super::{
byte_range::ByteRange,
@@ -44,28 +44,6 @@ impl fmt::Debug for DFA {
}
}
-impl DFA {
- pub fn matches(&self, x: &[u8]) -> bool {
- let mut state = self.start;
- 'next_byte: for &b in x.iter() {
- for (range, &next_state) in self.states[state].trans.iter() {
- if range.contains(b) {
- state = next_state;
- continue 'next_byte;
- }
- }
- state = self.states[state].default_trans;
- }
- self.states[state].accept
- }
-
- pub fn simplify(&mut self) {
- for state in self.states.iter_mut() {
- state.trans.retain(|_, to| *to != state.default_trans);
- }
- }
-}
-
impl From<ENFA> for DFA {
fn from(mut nfa: ENFA) -> Self {
nfa.remove_unreachable();
@@ -105,7 +83,264 @@ impl From<ENFA> for DFA {
start: multi_to_dfa[&nfa.start_multi_state()],
states,
};
- this.simplify();
+ this.minify();
this
}
}
+
+impl DFA {
+ pub fn matches(&self, x: &[u8]) -> bool {
+ let mut state = self.start;
+ 'next_byte: for &b in x.iter() {
+ for (range, &next_state) in self.states[state].trans.iter() {
+ if range.contains(b) {
+ state = next_state;
+ continue 'next_byte;
+ }
+ }
+ state = self.states[state].default_trans;
+ }
+ self.states[state].accept
+ }
+}
+
+mod state_set {
+ #[derive(Hash, Clone, PartialEq, Eq)]
+ pub struct StateSet {
+ set: Vec<usize>,
+ }
+
+ impl StateSet {
+ pub fn new(mut set: Vec<usize>) -> Self {
+ set.sort();
+ set.dedup();
+ Self { set }
+ }
+
+ pub fn iter(&self) -> impl Iterator<Item = usize> {
+ self.set.iter().cloned()
+ }
+
+ pub fn intersection(&self, other: &Self) -> Self {
+ let a = &self.set;
+ let b = &other.set;
+
+ let mut i = 0;
+ let mut j = 0;
+ let mut out = Vec::new();
+
+ while i < a.len() && j < b.len() {
+ match a[i].cmp(&b[j]) {
+ std::cmp::Ordering::Less => i += 1,
+ std::cmp::Ordering::Greater => j += 1,
+ std::cmp::Ordering::Equal => {
+ out.push(a[i]);
+ i += 1;
+ j += 1;
+ }
+ }
+ }
+
+ Self::new(out)
+ }
+
+ pub fn difference(&self, other: &Self) -> Self {
+ let a = &self.set;
+ let b = &other.set;
+
+ let mut i = 0;
+ let mut j = 0;
+ let mut out = Vec::new();
+
+ while i < a.len() && j < b.len() {
+ match a[i].cmp(&b[j]) {
+ std::cmp::Ordering::Less => {
+ out.push(a[i]);
+ i += 1;
+ }
+ std::cmp::Ordering::Greater => {
+ j += 1;
+ }
+ std::cmp::Ordering::Equal => {
+ i += 1;
+ j += 1;
+ }
+ }
+ }
+
+ out.extend_from_slice(&a[i..]);
+ Self::new(out)
+ }
+
+ pub fn is_empty(&self) -> bool {
+ self.set.is_empty()
+ }
+
+ pub fn len(&self) -> usize {
+ self.set.len()
+ }
+
+ pub fn primary_state(&self) -> Option<usize> {
+ self.set.first().cloned()
+ }
+ }
+
+ // custom implementation such that smaller sets come first in a BinaryHeap
+ impl Ord for StateSet {
+ fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+ match self.set.len().cmp(&other.set.len()) {
+ std::cmp::Ordering::Equal => {}
+ other => return other.reverse(),
+ }
+ self.set.cmp(&other.set)
+ }
+ }
+
+ impl PartialOrd for StateSet {
+ fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+ Some(self.cmp(other))
+ }
+ }
+}
+
+use state_set::StateSet;
+
+trait GoesTo {
+ fn goes_to(&self, to: usize) -> bool;
+}
+impl<'a> GoesTo for (&'a State, ByteRange) {
+ fn goes_to(&self, target: usize) -> bool {
+ let from = self.0;
+ let ch = self.1;
+ for (c, &to) in from.trans.iter() {
+ if c.overlaps(ch) && to == target {
+ return true;
+ }
+ }
+ from.default_trans == target
+ }
+}
+
+impl DFA {
+ fn states_where(&self, mut f: impl FnMut(&State) -> bool) -> StateSet {
+ let states = self
+ .states
+ .iter()
+ .enumerate()
+ .filter_map(|(i, s)| if f(s) { Some(i) } else { None })
+ .collect();
+ StateSet::new(states)
+ }
+
+ /// https://en.wikipedia.org/wiki/DFA_minimization
+ fn hopcroft_minimization(&mut self) {
+ let mut partitions: HashSet<StateSet> = HashSet::new();
+ partitions.insert(self.states_where(|s| s.accept));
+ partitions.insert(self.states_where(|s| !s.accept));
+
+ let mut ranges: Vec<ByteRange> = self
+ .states
+ .iter()
+ .flat_map(|s| s.trans.iter().map(|t| *t.0))
+ .chain([ByteRange::new_range(u8::MIN, u8::MAX)])
+ .collect();
+ ranges.sort();
+ ranges.dedup();
+ let ranges = ByteRange::split_to_disjoint(ranges);
+
+ let mut queue: BinaryHeap<StateSet> = partitions.iter().cloned().collect();
+ let mut queue_set = partitions.clone();
+
+ while let Some(a) = queue.pop() {
+ if !queue_set.contains(&a) {
+ continue;
+ }
+
+ for &c in ranges.iter() {
+ let x = self.states_where(|s| a.iter().any(|a| (s, c).goes_to(a)));
+
+ let mut del_list = HashSet::new();
+ let mut add_list = Vec::new();
+ for y in partitions.iter() {
+ let i = x.intersection(y);
+ let d = y.difference(&x);
+
+ if !i.is_empty() && !d.is_empty() {
+ del_list.insert(y.clone());
+ add_list.push(i.clone());
+ add_list.push(d.clone());
+
+ if queue_set.contains(y) {
+ queue_set.remove(y);
+
+ queue.push(i.clone());
+ queue_set.insert(i);
+
+ queue.push(d.clone());
+ queue_set.insert(d);
+ } else if i.len() < d.len() {
+ queue.push(i.clone());
+ queue_set.insert(i);
+ } else {
+ queue.push(d.clone());
+ queue_set.insert(d);
+ }
+ }
+ }
+
+ partitions.retain(|i| !del_list.contains(i));
+ for x in add_list {
+ partitions.insert(x);
+ }
+ }
+ }
+
+ let mut replacement = vec![None; self.states.len()];
+ for partition in partitions {
+ if let Some(x) = partition.primary_state() {
+ for state in partition.iter() {
+ assert!(replacement[state].is_none());
+ replacement[state] = Some(x);
+ }
+ }
+ }
+
+ // replacement indices in original index space
+ let replacement: Vec<usize> = replacement.into_iter().map(|x| x.unwrap()).collect();
+ let is_alive = |idx: usize| replacement[idx] == idx;
+
+ // compact index space
+ let mut compact: Vec<usize> = vec![usize::MAX; self.states.len()];
+ let mut next = 0;
+ for i in 0..self.states.len() {
+ if is_alive(i) {
+ compact[i] = next;
+ next += 1;
+ }
+ }
+
+ // remap everything and skip all no-longer-needed states
+ let remap = |idx: usize| compact[replacement[idx]];
+ let mut new_states = Vec::with_capacity(next);
+ for i in 0..self.states.len() {
+ if is_alive(i) {
+ let s = &self.states[i];
+ new_states.push(State {
+ trans: s.trans.iter().map(|(&ch, &to)| (ch, remap(to))).collect(),
+ default_trans: remap(s.default_trans),
+ accept: s.accept,
+ });
+ }
+ }
+ self.states = new_states;
+ self.start = remap(self.start);
+ }
+
+ pub fn minify(&mut self) {
+ for state in self.states.iter_mut() {
+ state.trans.retain(|_, to| *to != state.default_trans);
+ }
+
+ self.hopcroft_minimization();
+ }
+}