aboutsummaryrefslogtreecommitdiffstats
path: root/src/parse/regex/dfa.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/parse/regex/dfa.rs')
-rw-r--r--src/parse/regex/dfa.rs346
1 files changed, 0 insertions, 346 deletions
diff --git a/src/parse/regex/dfa.rs b/src/parse/regex/dfa.rs
deleted file mode 100644
index 35f726c..0000000
--- a/src/parse/regex/dfa.rs
+++ /dev/null
@@ -1,346 +0,0 @@
-use core::fmt;
-use std::collections::{BinaryHeap, HashMap, HashSet};
-
-use super::{
- byte_range::ByteRange,
- enfa::{ENFA, MultiState},
-};
-
-pub type StateId = usize;
-
-pub struct State {
- trans: HashMap<ByteRange, StateId>,
- default_trans: StateId,
- accept: bool,
-}
-
-#[allow(clippy::upper_case_acronyms)]
-pub struct DFA {
- start: StateId,
- states: Vec<State>,
-}
-
-impl fmt::Debug for DFA {
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- writeln!(f, "DFA {{")?;
- for (i, s) in self.states.iter().enumerate() {
- if self.start == i {
- write!(f, "-> {i}: ")?;
- } else {
- write!(f, " {i}: ")?;
- }
-
- for (chr, to) in s.trans.iter() {
- write!(f, "{chr:?} to {to}, ")?;
- }
-
- write!(f, "dfl to {}", s.default_trans)?;
- if s.accept {
- write!(f, ", accept")?;
- }
- writeln!(f)?;
- }
- writeln!(f, "}}")
- }
-}
-
-impl From<ENFA> for DFA {
- fn from(mut nfa: ENFA) -> Self {
- nfa.remove_unreachable();
-
- let mut multi_states = nfa.all_multi_states();
- multi_states.insert(nfa.void_multi_state());
- let mut len = 0;
- let multi_to_dfa: HashMap<MultiState, StateId> = multi_states
- .clone()
- .into_iter()
- .map(|ms| {
- len += 1;
- (ms, len - 1)
- })
- .collect();
-
- let void = multi_to_dfa[&nfa.void_multi_state()];
-
- let mut states: Vec<State> = (0..len)
- .map(|_| State {
- trans: HashMap::new(),
- default_trans: void,
- accept: false,
- })
- .collect();
-
- for ms in multi_states.iter() {
- let i: usize = multi_to_dfa[ms];
- states[i].accept = ms.accept();
- for t in ms.possible_transitions() {
- let k = multi_to_dfa[&ms.transition(t)];
- states[i].trans.insert(t, k);
- }
- }
-
- let mut this = Self {
- start: multi_to_dfa[&nfa.start_multi_state()],
- states,
- };
- this.minify();
- this
- }
-}
-
-impl DFA {
- pub fn matches(&self, x: &[u8]) -> bool {
- let mut state = self.start;
- 'next_byte: for &b in x.iter() {
- for (range, &next_state) in self.states[state].trans.iter() {
- if range.contains(b) {
- state = next_state;
- continue 'next_byte;
- }
- }
- state = self.states[state].default_trans;
- }
- self.states[state].accept
- }
-}
-
-mod state_set {
- #[derive(Hash, Clone, PartialEq, Eq)]
- pub struct StateSet {
- set: Vec<usize>,
- }
-
- impl StateSet {
- pub fn new(mut set: Vec<usize>) -> Self {
- set.sort();
- set.dedup();
- Self { set }
- }
-
- pub fn iter(&self) -> impl Iterator<Item = usize> {
- self.set.iter().cloned()
- }
-
- pub fn intersection(&self, other: &Self) -> Self {
- let a = &self.set;
- let b = &other.set;
-
- let mut i = 0;
- let mut j = 0;
- let mut out = Vec::new();
-
- while i < a.len() && j < b.len() {
- match a[i].cmp(&b[j]) {
- std::cmp::Ordering::Less => i += 1,
- std::cmp::Ordering::Greater => j += 1,
- std::cmp::Ordering::Equal => {
- out.push(a[i]);
- i += 1;
- j += 1;
- }
- }
- }
-
- Self::new(out)
- }
-
- pub fn difference(&self, other: &Self) -> Self {
- let a = &self.set;
- let b = &other.set;
-
- let mut i = 0;
- let mut j = 0;
- let mut out = Vec::new();
-
- while i < a.len() && j < b.len() {
- match a[i].cmp(&b[j]) {
- std::cmp::Ordering::Less => {
- out.push(a[i]);
- i += 1;
- }
- std::cmp::Ordering::Greater => {
- j += 1;
- }
- std::cmp::Ordering::Equal => {
- i += 1;
- j += 1;
- }
- }
- }
-
- out.extend_from_slice(&a[i..]);
- Self::new(out)
- }
-
- pub fn is_empty(&self) -> bool {
- self.set.is_empty()
- }
-
- pub fn len(&self) -> usize {
- self.set.len()
- }
-
- pub fn primary_state(&self) -> Option<usize> {
- self.set.first().cloned()
- }
- }
-
- // custom implementation such that smaller sets come first in a BinaryHeap
- impl Ord for StateSet {
- fn cmp(&self, other: &Self) -> std::cmp::Ordering {
- match self.set.len().cmp(&other.set.len()) {
- std::cmp::Ordering::Equal => {}
- other => return other.reverse(),
- }
- self.set.cmp(&other.set)
- }
- }
-
- impl PartialOrd for StateSet {
- fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
- Some(self.cmp(other))
- }
- }
-}
-
-use state_set::StateSet;
-
-trait GoesTo {
- fn goes_to(&self, to: usize) -> bool;
-}
-impl GoesTo for (&State, ByteRange) {
- fn goes_to(&self, target: usize) -> bool {
- let from = self.0;
- let ch = self.1;
- for (c, &to) in from.trans.iter() {
- if c.overlaps(ch) && to == target {
- return true;
- }
- }
- from.default_trans == target
- }
-}
-
-impl DFA {
- fn states_where(&self, mut f: impl FnMut(&State) -> bool) -> StateSet {
- let states = self
- .states
- .iter()
- .enumerate()
- .filter_map(|(i, s)| if f(s) { Some(i) } else { None })
- .collect();
- StateSet::new(states)
- }
-
- /// https://en.wikipedia.org/wiki/DFA_minimization
- fn hopcroft_minimization(&mut self) {
- let mut partitions: HashSet<StateSet> = HashSet::new();
- partitions.insert(self.states_where(|s| s.accept));
- partitions.insert(self.states_where(|s| !s.accept));
-
- let mut ranges: Vec<ByteRange> = self
- .states
- .iter()
- .flat_map(|s| s.trans.iter().map(|t| *t.0))
- .chain([ByteRange::new_range(u8::MIN, u8::MAX)])
- .collect();
- ranges.sort();
- ranges.dedup();
- let ranges = ByteRange::split_to_disjoint(ranges);
-
- let mut queue: BinaryHeap<StateSet> = partitions.iter().cloned().collect();
- let mut queue_set = partitions.clone();
-
- while let Some(a) = queue.pop() {
- if !queue_set.contains(&a) {
- continue;
- }
-
- for &c in ranges.iter() {
- let x = self.states_where(|s| a.iter().any(|a| (s, c).goes_to(a)));
-
- let mut del_list = HashSet::new();
- let mut add_list = Vec::new();
- for y in partitions.iter() {
- let i = x.intersection(y);
- let d = y.difference(&x);
-
- if !i.is_empty() && !d.is_empty() {
- del_list.insert(y.clone());
- add_list.push(i.clone());
- add_list.push(d.clone());
-
- if queue_set.contains(y) {
- queue_set.remove(y);
-
- queue.push(i.clone());
- queue_set.insert(i);
-
- queue.push(d.clone());
- queue_set.insert(d);
- } else if i.len() < d.len() {
- queue.push(i.clone());
- queue_set.insert(i);
- } else {
- queue.push(d.clone());
- queue_set.insert(d);
- }
- }
- }
-
- partitions.retain(|i| !del_list.contains(i));
- for x in add_list {
- partitions.insert(x);
- }
- }
- }
-
- let mut replacement = vec![None; self.states.len()];
- for partition in partitions {
- if let Some(x) = partition.primary_state() {
- for state in partition.iter() {
- assert!(replacement[state].is_none());
- replacement[state] = Some(x);
- }
- }
- }
-
- // replacement indices in original index space
- let replacement: Vec<usize> = replacement.into_iter().map(|x| x.unwrap()).collect();
- let is_alive = |idx: usize| replacement[idx] == idx;
-
- // compact index space
- let mut compact: Vec<usize> = vec![usize::MAX; self.states.len()];
- let mut next = 0;
- for i in 0..self.states.len() {
- if is_alive(i) {
- compact[i] = next;
- next += 1;
- }
- }
-
- // remap everything and skip all no-longer-needed states
- let remap = |idx: usize| compact[replacement[idx]];
- let mut new_states = Vec::with_capacity(next);
- for i in 0..self.states.len() {
- if is_alive(i) {
- let s = &self.states[i];
- new_states.push(State {
- trans: s.trans.iter().map(|(&ch, &to)| (ch, remap(to))).collect(),
- default_trans: remap(s.default_trans),
- accept: s.accept,
- });
- }
- }
- self.states = new_states;
- self.start = remap(self.start);
- }
-
- pub fn minify(&mut self) {
- for state in self.states.iter_mut() {
- state.trans.retain(|_, to| *to != state.default_trans);
- }
-
- self.hopcroft_minimization();
- }
-}