aboutsummaryrefslogtreecommitdiffstats
path: root/src/regex/dfa.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/regex/dfa.rs')
-rw-r--r--src/regex/dfa.rs381
1 files changed, 381 insertions, 0 deletions
diff --git a/src/regex/dfa.rs b/src/regex/dfa.rs
new file mode 100644
index 0000000..c55d99d
--- /dev/null
+++ b/src/regex/dfa.rs
@@ -0,0 +1,381 @@
+use core::fmt;
+use std::collections::{BinaryHeap, HashMap, HashSet};
+
+use crate::regex::{Match, RegexEngine, enfa::EnfaTranslationError};
+
+use super::{
+ Pattern,
+ byte_range::ByteRange,
+ enfa::{ENFA, MultiState},
+};
+
+pub type StateId = usize;
+
+pub struct State {
+ trans: HashMap<ByteRange, StateId>,
+ default_trans: StateId,
+ accept: bool,
+}
+
+#[allow(clippy::upper_case_acronyms)]
+pub struct DFA {
+ start: StateId,
+ states: Vec<State>,
+}
+
+impl fmt::Debug for DFA {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ writeln!(f, "DFA {{")?;
+ for (i, s) in self.states.iter().enumerate() {
+ if self.start == i {
+ write!(f, "-> {i}: ")?;
+ } else {
+ write!(f, " {i}: ")?;
+ }
+
+ for (chr, to) in s.trans.iter() {
+ write!(f, "{chr:?} to {to}, ")?;
+ }
+
+ write!(f, "dfl to {}", s.default_trans)?;
+ if s.accept {
+ write!(f, ", accept")?;
+ }
+ writeln!(f)?;
+ }
+ writeln!(f, "}}")
+ }
+}
+
+impl From<ENFA> for DFA {
+ fn from(mut nfa: ENFA) -> Self {
+ nfa.remove_unreachable();
+
+ let mut multi_states = nfa.all_multi_states();
+ multi_states.insert(nfa.void_multi_state());
+ let mut len = 0;
+ let multi_to_dfa: HashMap<MultiState, StateId> = multi_states
+ .clone()
+ .into_iter()
+ .map(|ms| {
+ len += 1;
+ (ms, len - 1)
+ })
+ .collect();
+
+ let void = multi_to_dfa[&nfa.void_multi_state()];
+
+ let mut states: Vec<State> = (0..len)
+ .map(|_| State {
+ trans: HashMap::new(),
+ default_trans: void,
+ accept: false,
+ })
+ .collect();
+
+ for ms in multi_states.iter() {
+ let i: usize = multi_to_dfa[ms];
+ states[i].accept = ms.accept();
+ for t in ms.possible_transitions() {
+ let k = multi_to_dfa[&ms.transition(t)];
+ states[i].trans.insert(t, k);
+ }
+ }
+
+ let mut this = Self {
+ start: multi_to_dfa[&nfa.start_multi_state()],
+ states,
+ };
+ this.minify();
+ this
+ }
+}
+
+#[derive(Clone, Debug)]
+pub enum DFACompileError {
+ SubmatchesNotSupported,
+ #[allow(unused)]
+ NFAError(EnfaTranslationError),
+}
+
+impl RegexEngine for DFA {
+ type CompileError = DFACompileError;
+
+ fn compile(pat: Pattern) -> Result<Self, Self::CompileError> {
+ match ENFA::try_from(pat) {
+ Ok(nfa) => {
+ if nfa.has_submatches {
+ Err(DFACompileError::SubmatchesNotSupported)
+ } else {
+ Ok(Self::from(nfa))
+ }
+ }
+ Err(e) => Err(DFACompileError::NFAError(e)),
+ }
+ }
+
+ fn run(&self, input: &[u8]) -> Option<super::Match> {
+ if self.matches(input) {
+ Some(Match::new_empty())
+ } else {
+ None
+ }
+ }
+}
+
+impl DFA {
+ pub fn matches(&self, x: &[u8]) -> bool {
+ let mut state = self.start;
+ 'next_byte: for &b in x.iter() {
+ for (range, &next_state) in self.states[state].trans.iter() {
+ if range.contains(b) {
+ state = next_state;
+ continue 'next_byte;
+ }
+ }
+ state = self.states[state].default_trans;
+ }
+ self.states[state].accept
+ }
+}
+
+mod state_set {
+ #[derive(Hash, Clone, PartialEq, Eq)]
+ pub struct StateSet {
+ set: Vec<usize>,
+ }
+
+ impl StateSet {
+ pub fn new(mut set: Vec<usize>) -> Self {
+ set.sort();
+ set.dedup();
+ Self { set }
+ }
+
+ pub fn iter(&self) -> impl Iterator<Item = usize> {
+ self.set.iter().cloned()
+ }
+
+ pub fn intersection(&self, other: &Self) -> Self {
+ let a = &self.set;
+ let b = &other.set;
+
+ let mut i = 0;
+ let mut j = 0;
+ let mut out = Vec::new();
+
+ while i < a.len() && j < b.len() {
+ match a[i].cmp(&b[j]) {
+ std::cmp::Ordering::Less => i += 1,
+ std::cmp::Ordering::Greater => j += 1,
+ std::cmp::Ordering::Equal => {
+ out.push(a[i]);
+ i += 1;
+ j += 1;
+ }
+ }
+ }
+
+ Self::new(out)
+ }
+
+ pub fn difference(&self, other: &Self) -> Self {
+ let a = &self.set;
+ let b = &other.set;
+
+ let mut i = 0;
+ let mut j = 0;
+ let mut out = Vec::new();
+
+ while i < a.len() && j < b.len() {
+ match a[i].cmp(&b[j]) {
+ std::cmp::Ordering::Less => {
+ out.push(a[i]);
+ i += 1;
+ }
+ std::cmp::Ordering::Greater => {
+ j += 1;
+ }
+ std::cmp::Ordering::Equal => {
+ i += 1;
+ j += 1;
+ }
+ }
+ }
+
+ out.extend_from_slice(&a[i..]);
+ Self::new(out)
+ }
+
+ pub fn is_empty(&self) -> bool {
+ self.set.is_empty()
+ }
+
+ pub fn len(&self) -> usize {
+ self.set.len()
+ }
+
+ pub fn primary_state(&self) -> Option<usize> {
+ self.set.first().cloned()
+ }
+ }
+
+ // custom implementation such that smaller sets come first in a BinaryHeap
+ impl Ord for StateSet {
+ fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+ match self.set.len().cmp(&other.set.len()) {
+ std::cmp::Ordering::Equal => {}
+ other => return other.reverse(),
+ }
+ self.set.cmp(&other.set)
+ }
+ }
+
+ impl PartialOrd for StateSet {
+ fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+ Some(self.cmp(other))
+ }
+ }
+}
+
+use state_set::StateSet;
+
+trait GoesTo {
+ fn goes_to(&self, to: usize) -> bool;
+}
+impl GoesTo for (&State, ByteRange) {
+ fn goes_to(&self, target: usize) -> bool {
+ let from = self.0;
+ let ch = self.1;
+ for (c, &to) in from.trans.iter() {
+ if c.overlaps(ch) && to == target {
+ return true;
+ }
+ }
+ from.default_trans == target
+ }
+}
+
+impl DFA {
+ fn states_where(&self, mut f: impl FnMut(&State) -> bool) -> StateSet {
+ let states = self
+ .states
+ .iter()
+ .enumerate()
+ .filter_map(|(i, s)| if f(s) { Some(i) } else { None })
+ .collect();
+ StateSet::new(states)
+ }
+
+ /// https://en.wikipedia.org/wiki/DFA_minimization
+ fn hopcroft_minimization(&mut self) {
+ let mut partitions: HashSet<StateSet> = HashSet::new();
+ partitions.insert(self.states_where(|s| s.accept));
+ partitions.insert(self.states_where(|s| !s.accept));
+
+ let mut ranges: Vec<ByteRange> = self
+ .states
+ .iter()
+ .flat_map(|s| s.trans.iter().map(|t| *t.0))
+ .chain([ByteRange::new_range(u8::MIN, u8::MAX)])
+ .collect();
+ ranges.sort();
+ ranges.dedup();
+ let ranges = ByteRange::split_to_disjoint(ranges);
+
+ let mut queue: BinaryHeap<StateSet> = partitions.iter().cloned().collect();
+ let mut queue_set = partitions.clone();
+
+ while let Some(a) = queue.pop() {
+ if !queue_set.contains(&a) {
+ continue;
+ }
+
+ for &c in ranges.iter() {
+ let x = self.states_where(|s| a.iter().any(|a| (s, c).goes_to(a)));
+
+ let mut del_list = HashSet::new();
+ let mut add_list = Vec::new();
+ for y in partitions.iter() {
+ let i = x.intersection(y);
+ let d = y.difference(&x);
+
+ if !i.is_empty() && !d.is_empty() {
+ del_list.insert(y.clone());
+ add_list.push(i.clone());
+ add_list.push(d.clone());
+
+ if queue_set.contains(y) {
+ queue_set.remove(y);
+
+ queue.push(i.clone());
+ queue_set.insert(i);
+
+ queue.push(d.clone());
+ queue_set.insert(d);
+ } else if i.len() < d.len() {
+ queue.push(i.clone());
+ queue_set.insert(i);
+ } else {
+ queue.push(d.clone());
+ queue_set.insert(d);
+ }
+ }
+ }
+
+ partitions.retain(|i| !del_list.contains(i));
+ for x in add_list {
+ partitions.insert(x);
+ }
+ }
+ }
+
+ let mut replacement = vec![None; self.states.len()];
+ for partition in partitions {
+ if let Some(x) = partition.primary_state() {
+ for state in partition.iter() {
+ assert!(replacement[state].is_none());
+ replacement[state] = Some(x);
+ }
+ }
+ }
+
+ // replacement indices in original index space
+ let replacement: Vec<usize> = replacement.into_iter().map(|x| x.unwrap()).collect();
+ let is_alive = |idx: usize| replacement[idx] == idx;
+
+ // compact index space
+ let mut compact: Vec<usize> = vec![usize::MAX; self.states.len()];
+ let mut next = 0;
+ for i in 0..self.states.len() {
+ if is_alive(i) {
+ compact[i] = next;
+ next += 1;
+ }
+ }
+
+ // remap everything and skip all no-longer-needed states
+ let remap = |idx: usize| compact[replacement[idx]];
+ let mut new_states = Vec::with_capacity(next);
+ for i in 0..self.states.len() {
+ if is_alive(i) {
+ let s = &self.states[i];
+ new_states.push(State {
+ trans: s.trans.iter().map(|(&ch, &to)| (ch, remap(to))).collect(),
+ default_trans: remap(s.default_trans),
+ accept: s.accept,
+ });
+ }
+ }
+ self.states = new_states;
+ self.start = remap(self.start);
+ }
+
+ pub fn minify(&mut self) {
+ for state in self.states.iter_mut() {
+ state.trans.retain(|_, to| *to != state.default_trans);
+ }
+
+ self.hopcroft_minimization();
+ }
+}