diff options
| author | Jonas Maier <jonas@x77.dev> | 2026-06-06 14:29:21 +0200 |
|---|---|---|
| committer | Jonas Maier <jonas@x77.dev> | 2026-06-06 14:29:21 +0200 |
| commit | 24d41bb4daf081bb9cd63a2107b28b1878594ed3 (patch) | |
| tree | e571d21edd4e20a3f7246bebf805b461c5ed7a43 /src | |
| parent | d39ed8fc77981f937c35fa84a7ff5d288d0c7181 (diff) | |
| download | pish-24d41bb4daf081bb9cd63a2107b28b1878594ed3.tar.gz | |
tried to implement a decision tree for faster dfa, but it is not faster
Diffstat (limited to 'src')
| -rw-r--r-- | src/regex/byte_range.rs | 8 | ||||
| -rw-r--r-- | src/regex/decision_tree.rs | 112 | ||||
| -rw-r--r-- | src/regex/dfa.rs | 26 | ||||
| -rw-r--r-- | src/regex/mod.rs | 1 |
4 files changed, 136 insertions, 11 deletions
diff --git a/src/regex/byte_range.rs b/src/regex/byte_range.rs index d549a55..66c58ad 100644 --- a/src/regex/byte_range.rs +++ b/src/regex/byte_range.rs @@ -36,6 +36,14 @@ impl ByteRange { self.from.max(other.from) <= self.to.min(other.to) } + pub fn lower_bound(&self) -> u8 { + self.from + } + + pub fn upper_bound(&self) -> u8 { + self.to + } + pub fn split_to_disjoint(ranges: Vec<ByteRange>) -> Vec<ByteRange> { if ranges.is_empty() { return vec![]; diff --git a/src/regex/decision_tree.rs b/src/regex/decision_tree.rs new file mode 100644 index 0000000..fed6e70 --- /dev/null +++ b/src/regex/decision_tree.rs @@ -0,0 +1,112 @@ +use std::collections::HashMap; + +use crate::regex::byte_range::ByteRange; + +pub struct DecisionTree<T> { + instrs: Vec<Instr>, + ret_vals: Vec<T>, +} + +#[repr(u32)] +#[derive(Copy, Clone)] +enum Instr { + JumpIfGe(u8, u8), + Ret(u8), +} + +impl<T: Clone> DecisionTree<T> { + pub fn decide(&self, val: u8) -> T { + let mut pc = 0; + loop { + match self.instrs[pc] { + Instr::JumpIfGe(pivot, rela) => { + if val >= pivot { + pc += rela as usize; + } else { + pc += 1; + } + } + Instr::Ret(idx) => return self.ret_vals[idx as usize].clone(), + } + } + } +} + +impl<T: PartialEq + Clone> DecisionTree<T> { + fn add_retval(&mut self, searched_val: &T) -> u8 { + for (i, val) in self.ret_vals.iter().enumerate() { + if val == searched_val { + return i as u8; + } + } + self.ret_vals.push(searched_val.clone()); + (self.ret_vals.len() - 1) as u8 + } + + fn merge(mut self, other: Self) -> Self { + for ins in other.instrs.into_iter() { + match ins { + Instr::JumpIfGe(_, _) => self.instrs.push(ins), + Instr::Ret(i) => { + let i = self.add_retval(&other.ret_vals[i as usize]); + self.instrs.push(Instr::Ret(i)); + } + } + } + self + } + + fn build_inner(ranges: &[(ByteRange, T)]) -> Self { + assert!(!ranges.is_empty()); + if ranges.len() == 1 { + Self { + instrs: vec![Instr::Ret(0)], + ret_vals: vec![ranges[0].1.clone()], + } + } else { + let pivot = ranges.len() / 2; + let a = Self::build_inner(&ranges[..pivot]); + let b = Self::build_inner(&ranges[pivot..]); + let decider = Self { + instrs: vec![Instr::JumpIfGe( + ranges[pivot].0.lower_bound(), + a.instrs.len() as u8 + 1, + )], + ret_vals: Vec::new(), + }; + decider.merge(a).merge(b) + } + } + + pub fn build(map: HashMap<ByteRange, T>, default: T) -> Self { + let mut ranges: Vec<ByteRange> = map.keys().cloned().collect(); + ranges.push(ByteRange::all()); + let ranges = ByteRange::split_to_disjoint(ranges); + for i in 0..ranges.len() - 1 { + assert_eq!(ranges[i].upper_bound() + 1, ranges[i + 1].lower_bound()); + } + + let ranges: Vec<(ByteRange, T)> = ranges + .into_iter() + .map(|r| { + let maps_to = map + .iter() + .filter_map(|(x, t)| if x.overlaps(r) { Some(t.clone()) } else { None }) + .next() + .unwrap_or(default.clone()); + (r, maps_to) + }) + .collect(); + + Self::build_inner(&ranges) + } +} + +impl<T> Default for DecisionTree<T> { + fn default() -> Self { + Self { + instrs: Default::default(), + ret_vals: Default::default(), + } + } +} diff --git a/src/regex/dfa.rs b/src/regex/dfa.rs index 78a216c..5687c8b 100644 --- a/src/regex/dfa.rs +++ b/src/regex/dfa.rs @@ -1,7 +1,7 @@ use core::fmt; use std::collections::{BinaryHeap, HashMap, HashSet}; -use crate::regex::{Match, RegexEngine, enfa::EnfaTranslationError}; +use crate::regex::{Match, RegexEngine, decision_tree::DecisionTree, enfa::EnfaTranslationError}; use super::{ Pattern, @@ -13,6 +13,7 @@ pub type StateId = usize; pub struct State { trans: HashMap<ByteRange, StateId>, + fast_trans: DecisionTree<StateId>, default_trans: StateId, accept: bool, } @@ -67,6 +68,7 @@ impl From<ENFA> for DFA { let mut states: Vec<State> = (0..len) .map(|_| State { + fast_trans: DecisionTree::default(), trans: HashMap::new(), default_trans: void, accept: false, @@ -82,6 +84,10 @@ impl From<ENFA> for DFA { } } + for s in states.iter_mut() { + s.fast_trans = DecisionTree::build(s.trans.clone(), s.default_trans.clone()); + } + let mut this = Self { start: multi_to_dfa[&nfa.start_multi_state()], states, @@ -126,14 +132,8 @@ impl RegexEngine for DFA { impl DFA { pub fn matches(&self, x: &[u8]) -> bool { let mut state = self.start; - 'next_byte: for &b in x.iter() { - for (range, &next_state) in self.states[state].trans.iter() { - if range.contains(b) { - state = next_state; - continue 'next_byte; - } - } - state = self.states[state].default_trans; + for &b in x.iter() { + state = self.states[state].fast_trans.decide(b); } self.states[state].accept } @@ -360,9 +360,13 @@ impl DFA { for i in 0..self.states.len() { if is_alive(i) { let s = &self.states[i]; + let trans: HashMap<ByteRange, usize> = + s.trans.iter().map(|(&ch, &to)| (ch, remap(to))).collect(); + let default_trans = remap(s.default_trans); new_states.push(State { - trans: s.trans.iter().map(|(&ch, &to)| (ch, remap(to))).collect(), - default_trans: remap(s.default_trans), + fast_trans: DecisionTree::build(trans.clone(), default_trans), + trans, + default_trans, accept: s.accept, }); } diff --git a/src/regex/mod.rs b/src/regex/mod.rs index 438361a..f10a92d 100644 --- a/src/regex/mod.rs +++ b/src/regex/mod.rs @@ -3,6 +3,7 @@ mod byte_range; pub mod dfa; pub mod enfa; pub mod simple; +mod decision_tree; #[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)] pub enum LookDirection { |
