aboutsummaryrefslogtreecommitdiffstats
path: root/src/regex/dfa.rs
diff options
context:
space:
mode:
authorJonas Maier <jonas@x77.dev>2026-06-06 20:49:07 +0200
committerJonas Maier <jonas@x77.dev>2026-06-06 20:49:07 +0200
commit150c732eefce15f142681b99ccdc8e22e9a05e04 (patch)
tree249d6ef2b88876aab24d38b0136493072758e9b8 /src/regex/dfa.rs
parent24d41bb4daf081bb9cd63a2107b28b1878594ed3 (diff)
downloadpish-150c732eefce15f142681b99ccdc8e22e9a05e04.tar.gz
regex: perf
Diffstat (limited to 'src/regex/dfa.rs')
-rw-r--r--src/regex/dfa.rs156
1 files changed, 112 insertions, 44 deletions
diff --git a/src/regex/dfa.rs b/src/regex/dfa.rs
index 5687c8b..0e47b89 100644
--- a/src/regex/dfa.rs
+++ b/src/regex/dfa.rs
@@ -11,20 +11,64 @@ use super::{
pub type StateId = usize;
-pub struct State {
+trait DfaDecider: From<HashMapDecider> {
+ fn decide(&self, byte: u8) -> StateId;
+}
+
+pub struct HashMapDecider {
trans: HashMap<ByteRange, StateId>,
- fast_trans: DecisionTree<StateId>,
default_trans: StateId,
+}
+
+impl DfaDecider for HashMapDecider {
+ fn decide(&self, byte: u8) -> StateId {
+ for (range, to) in self.trans.iter() {
+ if range.contains(byte) {
+ return *to;
+ }
+ }
+ self.default_trans
+ }
+}
+
+impl From<HashMapDecider> for DecisionTree<StateId> {
+ fn from(value: HashMapDecider) -> Self {
+ DecisionTree::build(value.trans, value.default_trans)
+ }
+}
+impl DfaDecider for DecisionTree<StateId> {
+ fn decide(&self, byte: u8) -> StateId {
+ DecisionTree::decide(&self, byte)
+ }
+}
+
+impl From<HashMapDecider> for [u32; 256] {
+ fn from(value: HashMapDecider) -> Self {
+ let mut table = [0; 256];
+ for i in 0..256 {
+ table[i] = value.decide(i as u8) as u32;
+ }
+ table
+ }
+}
+impl DfaDecider for [u32; 256] {
+ fn decide(&self, byte: u8) -> StateId {
+ self[byte as usize] as StateId
+ }
+}
+
+pub struct State<D> {
+ decision: D,
accept: bool,
}
#[allow(clippy::upper_case_acronyms)]
-pub struct DFA {
+pub struct DFA<D = HashMapDecider> {
start: StateId,
- states: Vec<State>,
+ states: Vec<State<D>>,
}
-impl fmt::Debug for DFA {
+impl fmt::Debug for DFA<HashMapDecider> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
writeln!(f, "DFA {{")?;
for (i, s) in self.states.iter().enumerate() {
@@ -34,11 +78,11 @@ impl fmt::Debug for DFA {
write!(f, " {i}: ")?;
}
- for (chr, to) in s.trans.iter() {
+ for (chr, to) in s.decision.trans.iter() {
write!(f, "{chr:?} to {to}, ")?;
}
- write!(f, "dfl to {}", s.default_trans)?;
+ write!(f, "dfl to {}", s.decision.default_trans)?;
if s.accept {
write!(f, ", accept")?;
}
@@ -66,11 +110,12 @@ impl From<ENFA> for DFA {
let void = multi_to_dfa[&nfa.void_multi_state()];
- let mut states: Vec<State> = (0..len)
+ let mut states: Vec<State<_>> = (0..len)
.map(|_| State {
- fast_trans: DecisionTree::default(),
- trans: HashMap::new(),
- default_trans: void,
+ decision: HashMapDecider {
+ trans: HashMap::new(),
+ default_trans: void,
+ },
accept: false,
})
.collect();
@@ -80,14 +125,10 @@ impl From<ENFA> for DFA {
states[i].accept = ms.accept();
for t in ms.possible_transitions() {
let k = multi_to_dfa[&ms.transition(t)];
- states[i].trans.insert(t, k);
+ states[i].decision.trans.insert(t, k);
}
}
- for s in states.iter_mut() {
- s.fast_trans = DecisionTree::build(s.trans.clone(), s.default_trans.clone());
- }
-
let mut this = Self {
start: multi_to_dfa[&nfa.start_multi_state()],
states,
@@ -104,7 +145,7 @@ pub enum DFACompileError {
NFAError(EnfaTranslationError),
}
-impl RegexEngine for DFA {
+impl<D: DfaDecider> RegexEngine for DFA<D> {
type CompileError = DFACompileError;
fn compile(pat: Pattern) -> Result<Self, Self::CompileError> {
@@ -113,7 +154,18 @@ impl RegexEngine for DFA {
if nfa.has_submatches {
Err(DFACompileError::SubmatchesNotSupported)
} else {
- Ok(Self::from(nfa))
+ let this = DFA::from(nfa);
+ Ok(Self {
+ states: this
+ .states
+ .into_iter()
+ .map(|s| State {
+ decision: D::from(s.decision),
+ accept: s.accept,
+ })
+ .collect(),
+ start: this.start,
+ })
}
}
Err(e) => Err(DFACompileError::NFAError(e)),
@@ -121,7 +173,24 @@ impl RegexEngine for DFA {
}
fn run(&self, input: &[u8]) -> Option<super::Match> {
- if self.matches(input) {
+ let mut state = self.start;
+ let mut i = 0;
+ while i + 7 < input.len() {
+ state = self.states[state].decision.decide(input[i+0]);
+ state = self.states[state].decision.decide(input[i+1]);
+ state = self.states[state].decision.decide(input[i+2]);
+ state = self.states[state].decision.decide(input[i+3]);
+ state = self.states[state].decision.decide(input[i+4]);
+ state = self.states[state].decision.decide(input[i+5]);
+ state = self.states[state].decision.decide(input[i+6]);
+ state = self.states[state].decision.decide(input[i+7]);
+ i += 8;
+ }
+ while i < input.len() {
+ state = self.states[state].decision.decide(input[i]);
+ i += 1;
+ }
+ if self.states[state].accept {
Some(Match::new_empty())
} else {
None
@@ -129,16 +198,6 @@ impl RegexEngine for DFA {
}
}
-impl DFA {
- pub fn matches(&self, x: &[u8]) -> bool {
- let mut state = self.start;
- for &b in x.iter() {
- state = self.states[state].fast_trans.decide(b);
- }
- self.states[state].accept
- }
-}
-
mod state_set {
#[derive(Hash, Clone, PartialEq, Eq)]
pub struct StateSet {
@@ -243,21 +302,21 @@ use state_set::StateSet;
trait GoesTo {
fn goes_to(&self, to: usize) -> bool;
}
-impl GoesTo for (&State, ByteRange) {
+impl GoesTo for (&State<HashMapDecider>, ByteRange) {
fn goes_to(&self, target: usize) -> bool {
let from = self.0;
let ch = self.1;
- for (c, &to) in from.trans.iter() {
+ for (c, &to) in from.decision.trans.iter() {
if c.overlaps(ch) && to == target {
return true;
}
}
- from.default_trans == target
+ from.decision.default_trans == target
}
}
impl DFA {
- fn states_where(&self, mut f: impl FnMut(&State) -> bool) -> StateSet {
+ fn states_where(&self, mut f: impl FnMut(&State<HashMapDecider>) -> bool) -> StateSet {
let states = self
.states
.iter()
@@ -276,7 +335,7 @@ impl DFA {
let mut ranges: Vec<ByteRange> = self
.states
.iter()
- .flat_map(|s| s.trans.iter().map(|t| *t.0))
+ .flat_map(|s| s.decision.trans.iter().map(|t| *t.0))
.chain([ByteRange::new_range(u8::MIN, u8::MAX)])
.collect();
ranges.sort();
@@ -360,13 +419,18 @@ impl DFA {
for i in 0..self.states.len() {
if is_alive(i) {
let s = &self.states[i];
- let trans: HashMap<ByteRange, usize> =
- s.trans.iter().map(|(&ch, &to)| (ch, remap(to))).collect();
- let default_trans = remap(s.default_trans);
+ let trans: HashMap<ByteRange, usize> = s
+ .decision
+ .trans
+ .iter()
+ .map(|(&ch, &to)| (ch, remap(to)))
+ .collect();
+ let default_trans = remap(s.decision.default_trans);
new_states.push(State {
- fast_trans: DecisionTree::build(trans.clone(), default_trans),
- trans,
- default_trans,
+ decision: HashMapDecider {
+ trans,
+ default_trans,
+ },
accept: s.accept,
});
}
@@ -377,15 +441,19 @@ impl DFA {
pub fn minify(&mut self) {
for state in self.states.iter_mut() {
- state.trans.retain(|_, to| *to != state.default_trans);
- if state.trans.len() == 1
+ state
+ .decision
+ .trans
+ .retain(|_, to| *to != state.decision.default_trans);
+ if state.decision.trans.len() == 1
&& state
+ .decision
.trans
.iter()
.all(|t| *t.0 == ByteRange::new_range(0, 255))
{
- state.default_trans = state.trans.iter().map(|x| *x.1).next().unwrap();
- state.trans.clear();
+ state.decision.default_trans = state.decision.trans.iter().map(|x| *x.1).next().unwrap();
+ state.decision.trans.clear();
}
}