aboutsummaryrefslogtreecommitdiffstats
path: root/src/regex
diff options
context:
space:
mode:
authorJonas Maier <jonas@x77.dev>2026-06-06 20:49:07 +0200
committerJonas Maier <jonas@x77.dev>2026-06-06 20:49:07 +0200
commit150c732eefce15f142681b99ccdc8e22e9a05e04 (patch)
tree249d6ef2b88876aab24d38b0136493072758e9b8 /src/regex
parent24d41bb4daf081bb9cd63a2107b28b1878594ed3 (diff)
downloadpish-150c732eefce15f142681b99ccdc8e22e9a05e04.tar.gz
regex: perf
Diffstat (limited to 'src/regex')
-rw-r--r--src/regex/bc.rs63
-rw-r--r--src/regex/dfa.rs156
-rw-r--r--src/regex/mod.rs3
3 files changed, 152 insertions, 70 deletions
diff --git a/src/regex/bc.rs b/src/regex/bc.rs
index 6e55c7d..5de1513 100644
--- a/src/regex/bc.rs
+++ b/src/regex/bc.rs
@@ -147,34 +147,45 @@ impl<'p, F: Flavor> VM<'p, F> {
}};
}
+ macro_rules! continue_thread {
+ ($t:expr) => {{
+ let bit = $t.pc as usize;
+ if !self.warm.get(bit) {
+ self.warm.set(bit, true);
+ continue;
+ }
+ }};
+ }
+
while let Some(mut thread) = self.active_threads.pop_front() {
- match self.instr[thread.pc as usize] {
- Instr::Class(_) | Instr::Consume(_) => {
- if !self.hot.get(thread.pc as usize) {
- self.hot.set(thread.pc as usize, true);
- self.passive_threads.push_back(thread);
+ loop {
+ match self.instr[thread.pc as usize] {
+ Instr::Class(_) | Instr::Consume(_) => {
+ if !self.hot.get(thread.pc as usize) {
+ self.hot.set(thread.pc as usize, true);
+ self.passive_threads.push_back(thread);
+ }
}
- }
- Instr::Jump(j) => {
- thread.pc = j;
- add_thread!(thread);
- }
- Instr::Fork(a, b) => {
- add_thread!(Thread {
- pc: b,
- data: thread.data.clone(),
- });
- add_thread!(Thread {
- pc: a,
- data: thread.data.clone(),
- });
- }
- Instr::Custom(instr) => {
- if F::accepts(&mut thread, instr, sd) {
- thread.pc += 1;
- add_thread!(thread);
+ Instr::Jump(j) => {
+ thread.pc = j;
+ continue_thread!(thread);
+ }
+ Instr::Fork(a, b) => {
+ add_thread!(Thread {
+ pc: b,
+ data: thread.data.clone(),
+ });
+ thread.pc = a;
+ continue_thread!(thread);
+ }
+ Instr::Custom(instr) => {
+ if F::accepts(&mut thread, instr, sd) {
+ thread.pc += 1;
+ continue_thread!(thread);
+ }
}
}
+ break;
}
}
}
@@ -257,21 +268,25 @@ struct VirtualMachine<'a> {
}
impl<'a> VirtualMachine<'a> {
+ #[inline]
fn step_epsilon_1(&mut self, loc: usize) {
self.vm1
.step_epsilon(&mut (loc, &self.vm0.hot, &mut self.vm2));
}
+ #[inline]
fn step_epsilon(&mut self, loc: usize) {
self.vm0.step_epsilon(&mut ());
self.step_epsilon_1(loc);
}
+ #[inline]
fn step_consume(&mut self, byte: u8) {
self.vm0.step_consume(byte);
self.vm1.step_consume(byte);
}
+ #[inline]
fn step(&mut self, byte: u8, loc: usize) {
self.step_epsilon(loc);
self.step_consume(byte);
diff --git a/src/regex/dfa.rs b/src/regex/dfa.rs
index 5687c8b..0e47b89 100644
--- a/src/regex/dfa.rs
+++ b/src/regex/dfa.rs
@@ -11,20 +11,64 @@ use super::{
pub type StateId = usize;
-pub struct State {
+trait DfaDecider: From<HashMapDecider> {
+ fn decide(&self, byte: u8) -> StateId;
+}
+
+pub struct HashMapDecider {
trans: HashMap<ByteRange, StateId>,
- fast_trans: DecisionTree<StateId>,
default_trans: StateId,
+}
+
+impl DfaDecider for HashMapDecider {
+ fn decide(&self, byte: u8) -> StateId {
+ for (range, to) in self.trans.iter() {
+ if range.contains(byte) {
+ return *to;
+ }
+ }
+ self.default_trans
+ }
+}
+
+impl From<HashMapDecider> for DecisionTree<StateId> {
+ fn from(value: HashMapDecider) -> Self {
+ DecisionTree::build(value.trans, value.default_trans)
+ }
+}
+impl DfaDecider for DecisionTree<StateId> {
+ fn decide(&self, byte: u8) -> StateId {
+ DecisionTree::decide(&self, byte)
+ }
+}
+
+impl From<HashMapDecider> for [u32; 256] {
+ fn from(value: HashMapDecider) -> Self {
+ let mut table = [0; 256];
+ for i in 0..256 {
+ table[i] = value.decide(i as u8) as u32;
+ }
+ table
+ }
+}
+impl DfaDecider for [u32; 256] {
+ fn decide(&self, byte: u8) -> StateId {
+ self[byte as usize] as StateId
+ }
+}
+
+pub struct State<D> {
+ decision: D,
accept: bool,
}
#[allow(clippy::upper_case_acronyms)]
-pub struct DFA {
+pub struct DFA<D = HashMapDecider> {
start: StateId,
- states: Vec<State>,
+ states: Vec<State<D>>,
}
-impl fmt::Debug for DFA {
+impl fmt::Debug for DFA<HashMapDecider> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
writeln!(f, "DFA {{")?;
for (i, s) in self.states.iter().enumerate() {
@@ -34,11 +78,11 @@ impl fmt::Debug for DFA {
write!(f, " {i}: ")?;
}
- for (chr, to) in s.trans.iter() {
+ for (chr, to) in s.decision.trans.iter() {
write!(f, "{chr:?} to {to}, ")?;
}
- write!(f, "dfl to {}", s.default_trans)?;
+ write!(f, "dfl to {}", s.decision.default_trans)?;
if s.accept {
write!(f, ", accept")?;
}
@@ -66,11 +110,12 @@ impl From<ENFA> for DFA {
let void = multi_to_dfa[&nfa.void_multi_state()];
- let mut states: Vec<State> = (0..len)
+ let mut states: Vec<State<_>> = (0..len)
.map(|_| State {
- fast_trans: DecisionTree::default(),
- trans: HashMap::new(),
- default_trans: void,
+ decision: HashMapDecider {
+ trans: HashMap::new(),
+ default_trans: void,
+ },
accept: false,
})
.collect();
@@ -80,14 +125,10 @@ impl From<ENFA> for DFA {
states[i].accept = ms.accept();
for t in ms.possible_transitions() {
let k = multi_to_dfa[&ms.transition(t)];
- states[i].trans.insert(t, k);
+ states[i].decision.trans.insert(t, k);
}
}
- for s in states.iter_mut() {
- s.fast_trans = DecisionTree::build(s.trans.clone(), s.default_trans.clone());
- }
-
let mut this = Self {
start: multi_to_dfa[&nfa.start_multi_state()],
states,
@@ -104,7 +145,7 @@ pub enum DFACompileError {
NFAError(EnfaTranslationError),
}
-impl RegexEngine for DFA {
+impl<D: DfaDecider> RegexEngine for DFA<D> {
type CompileError = DFACompileError;
fn compile(pat: Pattern) -> Result<Self, Self::CompileError> {
@@ -113,7 +154,18 @@ impl RegexEngine for DFA {
if nfa.has_submatches {
Err(DFACompileError::SubmatchesNotSupported)
} else {
- Ok(Self::from(nfa))
+ let this = DFA::from(nfa);
+ Ok(Self {
+ states: this
+ .states
+ .into_iter()
+ .map(|s| State {
+ decision: D::from(s.decision),
+ accept: s.accept,
+ })
+ .collect(),
+ start: this.start,
+ })
}
}
Err(e) => Err(DFACompileError::NFAError(e)),
@@ -121,7 +173,24 @@ impl RegexEngine for DFA {
}
fn run(&self, input: &[u8]) -> Option<super::Match> {
- if self.matches(input) {
+ let mut state = self.start;
+ let mut i = 0;
+ while i + 7 < input.len() {
+ state = self.states[state].decision.decide(input[i+0]);
+ state = self.states[state].decision.decide(input[i+1]);
+ state = self.states[state].decision.decide(input[i+2]);
+ state = self.states[state].decision.decide(input[i+3]);
+ state = self.states[state].decision.decide(input[i+4]);
+ state = self.states[state].decision.decide(input[i+5]);
+ state = self.states[state].decision.decide(input[i+6]);
+ state = self.states[state].decision.decide(input[i+7]);
+ i += 8;
+ }
+ while i < input.len() {
+ state = self.states[state].decision.decide(input[i]);
+ i += 1;
+ }
+ if self.states[state].accept {
Some(Match::new_empty())
} else {
None
@@ -129,16 +198,6 @@ impl RegexEngine for DFA {
}
}
-impl DFA {
- pub fn matches(&self, x: &[u8]) -> bool {
- let mut state = self.start;
- for &b in x.iter() {
- state = self.states[state].fast_trans.decide(b);
- }
- self.states[state].accept
- }
-}
-
mod state_set {
#[derive(Hash, Clone, PartialEq, Eq)]
pub struct StateSet {
@@ -243,21 +302,21 @@ use state_set::StateSet;
trait GoesTo {
fn goes_to(&self, to: usize) -> bool;
}
-impl GoesTo for (&State, ByteRange) {
+impl GoesTo for (&State<HashMapDecider>, ByteRange) {
fn goes_to(&self, target: usize) -> bool {
let from = self.0;
let ch = self.1;
- for (c, &to) in from.trans.iter() {
+ for (c, &to) in from.decision.trans.iter() {
if c.overlaps(ch) && to == target {
return true;
}
}
- from.default_trans == target
+ from.decision.default_trans == target
}
}
impl DFA {
- fn states_where(&self, mut f: impl FnMut(&State) -> bool) -> StateSet {
+ fn states_where(&self, mut f: impl FnMut(&State<HashMapDecider>) -> bool) -> StateSet {
let states = self
.states
.iter()
@@ -276,7 +335,7 @@ impl DFA {
let mut ranges: Vec<ByteRange> = self
.states
.iter()
- .flat_map(|s| s.trans.iter().map(|t| *t.0))
+ .flat_map(|s| s.decision.trans.iter().map(|t| *t.0))
.chain([ByteRange::new_range(u8::MIN, u8::MAX)])
.collect();
ranges.sort();
@@ -360,13 +419,18 @@ impl DFA {
for i in 0..self.states.len() {
if is_alive(i) {
let s = &self.states[i];
- let trans: HashMap<ByteRange, usize> =
- s.trans.iter().map(|(&ch, &to)| (ch, remap(to))).collect();
- let default_trans = remap(s.default_trans);
+ let trans: HashMap<ByteRange, usize> = s
+ .decision
+ .trans
+ .iter()
+ .map(|(&ch, &to)| (ch, remap(to)))
+ .collect();
+ let default_trans = remap(s.decision.default_trans);
new_states.push(State {
- fast_trans: DecisionTree::build(trans.clone(), default_trans),
- trans,
- default_trans,
+ decision: HashMapDecider {
+ trans,
+ default_trans,
+ },
accept: s.accept,
});
}
@@ -377,15 +441,19 @@ impl DFA {
pub fn minify(&mut self) {
for state in self.states.iter_mut() {
- state.trans.retain(|_, to| *to != state.default_trans);
- if state.trans.len() == 1
+ state
+ .decision
+ .trans
+ .retain(|_, to| *to != state.decision.default_trans);
+ if state.decision.trans.len() == 1
&& state
+ .decision
.trans
.iter()
.all(|t| *t.0 == ByteRange::new_range(0, 255))
{
- state.default_trans = state.trans.iter().map(|x| *x.1).next().unwrap();
- state.trans.clear();
+ state.decision.default_trans = state.decision.trans.iter().map(|x| *x.1).next().unwrap();
+ state.decision.trans.clear();
}
}
diff --git a/src/regex/mod.rs b/src/regex/mod.rs
index f10a92d..cdbfe0d 100644
--- a/src/regex/mod.rs
+++ b/src/regex/mod.rs
@@ -3,7 +3,7 @@ mod byte_range;
pub mod dfa;
pub mod enfa;
pub mod simple;
-mod decision_tree;
+pub mod decision_tree;
#[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)]
pub enum LookDirection {
@@ -284,7 +284,6 @@ impl RegexEngine for CompiledPattern {
macro_rules! all_engines {
($ty_name:ident, $($x:ident : $ty:ty,)*) => {
- #[derive(Debug)]
pub struct $ty_name {
$(pub $x: Option<$ty>,)*
}