From 1ff733a821a947880baaad42aab43a398285418c Mon Sep 17 00:00:00 2001 From: Jonas Maier Date: Sat, 6 Jun 2026 10:15:16 +0200 Subject: regex: benchmark and optimizations --- src/parse/regex/bc.rs | 82 ++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 58 insertions(+), 24 deletions(-) (limited to 'src/parse/regex') diff --git a/src/parse/regex/bc.rs b/src/parse/regex/bc.rs index 72ba21b..cb41607 100644 --- a/src/parse/regex/bc.rs +++ b/src/parse/regex/bc.rs @@ -115,41 +115,45 @@ struct Thread { struct VM<'p, F: Flavor> { instr: &'p [Instr], - threads: Vec>, + passive_threads: VecDeque>, + active_threads: VecDeque>, hot: BitSet, + warm: BitSet, } impl<'p, F: Flavor> VM<'p, F> { fn new(instr: &'p [Instr], starting_thread: Thread) -> Self { Self { instr, - threads: vec![starting_thread], + passive_threads: vec![starting_thread].into(), + active_threads: VecDeque::new(), hot: BitSet::new(instr.len()), + warm: BitSet::new(instr.len()), } } fn step_epsilon<'a>(&mut self, sd: &mut F::StepData<'a, 'p>) { - let mut threads: VecDeque<_> = self.threads.drain(..).collect(); + std::mem::swap(&mut self.active_threads, &mut self.passive_threads); self.hot.set_all(false); + self.warm.set_all(false); - let mut warm = self.hot.clone(); macro_rules! add_thread { ($t:expr) => {{ let t = $t; let bit = t.pc as usize; - if !warm.get(bit) { - warm.set(bit, true); - threads.push_front(t); + if !self.warm.get(bit) { + self.warm.set(bit, true); + self.active_threads.push_front(t); } }}; } - while let Some(mut thread) = threads.pop_front() { + while let Some(mut thread) = self.active_threads.pop_front() { match self.instr[thread.pc as usize] { Instr::Class(_) | Instr::Consume(_) => { if !self.hot.get(thread.pc as usize) { self.hot.set(thread.pc as usize, true); - self.threads.push(thread); + self.passive_threads.push_back(thread); } } Instr::Jump(j) => { @@ -178,7 +182,7 @@ impl<'p, F: Flavor> VM<'p, F> { fn step_consume(&mut self, byte: u8) { self.hot.set_all(false); - self.threads + self.passive_threads .retain_mut(|thread| match self.instr[thread.pc as usize] { Instr::Class(class) => { if class.matches(byte) { @@ -254,12 +258,16 @@ struct VirtualMachine<'a> { } impl<'a> VirtualMachine<'a> { - fn step_epsilon(&mut self, loc: usize) { - self.vm0.step_epsilon(&mut ()); + fn step_epsilon_1(&mut self, loc: usize) { self.vm1 .step_epsilon(&mut (loc, &self.vm0.hot, &mut self.vm2)); } + fn step_epsilon(&mut self, loc: usize) { + self.vm0.step_epsilon(&mut ()); + self.step_epsilon_1(loc); + } + fn step_consume(&mut self, byte: u8) { self.vm0.step_consume(byte); self.vm1.step_consume(byte); @@ -272,7 +280,7 @@ impl<'a> VirtualMachine<'a> { fn extract_match(&self) -> Option { self.vm1 - .threads + .passive_threads .iter() .filter(|t| self.accepting.get(t.pc as usize)) .map(|t| { @@ -302,6 +310,7 @@ pub struct BytecodeCompiledRegex { instrs0: Box<[Instr]>, instrs1: Box<[Instr]>, instrs2: Box<[Instr]>, + no_lookbehind: bool, submatch_count: usize, accepting: BitSet, } @@ -333,10 +342,18 @@ impl BytecodeCompiledRegex { vm2, accepting: &self.accepting, }; - for (i, ch) in data.iter().cloned().enumerate() { - vm.step(ch, i); + if self.no_lookbehind { + for (i, ch) in data.iter().cloned().enumerate() { + vm.step_epsilon_1(i); + vm.vm1.step_consume(ch); + } + vm.step_epsilon(data.len()); + } else { + for (i, ch) in data.iter().cloned().enumerate() { + vm.step(ch, i); + } + vm.step_epsilon(data.len()); } - vm.step_epsilon(data.len()); vm.extract_match() } @@ -602,6 +619,7 @@ impl TryFrom for BytecodeCompiledRegex { accepting.set(final_state, true); Ok(Self { + no_lookbehind: neg.map.is_empty(), instrs0: neg.instrs.into(), instrs1: instrs.into(), instrs2: pos.instrs.into(), @@ -640,7 +658,9 @@ mod tests { 0..1 ); assert_eq!( - re.re_match(b"abbbbb").unwrap().submatches[0].clone().unwrap(), + re.re_match(b"abbbbb").unwrap().submatches[0] + .clone() + .unwrap(), 0..1 ); } @@ -653,7 +673,9 @@ mod tests { 0..3 ); assert_eq!( - re.re_match(b"abbbbb").unwrap().submatches[0].clone().unwrap(), + re.re_match(b"abbbbb").unwrap().submatches[0] + .clone() + .unwrap(), 0..5 ); } @@ -662,11 +684,15 @@ mod tests { fn nongreedy_plus() { let re = regex("(ab+?)bb*"); assert_eq!( - re.re_match(b"abbbb").unwrap().submatches[0].clone().unwrap(), + re.re_match(b"abbbb").unwrap().submatches[0] + .clone() + .unwrap(), 0..2 ); assert_eq!( - re.re_match(b"abbbbb").unwrap().submatches[0].clone().unwrap(), + re.re_match(b"abbbbb").unwrap().submatches[0] + .clone() + .unwrap(), 0..2 ); } @@ -679,7 +705,9 @@ mod tests { 0..3 ); assert_eq!( - re.re_match(b"abbbbb").unwrap().submatches[0].clone().unwrap(), + re.re_match(b"abbbbb").unwrap().submatches[0] + .clone() + .unwrap(), 0..5 ); } @@ -688,11 +716,15 @@ mod tests { fn nongreedy_qm() { let re = regex("(ab??)bb*"); assert_eq!( - re.re_match(b"abbbb").unwrap().submatches[0].clone().unwrap(), + re.re_match(b"abbbb").unwrap().submatches[0] + .clone() + .unwrap(), 0..1 ); assert_eq!( - re.re_match(b"abbbbb").unwrap().submatches[0].clone().unwrap(), + re.re_match(b"abbbbb").unwrap().submatches[0] + .clone() + .unwrap(), 0..1 ); } @@ -705,7 +737,9 @@ mod tests { 0..2 ); assert_eq!( - re.re_match(b"abbbbb").unwrap().submatches[0].clone().unwrap(), + re.re_match(b"abbbbb").unwrap().submatches[0] + .clone() + .unwrap(), 0..2 ); } -- cgit v1.2.3