aboutsummaryrefslogtreecommitdiffstats
path: root/src/parse/regex
diff options
context:
space:
mode:
authorJonas Maier <jonas@x77.dev>2026-06-06 10:15:16 +0200
committerJonas Maier <jonas@x77.dev>2026-06-06 10:15:16 +0200
commit1ff733a821a947880baaad42aab43a398285418c (patch)
treed698de85301a7f9a49f72d0e6dd2ee0a145a7fde /src/parse/regex
parent4eeb79c190bd08e761d097cd10382f09a9ee4348 (diff)
downloadpish-1ff733a821a947880baaad42aab43a398285418c.tar.gz
regex: benchmark and optimizations
Diffstat (limited to 'src/parse/regex')
-rw-r--r--src/parse/regex/bc.rs82
1 files changed, 58 insertions, 24 deletions
diff --git a/src/parse/regex/bc.rs b/src/parse/regex/bc.rs
index 72ba21b..cb41607 100644
--- a/src/parse/regex/bc.rs
+++ b/src/parse/regex/bc.rs
@@ -115,41 +115,45 @@ struct Thread<F: Flavor> {
struct VM<'p, F: Flavor> {
instr: &'p [Instr<F>],
- threads: Vec<Thread<F>>,
+ passive_threads: VecDeque<Thread<F>>,
+ active_threads: VecDeque<Thread<F>>,
hot: BitSet,
+ warm: BitSet,
}
impl<'p, F: Flavor> VM<'p, F> {
fn new(instr: &'p [Instr<F>], starting_thread: Thread<F>) -> Self {
Self {
instr,
- threads: vec![starting_thread],
+ passive_threads: vec![starting_thread].into(),
+ active_threads: VecDeque::new(),
hot: BitSet::new(instr.len()),
+ warm: BitSet::new(instr.len()),
}
}
fn step_epsilon<'a>(&mut self, sd: &mut F::StepData<'a, 'p>) {
- let mut threads: VecDeque<_> = self.threads.drain(..).collect();
+ std::mem::swap(&mut self.active_threads, &mut self.passive_threads);
self.hot.set_all(false);
+ self.warm.set_all(false);
- let mut warm = self.hot.clone();
macro_rules! add_thread {
($t:expr) => {{
let t = $t;
let bit = t.pc as usize;
- if !warm.get(bit) {
- warm.set(bit, true);
- threads.push_front(t);
+ if !self.warm.get(bit) {
+ self.warm.set(bit, true);
+ self.active_threads.push_front(t);
}
}};
}
- while let Some(mut thread) = threads.pop_front() {
+ while let Some(mut thread) = self.active_threads.pop_front() {
match self.instr[thread.pc as usize] {
Instr::Class(_) | Instr::Consume(_) => {
if !self.hot.get(thread.pc as usize) {
self.hot.set(thread.pc as usize, true);
- self.threads.push(thread);
+ self.passive_threads.push_back(thread);
}
}
Instr::Jump(j) => {
@@ -178,7 +182,7 @@ impl<'p, F: Flavor> VM<'p, F> {
fn step_consume(&mut self, byte: u8) {
self.hot.set_all(false);
- self.threads
+ self.passive_threads
.retain_mut(|thread| match self.instr[thread.pc as usize] {
Instr::Class(class) => {
if class.matches(byte) {
@@ -254,12 +258,16 @@ struct VirtualMachine<'a> {
}
impl<'a> VirtualMachine<'a> {
- fn step_epsilon(&mut self, loc: usize) {
- self.vm0.step_epsilon(&mut ());
+ fn step_epsilon_1(&mut self, loc: usize) {
self.vm1
.step_epsilon(&mut (loc, &self.vm0.hot, &mut self.vm2));
}
+ fn step_epsilon(&mut self, loc: usize) {
+ self.vm0.step_epsilon(&mut ());
+ self.step_epsilon_1(loc);
+ }
+
fn step_consume(&mut self, byte: u8) {
self.vm0.step_consume(byte);
self.vm1.step_consume(byte);
@@ -272,7 +280,7 @@ impl<'a> VirtualMachine<'a> {
fn extract_match(&self) -> Option<Match> {
self.vm1
- .threads
+ .passive_threads
.iter()
.filter(|t| self.accepting.get(t.pc as usize))
.map(|t| {
@@ -302,6 +310,7 @@ pub struct BytecodeCompiledRegex {
instrs0: Box<[Instr<AssertionFlavor>]>,
instrs1: Box<[Instr<MainFlavor>]>,
instrs2: Box<[Instr<AssertionFlavor>]>,
+ no_lookbehind: bool,
submatch_count: usize,
accepting: BitSet,
}
@@ -333,10 +342,18 @@ impl BytecodeCompiledRegex {
vm2,
accepting: &self.accepting,
};
- for (i, ch) in data.iter().cloned().enumerate() {
- vm.step(ch, i);
+ if self.no_lookbehind {
+ for (i, ch) in data.iter().cloned().enumerate() {
+ vm.step_epsilon_1(i);
+ vm.vm1.step_consume(ch);
+ }
+ vm.step_epsilon(data.len());
+ } else {
+ for (i, ch) in data.iter().cloned().enumerate() {
+ vm.step(ch, i);
+ }
+ vm.step_epsilon(data.len());
}
- vm.step_epsilon(data.len());
vm.extract_match()
}
@@ -602,6 +619,7 @@ impl TryFrom<Pattern> for BytecodeCompiledRegex {
accepting.set(final_state, true);
Ok(Self {
+ no_lookbehind: neg.map.is_empty(),
instrs0: neg.instrs.into(),
instrs1: instrs.into(),
instrs2: pos.instrs.into(),
@@ -640,7 +658,9 @@ mod tests {
0..1
);
assert_eq!(
- re.re_match(b"abbbbb").unwrap().submatches[0].clone().unwrap(),
+ re.re_match(b"abbbbb").unwrap().submatches[0]
+ .clone()
+ .unwrap(),
0..1
);
}
@@ -653,7 +673,9 @@ mod tests {
0..3
);
assert_eq!(
- re.re_match(b"abbbbb").unwrap().submatches[0].clone().unwrap(),
+ re.re_match(b"abbbbb").unwrap().submatches[0]
+ .clone()
+ .unwrap(),
0..5
);
}
@@ -662,11 +684,15 @@ mod tests {
fn nongreedy_plus() {
let re = regex("(ab+?)bb*");
assert_eq!(
- re.re_match(b"abbbb").unwrap().submatches[0].clone().unwrap(),
+ re.re_match(b"abbbb").unwrap().submatches[0]
+ .clone()
+ .unwrap(),
0..2
);
assert_eq!(
- re.re_match(b"abbbbb").unwrap().submatches[0].clone().unwrap(),
+ re.re_match(b"abbbbb").unwrap().submatches[0]
+ .clone()
+ .unwrap(),
0..2
);
}
@@ -679,7 +705,9 @@ mod tests {
0..3
);
assert_eq!(
- re.re_match(b"abbbbb").unwrap().submatches[0].clone().unwrap(),
+ re.re_match(b"abbbbb").unwrap().submatches[0]
+ .clone()
+ .unwrap(),
0..5
);
}
@@ -688,11 +716,15 @@ mod tests {
fn nongreedy_qm() {
let re = regex("(ab??)bb*");
assert_eq!(
- re.re_match(b"abbbb").unwrap().submatches[0].clone().unwrap(),
+ re.re_match(b"abbbb").unwrap().submatches[0]
+ .clone()
+ .unwrap(),
0..1
);
assert_eq!(
- re.re_match(b"abbbbb").unwrap().submatches[0].clone().unwrap(),
+ re.re_match(b"abbbbb").unwrap().submatches[0]
+ .clone()
+ .unwrap(),
0..1
);
}
@@ -705,7 +737,9 @@ mod tests {
0..2
);
assert_eq!(
- re.re_match(b"abbbbb").unwrap().submatches[0].clone().unwrap(),
+ re.re_match(b"abbbbb").unwrap().submatches[0]
+ .clone()
+ .unwrap(),
0..2
);
}