diff options
Diffstat (limited to 'src/parse')
| -rw-r--r-- | src/parse/regex/bc.rs | 92 |
1 files changed, 76 insertions, 16 deletions
diff --git a/src/parse/regex/bc.rs b/src/parse/regex/bc.rs index 75a6d5d..c0eff8b 100644 --- a/src/parse/regex/bc.rs +++ b/src/parse/regex/bc.rs @@ -2,13 +2,17 @@ use std::collections::{HashMap, VecDeque}; use crate::{ bitset::BitSet, - parse::regex::{ - CharacterClass, GreedyBehavior, LookDirection, LookPolarity, Pattern, byte_range::ByteRange, + parse::{ + Parse, + regex::{ + CharacterClass, GreedyBehavior, LookDirection, LookPolarity, Pattern, + byte_range::ByteRange, + }, }, }; trait Flavor: Clone { - type CustomInstr: Copy + Clone; + type CustomInstr: Copy + Clone + std::fmt::Debug; type ThreadData: Clone; type StepData<'a, 'b> where @@ -21,7 +25,7 @@ trait Flavor: Clone { ) -> bool; } -#[derive(Copy, Clone)] +#[derive(Copy, Clone, Debug)] struct MainFlavor; impl Flavor for MainFlavor { type CustomInstr = MainInstr; @@ -54,10 +58,10 @@ impl Flavor for MainFlavor { } } -#[derive(Copy, Clone)] +#[derive(Copy, Clone, Debug)] enum Nothing {} -#[derive(Copy, Clone)] +#[derive(Copy, Clone, Debug)] struct AssertionFlavor; impl Flavor for AssertionFlavor { type CustomInstr = Nothing; @@ -75,14 +79,14 @@ impl Flavor for AssertionFlavor { type JumpTarget = u32; type Register = u32; -#[derive(Copy, Clone)] +#[derive(Copy, Clone, Debug)] struct Assertion { target: JumpTarget, dir: LookDirection, pol: LookPolarity, } -#[derive(Copy, Clone)] +#[derive(Copy, Clone, Debug)] enum Instr<F: Flavor> { Class(CharacterClass), Consume(ByteRange), @@ -91,7 +95,7 @@ enum Instr<F: Flavor> { Custom(F::CustomInstr), } -#[derive(Copy, Clone)] +#[derive(Copy, Clone, Debug)] enum MainInstr { Save(Register), Join(Assertion), @@ -122,6 +126,18 @@ impl<'p, F: Flavor> VM<'p, F> { let mut threads: VecDeque<_> = self.threads.drain(..).collect(); self.hot.set_all(false); + let mut warm = self.hot.clone(); + macro_rules! add_thread { + ($t:expr) => {{ + let t = $t; + let bit = t.pc as usize; + if !warm.get(bit) { + warm.set(bit, true); + threads.push_back(t); + } + }}; + } + while let Some(mut thread) = threads.pop_front() { match self.instr[thread.pc as usize] { Instr::Class(_) | Instr::Consume(_) => { @@ -132,14 +148,14 @@ impl<'p, F: Flavor> VM<'p, F> { } Instr::Jump(j) => { thread.pc = j; - threads.push_back(thread); + add_thread!(thread); } Instr::Fork(a, b) => { - threads.push_back(Thread { + add_thread!(Thread { pc: a, data: thread.data.clone(), }); - threads.push_back(Thread { + add_thread!(Thread { pc: b, data: thread.data, }); @@ -147,7 +163,7 @@ impl<'p, F: Flavor> VM<'p, F> { Instr::Custom(instr) => { if F::accepts(&mut thread, instr, sd) { thread.pc += 1; - threads.push_back(thread); + add_thread!(thread); } } } @@ -205,9 +221,11 @@ impl<'a> LookaheadVM<'a> { assert!(self.cache_data.is_empty()); assert_eq!(self.loc_offset, 0); self.loc_offset = loc; + self.vm.step_epsilon(&mut ()); + self.cache_data.push(self.vm.hot.clone()); for i in (loc..self.data.len()).rev() { - self.vm.step_epsilon(&mut ()); self.vm.step_consume(self.data[i]); + self.vm.step_epsilon(&mut ()); self.cache_data.push(self.vm.hot.clone()); } self.cache_data.reverse(); @@ -230,14 +248,22 @@ struct VirtualMachine<'a> { } impl<'a> VirtualMachine<'a> { - fn step(&mut self, byte: u8, loc: usize) { + fn step_epsilon(&mut self, loc: usize) { self.vm0.step_epsilon(&mut ()); self.vm1 .step_epsilon(&mut (loc, &self.vm0.hot, &mut self.vm2)); + } + + fn step_consume(&mut self, byte: u8) { self.vm0.step_consume(byte); self.vm1.step_consume(byte); } + fn step(&mut self, byte: u8, loc: usize) { + self.step_epsilon(loc); + self.step_consume(byte); + } + fn extract_match(&self) -> Option<Match> { self.vm1 .threads @@ -249,6 +275,19 @@ impl<'a> VirtualMachine<'a> { .next() } } + +fn fmt_instructions<F: std::fmt::Debug + Flavor>( + f: &mut std::fmt::Formatter<'_>, + label: &str, + insns: &[Instr<F>], +) -> std::fmt::Result { + writeln!(f, "# {label}")?; + for (idx, ins) in insns.iter().enumerate() { + writeln!(f, "{idx}: {ins:?}")?; + } + Ok(()) +} + pub struct BytecodeCompiledRegex { instrs0: Box<[Instr<AssertionFlavor>]>, instrs1: Box<[Instr<MainFlavor>]>, @@ -256,6 +295,15 @@ pub struct BytecodeCompiledRegex { accepting: BitSet, } +impl std::fmt::Debug for BytecodeCompiledRegex { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + fmt_instructions(f, "behind", &self.instrs0)?; + fmt_instructions(f, "ahead", &self.instrs2)?; + fmt_instructions(f, "main", &self.instrs1)?; + writeln!(f, "accepting: {:?}", self.accepting) + } +} + impl BytecodeCompiledRegex { pub fn re_match(&self, data: &[u8]) -> Option<Match> { let vm0 = VM::new(&self.instrs0, Thread { pc: 0, data: () }); @@ -277,6 +325,7 @@ impl BytecodeCompiledRegex { for (i, ch) in data.iter().cloned().enumerate() { vm.step(ch, i); } + vm.step_epsilon(data.len()); vm.extract_match() } @@ -349,7 +398,7 @@ impl<'a, F: Flavor> Compiler<'a, F> { self.compile(pat)?; let fork_pos = self.instrs.len(); let after = fork_pos + 1; - self.instrs.push(fork(base, after, greedy)); + self.instrs.push(fork(base + 1, after, greedy)); self.instrs[base] = Instr::Jump(fork_pos as JumpTarget); Ok(()) } @@ -536,3 +585,14 @@ impl TryFrom<Pattern> for BytecodeCompiledRegex { }) } } + +#[test] +fn print_compiled_vm() { + let pat = Pattern::parse_from_bytes(b"a?b?").unwrap(); + let compiled = BytecodeCompiledRegex::try_from(pat).unwrap(); + println!("{compiled:#?}"); + assert_eq!(compiled.matches(b"ab"), true); + assert_eq!(compiled.matches(b"a"), true); + assert_eq!(compiled.matches(b"b"), true); + assert_eq!(compiled.matches(b""), true); +} |
