From a464bec4d558533776329634504ab0c1bee84bba Mon Sep 17 00:00:00 2001 From: Jonas Maier Date: Fri, 5 Jun 2026 15:03:44 +0200 Subject: some bytecode implementation, does not work yet --- src/parse/regex/bc.rs | 252 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 252 insertions(+) create mode 100644 src/parse/regex/bc.rs (limited to 'src/parse/regex/bc.rs') diff --git a/src/parse/regex/bc.rs b/src/parse/regex/bc.rs new file mode 100644 index 0000000..90761fd --- /dev/null +++ b/src/parse/regex/bc.rs @@ -0,0 +1,252 @@ +use std::collections::VecDeque; + +use crate::{ + bitset::BitSet, + parse::regex::{LookDirection, LookPolarity, byte_range::ByteRange}, +}; + +trait Flavor: Clone { + type CustomInstr: Copy + Clone; + type ThreadData: Clone; + type StepData<'a>; + + fn accepts<'a, 'b>( + thread: &mut Thread, + instr: Self::CustomInstr, + sd: &mut Self::StepData<'a>, + ) -> bool; +} + +#[derive(Copy, Clone)] +struct MainFlavor; +impl Flavor for MainFlavor { + type CustomInstr = MainInstr; + type ThreadData = Box<[usize]>; + type StepData<'a> = (usize, &'a BitSet, &'a mut LookaheadVM<'a>); + + fn accepts<'a>( + thread: &mut Thread, + instr: Self::CustomInstr, + data: &mut Self::StepData<'a>, + ) -> bool { + match instr { + MainInstr::Save(reg) => { + thread.data[reg as usize] = data.0; + true + } + MainInstr::Join(assertion) => { + let should_match = assertion.pol == LookPolarity::Positive; + let state = assertion.what as usize; + let is_matching = match assertion.dir { + LookDirection::Ahead => data.2.get_state(data.0, state), + LookDirection::Behind => data.1.get(state), + }; + is_matching == should_match + } + } + } +} + +#[derive(Copy, Clone)] +enum Nothing {} + +#[derive(Copy, Clone)] +struct AssertionFlavor; +impl Flavor for AssertionFlavor { + type CustomInstr = Nothing; + type ThreadData = (); + type StepData<'a> = (); + + fn accepts(_thread: &mut Thread, instr: Self::CustomInstr, _sd: &mut ()) -> bool { + match instr {} + } +} + +type JumpTarget = u32; +type Register = u32; +type AssertionRef = u32; + +#[derive(Copy, Clone)] +struct Assertion { + what: JumpTarget, + dir: LookDirection, + pol: LookPolarity, +} + +#[derive(Copy, Clone)] +enum Instr { + Consume(ByteRange), + Jump(JumpTarget), + Fork(JumpTarget, JumpTarget), + Custom(F::CustomInstr), +} + +#[derive(Copy, Clone)] +enum MainInstr { + Save(Register), + Join(Assertion), +} + +#[derive(Copy, Clone)] +enum Look {} + +type Registers = Box<[usize]>; + +#[derive(Clone)] +struct Thread { + pc: JumpTarget, + data: F::ThreadData, +} + +struct CachedLookahead {} + +struct VM<'p, F: Flavor> { + instr: &'p [Instr], + threads: Vec>, + hot: BitSet, +} + +impl<'p, F: Flavor> VM<'p, F> { + fn new(instr: &'p [Instr]) -> Self { + Self { + instr, + threads: todo!("determine starting thread"), + hot: BitSet::new(instr.len()), + } + } + + fn step<'a>(&mut self, byte: u8, sd: &mut F::StepData<'a>) { + let mut threads: VecDeque<_> = self.threads.drain(..).collect(); + self.hot.set_all(false); + + while let Some(mut thread) = threads.pop_front() { + match self.instr[thread.pc as usize] { + Instr::Consume(bytes) => { + if bytes.contains(byte) { + thread.pc += 1; + if !self.hot.get(thread.pc as usize) { + self.hot.set(thread.pc as usize, true); + self.threads.push(thread); + } + } + } + Instr::Jump(j) => { + thread.pc = j; + threads.push_back(thread); + } + Instr::Fork(a, b) => { + threads.push_back(Thread { + pc: a, + data: thread.data.clone(), + }); + threads.push_back(Thread { + pc: b, + data: thread.data, + }); + } + Instr::Custom(instr) => { + if F::accepts(&mut thread, instr, sd) { + thread.pc += 1; + threads.push_back(thread); + } + } + } + } + } +} + +struct LookaheadVM<'a> { + vm: VM<'a, AssertionFlavor>, + data: &'a [u8], + cached: bool, + cache_data: Vec, + loc_offset: usize, +} + +impl<'a> LookaheadVM<'a> { + fn new(vm: VM<'a, AssertionFlavor>, data: &'a [u8]) -> Self { + Self { + vm, + data, + cached: false, + cache_data: Vec::new(), + loc_offset: 0, + } + } + + fn get_state(&mut self, loc: usize, state: usize) -> bool { + if !self.cached { + assert!(self.cache_data.is_empty()); + assert_eq!(self.loc_offset, 0); + self.loc_offset = loc; + for i in (loc..self.data.len()).rev() { + self.vm.step(self.data[i], &mut ()); + self.cache_data.push(self.vm.hot.clone()); + } + self.cache_data.reverse(); + self.cached = true; + } + + assert!( + loc >= self.loc_offset, + "get_state must be called with non-decreasing arguments." + ); + self.cache_data[loc - self.loc_offset].get(state) + } +} + +struct VirtualMachine<'a> { + vm0: VM<'a, AssertionFlavor>, + vm1: VM<'a, MainFlavor>, + vm2: LookaheadVM<'a>, + accepting: &'a BitSet, +} + +impl<'a> VirtualMachine<'a> { + fn step(&'a mut self, byte: u8, loc: usize) { + self.vm0.step(byte, &mut ()); + self.vm1 + .step(byte, &mut (loc, &self.vm0.hot, &mut self.vm2)); + } + + fn extract_match(&self) -> Option { + self.vm1 + .threads + .iter() + .filter(|t| self.accepting.get(t.pc as usize)) + .map(|t| Match { + registers: t.data.clone(), + }) + .next() + } +} + +pub struct ByteCodeCompiledRegex { + instrs0: Box<[Instr]>, + instrs1: Box<[Instr]>, + instrs2: Box<[Instr]>, + accepting: BitSet, +} + +impl ByteCodeCompiledRegex { + pub fn re_match(&self, data: &[u8]) -> Option { + let vm0 = VM::new(&self.instrs0); + let vm1 = VM::new(&self.instrs1); + let vm2 = VM::new(&self.instrs2); + let vm2 = LookaheadVM::new(vm2, data); + let mut vm = VirtualMachine { + vm0, + vm1, + vm2, + accepting: &self.accepting, + }; + for (i, ch) in data.iter().cloned().enumerate() { + vm.step(ch, i); + } + vm.extract_match() + } +} + +pub struct Match { + pub registers: Box<[usize]>, +} -- cgit v1.2.3