diff options
| author | Jonas Maier <jonas@x77.dev> | 2026-06-05 15:03:44 +0200 |
|---|---|---|
| committer | Jonas Maier <jonas@x77.dev> | 2026-06-05 15:03:44 +0200 |
| commit | a464bec4d558533776329634504ab0c1bee84bba (patch) | |
| tree | ae19c073bf41ad52d97f951a1ffe10429d341199 /src/parse/regex/bc.rs | |
| parent | 2184c1cab66771594051197451c5e0f4dd2567a2 (diff) | |
| download | pish-a464bec4d558533776329634504ab0c1bee84bba.tar.gz | |
some bytecode implementation, does not work yet
Diffstat (limited to 'src/parse/regex/bc.rs')
| -rw-r--r-- | src/parse/regex/bc.rs | 252 |
1 files changed, 252 insertions, 0 deletions
diff --git a/src/parse/regex/bc.rs b/src/parse/regex/bc.rs new file mode 100644 index 0000000..90761fd --- /dev/null +++ b/src/parse/regex/bc.rs @@ -0,0 +1,252 @@ +use std::collections::VecDeque; + +use crate::{ + bitset::BitSet, + parse::regex::{LookDirection, LookPolarity, byte_range::ByteRange}, +}; + +trait Flavor: Clone { + type CustomInstr: Copy + Clone; + type ThreadData: Clone; + type StepData<'a>; + + fn accepts<'a, 'b>( + thread: &mut Thread<Self>, + instr: Self::CustomInstr, + sd: &mut Self::StepData<'a>, + ) -> bool; +} + +#[derive(Copy, Clone)] +struct MainFlavor; +impl Flavor for MainFlavor { + type CustomInstr = MainInstr; + type ThreadData = Box<[usize]>; + type StepData<'a> = (usize, &'a BitSet, &'a mut LookaheadVM<'a>); + + fn accepts<'a>( + thread: &mut Thread<Self>, + instr: Self::CustomInstr, + data: &mut Self::StepData<'a>, + ) -> bool { + match instr { + MainInstr::Save(reg) => { + thread.data[reg as usize] = data.0; + true + } + MainInstr::Join(assertion) => { + let should_match = assertion.pol == LookPolarity::Positive; + let state = assertion.what as usize; + let is_matching = match assertion.dir { + LookDirection::Ahead => data.2.get_state(data.0, state), + LookDirection::Behind => data.1.get(state), + }; + is_matching == should_match + } + } + } +} + +#[derive(Copy, Clone)] +enum Nothing {} + +#[derive(Copy, Clone)] +struct AssertionFlavor; +impl Flavor for AssertionFlavor { + type CustomInstr = Nothing; + type ThreadData = (); + type StepData<'a> = (); + + fn accepts(_thread: &mut Thread<Self>, instr: Self::CustomInstr, _sd: &mut ()) -> bool { + match instr {} + } +} + +type JumpTarget = u32; +type Register = u32; +type AssertionRef = u32; + +#[derive(Copy, Clone)] +struct Assertion { + what: JumpTarget, + dir: LookDirection, + pol: LookPolarity, +} + +#[derive(Copy, Clone)] +enum Instr<F: Flavor> { + Consume(ByteRange), + Jump(JumpTarget), + Fork(JumpTarget, JumpTarget), + Custom(F::CustomInstr), +} + +#[derive(Copy, Clone)] +enum MainInstr { + Save(Register), + Join(Assertion), +} + +#[derive(Copy, Clone)] +enum Look {} + +type Registers = Box<[usize]>; + +#[derive(Clone)] +struct Thread<F: Flavor> { + pc: JumpTarget, + data: F::ThreadData, +} + +struct CachedLookahead {} + +struct VM<'p, F: Flavor> { + instr: &'p [Instr<F>], + threads: Vec<Thread<F>>, + hot: BitSet, +} + +impl<'p, F: Flavor> VM<'p, F> { + fn new(instr: &'p [Instr<F>]) -> Self { + Self { + instr, + threads: todo!("determine starting thread"), + hot: BitSet::new(instr.len()), + } + } + + fn step<'a>(&mut self, byte: u8, sd: &mut F::StepData<'a>) { + let mut threads: VecDeque<_> = self.threads.drain(..).collect(); + self.hot.set_all(false); + + while let Some(mut thread) = threads.pop_front() { + match self.instr[thread.pc as usize] { + Instr::Consume(bytes) => { + if bytes.contains(byte) { + thread.pc += 1; + if !self.hot.get(thread.pc as usize) { + self.hot.set(thread.pc as usize, true); + self.threads.push(thread); + } + } + } + Instr::Jump(j) => { + thread.pc = j; + threads.push_back(thread); + } + Instr::Fork(a, b) => { + threads.push_back(Thread { + pc: a, + data: thread.data.clone(), + }); + threads.push_back(Thread { + pc: b, + data: thread.data, + }); + } + Instr::Custom(instr) => { + if F::accepts(&mut thread, instr, sd) { + thread.pc += 1; + threads.push_back(thread); + } + } + } + } + } +} + +struct LookaheadVM<'a> { + vm: VM<'a, AssertionFlavor>, + data: &'a [u8], + cached: bool, + cache_data: Vec<BitSet>, + loc_offset: usize, +} + +impl<'a> LookaheadVM<'a> { + fn new(vm: VM<'a, AssertionFlavor>, data: &'a [u8]) -> Self { + Self { + vm, + data, + cached: false, + cache_data: Vec::new(), + loc_offset: 0, + } + } + + fn get_state(&mut self, loc: usize, state: usize) -> bool { + if !self.cached { + assert!(self.cache_data.is_empty()); + assert_eq!(self.loc_offset, 0); + self.loc_offset = loc; + for i in (loc..self.data.len()).rev() { + self.vm.step(self.data[i], &mut ()); + self.cache_data.push(self.vm.hot.clone()); + } + self.cache_data.reverse(); + self.cached = true; + } + + assert!( + loc >= self.loc_offset, + "get_state must be called with non-decreasing arguments." + ); + self.cache_data[loc - self.loc_offset].get(state) + } +} + +struct VirtualMachine<'a> { + vm0: VM<'a, AssertionFlavor>, + vm1: VM<'a, MainFlavor>, + vm2: LookaheadVM<'a>, + accepting: &'a BitSet, +} + +impl<'a> VirtualMachine<'a> { + fn step(&'a mut self, byte: u8, loc: usize) { + self.vm0.step(byte, &mut ()); + self.vm1 + .step(byte, &mut (loc, &self.vm0.hot, &mut self.vm2)); + } + + fn extract_match(&self) -> Option<Match> { + self.vm1 + .threads + .iter() + .filter(|t| self.accepting.get(t.pc as usize)) + .map(|t| Match { + registers: t.data.clone(), + }) + .next() + } +} + +pub struct ByteCodeCompiledRegex { + instrs0: Box<[Instr<AssertionFlavor>]>, + instrs1: Box<[Instr<MainFlavor>]>, + instrs2: Box<[Instr<AssertionFlavor>]>, + accepting: BitSet, +} + +impl ByteCodeCompiledRegex { + pub fn re_match(&self, data: &[u8]) -> Option<Match> { + let vm0 = VM::new(&self.instrs0); + let vm1 = VM::new(&self.instrs1); + let vm2 = VM::new(&self.instrs2); + let vm2 = LookaheadVM::new(vm2, data); + let mut vm = VirtualMachine { + vm0, + vm1, + vm2, + accepting: &self.accepting, + }; + for (i, ch) in data.iter().cloned().enumerate() { + vm.step(ch, i); + } + vm.extract_match() + } +} + +pub struct Match { + pub registers: Box<[usize]>, +} |
