use std::collections::VecDeque; use crate::{ bitset::BitSet, parse::regex::{LookDirection, LookPolarity, byte_range::ByteRange}, }; trait Flavor: Clone { type CustomInstr: Copy + Clone; type ThreadData: Clone; type StepData<'a, 'b> where 'b : 'a; fn accepts<'a, 'b>( thread: &mut Thread, instr: Self::CustomInstr, sd: &mut Self::StepData<'a, 'b>, ) -> bool; } #[derive(Copy, Clone)] struct MainFlavor; impl Flavor for MainFlavor { type CustomInstr = MainInstr; type ThreadData = Box<[usize]>; type StepData<'a, 'b> = (usize, &'a BitSet, &'a mut LookaheadVM<'b>) where 'b : 'a; fn accepts<'a, 'b>( thread: &mut Thread, instr: Self::CustomInstr, data: &mut Self::StepData<'a, 'b>, ) -> bool { match instr { MainInstr::Save(reg) => { thread.data[reg as usize] = data.0; true } MainInstr::Join(assertion) => { let should_match = assertion.pol == LookPolarity::Positive; let state = assertion.what as usize; let is_matching = match assertion.dir { LookDirection::Ahead => data.2.get_state(data.0, state), LookDirection::Behind => data.1.get(state), }; is_matching == should_match } } } } #[derive(Copy, Clone)] enum Nothing {} #[derive(Copy, Clone)] struct AssertionFlavor; impl Flavor for AssertionFlavor { type CustomInstr = Nothing; type ThreadData = (); type StepData<'a, 'b> = () where 'b : 'a; fn accepts(_thread: &mut Thread, instr: Self::CustomInstr, _sd: &mut ()) -> bool { match instr {} } } type JumpTarget = u32; type Register = u32; type AssertionRef = u32; #[derive(Copy, Clone)] struct Assertion { what: JumpTarget, dir: LookDirection, pol: LookPolarity, } #[derive(Copy, Clone)] enum Instr { Consume(ByteRange), Jump(JumpTarget), Fork(JumpTarget, JumpTarget), Custom(F::CustomInstr), } #[derive(Copy, Clone)] enum MainInstr { Save(Register), Join(Assertion), } #[derive(Copy, Clone)] enum Look {} type Registers = Box<[usize]>; #[derive(Clone)] struct Thread { pc: JumpTarget, data: F::ThreadData, } struct CachedLookahead {} struct VM<'p, F: Flavor> { instr: &'p [Instr], threads: Vec>, hot: BitSet, } impl<'p, F: Flavor> VM<'p, F> { fn new(instr: &'p [Instr]) -> Self { Self { instr, threads: todo!("determine starting thread"), hot: BitSet::new(instr.len()), } } fn step<'a>(&mut self, byte: u8, sd: &mut F::StepData<'a, 'p>) { let mut threads: VecDeque<_> = self.threads.drain(..).collect(); self.hot.set_all(false); while let Some(mut thread) = threads.pop_front() { match self.instr[thread.pc as usize] { Instr::Consume(bytes) => { if bytes.contains(byte) { thread.pc += 1; if !self.hot.get(thread.pc as usize) { self.hot.set(thread.pc as usize, true); self.threads.push(thread); } } } Instr::Jump(j) => { thread.pc = j; threads.push_back(thread); } Instr::Fork(a, b) => { threads.push_back(Thread { pc: a, data: thread.data.clone(), }); threads.push_back(Thread { pc: b, data: thread.data, }); } Instr::Custom(instr) => { if F::accepts(&mut thread, instr, sd) { thread.pc += 1; threads.push_back(thread); } } } } } } struct LookaheadVM<'a> { vm: VM<'a, AssertionFlavor>, data: &'a [u8], cached: bool, cache_data: Vec, loc_offset: usize, } impl<'a> LookaheadVM<'a> { fn new(vm: VM<'a, AssertionFlavor>, data: &'a [u8]) -> Self { Self { vm, data, cached: false, cache_data: Vec::new(), loc_offset: 0, } } fn get_state(&mut self, loc: usize, state: usize) -> bool { if !self.cached { assert!(self.cache_data.is_empty()); assert_eq!(self.loc_offset, 0); self.loc_offset = loc; for i in (loc..self.data.len()).rev() { self.vm.step(self.data[i], &mut ()); self.cache_data.push(self.vm.hot.clone()); } self.cache_data.reverse(); self.cached = true; } assert!( loc >= self.loc_offset, "get_state must be called with non-decreasing arguments." ); self.cache_data[loc - self.loc_offset].get(state) } } struct VirtualMachine<'a> { vm0: VM<'a, AssertionFlavor>, vm1: VM<'a, MainFlavor>, vm2: LookaheadVM<'a>, accepting: &'a BitSet, } impl<'a> VirtualMachine<'a> { fn step(&mut self, byte: u8, loc: usize) { self.vm0.step(byte, &mut ()); self.vm1 .step(byte, &mut (loc, &self.vm0.hot, &mut self.vm2)); } fn extract_match(&self) -> Option { self.vm1 .threads .iter() .filter(|t| self.accepting.get(t.pc as usize)) .map(|t| Match { registers: t.data.clone(), }) .next() } } pub struct ByteCodeCompiledRegex { instrs0: Box<[Instr]>, instrs1: Box<[Instr]>, instrs2: Box<[Instr]>, accepting: BitSet, } impl ByteCodeCompiledRegex { pub fn re_match(&self, data: &[u8]) -> Option { let vm0 = VM::new(&self.instrs0); let vm1 = VM::new(&self.instrs1); let vm2 = VM::new(&self.instrs2); let vm2 = LookaheadVM::new(vm2, data); let mut vm = VirtualMachine { vm0, vm1, vm2, accepting: &self.accepting, }; for (i, ch) in data.iter().cloned().enumerate() { vm.step(ch, i); } vm.extract_match() } } pub struct Match { pub registers: Box<[usize]>, }