diff options
Diffstat (limited to 'src/parse/regex')
| -rw-r--r-- | src/parse/regex/bc.rs | 252 | ||||
| -rw-r--r-- | src/parse/regex/mod.rs | 104 |
2 files changed, 356 insertions, 0 deletions
diff --git a/src/parse/regex/bc.rs b/src/parse/regex/bc.rs new file mode 100644 index 0000000..90761fd --- /dev/null +++ b/src/parse/regex/bc.rs @@ -0,0 +1,252 @@ +use std::collections::VecDeque; + +use crate::{ + bitset::BitSet, + parse::regex::{LookDirection, LookPolarity, byte_range::ByteRange}, +}; + +trait Flavor: Clone { + type CustomInstr: Copy + Clone; + type ThreadData: Clone; + type StepData<'a>; + + fn accepts<'a, 'b>( + thread: &mut Thread<Self>, + instr: Self::CustomInstr, + sd: &mut Self::StepData<'a>, + ) -> bool; +} + +#[derive(Copy, Clone)] +struct MainFlavor; +impl Flavor for MainFlavor { + type CustomInstr = MainInstr; + type ThreadData = Box<[usize]>; + type StepData<'a> = (usize, &'a BitSet, &'a mut LookaheadVM<'a>); + + fn accepts<'a>( + thread: &mut Thread<Self>, + instr: Self::CustomInstr, + data: &mut Self::StepData<'a>, + ) -> bool { + match instr { + MainInstr::Save(reg) => { + thread.data[reg as usize] = data.0; + true + } + MainInstr::Join(assertion) => { + let should_match = assertion.pol == LookPolarity::Positive; + let state = assertion.what as usize; + let is_matching = match assertion.dir { + LookDirection::Ahead => data.2.get_state(data.0, state), + LookDirection::Behind => data.1.get(state), + }; + is_matching == should_match + } + } + } +} + +#[derive(Copy, Clone)] +enum Nothing {} + +#[derive(Copy, Clone)] +struct AssertionFlavor; +impl Flavor for AssertionFlavor { + type CustomInstr = Nothing; + type ThreadData = (); + type StepData<'a> = (); + + fn accepts(_thread: &mut Thread<Self>, instr: Self::CustomInstr, _sd: &mut ()) -> bool { + match instr {} + } +} + +type JumpTarget = u32; +type Register = u32; +type AssertionRef = u32; + +#[derive(Copy, Clone)] +struct Assertion { + what: JumpTarget, + dir: LookDirection, + pol: LookPolarity, +} + +#[derive(Copy, Clone)] +enum Instr<F: Flavor> { + Consume(ByteRange), + Jump(JumpTarget), + Fork(JumpTarget, JumpTarget), + Custom(F::CustomInstr), +} + +#[derive(Copy, Clone)] +enum MainInstr { + Save(Register), + Join(Assertion), +} + +#[derive(Copy, Clone)] +enum Look {} + +type Registers = Box<[usize]>; + +#[derive(Clone)] +struct Thread<F: Flavor> { + pc: JumpTarget, + data: F::ThreadData, +} + +struct CachedLookahead {} + +struct VM<'p, F: Flavor> { + instr: &'p [Instr<F>], + threads: Vec<Thread<F>>, + hot: BitSet, +} + +impl<'p, F: Flavor> VM<'p, F> { + fn new(instr: &'p [Instr<F>]) -> Self { + Self { + instr, + threads: todo!("determine starting thread"), + hot: BitSet::new(instr.len()), + } + } + + fn step<'a>(&mut self, byte: u8, sd: &mut F::StepData<'a>) { + let mut threads: VecDeque<_> = self.threads.drain(..).collect(); + self.hot.set_all(false); + + while let Some(mut thread) = threads.pop_front() { + match self.instr[thread.pc as usize] { + Instr::Consume(bytes) => { + if bytes.contains(byte) { + thread.pc += 1; + if !self.hot.get(thread.pc as usize) { + self.hot.set(thread.pc as usize, true); + self.threads.push(thread); + } + } + } + Instr::Jump(j) => { + thread.pc = j; + threads.push_back(thread); + } + Instr::Fork(a, b) => { + threads.push_back(Thread { + pc: a, + data: thread.data.clone(), + }); + threads.push_back(Thread { + pc: b, + data: thread.data, + }); + } + Instr::Custom(instr) => { + if F::accepts(&mut thread, instr, sd) { + thread.pc += 1; + threads.push_back(thread); + } + } + } + } + } +} + +struct LookaheadVM<'a> { + vm: VM<'a, AssertionFlavor>, + data: &'a [u8], + cached: bool, + cache_data: Vec<BitSet>, + loc_offset: usize, +} + +impl<'a> LookaheadVM<'a> { + fn new(vm: VM<'a, AssertionFlavor>, data: &'a [u8]) -> Self { + Self { + vm, + data, + cached: false, + cache_data: Vec::new(), + loc_offset: 0, + } + } + + fn get_state(&mut self, loc: usize, state: usize) -> bool { + if !self.cached { + assert!(self.cache_data.is_empty()); + assert_eq!(self.loc_offset, 0); + self.loc_offset = loc; + for i in (loc..self.data.len()).rev() { + self.vm.step(self.data[i], &mut ()); + self.cache_data.push(self.vm.hot.clone()); + } + self.cache_data.reverse(); + self.cached = true; + } + + assert!( + loc >= self.loc_offset, + "get_state must be called with non-decreasing arguments." + ); + self.cache_data[loc - self.loc_offset].get(state) + } +} + +struct VirtualMachine<'a> { + vm0: VM<'a, AssertionFlavor>, + vm1: VM<'a, MainFlavor>, + vm2: LookaheadVM<'a>, + accepting: &'a BitSet, +} + +impl<'a> VirtualMachine<'a> { + fn step(&'a mut self, byte: u8, loc: usize) { + self.vm0.step(byte, &mut ()); + self.vm1 + .step(byte, &mut (loc, &self.vm0.hot, &mut self.vm2)); + } + + fn extract_match(&self) -> Option<Match> { + self.vm1 + .threads + .iter() + .filter(|t| self.accepting.get(t.pc as usize)) + .map(|t| Match { + registers: t.data.clone(), + }) + .next() + } +} + +pub struct ByteCodeCompiledRegex { + instrs0: Box<[Instr<AssertionFlavor>]>, + instrs1: Box<[Instr<MainFlavor>]>, + instrs2: Box<[Instr<AssertionFlavor>]>, + accepting: BitSet, +} + +impl ByteCodeCompiledRegex { + pub fn re_match(&self, data: &[u8]) -> Option<Match> { + let vm0 = VM::new(&self.instrs0); + let vm1 = VM::new(&self.instrs1); + let vm2 = VM::new(&self.instrs2); + let vm2 = LookaheadVM::new(vm2, data); + let mut vm = VirtualMachine { + vm0, + vm1, + vm2, + accepting: &self.accepting, + }; + for (i, ch) in data.iter().cloned().enumerate() { + vm.step(ch, i); + } + vm.extract_match() + } +} + +pub struct Match { + pub registers: Box<[usize]>, +} diff --git a/src/parse/regex/mod.rs b/src/parse/regex/mod.rs index 63808cd..1bcf18c 100644 --- a/src/parse/regex/mod.rs +++ b/src/parse/regex/mod.rs @@ -5,6 +5,7 @@ use super::{Parse, ParseError, Result}; mod byte_range; pub mod dfa; pub mod enfa; +pub mod bc; #[derive(PartialEq, Eq, Debug, Clone, Copy)] pub enum LookDirection { @@ -12,6 +13,15 @@ pub enum LookDirection { Behind, } +impl LookDirection { + pub fn reverse(self) -> Self { + match self { + Self::Ahead => Self::Behind, + Self::Behind => Self::Ahead, + } + } +} + #[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)] pub enum LookPolarity { Positive, @@ -29,6 +39,100 @@ pub enum Pattern { Nothing, } +#[derive(Debug, PartialEq, Eq, Copy, Clone)] +pub enum ByteConsumption { + Bounded(usize), + Unbounded, +} + +impl ByteConsumption { + pub fn zero() -> Self { + Self::Bounded(0) + } + pub fn one() -> Self { + Self::Bounded(1) + } +} + +impl Ord for ByteConsumption { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + match (self, other) { + (Self::Bounded(a), Self::Bounded(b)) => a.cmp(&b), + (Self::Bounded(_), Self::Unbounded) => std::cmp::Ordering::Less, + (Self::Unbounded, Self::Bounded(_)) => std::cmp::Ordering::Greater, + (Self::Unbounded, Self::Unbounded) => std::cmp::Ordering::Equal, + } + } +} +impl PartialOrd for ByteConsumption { + fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> { + Some(self.cmp(other)) + } +} + +impl std::ops::Add for ByteConsumption { + type Output = Self; + + fn add(self, rhs: Self) -> Self::Output { + if let Self::Bounded(a) = self + && let Self::Bounded(b) = rhs + { + Self::Bounded(a + b) + } else { + Self::Unbounded + } + } +} + +impl std::ops::Mul<usize> for ByteConsumption { + type Output = Self; + + fn mul(self, rhs: usize) -> Self::Output { + match self { + Self::Bounded(x) => Self::Bounded(x * rhs), + Self::Unbounded => Self::Unbounded, + } + } +} + +impl std::iter::Sum for ByteConsumption { + fn sum<I: Iterator<Item = Self>>(iter: I) -> Self { + iter.fold(Self::zero(), |a, b| a + b) + } +} + +impl Pattern { + pub fn max_byte_consumption(&self) -> ByteConsumption { + match self { + Pattern::Byte(_) => ByteConsumption::one(), + Pattern::Range(_, _) => ByteConsumption::one(), + Pattern::Alt(patterns) => patterns + .iter() + .map(Self::max_byte_consumption) + .max() + .unwrap_or(ByteConsumption::zero()), + Pattern::Concat(patterns) => patterns.iter().map(Self::max_byte_consumption).sum(), + Pattern::Rep(pattern, _, Some(max_reps)) => { + pattern.max_byte_consumption() * (*max_reps as usize) + } + Pattern::Rep(_, _, None) => ByteConsumption::Unbounded, + Pattern::Assertion(_, _, _) => ByteConsumption::zero(), + Pattern::Nothing => ByteConsumption::zero(), + } + } + + pub fn reverse(self) -> Self { + use Pattern::*; + match self { + Byte(_) | Nothing | Range(..) => self, + Alt(patterns) => Alt(patterns.into_iter().map(Self::reverse).collect()), + Concat(patterns) => Concat(patterns.into_iter().map(Self::reverse).rev().collect()), + Rep(pattern, min, max) => Rep(Box::new(pattern.reverse()), min, max), + Assertion(dir, pol, pat) => Assertion(dir.reverse(), pol, Box::new(pat.reverse())), + } + } +} + impl Parse for Pattern { fn parse(b: &mut super::Cursor<'_>) -> super::Result<Self> { let begin = b.loc(); |
