aboutsummaryrefslogtreecommitdiffstats
path: root/src/parse
diff options
context:
space:
mode:
Diffstat (limited to 'src/parse')
-rw-r--r--src/parse/regex/bc.rs252
-rw-r--r--src/parse/regex/mod.rs104
2 files changed, 356 insertions, 0 deletions
diff --git a/src/parse/regex/bc.rs b/src/parse/regex/bc.rs
new file mode 100644
index 0000000..90761fd
--- /dev/null
+++ b/src/parse/regex/bc.rs
@@ -0,0 +1,252 @@
+use std::collections::VecDeque;
+
+use crate::{
+ bitset::BitSet,
+ parse::regex::{LookDirection, LookPolarity, byte_range::ByteRange},
+};
+
+trait Flavor: Clone {
+ type CustomInstr: Copy + Clone;
+ type ThreadData: Clone;
+ type StepData<'a>;
+
+ fn accepts<'a, 'b>(
+ thread: &mut Thread<Self>,
+ instr: Self::CustomInstr,
+ sd: &mut Self::StepData<'a>,
+ ) -> bool;
+}
+
+#[derive(Copy, Clone)]
+struct MainFlavor;
+impl Flavor for MainFlavor {
+ type CustomInstr = MainInstr;
+ type ThreadData = Box<[usize]>;
+ type StepData<'a> = (usize, &'a BitSet, &'a mut LookaheadVM<'a>);
+
+ fn accepts<'a>(
+ thread: &mut Thread<Self>,
+ instr: Self::CustomInstr,
+ data: &mut Self::StepData<'a>,
+ ) -> bool {
+ match instr {
+ MainInstr::Save(reg) => {
+ thread.data[reg as usize] = data.0;
+ true
+ }
+ MainInstr::Join(assertion) => {
+ let should_match = assertion.pol == LookPolarity::Positive;
+ let state = assertion.what as usize;
+ let is_matching = match assertion.dir {
+ LookDirection::Ahead => data.2.get_state(data.0, state),
+ LookDirection::Behind => data.1.get(state),
+ };
+ is_matching == should_match
+ }
+ }
+ }
+}
+
+#[derive(Copy, Clone)]
+enum Nothing {}
+
+#[derive(Copy, Clone)]
+struct AssertionFlavor;
+impl Flavor for AssertionFlavor {
+ type CustomInstr = Nothing;
+ type ThreadData = ();
+ type StepData<'a> = ();
+
+ fn accepts(_thread: &mut Thread<Self>, instr: Self::CustomInstr, _sd: &mut ()) -> bool {
+ match instr {}
+ }
+}
+
+type JumpTarget = u32;
+type Register = u32;
+type AssertionRef = u32;
+
+#[derive(Copy, Clone)]
+struct Assertion {
+ what: JumpTarget,
+ dir: LookDirection,
+ pol: LookPolarity,
+}
+
+#[derive(Copy, Clone)]
+enum Instr<F: Flavor> {
+ Consume(ByteRange),
+ Jump(JumpTarget),
+ Fork(JumpTarget, JumpTarget),
+ Custom(F::CustomInstr),
+}
+
+#[derive(Copy, Clone)]
+enum MainInstr {
+ Save(Register),
+ Join(Assertion),
+}
+
+#[derive(Copy, Clone)]
+enum Look {}
+
+type Registers = Box<[usize]>;
+
+#[derive(Clone)]
+struct Thread<F: Flavor> {
+ pc: JumpTarget,
+ data: F::ThreadData,
+}
+
+struct CachedLookahead {}
+
+struct VM<'p, F: Flavor> {
+ instr: &'p [Instr<F>],
+ threads: Vec<Thread<F>>,
+ hot: BitSet,
+}
+
+impl<'p, F: Flavor> VM<'p, F> {
+ fn new(instr: &'p [Instr<F>]) -> Self {
+ Self {
+ instr,
+ threads: todo!("determine starting thread"),
+ hot: BitSet::new(instr.len()),
+ }
+ }
+
+ fn step<'a>(&mut self, byte: u8, sd: &mut F::StepData<'a>) {
+ let mut threads: VecDeque<_> = self.threads.drain(..).collect();
+ self.hot.set_all(false);
+
+ while let Some(mut thread) = threads.pop_front() {
+ match self.instr[thread.pc as usize] {
+ Instr::Consume(bytes) => {
+ if bytes.contains(byte) {
+ thread.pc += 1;
+ if !self.hot.get(thread.pc as usize) {
+ self.hot.set(thread.pc as usize, true);
+ self.threads.push(thread);
+ }
+ }
+ }
+ Instr::Jump(j) => {
+ thread.pc = j;
+ threads.push_back(thread);
+ }
+ Instr::Fork(a, b) => {
+ threads.push_back(Thread {
+ pc: a,
+ data: thread.data.clone(),
+ });
+ threads.push_back(Thread {
+ pc: b,
+ data: thread.data,
+ });
+ }
+ Instr::Custom(instr) => {
+ if F::accepts(&mut thread, instr, sd) {
+ thread.pc += 1;
+ threads.push_back(thread);
+ }
+ }
+ }
+ }
+ }
+}
+
+struct LookaheadVM<'a> {
+ vm: VM<'a, AssertionFlavor>,
+ data: &'a [u8],
+ cached: bool,
+ cache_data: Vec<BitSet>,
+ loc_offset: usize,
+}
+
+impl<'a> LookaheadVM<'a> {
+ fn new(vm: VM<'a, AssertionFlavor>, data: &'a [u8]) -> Self {
+ Self {
+ vm,
+ data,
+ cached: false,
+ cache_data: Vec::new(),
+ loc_offset: 0,
+ }
+ }
+
+ fn get_state(&mut self, loc: usize, state: usize) -> bool {
+ if !self.cached {
+ assert!(self.cache_data.is_empty());
+ assert_eq!(self.loc_offset, 0);
+ self.loc_offset = loc;
+ for i in (loc..self.data.len()).rev() {
+ self.vm.step(self.data[i], &mut ());
+ self.cache_data.push(self.vm.hot.clone());
+ }
+ self.cache_data.reverse();
+ self.cached = true;
+ }
+
+ assert!(
+ loc >= self.loc_offset,
+ "get_state must be called with non-decreasing arguments."
+ );
+ self.cache_data[loc - self.loc_offset].get(state)
+ }
+}
+
+struct VirtualMachine<'a> {
+ vm0: VM<'a, AssertionFlavor>,
+ vm1: VM<'a, MainFlavor>,
+ vm2: LookaheadVM<'a>,
+ accepting: &'a BitSet,
+}
+
+impl<'a> VirtualMachine<'a> {
+ fn step(&'a mut self, byte: u8, loc: usize) {
+ self.vm0.step(byte, &mut ());
+ self.vm1
+ .step(byte, &mut (loc, &self.vm0.hot, &mut self.vm2));
+ }
+
+ fn extract_match(&self) -> Option<Match> {
+ self.vm1
+ .threads
+ .iter()
+ .filter(|t| self.accepting.get(t.pc as usize))
+ .map(|t| Match {
+ registers: t.data.clone(),
+ })
+ .next()
+ }
+}
+
+pub struct ByteCodeCompiledRegex {
+ instrs0: Box<[Instr<AssertionFlavor>]>,
+ instrs1: Box<[Instr<MainFlavor>]>,
+ instrs2: Box<[Instr<AssertionFlavor>]>,
+ accepting: BitSet,
+}
+
+impl ByteCodeCompiledRegex {
+ pub fn re_match(&self, data: &[u8]) -> Option<Match> {
+ let vm0 = VM::new(&self.instrs0);
+ let vm1 = VM::new(&self.instrs1);
+ let vm2 = VM::new(&self.instrs2);
+ let vm2 = LookaheadVM::new(vm2, data);
+ let mut vm = VirtualMachine {
+ vm0,
+ vm1,
+ vm2,
+ accepting: &self.accepting,
+ };
+ for (i, ch) in data.iter().cloned().enumerate() {
+ vm.step(ch, i);
+ }
+ vm.extract_match()
+ }
+}
+
+pub struct Match {
+ pub registers: Box<[usize]>,
+}
diff --git a/src/parse/regex/mod.rs b/src/parse/regex/mod.rs
index 63808cd..1bcf18c 100644
--- a/src/parse/regex/mod.rs
+++ b/src/parse/regex/mod.rs
@@ -5,6 +5,7 @@ use super::{Parse, ParseError, Result};
mod byte_range;
pub mod dfa;
pub mod enfa;
+pub mod bc;
#[derive(PartialEq, Eq, Debug, Clone, Copy)]
pub enum LookDirection {
@@ -12,6 +13,15 @@ pub enum LookDirection {
Behind,
}
+impl LookDirection {
+ pub fn reverse(self) -> Self {
+ match self {
+ Self::Ahead => Self::Behind,
+ Self::Behind => Self::Ahead,
+ }
+ }
+}
+
#[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)]
pub enum LookPolarity {
Positive,
@@ -29,6 +39,100 @@ pub enum Pattern {
Nothing,
}
+#[derive(Debug, PartialEq, Eq, Copy, Clone)]
+pub enum ByteConsumption {
+ Bounded(usize),
+ Unbounded,
+}
+
+impl ByteConsumption {
+ pub fn zero() -> Self {
+ Self::Bounded(0)
+ }
+ pub fn one() -> Self {
+ Self::Bounded(1)
+ }
+}
+
+impl Ord for ByteConsumption {
+ fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+ match (self, other) {
+ (Self::Bounded(a), Self::Bounded(b)) => a.cmp(&b),
+ (Self::Bounded(_), Self::Unbounded) => std::cmp::Ordering::Less,
+ (Self::Unbounded, Self::Bounded(_)) => std::cmp::Ordering::Greater,
+ (Self::Unbounded, Self::Unbounded) => std::cmp::Ordering::Equal,
+ }
+ }
+}
+impl PartialOrd for ByteConsumption {
+ fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+ Some(self.cmp(other))
+ }
+}
+
+impl std::ops::Add for ByteConsumption {
+ type Output = Self;
+
+ fn add(self, rhs: Self) -> Self::Output {
+ if let Self::Bounded(a) = self
+ && let Self::Bounded(b) = rhs
+ {
+ Self::Bounded(a + b)
+ } else {
+ Self::Unbounded
+ }
+ }
+}
+
+impl std::ops::Mul<usize> for ByteConsumption {
+ type Output = Self;
+
+ fn mul(self, rhs: usize) -> Self::Output {
+ match self {
+ Self::Bounded(x) => Self::Bounded(x * rhs),
+ Self::Unbounded => Self::Unbounded,
+ }
+ }
+}
+
+impl std::iter::Sum for ByteConsumption {
+ fn sum<I: Iterator<Item = Self>>(iter: I) -> Self {
+ iter.fold(Self::zero(), |a, b| a + b)
+ }
+}
+
+impl Pattern {
+ pub fn max_byte_consumption(&self) -> ByteConsumption {
+ match self {
+ Pattern::Byte(_) => ByteConsumption::one(),
+ Pattern::Range(_, _) => ByteConsumption::one(),
+ Pattern::Alt(patterns) => patterns
+ .iter()
+ .map(Self::max_byte_consumption)
+ .max()
+ .unwrap_or(ByteConsumption::zero()),
+ Pattern::Concat(patterns) => patterns.iter().map(Self::max_byte_consumption).sum(),
+ Pattern::Rep(pattern, _, Some(max_reps)) => {
+ pattern.max_byte_consumption() * (*max_reps as usize)
+ }
+ Pattern::Rep(_, _, None) => ByteConsumption::Unbounded,
+ Pattern::Assertion(_, _, _) => ByteConsumption::zero(),
+ Pattern::Nothing => ByteConsumption::zero(),
+ }
+ }
+
+ pub fn reverse(self) -> Self {
+ use Pattern::*;
+ match self {
+ Byte(_) | Nothing | Range(..) => self,
+ Alt(patterns) => Alt(patterns.into_iter().map(Self::reverse).collect()),
+ Concat(patterns) => Concat(patterns.into_iter().map(Self::reverse).rev().collect()),
+ Rep(pattern, min, max) => Rep(Box::new(pattern.reverse()), min, max),
+ Assertion(dir, pol, pat) => Assertion(dir.reverse(), pol, Box::new(pat.reverse())),
+ }
+ }
+}
+
impl Parse for Pattern {
fn parse(b: &mut super::Cursor<'_>) -> super::Result<Self> {
let begin = b.loc();