aboutsummaryrefslogtreecommitdiffstats
path: root/src/parse/regex/bc.rs
diff options
context:
space:
mode:
authorJonas Maier <jonas@x77.dev>2026-06-05 15:03:44 +0200
committerJonas Maier <jonas@x77.dev>2026-06-05 15:03:44 +0200
commita464bec4d558533776329634504ab0c1bee84bba (patch)
treeae19c073bf41ad52d97f951a1ffe10429d341199 /src/parse/regex/bc.rs
parent2184c1cab66771594051197451c5e0f4dd2567a2 (diff)
downloadpish-a464bec4d558533776329634504ab0c1bee84bba.tar.gz
some bytecode implementation, does not work yet
Diffstat (limited to 'src/parse/regex/bc.rs')
-rw-r--r--src/parse/regex/bc.rs252
1 files changed, 252 insertions, 0 deletions
diff --git a/src/parse/regex/bc.rs b/src/parse/regex/bc.rs
new file mode 100644
index 0000000..90761fd
--- /dev/null
+++ b/src/parse/regex/bc.rs
@@ -0,0 +1,252 @@
+use std::collections::VecDeque;
+
+use crate::{
+ bitset::BitSet,
+ parse::regex::{LookDirection, LookPolarity, byte_range::ByteRange},
+};
+
+trait Flavor: Clone {
+ type CustomInstr: Copy + Clone;
+ type ThreadData: Clone;
+ type StepData<'a>;
+
+ fn accepts<'a, 'b>(
+ thread: &mut Thread<Self>,
+ instr: Self::CustomInstr,
+ sd: &mut Self::StepData<'a>,
+ ) -> bool;
+}
+
+#[derive(Copy, Clone)]
+struct MainFlavor;
+impl Flavor for MainFlavor {
+ type CustomInstr = MainInstr;
+ type ThreadData = Box<[usize]>;
+ type StepData<'a> = (usize, &'a BitSet, &'a mut LookaheadVM<'a>);
+
+ fn accepts<'a>(
+ thread: &mut Thread<Self>,
+ instr: Self::CustomInstr,
+ data: &mut Self::StepData<'a>,
+ ) -> bool {
+ match instr {
+ MainInstr::Save(reg) => {
+ thread.data[reg as usize] = data.0;
+ true
+ }
+ MainInstr::Join(assertion) => {
+ let should_match = assertion.pol == LookPolarity::Positive;
+ let state = assertion.what as usize;
+ let is_matching = match assertion.dir {
+ LookDirection::Ahead => data.2.get_state(data.0, state),
+ LookDirection::Behind => data.1.get(state),
+ };
+ is_matching == should_match
+ }
+ }
+ }
+}
+
+#[derive(Copy, Clone)]
+enum Nothing {}
+
+#[derive(Copy, Clone)]
+struct AssertionFlavor;
+impl Flavor for AssertionFlavor {
+ type CustomInstr = Nothing;
+ type ThreadData = ();
+ type StepData<'a> = ();
+
+ fn accepts(_thread: &mut Thread<Self>, instr: Self::CustomInstr, _sd: &mut ()) -> bool {
+ match instr {}
+ }
+}
+
+type JumpTarget = u32;
+type Register = u32;
+type AssertionRef = u32;
+
+#[derive(Copy, Clone)]
+struct Assertion {
+ what: JumpTarget,
+ dir: LookDirection,
+ pol: LookPolarity,
+}
+
+#[derive(Copy, Clone)]
+enum Instr<F: Flavor> {
+ Consume(ByteRange),
+ Jump(JumpTarget),
+ Fork(JumpTarget, JumpTarget),
+ Custom(F::CustomInstr),
+}
+
+#[derive(Copy, Clone)]
+enum MainInstr {
+ Save(Register),
+ Join(Assertion),
+}
+
+#[derive(Copy, Clone)]
+enum Look {}
+
+type Registers = Box<[usize]>;
+
+#[derive(Clone)]
+struct Thread<F: Flavor> {
+ pc: JumpTarget,
+ data: F::ThreadData,
+}
+
+struct CachedLookahead {}
+
+struct VM<'p, F: Flavor> {
+ instr: &'p [Instr<F>],
+ threads: Vec<Thread<F>>,
+ hot: BitSet,
+}
+
+impl<'p, F: Flavor> VM<'p, F> {
+ fn new(instr: &'p [Instr<F>]) -> Self {
+ Self {
+ instr,
+ threads: todo!("determine starting thread"),
+ hot: BitSet::new(instr.len()),
+ }
+ }
+
+ fn step<'a>(&mut self, byte: u8, sd: &mut F::StepData<'a>) {
+ let mut threads: VecDeque<_> = self.threads.drain(..).collect();
+ self.hot.set_all(false);
+
+ while let Some(mut thread) = threads.pop_front() {
+ match self.instr[thread.pc as usize] {
+ Instr::Consume(bytes) => {
+ if bytes.contains(byte) {
+ thread.pc += 1;
+ if !self.hot.get(thread.pc as usize) {
+ self.hot.set(thread.pc as usize, true);
+ self.threads.push(thread);
+ }
+ }
+ }
+ Instr::Jump(j) => {
+ thread.pc = j;
+ threads.push_back(thread);
+ }
+ Instr::Fork(a, b) => {
+ threads.push_back(Thread {
+ pc: a,
+ data: thread.data.clone(),
+ });
+ threads.push_back(Thread {
+ pc: b,
+ data: thread.data,
+ });
+ }
+ Instr::Custom(instr) => {
+ if F::accepts(&mut thread, instr, sd) {
+ thread.pc += 1;
+ threads.push_back(thread);
+ }
+ }
+ }
+ }
+ }
+}
+
+struct LookaheadVM<'a> {
+ vm: VM<'a, AssertionFlavor>,
+ data: &'a [u8],
+ cached: bool,
+ cache_data: Vec<BitSet>,
+ loc_offset: usize,
+}
+
+impl<'a> LookaheadVM<'a> {
+ fn new(vm: VM<'a, AssertionFlavor>, data: &'a [u8]) -> Self {
+ Self {
+ vm,
+ data,
+ cached: false,
+ cache_data: Vec::new(),
+ loc_offset: 0,
+ }
+ }
+
+ fn get_state(&mut self, loc: usize, state: usize) -> bool {
+ if !self.cached {
+ assert!(self.cache_data.is_empty());
+ assert_eq!(self.loc_offset, 0);
+ self.loc_offset = loc;
+ for i in (loc..self.data.len()).rev() {
+ self.vm.step(self.data[i], &mut ());
+ self.cache_data.push(self.vm.hot.clone());
+ }
+ self.cache_data.reverse();
+ self.cached = true;
+ }
+
+ assert!(
+ loc >= self.loc_offset,
+ "get_state must be called with non-decreasing arguments."
+ );
+ self.cache_data[loc - self.loc_offset].get(state)
+ }
+}
+
+struct VirtualMachine<'a> {
+ vm0: VM<'a, AssertionFlavor>,
+ vm1: VM<'a, MainFlavor>,
+ vm2: LookaheadVM<'a>,
+ accepting: &'a BitSet,
+}
+
+impl<'a> VirtualMachine<'a> {
+ fn step(&'a mut self, byte: u8, loc: usize) {
+ self.vm0.step(byte, &mut ());
+ self.vm1
+ .step(byte, &mut (loc, &self.vm0.hot, &mut self.vm2));
+ }
+
+ fn extract_match(&self) -> Option<Match> {
+ self.vm1
+ .threads
+ .iter()
+ .filter(|t| self.accepting.get(t.pc as usize))
+ .map(|t| Match {
+ registers: t.data.clone(),
+ })
+ .next()
+ }
+}
+
+pub struct ByteCodeCompiledRegex {
+ instrs0: Box<[Instr<AssertionFlavor>]>,
+ instrs1: Box<[Instr<MainFlavor>]>,
+ instrs2: Box<[Instr<AssertionFlavor>]>,
+ accepting: BitSet,
+}
+
+impl ByteCodeCompiledRegex {
+ pub fn re_match(&self, data: &[u8]) -> Option<Match> {
+ let vm0 = VM::new(&self.instrs0);
+ let vm1 = VM::new(&self.instrs1);
+ let vm2 = VM::new(&self.instrs2);
+ let vm2 = LookaheadVM::new(vm2, data);
+ let mut vm = VirtualMachine {
+ vm0,
+ vm1,
+ vm2,
+ accepting: &self.accepting,
+ };
+ for (i, ch) in data.iter().cloned().enumerate() {
+ vm.step(ch, i);
+ }
+ vm.extract_match()
+ }
+}
+
+pub struct Match {
+ pub registers: Box<[usize]>,
+}