diff options
| author | Jonas Maier <jonas@x77.dev> | 2026-06-05 21:22:44 +0200 |
|---|---|---|
| committer | Jonas Maier <jonas@x77.dev> | 2026-06-05 21:22:44 +0200 |
| commit | 05f3f381d0066b2e6116470f4e6251ae191aaefe (patch) | |
| tree | 063d31bdc10e25a9e97d15cdaddb625a9fcd199f /src/parse/regex/bc.rs | |
| parent | 65013cb7441dcd56d362613e1ebc469756613b24 (diff) | |
| download | pish-05f3f381d0066b2e6116470f4e6251ae191aaefe.tar.gz | |
regex compiler compiles
Diffstat (limited to 'src/parse/regex/bc.rs')
| -rw-r--r-- | src/parse/regex/bc.rs | 354 |
1 files changed, 320 insertions, 34 deletions
diff --git a/src/parse/regex/bc.rs b/src/parse/regex/bc.rs index eec5628..75a6d5d 100644 --- a/src/parse/regex/bc.rs +++ b/src/parse/regex/bc.rs @@ -1,14 +1,18 @@ -use std::collections::VecDeque; +use std::collections::{HashMap, VecDeque}; use crate::{ bitset::BitSet, - parse::regex::{LookDirection, LookPolarity, byte_range::ByteRange}, + parse::regex::{ + CharacterClass, GreedyBehavior, LookDirection, LookPolarity, Pattern, byte_range::ByteRange, + }, }; trait Flavor: Clone { type CustomInstr: Copy + Clone; type ThreadData: Clone; - type StepData<'a, 'b> where 'b : 'a; + type StepData<'a, 'b> + where + 'b: 'a; fn accepts<'a, 'b>( thread: &mut Thread<Self>, @@ -22,7 +26,10 @@ struct MainFlavor; impl Flavor for MainFlavor { type CustomInstr = MainInstr; type ThreadData = Box<[usize]>; - type StepData<'a, 'b> = (usize, &'a BitSet, &'a mut LookaheadVM<'b>) where 'b : 'a; + type StepData<'a, 'b> + = (usize, &'a BitSet, &'a mut LookaheadVM<'b>) + where + 'b: 'a; fn accepts<'a, 'b>( thread: &mut Thread<Self>, @@ -36,7 +43,7 @@ impl Flavor for MainFlavor { } MainInstr::Join(assertion) => { let should_match = assertion.pol == LookPolarity::Positive; - let state = assertion.what as usize; + let state = assertion.target as usize; let is_matching = match assertion.dir { LookDirection::Ahead => data.2.get_state(data.0, state), LookDirection::Behind => data.1.get(state), @@ -55,7 +62,10 @@ struct AssertionFlavor; impl Flavor for AssertionFlavor { type CustomInstr = Nothing; type ThreadData = (); - type StepData<'a, 'b> = () where 'b : 'a; + type StepData<'a, 'b> + = () + where + 'b: 'a; fn accepts(_thread: &mut Thread<Self>, instr: Self::CustomInstr, _sd: &mut ()) -> bool { match instr {} @@ -64,17 +74,17 @@ impl Flavor for AssertionFlavor { type JumpTarget = u32; type Register = u32; -type AssertionRef = u32; #[derive(Copy, Clone)] struct Assertion { - what: JumpTarget, + target: JumpTarget, dir: LookDirection, pol: LookPolarity, } #[derive(Copy, Clone)] enum Instr<F: Flavor> { + Class(CharacterClass), Consume(ByteRange), Jump(JumpTarget), Fork(JumpTarget, JumpTarget), @@ -87,19 +97,12 @@ enum MainInstr { Join(Assertion), } -#[derive(Copy, Clone)] -enum Look {} - -type Registers = Box<[usize]>; - #[derive(Clone)] struct Thread<F: Flavor> { pc: JumpTarget, data: F::ThreadData, } -struct CachedLookahead {} - struct VM<'p, F: Flavor> { instr: &'p [Instr<F>], threads: Vec<Thread<F>>, @@ -107,27 +110,24 @@ struct VM<'p, F: Flavor> { } impl<'p, F: Flavor> VM<'p, F> { - fn new(instr: &'p [Instr<F>]) -> Self { + fn new(instr: &'p [Instr<F>], starting_thread: Thread<F>) -> Self { Self { instr, - threads: todo!("determine starting thread"), + threads: vec![starting_thread], hot: BitSet::new(instr.len()), } } - fn step<'a>(&mut self, byte: u8, sd: &mut F::StepData<'a, 'p>) { + fn step_epsilon<'a>(&mut self, sd: &mut F::StepData<'a, 'p>) { let mut threads: VecDeque<_> = self.threads.drain(..).collect(); self.hot.set_all(false); while let Some(mut thread) = threads.pop_front() { match self.instr[thread.pc as usize] { - Instr::Consume(bytes) => { - if bytes.contains(byte) { - thread.pc += 1; - if !self.hot.get(thread.pc as usize) { - self.hot.set(thread.pc as usize, true); - self.threads.push(thread); - } + Instr::Class(_) | Instr::Consume(_) => { + if !self.hot.get(thread.pc as usize) { + self.hot.set(thread.pc as usize, true); + self.threads.push(thread); } } Instr::Jump(j) => { @@ -153,6 +153,32 @@ impl<'p, F: Flavor> VM<'p, F> { } } } + + fn step_consume(&mut self, byte: u8) { + self.hot.set_all(false); + self.threads + .retain_mut(|thread| match self.instr[thread.pc as usize] { + Instr::Class(class) => { + if class.matches(byte) { + thread.pc += 1; + self.hot.set(thread.pc as usize, true); + true + } else { + false + } + } + Instr::Consume(bytes) => { + if bytes.contains(byte) { + thread.pc += 1; + self.hot.set(thread.pc as usize, true); + true + } else { + false + } + } + _ => false, + }); + } } struct LookaheadVM<'a> { @@ -180,7 +206,8 @@ impl<'a> LookaheadVM<'a> { assert_eq!(self.loc_offset, 0); self.loc_offset = loc; for i in (loc..self.data.len()).rev() { - self.vm.step(self.data[i], &mut ()); + self.vm.step_epsilon(&mut ()); + self.vm.step_consume(self.data[i]); self.cache_data.push(self.vm.hot.clone()); } self.cache_data.reverse(); @@ -204,9 +231,11 @@ struct VirtualMachine<'a> { impl<'a> VirtualMachine<'a> { fn step(&mut self, byte: u8, loc: usize) { - self.vm0.step(byte, &mut ()); + self.vm0.step_epsilon(&mut ()); self.vm1 - .step(byte, &mut (loc, &self.vm0.hot, &mut self.vm2)); + .step_epsilon(&mut (loc, &self.vm0.hot, &mut self.vm2)); + self.vm0.step_consume(byte); + self.vm1.step_consume(byte); } fn extract_match(&self) -> Option<Match> { @@ -220,19 +249,24 @@ impl<'a> VirtualMachine<'a> { .next() } } - -pub struct ByteCodeCompiledRegex { +pub struct BytecodeCompiledRegex { instrs0: Box<[Instr<AssertionFlavor>]>, instrs1: Box<[Instr<MainFlavor>]>, instrs2: Box<[Instr<AssertionFlavor>]>, accepting: BitSet, } -impl ByteCodeCompiledRegex { +impl BytecodeCompiledRegex { pub fn re_match(&self, data: &[u8]) -> Option<Match> { - let vm0 = VM::new(&self.instrs0); - let vm1 = VM::new(&self.instrs1); - let vm2 = VM::new(&self.instrs2); + let vm0 = VM::new(&self.instrs0, Thread { pc: 0, data: () }); + let vm1 = VM::new( + &self.instrs1, + Thread { + pc: 0, + data: Vec::new().into(), // TODO: submatches + }, + ); + let vm2 = VM::new(&self.instrs2, Thread { pc: 0, data: () }); let vm2 = LookaheadVM::new(vm2, data); let mut vm = VirtualMachine { vm0, @@ -245,8 +279,260 @@ impl ByteCodeCompiledRegex { } vm.extract_match() } + + pub fn matches(&self, data: &[u8]) -> bool { + self.re_match(data).is_some() + } } pub struct Match { pub registers: Box<[usize]>, } + +type AssertionHandler<'a, F> = + Box<dyn 'a + FnMut(LookDirection, LookPolarity, Pattern) -> CompileResult<Instr<F>>>; + +#[derive(Copy, Clone)] +struct CompiledSnippet { + begin: JumpTarget, + end: JumpTarget, +} + +struct Compiler<'a, F: Flavor> { + instrs: Vec<Instr<F>>, + map: HashMap<Pattern, CompiledSnippet>, + assertion_handler: AssertionHandler<'a, F>, + assertion_fork_base: usize, +} + +fn fork<F: Flavor>(repeat: usize, exit: usize, greedy: GreedyBehavior) -> Instr<F> { + let repeat = repeat as JumpTarget; + let exit = exit as JumpTarget; + match greedy { + GreedyBehavior::Greedy => Instr::Fork(repeat, exit), + GreedyBehavior::NonGreedy => Instr::Fork(exit, repeat), + } +} + +impl<'a, F: Flavor> Compiler<'a, F> { + fn new( + assertion_handler: impl 'a + + FnMut(LookDirection, LookPolarity, Pattern) -> CompileResult<Instr<F>>, + ) -> Self { + Self { + instrs: Vec::new(), + map: HashMap::new(), + assertion_handler: Box::new(assertion_handler), + assertion_fork_base: usize::MAX, + } + } + + fn rep_1_or_more(&mut self, pat: Pattern, greedy: GreedyBehavior) -> CompileResult { + let base = self.instrs.len(); + self.compile(pat)?; + let exit = self.instrs.len() + 1; + self.instrs.push(fork(base, exit, greedy)); + Ok(()) + } + + fn rep_0_or_1(&mut self, pat: Pattern, greedy: GreedyBehavior) -> CompileResult { + let base = self.instrs.len(); + self.instrs.push(Instr::Jump(u32::MAX)); + self.compile(pat)?; + self.instrs[base] = fork(base + 1, self.instrs.len(), greedy); + Ok(()) + } + + fn rep_any_amt(&mut self, pat: Pattern, greedy: GreedyBehavior) -> CompileResult { + let base = self.instrs.len(); + self.instrs.push(Instr::Jump(u32::MAX)); + self.compile(pat)?; + let fork_pos = self.instrs.len(); + let after = fork_pos + 1; + self.instrs.push(fork(base, after, greedy)); + self.instrs[base] = Instr::Jump(fork_pos as JumpTarget); + Ok(()) + } + + fn compile(&mut self, pat: Pattern) -> CompileResult { + match pat { + Pattern::Byte(x) => self.instrs.push(Instr::Consume(ByteRange::new_single(x))), + Pattern::Range(a, b) => self.instrs.push(Instr::Consume(ByteRange::new_range(a, b))), + Pattern::CharacterClass(cc) => { + self.instrs.push(Instr::Class(cc)); + } + Pattern::Alt(patterns) => { + let branch_factor = patterns.len(); + assert!(branch_factor > 0); + + let base = self.instrs.len(); + + // placeholders to later place in forks + for _ in 0..patterns.len() - 1 { + self.instrs.push(Instr::Jump(u32::MAX)); + } + + let mut enter_pats = Vec::new(); + let mut leave_pats = Vec::new(); + for pat in patterns.into_iter() { + enter_pats.push(self.instrs.len()); + self.compile(pat)?; + leave_pats.push(self.instrs.len()); + + // placeholder to place in join + self.instrs.push(Instr::Jump(u32::MAX)); + } + + self.instrs.pop(); // remove last jump + let join_point = self.instrs.len(); + + // link forks + for i in 0..branch_factor - 1 { + let a = enter_pats[i]; + let b = if i == branch_factor - 2 { + enter_pats[i + 1] + } else { + base + i + 1 + }; + self.instrs[base + i] = Instr::Fork(a as JumpTarget, b as JumpTarget); + } + + // link joins + for i in 0..branch_factor - 1 { + self.instrs[leave_pats[i]] = Instr::Jump(join_point as JumpTarget); + } + } + Pattern::Concat(patterns) => { + for pat in patterns.into_iter() { + self.compile(pat)?; + } + } + Pattern::Rep(pat, 0, None, greed) => { + self.rep_any_amt(*pat, greed)?; + } + Pattern::Rep(pat, min, None, greed) => { + let pat = *pat; + for _ in 1..min { + self.compile(pat.clone())?; + } + self.rep_1_or_more(pat, greed)?; + } + Pattern::Rep(pat, min, Some(max), greed) => { + let pat = *pat; + let opt = max - min; + for _ in 0..min { + self.compile(pat.clone())?; + } + for _ in 0..opt { + self.rep_0_or_1(pat.clone(), greed)?; + } + } + Pattern::Assertion(look_direction, look_polarity, pattern) => { + let ins = (self.assertion_handler)(look_direction, look_polarity, *pattern)?; + self.instrs.push(ins); + } + Pattern::Nothing => {} + } + Ok(()) + } + + fn compile_and_memoize(&mut self, pat: Pattern) -> CompileResult<CompiledSnippet> { + if let Some(&jt) = self.map.get(&pat) { + return Ok(jt); + } + let begin = self.instrs.len() as JumpTarget; + self.compile(pat.clone())?; + let end = self.instrs.len() as JumpTarget; + self.instrs.push(Instr::Class(CharacterClass::Nothing)); + let bounds = CompiledSnippet { begin, end }; + self.map.insert(pat, bounds); + Ok(bounds) + } + + fn finalize_assertion_forks(&mut self) { + let fork_targets: Vec<JumpTarget> = self.map.values().map(|v| v.begin).collect(); + let fork_begin = self.instrs.len() as JumpTarget; + match fork_targets.len() { + 0 => { + self.instrs[self.assertion_fork_base] = Instr::Class(CharacterClass::Nothing); + } + 1 => { + self.instrs[self.assertion_fork_base] = Instr::Jump(fork_targets[0]); + } + 2 => { + self.instrs[self.assertion_fork_base] = + Instr::Fork(fork_targets[0], fork_targets[1]); + } + n => { + self.instrs[self.assertion_fork_base] = Instr::Fork(fork_targets[0], fork_begin); + for i in 1..n - 1 { + let fork = if i == n - 2 { + Instr::Fork(fork_targets[i], fork_targets[i + 1]) + } else { + Instr::Fork(fork_targets[i], self.instrs.len() as JumpTarget + 1) + }; + self.instrs.push(fork); + } + } + } + } +} + +fn assertion_compiler() -> Compiler<'static, AssertionFlavor> { + let mut c = Compiler::new(|_, _, _| Err(RegexCompilationError::NestedLookaroundNotSupported)); + c.rep_any_amt( + Pattern::CharacterClass(CharacterClass::Everything), + GreedyBehavior::NonGreedy, + ) + .expect("characterclass should always compile"); + c.assertion_fork_base = c.instrs.len(); + c.instrs.push(Instr::Jump(u32::MAX)); // in the end this gets replaced by a jump to a fork-list for all the assertions + c +} + +#[derive(Clone, Debug)] +pub enum RegexCompilationError { + NestedLookaroundNotSupported, +} + +pub type CompileResult<T = ()> = Result<T, RegexCompilationError>; + +impl TryFrom<Pattern> for BytecodeCompiledRegex { + type Error = RegexCompilationError; + + fn try_from(value: Pattern) -> Result<Self, Self::Error> { + let mut neg = assertion_compiler(); + let mut pos = assertion_compiler(); + let (final_state, instrs) = { + let mut main: Compiler<MainFlavor> = Compiler::new(|dir, pol, pat| { + let target = match dir { + LookDirection::Ahead => pos.compile_and_memoize(pat.reverse()), + LookDirection::Behind => neg.compile_and_memoize(pat), + }? + .end; + + Ok(Instr::Custom(MainInstr::Join(Assertion { + target, + dir, + pol, + }))) + }); + main.compile(value)?; + let end = main.instrs.len(); + main.instrs.push(Instr::Class(CharacterClass::Nothing)); + (end, main.instrs) + }; + neg.finalize_assertion_forks(); + pos.finalize_assertion_forks(); + + let mut accepting = BitSet::new(instrs.len()); + accepting.set(final_state, true); + + Ok(Self { + instrs0: neg.instrs.into(), + instrs1: instrs.into(), + instrs2: pos.instrs.into(), + accepting, + }) + } +} |
