From de6474983156b108874432238174f7c3b60b2945 Mon Sep 17 00:00:00 2001 From: Jonas Maier Date: Fri, 5 Jun 2026 22:12:07 +0200 Subject: submatches --- src/parse/regex/bc.rs | 45 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 8 deletions(-) (limited to 'src/parse/regex/bc.rs') diff --git a/src/parse/regex/bc.rs b/src/parse/regex/bc.rs index 6b3ca1b..571386e 100644 --- a/src/parse/regex/bc.rs +++ b/src/parse/regex/bc.rs @@ -19,13 +19,15 @@ trait Flavor: Clone { instr: Self::CustomInstr, sd: &mut Self::StepData<'a, 'b>, ) -> bool; + + fn save(x: u32) -> Option; } #[derive(Copy, Clone, Debug)] struct MainFlavor; impl Flavor for MainFlavor { type CustomInstr = MainInstr; - type ThreadData = Box<[usize]>; + type ThreadData = Box<[Option]>; type StepData<'a, 'b> = (usize, &'a BitSet, &'a mut LookaheadVM<'b>) where @@ -38,7 +40,7 @@ impl Flavor for MainFlavor { ) -> bool { match instr { MainInstr::Save(reg) => { - thread.data[reg as usize] = data.0; + thread.data[reg as usize] = Some(data.0); true } MainInstr::Join(assertion) => { @@ -52,6 +54,10 @@ impl Flavor for MainFlavor { } } } + + fn save(x: u32) -> Option { + Some(MainInstr::Save(x)) + } } #[derive(Copy, Clone, Debug)] @@ -70,6 +76,10 @@ impl Flavor for AssertionFlavor { fn accepts(_thread: &mut Thread, instr: Self::CustomInstr, _sd: &mut ()) -> bool { match instr {} } + + fn save(_: u32) -> Option { + None + } } type JumpTarget = u32; @@ -265,8 +275,12 @@ impl<'a> VirtualMachine<'a> { .threads .iter() .filter(|t| self.accepting.get(t.pc as usize)) - .map(|t| Match { - registers: t.data.clone(), + .map(|t| { + let submatches: Vec<_> = t.data.windows(2).map(|x| Some(x[0]?..x[1]?)).collect(); + + Match { + submatches: submatches.into(), + } }) .next() } @@ -288,6 +302,7 @@ pub struct BytecodeCompiledRegex { instrs0: Box<[Instr]>, instrs1: Box<[Instr]>, instrs2: Box<[Instr]>, + submatch_count: usize, accepting: BitSet, } @@ -307,7 +322,7 @@ impl BytecodeCompiledRegex { &self.instrs1, Thread { pc: 0, - data: Vec::new().into(), // TODO: submatches + data: vec![None; 2 * self.submatch_count].into(), }, ); let vm2 = VM::new(&self.instrs2, Thread { pc: 0, data: () }); @@ -331,7 +346,7 @@ impl BytecodeCompiledRegex { } pub struct Match { - pub registers: Box<[usize]>, + pub submatches: Box<[Option>]>, } type AssertionHandler<'a, F> = @@ -348,6 +363,7 @@ struct Compiler<'a, F: Flavor> { map: HashMap, assertion_handler: AssertionHandler<'a, F>, assertion_fork_base: usize, + submatch_count: usize, } fn fork(repeat: usize, exit: usize, greedy: GreedyBehavior) -> Instr { @@ -369,6 +385,7 @@ impl<'a, F: Flavor> Compiler<'a, F> { map: HashMap::new(), assertion_handler: Box::new(assertion_handler), assertion_fork_base: usize::MAX, + submatch_count: 0, } } @@ -477,6 +494,17 @@ impl<'a, F: Flavor> Compiler<'a, F> { self.instrs.push(ins); } Pattern::Nothing => {} + Pattern::Submatch(pat) => { + let i = self.submatch_count as u32 * 2; + self.submatch_count += 1; + if let Some(ins) = F::save(i) { + self.instrs.push(Instr::Custom(ins)); + } + self.compile(*pat)?; + if let Some(ins) = F::save(i + 1) { + self.instrs.push(Instr::Custom(ins)); + } + } } Ok(()) } @@ -548,7 +576,7 @@ impl TryFrom for BytecodeCompiledRegex { fn try_from(value: Pattern) -> Result { let mut neg = assertion_compiler(); let mut pos = assertion_compiler(); - let (final_state, instrs) = { + let (final_state, instrs, submatch_count) = { let mut main: Compiler = Compiler::new(|dir, pol, pat| { let target = match dir { LookDirection::Ahead => pos.compile_and_memoize(pat.reverse()), @@ -565,7 +593,7 @@ impl TryFrom for BytecodeCompiledRegex { main.compile(value)?; let end = main.instrs.len(); main.instrs.push(Instr::Class(CharacterClass::Nothing)); - (end, main.instrs) + (end, main.instrs, main.submatch_count) }; neg.finalize_assertion_forks(); pos.finalize_assertion_forks(); @@ -578,6 +606,7 @@ impl TryFrom for BytecodeCompiledRegex { instrs1: instrs.into(), instrs2: pos.instrs.into(), accepting, + submatch_count, }) } } -- cgit v1.2.3