diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/parse/regex/bc.rs | 45 | ||||
| -rw-r--r-- | src/parse/regex/enfa.rs | 4 | ||||
| -rw-r--r-- | src/parse/regex/mod.rs | 5 |
3 files changed, 45 insertions, 9 deletions
diff --git a/src/parse/regex/bc.rs b/src/parse/regex/bc.rs index 6b3ca1b..571386e 100644 --- a/src/parse/regex/bc.rs +++ b/src/parse/regex/bc.rs @@ -19,13 +19,15 @@ trait Flavor: Clone { instr: Self::CustomInstr, sd: &mut Self::StepData<'a, 'b>, ) -> bool; + + fn save(x: u32) -> Option<Self::CustomInstr>; } #[derive(Copy, Clone, Debug)] struct MainFlavor; impl Flavor for MainFlavor { type CustomInstr = MainInstr; - type ThreadData = Box<[usize]>; + type ThreadData = Box<[Option<usize>]>; type StepData<'a, 'b> = (usize, &'a BitSet, &'a mut LookaheadVM<'b>) where @@ -38,7 +40,7 @@ impl Flavor for MainFlavor { ) -> bool { match instr { MainInstr::Save(reg) => { - thread.data[reg as usize] = data.0; + thread.data[reg as usize] = Some(data.0); true } MainInstr::Join(assertion) => { @@ -52,6 +54,10 @@ impl Flavor for MainFlavor { } } } + + fn save(x: u32) -> Option<Self::CustomInstr> { + Some(MainInstr::Save(x)) + } } #[derive(Copy, Clone, Debug)] @@ -70,6 +76,10 @@ impl Flavor for AssertionFlavor { fn accepts(_thread: &mut Thread<Self>, instr: Self::CustomInstr, _sd: &mut ()) -> bool { match instr {} } + + fn save(_: u32) -> Option<Self::CustomInstr> { + None + } } type JumpTarget = u32; @@ -265,8 +275,12 @@ impl<'a> VirtualMachine<'a> { .threads .iter() .filter(|t| self.accepting.get(t.pc as usize)) - .map(|t| Match { - registers: t.data.clone(), + .map(|t| { + let submatches: Vec<_> = t.data.windows(2).map(|x| Some(x[0]?..x[1]?)).collect(); + + Match { + submatches: submatches.into(), + } }) .next() } @@ -288,6 +302,7 @@ pub struct BytecodeCompiledRegex { instrs0: Box<[Instr<AssertionFlavor>]>, instrs1: Box<[Instr<MainFlavor>]>, instrs2: Box<[Instr<AssertionFlavor>]>, + submatch_count: usize, accepting: BitSet, } @@ -307,7 +322,7 @@ impl BytecodeCompiledRegex { &self.instrs1, Thread { pc: 0, - data: Vec::new().into(), // TODO: submatches + data: vec![None; 2 * self.submatch_count].into(), }, ); let vm2 = VM::new(&self.instrs2, Thread { pc: 0, data: () }); @@ -331,7 +346,7 @@ impl BytecodeCompiledRegex { } pub struct Match { - pub registers: Box<[usize]>, + pub submatches: Box<[Option<core::ops::Range<usize>>]>, } type AssertionHandler<'a, F> = @@ -348,6 +363,7 @@ struct Compiler<'a, F: Flavor> { map: HashMap<Pattern, CompiledSnippet>, assertion_handler: AssertionHandler<'a, F>, assertion_fork_base: usize, + submatch_count: usize, } fn fork<F: Flavor>(repeat: usize, exit: usize, greedy: GreedyBehavior) -> Instr<F> { @@ -369,6 +385,7 @@ impl<'a, F: Flavor> Compiler<'a, F> { map: HashMap::new(), assertion_handler: Box::new(assertion_handler), assertion_fork_base: usize::MAX, + submatch_count: 0, } } @@ -477,6 +494,17 @@ impl<'a, F: Flavor> Compiler<'a, F> { self.instrs.push(ins); } Pattern::Nothing => {} + Pattern::Submatch(pat) => { + let i = self.submatch_count as u32 * 2; + self.submatch_count += 1; + if let Some(ins) = F::save(i) { + self.instrs.push(Instr::Custom(ins)); + } + self.compile(*pat)?; + if let Some(ins) = F::save(i + 1) { + self.instrs.push(Instr::Custom(ins)); + } + } } Ok(()) } @@ -548,7 +576,7 @@ impl TryFrom<Pattern> for BytecodeCompiledRegex { fn try_from(value: Pattern) -> Result<Self, Self::Error> { let mut neg = assertion_compiler(); let mut pos = assertion_compiler(); - let (final_state, instrs) = { + let (final_state, instrs, submatch_count) = { let mut main: Compiler<MainFlavor> = Compiler::new(|dir, pol, pat| { let target = match dir { LookDirection::Ahead => pos.compile_and_memoize(pat.reverse()), @@ -565,7 +593,7 @@ impl TryFrom<Pattern> for BytecodeCompiledRegex { main.compile(value)?; let end = main.instrs.len(); main.instrs.push(Instr::Class(CharacterClass::Nothing)); - (end, main.instrs) + (end, main.instrs, main.submatch_count) }; neg.finalize_assertion_forks(); pos.finalize_assertion_forks(); @@ -578,6 +606,7 @@ impl TryFrom<Pattern> for BytecodeCompiledRegex { instrs1: instrs.into(), instrs2: pos.instrs.into(), accepting, + submatch_count, }) } } diff --git a/src/parse/regex/enfa.rs b/src/parse/regex/enfa.rs index ff742e0..dd4cf6e 100644 --- a/src/parse/regex/enfa.rs +++ b/src/parse/regex/enfa.rs @@ -724,6 +724,10 @@ impl TryFrom<Pattern> for ENFA { states.push(EState::terminal()); Self { states } } + Pattern::Submatch(pat) => { + // NFA does not track submatches, so we are skipping it. + Self::try_from(*pat)? + } }) } } diff --git a/src/parse/regex/mod.rs b/src/parse/regex/mod.rs index fed792c..486b0d9 100644 --- a/src/parse/regex/mod.rs +++ b/src/parse/regex/mod.rs @@ -58,6 +58,7 @@ pub enum Pattern { Concat(Vec<Pattern>), Rep(Box<Pattern>, u32, Option<u32>, GreedyBehavior), Assertion(LookDirection, LookPolarity, Box<Pattern>), + Submatch(Box<Pattern>), Nothing, } @@ -141,6 +142,7 @@ impl Pattern { Pattern::Rep(_, _, None, _) => ByteConsumption::Unbounded, Pattern::Assertion(_, _, _) => ByteConsumption::zero(), Pattern::Nothing => ByteConsumption::zero(), + Pattern::Submatch(pat) => pat.max_byte_consumption(), } } @@ -152,6 +154,7 @@ impl Pattern { Concat(patterns) => Concat(patterns.into_iter().map(Self::reverse).rev().collect()), Rep(pattern, min, max, greedy) => Rep(Box::new(pattern.reverse()), min, max, greedy), Assertion(dir, pol, pat) => Assertion(dir.reverse(), pol, Box::new(pat.reverse())), + Submatch(pat) => Submatch(Box::new(pat.reverse())) } } } @@ -342,7 +345,7 @@ fn parse_atom(s: &mut super::Cursor<'_>) -> Result<Pattern> { if let Some((dir, pol)) = assertion { Ok(Pattern::Assertion(dir, pol, Box::new(inner))) } else { - Ok(inner) + Ok(Pattern::Submatch(Box::new(inner))) } } b'.' => { |
