diff options
Diffstat (limited to 'src/regex/bc.rs')
| -rw-r--r-- | src/regex/bc.rs | 94 |
1 files changed, 43 insertions, 51 deletions
diff --git a/src/regex/bc.rs b/src/regex/bc.rs index ea1d086..b94433e 100644 --- a/src/regex/bc.rs +++ b/src/regex/bc.rs @@ -1,10 +1,10 @@ -use std::collections::{HashMap, VecDeque}; +use std::collections::HashMap; use super::{ Class, GreedyBehavior, LookDirection, LookPolarity, Match, Pattern, RegexEngine, byte_range::ByteRange, }; -use crate::bitset::BitSet; +use crate::{BString, bitset::BitSet}; trait Flavor: Clone { type CustomInstr: Copy + Clone + std::fmt::Debug; @@ -267,6 +267,7 @@ struct VirtualMachine<'a> { vm1: VM<'a, MainFlavor>, vm2: LookaheadVM<'a>, accepting: &'a BitSet, + submatches: &'a [BString], } impl<'a> VirtualMachine<'a> { @@ -300,11 +301,17 @@ impl<'a> VirtualMachine<'a> { .iter() .filter(|t| self.accepting.get(t.pc as usize)) .map(|t| { - let submatches: Vec<_> = t.data.windows(2).map(|x| Some(x[0]?..x[1]?)).collect(); - - Match { - submatches: submatches.into(), - } + let submatches = self + .submatches + .iter() + .enumerate() + .filter_map(|(idx, name)| { + let begin = t.data[2 * idx]?; + let end = t.data[2 * idx + 1]?; + Some((name.clone(), begin..end)) + }) + .collect(); + Match { submatches } }) .next() } @@ -327,7 +334,7 @@ pub struct BytecodeCompiledRegex { instrs1: Box<[Instr<MainFlavor>]>, instrs2: Box<[Instr<AssertionFlavor>]>, no_lookbehind: bool, - submatch_count: usize, + submatches: Vec<BString>, accepting: BitSet, } @@ -347,7 +354,7 @@ impl BytecodeCompiledRegex { &self.instrs1, Thread { pc: 0, - data: vec![None; 2 * self.submatch_count].into(), + data: vec![None; 2 * self.submatches.len()].into(), }, ); let vm2 = VM::new(&self.instrs2, Thread { pc: 0, data: () }); @@ -357,6 +364,7 @@ impl BytecodeCompiledRegex { vm1, vm2, accepting: &self.accepting, + submatches: &self.submatches, }; if self.no_lookbehind { for (i, ch) in data.iter().cloned().enumerate() { @@ -392,7 +400,7 @@ struct Compiler<'a, F: Flavor> { map: HashMap<Pattern, CompiledSnippet>, assertion_handler: AssertionHandler<'a, F>, assertion_fork_base: usize, - submatch_count: usize, + submatches: Vec<BString>, } fn fork<F: Flavor>(repeat: usize, exit: usize, greedy: GreedyBehavior) -> Instr<F> { @@ -414,7 +422,7 @@ impl<'a, F: Flavor> Compiler<'a, F> { map: HashMap::new(), assertion_handler: Box::new(assertion_handler), assertion_fork_base: usize::MAX, - submatch_count: 0, + submatches: Vec::new(), } } @@ -523,9 +531,9 @@ impl<'a, F: Flavor> Compiler<'a, F> { self.instrs.push(ins); } Pattern::Nothing => {} - Pattern::Submatch(pat) => { - let i = self.submatch_count as u32 * 2; - self.submatch_count += 1; + Pattern::Submatch(match_name, pat) => { + let i = self.submatches.len() as u32 * 2; + self.submatches.push(match_name); if let Some(ins) = F::save(i) { self.instrs.push(Instr::Custom(ins)); } @@ -605,7 +613,7 @@ impl TryFrom<Pattern> for BytecodeCompiledRegex { fn try_from(value: Pattern) -> Result<Self, Self::Error> { let mut neg = assertion_compiler(); let mut pos = assertion_compiler(); - let (final_state, instrs, submatch_count) = { + let (final_state, instrs, submatches) = { let mut main: Compiler<MainFlavor> = Compiler::new(|dir, pol, pat| { let target = match dir { LookDirection::Ahead => pos.compile_and_memoize(pat.reverse()), @@ -622,7 +630,7 @@ impl TryFrom<Pattern> for BytecodeCompiledRegex { main.compile(value)?; let end = main.instrs.len(); main.instrs.push(Instr::Class(Class::Nothing)); - (end, main.instrs, main.submatch_count) + (end, main.instrs, main.submatches) }; neg.finalize_assertion_forks(); pos.finalize_assertion_forks(); @@ -636,7 +644,7 @@ impl TryFrom<Pattern> for BytecodeCompiledRegex { instrs1: instrs.into(), instrs2: pos.instrs.into(), accepting, - submatch_count, + submatches, }) } } @@ -675,94 +683,78 @@ mod tests { #[test] fn nongreedy_star() { - let re = regex("(ab*?)bb*"); + let re = regex("(?<x>ab*?)bb*"); assert_eq!( - re.re_match(b"abbb").unwrap().submatches[0].clone().unwrap(), + re.re_match(b"abbb").unwrap().submatches["x".as_bytes()], 0..1 ); assert_eq!( - re.re_match(b"abbbbb").unwrap().submatches[0] - .clone() - .unwrap(), + re.re_match(b"abbbbb").unwrap().submatches["x".as_bytes()], 0..1 ); } #[test] fn greedy_star() { - let re = regex("(ab*)bb*"); + let re = regex("(?<x>ab*)bb*"); assert_eq!( - re.re_match(b"abbb").unwrap().submatches[0].clone().unwrap(), + re.re_match(b"abbb").unwrap().submatches["x".as_bytes()], 0..3 ); assert_eq!( - re.re_match(b"abbbbb").unwrap().submatches[0] - .clone() - .unwrap(), + re.re_match(b"abbbbb").unwrap().submatches["x".as_bytes()], 0..5 ); } #[test] fn nongreedy_plus() { - let re = regex("(ab+?)bb*"); + let re = regex("(?<x>ab+?)bb*"); assert_eq!( - re.re_match(b"abbbb").unwrap().submatches[0] - .clone() - .unwrap(), + re.re_match(b"abbbb").unwrap().submatches["x".as_bytes()], 0..2 ); assert_eq!( - re.re_match(b"abbbbb").unwrap().submatches[0] - .clone() - .unwrap(), + re.re_match(b"abbbbb").unwrap().submatches["x".as_bytes()], 0..2 ); } #[test] fn greedy_plus() { - let re = regex("(ab+)bb*"); + let re = regex("(?<x>ab+)bb*"); assert_eq!( - re.re_match(b"abbb").unwrap().submatches[0].clone().unwrap(), + re.re_match(b"abbb").unwrap().submatches["x".as_bytes()], 0..3 ); assert_eq!( - re.re_match(b"abbbbb").unwrap().submatches[0] - .clone() - .unwrap(), + re.re_match(b"abbbbb").unwrap().submatches["x".as_bytes()], 0..5 ); } #[test] fn nongreedy_qm() { - let re = regex("(ab??)bb*"); + let re = regex("(?<x>ab??)bb*"); assert_eq!( - re.re_match(b"abbbb").unwrap().submatches[0] - .clone() - .unwrap(), + re.re_match(b"abbbb").unwrap().submatches["x".as_bytes()], 0..1 ); assert_eq!( - re.re_match(b"abbbbb").unwrap().submatches[0] - .clone() - .unwrap(), + re.re_match(b"abbbbb").unwrap().submatches["x".as_bytes()], 0..1 ); } #[test] fn greedy_qm() { - let re = regex("(ab?)bb*"); + let re = regex("(?<x>ab?)bb*"); assert_eq!( - re.re_match(b"abbb").unwrap().submatches[0].clone().unwrap(), + re.re_match(b"abbb").unwrap().submatches["x".as_bytes()], 0..2 ); assert_eq!( - re.re_match(b"abbbbb").unwrap().submatches[0] - .clone() - .unwrap(), + re.re_match(b"abbbbb").unwrap().submatches["x".as_bytes()], 0..2 ); } |
