From 6b452e817090dba6c80454a42079fa5c9f83b1cd Mon Sep 17 00:00:00 2001 From: Jonas Maier Date: Sat, 6 Jun 2026 22:35:32 +0200 Subject: regex: only named matches now --- src/regex/bc.rs | 94 ++++++++++++++++++++++++----------------------------- src/regex/enfa.rs | 2 +- src/regex/mod.rs | 16 +++++---- src/regex/simple.rs | 4 +-- 4 files changed, 56 insertions(+), 60 deletions(-) (limited to 'src/regex') diff --git a/src/regex/bc.rs b/src/regex/bc.rs index ea1d086..b94433e 100644 --- a/src/regex/bc.rs +++ b/src/regex/bc.rs @@ -1,10 +1,10 @@ -use std::collections::{HashMap, VecDeque}; +use std::collections::HashMap; use super::{ Class, GreedyBehavior, LookDirection, LookPolarity, Match, Pattern, RegexEngine, byte_range::ByteRange, }; -use crate::bitset::BitSet; +use crate::{BString, bitset::BitSet}; trait Flavor: Clone { type CustomInstr: Copy + Clone + std::fmt::Debug; @@ -267,6 +267,7 @@ struct VirtualMachine<'a> { vm1: VM<'a, MainFlavor>, vm2: LookaheadVM<'a>, accepting: &'a BitSet, + submatches: &'a [BString], } impl<'a> VirtualMachine<'a> { @@ -300,11 +301,17 @@ impl<'a> VirtualMachine<'a> { .iter() .filter(|t| self.accepting.get(t.pc as usize)) .map(|t| { - let submatches: Vec<_> = t.data.windows(2).map(|x| Some(x[0]?..x[1]?)).collect(); - - Match { - submatches: submatches.into(), - } + let submatches = self + .submatches + .iter() + .enumerate() + .filter_map(|(idx, name)| { + let begin = t.data[2 * idx]?; + let end = t.data[2 * idx + 1]?; + Some((name.clone(), begin..end)) + }) + .collect(); + Match { submatches } }) .next() } @@ -327,7 +334,7 @@ pub struct BytecodeCompiledRegex { instrs1: Box<[Instr]>, instrs2: Box<[Instr]>, no_lookbehind: bool, - submatch_count: usize, + submatches: Vec, accepting: BitSet, } @@ -347,7 +354,7 @@ impl BytecodeCompiledRegex { &self.instrs1, Thread { pc: 0, - data: vec![None; 2 * self.submatch_count].into(), + data: vec![None; 2 * self.submatches.len()].into(), }, ); let vm2 = VM::new(&self.instrs2, Thread { pc: 0, data: () }); @@ -357,6 +364,7 @@ impl BytecodeCompiledRegex { vm1, vm2, accepting: &self.accepting, + submatches: &self.submatches, }; if self.no_lookbehind { for (i, ch) in data.iter().cloned().enumerate() { @@ -392,7 +400,7 @@ struct Compiler<'a, F: Flavor> { map: HashMap, assertion_handler: AssertionHandler<'a, F>, assertion_fork_base: usize, - submatch_count: usize, + submatches: Vec, } fn fork(repeat: usize, exit: usize, greedy: GreedyBehavior) -> Instr { @@ -414,7 +422,7 @@ impl<'a, F: Flavor> Compiler<'a, F> { map: HashMap::new(), assertion_handler: Box::new(assertion_handler), assertion_fork_base: usize::MAX, - submatch_count: 0, + submatches: Vec::new(), } } @@ -523,9 +531,9 @@ impl<'a, F: Flavor> Compiler<'a, F> { self.instrs.push(ins); } Pattern::Nothing => {} - Pattern::Submatch(pat) => { - let i = self.submatch_count as u32 * 2; - self.submatch_count += 1; + Pattern::Submatch(match_name, pat) => { + let i = self.submatches.len() as u32 * 2; + self.submatches.push(match_name); if let Some(ins) = F::save(i) { self.instrs.push(Instr::Custom(ins)); } @@ -605,7 +613,7 @@ impl TryFrom for BytecodeCompiledRegex { fn try_from(value: Pattern) -> Result { let mut neg = assertion_compiler(); let mut pos = assertion_compiler(); - let (final_state, instrs, submatch_count) = { + let (final_state, instrs, submatches) = { let mut main: Compiler = Compiler::new(|dir, pol, pat| { let target = match dir { LookDirection::Ahead => pos.compile_and_memoize(pat.reverse()), @@ -622,7 +630,7 @@ impl TryFrom for BytecodeCompiledRegex { main.compile(value)?; let end = main.instrs.len(); main.instrs.push(Instr::Class(Class::Nothing)); - (end, main.instrs, main.submatch_count) + (end, main.instrs, main.submatches) }; neg.finalize_assertion_forks(); pos.finalize_assertion_forks(); @@ -636,7 +644,7 @@ impl TryFrom for BytecodeCompiledRegex { instrs1: instrs.into(), instrs2: pos.instrs.into(), accepting, - submatch_count, + submatches, }) } } @@ -675,94 +683,78 @@ mod tests { #[test] fn nongreedy_star() { - let re = regex("(ab*?)bb*"); + let re = regex("(?ab*?)bb*"); assert_eq!( - re.re_match(b"abbb").unwrap().submatches[0].clone().unwrap(), + re.re_match(b"abbb").unwrap().submatches["x".as_bytes()], 0..1 ); assert_eq!( - re.re_match(b"abbbbb").unwrap().submatches[0] - .clone() - .unwrap(), + re.re_match(b"abbbbb").unwrap().submatches["x".as_bytes()], 0..1 ); } #[test] fn greedy_star() { - let re = regex("(ab*)bb*"); + let re = regex("(?ab*)bb*"); assert_eq!( - re.re_match(b"abbb").unwrap().submatches[0].clone().unwrap(), + re.re_match(b"abbb").unwrap().submatches["x".as_bytes()], 0..3 ); assert_eq!( - re.re_match(b"abbbbb").unwrap().submatches[0] - .clone() - .unwrap(), + re.re_match(b"abbbbb").unwrap().submatches["x".as_bytes()], 0..5 ); } #[test] fn nongreedy_plus() { - let re = regex("(ab+?)bb*"); + let re = regex("(?ab+?)bb*"); assert_eq!( - re.re_match(b"abbbb").unwrap().submatches[0] - .clone() - .unwrap(), + re.re_match(b"abbbb").unwrap().submatches["x".as_bytes()], 0..2 ); assert_eq!( - re.re_match(b"abbbbb").unwrap().submatches[0] - .clone() - .unwrap(), + re.re_match(b"abbbbb").unwrap().submatches["x".as_bytes()], 0..2 ); } #[test] fn greedy_plus() { - let re = regex("(ab+)bb*"); + let re = regex("(?ab+)bb*"); assert_eq!( - re.re_match(b"abbb").unwrap().submatches[0].clone().unwrap(), + re.re_match(b"abbb").unwrap().submatches["x".as_bytes()], 0..3 ); assert_eq!( - re.re_match(b"abbbbb").unwrap().submatches[0] - .clone() - .unwrap(), + re.re_match(b"abbbbb").unwrap().submatches["x".as_bytes()], 0..5 ); } #[test] fn nongreedy_qm() { - let re = regex("(ab??)bb*"); + let re = regex("(?ab??)bb*"); assert_eq!( - re.re_match(b"abbbb").unwrap().submatches[0] - .clone() - .unwrap(), + re.re_match(b"abbbb").unwrap().submatches["x".as_bytes()], 0..1 ); assert_eq!( - re.re_match(b"abbbbb").unwrap().submatches[0] - .clone() - .unwrap(), + re.re_match(b"abbbbb").unwrap().submatches["x".as_bytes()], 0..1 ); } #[test] fn greedy_qm() { - let re = regex("(ab?)bb*"); + let re = regex("(?ab?)bb*"); assert_eq!( - re.re_match(b"abbb").unwrap().submatches[0].clone().unwrap(), + re.re_match(b"abbb").unwrap().submatches["x".as_bytes()], 0..2 ); assert_eq!( - re.re_match(b"abbbbb").unwrap().submatches[0] - .clone() - .unwrap(), + re.re_match(b"abbbbb").unwrap().submatches["x".as_bytes()], 0..2 ); } diff --git a/src/regex/enfa.rs b/src/regex/enfa.rs index ea3bc93..2947f58 100644 --- a/src/regex/enfa.rs +++ b/src/regex/enfa.rs @@ -736,7 +736,7 @@ impl TryFrom for ENFA { states.push(EState::terminal()); Self { states, has_submatches: false, } } - Pattern::Submatch(pat) => { + Pattern::Submatch(_, pat) => { let mut this = Self::try_from(*pat)?; this.has_submatches = true; this diff --git a/src/regex/mod.rs b/src/regex/mod.rs index cdbfe0d..c42c0ad 100644 --- a/src/regex/mod.rs +++ b/src/regex/mod.rs @@ -1,3 +1,7 @@ +use std::collections::HashMap; + +use crate::BString; + pub mod bc; mod byte_range; pub mod dfa; @@ -56,7 +60,7 @@ pub enum Pattern { Concat(Vec), Rep(Box, u32, Option, GreedyBehavior), Assertion(LookDirection, LookPolarity, Box), - Submatch(Box), + Submatch(BString, Box), Nothing, } @@ -140,7 +144,7 @@ impl Pattern { Pattern::Rep(_, _, None, _) => ByteConsumption::Unbounded, Pattern::Assertion(_, _, _) => ByteConsumption::zero(), Pattern::Nothing => ByteConsumption::zero(), - Pattern::Submatch(pat) => pat.max_byte_consumption(), + Pattern::Submatch(_, pat) => pat.max_byte_consumption(), } } @@ -152,7 +156,7 @@ impl Pattern { Concat(patterns) => Concat(patterns.into_iter().map(Self::reverse).rev().collect()), Rep(pattern, min, max, greedy) => Rep(Box::new(pattern.reverse()), min, max, greedy), Assertion(dir, pol, pat) => Assertion(dir.reverse(), pol, Box::new(pat.reverse())), - Submatch(pat) => Submatch(Box::new(pat.reverse())), + Submatch(name, pat) => Submatch(name, Box::new(pat.reverse())), } } @@ -196,7 +200,7 @@ impl Pattern { Rep(pat, 1, Some(1), _) => pat.simplify(), Rep(pat, min, max, greed) => Rep(Box::new(pat.simplify()), min, max, greed), Assertion(dir, pol, pattern) => Assertion(dir, pol, Box::new(pattern.simplify())), - Submatch(pattern) => Submatch(Box::new(pattern.simplify())), + Submatch(name, pattern) => Submatch(name, Box::new(pattern.simplify())), CharacterClass(_) | Range(_, _) | Nothing | Byte(_) => self, } } @@ -231,7 +235,7 @@ impl CompiledPattern { #[derive(PartialEq, Eq, Debug)] pub struct Match { - pub submatches: Box<[Option>]>, + pub submatches: HashMap>, } impl Match { @@ -367,7 +371,7 @@ mod tests { #[test] fn match_is_bytecode() { - let x = Pattern::parse_from_bytes(b".*(ele.*phant).*") + let x = Pattern::parse_from_bytes(b".*(?ele.*phant).*") .unwrap() .try_compile() .unwrap(); diff --git a/src/regex/simple.rs b/src/regex/simple.rs index 4286322..3f0bf30 100644 --- a/src/regex/simple.rs +++ b/src/regex/simple.rs @@ -69,7 +69,7 @@ impl RegexEngine for Nothing { Pattern::Rep(_, x, Some(y), _) if y < x => Ok(Nothing), Pattern::Rep(_, 0, None, _) => Err(NotASimpleNothing), Pattern::Rep(pat, _gt_0, _, _) => Self::compile(*pat), - Pattern::Submatch(pat) => Self::compile(*pat), + Pattern::Submatch(_, pat) => Self::compile(*pat), _ => Err(NotASimpleNothing), } } @@ -109,7 +109,7 @@ fn ce(pat: Pattern) -> Option> { None } } - Pattern::Submatch(_) => None, // TODO: submatches could be stored as constant offsets + Pattern::Submatch(_, _) => None, // TODO: submatches could be stored as constant offsets Pattern::Nothing => Some(Vec::new()), _ => None, } -- cgit v1.2.3