diff options
| author | Jonas Maier <jonas@x77.dev> | 2026-06-06 22:35:32 +0200 |
|---|---|---|
| committer | Jonas Maier <jonas@x77.dev> | 2026-06-06 22:35:32 +0200 |
| commit | 6b452e817090dba6c80454a42079fa5c9f83b1cd (patch) | |
| tree | 70645faa7aef2068ad07ade092e168613727713a | |
| parent | a0376d361b83360c6f9eb971419726995f4c9c19 (diff) | |
| download | pish-6b452e817090dba6c80454a42079fa5c9f83b1cd.tar.gz | |
regex: only named matches now
| -rw-r--r-- | src/parse/regex.rs | 16 | ||||
| -rw-r--r-- | src/regex/bc.rs | 94 | ||||
| -rw-r--r-- | src/regex/enfa.rs | 2 | ||||
| -rw-r--r-- | src/regex/mod.rs | 16 | ||||
| -rw-r--r-- | src/regex/simple.rs | 4 |
5 files changed, 70 insertions, 62 deletions
diff --git a/src/parse/regex.rs b/src/parse/regex.rs index 22c3b9c..4134c88 100644 --- a/src/parse/regex.rs +++ b/src/parse/regex.rs @@ -1,5 +1,5 @@ use super::{Cursor, OtherHighlights, Parse, ParseError, Result}; -use crate::regex::{Class, GreedyBehavior, LookDirection, LookPolarity, Pattern}; +use crate::{parse::VarName, regex::{Class, GreedyBehavior, LookDirection, LookPolarity, Pattern}}; const SYMBOLS: &[u8] = b"{}[]()*+-?|.\\ "; fn is_symbol(x: u8) -> bool { @@ -155,6 +155,7 @@ fn parse_atom(s: &mut super::Cursor<'_>) -> Result<Pattern> { b'(' => { s.adv(); + let mut match_name = None; let mut assertion = None; if s.buf.starts_with(b"?=") { s.advance(2); @@ -168,6 +169,15 @@ fn parse_atom(s: &mut super::Cursor<'_>) -> Result<Pattern> { } else if s.buf.starts_with(b"?<!") { s.advance(3); assertion = Some((LookDirection::Behind, LookPolarity::Negative)); + } else if s.buf.starts_with(b"?<") && s.buf.len() > 2 && s.buf[2].is_ascii_alphabetic() { + s.advance(2); + match_name = Some(VarName::parse(s)?.name); + if !s.has() { + return Err(ParseError::Eof); + } + if s.adv() != b'>' { + return Err(ParseError::Expected('>')); + } } s.highlight_from(begin, OtherHighlights::RegexSymbol); @@ -183,8 +193,10 @@ fn parse_atom(s: &mut super::Cursor<'_>) -> Result<Pattern> { if let Some((dir, pol)) = assertion { Ok(Pattern::Assertion(dir, pol, Box::new(inner))) + } else if let Some(match_name) = match_name { + Ok(Pattern::Submatch(match_name, Box::new(inner))) } else { - Ok(Pattern::Submatch(Box::new(inner))) + Ok(inner) } } b'.' => { diff --git a/src/regex/bc.rs b/src/regex/bc.rs index ea1d086..b94433e 100644 --- a/src/regex/bc.rs +++ b/src/regex/bc.rs @@ -1,10 +1,10 @@ -use std::collections::{HashMap, VecDeque}; +use std::collections::HashMap; use super::{ Class, GreedyBehavior, LookDirection, LookPolarity, Match, Pattern, RegexEngine, byte_range::ByteRange, }; -use crate::bitset::BitSet; +use crate::{BString, bitset::BitSet}; trait Flavor: Clone { type CustomInstr: Copy + Clone + std::fmt::Debug; @@ -267,6 +267,7 @@ struct VirtualMachine<'a> { vm1: VM<'a, MainFlavor>, vm2: LookaheadVM<'a>, accepting: &'a BitSet, + submatches: &'a [BString], } impl<'a> VirtualMachine<'a> { @@ -300,11 +301,17 @@ impl<'a> VirtualMachine<'a> { .iter() .filter(|t| self.accepting.get(t.pc as usize)) .map(|t| { - let submatches: Vec<_> = t.data.windows(2).map(|x| Some(x[0]?..x[1]?)).collect(); - - Match { - submatches: submatches.into(), - } + let submatches = self + .submatches + .iter() + .enumerate() + .filter_map(|(idx, name)| { + let begin = t.data[2 * idx]?; + let end = t.data[2 * idx + 1]?; + Some((name.clone(), begin..end)) + }) + .collect(); + Match { submatches } }) .next() } @@ -327,7 +334,7 @@ pub struct BytecodeCompiledRegex { instrs1: Box<[Instr<MainFlavor>]>, instrs2: Box<[Instr<AssertionFlavor>]>, no_lookbehind: bool, - submatch_count: usize, + submatches: Vec<BString>, accepting: BitSet, } @@ -347,7 +354,7 @@ impl BytecodeCompiledRegex { &self.instrs1, Thread { pc: 0, - data: vec![None; 2 * self.submatch_count].into(), + data: vec![None; 2 * self.submatches.len()].into(), }, ); let vm2 = VM::new(&self.instrs2, Thread { pc: 0, data: () }); @@ -357,6 +364,7 @@ impl BytecodeCompiledRegex { vm1, vm2, accepting: &self.accepting, + submatches: &self.submatches, }; if self.no_lookbehind { for (i, ch) in data.iter().cloned().enumerate() { @@ -392,7 +400,7 @@ struct Compiler<'a, F: Flavor> { map: HashMap<Pattern, CompiledSnippet>, assertion_handler: AssertionHandler<'a, F>, assertion_fork_base: usize, - submatch_count: usize, + submatches: Vec<BString>, } fn fork<F: Flavor>(repeat: usize, exit: usize, greedy: GreedyBehavior) -> Instr<F> { @@ -414,7 +422,7 @@ impl<'a, F: Flavor> Compiler<'a, F> { map: HashMap::new(), assertion_handler: Box::new(assertion_handler), assertion_fork_base: usize::MAX, - submatch_count: 0, + submatches: Vec::new(), } } @@ -523,9 +531,9 @@ impl<'a, F: Flavor> Compiler<'a, F> { self.instrs.push(ins); } Pattern::Nothing => {} - Pattern::Submatch(pat) => { - let i = self.submatch_count as u32 * 2; - self.submatch_count += 1; + Pattern::Submatch(match_name, pat) => { + let i = self.submatches.len() as u32 * 2; + self.submatches.push(match_name); if let Some(ins) = F::save(i) { self.instrs.push(Instr::Custom(ins)); } @@ -605,7 +613,7 @@ impl TryFrom<Pattern> for BytecodeCompiledRegex { fn try_from(value: Pattern) -> Result<Self, Self::Error> { let mut neg = assertion_compiler(); let mut pos = assertion_compiler(); - let (final_state, instrs, submatch_count) = { + let (final_state, instrs, submatches) = { let mut main: Compiler<MainFlavor> = Compiler::new(|dir, pol, pat| { let target = match dir { LookDirection::Ahead => pos.compile_and_memoize(pat.reverse()), @@ -622,7 +630,7 @@ impl TryFrom<Pattern> for BytecodeCompiledRegex { main.compile(value)?; let end = main.instrs.len(); main.instrs.push(Instr::Class(Class::Nothing)); - (end, main.instrs, main.submatch_count) + (end, main.instrs, main.submatches) }; neg.finalize_assertion_forks(); pos.finalize_assertion_forks(); @@ -636,7 +644,7 @@ impl TryFrom<Pattern> for BytecodeCompiledRegex { instrs1: instrs.into(), instrs2: pos.instrs.into(), accepting, - submatch_count, + submatches, }) } } @@ -675,94 +683,78 @@ mod tests { #[test] fn nongreedy_star() { - let re = regex("(ab*?)bb*"); + let re = regex("(?<x>ab*?)bb*"); assert_eq!( - re.re_match(b"abbb").unwrap().submatches[0].clone().unwrap(), + re.re_match(b"abbb").unwrap().submatches["x".as_bytes()], 0..1 ); assert_eq!( - re.re_match(b"abbbbb").unwrap().submatches[0] - .clone() - .unwrap(), + re.re_match(b"abbbbb").unwrap().submatches["x".as_bytes()], 0..1 ); } #[test] fn greedy_star() { - let re = regex("(ab*)bb*"); + let re = regex("(?<x>ab*)bb*"); assert_eq!( - re.re_match(b"abbb").unwrap().submatches[0].clone().unwrap(), + re.re_match(b"abbb").unwrap().submatches["x".as_bytes()], 0..3 ); assert_eq!( - re.re_match(b"abbbbb").unwrap().submatches[0] - .clone() - .unwrap(), + re.re_match(b"abbbbb").unwrap().submatches["x".as_bytes()], 0..5 ); } #[test] fn nongreedy_plus() { - let re = regex("(ab+?)bb*"); + let re = regex("(?<x>ab+?)bb*"); assert_eq!( - re.re_match(b"abbbb").unwrap().submatches[0] - .clone() - .unwrap(), + re.re_match(b"abbbb").unwrap().submatches["x".as_bytes()], 0..2 ); assert_eq!( - re.re_match(b"abbbbb").unwrap().submatches[0] - .clone() - .unwrap(), + re.re_match(b"abbbbb").unwrap().submatches["x".as_bytes()], 0..2 ); } #[test] fn greedy_plus() { - let re = regex("(ab+)bb*"); + let re = regex("(?<x>ab+)bb*"); assert_eq!( - re.re_match(b"abbb").unwrap().submatches[0].clone().unwrap(), + re.re_match(b"abbb").unwrap().submatches["x".as_bytes()], 0..3 ); assert_eq!( - re.re_match(b"abbbbb").unwrap().submatches[0] - .clone() - .unwrap(), + re.re_match(b"abbbbb").unwrap().submatches["x".as_bytes()], 0..5 ); } #[test] fn nongreedy_qm() { - let re = regex("(ab??)bb*"); + let re = regex("(?<x>ab??)bb*"); assert_eq!( - re.re_match(b"abbbb").unwrap().submatches[0] - .clone() - .unwrap(), + re.re_match(b"abbbb").unwrap().submatches["x".as_bytes()], 0..1 ); assert_eq!( - re.re_match(b"abbbbb").unwrap().submatches[0] - .clone() - .unwrap(), + re.re_match(b"abbbbb").unwrap().submatches["x".as_bytes()], 0..1 ); } #[test] fn greedy_qm() { - let re = regex("(ab?)bb*"); + let re = regex("(?<x>ab?)bb*"); assert_eq!( - re.re_match(b"abbb").unwrap().submatches[0].clone().unwrap(), + re.re_match(b"abbb").unwrap().submatches["x".as_bytes()], 0..2 ); assert_eq!( - re.re_match(b"abbbbb").unwrap().submatches[0] - .clone() - .unwrap(), + re.re_match(b"abbbbb").unwrap().submatches["x".as_bytes()], 0..2 ); } diff --git a/src/regex/enfa.rs b/src/regex/enfa.rs index ea3bc93..2947f58 100644 --- a/src/regex/enfa.rs +++ b/src/regex/enfa.rs @@ -736,7 +736,7 @@ impl TryFrom<Pattern> for ENFA { states.push(EState::terminal()); Self { states, has_submatches: false, } } - Pattern::Submatch(pat) => { + Pattern::Submatch(_, pat) => { let mut this = Self::try_from(*pat)?; this.has_submatches = true; this diff --git a/src/regex/mod.rs b/src/regex/mod.rs index cdbfe0d..c42c0ad 100644 --- a/src/regex/mod.rs +++ b/src/regex/mod.rs @@ -1,3 +1,7 @@ +use std::collections::HashMap; + +use crate::BString; + pub mod bc; mod byte_range; pub mod dfa; @@ -56,7 +60,7 @@ pub enum Pattern { Concat(Vec<Pattern>), Rep(Box<Pattern>, u32, Option<u32>, GreedyBehavior), Assertion(LookDirection, LookPolarity, Box<Pattern>), - Submatch(Box<Pattern>), + Submatch(BString, Box<Pattern>), Nothing, } @@ -140,7 +144,7 @@ impl Pattern { Pattern::Rep(_, _, None, _) => ByteConsumption::Unbounded, Pattern::Assertion(_, _, _) => ByteConsumption::zero(), Pattern::Nothing => ByteConsumption::zero(), - Pattern::Submatch(pat) => pat.max_byte_consumption(), + Pattern::Submatch(_, pat) => pat.max_byte_consumption(), } } @@ -152,7 +156,7 @@ impl Pattern { Concat(patterns) => Concat(patterns.into_iter().map(Self::reverse).rev().collect()), Rep(pattern, min, max, greedy) => Rep(Box::new(pattern.reverse()), min, max, greedy), Assertion(dir, pol, pat) => Assertion(dir.reverse(), pol, Box::new(pat.reverse())), - Submatch(pat) => Submatch(Box::new(pat.reverse())), + Submatch(name, pat) => Submatch(name, Box::new(pat.reverse())), } } @@ -196,7 +200,7 @@ impl Pattern { Rep(pat, 1, Some(1), _) => pat.simplify(), Rep(pat, min, max, greed) => Rep(Box::new(pat.simplify()), min, max, greed), Assertion(dir, pol, pattern) => Assertion(dir, pol, Box::new(pattern.simplify())), - Submatch(pattern) => Submatch(Box::new(pattern.simplify())), + Submatch(name, pattern) => Submatch(name, Box::new(pattern.simplify())), CharacterClass(_) | Range(_, _) | Nothing | Byte(_) => self, } } @@ -231,7 +235,7 @@ impl CompiledPattern { #[derive(PartialEq, Eq, Debug)] pub struct Match { - pub submatches: Box<[Option<core::ops::Range<usize>>]>, + pub submatches: HashMap<BString, core::ops::Range<usize>>, } impl Match { @@ -367,7 +371,7 @@ mod tests { #[test] fn match_is_bytecode() { - let x = Pattern::parse_from_bytes(b".*(ele.*phant).*") + let x = Pattern::parse_from_bytes(b".*(?<match_name>ele.*phant).*") .unwrap() .try_compile() .unwrap(); diff --git a/src/regex/simple.rs b/src/regex/simple.rs index 4286322..3f0bf30 100644 --- a/src/regex/simple.rs +++ b/src/regex/simple.rs @@ -69,7 +69,7 @@ impl RegexEngine for Nothing { Pattern::Rep(_, x, Some(y), _) if y < x => Ok(Nothing), Pattern::Rep(_, 0, None, _) => Err(NotASimpleNothing), Pattern::Rep(pat, _gt_0, _, _) => Self::compile(*pat), - Pattern::Submatch(pat) => Self::compile(*pat), + Pattern::Submatch(_, pat) => Self::compile(*pat), _ => Err(NotASimpleNothing), } } @@ -109,7 +109,7 @@ fn ce(pat: Pattern) -> Option<Vec<u8>> { None } } - Pattern::Submatch(_) => None, // TODO: submatches could be stored as constant offsets + Pattern::Submatch(_, _) => None, // TODO: submatches could be stored as constant offsets Pattern::Nothing => Some(Vec::new()), _ => None, } |
