aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/parse/regex.rs16
-rw-r--r--src/regex/bc.rs94
-rw-r--r--src/regex/enfa.rs2
-rw-r--r--src/regex/mod.rs16
-rw-r--r--src/regex/simple.rs4
5 files changed, 70 insertions, 62 deletions
diff --git a/src/parse/regex.rs b/src/parse/regex.rs
index 22c3b9c..4134c88 100644
--- a/src/parse/regex.rs
+++ b/src/parse/regex.rs
@@ -1,5 +1,5 @@
use super::{Cursor, OtherHighlights, Parse, ParseError, Result};
-use crate::regex::{Class, GreedyBehavior, LookDirection, LookPolarity, Pattern};
+use crate::{parse::VarName, regex::{Class, GreedyBehavior, LookDirection, LookPolarity, Pattern}};
const SYMBOLS: &[u8] = b"{}[]()*+-?|.\\ ";
fn is_symbol(x: u8) -> bool {
@@ -155,6 +155,7 @@ fn parse_atom(s: &mut super::Cursor<'_>) -> Result<Pattern> {
b'(' => {
s.adv();
+ let mut match_name = None;
let mut assertion = None;
if s.buf.starts_with(b"?=") {
s.advance(2);
@@ -168,6 +169,15 @@ fn parse_atom(s: &mut super::Cursor<'_>) -> Result<Pattern> {
} else if s.buf.starts_with(b"?<!") {
s.advance(3);
assertion = Some((LookDirection::Behind, LookPolarity::Negative));
+ } else if s.buf.starts_with(b"?<") && s.buf.len() > 2 && s.buf[2].is_ascii_alphabetic() {
+ s.advance(2);
+ match_name = Some(VarName::parse(s)?.name);
+ if !s.has() {
+ return Err(ParseError::Eof);
+ }
+ if s.adv() != b'>' {
+ return Err(ParseError::Expected('>'));
+ }
}
s.highlight_from(begin, OtherHighlights::RegexSymbol);
@@ -183,8 +193,10 @@ fn parse_atom(s: &mut super::Cursor<'_>) -> Result<Pattern> {
if let Some((dir, pol)) = assertion {
Ok(Pattern::Assertion(dir, pol, Box::new(inner)))
+ } else if let Some(match_name) = match_name {
+ Ok(Pattern::Submatch(match_name, Box::new(inner)))
} else {
- Ok(Pattern::Submatch(Box::new(inner)))
+ Ok(inner)
}
}
b'.' => {
diff --git a/src/regex/bc.rs b/src/regex/bc.rs
index ea1d086..b94433e 100644
--- a/src/regex/bc.rs
+++ b/src/regex/bc.rs
@@ -1,10 +1,10 @@
-use std::collections::{HashMap, VecDeque};
+use std::collections::HashMap;
use super::{
Class, GreedyBehavior, LookDirection, LookPolarity, Match, Pattern, RegexEngine,
byte_range::ByteRange,
};
-use crate::bitset::BitSet;
+use crate::{BString, bitset::BitSet};
trait Flavor: Clone {
type CustomInstr: Copy + Clone + std::fmt::Debug;
@@ -267,6 +267,7 @@ struct VirtualMachine<'a> {
vm1: VM<'a, MainFlavor>,
vm2: LookaheadVM<'a>,
accepting: &'a BitSet,
+ submatches: &'a [BString],
}
impl<'a> VirtualMachine<'a> {
@@ -300,11 +301,17 @@ impl<'a> VirtualMachine<'a> {
.iter()
.filter(|t| self.accepting.get(t.pc as usize))
.map(|t| {
- let submatches: Vec<_> = t.data.windows(2).map(|x| Some(x[0]?..x[1]?)).collect();
-
- Match {
- submatches: submatches.into(),
- }
+ let submatches = self
+ .submatches
+ .iter()
+ .enumerate()
+ .filter_map(|(idx, name)| {
+ let begin = t.data[2 * idx]?;
+ let end = t.data[2 * idx + 1]?;
+ Some((name.clone(), begin..end))
+ })
+ .collect();
+ Match { submatches }
})
.next()
}
@@ -327,7 +334,7 @@ pub struct BytecodeCompiledRegex {
instrs1: Box<[Instr<MainFlavor>]>,
instrs2: Box<[Instr<AssertionFlavor>]>,
no_lookbehind: bool,
- submatch_count: usize,
+ submatches: Vec<BString>,
accepting: BitSet,
}
@@ -347,7 +354,7 @@ impl BytecodeCompiledRegex {
&self.instrs1,
Thread {
pc: 0,
- data: vec![None; 2 * self.submatch_count].into(),
+ data: vec![None; 2 * self.submatches.len()].into(),
},
);
let vm2 = VM::new(&self.instrs2, Thread { pc: 0, data: () });
@@ -357,6 +364,7 @@ impl BytecodeCompiledRegex {
vm1,
vm2,
accepting: &self.accepting,
+ submatches: &self.submatches,
};
if self.no_lookbehind {
for (i, ch) in data.iter().cloned().enumerate() {
@@ -392,7 +400,7 @@ struct Compiler<'a, F: Flavor> {
map: HashMap<Pattern, CompiledSnippet>,
assertion_handler: AssertionHandler<'a, F>,
assertion_fork_base: usize,
- submatch_count: usize,
+ submatches: Vec<BString>,
}
fn fork<F: Flavor>(repeat: usize, exit: usize, greedy: GreedyBehavior) -> Instr<F> {
@@ -414,7 +422,7 @@ impl<'a, F: Flavor> Compiler<'a, F> {
map: HashMap::new(),
assertion_handler: Box::new(assertion_handler),
assertion_fork_base: usize::MAX,
- submatch_count: 0,
+ submatches: Vec::new(),
}
}
@@ -523,9 +531,9 @@ impl<'a, F: Flavor> Compiler<'a, F> {
self.instrs.push(ins);
}
Pattern::Nothing => {}
- Pattern::Submatch(pat) => {
- let i = self.submatch_count as u32 * 2;
- self.submatch_count += 1;
+ Pattern::Submatch(match_name, pat) => {
+ let i = self.submatches.len() as u32 * 2;
+ self.submatches.push(match_name);
if let Some(ins) = F::save(i) {
self.instrs.push(Instr::Custom(ins));
}
@@ -605,7 +613,7 @@ impl TryFrom<Pattern> for BytecodeCompiledRegex {
fn try_from(value: Pattern) -> Result<Self, Self::Error> {
let mut neg = assertion_compiler();
let mut pos = assertion_compiler();
- let (final_state, instrs, submatch_count) = {
+ let (final_state, instrs, submatches) = {
let mut main: Compiler<MainFlavor> = Compiler::new(|dir, pol, pat| {
let target = match dir {
LookDirection::Ahead => pos.compile_and_memoize(pat.reverse()),
@@ -622,7 +630,7 @@ impl TryFrom<Pattern> for BytecodeCompiledRegex {
main.compile(value)?;
let end = main.instrs.len();
main.instrs.push(Instr::Class(Class::Nothing));
- (end, main.instrs, main.submatch_count)
+ (end, main.instrs, main.submatches)
};
neg.finalize_assertion_forks();
pos.finalize_assertion_forks();
@@ -636,7 +644,7 @@ impl TryFrom<Pattern> for BytecodeCompiledRegex {
instrs1: instrs.into(),
instrs2: pos.instrs.into(),
accepting,
- submatch_count,
+ submatches,
})
}
}
@@ -675,94 +683,78 @@ mod tests {
#[test]
fn nongreedy_star() {
- let re = regex("(ab*?)bb*");
+ let re = regex("(?<x>ab*?)bb*");
assert_eq!(
- re.re_match(b"abbb").unwrap().submatches[0].clone().unwrap(),
+ re.re_match(b"abbb").unwrap().submatches["x".as_bytes()],
0..1
);
assert_eq!(
- re.re_match(b"abbbbb").unwrap().submatches[0]
- .clone()
- .unwrap(),
+ re.re_match(b"abbbbb").unwrap().submatches["x".as_bytes()],
0..1
);
}
#[test]
fn greedy_star() {
- let re = regex("(ab*)bb*");
+ let re = regex("(?<x>ab*)bb*");
assert_eq!(
- re.re_match(b"abbb").unwrap().submatches[0].clone().unwrap(),
+ re.re_match(b"abbb").unwrap().submatches["x".as_bytes()],
0..3
);
assert_eq!(
- re.re_match(b"abbbbb").unwrap().submatches[0]
- .clone()
- .unwrap(),
+ re.re_match(b"abbbbb").unwrap().submatches["x".as_bytes()],
0..5
);
}
#[test]
fn nongreedy_plus() {
- let re = regex("(ab+?)bb*");
+ let re = regex("(?<x>ab+?)bb*");
assert_eq!(
- re.re_match(b"abbbb").unwrap().submatches[0]
- .clone()
- .unwrap(),
+ re.re_match(b"abbbb").unwrap().submatches["x".as_bytes()],
0..2
);
assert_eq!(
- re.re_match(b"abbbbb").unwrap().submatches[0]
- .clone()
- .unwrap(),
+ re.re_match(b"abbbbb").unwrap().submatches["x".as_bytes()],
0..2
);
}
#[test]
fn greedy_plus() {
- let re = regex("(ab+)bb*");
+ let re = regex("(?<x>ab+)bb*");
assert_eq!(
- re.re_match(b"abbb").unwrap().submatches[0].clone().unwrap(),
+ re.re_match(b"abbb").unwrap().submatches["x".as_bytes()],
0..3
);
assert_eq!(
- re.re_match(b"abbbbb").unwrap().submatches[0]
- .clone()
- .unwrap(),
+ re.re_match(b"abbbbb").unwrap().submatches["x".as_bytes()],
0..5
);
}
#[test]
fn nongreedy_qm() {
- let re = regex("(ab??)bb*");
+ let re = regex("(?<x>ab??)bb*");
assert_eq!(
- re.re_match(b"abbbb").unwrap().submatches[0]
- .clone()
- .unwrap(),
+ re.re_match(b"abbbb").unwrap().submatches["x".as_bytes()],
0..1
);
assert_eq!(
- re.re_match(b"abbbbb").unwrap().submatches[0]
- .clone()
- .unwrap(),
+ re.re_match(b"abbbbb").unwrap().submatches["x".as_bytes()],
0..1
);
}
#[test]
fn greedy_qm() {
- let re = regex("(ab?)bb*");
+ let re = regex("(?<x>ab?)bb*");
assert_eq!(
- re.re_match(b"abbb").unwrap().submatches[0].clone().unwrap(),
+ re.re_match(b"abbb").unwrap().submatches["x".as_bytes()],
0..2
);
assert_eq!(
- re.re_match(b"abbbbb").unwrap().submatches[0]
- .clone()
- .unwrap(),
+ re.re_match(b"abbbbb").unwrap().submatches["x".as_bytes()],
0..2
);
}
diff --git a/src/regex/enfa.rs b/src/regex/enfa.rs
index ea3bc93..2947f58 100644
--- a/src/regex/enfa.rs
+++ b/src/regex/enfa.rs
@@ -736,7 +736,7 @@ impl TryFrom<Pattern> for ENFA {
states.push(EState::terminal());
Self { states, has_submatches: false, }
}
- Pattern::Submatch(pat) => {
+ Pattern::Submatch(_, pat) => {
let mut this = Self::try_from(*pat)?;
this.has_submatches = true;
this
diff --git a/src/regex/mod.rs b/src/regex/mod.rs
index cdbfe0d..c42c0ad 100644
--- a/src/regex/mod.rs
+++ b/src/regex/mod.rs
@@ -1,3 +1,7 @@
+use std::collections::HashMap;
+
+use crate::BString;
+
pub mod bc;
mod byte_range;
pub mod dfa;
@@ -56,7 +60,7 @@ pub enum Pattern {
Concat(Vec<Pattern>),
Rep(Box<Pattern>, u32, Option<u32>, GreedyBehavior),
Assertion(LookDirection, LookPolarity, Box<Pattern>),
- Submatch(Box<Pattern>),
+ Submatch(BString, Box<Pattern>),
Nothing,
}
@@ -140,7 +144,7 @@ impl Pattern {
Pattern::Rep(_, _, None, _) => ByteConsumption::Unbounded,
Pattern::Assertion(_, _, _) => ByteConsumption::zero(),
Pattern::Nothing => ByteConsumption::zero(),
- Pattern::Submatch(pat) => pat.max_byte_consumption(),
+ Pattern::Submatch(_, pat) => pat.max_byte_consumption(),
}
}
@@ -152,7 +156,7 @@ impl Pattern {
Concat(patterns) => Concat(patterns.into_iter().map(Self::reverse).rev().collect()),
Rep(pattern, min, max, greedy) => Rep(Box::new(pattern.reverse()), min, max, greedy),
Assertion(dir, pol, pat) => Assertion(dir.reverse(), pol, Box::new(pat.reverse())),
- Submatch(pat) => Submatch(Box::new(pat.reverse())),
+ Submatch(name, pat) => Submatch(name, Box::new(pat.reverse())),
}
}
@@ -196,7 +200,7 @@ impl Pattern {
Rep(pat, 1, Some(1), _) => pat.simplify(),
Rep(pat, min, max, greed) => Rep(Box::new(pat.simplify()), min, max, greed),
Assertion(dir, pol, pattern) => Assertion(dir, pol, Box::new(pattern.simplify())),
- Submatch(pattern) => Submatch(Box::new(pattern.simplify())),
+ Submatch(name, pattern) => Submatch(name, Box::new(pattern.simplify())),
CharacterClass(_) | Range(_, _) | Nothing | Byte(_) => self,
}
}
@@ -231,7 +235,7 @@ impl CompiledPattern {
#[derive(PartialEq, Eq, Debug)]
pub struct Match {
- pub submatches: Box<[Option<core::ops::Range<usize>>]>,
+ pub submatches: HashMap<BString, core::ops::Range<usize>>,
}
impl Match {
@@ -367,7 +371,7 @@ mod tests {
#[test]
fn match_is_bytecode() {
- let x = Pattern::parse_from_bytes(b".*(ele.*phant).*")
+ let x = Pattern::parse_from_bytes(b".*(?<match_name>ele.*phant).*")
.unwrap()
.try_compile()
.unwrap();
diff --git a/src/regex/simple.rs b/src/regex/simple.rs
index 4286322..3f0bf30 100644
--- a/src/regex/simple.rs
+++ b/src/regex/simple.rs
@@ -69,7 +69,7 @@ impl RegexEngine for Nothing {
Pattern::Rep(_, x, Some(y), _) if y < x => Ok(Nothing),
Pattern::Rep(_, 0, None, _) => Err(NotASimpleNothing),
Pattern::Rep(pat, _gt_0, _, _) => Self::compile(*pat),
- Pattern::Submatch(pat) => Self::compile(*pat),
+ Pattern::Submatch(_, pat) => Self::compile(*pat),
_ => Err(NotASimpleNothing),
}
}
@@ -109,7 +109,7 @@ fn ce(pat: Pattern) -> Option<Vec<u8>> {
None
}
}
- Pattern::Submatch(_) => None, // TODO: submatches could be stored as constant offsets
+ Pattern::Submatch(_, _) => None, // TODO: submatches could be stored as constant offsets
Pattern::Nothing => Some(Vec::new()),
_ => None,
}