aboutsummaryrefslogtreecommitdiffstats
path: root/src/parse/regex
diff options
context:
space:
mode:
Diffstat (limited to 'src/parse/regex')
-rw-r--r--src/parse/regex/bc.rs45
-rw-r--r--src/parse/regex/enfa.rs4
-rw-r--r--src/parse/regex/mod.rs5
3 files changed, 45 insertions, 9 deletions
diff --git a/src/parse/regex/bc.rs b/src/parse/regex/bc.rs
index 6b3ca1b..571386e 100644
--- a/src/parse/regex/bc.rs
+++ b/src/parse/regex/bc.rs
@@ -19,13 +19,15 @@ trait Flavor: Clone {
instr: Self::CustomInstr,
sd: &mut Self::StepData<'a, 'b>,
) -> bool;
+
+ fn save(x: u32) -> Option<Self::CustomInstr>;
}
#[derive(Copy, Clone, Debug)]
struct MainFlavor;
impl Flavor for MainFlavor {
type CustomInstr = MainInstr;
- type ThreadData = Box<[usize]>;
+ type ThreadData = Box<[Option<usize>]>;
type StepData<'a, 'b>
= (usize, &'a BitSet, &'a mut LookaheadVM<'b>)
where
@@ -38,7 +40,7 @@ impl Flavor for MainFlavor {
) -> bool {
match instr {
MainInstr::Save(reg) => {
- thread.data[reg as usize] = data.0;
+ thread.data[reg as usize] = Some(data.0);
true
}
MainInstr::Join(assertion) => {
@@ -52,6 +54,10 @@ impl Flavor for MainFlavor {
}
}
}
+
+ fn save(x: u32) -> Option<Self::CustomInstr> {
+ Some(MainInstr::Save(x))
+ }
}
#[derive(Copy, Clone, Debug)]
@@ -70,6 +76,10 @@ impl Flavor for AssertionFlavor {
fn accepts(_thread: &mut Thread<Self>, instr: Self::CustomInstr, _sd: &mut ()) -> bool {
match instr {}
}
+
+ fn save(_: u32) -> Option<Self::CustomInstr> {
+ None
+ }
}
type JumpTarget = u32;
@@ -265,8 +275,12 @@ impl<'a> VirtualMachine<'a> {
.threads
.iter()
.filter(|t| self.accepting.get(t.pc as usize))
- .map(|t| Match {
- registers: t.data.clone(),
+ .map(|t| {
+ let submatches: Vec<_> = t.data.windows(2).map(|x| Some(x[0]?..x[1]?)).collect();
+
+ Match {
+ submatches: submatches.into(),
+ }
})
.next()
}
@@ -288,6 +302,7 @@ pub struct BytecodeCompiledRegex {
instrs0: Box<[Instr<AssertionFlavor>]>,
instrs1: Box<[Instr<MainFlavor>]>,
instrs2: Box<[Instr<AssertionFlavor>]>,
+ submatch_count: usize,
accepting: BitSet,
}
@@ -307,7 +322,7 @@ impl BytecodeCompiledRegex {
&self.instrs1,
Thread {
pc: 0,
- data: Vec::new().into(), // TODO: submatches
+ data: vec![None; 2 * self.submatch_count].into(),
},
);
let vm2 = VM::new(&self.instrs2, Thread { pc: 0, data: () });
@@ -331,7 +346,7 @@ impl BytecodeCompiledRegex {
}
pub struct Match {
- pub registers: Box<[usize]>,
+ pub submatches: Box<[Option<core::ops::Range<usize>>]>,
}
type AssertionHandler<'a, F> =
@@ -348,6 +363,7 @@ struct Compiler<'a, F: Flavor> {
map: HashMap<Pattern, CompiledSnippet>,
assertion_handler: AssertionHandler<'a, F>,
assertion_fork_base: usize,
+ submatch_count: usize,
}
fn fork<F: Flavor>(repeat: usize, exit: usize, greedy: GreedyBehavior) -> Instr<F> {
@@ -369,6 +385,7 @@ impl<'a, F: Flavor> Compiler<'a, F> {
map: HashMap::new(),
assertion_handler: Box::new(assertion_handler),
assertion_fork_base: usize::MAX,
+ submatch_count: 0,
}
}
@@ -477,6 +494,17 @@ impl<'a, F: Flavor> Compiler<'a, F> {
self.instrs.push(ins);
}
Pattern::Nothing => {}
+ Pattern::Submatch(pat) => {
+ let i = self.submatch_count as u32 * 2;
+ self.submatch_count += 1;
+ if let Some(ins) = F::save(i) {
+ self.instrs.push(Instr::Custom(ins));
+ }
+ self.compile(*pat)?;
+ if let Some(ins) = F::save(i + 1) {
+ self.instrs.push(Instr::Custom(ins));
+ }
+ }
}
Ok(())
}
@@ -548,7 +576,7 @@ impl TryFrom<Pattern> for BytecodeCompiledRegex {
fn try_from(value: Pattern) -> Result<Self, Self::Error> {
let mut neg = assertion_compiler();
let mut pos = assertion_compiler();
- let (final_state, instrs) = {
+ let (final_state, instrs, submatch_count) = {
let mut main: Compiler<MainFlavor> = Compiler::new(|dir, pol, pat| {
let target = match dir {
LookDirection::Ahead => pos.compile_and_memoize(pat.reverse()),
@@ -565,7 +593,7 @@ impl TryFrom<Pattern> for BytecodeCompiledRegex {
main.compile(value)?;
let end = main.instrs.len();
main.instrs.push(Instr::Class(CharacterClass::Nothing));
- (end, main.instrs)
+ (end, main.instrs, main.submatch_count)
};
neg.finalize_assertion_forks();
pos.finalize_assertion_forks();
@@ -578,6 +606,7 @@ impl TryFrom<Pattern> for BytecodeCompiledRegex {
instrs1: instrs.into(),
instrs2: pos.instrs.into(),
accepting,
+ submatch_count,
})
}
}
diff --git a/src/parse/regex/enfa.rs b/src/parse/regex/enfa.rs
index ff742e0..dd4cf6e 100644
--- a/src/parse/regex/enfa.rs
+++ b/src/parse/regex/enfa.rs
@@ -724,6 +724,10 @@ impl TryFrom<Pattern> for ENFA {
states.push(EState::terminal());
Self { states }
}
+ Pattern::Submatch(pat) => {
+ // NFA does not track submatches, so we are skipping it.
+ Self::try_from(*pat)?
+ }
})
}
}
diff --git a/src/parse/regex/mod.rs b/src/parse/regex/mod.rs
index fed792c..486b0d9 100644
--- a/src/parse/regex/mod.rs
+++ b/src/parse/regex/mod.rs
@@ -58,6 +58,7 @@ pub enum Pattern {
Concat(Vec<Pattern>),
Rep(Box<Pattern>, u32, Option<u32>, GreedyBehavior),
Assertion(LookDirection, LookPolarity, Box<Pattern>),
+ Submatch(Box<Pattern>),
Nothing,
}
@@ -141,6 +142,7 @@ impl Pattern {
Pattern::Rep(_, _, None, _) => ByteConsumption::Unbounded,
Pattern::Assertion(_, _, _) => ByteConsumption::zero(),
Pattern::Nothing => ByteConsumption::zero(),
+ Pattern::Submatch(pat) => pat.max_byte_consumption(),
}
}
@@ -152,6 +154,7 @@ impl Pattern {
Concat(patterns) => Concat(patterns.into_iter().map(Self::reverse).rev().collect()),
Rep(pattern, min, max, greedy) => Rep(Box::new(pattern.reverse()), min, max, greedy),
Assertion(dir, pol, pat) => Assertion(dir.reverse(), pol, Box::new(pat.reverse())),
+ Submatch(pat) => Submatch(Box::new(pat.reverse()))
}
}
}
@@ -342,7 +345,7 @@ fn parse_atom(s: &mut super::Cursor<'_>) -> Result<Pattern> {
if let Some((dir, pol)) = assertion {
Ok(Pattern::Assertion(dir, pol, Box::new(inner)))
} else {
- Ok(inner)
+ Ok(Pattern::Submatch(Box::new(inner)))
}
}
b'.' => {