From 1a138779293823177613238591768077d781de05 Mon Sep 17 00:00:00 2001 From: Jonas Maier Date: Tue, 2 Jun 2026 12:39:41 +0200 Subject: regex: parse lookahead and lookbehind --- src/parse/regex/enfa.rs | 34 ++++++++++++++++++++++--------- src/parse/regex/mod.rs | 53 +++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 74 insertions(+), 13 deletions(-) (limited to 'src/parse/regex') diff --git a/src/parse/regex/enfa.rs b/src/parse/regex/enfa.rs index 3809595..b3d3c4a 100644 --- a/src/parse/regex/enfa.rs +++ b/src/parse/regex/enfa.rs @@ -328,10 +328,17 @@ impl EState { } } -impl From for ENFA { - fn from(value: Pattern) -> Self { - match value { - Pattern::Byte(c) => Self::from(Pattern::Range(c, c)), +#[derive(Debug)] +pub enum EnfaTranslationError { + AssertionsNotSupported, +} + +impl TryFrom for ENFA { + type Error = EnfaTranslationError; + + fn try_from(value: Pattern) -> Result { + Ok(match value { + Pattern::Byte(c) => Self::try_from(Pattern::Range(c, c))?, Pattern::Range(c1, c2) => Self { states: vec![ EState { @@ -343,7 +350,10 @@ impl From for ENFA { ], }, Pattern::Alt(alts) => { - let nfas: Vec = alts.into_iter().map(ENFA::from).collect(); + let nfas: Vec = alts + .into_iter() + .map(Self::try_from) + .collect::>()?; let mut states = vec![EState::start()]; let mut ends = vec![]; for nfa in nfas.into_iter() { @@ -360,18 +370,21 @@ impl From for ENFA { Self { states } } Pattern::Concat(seq) => { - let nfas: Vec = seq.into_iter().map(ENFA::from).collect(); + let nfas: Vec = seq + .into_iter() + .map(Self::try_from) + .collect::>()?; Self::concat(nfas) } Pattern::Rep(regex, min, None) => { - let nfa = ENFA::from(*regex); + let nfa = ENFA::try_from(*regex)?; let base = nfa.clone().repeat(min as usize); let tail = nfa.looping(); Self::concat(vec![base, tail]) } Pattern::Rep(regex, min, Some(max)) => { assert!(min < max); - let nfa = Self::from(*regex); + let nfa = Self::try_from(*regex)?; let base = nfa.clone().repeat(min as usize); let tail = nfa.optx((max - min) as usize); Self::concat(vec![base, tail]) @@ -379,6 +392,9 @@ impl From for ENFA { Pattern::Nothing => Self { states: vec![EState::terminal()], }, - } + Pattern::Assertion(..) => { + return Err(EnfaTranslationError::AssertionsNotSupported); + } + }) } } diff --git a/src/parse/regex/mod.rs b/src/parse/regex/mod.rs index 51527ae..10d332e 100644 --- a/src/parse/regex/mod.rs +++ b/src/parse/regex/mod.rs @@ -6,6 +6,18 @@ mod byte_range; mod dfa; mod enfa; +#[derive(PartialEq, Eq, Debug, Clone, Copy)] +pub enum LookDirection { + Ahead, + Behind, +} + +#[derive(PartialEq, Eq, Debug, Clone, Copy)] +pub enum LookPolarity { + Positive, + Negative, +} + #[derive(PartialEq, Debug, Clone)] pub enum Pattern { Byte(u8), @@ -13,6 +25,7 @@ pub enum Pattern { Alt(Vec), Concat(Vec), Rep(Box, u32, Option), + Assertion(LookDirection, LookPolarity, Box), Nothing, } @@ -166,6 +179,22 @@ fn parse_atom(s: &mut super::Cursor<'_>) -> Result { } b'(' => { s.adv(); + + let mut assertion = None; + if s.buf.starts_with(b"?=") { + s.advance(2); + assertion = Some((LookDirection::Ahead, LookPolarity::Positive)); + } else if s.buf.starts_with(b"?!") { + s.advance(2); + assertion = Some((LookDirection::Ahead, LookPolarity::Negative)); + } else if s.buf.starts_with(b"?<=") { + s.advance(3); + assertion = Some((LookDirection::Behind, LookPolarity::Positive)); + } else if s.buf.starts_with(b"?) -> Result { return Err(ParseError::Expected(')')); } s.highlight_from(begin, OtherHighlights::RegexSymbol); - Ok(inner) + + if let Some((dir, pol)) = assertion { + Ok(Pattern::Assertion(dir, pol, Box::new(inner))) + } else { + Ok(inner) + } } b'.' => { s.adv(); @@ -217,11 +251,22 @@ impl std::fmt::Debug for CompiledPattern { } } +#[derive(Debug)] +pub enum CompilationError { + Enfa(enfa::EnfaTranslationError), +} + +impl From for CompilationError { + fn from(value: enfa::EnfaTranslationError) -> Self { + Self::Enfa(value) + } +} + impl Pattern { - pub fn compile(self) -> CompiledPattern { - let enfa = enfa::ENFA::from(self); + pub fn try_compile(self) -> std::result::Result { + let enfa = enfa::ENFA::try_from(self)?; let dfa = dfa::DFA::from(enfa); - CompiledPattern { dfa } + Ok(CompiledPattern { dfa }) } } -- cgit v1.2.3