From 53980774c327675e886179c0a2c140744dcf9b95 Mon Sep 17 00:00:00 2001 From: Jonas Maier Date: Sat, 6 Jun 2026 12:15:52 +0200 Subject: special cased regex for performance --- src/parse/regex/mod.rs | 432 ------------------------------------------------- 1 file changed, 432 deletions(-) delete mode 100644 src/parse/regex/mod.rs (limited to 'src/parse/regex/mod.rs') diff --git a/src/parse/regex/mod.rs b/src/parse/regex/mod.rs deleted file mode 100644 index 486b0d9..0000000 --- a/src/parse/regex/mod.rs +++ /dev/null @@ -1,432 +0,0 @@ -use crate::parse::OtherHighlights; - -use super::{Parse, ParseError, Result}; - -pub mod bc; -mod byte_range; -pub mod dfa; -pub mod enfa; - -#[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)] -pub enum LookDirection { - Ahead, - Behind, -} - -impl LookDirection { - pub fn reverse(self) -> Self { - match self { - Self::Ahead => Self::Behind, - Self::Behind => Self::Ahead, - } - } -} - -#[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)] -pub enum LookPolarity { - Positive, - Negative, -} - -#[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)] -pub enum CharacterClass { - Everything, - Nothing, - Whitespace, - Alphabetic, - Alphanumeric, -} - -impl CharacterClass { - pub fn matches(self, byte: u8) -> bool { - match self { - CharacterClass::Everything => true, - CharacterClass::Nothing => false, - CharacterClass::Whitespace => byte.is_ascii_whitespace(), - CharacterClass::Alphabetic => byte.is_ascii_alphabetic(), - CharacterClass::Alphanumeric => byte.is_ascii_alphanumeric(), - } - } -} - -#[derive(PartialEq, Eq, Hash, Debug, Clone)] -pub enum Pattern { - Byte(u8), - Range(u8, u8), - CharacterClass(CharacterClass), - Alt(Vec), - Concat(Vec), - Rep(Box, u32, Option, GreedyBehavior), - Assertion(LookDirection, LookPolarity, Box), - Submatch(Box), - Nothing, -} - -#[derive(Debug, PartialEq, Eq, Copy, Clone)] -pub enum ByteConsumption { - Bounded(usize), - Unbounded, -} - -impl ByteConsumption { - pub fn zero() -> Self { - Self::Bounded(0) - } - pub fn one() -> Self { - Self::Bounded(1) - } -} - -impl Ord for ByteConsumption { - fn cmp(&self, other: &Self) -> std::cmp::Ordering { - match (self, other) { - (Self::Bounded(a), Self::Bounded(b)) => a.cmp(b), - (Self::Bounded(_), Self::Unbounded) => std::cmp::Ordering::Less, - (Self::Unbounded, Self::Bounded(_)) => std::cmp::Ordering::Greater, - (Self::Unbounded, Self::Unbounded) => std::cmp::Ordering::Equal, - } - } -} -impl PartialOrd for ByteConsumption { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl std::ops::Add for ByteConsumption { - type Output = Self; - - fn add(self, rhs: Self) -> Self::Output { - if let Self::Bounded(a) = self - && let Self::Bounded(b) = rhs - { - Self::Bounded(a + b) - } else { - Self::Unbounded - } - } -} - -impl std::ops::Mul for ByteConsumption { - type Output = Self; - - fn mul(self, rhs: usize) -> Self::Output { - match self { - Self::Bounded(x) => Self::Bounded(x * rhs), - Self::Unbounded => Self::Unbounded, - } - } -} - -impl std::iter::Sum for ByteConsumption { - fn sum>(iter: I) -> Self { - iter.fold(Self::zero(), |a, b| a + b) - } -} - -impl Pattern { - pub fn max_byte_consumption(&self) -> ByteConsumption { - match self { - Pattern::Byte(_) => ByteConsumption::one(), - Pattern::Range(_, _) => ByteConsumption::one(), - Pattern::CharacterClass(_) => ByteConsumption::one(), - Pattern::Alt(patterns) => patterns - .iter() - .map(Self::max_byte_consumption) - .max() - .unwrap_or(ByteConsumption::zero()), - Pattern::Concat(patterns) => patterns.iter().map(Self::max_byte_consumption).sum(), - Pattern::Rep(pattern, _, Some(max_reps), _) => { - pattern.max_byte_consumption() * (*max_reps as usize) - } - Pattern::Rep(_, _, None, _) => ByteConsumption::Unbounded, - Pattern::Assertion(_, _, _) => ByteConsumption::zero(), - Pattern::Nothing => ByteConsumption::zero(), - Pattern::Submatch(pat) => pat.max_byte_consumption(), - } - } - - pub fn reverse(self) -> Self { - use Pattern::*; - match self { - Byte(_) | Nothing | Range(..) | CharacterClass(_) => self, - Alt(patterns) => Alt(patterns.into_iter().map(Self::reverse).collect()), - Concat(patterns) => Concat(patterns.into_iter().map(Self::reverse).rev().collect()), - Rep(pattern, min, max, greedy) => Rep(Box::new(pattern.reverse()), min, max, greedy), - Assertion(dir, pol, pat) => Assertion(dir.reverse(), pol, Box::new(pat.reverse())), - Submatch(pat) => Submatch(Box::new(pat.reverse())) - } - } -} - -impl Parse for Pattern { - fn parse(b: &mut super::Cursor<'_>) -> super::Result { - let begin = b.loc(); - let result = parse0(b); - if result.is_ok() { - b.highlight_from(begin, OtherHighlights::Regex); - } - result - } -} - -fn parse0(s: &mut super::Cursor<'_>) -> Result { - parse_alt(s) -} - -fn parse_alt(s: &mut super::Cursor<'_>) -> Result { - let mut seqs = vec![]; - loop { - let seq = parse_seq(s)?; - seqs.push(seq); - let begin = s.loc(); - if s.has() && s.peek() == b'|' { - s.adv(); - s.highlight_from(begin, OtherHighlights::RegexSymbol); - } else { - break; - } - } - - Ok(match seqs.len() { - 0 => Pattern::Nothing, - 1 => seqs.into_iter().next().unwrap(), - _ => Pattern::Alt(seqs), - }) -} - -fn parse_seq(s: &mut super::Cursor<'_>) -> Result { - let mut reps = vec![]; - loop { - let rep = parse_rep(s)?; - if rep != Pattern::Nothing { - reps.push(rep); - } else { - break; - } - } - - Ok(match reps.len() { - 0 => Pattern::Nothing, - 1 => reps.into_iter().next().unwrap(), - _ => Pattern::Concat(reps), - }) -} - -fn parse_rep(s: &mut super::Cursor<'_>) -> Result { - let atom = parse_atom(s)?; - - if atom == Pattern::Nothing { - return Ok(atom); - } - - if !s.has() { - return Ok(atom); - } - - let begin = s.loc(); - - let rep = match s.peek() { - b'*' => Some((0, None)), - b'+' => Some((1, None)), - b'?' => Some((0, Some(1))), - _ => None, - }; - - if let Some((min_rep, max_rep)) = rep { - s.adv(); - s.highlight_from(begin, OtherHighlights::RegexSymbol); - - let greed = if s.has() && s.peek() == b'?' { - s.adv(); - GreedyBehavior::NonGreedy - } else { - GreedyBehavior::Greedy - }; - - Ok(Pattern::Rep(Box::new(atom), min_rep, max_rep, greed)) - } else { - Ok(atom) - } -} - -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -pub enum GreedyBehavior { - Greedy, - NonGreedy, -} - -const SYMBOLS: &[u8] = b"{}[]()*+-?|.\\ "; -fn is_symbol(x: u8) -> bool { - SYMBOLS.contains(&x) -} - -fn parse_atom(s: &mut super::Cursor<'_>) -> Result { - if !s.has() { - return Ok(Pattern::Nothing); - } - - let begin = s.loc(); - - match s.peek() { - b'[' => { - s.adv(); - s.highlight_from(begin, OtherHighlights::RegexSymbol); - let mut ranges = Vec::new(); - loop { - if !s.has() { - return Err(ParseError::Eof); - } - - let begin = s.loc(); - let tok = s.adv(); - - if tok == b']' { - if ranges.is_empty() { - todo!("error handling for empty alternative list"); - } - s.highlight_from(begin, OtherHighlights::RegexSymbol); - return Ok(Pattern::Alt(ranges)); - } - - let begin = s.loc(); - if s.has() && s.peek() == b'-' { - s.adv(); - s.highlight_from(begin, OtherHighlights::RegexSymbol); - - if !s.has() { - return Err(ParseError::Eof); - } - let tok2 = s.peek(); - - if tok2 == b']' { - ranges.push(Pattern::Byte(tok)); - ranges.push(Pattern::Byte(b'-')); - } else if is_symbol(tok2) { - return Err(ParseError::Unknown(tok2)); - } else { - s.adv(); - ranges.push(Pattern::Range(tok, tok2)); - } - } else { - ranges.push(Pattern::Byte(tok)); - } - } - } - b'(' => { - s.adv(); - - let mut assertion = None; - if s.buf.starts_with(b"?=") { - s.advance(2); - assertion = Some((LookDirection::Ahead, LookPolarity::Positive)); - } else if s.buf.starts_with(b"?!") { - s.advance(2); - assertion = Some((LookDirection::Ahead, LookPolarity::Negative)); - } else if s.buf.starts_with(b"?<=") { - s.advance(3); - assertion = Some((LookDirection::Behind, LookPolarity::Positive)); - } else if s.buf.starts_with(b"? { - s.adv(); - s.highlight_from(begin, OtherHighlights::RegexSymbol); - Ok(Pattern::Range(0, 127)) - } - b'\\' => { - s.adv(); - if s.has() { - let escaped = s.adv(); - s.highlight_from(begin, OtherHighlights::RegexSymbol); - - if is_symbol(escaped) { - Ok(Pattern::Byte(escaped)) - } else { - // TODO interpret \w and others - Err(ParseError::Unknown(escaped)) - } - } else { - Err(ParseError::Eof) - } - } - x if is_symbol(x) => Ok(Pattern::Nothing), - ch => { - s.adv(); - Ok(Pattern::Byte(ch)) - } - } -} - -pub struct CompiledPattern { - dfa: dfa::DFA, -} - -impl std::fmt::Debug for CompiledPattern { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - self.dfa.fmt(f) - } -} - -#[derive(Debug)] -pub enum CompilationError { - Enfa(enfa::EnfaTranslationError), -} - -impl From for CompilationError { - fn from(value: enfa::EnfaTranslationError) -> Self { - Self::Enfa(value) - } -} - -impl Pattern { - pub fn try_compile(self) -> std::result::Result { - let enfa = enfa::ENFA::try_from(self)?; - let dfa = dfa::DFA::from(enfa); - Ok(CompiledPattern { dfa }) - } -} - -impl CompiledPattern { - pub fn matches(&self, bytes: impl AsRef<[u8]>) -> bool { - self.dfa.matches(bytes.as_ref()) - } -} - -#[cfg(test)] -macro_rules! regex_matches { - ($regex:literal, $match:literal, $true:literal) => { - assert_eq!( - Pattern::parse_from_bytes($regex.as_bytes()) - .unwrap() - .try_compile() - .unwrap() - .matches($match.as_bytes()), - $true - ) - }; -} - -#[test] -fn foo_matches_foo() { - regex_matches!("foo", "foo", true); -} -- cgit v1.2.3