From 53980774c327675e886179c0a2c140744dcf9b95 Mon Sep 17 00:00:00 2001 From: Jonas Maier Date: Sat, 6 Jun 2026 12:15:52 +0200 Subject: special cased regex for performance --- src/regex/simple.rs | 125 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 src/regex/simple.rs (limited to 'src/regex/simple.rs') diff --git a/src/regex/simple.rs b/src/regex/simple.rs new file mode 100644 index 0000000..00bc9b4 --- /dev/null +++ b/src/regex/simple.rs @@ -0,0 +1,125 @@ +use crate::regex::CharacterClass; + +use super::{Match, Pattern, RegexEngine}; + +fn empty_match() -> Option { + Some(Match { + submatches: [].into(), + }) +} + +pub struct Anything; + +#[derive(Debug, Clone)] +pub struct NotASimpleWildcard; + +impl RegexEngine for Anything { + type CompileError = NotASimpleWildcard; + + fn compile(pat: Pattern) -> Result { + if let Pattern::Rep(pat, 0, None, _) = pat + && let Pattern::CharacterClass(CharacterClass::Everything) = *pat + { + Ok(Anything) + } else { + Err(NotASimpleWildcard) + } + } + + fn run(&self, _input: &[u8]) -> Option { + empty_match() + } +} + +pub struct Nothing; +#[derive(Debug, Clone)] +pub struct NotASimpleNothing; + +impl RegexEngine for Nothing { + type CompileError = NotASimpleNothing; + + fn compile(pat: Pattern) -> Result { + match pat { + Pattern::Range(a, b) if a > b => Ok(Nothing), + Pattern::CharacterClass(CharacterClass::Nothing) => Ok(Nothing), + Pattern::Alt(pats) => { + let all_impossible = pats.into_iter().map(Self::compile).all(|p| p.is_ok()); + if all_impossible { + Ok(Nothing) + } else { + Err(NotASimpleNothing) + } + } + Pattern::Concat(pats) => { + if let Some(pat) = pats.into_iter().next() { + Self::compile(pat) + } else { + Err(NotASimpleNothing) + } + } + Pattern::Rep(_, x, Some(y), _) if y < x => Ok(Nothing), + Pattern::Rep(_, 0, None, _) => Err(NotASimpleNothing), + Pattern::Rep(pat, _gt_0, _, _) => Self::compile(*pat), + Pattern::Submatch(pat) => Self::compile(*pat), + _ => Err(NotASimpleNothing), + } + } + + fn run(&self, _input: &[u8]) -> Option { + None + } +} + +pub struct Exact { + bytes: Vec, +} + +const MEM_LIMIT: usize = 25_000; + +#[derive(Debug, Clone)] +pub struct NotSimplyAString; + +fn ce(pat: Pattern) -> Option> { + match pat { + Pattern::Byte(b) => Some(vec![b]), + Pattern::Concat(patterns) => { + let mut pats = patterns.into_iter().map(ce).collect::>>()?; + let mut out = Vec::new(); + for p in pats.iter_mut() { + out.append(p); + } + Some(out) + } + Pattern::Rep(pat, min, Some(max), _) if min == max => { + if let Some(bytes) = ce(*pat) + && bytes.len() * (min as usize) < MEM_LIMIT + { + Some(bytes.repeat(min as usize)) + } else { + None + } + } + Pattern::Submatch(_) => None, // TODO: submatches could be stored as constant offsets + Pattern::Nothing => Some(Vec::new()), + _ => None, + } +} + +impl RegexEngine for Exact { + type CompileError = NotSimplyAString; + + fn compile(pat: Pattern) -> Result { + match ce(pat) { + Some(bytes) => Ok(Self { bytes }), + None => Err(NotSimplyAString), + } + } + + fn run(&self, input: &[u8]) -> Option { + if input == self.bytes { + empty_match() + } else { + None + } + } +} -- cgit v1.2.3