From a2e9e8647daa2622cf55a047c329027fcfc49bf8 Mon Sep 17 00:00:00 2001 From: Jonas Maier Date: Sat, 6 Jun 2026 12:34:22 +0200 Subject: some small opt --- src/regex/mod.rs | 161 +++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 104 insertions(+), 57 deletions(-) (limited to 'src/regex/mod.rs') diff --git a/src/regex/mod.rs b/src/regex/mod.rs index be3026f..2c9f3d1 100644 --- a/src/regex/mod.rs +++ b/src/regex/mod.rs @@ -1,5 +1,3 @@ -use crate::parse::Parse; - pub mod bc; mod byte_range; pub mod dfa; @@ -28,7 +26,7 @@ pub enum LookPolarity { } #[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)] -pub enum CharacterClass { +pub enum Class { Everything, Nothing, Whitespace, @@ -36,14 +34,14 @@ pub enum CharacterClass { Alphanumeric, } -impl CharacterClass { +impl Class { pub fn matches(self, byte: u8) -> bool { match self { - CharacterClass::Everything => true, - CharacterClass::Nothing => false, - CharacterClass::Whitespace => byte.is_ascii_whitespace(), - CharacterClass::Alphabetic => byte.is_ascii_alphabetic(), - CharacterClass::Alphanumeric => byte.is_ascii_alphanumeric(), + Class::Everything => true, + Class::Nothing => false, + Class::Whitespace => byte.is_ascii_whitespace(), + Class::Alphabetic => byte.is_ascii_alphabetic(), + Class::Alphanumeric => byte.is_ascii_alphanumeric(), } } } @@ -52,7 +50,7 @@ impl CharacterClass { pub enum Pattern { Byte(u8), Range(u8, u8), - CharacterClass(CharacterClass), + CharacterClass(Class), Alt(Vec), Concat(Vec), Rep(Box, u32, Option, GreedyBehavior), @@ -156,6 +154,51 @@ impl Pattern { Submatch(pat) => Submatch(Box::new(pat.reverse())), } } + + pub fn simplify(self) -> Self { + use Pattern::*; + + match self { + Range(a, b) if a > b => CharacterClass(Class::Nothing), + Range(a, b) if a == b => Byte(a), + Alt(pats) => { + let pats: Vec<_> = pats + .into_iter() + .map(Pattern::simplify) + .filter(|p| !matches!(p, CharacterClass(Class::Nothing))) + .collect(); + match pats.len() { + 0 => CharacterClass(Class::Nothing), + 1 => pats.into_iter().next().unwrap(), + _ => Alt(pats) + } + } + Concat(pats) => { + let mut out = Vec::new(); + for pat in pats { + let pat = pat.simplify(); + match pat { + Concat(mut pats) => out.append(&mut pats), + Nothing => (), + CharacterClass(Class::Nothing) => return CharacterClass(Class::Nothing), + x => out.push(x), + } + } + match out.len() { + 0 => Nothing, + 1 => out.into_iter().next().unwrap(), + _ => Concat(out), + } + } + Rep(_, 0, Some(0), _) => Nothing, + Rep(_, min, Some(max), _) if min > max => CharacterClass(Class::Nothing), + Rep(pat, 1, Some(1), _) => pat.simplify(), + Rep(pat, min, max, greed) => Rep(Box::new(pat.simplify()), min, max, greed), + Assertion(dir, pol, pattern) => Assertion(dir, pol, Box::new(pattern.simplify())), + Submatch(pattern) => Submatch(Box::new(pattern.simplify())), + CharacterClass(_) | Range(_, _) | Nothing | Byte(_) => self, + } + } } #[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] @@ -172,6 +215,7 @@ pub enum CompiledPattern { Exact(simple::Exact), } +#[allow(unused)] impl CompiledPattern { fn is_dfa(&self) -> bool { matches!(self, Self::Dfa(_)) @@ -289,56 +333,59 @@ impl Pattern { } #[cfg(test)] -macro_rules! regex_matches { - ($regex:literal, $match:literal, $true:literal) => { - assert_eq!( - ::parse_from_bytes($regex.as_bytes()) - .unwrap() - .try_compile() - .unwrap() - .matches($match.as_bytes()), - $true - ) - }; -} +mod tests { + use super::*; + use crate::parse::Parse; + + macro_rules! regex_matches { + ($regex:literal, $match:literal, $true:literal) => { + assert_eq!( + ::parse_from_bytes($regex.as_bytes()) + .unwrap() + .try_compile() + .unwrap() + .matches($match.as_bytes()), + $true + ) + }; + } -#[test] -fn foo_matches_foo() { - regex_matches!("foo", "foo", true); -} + #[test] + fn foo_matches_foo() { + regex_matches!("foo", "foo", true); + } -#[test] -fn dot_star_is_simple() { - let x = Pattern::parse_from_bytes(b".*") - .unwrap() - .try_compile() - .unwrap(); - assert!(x.is_simple()); -} + #[test] + fn dot_star_is_simple() { + let x = Pattern::parse_from_bytes(b".*") + .unwrap() + .try_compile() + .unwrap(); + assert!(x.is_simple()); + } -#[test] -fn match_is_bytecode() { - let x = Pattern::parse_from_bytes(b".*(ele.*phant).*") - .unwrap() - .try_compile() - .unwrap(); - assert!(x.is_bytecode()); -} + #[test] + fn match_is_bytecode() { + let x = Pattern::parse_from_bytes(b".*(ele.*phant).*") + .unwrap() + .try_compile() + .unwrap(); + assert!(x.is_bytecode()); + } -#[test] -fn simple_word_is_exact() { - let x = Pattern::parse_from_bytes(b"Gnu[ ]plus[ ]Linux") - .unwrap() - .try_compile() - .unwrap(); - assert!(x.is_simple()); -} + #[test] + fn simple_word_is_exact() { + let parsed = Pattern::parse_from_bytes(b"Gnu[ ]plus[ ]Linux").unwrap(); + let compiled = parsed.clone().try_compile().unwrap(); + assert!(compiled.is_simple(), "{parsed:?}"); + } -#[test] -fn no_match_is_dfa() { - let x = Pattern::parse_from_bytes(b".*Gnu.*plus.*Linux.*") - .unwrap() - .try_compile() - .unwrap(); - assert!(x.is_dfa()); + #[test] + fn no_match_is_dfa() { + let x = Pattern::parse_from_bytes(b".*Gnu.*plus.*Linux.*") + .unwrap() + .try_compile() + .unwrap(); + assert!(x.is_dfa()); + } } -- cgit v1.2.3