pub mod bc; mod byte_range; pub mod dfa; pub mod enfa; pub mod simple; mod decision_tree; #[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)] pub enum LookDirection { Ahead, Behind, } impl LookDirection { pub fn reverse(self) -> Self { match self { Self::Ahead => Self::Behind, Self::Behind => Self::Ahead, } } } #[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)] pub enum LookPolarity { Positive, Negative, } #[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)] pub enum Class { Everything, Nothing, Whitespace, Alphabetic, Alphanumeric, } impl Class { pub fn matches(self, byte: u8) -> bool { match self { Class::Everything => true, Class::Nothing => false, Class::Whitespace => byte.is_ascii_whitespace(), Class::Alphabetic => byte.is_ascii_alphabetic(), Class::Alphanumeric => byte.is_ascii_alphanumeric(), } } } #[derive(PartialEq, Eq, Hash, Debug, Clone)] pub enum Pattern { Byte(u8), Range(u8, u8), CharacterClass(Class), Alt(Vec), Concat(Vec), Rep(Box, u32, Option, GreedyBehavior), Assertion(LookDirection, LookPolarity, Box), Submatch(Box), Nothing, } #[derive(Debug, PartialEq, Eq, Copy, Clone)] pub enum ByteConsumption { Bounded(usize), Unbounded, } impl ByteConsumption { pub fn zero() -> Self { Self::Bounded(0) } pub fn one() -> Self { Self::Bounded(1) } } impl Ord for ByteConsumption { fn cmp(&self, other: &Self) -> std::cmp::Ordering { match (self, other) { (Self::Bounded(a), Self::Bounded(b)) => a.cmp(b), (Self::Bounded(_), Self::Unbounded) => std::cmp::Ordering::Less, (Self::Unbounded, Self::Bounded(_)) => std::cmp::Ordering::Greater, (Self::Unbounded, Self::Unbounded) => std::cmp::Ordering::Equal, } } } impl PartialOrd for ByteConsumption { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } impl std::ops::Add for ByteConsumption { type Output = Self; fn add(self, rhs: Self) -> Self::Output { if let Self::Bounded(a) = self && let Self::Bounded(b) = rhs { Self::Bounded(a + b) } else { Self::Unbounded } } } impl std::ops::Mul for ByteConsumption { type Output = Self; fn mul(self, rhs: usize) -> Self::Output { match self { Self::Bounded(x) => Self::Bounded(x * rhs), Self::Unbounded => Self::Unbounded, } } } impl std::iter::Sum for ByteConsumption { fn sum>(iter: I) -> Self { iter.fold(Self::zero(), |a, b| a + b) } } impl Pattern { pub fn max_byte_consumption(&self) -> ByteConsumption { match self { Pattern::Byte(_) => ByteConsumption::one(), Pattern::Range(_, _) => ByteConsumption::one(), Pattern::CharacterClass(_) => ByteConsumption::one(), Pattern::Alt(patterns) => patterns .iter() .map(Self::max_byte_consumption) .max() .unwrap_or(ByteConsumption::zero()), Pattern::Concat(patterns) => patterns.iter().map(Self::max_byte_consumption).sum(), Pattern::Rep(pattern, _, Some(max_reps), _) => { pattern.max_byte_consumption() * (*max_reps as usize) } Pattern::Rep(_, _, None, _) => ByteConsumption::Unbounded, Pattern::Assertion(_, _, _) => ByteConsumption::zero(), Pattern::Nothing => ByteConsumption::zero(), Pattern::Submatch(pat) => pat.max_byte_consumption(), } } pub fn reverse(self) -> Self { use Pattern::*; match self { Byte(_) | Nothing | Range(..) | CharacterClass(_) => self, Alt(patterns) => Alt(patterns.into_iter().map(Self::reverse).collect()), Concat(patterns) => Concat(patterns.into_iter().map(Self::reverse).rev().collect()), Rep(pattern, min, max, greedy) => Rep(Box::new(pattern.reverse()), min, max, greedy), Assertion(dir, pol, pat) => Assertion(dir.reverse(), pol, Box::new(pat.reverse())), Submatch(pat) => Submatch(Box::new(pat.reverse())), } } pub fn simplify(self) -> Self { use Pattern::*; match self { Range(a, b) if a > b => CharacterClass(Class::Nothing), Range(a, b) if a == b => Byte(a), Alt(pats) => { let pats: Vec<_> = pats .into_iter() .map(Pattern::simplify) .filter(|p| !matches!(p, CharacterClass(Class::Nothing))) .collect(); match pats.len() { 0 => CharacterClass(Class::Nothing), 1 => pats.into_iter().next().unwrap(), _ => Alt(pats) } } Concat(pats) => { let mut out = Vec::new(); for pat in pats { let pat = pat.simplify(); match pat { Concat(mut pats) => out.append(&mut pats), Nothing => (), CharacterClass(Class::Nothing) => return CharacterClass(Class::Nothing), x => out.push(x), } } match out.len() { 0 => Nothing, 1 => out.into_iter().next().unwrap(), _ => Concat(out), } } Rep(_, 0, Some(0), _) => Nothing, Rep(_, min, Some(max), _) if min > max => CharacterClass(Class::Nothing), Rep(pat, 1, Some(1), _) => pat.simplify(), Rep(pat, min, max, greed) => Rep(Box::new(pat.simplify()), min, max, greed), Assertion(dir, pol, pattern) => Assertion(dir, pol, Box::new(pattern.simplify())), Submatch(pattern) => Submatch(Box::new(pattern.simplify())), CharacterClass(_) | Range(_, _) | Nothing | Byte(_) => self, } } } #[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] pub enum GreedyBehavior { Greedy, NonGreedy, } pub enum CompiledPattern { Dfa(dfa::DFA), Bytecode(bc::BytecodeCompiledRegex), Anything(simple::Anything), Nothing(simple::Nothing), Exact(simple::Exact), } #[allow(unused)] impl CompiledPattern { fn is_dfa(&self) -> bool { matches!(self, Self::Dfa(_)) } fn is_bytecode(&self) -> bool { matches!(self, Self::Bytecode(_)) } fn is_simple(&self) -> bool { matches!(self, Self::Anything(_) | Self::Nothing(_) | Self::Exact(_)) } } #[derive(PartialEq, Eq, Debug)] pub struct Match { pub submatches: Box<[Option>]>, } impl Match { pub fn new_empty() -> Self { Self { submatches: [].into(), } } } pub trait RegexEngine: Sized { type CompileError: std::fmt::Debug + Clone; fn compile(pat: Pattern) -> Result; fn run(&self, input: &[u8]) -> Option; fn matches(&self, input: &[u8]) -> bool { self.run(input).is_some() } } impl RegexEngine for CompiledPattern { type CompileError = bc::RegexCompilationError; fn compile(pat: Pattern) -> Result { if let Ok(c) = simple::Anything::compile(pat.clone()) { Ok(Self::Anything(c)) } else if let Ok(c) = simple::Nothing::compile(pat.clone()) { Ok(Self::Nothing(c)) } else if let Ok(c) = simple::Exact::compile(pat.clone()) { Ok(Self::Exact(c)) } else if let Ok(c) = dfa::DFA::compile(pat.clone()) { Ok(Self::Dfa(c)) } else { bc::BytecodeCompiledRegex::compile(pat).map(Self::Bytecode) } } fn run(&self, input: &[u8]) -> Option { match self { CompiledPattern::Dfa(x) => x.run(input), CompiledPattern::Bytecode(x) => x.run(input), CompiledPattern::Anything(x) => x.run(input), CompiledPattern::Nothing(x) => x.run(input), CompiledPattern::Exact(x) => x.run(input), } } } macro_rules! all_engines { ($ty_name:ident, $($x:ident : $ty:ty,)*) => { #[derive(Debug)] pub struct $ty_name { $(pub $x: Option<$ty>,)* } impl RegexEngine for $ty_name { type CompileError = (); fn compile(pat: Pattern) -> Result { let x = Self { $($x: RegexEngine::compile(pat.clone()).ok(),)* }; if $(x.$x.is_none())&&* { Err(()) } else { Ok(x) } } fn run(&self, input: &[u8]) -> Option { $(let $x = self.$x.as_ref().map(|x| x.run(input));)* let mut result = None; $( if let Some(res) = $x { if let Some(result) = result { assert_eq!(res, result, concat!("engine ", stringify!($x), " does not agree with previously run engines.")); } result = Some(res) } )* result.unwrap() } } } } all_engines!( AllEngines, dfa: dfa::DFA, bc: bc::BytecodeCompiledRegex, any: simple::Anything, nothing: simple::Nothing, exact: simple::Exact, ); impl Pattern { pub fn try_compile(self) -> Result { CompiledPattern::compile(self) } } #[cfg(test)] mod tests { use super::*; use crate::parse::Parse; macro_rules! regex_matches { ($regex:literal, $match:literal, $true:literal) => { assert_eq!( ::parse_from_bytes($regex.as_bytes()) .unwrap() .try_compile() .unwrap() .matches($match.as_bytes()), $true ) }; } #[test] fn foo_matches_foo() { regex_matches!("foo", "foo", true); } #[test] fn dot_star_is_simple() { let x = Pattern::parse_from_bytes(b".*") .unwrap() .try_compile() .unwrap(); assert!(x.is_simple()); } #[test] fn match_is_bytecode() { let x = Pattern::parse_from_bytes(b".*(ele.*phant).*") .unwrap() .try_compile() .unwrap(); assert!(x.is_bytecode()); } #[test] fn simple_word_is_exact() { let parsed = Pattern::parse_from_bytes(b"Gnu[ ]plus[ ]Linux").unwrap(); let compiled = parsed.clone().try_compile().unwrap(); assert!(compiled.is_simple(), "{parsed:?}"); } #[test] fn no_match_is_dfa() { let x = Pattern::parse_from_bytes(b".*Gnu.*plus.*Linux.*") .unwrap() .try_compile() .unwrap(); assert!(x.is_dfa()); } }