use crate::parse::Parse; pub mod bc; mod byte_range; pub mod dfa; pub mod enfa; pub mod simple; #[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)] pub enum LookDirection { Ahead, Behind, } impl LookDirection { pub fn reverse(self) -> Self { match self { Self::Ahead => Self::Behind, Self::Behind => Self::Ahead, } } } #[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)] pub enum LookPolarity { Positive, Negative, } #[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)] pub enum CharacterClass { Everything, Nothing, Whitespace, Alphabetic, Alphanumeric, } impl CharacterClass { pub fn matches(self, byte: u8) -> bool { match self { CharacterClass::Everything => true, CharacterClass::Nothing => false, CharacterClass::Whitespace => byte.is_ascii_whitespace(), CharacterClass::Alphabetic => byte.is_ascii_alphabetic(), CharacterClass::Alphanumeric => byte.is_ascii_alphanumeric(), } } } #[derive(PartialEq, Eq, Hash, Debug, Clone)] pub enum Pattern { Byte(u8), Range(u8, u8), CharacterClass(CharacterClass), Alt(Vec), Concat(Vec), Rep(Box, u32, Option, GreedyBehavior), Assertion(LookDirection, LookPolarity, Box), Submatch(Box), Nothing, } #[derive(Debug, PartialEq, Eq, Copy, Clone)] pub enum ByteConsumption { Bounded(usize), Unbounded, } impl ByteConsumption { pub fn zero() -> Self { Self::Bounded(0) } pub fn one() -> Self { Self::Bounded(1) } } impl Ord for ByteConsumption { fn cmp(&self, other: &Self) -> std::cmp::Ordering { match (self, other) { (Self::Bounded(a), Self::Bounded(b)) => a.cmp(b), (Self::Bounded(_), Self::Unbounded) => std::cmp::Ordering::Less, (Self::Unbounded, Self::Bounded(_)) => std::cmp::Ordering::Greater, (Self::Unbounded, Self::Unbounded) => std::cmp::Ordering::Equal, } } } impl PartialOrd for ByteConsumption { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } impl std::ops::Add for ByteConsumption { type Output = Self; fn add(self, rhs: Self) -> Self::Output { if let Self::Bounded(a) = self && let Self::Bounded(b) = rhs { Self::Bounded(a + b) } else { Self::Unbounded } } } impl std::ops::Mul for ByteConsumption { type Output = Self; fn mul(self, rhs: usize) -> Self::Output { match self { Self::Bounded(x) => Self::Bounded(x * rhs), Self::Unbounded => Self::Unbounded, } } } impl std::iter::Sum for ByteConsumption { fn sum>(iter: I) -> Self { iter.fold(Self::zero(), |a, b| a + b) } } impl Pattern { pub fn max_byte_consumption(&self) -> ByteConsumption { match self { Pattern::Byte(_) => ByteConsumption::one(), Pattern::Range(_, _) => ByteConsumption::one(), Pattern::CharacterClass(_) => ByteConsumption::one(), Pattern::Alt(patterns) => patterns .iter() .map(Self::max_byte_consumption) .max() .unwrap_or(ByteConsumption::zero()), Pattern::Concat(patterns) => patterns.iter().map(Self::max_byte_consumption).sum(), Pattern::Rep(pattern, _, Some(max_reps), _) => { pattern.max_byte_consumption() * (*max_reps as usize) } Pattern::Rep(_, _, None, _) => ByteConsumption::Unbounded, Pattern::Assertion(_, _, _) => ByteConsumption::zero(), Pattern::Nothing => ByteConsumption::zero(), Pattern::Submatch(pat) => pat.max_byte_consumption(), } } pub fn reverse(self) -> Self { use Pattern::*; match self { Byte(_) | Nothing | Range(..) | CharacterClass(_) => self, Alt(patterns) => Alt(patterns.into_iter().map(Self::reverse).collect()), Concat(patterns) => Concat(patterns.into_iter().map(Self::reverse).rev().collect()), Rep(pattern, min, max, greedy) => Rep(Box::new(pattern.reverse()), min, max, greedy), Assertion(dir, pol, pat) => Assertion(dir.reverse(), pol, Box::new(pat.reverse())), Submatch(pat) => Submatch(Box::new(pat.reverse())), } } } #[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] pub enum GreedyBehavior { Greedy, NonGreedy, } pub enum CompiledPattern { Dfa(dfa::DFA), Bytecode(bc::BytecodeCompiledRegex), Anything(simple::Anything), Nothing(simple::Nothing), Exact(simple::Exact), } impl CompiledPattern { fn is_dfa(&self) -> bool { matches!(self, Self::Dfa(_)) } fn is_bytecode(&self) -> bool { matches!(self, Self::Bytecode(_)) } fn is_simple(&self) -> bool { matches!(self, Self::Anything(_) | Self::Nothing(_) | Self::Exact(_)) } } #[derive(PartialEq, Eq, Debug)] pub struct Match { pub submatches: Box<[Option>]>, } impl Match { pub fn new_empty() -> Self { Self { submatches: [].into(), } } } pub trait RegexEngine: Sized { type CompileError: std::fmt::Debug + Clone; fn compile(pat: Pattern) -> Result; fn run(&self, input: &[u8]) -> Option; fn matches(&self, input: &[u8]) -> bool { self.run(input).is_some() } } impl RegexEngine for CompiledPattern { type CompileError = bc::RegexCompilationError; fn compile(pat: Pattern) -> Result { if let Ok(c) = simple::Anything::compile(pat.clone()) { Ok(Self::Anything(c)) } else if let Ok(c) = simple::Nothing::compile(pat.clone()) { Ok(Self::Nothing(c)) } else if let Ok(c) = simple::Exact::compile(pat.clone()) { Ok(Self::Exact(c)) } else if let Ok(c) = dfa::DFA::compile(pat.clone()) { Ok(Self::Dfa(c)) } else { bc::BytecodeCompiledRegex::compile(pat).map(Self::Bytecode) } } fn run(&self, input: &[u8]) -> Option { match self { CompiledPattern::Dfa(x) => x.run(input), CompiledPattern::Bytecode(x) => x.run(input), CompiledPattern::Anything(x) => x.run(input), CompiledPattern::Nothing(x) => x.run(input), CompiledPattern::Exact(x) => x.run(input), } } } macro_rules! all_engines { ($ty_name:ident, $($x:ident : $ty:ty,)*) => { pub struct $ty_name { $($x: Option<$ty>,)* } impl RegexEngine for $ty_name { type CompileError = (); fn compile(pat: Pattern) -> Result { let x = Self { $($x: RegexEngine::compile(pat.clone()).ok(),)* }; if $(x.$x.is_none())&&* { Err(()) } else { Ok(x) } } fn run(&self, input: &[u8]) -> Option { $(let $x = self.$x.as_ref().map(|x| x.run(input));)* let mut result = None; $( if let Some(res) = $x { if let Some(result) = result { assert_eq!(res, result, concat!("engine ", stringify!($x), " does not agree with previously run engines.")); } result = Some(res) } )* result.unwrap() } } } } all_engines!( AllEngines, dfa: dfa::DFA, bc: bc::BytecodeCompiledRegex, any: simple::Anything, nothing: simple::Nothing, exact: simple::Exact, ); impl Pattern { pub fn try_compile(self) -> Result { CompiledPattern::compile(self) } } #[cfg(test)] macro_rules! regex_matches { ($regex:literal, $match:literal, $true:literal) => { assert_eq!( ::parse_from_bytes($regex.as_bytes()) .unwrap() .try_compile() .unwrap() .matches($match.as_bytes()), $true ) }; } #[test] fn foo_matches_foo() { regex_matches!("foo", "foo", true); } #[test] fn dot_star_is_simple() { let x = Pattern::parse_from_bytes(b".*") .unwrap() .try_compile() .unwrap(); assert!(x.is_simple()); } #[test] fn match_is_bytecode() { let x = Pattern::parse_from_bytes(b".*(ele.*phant).*") .unwrap() .try_compile() .unwrap(); assert!(x.is_bytecode()); } #[test] fn simple_word_is_exact() { let x = Pattern::parse_from_bytes(b"Gnu[ ]plus[ ]Linux") .unwrap() .try_compile() .unwrap(); assert!(x.is_simple()); } #[test] fn no_match_is_dfa() { let x = Pattern::parse_from_bytes(b".*Gnu.*plus.*Linux.*") .unwrap() .try_compile() .unwrap(); assert!(x.is_dfa()); }