use super::{Parse, ParseError, Result}; mod byte_range; mod dfa; mod enfa; #[derive(PartialEq, Debug, Clone)] pub enum Pattern { Byte(u8), Range(u8, u8), Alt(Vec), Concat(Vec), Rep(Box, u32, Option), Nothing, } impl Parse for Pattern { fn parse(b: &mut super::Cursor<'_>) -> super::Result { parse_alt(b) } } fn parse_alt(s: &mut super::Cursor<'_>) -> Result { let mut seqs = vec![]; loop { let seq = parse_seq(s)?; if seq != Pattern::Nothing { seqs.push(seq); } if s.has() && s.peek() == b'|' { s.adv(); } else { break; } } Ok(match seqs.len() { 0 => Pattern::Nothing, 1 => seqs.into_iter().next().unwrap(), _ => Pattern::Alt(seqs), }) } fn parse_seq(s: &mut super::Cursor<'_>) -> Result { let mut reps = vec![]; loop { let rep = parse_rep(s)?; if rep != Pattern::Nothing { reps.push(rep); } else { break; } } Ok(match reps.len() { 0 => Pattern::Nothing, 1 => reps.into_iter().next().unwrap(), _ => Pattern::Concat(reps), }) } fn parse_rep(s: &mut super::Cursor<'_>) -> Result { let atom = parse_atom(s)?; if atom == Pattern::Nothing { return Ok(atom); } if !s.has() { return Ok(atom); } match s.peek() { b'*' => { s.adv(); Ok(Pattern::Rep(Box::new(atom), 0, None)) } b'+' => { s.adv(); Ok(Pattern::Rep(Box::new(atom), 1, None)) } b'?' => { s.adv(); Ok(Pattern::Rep(Box::new(atom), 0, Some(1))) } _ => Ok(atom), } // TODO: non-greedy } const SYMBOLS: &[u8] = b"{}[]()*+-?| "; fn is_symbol(x: u8) -> bool { SYMBOLS.contains(&x) } fn parse_atom(s: &mut super::Cursor<'_>) -> Result { if !s.has() { return Ok(Pattern::Nothing); } match s.peek() { b'[' => { s.adv(); let mut ranges = Vec::new(); loop { if !s.has() { return Err(ParseError::Eof); } let tok = s.adv(); if tok == b']' { if ranges.is_empty() { todo!("error handling for empty alternative list"); } return Ok(Pattern::Alt(ranges)); } if is_symbol(tok) { return Err(ParseError::Unknown(tok)); } if s.has() && s.peek() == b'-' { s.adv(); if !s.has() { return Err(ParseError::Eof); } let tok2 = s.adv(); if is_symbol(tok2) { return Err(ParseError::Unknown(tok2)); } ranges.push(Pattern::Range(tok, tok2)); } else { ranges.push(Pattern::Byte(tok)); } } } b'(' => { s.adv(); let inner = Pattern::parse(s)?; if !s.has() { return Err(ParseError::Eof); } if s.adv() != b')' { return Err(ParseError::Expected(')')); } Ok(inner) } x if is_symbol(x) => Ok(Pattern::Nothing), ch => { s.adv(); Ok(Pattern::Byte(ch)) } } } pub struct CompiledPattern { dfa: dfa::DFA, } impl std::fmt::Debug for CompiledPattern { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { self.dfa.fmt(f) } } impl Pattern { pub fn compile(self) -> CompiledPattern { let enfa = enfa::ENFA::from(self); let dfa = dfa::DFA::from(enfa); CompiledPattern { dfa } } } impl CompiledPattern { pub fn matches(&self, bytes: &[u8]) -> bool { self.dfa.matches(bytes) } } #[cfg(test)] macro_rules! regex_matches { ($regex:literal, $match:literal, $true:literal) => { assert_eq!( Pattern::parse_from_bytes($regex.as_bytes()) .unwrap() .compile() .matches($match.as_bytes()), $true ) }; } #[test] fn foo_matches_foo() { regex_matches!("foo", "foo", true); }