use crate::parse::OtherHighlights; use super::{Parse, ParseError, Result}; mod byte_range; mod dfa; mod enfa; #[derive(PartialEq, Eq, Debug, Clone, Copy)] pub enum LookDirection { Ahead, Behind, } #[derive(PartialEq, Eq, Debug, Clone, Copy)] pub enum LookPolarity { Positive, Negative, } #[derive(PartialEq, Debug, Clone)] pub enum Pattern { Byte(u8), Range(u8, u8), Alt(Vec), Concat(Vec), Rep(Box, u32, Option), Assertion(LookDirection, LookPolarity, Box), Nothing, } impl Parse for Pattern { fn parse(b: &mut super::Cursor<'_>) -> super::Result { let begin = b.loc(); let result = parse0(b); if result.is_ok() { b.highlight_from(begin, OtherHighlights::Regex); } result } } fn parse0(s: &mut super::Cursor<'_>) -> Result { parse_alt(s) } fn parse_alt(s: &mut super::Cursor<'_>) -> Result { let mut seqs = vec![]; loop { let seq = parse_seq(s)?; if seq != Pattern::Nothing { seqs.push(seq); } let begin = s.loc(); if s.has() && s.peek() == b'|' { s.adv(); s.highlight_from(begin, OtherHighlights::RegexSymbol); } else { break; } } Ok(match seqs.len() { 0 => Pattern::Nothing, 1 => seqs.into_iter().next().unwrap(), _ => Pattern::Alt(seqs), }) } fn parse_seq(s: &mut super::Cursor<'_>) -> Result { let mut reps = vec![]; loop { let rep = parse_rep(s)?; if rep != Pattern::Nothing { reps.push(rep); } else { break; } } Ok(match reps.len() { 0 => Pattern::Nothing, 1 => reps.into_iter().next().unwrap(), _ => Pattern::Concat(reps), }) } fn parse_rep(s: &mut super::Cursor<'_>) -> Result { let atom = parse_atom(s)?; if atom == Pattern::Nothing { return Ok(atom); } if !s.has() { return Ok(atom); } let begin = s.loc(); match s.peek() { b'*' => { s.adv(); s.highlight_from(begin, OtherHighlights::RegexSymbol); Ok(Pattern::Rep(Box::new(atom), 0, None)) } b'+' => { s.adv(); s.highlight_from(begin, OtherHighlights::RegexSymbol); Ok(Pattern::Rep(Box::new(atom), 1, None)) } b'?' => { s.adv(); s.highlight_from(begin, OtherHighlights::RegexSymbol); Ok(Pattern::Rep(Box::new(atom), 0, Some(1))) } _ => Ok(atom), } // TODO: non-greedy } const SYMBOLS: &[u8] = b"{}[]()*+-?|.\\ "; fn is_symbol(x: u8) -> bool { SYMBOLS.contains(&x) } fn parse_atom(s: &mut super::Cursor<'_>) -> Result { if !s.has() { return Ok(Pattern::Nothing); } let begin = s.loc(); match s.peek() { b'[' => { s.adv(); s.highlight_from(begin, OtherHighlights::RegexSymbol); let mut ranges = Vec::new(); loop { if !s.has() { return Err(ParseError::Eof); } let begin = s.loc(); let tok = s.adv(); if tok == b']' { if ranges.is_empty() { todo!("error handling for empty alternative list"); } s.highlight_from(begin, OtherHighlights::RegexSymbol); return Ok(Pattern::Alt(ranges)); } if is_symbol(tok) { return Err(ParseError::Unknown(tok)); } let begin = s.loc(); if s.has() && s.peek() == b'-' { s.adv(); s.highlight_from(begin, OtherHighlights::RegexSymbol); if !s.has() { return Err(ParseError::Eof); } let tok2 = s.adv(); if is_symbol(tok2) { return Err(ParseError::Unknown(tok2)); } ranges.push(Pattern::Range(tok, tok2)); } else { ranges.push(Pattern::Byte(tok)); } } } b'(' => { s.adv(); let mut assertion = None; if s.buf.starts_with(b"?=") { s.advance(2); assertion = Some((LookDirection::Ahead, LookPolarity::Positive)); } else if s.buf.starts_with(b"?!") { s.advance(2); assertion = Some((LookDirection::Ahead, LookPolarity::Negative)); } else if s.buf.starts_with(b"?<=") { s.advance(3); assertion = Some((LookDirection::Behind, LookPolarity::Positive)); } else if s.buf.starts_with(b"? { s.adv(); s.highlight_from(begin, OtherHighlights::RegexSymbol); Ok(Pattern::Range(0, 127)) } b'\\' => { s.adv(); if s.has() { let escaped = s.adv(); s.highlight_from(begin, OtherHighlights::RegexSymbol); if is_symbol(escaped) { Ok(Pattern::Byte(escaped)) } else { // TODO interpret \w and others Err(ParseError::Unknown(escaped)) } } else { Err(ParseError::Eof) } } x if is_symbol(x) => Ok(Pattern::Nothing), ch => { s.adv(); Ok(Pattern::Byte(ch)) } } } pub struct CompiledPattern { dfa: dfa::DFA, } impl std::fmt::Debug for CompiledPattern { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { self.dfa.fmt(f) } } #[derive(Debug)] pub enum CompilationError { Enfa(enfa::EnfaTranslationError), } impl From for CompilationError { fn from(value: enfa::EnfaTranslationError) -> Self { Self::Enfa(value) } } impl Pattern { pub fn try_compile(self) -> std::result::Result { let enfa = enfa::ENFA::try_from(self)?; let dfa = dfa::DFA::from(enfa); Ok(CompiledPattern { dfa }) } } impl CompiledPattern { pub fn matches(&self, bytes: &[u8]) -> bool { self.dfa.matches(bytes) } } #[cfg(test)] macro_rules! regex_matches { ($regex:literal, $match:literal, $true:literal) => { assert_eq!( Pattern::parse_from_bytes($regex.as_bytes()) .unwrap() .try_compile() .unwrap() .matches($match.as_bytes()), $true ) }; } #[test] fn foo_matches_foo() { regex_matches!("foo", "foo", true); }