diff options
Diffstat (limited to 'src/parse/regex/mod.rs')
| -rw-r--r-- | src/parse/regex/mod.rs | 201 |
1 files changed, 201 insertions, 0 deletions
diff --git a/src/parse/regex/mod.rs b/src/parse/regex/mod.rs new file mode 100644 index 0000000..1c761a1 --- /dev/null +++ b/src/parse/regex/mod.rs @@ -0,0 +1,201 @@ +use super::{Parse, ParseError, Result}; + +mod byte_range; +mod dfa; +mod enfa; + +#[derive(PartialEq, Debug, Clone)] +pub enum Pattern { + Byte(u8), + Range(u8, u8), + Alt(Vec<Pattern>), + Concat(Vec<Pattern>), + Rep(Box<Pattern>, u32, Option<u32>), + Nothing, +} + +impl Parse for Pattern { + fn parse(b: &mut super::Cursor<'_>) -> super::Result<Self> { + parse_alt(b) + } +} + +fn parse_alt(s: &mut super::Cursor<'_>) -> Result<Pattern> { + let mut seqs = vec![]; + loop { + let seq = parse_seq(s)?; + if seq != Pattern::Nothing { + seqs.push(seq); + } + if s.has() && s.peek() == b'|' { + s.adv(); + } else { + break; + } + } + + Ok(match seqs.len() { + 0 => Pattern::Nothing, + 1 => seqs.into_iter().next().unwrap(), + _ => Pattern::Alt(seqs), + }) +} + +fn parse_seq(s: &mut super::Cursor<'_>) -> Result<Pattern> { + let mut reps = vec![]; + loop { + let rep = parse_rep(s)?; + if rep != Pattern::Nothing { + reps.push(rep); + } else { + break; + } + } + + Ok(match reps.len() { + 0 => Pattern::Nothing, + 1 => reps.into_iter().next().unwrap(), + _ => Pattern::Concat(reps), + }) +} + +fn parse_rep(s: &mut super::Cursor<'_>) -> Result<Pattern> { + let atom = parse_atom(s)?; + + if atom == Pattern::Nothing { + return Ok(atom); + } + + if !s.has() { + return Ok(atom); + } + + match s.peek() { + b'*' => { + s.adv(); + Ok(Pattern::Rep(Box::new(atom), 0, None)) + } + b'+' => { + s.adv(); + Ok(Pattern::Rep(Box::new(atom), 1, None)) + } + b'?' => { + s.adv(); + Ok(Pattern::Rep(Box::new(atom), 0, Some(1))) + } + _ => Ok(atom), + } + + // TODO: non-greedy +} + +const SYMBOLS: &[u8] = b"{}[]()*+-?| "; +fn is_symbol(x: u8) -> bool { + SYMBOLS.contains(&x) +} + +fn parse_atom(s: &mut super::Cursor<'_>) -> Result<Pattern> { + if !s.has() { + return Ok(Pattern::Nothing); + } + + match s.peek() { + b'[' => { + s.adv(); + let mut ranges = Vec::new(); + loop { + if !s.has() { + return Err(ParseError::Eof); + } + + let tok = s.adv(); + + if tok == b']' { + if ranges.is_empty() { + todo!("error handling for empty alternative list"); + } + return Ok(Pattern::Alt(ranges)); + } + + if is_symbol(tok) { + return Err(ParseError::Unknown(tok)); + } + + if s.has() && s.peek() == b'-' { + s.adv(); + + if !s.has() { + return Err(ParseError::Eof); + } + let tok2 = s.adv(); + + if is_symbol(tok2) { + return Err(ParseError::Unknown(tok2)); + } + + ranges.push(Pattern::Range(tok, tok2)); + } else { + ranges.push(Pattern::Byte(tok)); + } + } + } + b'(' => { + s.adv(); + let inner = Pattern::parse(s)?; + if !s.has() { + return Err(ParseError::Eof); + } + if s.adv() != b')' { + return Err(ParseError::Expected(')')); + } + Ok(inner) + } + x if is_symbol(x) => Ok(Pattern::Nothing), + ch => { + s.adv(); + Ok(Pattern::Byte(ch)) + } + } +} + +pub struct CompiledPattern { + dfa: dfa::DFA, +} + +impl std::fmt::Debug for CompiledPattern { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.dfa.fmt(f) + } +} + +impl Pattern { + pub fn compile(self) -> CompiledPattern { + let enfa = enfa::ENFA::from(self); + let dfa = dfa::DFA::from(enfa); + CompiledPattern { dfa } + } +} + +impl CompiledPattern { + pub fn matches(&self, bytes: &[u8]) -> bool { + self.dfa.matches(bytes) + } +} + +#[cfg(test)] +macro_rules! regex_matches { + ($regex:literal, $match:literal, $true:literal) => { + assert_eq!( + Pattern::parse_from_bytes($regex.as_bytes()) + .unwrap() + .compile() + .matches($match.as_bytes()), + $true + ) + }; +} + +#[test] +fn foo_matches_foo() { + regex_matches!("foo", "foo", true); +} |
