aboutsummaryrefslogtreecommitdiffstats
path: root/src/parse/regex/mod.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/parse/regex/mod.rs')
-rw-r--r--src/parse/regex/mod.rs201
1 files changed, 201 insertions, 0 deletions
diff --git a/src/parse/regex/mod.rs b/src/parse/regex/mod.rs
new file mode 100644
index 0000000..1c761a1
--- /dev/null
+++ b/src/parse/regex/mod.rs
@@ -0,0 +1,201 @@
+use super::{Parse, ParseError, Result};
+
+mod byte_range;
+mod dfa;
+mod enfa;
+
+#[derive(PartialEq, Debug, Clone)]
+pub enum Pattern {
+ Byte(u8),
+ Range(u8, u8),
+ Alt(Vec<Pattern>),
+ Concat(Vec<Pattern>),
+ Rep(Box<Pattern>, u32, Option<u32>),
+ Nothing,
+}
+
+impl Parse for Pattern {
+ fn parse(b: &mut super::Cursor<'_>) -> super::Result<Self> {
+ parse_alt(b)
+ }
+}
+
+fn parse_alt(s: &mut super::Cursor<'_>) -> Result<Pattern> {
+ let mut seqs = vec![];
+ loop {
+ let seq = parse_seq(s)?;
+ if seq != Pattern::Nothing {
+ seqs.push(seq);
+ }
+ if s.has() && s.peek() == b'|' {
+ s.adv();
+ } else {
+ break;
+ }
+ }
+
+ Ok(match seqs.len() {
+ 0 => Pattern::Nothing,
+ 1 => seqs.into_iter().next().unwrap(),
+ _ => Pattern::Alt(seqs),
+ })
+}
+
+fn parse_seq(s: &mut super::Cursor<'_>) -> Result<Pattern> {
+ let mut reps = vec![];
+ loop {
+ let rep = parse_rep(s)?;
+ if rep != Pattern::Nothing {
+ reps.push(rep);
+ } else {
+ break;
+ }
+ }
+
+ Ok(match reps.len() {
+ 0 => Pattern::Nothing,
+ 1 => reps.into_iter().next().unwrap(),
+ _ => Pattern::Concat(reps),
+ })
+}
+
+fn parse_rep(s: &mut super::Cursor<'_>) -> Result<Pattern> {
+ let atom = parse_atom(s)?;
+
+ if atom == Pattern::Nothing {
+ return Ok(atom);
+ }
+
+ if !s.has() {
+ return Ok(atom);
+ }
+
+ match s.peek() {
+ b'*' => {
+ s.adv();
+ Ok(Pattern::Rep(Box::new(atom), 0, None))
+ }
+ b'+' => {
+ s.adv();
+ Ok(Pattern::Rep(Box::new(atom), 1, None))
+ }
+ b'?' => {
+ s.adv();
+ Ok(Pattern::Rep(Box::new(atom), 0, Some(1)))
+ }
+ _ => Ok(atom),
+ }
+
+ // TODO: non-greedy
+}
+
+const SYMBOLS: &[u8] = b"{}[]()*+-?| ";
+fn is_symbol(x: u8) -> bool {
+ SYMBOLS.contains(&x)
+}
+
+fn parse_atom(s: &mut super::Cursor<'_>) -> Result<Pattern> {
+ if !s.has() {
+ return Ok(Pattern::Nothing);
+ }
+
+ match s.peek() {
+ b'[' => {
+ s.adv();
+ let mut ranges = Vec::new();
+ loop {
+ if !s.has() {
+ return Err(ParseError::Eof);
+ }
+
+ let tok = s.adv();
+
+ if tok == b']' {
+ if ranges.is_empty() {
+ todo!("error handling for empty alternative list");
+ }
+ return Ok(Pattern::Alt(ranges));
+ }
+
+ if is_symbol(tok) {
+ return Err(ParseError::Unknown(tok));
+ }
+
+ if s.has() && s.peek() == b'-' {
+ s.adv();
+
+ if !s.has() {
+ return Err(ParseError::Eof);
+ }
+ let tok2 = s.adv();
+
+ if is_symbol(tok2) {
+ return Err(ParseError::Unknown(tok2));
+ }
+
+ ranges.push(Pattern::Range(tok, tok2));
+ } else {
+ ranges.push(Pattern::Byte(tok));
+ }
+ }
+ }
+ b'(' => {
+ s.adv();
+ let inner = Pattern::parse(s)?;
+ if !s.has() {
+ return Err(ParseError::Eof);
+ }
+ if s.adv() != b')' {
+ return Err(ParseError::Expected(')'));
+ }
+ Ok(inner)
+ }
+ x if is_symbol(x) => Ok(Pattern::Nothing),
+ ch => {
+ s.adv();
+ Ok(Pattern::Byte(ch))
+ }
+ }
+}
+
+pub struct CompiledPattern {
+ dfa: dfa::DFA,
+}
+
+impl std::fmt::Debug for CompiledPattern {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ self.dfa.fmt(f)
+ }
+}
+
+impl Pattern {
+ pub fn compile(self) -> CompiledPattern {
+ let enfa = enfa::ENFA::from(self);
+ let dfa = dfa::DFA::from(enfa);
+ CompiledPattern { dfa }
+ }
+}
+
+impl CompiledPattern {
+ pub fn matches(&self, bytes: &[u8]) -> bool {
+ self.dfa.matches(bytes)
+ }
+}
+
+#[cfg(test)]
+macro_rules! regex_matches {
+ ($regex:literal, $match:literal, $true:literal) => {
+ assert_eq!(
+ Pattern::parse_from_bytes($regex.as_bytes())
+ .unwrap()
+ .compile()
+ .matches($match.as_bytes()),
+ $true
+ )
+ };
+}
+
+#[test]
+fn foo_matches_foo() {
+ regex_matches!("foo", "foo", true);
+}