diff options
Diffstat (limited to 'src/regex')
| -rw-r--r-- | src/regex/bc.rs | 12 | ||||
| -rw-r--r-- | src/regex/enfa.rs | 5 | ||||
| -rw-r--r-- | src/regex/mod.rs | 161 | ||||
| -rw-r--r-- | src/regex/simple.rs | 6 |
4 files changed, 118 insertions, 66 deletions
diff --git a/src/regex/bc.rs b/src/regex/bc.rs index 4a79485..6e55c7d 100644 --- a/src/regex/bc.rs +++ b/src/regex/bc.rs @@ -1,7 +1,7 @@ use std::collections::{HashMap, VecDeque}; use super::{ - CharacterClass, GreedyBehavior, LookDirection, LookPolarity, Match, Pattern, RegexEngine, + Class, GreedyBehavior, LookDirection, LookPolarity, Match, Pattern, RegexEngine, byte_range::ByteRange, }; use crate::bitset::BitSet; @@ -93,7 +93,7 @@ struct Assertion { #[derive(Copy, Clone, Debug)] enum Instr<F: Flavor> { - Class(CharacterClass), + Class(Class), Consume(ByteRange), Jump(JumpTarget), Fork(JumpTarget, JumpTarget), @@ -528,7 +528,7 @@ impl<'a, F: Flavor> Compiler<'a, F> { let begin = self.instrs.len() as JumpTarget; self.compile(pat.clone())?; let end = self.instrs.len() as JumpTarget; - self.instrs.push(Instr::Class(CharacterClass::Nothing)); + self.instrs.push(Instr::Class(Class::Nothing)); let bounds = CompiledSnippet { begin, end }; self.map.insert(pat, bounds); Ok(bounds) @@ -539,7 +539,7 @@ impl<'a, F: Flavor> Compiler<'a, F> { let fork_begin = self.instrs.len() as JumpTarget; match fork_targets.len() { 0 => { - self.instrs[self.assertion_fork_base] = Instr::Class(CharacterClass::Nothing); + self.instrs[self.assertion_fork_base] = Instr::Class(Class::Nothing); } 1 => { self.instrs[self.assertion_fork_base] = Instr::Jump(fork_targets[0]); @@ -566,7 +566,7 @@ impl<'a, F: Flavor> Compiler<'a, F> { fn assertion_compiler() -> Compiler<'static, AssertionFlavor> { let mut c = Compiler::new(|_, _, _| Err(RegexCompilationError::NestedLookaroundNotSupported)); c.rep_any_amt( - Pattern::CharacterClass(CharacterClass::Everything), + Pattern::CharacterClass(Class::Everything), GreedyBehavior::NonGreedy, ) .expect("characterclass should always compile"); @@ -604,7 +604,7 @@ impl TryFrom<Pattern> for BytecodeCompiledRegex { }); main.compile(value)?; let end = main.instrs.len(); - main.instrs.push(Instr::Class(CharacterClass::Nothing)); + main.instrs.push(Instr::Class(Class::Nothing)); (end, main.instrs, main.submatch_count) }; neg.finalize_assertion_forks(); diff --git a/src/regex/enfa.rs b/src/regex/enfa.rs index 8392642..c740915 100644 --- a/src/regex/enfa.rs +++ b/src/regex/enfa.rs @@ -3,6 +3,8 @@ use std::{ hash::{DefaultHasher, Hash, Hasher}, }; +use crate::regex::Class; + use super::{LookDirection, LookPolarity, Pattern, byte_range::ByteRange}; /// NFA with epsilon transitions @@ -661,6 +663,9 @@ impl TryFrom<Pattern> for ENFA { ], has_submatches: false, }, + Pattern::CharacterClass(Class::Everything) => { + Self::try_from(Pattern::Range(0, 254))? + } Pattern::CharacterClass(_) => { return Err(EnfaTranslationError::CharacterClassNotSupported); } diff --git a/src/regex/mod.rs b/src/regex/mod.rs index be3026f..2c9f3d1 100644 --- a/src/regex/mod.rs +++ b/src/regex/mod.rs @@ -1,5 +1,3 @@ -use crate::parse::Parse; - pub mod bc; mod byte_range; pub mod dfa; @@ -28,7 +26,7 @@ pub enum LookPolarity { } #[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)] -pub enum CharacterClass { +pub enum Class { Everything, Nothing, Whitespace, @@ -36,14 +34,14 @@ pub enum CharacterClass { Alphanumeric, } -impl CharacterClass { +impl Class { pub fn matches(self, byte: u8) -> bool { match self { - CharacterClass::Everything => true, - CharacterClass::Nothing => false, - CharacterClass::Whitespace => byte.is_ascii_whitespace(), - CharacterClass::Alphabetic => byte.is_ascii_alphabetic(), - CharacterClass::Alphanumeric => byte.is_ascii_alphanumeric(), + Class::Everything => true, + Class::Nothing => false, + Class::Whitespace => byte.is_ascii_whitespace(), + Class::Alphabetic => byte.is_ascii_alphabetic(), + Class::Alphanumeric => byte.is_ascii_alphanumeric(), } } } @@ -52,7 +50,7 @@ impl CharacterClass { pub enum Pattern { Byte(u8), Range(u8, u8), - CharacterClass(CharacterClass), + CharacterClass(Class), Alt(Vec<Pattern>), Concat(Vec<Pattern>), Rep(Box<Pattern>, u32, Option<u32>, GreedyBehavior), @@ -156,6 +154,51 @@ impl Pattern { Submatch(pat) => Submatch(Box::new(pat.reverse())), } } + + pub fn simplify(self) -> Self { + use Pattern::*; + + match self { + Range(a, b) if a > b => CharacterClass(Class::Nothing), + Range(a, b) if a == b => Byte(a), + Alt(pats) => { + let pats: Vec<_> = pats + .into_iter() + .map(Pattern::simplify) + .filter(|p| !matches!(p, CharacterClass(Class::Nothing))) + .collect(); + match pats.len() { + 0 => CharacterClass(Class::Nothing), + 1 => pats.into_iter().next().unwrap(), + _ => Alt(pats) + } + } + Concat(pats) => { + let mut out = Vec::new(); + for pat in pats { + let pat = pat.simplify(); + match pat { + Concat(mut pats) => out.append(&mut pats), + Nothing => (), + CharacterClass(Class::Nothing) => return CharacterClass(Class::Nothing), + x => out.push(x), + } + } + match out.len() { + 0 => Nothing, + 1 => out.into_iter().next().unwrap(), + _ => Concat(out), + } + } + Rep(_, 0, Some(0), _) => Nothing, + Rep(_, min, Some(max), _) if min > max => CharacterClass(Class::Nothing), + Rep(pat, 1, Some(1), _) => pat.simplify(), + Rep(pat, min, max, greed) => Rep(Box::new(pat.simplify()), min, max, greed), + Assertion(dir, pol, pattern) => Assertion(dir, pol, Box::new(pattern.simplify())), + Submatch(pattern) => Submatch(Box::new(pattern.simplify())), + CharacterClass(_) | Range(_, _) | Nothing | Byte(_) => self, + } + } } #[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] @@ -172,6 +215,7 @@ pub enum CompiledPattern { Exact(simple::Exact), } +#[allow(unused)] impl CompiledPattern { fn is_dfa(&self) -> bool { matches!(self, Self::Dfa(_)) @@ -289,56 +333,59 @@ impl Pattern { } #[cfg(test)] -macro_rules! regex_matches { - ($regex:literal, $match:literal, $true:literal) => { - assert_eq!( - <Pattern as crate::parse::Parse>::parse_from_bytes($regex.as_bytes()) - .unwrap() - .try_compile() - .unwrap() - .matches($match.as_bytes()), - $true - ) - }; -} +mod tests { + use super::*; + use crate::parse::Parse; + + macro_rules! regex_matches { + ($regex:literal, $match:literal, $true:literal) => { + assert_eq!( + <Pattern as crate::parse::Parse>::parse_from_bytes($regex.as_bytes()) + .unwrap() + .try_compile() + .unwrap() + .matches($match.as_bytes()), + $true + ) + }; + } -#[test] -fn foo_matches_foo() { - regex_matches!("foo", "foo", true); -} + #[test] + fn foo_matches_foo() { + regex_matches!("foo", "foo", true); + } -#[test] -fn dot_star_is_simple() { - let x = Pattern::parse_from_bytes(b".*") - .unwrap() - .try_compile() - .unwrap(); - assert!(x.is_simple()); -} + #[test] + fn dot_star_is_simple() { + let x = Pattern::parse_from_bytes(b".*") + .unwrap() + .try_compile() + .unwrap(); + assert!(x.is_simple()); + } -#[test] -fn match_is_bytecode() { - let x = Pattern::parse_from_bytes(b".*(ele.*phant).*") - .unwrap() - .try_compile() - .unwrap(); - assert!(x.is_bytecode()); -} + #[test] + fn match_is_bytecode() { + let x = Pattern::parse_from_bytes(b".*(ele.*phant).*") + .unwrap() + .try_compile() + .unwrap(); + assert!(x.is_bytecode()); + } -#[test] -fn simple_word_is_exact() { - let x = Pattern::parse_from_bytes(b"Gnu[ ]plus[ ]Linux") - .unwrap() - .try_compile() - .unwrap(); - assert!(x.is_simple()); -} + #[test] + fn simple_word_is_exact() { + let parsed = Pattern::parse_from_bytes(b"Gnu[ ]plus[ ]Linux").unwrap(); + let compiled = parsed.clone().try_compile().unwrap(); + assert!(compiled.is_simple(), "{parsed:?}"); + } -#[test] -fn no_match_is_dfa() { - let x = Pattern::parse_from_bytes(b".*Gnu.*plus.*Linux.*") - .unwrap() - .try_compile() - .unwrap(); - assert!(x.is_dfa()); + #[test] + fn no_match_is_dfa() { + let x = Pattern::parse_from_bytes(b".*Gnu.*plus.*Linux.*") + .unwrap() + .try_compile() + .unwrap(); + assert!(x.is_dfa()); + } } diff --git a/src/regex/simple.rs b/src/regex/simple.rs index 00bc9b4..e75ff14 100644 --- a/src/regex/simple.rs +++ b/src/regex/simple.rs @@ -1,4 +1,4 @@ -use crate::regex::CharacterClass; +use crate::regex::Class; use super::{Match, Pattern, RegexEngine}; @@ -18,7 +18,7 @@ impl RegexEngine for Anything { fn compile(pat: Pattern) -> Result<Self, Self::CompileError> { if let Pattern::Rep(pat, 0, None, _) = pat - && let Pattern::CharacterClass(CharacterClass::Everything) = *pat + && let Pattern::CharacterClass(Class::Everything) = *pat { Ok(Anything) } else { @@ -41,7 +41,7 @@ impl RegexEngine for Nothing { fn compile(pat: Pattern) -> Result<Self, Self::CompileError> { match pat { Pattern::Range(a, b) if a > b => Ok(Nothing), - Pattern::CharacterClass(CharacterClass::Nothing) => Ok(Nothing), + Pattern::CharacterClass(Class::Nothing) => Ok(Nothing), Pattern::Alt(pats) => { let all_impossible = pats.into_iter().map(Self::compile).all(|p| p.is_ok()); if all_impossible { |
