diff options
| author | Jonas Maier <jonas@x77.dev> | 2026-06-06 12:15:52 +0200 |
|---|---|---|
| committer | Jonas Maier <jonas@x77.dev> | 2026-06-06 12:15:52 +0200 |
| commit | 53980774c327675e886179c0a2c140744dcf9b95 (patch) | |
| tree | ca1fdcc9938fce2c10c51e0a51659c6ba38ac5ba /src | |
| parent | 75e0c29cf91ddc6299c14a94a038c3e3df3d2805 (diff) | |
| download | pish-53980774c327675e886179c0a2c140744dcf9b95.tar.gz | |
special cased regex for performance
Diffstat (limited to 'src')
| -rw-r--r-- | src/lib.rs | 1 | ||||
| -rw-r--r-- | src/parse/mod.rs | 4 | ||||
| -rw-r--r-- | src/parse/regex.rs | 214 | ||||
| -rw-r--r-- | src/parse/regex/mod.rs | 432 | ||||
| -rw-r--r-- | src/regex/bc.rs (renamed from src/parse/regex/bc.rs) | 26 | ||||
| -rw-r--r-- | src/regex/byte_range.rs (renamed from src/parse/regex/byte_range.rs) | 0 | ||||
| -rw-r--r-- | src/regex/dfa.rs (renamed from src/parse/regex/dfa.rs) | 35 | ||||
| -rw-r--r-- | src/regex/enfa.rs (renamed from src/parse/regex/enfa.rs) | 34 | ||||
| -rw-r--r-- | src/regex/mod.rs | 344 | ||||
| -rw-r--r-- | src/regex/simple.rs | 125 | ||||
| -rw-r--r-- | src/run/builtin.rs | 12 | ||||
| -rw-r--r-- | src/run/mod.rs | 1 |
12 files changed, 765 insertions, 463 deletions
@@ -38,6 +38,7 @@ pub mod serialization; pub mod syntax_highlighting; pub mod variants; pub mod wait; +pub mod regex; mod bitset; use raw::*; diff --git a/src/parse/mod.rs b/src/parse/mod.rs index 407cebd..8530aab 100644 --- a/src/parse/mod.rs +++ b/src/parse/mod.rs @@ -1922,7 +1922,7 @@ impl Parse for Pipes<PreExpansion> { #[derive(Debug, Clone, PartialEq)] pub struct CaseBranch { - pub pattern: regex::Pattern, + pub pattern: crate::regex::Pattern, pub block: Block, } @@ -1953,7 +1953,7 @@ impl Parse for CaseBranch { fn parse(b: &mut Cursor<'_>) -> Result<Self> { b.spaces(); - let pattern = regex::Pattern::parse(b)?; + let pattern = crate::regex::Pattern::parse(b)?; let block = Block::parse(b)?; Ok(Self { pattern, block }) diff --git a/src/parse/regex.rs b/src/parse/regex.rs new file mode 100644 index 0000000..e5329e8 --- /dev/null +++ b/src/parse/regex.rs @@ -0,0 +1,214 @@ +use super::{Cursor, OtherHighlights, Parse, ParseError, Result}; +use crate::regex::{GreedyBehavior, Pattern, LookDirection, LookPolarity}; + +const SYMBOLS: &[u8] = b"{}[]()*+-?|.\\ "; +fn is_symbol(x: u8) -> bool { + SYMBOLS.contains(&x) +} + +impl Parse for Pattern { + fn parse(b: &mut Cursor<'_>) -> super::Result<Self> { + let begin = b.loc(); + let result = parse0(b); + if result.is_ok() { + b.highlight_from(begin, OtherHighlights::Regex); + } + result + } +} + +fn parse0(s: &mut Cursor<'_>) -> Result<Pattern> { + parse_alt(s) +} + +fn parse_alt(s: &mut Cursor<'_>) -> Result<Pattern> { + let mut seqs = vec![]; + loop { + let seq = parse_seq(s)?; + seqs.push(seq); + let begin = s.loc(); + if s.has() && s.peek() == b'|' { + s.adv(); + s.highlight_from(begin, OtherHighlights::RegexSymbol); + } else { + break; + } + } + + Ok(match seqs.len() { + 0 => Pattern::Nothing, + 1 => seqs.into_iter().next().unwrap(), + _ => Pattern::Alt(seqs), + }) +} + +fn parse_seq(s: &mut Cursor<'_>) -> Result<Pattern> { + let mut reps = vec![]; + loop { + let rep = parse_rep(s)?; + if rep != Pattern::Nothing { + reps.push(rep); + } else { + break; + } + } + + Ok(match reps.len() { + 0 => Pattern::Nothing, + 1 => reps.into_iter().next().unwrap(), + _ => Pattern::Concat(reps), + }) +} + +fn parse_rep(s: &mut Cursor<'_>) -> Result<Pattern> { + let atom = parse_atom(s)?; + + if atom == Pattern::Nothing { + return Ok(atom); + } + + if !s.has() { + return Ok(atom); + } + + let begin = s.loc(); + + let rep = match s.peek() { + b'*' => Some((0, None)), + b'+' => Some((1, None)), + b'?' => Some((0, Some(1))), + _ => None, + }; + + if let Some((min_rep, max_rep)) = rep { + s.adv(); + s.highlight_from(begin, OtherHighlights::RegexSymbol); + + let greed = if s.has() && s.peek() == b'?' { + s.adv(); + GreedyBehavior::NonGreedy + } else { + GreedyBehavior::Greedy + }; + + Ok(Pattern::Rep(Box::new(atom), min_rep, max_rep, greed)) + } else { + Ok(atom) + } +} + +fn parse_atom(s: &mut super::Cursor<'_>) -> Result<Pattern> { + if !s.has() { + return Ok(Pattern::Nothing); + } + + let begin = s.loc(); + + match s.peek() { + b'[' => { + s.adv(); + s.highlight_from(begin, OtherHighlights::RegexSymbol); + let mut ranges = Vec::new(); + loop { + if !s.has() { + return Err(ParseError::Eof); + } + + let begin = s.loc(); + let tok = s.adv(); + + if tok == b']' { + if ranges.is_empty() { + todo!("error handling for empty alternative list"); + } + s.highlight_from(begin, OtherHighlights::RegexSymbol); + return Ok(Pattern::Alt(ranges)); + } + + let begin = s.loc(); + if s.has() && s.peek() == b'-' { + s.adv(); + s.highlight_from(begin, OtherHighlights::RegexSymbol); + + if !s.has() { + return Err(ParseError::Eof); + } + let tok2 = s.peek(); + + if tok2 == b']' { + ranges.push(Pattern::Byte(tok)); + ranges.push(Pattern::Byte(b'-')); + } else if is_symbol(tok2) { + return Err(ParseError::Unknown(tok2)); + } else { + s.adv(); + ranges.push(Pattern::Range(tok, tok2)); + } + } else { + ranges.push(Pattern::Byte(tok)); + } + } + } + b'(' => { + s.adv(); + + let mut assertion = None; + if s.buf.starts_with(b"?=") { + s.advance(2); + assertion = Some((LookDirection::Ahead, LookPolarity::Positive)); + } else if s.buf.starts_with(b"?!") { + s.advance(2); + assertion = Some((LookDirection::Ahead, LookPolarity::Negative)); + } else if s.buf.starts_with(b"?<=") { + s.advance(3); + assertion = Some((LookDirection::Behind, LookPolarity::Positive)); + } else if s.buf.starts_with(b"?<!") { + s.advance(3); + assertion = Some((LookDirection::Behind, LookPolarity::Negative)); + } + + s.highlight_from(begin, OtherHighlights::RegexSymbol); + let inner = parse0(s)?; + if !s.has() { + return Err(ParseError::Eof); + } + let begin = s.loc(); + if s.adv() != b')' { + return Err(ParseError::Expected(')')); + } + s.highlight_from(begin, OtherHighlights::RegexSymbol); + + if let Some((dir, pol)) = assertion { + Ok(Pattern::Assertion(dir, pol, Box::new(inner))) + } else { + Ok(Pattern::Submatch(Box::new(inner))) + } + } + b'.' => { + s.adv(); + s.highlight_from(begin, OtherHighlights::RegexSymbol); + Ok(Pattern::Range(0, 127)) + } + b'\\' => { + s.adv(); + if s.has() { + let escaped = s.adv(); + s.highlight_from(begin, OtherHighlights::RegexSymbol); + + if is_symbol(escaped) { + Ok(Pattern::Byte(escaped)) + } else { + // TODO interpret \w and others + Err(ParseError::Unknown(escaped)) + } + } else { + Err(ParseError::Eof) + } + } + x if is_symbol(x) => Ok(Pattern::Nothing), + ch => { + s.adv(); + Ok(Pattern::Byte(ch)) + } + } +} diff --git a/src/parse/regex/mod.rs b/src/parse/regex/mod.rs deleted file mode 100644 index 486b0d9..0000000 --- a/src/parse/regex/mod.rs +++ /dev/null @@ -1,432 +0,0 @@ -use crate::parse::OtherHighlights; - -use super::{Parse, ParseError, Result}; - -pub mod bc; -mod byte_range; -pub mod dfa; -pub mod enfa; - -#[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)] -pub enum LookDirection { - Ahead, - Behind, -} - -impl LookDirection { - pub fn reverse(self) -> Self { - match self { - Self::Ahead => Self::Behind, - Self::Behind => Self::Ahead, - } - } -} - -#[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)] -pub enum LookPolarity { - Positive, - Negative, -} - -#[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)] -pub enum CharacterClass { - Everything, - Nothing, - Whitespace, - Alphabetic, - Alphanumeric, -} - -impl CharacterClass { - pub fn matches(self, byte: u8) -> bool { - match self { - CharacterClass::Everything => true, - CharacterClass::Nothing => false, - CharacterClass::Whitespace => byte.is_ascii_whitespace(), - CharacterClass::Alphabetic => byte.is_ascii_alphabetic(), - CharacterClass::Alphanumeric => byte.is_ascii_alphanumeric(), - } - } -} - -#[derive(PartialEq, Eq, Hash, Debug, Clone)] -pub enum Pattern { - Byte(u8), - Range(u8, u8), - CharacterClass(CharacterClass), - Alt(Vec<Pattern>), - Concat(Vec<Pattern>), - Rep(Box<Pattern>, u32, Option<u32>, GreedyBehavior), - Assertion(LookDirection, LookPolarity, Box<Pattern>), - Submatch(Box<Pattern>), - Nothing, -} - -#[derive(Debug, PartialEq, Eq, Copy, Clone)] -pub enum ByteConsumption { - Bounded(usize), - Unbounded, -} - -impl ByteConsumption { - pub fn zero() -> Self { - Self::Bounded(0) - } - pub fn one() -> Self { - Self::Bounded(1) - } -} - -impl Ord for ByteConsumption { - fn cmp(&self, other: &Self) -> std::cmp::Ordering { - match (self, other) { - (Self::Bounded(a), Self::Bounded(b)) => a.cmp(b), - (Self::Bounded(_), Self::Unbounded) => std::cmp::Ordering::Less, - (Self::Unbounded, Self::Bounded(_)) => std::cmp::Ordering::Greater, - (Self::Unbounded, Self::Unbounded) => std::cmp::Ordering::Equal, - } - } -} -impl PartialOrd for ByteConsumption { - fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> { - Some(self.cmp(other)) - } -} - -impl std::ops::Add for ByteConsumption { - type Output = Self; - - fn add(self, rhs: Self) -> Self::Output { - if let Self::Bounded(a) = self - && let Self::Bounded(b) = rhs - { - Self::Bounded(a + b) - } else { - Self::Unbounded - } - } -} - -impl std::ops::Mul<usize> for ByteConsumption { - type Output = Self; - - fn mul(self, rhs: usize) -> Self::Output { - match self { - Self::Bounded(x) => Self::Bounded(x * rhs), - Self::Unbounded => Self::Unbounded, - } - } -} - -impl std::iter::Sum for ByteConsumption { - fn sum<I: Iterator<Item = Self>>(iter: I) -> Self { - iter.fold(Self::zero(), |a, b| a + b) - } -} - -impl Pattern { - pub fn max_byte_consumption(&self) -> ByteConsumption { - match self { - Pattern::Byte(_) => ByteConsumption::one(), - Pattern::Range(_, _) => ByteConsumption::one(), - Pattern::CharacterClass(_) => ByteConsumption::one(), - Pattern::Alt(patterns) => patterns - .iter() - .map(Self::max_byte_consumption) - .max() - .unwrap_or(ByteConsumption::zero()), - Pattern::Concat(patterns) => patterns.iter().map(Self::max_byte_consumption).sum(), - Pattern::Rep(pattern, _, Some(max_reps), _) => { - pattern.max_byte_consumption() * (*max_reps as usize) - } - Pattern::Rep(_, _, None, _) => ByteConsumption::Unbounded, - Pattern::Assertion(_, _, _) => ByteConsumption::zero(), - Pattern::Nothing => ByteConsumption::zero(), - Pattern::Submatch(pat) => pat.max_byte_consumption(), - } - } - - pub fn reverse(self) -> Self { - use Pattern::*; - match self { - Byte(_) | Nothing | Range(..) | CharacterClass(_) => self, - Alt(patterns) => Alt(patterns.into_iter().map(Self::reverse).collect()), - Concat(patterns) => Concat(patterns.into_iter().map(Self::reverse).rev().collect()), - Rep(pattern, min, max, greedy) => Rep(Box::new(pattern.reverse()), min, max, greedy), - Assertion(dir, pol, pat) => Assertion(dir.reverse(), pol, Box::new(pat.reverse())), - Submatch(pat) => Submatch(Box::new(pat.reverse())) - } - } -} - -impl Parse for Pattern { - fn parse(b: &mut super::Cursor<'_>) -> super::Result<Self> { - let begin = b.loc(); - let result = parse0(b); - if result.is_ok() { - b.highlight_from(begin, OtherHighlights::Regex); - } - result - } -} - -fn parse0(s: &mut super::Cursor<'_>) -> Result<Pattern> { - parse_alt(s) -} - -fn parse_alt(s: &mut super::Cursor<'_>) -> Result<Pattern> { - let mut seqs = vec![]; - loop { - let seq = parse_seq(s)?; - seqs.push(seq); - let begin = s.loc(); - if s.has() && s.peek() == b'|' { - s.adv(); - s.highlight_from(begin, OtherHighlights::RegexSymbol); - } else { - break; - } - } - - Ok(match seqs.len() { - 0 => Pattern::Nothing, - 1 => seqs.into_iter().next().unwrap(), - _ => Pattern::Alt(seqs), - }) -} - -fn parse_seq(s: &mut super::Cursor<'_>) -> Result<Pattern> { - let mut reps = vec![]; - loop { - let rep = parse_rep(s)?; - if rep != Pattern::Nothing { - reps.push(rep); - } else { - break; - } - } - - Ok(match reps.len() { - 0 => Pattern::Nothing, - 1 => reps.into_iter().next().unwrap(), - _ => Pattern::Concat(reps), - }) -} - -fn parse_rep(s: &mut super::Cursor<'_>) -> Result<Pattern> { - let atom = parse_atom(s)?; - - if atom == Pattern::Nothing { - return Ok(atom); - } - - if !s.has() { - return Ok(atom); - } - - let begin = s.loc(); - - let rep = match s.peek() { - b'*' => Some((0, None)), - b'+' => Some((1, None)), - b'?' => Some((0, Some(1))), - _ => None, - }; - - if let Some((min_rep, max_rep)) = rep { - s.adv(); - s.highlight_from(begin, OtherHighlights::RegexSymbol); - - let greed = if s.has() && s.peek() == b'?' { - s.adv(); - GreedyBehavior::NonGreedy - } else { - GreedyBehavior::Greedy - }; - - Ok(Pattern::Rep(Box::new(atom), min_rep, max_rep, greed)) - } else { - Ok(atom) - } -} - -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -pub enum GreedyBehavior { - Greedy, - NonGreedy, -} - -const SYMBOLS: &[u8] = b"{}[]()*+-?|.\\ "; -fn is_symbol(x: u8) -> bool { - SYMBOLS.contains(&x) -} - -fn parse_atom(s: &mut super::Cursor<'_>) -> Result<Pattern> { - if !s.has() { - return Ok(Pattern::Nothing); - } - - let begin = s.loc(); - - match s.peek() { - b'[' => { - s.adv(); - s.highlight_from(begin, OtherHighlights::RegexSymbol); - let mut ranges = Vec::new(); - loop { - if !s.has() { - return Err(ParseError::Eof); - } - - let begin = s.loc(); - let tok = s.adv(); - - if tok == b']' { - if ranges.is_empty() { - todo!("error handling for empty alternative list"); - } - s.highlight_from(begin, OtherHighlights::RegexSymbol); - return Ok(Pattern::Alt(ranges)); - } - - let begin = s.loc(); - if s.has() && s.peek() == b'-' { - s.adv(); - s.highlight_from(begin, OtherHighlights::RegexSymbol); - - if !s.has() { - return Err(ParseError::Eof); - } - let tok2 = s.peek(); - - if tok2 == b']' { - ranges.push(Pattern::Byte(tok)); - ranges.push(Pattern::Byte(b'-')); - } else if is_symbol(tok2) { - return Err(ParseError::Unknown(tok2)); - } else { - s.adv(); - ranges.push(Pattern::Range(tok, tok2)); - } - } else { - ranges.push(Pattern::Byte(tok)); - } - } - } - b'(' => { - s.adv(); - - let mut assertion = None; - if s.buf.starts_with(b"?=") { - s.advance(2); - assertion = Some((LookDirection::Ahead, LookPolarity::Positive)); - } else if s.buf.starts_with(b"?!") { - s.advance(2); - assertion = Some((LookDirection::Ahead, LookPolarity::Negative)); - } else if s.buf.starts_with(b"?<=") { - s.advance(3); - assertion = Some((LookDirection::Behind, LookPolarity::Positive)); - } else if s.buf.starts_with(b"?<!") { - s.advance(3); - assertion = Some((LookDirection::Behind, LookPolarity::Negative)); - } - - s.highlight_from(begin, OtherHighlights::RegexSymbol); - let inner = parse0(s)?; - if !s.has() { - return Err(ParseError::Eof); - } - let begin = s.loc(); - if s.adv() != b')' { - return Err(ParseError::Expected(')')); - } - s.highlight_from(begin, OtherHighlights::RegexSymbol); - - if let Some((dir, pol)) = assertion { - Ok(Pattern::Assertion(dir, pol, Box::new(inner))) - } else { - Ok(Pattern::Submatch(Box::new(inner))) - } - } - b'.' => { - s.adv(); - s.highlight_from(begin, OtherHighlights::RegexSymbol); - Ok(Pattern::Range(0, 127)) - } - b'\\' => { - s.adv(); - if s.has() { - let escaped = s.adv(); - s.highlight_from(begin, OtherHighlights::RegexSymbol); - - if is_symbol(escaped) { - Ok(Pattern::Byte(escaped)) - } else { - // TODO interpret \w and others - Err(ParseError::Unknown(escaped)) - } - } else { - Err(ParseError::Eof) - } - } - x if is_symbol(x) => Ok(Pattern::Nothing), - ch => { - s.adv(); - Ok(Pattern::Byte(ch)) - } - } -} - -pub struct CompiledPattern { - dfa: dfa::DFA, -} - -impl std::fmt::Debug for CompiledPattern { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - self.dfa.fmt(f) - } -} - -#[derive(Debug)] -pub enum CompilationError { - Enfa(enfa::EnfaTranslationError), -} - -impl From<enfa::EnfaTranslationError> for CompilationError { - fn from(value: enfa::EnfaTranslationError) -> Self { - Self::Enfa(value) - } -} - -impl Pattern { - pub fn try_compile(self) -> std::result::Result<CompiledPattern, CompilationError> { - let enfa = enfa::ENFA::try_from(self)?; - let dfa = dfa::DFA::from(enfa); - Ok(CompiledPattern { dfa }) - } -} - -impl CompiledPattern { - pub fn matches(&self, bytes: impl AsRef<[u8]>) -> bool { - self.dfa.matches(bytes.as_ref()) - } -} - -#[cfg(test)] -macro_rules! regex_matches { - ($regex:literal, $match:literal, $true:literal) => { - assert_eq!( - Pattern::parse_from_bytes($regex.as_bytes()) - .unwrap() - .try_compile() - .unwrap() - .matches($match.as_bytes()), - $true - ) - }; -} - -#[test] -fn foo_matches_foo() { - regex_matches!("foo", "foo", true); -} diff --git a/src/parse/regex/bc.rs b/src/regex/bc.rs index cb41607..4a79485 100644 --- a/src/parse/regex/bc.rs +++ b/src/regex/bc.rs @@ -1,11 +1,10 @@ use std::collections::{HashMap, VecDeque}; -use crate::{ - bitset::BitSet, - parse::regex::{ - CharacterClass, GreedyBehavior, LookDirection, LookPolarity, Pattern, byte_range::ByteRange, - }, +use super::{ + CharacterClass, GreedyBehavior, LookDirection, LookPolarity, Match, Pattern, RegexEngine, + byte_range::ByteRange, }; +use crate::bitset::BitSet; trait Flavor: Clone { type CustomInstr: Copy + Clone + std::fmt::Debug; @@ -362,10 +361,6 @@ impl BytecodeCompiledRegex { } } -pub struct Match { - pub submatches: Box<[Option<core::ops::Range<usize>>]>, -} - type AssertionHandler<'a, F> = Box<dyn 'a + FnMut(LookDirection, LookPolarity, Pattern) -> CompileResult<Instr<F>>>; @@ -629,11 +624,22 @@ impl TryFrom<Pattern> for BytecodeCompiledRegex { } } +impl RegexEngine for BytecodeCompiledRegex { + type CompileError = RegexCompilationError; + + fn compile(pat: Pattern) -> Result<Self, Self::CompileError> { + Self::try_from(pat) + } + + fn run(&self, input: &[u8]) -> Option<Match> { + self.re_match(input) + } +} + #[cfg(test)] mod tests { use super::*; use crate::parse::Parse; - fn regex(s: &str) -> BytecodeCompiledRegex { let pat = Pattern::parse_from_bytes(s.as_bytes()).unwrap(); let compiled = BytecodeCompiledRegex::try_from(pat).unwrap(); diff --git a/src/parse/regex/byte_range.rs b/src/regex/byte_range.rs index b7642c1..b7642c1 100644 --- a/src/parse/regex/byte_range.rs +++ b/src/regex/byte_range.rs diff --git a/src/parse/regex/dfa.rs b/src/regex/dfa.rs index 35f726c..c55d99d 100644 --- a/src/parse/regex/dfa.rs +++ b/src/regex/dfa.rs @@ -1,7 +1,10 @@ use core::fmt; use std::collections::{BinaryHeap, HashMap, HashSet}; +use crate::regex::{Match, RegexEngine, enfa::EnfaTranslationError}; + use super::{ + Pattern, byte_range::ByteRange, enfa::{ENFA, MultiState}, }; @@ -88,6 +91,38 @@ impl From<ENFA> for DFA { } } +#[derive(Clone, Debug)] +pub enum DFACompileError { + SubmatchesNotSupported, + #[allow(unused)] + NFAError(EnfaTranslationError), +} + +impl RegexEngine for DFA { + type CompileError = DFACompileError; + + fn compile(pat: Pattern) -> Result<Self, Self::CompileError> { + match ENFA::try_from(pat) { + Ok(nfa) => { + if nfa.has_submatches { + Err(DFACompileError::SubmatchesNotSupported) + } else { + Ok(Self::from(nfa)) + } + } + Err(e) => Err(DFACompileError::NFAError(e)), + } + } + + fn run(&self, input: &[u8]) -> Option<super::Match> { + if self.matches(input) { + Some(Match::new_empty()) + } else { + None + } + } +} + impl DFA { pub fn matches(&self, x: &[u8]) -> bool { let mut state = self.start; diff --git a/src/parse/regex/enfa.rs b/src/regex/enfa.rs index dd4cf6e..8392642 100644 --- a/src/parse/regex/enfa.rs +++ b/src/regex/enfa.rs @@ -3,16 +3,14 @@ use std::{ hash::{DefaultHasher, Hash, Hasher}, }; -use crate::parse::regex::{LookDirection, LookPolarity}; - -use super::Pattern; -use super::byte_range::ByteRange; +use super::{LookDirection, LookPolarity, Pattern, byte_range::ByteRange}; /// NFA with epsilon transitions #[derive(Clone)] #[allow(clippy::upper_case_acronyms)] pub struct ENFA { pub states: Vec<EState>, + pub has_submatches: bool, } fn cartesian_product<T: Clone>(x: Vec<Vec<T>>) -> Vec<Vec<T>> { @@ -64,10 +62,11 @@ mod product_tests { fn empty_outer_vector() { let x: Vec<Vec<i32>> = vec![]; - let out = cartesian_product(x); + let out: Vec<Vec<i32>> = cartesian_product(x); + let r: Vec<Vec<i32>> = vec![vec![]]; // One empty combination. - assert_eq!(out, vec![vec![]]); + assert_eq!(out, r); } #[test] @@ -426,13 +425,14 @@ impl ENFA { impl ENFA { fn looping(self) -> Self { + let has_submatches = self.has_submatches; let mut states = vec![EState::start()]; states.append(&mut self.shift(1)); let len = states.len(); states[0].set_epsilon_transitions([Transition::epsilon(1), Transition::epsilon(len)]); states[len - 1].set_epsilon_transitions([Transition::epsilon(0), Transition::epsilon(len)]); states.push(EState::terminal()); - Self { states } + Self { states, has_submatches } } fn repeat(self, times: usize) -> Self { @@ -457,11 +457,14 @@ impl ENFA { if nfas.is_empty() { return Self { states: vec![EState::terminal()], + has_submatches: false, }; } + let mut has_submatches = false; let mut states: Vec<EState> = Vec::new(); for nfa in nfas.into_iter() { + has_submatches = has_submatches || nfa.has_submatches; let len = states.len(); let mut ns = nfa.shift(len); if let Some(n) = states.last_mut() { @@ -474,7 +477,7 @@ impl ENFA { let len = states.len(); states[len - 1].accept = Acceptance::Accept; - Self { states } + Self { states, has_submatches } } } @@ -635,7 +638,7 @@ impl EState { } } -#[derive(Debug)] +#[derive(Clone, Debug)] pub enum EnfaTranslationError { CharacterClassNotSupported, AssertionsNotSupported, @@ -656,6 +659,7 @@ impl TryFrom<Pattern> for ENFA { }, EState::terminal(), ], + has_submatches: false, }, Pattern::CharacterClass(_) => { return Err(EnfaTranslationError::CharacterClassNotSupported); @@ -667,7 +671,9 @@ impl TryFrom<Pattern> for ENFA { .collect::<Result<_, _>>()?; let mut states = vec![EState::start()]; let mut ends = vec![]; + let mut has_submatches = false; for nfa in nfas.into_iter() { + has_submatches = has_submatches || nfa.has_submatches; let len = states.len(); states[0].trans.insert(Transition::epsilon(len)); states.append(&mut (nfa.shift(len))); @@ -678,7 +684,7 @@ impl TryFrom<Pattern> for ENFA { let last = states.len() - 1; states[end].trans.insert(Transition::epsilon(last)); } - Self { states } + Self { states, has_submatches } } Pattern::Concat(seq) => { let nfas: Vec<Self> = seq @@ -702,6 +708,7 @@ impl TryFrom<Pattern> for ENFA { } Pattern::Nothing => Self { states: vec![EState::terminal()], + has_submatches: false, }, Pattern::Assertion(dir, polarity, pat) => { if dir == LookDirection::Behind { @@ -722,11 +729,12 @@ impl TryFrom<Pattern> for ENFA { }); states.append(&mut regex); states.push(EState::terminal()); - Self { states } + Self { states, has_submatches: false, } } Pattern::Submatch(pat) => { - // NFA does not track submatches, so we are skipping it. - Self::try_from(*pat)? + let mut this = Self::try_from(*pat)?; + this.has_submatches = true; + this } }) } diff --git a/src/regex/mod.rs b/src/regex/mod.rs new file mode 100644 index 0000000..be3026f --- /dev/null +++ b/src/regex/mod.rs @@ -0,0 +1,344 @@ +use crate::parse::Parse; + +pub mod bc; +mod byte_range; +pub mod dfa; +pub mod enfa; +pub mod simple; + +#[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)] +pub enum LookDirection { + Ahead, + Behind, +} + +impl LookDirection { + pub fn reverse(self) -> Self { + match self { + Self::Ahead => Self::Behind, + Self::Behind => Self::Ahead, + } + } +} + +#[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)] +pub enum LookPolarity { + Positive, + Negative, +} + +#[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)] +pub enum CharacterClass { + Everything, + Nothing, + Whitespace, + Alphabetic, + Alphanumeric, +} + +impl CharacterClass { + pub fn matches(self, byte: u8) -> bool { + match self { + CharacterClass::Everything => true, + CharacterClass::Nothing => false, + CharacterClass::Whitespace => byte.is_ascii_whitespace(), + CharacterClass::Alphabetic => byte.is_ascii_alphabetic(), + CharacterClass::Alphanumeric => byte.is_ascii_alphanumeric(), + } + } +} + +#[derive(PartialEq, Eq, Hash, Debug, Clone)] +pub enum Pattern { + Byte(u8), + Range(u8, u8), + CharacterClass(CharacterClass), + Alt(Vec<Pattern>), + Concat(Vec<Pattern>), + Rep(Box<Pattern>, u32, Option<u32>, GreedyBehavior), + Assertion(LookDirection, LookPolarity, Box<Pattern>), + Submatch(Box<Pattern>), + Nothing, +} + +#[derive(Debug, PartialEq, Eq, Copy, Clone)] +pub enum ByteConsumption { + Bounded(usize), + Unbounded, +} + +impl ByteConsumption { + pub fn zero() -> Self { + Self::Bounded(0) + } + pub fn one() -> Self { + Self::Bounded(1) + } +} + +impl Ord for ByteConsumption { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + match (self, other) { + (Self::Bounded(a), Self::Bounded(b)) => a.cmp(b), + (Self::Bounded(_), Self::Unbounded) => std::cmp::Ordering::Less, + (Self::Unbounded, Self::Bounded(_)) => std::cmp::Ordering::Greater, + (Self::Unbounded, Self::Unbounded) => std::cmp::Ordering::Equal, + } + } +} +impl PartialOrd for ByteConsumption { + fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> { + Some(self.cmp(other)) + } +} + +impl std::ops::Add for ByteConsumption { + type Output = Self; + + fn add(self, rhs: Self) -> Self::Output { + if let Self::Bounded(a) = self + && let Self::Bounded(b) = rhs + { + Self::Bounded(a + b) + } else { + Self::Unbounded + } + } +} + +impl std::ops::Mul<usize> for ByteConsumption { + type Output = Self; + + fn mul(self, rhs: usize) -> Self::Output { + match self { + Self::Bounded(x) => Self::Bounded(x * rhs), + Self::Unbounded => Self::Unbounded, + } + } +} + +impl std::iter::Sum for ByteConsumption { + fn sum<I: Iterator<Item = Self>>(iter: I) -> Self { + iter.fold(Self::zero(), |a, b| a + b) + } +} + +impl Pattern { + pub fn max_byte_consumption(&self) -> ByteConsumption { + match self { + Pattern::Byte(_) => ByteConsumption::one(), + Pattern::Range(_, _) => ByteConsumption::one(), + Pattern::CharacterClass(_) => ByteConsumption::one(), + Pattern::Alt(patterns) => patterns + .iter() + .map(Self::max_byte_consumption) + .max() + .unwrap_or(ByteConsumption::zero()), + Pattern::Concat(patterns) => patterns.iter().map(Self::max_byte_consumption).sum(), + Pattern::Rep(pattern, _, Some(max_reps), _) => { + pattern.max_byte_consumption() * (*max_reps as usize) + } + Pattern::Rep(_, _, None, _) => ByteConsumption::Unbounded, + Pattern::Assertion(_, _, _) => ByteConsumption::zero(), + Pattern::Nothing => ByteConsumption::zero(), + Pattern::Submatch(pat) => pat.max_byte_consumption(), + } + } + + pub fn reverse(self) -> Self { + use Pattern::*; + match self { + Byte(_) | Nothing | Range(..) | CharacterClass(_) => self, + Alt(patterns) => Alt(patterns.into_iter().map(Self::reverse).collect()), + Concat(patterns) => Concat(patterns.into_iter().map(Self::reverse).rev().collect()), + Rep(pattern, min, max, greedy) => Rep(Box::new(pattern.reverse()), min, max, greedy), + Assertion(dir, pol, pat) => Assertion(dir.reverse(), pol, Box::new(pat.reverse())), + Submatch(pat) => Submatch(Box::new(pat.reverse())), + } + } +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] +pub enum GreedyBehavior { + Greedy, + NonGreedy, +} + +pub enum CompiledPattern { + Dfa(dfa::DFA), + Bytecode(bc::BytecodeCompiledRegex), + Anything(simple::Anything), + Nothing(simple::Nothing), + Exact(simple::Exact), +} + +impl CompiledPattern { + fn is_dfa(&self) -> bool { + matches!(self, Self::Dfa(_)) + } + fn is_bytecode(&self) -> bool { + matches!(self, Self::Bytecode(_)) + } + fn is_simple(&self) -> bool { + matches!(self, Self::Anything(_) | Self::Nothing(_) | Self::Exact(_)) + } +} + +#[derive(PartialEq, Eq, Debug)] +pub struct Match { + pub submatches: Box<[Option<core::ops::Range<usize>>]>, +} + +impl Match { + pub fn new_empty() -> Self { + Self { + submatches: [].into(), + } + } +} + +pub trait RegexEngine: Sized { + type CompileError: std::fmt::Debug + Clone; + + fn compile(pat: Pattern) -> Result<Self, Self::CompileError>; + + fn run(&self, input: &[u8]) -> Option<Match>; + + fn matches(&self, input: &[u8]) -> bool { + self.run(input).is_some() + } +} + +impl RegexEngine for CompiledPattern { + type CompileError = bc::RegexCompilationError; + + fn compile(pat: Pattern) -> Result<Self, Self::CompileError> { + if let Ok(c) = simple::Anything::compile(pat.clone()) { + Ok(Self::Anything(c)) + } else if let Ok(c) = simple::Nothing::compile(pat.clone()) { + Ok(Self::Nothing(c)) + } else if let Ok(c) = simple::Exact::compile(pat.clone()) { + Ok(Self::Exact(c)) + } else if let Ok(c) = dfa::DFA::compile(pat.clone()) { + Ok(Self::Dfa(c)) + } else { + bc::BytecodeCompiledRegex::compile(pat).map(Self::Bytecode) + } + } + + fn run(&self, input: &[u8]) -> Option<Match> { + match self { + CompiledPattern::Dfa(x) => x.run(input), + CompiledPattern::Bytecode(x) => x.run(input), + CompiledPattern::Anything(x) => x.run(input), + CompiledPattern::Nothing(x) => x.run(input), + CompiledPattern::Exact(x) => x.run(input), + } + } +} + +macro_rules! all_engines { + ($ty_name:ident, $($x:ident : $ty:ty,)*) => { + pub struct $ty_name { + $($x: Option<$ty>,)* + } + impl RegexEngine for $ty_name { + type CompileError = (); + + fn compile(pat: Pattern) -> Result<Self, Self::CompileError> { + let x = Self { + $($x: RegexEngine::compile(pat.clone()).ok(),)* + }; + if $(x.$x.is_none())&&* { + Err(()) + } else { + Ok(x) + } + } + + fn run(&self, input: &[u8]) -> Option<Match> { + $(let $x = self.$x.as_ref().map(|x| x.run(input));)* + let mut result = None; + $( + if let Some(res) = $x { + if let Some(result) = result { + assert_eq!(res, result, concat!("engine ", stringify!($x), " does not agree with previously run engines.")); + } + result = Some(res) + } + )* + result.unwrap() + } + } + } +} + +all_engines!( + AllEngines, + dfa: dfa::DFA, + bc: bc::BytecodeCompiledRegex, + any: simple::Anything, + nothing: simple::Nothing, + exact: simple::Exact, +); + +impl Pattern { + pub fn try_compile(self) -> Result<CompiledPattern, bc::RegexCompilationError> { + CompiledPattern::compile(self) + } +} + +#[cfg(test)] +macro_rules! regex_matches { + ($regex:literal, $match:literal, $true:literal) => { + assert_eq!( + <Pattern as crate::parse::Parse>::parse_from_bytes($regex.as_bytes()) + .unwrap() + .try_compile() + .unwrap() + .matches($match.as_bytes()), + $true + ) + }; +} + +#[test] +fn foo_matches_foo() { + regex_matches!("foo", "foo", true); +} + +#[test] +fn dot_star_is_simple() { + let x = Pattern::parse_from_bytes(b".*") + .unwrap() + .try_compile() + .unwrap(); + assert!(x.is_simple()); +} + +#[test] +fn match_is_bytecode() { + let x = Pattern::parse_from_bytes(b".*(ele.*phant).*") + .unwrap() + .try_compile() + .unwrap(); + assert!(x.is_bytecode()); +} + +#[test] +fn simple_word_is_exact() { + let x = Pattern::parse_from_bytes(b"Gnu[ ]plus[ ]Linux") + .unwrap() + .try_compile() + .unwrap(); + assert!(x.is_simple()); +} + +#[test] +fn no_match_is_dfa() { + let x = Pattern::parse_from_bytes(b".*Gnu.*plus.*Linux.*") + .unwrap() + .try_compile() + .unwrap(); + assert!(x.is_dfa()); +} diff --git a/src/regex/simple.rs b/src/regex/simple.rs new file mode 100644 index 0000000..00bc9b4 --- /dev/null +++ b/src/regex/simple.rs @@ -0,0 +1,125 @@ +use crate::regex::CharacterClass; + +use super::{Match, Pattern, RegexEngine}; + +fn empty_match() -> Option<Match> { + Some(Match { + submatches: [].into(), + }) +} + +pub struct Anything; + +#[derive(Debug, Clone)] +pub struct NotASimpleWildcard; + +impl RegexEngine for Anything { + type CompileError = NotASimpleWildcard; + + fn compile(pat: Pattern) -> Result<Self, Self::CompileError> { + if let Pattern::Rep(pat, 0, None, _) = pat + && let Pattern::CharacterClass(CharacterClass::Everything) = *pat + { + Ok(Anything) + } else { + Err(NotASimpleWildcard) + } + } + + fn run(&self, _input: &[u8]) -> Option<Match> { + empty_match() + } +} + +pub struct Nothing; +#[derive(Debug, Clone)] +pub struct NotASimpleNothing; + +impl RegexEngine for Nothing { + type CompileError = NotASimpleNothing; + + fn compile(pat: Pattern) -> Result<Self, Self::CompileError> { + match pat { + Pattern::Range(a, b) if a > b => Ok(Nothing), + Pattern::CharacterClass(CharacterClass::Nothing) => Ok(Nothing), + Pattern::Alt(pats) => { + let all_impossible = pats.into_iter().map(Self::compile).all(|p| p.is_ok()); + if all_impossible { + Ok(Nothing) + } else { + Err(NotASimpleNothing) + } + } + Pattern::Concat(pats) => { + if let Some(pat) = pats.into_iter().next() { + Self::compile(pat) + } else { + Err(NotASimpleNothing) + } + } + Pattern::Rep(_, x, Some(y), _) if y < x => Ok(Nothing), + Pattern::Rep(_, 0, None, _) => Err(NotASimpleNothing), + Pattern::Rep(pat, _gt_0, _, _) => Self::compile(*pat), + Pattern::Submatch(pat) => Self::compile(*pat), + _ => Err(NotASimpleNothing), + } + } + + fn run(&self, _input: &[u8]) -> Option<Match> { + None + } +} + +pub struct Exact { + bytes: Vec<u8>, +} + +const MEM_LIMIT: usize = 25_000; + +#[derive(Debug, Clone)] +pub struct NotSimplyAString; + +fn ce(pat: Pattern) -> Option<Vec<u8>> { + match pat { + Pattern::Byte(b) => Some(vec![b]), + Pattern::Concat(patterns) => { + let mut pats = patterns.into_iter().map(ce).collect::<Option<Vec<_>>>()?; + let mut out = Vec::new(); + for p in pats.iter_mut() { + out.append(p); + } + Some(out) + } + Pattern::Rep(pat, min, Some(max), _) if min == max => { + if let Some(bytes) = ce(*pat) + && bytes.len() * (min as usize) < MEM_LIMIT + { + Some(bytes.repeat(min as usize)) + } else { + None + } + } + Pattern::Submatch(_) => None, // TODO: submatches could be stored as constant offsets + Pattern::Nothing => Some(Vec::new()), + _ => None, + } +} + +impl RegexEngine for Exact { + type CompileError = NotSimplyAString; + + fn compile(pat: Pattern) -> Result<Self, Self::CompileError> { + match ce(pat) { + Some(bytes) => Ok(Self { bytes }), + None => Err(NotSimplyAString), + } + } + + fn run(&self, input: &[u8]) -> Option<Match> { + if input == self.bytes { + empty_match() + } else { + None + } + } +} diff --git a/src/run/builtin.rs b/src/run/builtin.rs index 87a0be0..45d33e2 100644 --- a/src/run/builtin.rs +++ b/src/run/builtin.rs @@ -1039,9 +1039,9 @@ impl Builtin for pish_theme { #[cfg(debug_assertions)] mod dbg { - use crate::parse::regex::{dfa::DFA, enfa::ENFA}; + use crate::regex::{dfa::DFA, enfa::ENFA}; -use super::*; + use super::*; #[derive(Copy, Clone)] pub struct debug; @@ -1107,7 +1107,7 @@ use super::*; fn name(&self) -> &str { "case_match" } - + fn io( &self, _session: Arc<Mutex<Session>>, @@ -1115,12 +1115,12 @@ use super::*; _stdin: &mut dyn Read, stdout: &mut dyn Write, ) -> Result { - let regex = match crate::parse::regex::Pattern::parse_from_bytes(&args[0]) { + let regex = match crate::regex::Pattern::parse_from_bytes(&args[0]) { Ok(r) => r, Err(e) => { writeln!(stdout, "parse error: {e:?}")?; return Err(Error::Exit(1)); - }, + } }; let nfa = match ENFA::try_from(regex) { @@ -1141,4 +1141,4 @@ use super::*; } #[cfg(debug_assertions)] -pub use dbg::{debug, re, case_match}; +pub use dbg::{case_match, debug, re}; diff --git a/src/run/mod.rs b/src/run/mod.rs index c730272..64522d9 100644 --- a/src/run/mod.rs +++ b/src/run/mod.rs @@ -1,3 +1,4 @@ +use crate::regex::RegexEngine; use crate::rw::*; use std::collections::HashMap; use std::path::PathBuf; |
