aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--benches/regex.rs10
-rw-r--r--src/lib.rs1
-rw-r--r--src/parse/mod.rs4
-rw-r--r--src/parse/regex.rs214
-rw-r--r--src/parse/regex/mod.rs432
-rw-r--r--src/regex/bc.rs (renamed from src/parse/regex/bc.rs)26
-rw-r--r--src/regex/byte_range.rs (renamed from src/parse/regex/byte_range.rs)0
-rw-r--r--src/regex/dfa.rs (renamed from src/parse/regex/dfa.rs)35
-rw-r--r--src/regex/enfa.rs (renamed from src/parse/regex/enfa.rs)34
-rw-r--r--src/regex/mod.rs344
-rw-r--r--src/regex/simple.rs125
-rw-r--r--src/run/builtin.rs12
-rw-r--r--src/run/mod.rs1
-rw-r--r--tests/common.rs34
-rwxr-xr-xtests/regex-gen.sh6
15 files changed, 778 insertions, 500 deletions
diff --git a/benches/regex.rs b/benches/regex.rs
index 577e44d..dd8f54a 100644
--- a/benches/regex.rs
+++ b/benches/regex.rs
@@ -1,8 +1,6 @@
use criterion::{Criterion, Throughput, criterion_group, criterion_main};
-use pish::parse::{
- Parse,
- regex::{Pattern, bc::BytecodeCompiledRegex},
-};
+use pish::parse::Parse;
+use pish::regex::{Pattern, RegexEngine};
/// https://mattmahoney.net/dc/enwik8.zip
pub fn regex_throughput(c: &mut Criterion) {
@@ -14,11 +12,11 @@ pub fn regex_throughput(c: &mut Criterion) {
let mut re = |re: &str| {
let pat = Pattern::parse_from_bytes(re.as_bytes()).unwrap();
- let compiled = BytecodeCompiledRegex::try_from(pat).unwrap();
+ let compiled = pat.try_compile().unwrap();
group.bench_function(&format!("enwik6 {re}"), |b| {
b.iter(|| {
- let _ = compiled.re_match(&content);
+ compiled.matches(&content);
})
});
};
diff --git a/src/lib.rs b/src/lib.rs
index bc5191f..825e216 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -38,6 +38,7 @@ pub mod serialization;
pub mod syntax_highlighting;
pub mod variants;
pub mod wait;
+pub mod regex;
mod bitset;
use raw::*;
diff --git a/src/parse/mod.rs b/src/parse/mod.rs
index 407cebd..8530aab 100644
--- a/src/parse/mod.rs
+++ b/src/parse/mod.rs
@@ -1922,7 +1922,7 @@ impl Parse for Pipes<PreExpansion> {
#[derive(Debug, Clone, PartialEq)]
pub struct CaseBranch {
- pub pattern: regex::Pattern,
+ pub pattern: crate::regex::Pattern,
pub block: Block,
}
@@ -1953,7 +1953,7 @@ impl Parse for CaseBranch {
fn parse(b: &mut Cursor<'_>) -> Result<Self> {
b.spaces();
- let pattern = regex::Pattern::parse(b)?;
+ let pattern = crate::regex::Pattern::parse(b)?;
let block = Block::parse(b)?;
Ok(Self { pattern, block })
diff --git a/src/parse/regex.rs b/src/parse/regex.rs
new file mode 100644
index 0000000..e5329e8
--- /dev/null
+++ b/src/parse/regex.rs
@@ -0,0 +1,214 @@
+use super::{Cursor, OtherHighlights, Parse, ParseError, Result};
+use crate::regex::{GreedyBehavior, Pattern, LookDirection, LookPolarity};
+
+const SYMBOLS: &[u8] = b"{}[]()*+-?|.\\ ";
+fn is_symbol(x: u8) -> bool {
+ SYMBOLS.contains(&x)
+}
+
+impl Parse for Pattern {
+ fn parse(b: &mut Cursor<'_>) -> super::Result<Self> {
+ let begin = b.loc();
+ let result = parse0(b);
+ if result.is_ok() {
+ b.highlight_from(begin, OtherHighlights::Regex);
+ }
+ result
+ }
+}
+
+fn parse0(s: &mut Cursor<'_>) -> Result<Pattern> {
+ parse_alt(s)
+}
+
+fn parse_alt(s: &mut Cursor<'_>) -> Result<Pattern> {
+ let mut seqs = vec![];
+ loop {
+ let seq = parse_seq(s)?;
+ seqs.push(seq);
+ let begin = s.loc();
+ if s.has() && s.peek() == b'|' {
+ s.adv();
+ s.highlight_from(begin, OtherHighlights::RegexSymbol);
+ } else {
+ break;
+ }
+ }
+
+ Ok(match seqs.len() {
+ 0 => Pattern::Nothing,
+ 1 => seqs.into_iter().next().unwrap(),
+ _ => Pattern::Alt(seqs),
+ })
+}
+
+fn parse_seq(s: &mut Cursor<'_>) -> Result<Pattern> {
+ let mut reps = vec![];
+ loop {
+ let rep = parse_rep(s)?;
+ if rep != Pattern::Nothing {
+ reps.push(rep);
+ } else {
+ break;
+ }
+ }
+
+ Ok(match reps.len() {
+ 0 => Pattern::Nothing,
+ 1 => reps.into_iter().next().unwrap(),
+ _ => Pattern::Concat(reps),
+ })
+}
+
+fn parse_rep(s: &mut Cursor<'_>) -> Result<Pattern> {
+ let atom = parse_atom(s)?;
+
+ if atom == Pattern::Nothing {
+ return Ok(atom);
+ }
+
+ if !s.has() {
+ return Ok(atom);
+ }
+
+ let begin = s.loc();
+
+ let rep = match s.peek() {
+ b'*' => Some((0, None)),
+ b'+' => Some((1, None)),
+ b'?' => Some((0, Some(1))),
+ _ => None,
+ };
+
+ if let Some((min_rep, max_rep)) = rep {
+ s.adv();
+ s.highlight_from(begin, OtherHighlights::RegexSymbol);
+
+ let greed = if s.has() && s.peek() == b'?' {
+ s.adv();
+ GreedyBehavior::NonGreedy
+ } else {
+ GreedyBehavior::Greedy
+ };
+
+ Ok(Pattern::Rep(Box::new(atom), min_rep, max_rep, greed))
+ } else {
+ Ok(atom)
+ }
+}
+
+fn parse_atom(s: &mut super::Cursor<'_>) -> Result<Pattern> {
+ if !s.has() {
+ return Ok(Pattern::Nothing);
+ }
+
+ let begin = s.loc();
+
+ match s.peek() {
+ b'[' => {
+ s.adv();
+ s.highlight_from(begin, OtherHighlights::RegexSymbol);
+ let mut ranges = Vec::new();
+ loop {
+ if !s.has() {
+ return Err(ParseError::Eof);
+ }
+
+ let begin = s.loc();
+ let tok = s.adv();
+
+ if tok == b']' {
+ if ranges.is_empty() {
+ todo!("error handling for empty alternative list");
+ }
+ s.highlight_from(begin, OtherHighlights::RegexSymbol);
+ return Ok(Pattern::Alt(ranges));
+ }
+
+ let begin = s.loc();
+ if s.has() && s.peek() == b'-' {
+ s.adv();
+ s.highlight_from(begin, OtherHighlights::RegexSymbol);
+
+ if !s.has() {
+ return Err(ParseError::Eof);
+ }
+ let tok2 = s.peek();
+
+ if tok2 == b']' {
+ ranges.push(Pattern::Byte(tok));
+ ranges.push(Pattern::Byte(b'-'));
+ } else if is_symbol(tok2) {
+ return Err(ParseError::Unknown(tok2));
+ } else {
+ s.adv();
+ ranges.push(Pattern::Range(tok, tok2));
+ }
+ } else {
+ ranges.push(Pattern::Byte(tok));
+ }
+ }
+ }
+ b'(' => {
+ s.adv();
+
+ let mut assertion = None;
+ if s.buf.starts_with(b"?=") {
+ s.advance(2);
+ assertion = Some((LookDirection::Ahead, LookPolarity::Positive));
+ } else if s.buf.starts_with(b"?!") {
+ s.advance(2);
+ assertion = Some((LookDirection::Ahead, LookPolarity::Negative));
+ } else if s.buf.starts_with(b"?<=") {
+ s.advance(3);
+ assertion = Some((LookDirection::Behind, LookPolarity::Positive));
+ } else if s.buf.starts_with(b"?<!") {
+ s.advance(3);
+ assertion = Some((LookDirection::Behind, LookPolarity::Negative));
+ }
+
+ s.highlight_from(begin, OtherHighlights::RegexSymbol);
+ let inner = parse0(s)?;
+ if !s.has() {
+ return Err(ParseError::Eof);
+ }
+ let begin = s.loc();
+ if s.adv() != b')' {
+ return Err(ParseError::Expected(')'));
+ }
+ s.highlight_from(begin, OtherHighlights::RegexSymbol);
+
+ if let Some((dir, pol)) = assertion {
+ Ok(Pattern::Assertion(dir, pol, Box::new(inner)))
+ } else {
+ Ok(Pattern::Submatch(Box::new(inner)))
+ }
+ }
+ b'.' => {
+ s.adv();
+ s.highlight_from(begin, OtherHighlights::RegexSymbol);
+ Ok(Pattern::Range(0, 127))
+ }
+ b'\\' => {
+ s.adv();
+ if s.has() {
+ let escaped = s.adv();
+ s.highlight_from(begin, OtherHighlights::RegexSymbol);
+
+ if is_symbol(escaped) {
+ Ok(Pattern::Byte(escaped))
+ } else {
+ // TODO interpret \w and others
+ Err(ParseError::Unknown(escaped))
+ }
+ } else {
+ Err(ParseError::Eof)
+ }
+ }
+ x if is_symbol(x) => Ok(Pattern::Nothing),
+ ch => {
+ s.adv();
+ Ok(Pattern::Byte(ch))
+ }
+ }
+}
diff --git a/src/parse/regex/mod.rs b/src/parse/regex/mod.rs
deleted file mode 100644
index 486b0d9..0000000
--- a/src/parse/regex/mod.rs
+++ /dev/null
@@ -1,432 +0,0 @@
-use crate::parse::OtherHighlights;
-
-use super::{Parse, ParseError, Result};
-
-pub mod bc;
-mod byte_range;
-pub mod dfa;
-pub mod enfa;
-
-#[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)]
-pub enum LookDirection {
- Ahead,
- Behind,
-}
-
-impl LookDirection {
- pub fn reverse(self) -> Self {
- match self {
- Self::Ahead => Self::Behind,
- Self::Behind => Self::Ahead,
- }
- }
-}
-
-#[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)]
-pub enum LookPolarity {
- Positive,
- Negative,
-}
-
-#[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)]
-pub enum CharacterClass {
- Everything,
- Nothing,
- Whitespace,
- Alphabetic,
- Alphanumeric,
-}
-
-impl CharacterClass {
- pub fn matches(self, byte: u8) -> bool {
- match self {
- CharacterClass::Everything => true,
- CharacterClass::Nothing => false,
- CharacterClass::Whitespace => byte.is_ascii_whitespace(),
- CharacterClass::Alphabetic => byte.is_ascii_alphabetic(),
- CharacterClass::Alphanumeric => byte.is_ascii_alphanumeric(),
- }
- }
-}
-
-#[derive(PartialEq, Eq, Hash, Debug, Clone)]
-pub enum Pattern {
- Byte(u8),
- Range(u8, u8),
- CharacterClass(CharacterClass),
- Alt(Vec<Pattern>),
- Concat(Vec<Pattern>),
- Rep(Box<Pattern>, u32, Option<u32>, GreedyBehavior),
- Assertion(LookDirection, LookPolarity, Box<Pattern>),
- Submatch(Box<Pattern>),
- Nothing,
-}
-
-#[derive(Debug, PartialEq, Eq, Copy, Clone)]
-pub enum ByteConsumption {
- Bounded(usize),
- Unbounded,
-}
-
-impl ByteConsumption {
- pub fn zero() -> Self {
- Self::Bounded(0)
- }
- pub fn one() -> Self {
- Self::Bounded(1)
- }
-}
-
-impl Ord for ByteConsumption {
- fn cmp(&self, other: &Self) -> std::cmp::Ordering {
- match (self, other) {
- (Self::Bounded(a), Self::Bounded(b)) => a.cmp(b),
- (Self::Bounded(_), Self::Unbounded) => std::cmp::Ordering::Less,
- (Self::Unbounded, Self::Bounded(_)) => std::cmp::Ordering::Greater,
- (Self::Unbounded, Self::Unbounded) => std::cmp::Ordering::Equal,
- }
- }
-}
-impl PartialOrd for ByteConsumption {
- fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
- Some(self.cmp(other))
- }
-}
-
-impl std::ops::Add for ByteConsumption {
- type Output = Self;
-
- fn add(self, rhs: Self) -> Self::Output {
- if let Self::Bounded(a) = self
- && let Self::Bounded(b) = rhs
- {
- Self::Bounded(a + b)
- } else {
- Self::Unbounded
- }
- }
-}
-
-impl std::ops::Mul<usize> for ByteConsumption {
- type Output = Self;
-
- fn mul(self, rhs: usize) -> Self::Output {
- match self {
- Self::Bounded(x) => Self::Bounded(x * rhs),
- Self::Unbounded => Self::Unbounded,
- }
- }
-}
-
-impl std::iter::Sum for ByteConsumption {
- fn sum<I: Iterator<Item = Self>>(iter: I) -> Self {
- iter.fold(Self::zero(), |a, b| a + b)
- }
-}
-
-impl Pattern {
- pub fn max_byte_consumption(&self) -> ByteConsumption {
- match self {
- Pattern::Byte(_) => ByteConsumption::one(),
- Pattern::Range(_, _) => ByteConsumption::one(),
- Pattern::CharacterClass(_) => ByteConsumption::one(),
- Pattern::Alt(patterns) => patterns
- .iter()
- .map(Self::max_byte_consumption)
- .max()
- .unwrap_or(ByteConsumption::zero()),
- Pattern::Concat(patterns) => patterns.iter().map(Self::max_byte_consumption).sum(),
- Pattern::Rep(pattern, _, Some(max_reps), _) => {
- pattern.max_byte_consumption() * (*max_reps as usize)
- }
- Pattern::Rep(_, _, None, _) => ByteConsumption::Unbounded,
- Pattern::Assertion(_, _, _) => ByteConsumption::zero(),
- Pattern::Nothing => ByteConsumption::zero(),
- Pattern::Submatch(pat) => pat.max_byte_consumption(),
- }
- }
-
- pub fn reverse(self) -> Self {
- use Pattern::*;
- match self {
- Byte(_) | Nothing | Range(..) | CharacterClass(_) => self,
- Alt(patterns) => Alt(patterns.into_iter().map(Self::reverse).collect()),
- Concat(patterns) => Concat(patterns.into_iter().map(Self::reverse).rev().collect()),
- Rep(pattern, min, max, greedy) => Rep(Box::new(pattern.reverse()), min, max, greedy),
- Assertion(dir, pol, pat) => Assertion(dir.reverse(), pol, Box::new(pat.reverse())),
- Submatch(pat) => Submatch(Box::new(pat.reverse()))
- }
- }
-}
-
-impl Parse for Pattern {
- fn parse(b: &mut super::Cursor<'_>) -> super::Result<Self> {
- let begin = b.loc();
- let result = parse0(b);
- if result.is_ok() {
- b.highlight_from(begin, OtherHighlights::Regex);
- }
- result
- }
-}
-
-fn parse0(s: &mut super::Cursor<'_>) -> Result<Pattern> {
- parse_alt(s)
-}
-
-fn parse_alt(s: &mut super::Cursor<'_>) -> Result<Pattern> {
- let mut seqs = vec![];
- loop {
- let seq = parse_seq(s)?;
- seqs.push(seq);
- let begin = s.loc();
- if s.has() && s.peek() == b'|' {
- s.adv();
- s.highlight_from(begin, OtherHighlights::RegexSymbol);
- } else {
- break;
- }
- }
-
- Ok(match seqs.len() {
- 0 => Pattern::Nothing,
- 1 => seqs.into_iter().next().unwrap(),
- _ => Pattern::Alt(seqs),
- })
-}
-
-fn parse_seq(s: &mut super::Cursor<'_>) -> Result<Pattern> {
- let mut reps = vec![];
- loop {
- let rep = parse_rep(s)?;
- if rep != Pattern::Nothing {
- reps.push(rep);
- } else {
- break;
- }
- }
-
- Ok(match reps.len() {
- 0 => Pattern::Nothing,
- 1 => reps.into_iter().next().unwrap(),
- _ => Pattern::Concat(reps),
- })
-}
-
-fn parse_rep(s: &mut super::Cursor<'_>) -> Result<Pattern> {
- let atom = parse_atom(s)?;
-
- if atom == Pattern::Nothing {
- return Ok(atom);
- }
-
- if !s.has() {
- return Ok(atom);
- }
-
- let begin = s.loc();
-
- let rep = match s.peek() {
- b'*' => Some((0, None)),
- b'+' => Some((1, None)),
- b'?' => Some((0, Some(1))),
- _ => None,
- };
-
- if let Some((min_rep, max_rep)) = rep {
- s.adv();
- s.highlight_from(begin, OtherHighlights::RegexSymbol);
-
- let greed = if s.has() && s.peek() == b'?' {
- s.adv();
- GreedyBehavior::NonGreedy
- } else {
- GreedyBehavior::Greedy
- };
-
- Ok(Pattern::Rep(Box::new(atom), min_rep, max_rep, greed))
- } else {
- Ok(atom)
- }
-}
-
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub enum GreedyBehavior {
- Greedy,
- NonGreedy,
-}
-
-const SYMBOLS: &[u8] = b"{}[]()*+-?|.\\ ";
-fn is_symbol(x: u8) -> bool {
- SYMBOLS.contains(&x)
-}
-
-fn parse_atom(s: &mut super::Cursor<'_>) -> Result<Pattern> {
- if !s.has() {
- return Ok(Pattern::Nothing);
- }
-
- let begin = s.loc();
-
- match s.peek() {
- b'[' => {
- s.adv();
- s.highlight_from(begin, OtherHighlights::RegexSymbol);
- let mut ranges = Vec::new();
- loop {
- if !s.has() {
- return Err(ParseError::Eof);
- }
-
- let begin = s.loc();
- let tok = s.adv();
-
- if tok == b']' {
- if ranges.is_empty() {
- todo!("error handling for empty alternative list");
- }
- s.highlight_from(begin, OtherHighlights::RegexSymbol);
- return Ok(Pattern::Alt(ranges));
- }
-
- let begin = s.loc();
- if s.has() && s.peek() == b'-' {
- s.adv();
- s.highlight_from(begin, OtherHighlights::RegexSymbol);
-
- if !s.has() {
- return Err(ParseError::Eof);
- }
- let tok2 = s.peek();
-
- if tok2 == b']' {
- ranges.push(Pattern::Byte(tok));
- ranges.push(Pattern::Byte(b'-'));
- } else if is_symbol(tok2) {
- return Err(ParseError::Unknown(tok2));
- } else {
- s.adv();
- ranges.push(Pattern::Range(tok, tok2));
- }
- } else {
- ranges.push(Pattern::Byte(tok));
- }
- }
- }
- b'(' => {
- s.adv();
-
- let mut assertion = None;
- if s.buf.starts_with(b"?=") {
- s.advance(2);
- assertion = Some((LookDirection::Ahead, LookPolarity::Positive));
- } else if s.buf.starts_with(b"?!") {
- s.advance(2);
- assertion = Some((LookDirection::Ahead, LookPolarity::Negative));
- } else if s.buf.starts_with(b"?<=") {
- s.advance(3);
- assertion = Some((LookDirection::Behind, LookPolarity::Positive));
- } else if s.buf.starts_with(b"?<!") {
- s.advance(3);
- assertion = Some((LookDirection::Behind, LookPolarity::Negative));
- }
-
- s.highlight_from(begin, OtherHighlights::RegexSymbol);
- let inner = parse0(s)?;
- if !s.has() {
- return Err(ParseError::Eof);
- }
- let begin = s.loc();
- if s.adv() != b')' {
- return Err(ParseError::Expected(')'));
- }
- s.highlight_from(begin, OtherHighlights::RegexSymbol);
-
- if let Some((dir, pol)) = assertion {
- Ok(Pattern::Assertion(dir, pol, Box::new(inner)))
- } else {
- Ok(Pattern::Submatch(Box::new(inner)))
- }
- }
- b'.' => {
- s.adv();
- s.highlight_from(begin, OtherHighlights::RegexSymbol);
- Ok(Pattern::Range(0, 127))
- }
- b'\\' => {
- s.adv();
- if s.has() {
- let escaped = s.adv();
- s.highlight_from(begin, OtherHighlights::RegexSymbol);
-
- if is_symbol(escaped) {
- Ok(Pattern::Byte(escaped))
- } else {
- // TODO interpret \w and others
- Err(ParseError::Unknown(escaped))
- }
- } else {
- Err(ParseError::Eof)
- }
- }
- x if is_symbol(x) => Ok(Pattern::Nothing),
- ch => {
- s.adv();
- Ok(Pattern::Byte(ch))
- }
- }
-}
-
-pub struct CompiledPattern {
- dfa: dfa::DFA,
-}
-
-impl std::fmt::Debug for CompiledPattern {
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- self.dfa.fmt(f)
- }
-}
-
-#[derive(Debug)]
-pub enum CompilationError {
- Enfa(enfa::EnfaTranslationError),
-}
-
-impl From<enfa::EnfaTranslationError> for CompilationError {
- fn from(value: enfa::EnfaTranslationError) -> Self {
- Self::Enfa(value)
- }
-}
-
-impl Pattern {
- pub fn try_compile(self) -> std::result::Result<CompiledPattern, CompilationError> {
- let enfa = enfa::ENFA::try_from(self)?;
- let dfa = dfa::DFA::from(enfa);
- Ok(CompiledPattern { dfa })
- }
-}
-
-impl CompiledPattern {
- pub fn matches(&self, bytes: impl AsRef<[u8]>) -> bool {
- self.dfa.matches(bytes.as_ref())
- }
-}
-
-#[cfg(test)]
-macro_rules! regex_matches {
- ($regex:literal, $match:literal, $true:literal) => {
- assert_eq!(
- Pattern::parse_from_bytes($regex.as_bytes())
- .unwrap()
- .try_compile()
- .unwrap()
- .matches($match.as_bytes()),
- $true
- )
- };
-}
-
-#[test]
-fn foo_matches_foo() {
- regex_matches!("foo", "foo", true);
-}
diff --git a/src/parse/regex/bc.rs b/src/regex/bc.rs
index cb41607..4a79485 100644
--- a/src/parse/regex/bc.rs
+++ b/src/regex/bc.rs
@@ -1,11 +1,10 @@
use std::collections::{HashMap, VecDeque};
-use crate::{
- bitset::BitSet,
- parse::regex::{
- CharacterClass, GreedyBehavior, LookDirection, LookPolarity, Pattern, byte_range::ByteRange,
- },
+use super::{
+ CharacterClass, GreedyBehavior, LookDirection, LookPolarity, Match, Pattern, RegexEngine,
+ byte_range::ByteRange,
};
+use crate::bitset::BitSet;
trait Flavor: Clone {
type CustomInstr: Copy + Clone + std::fmt::Debug;
@@ -362,10 +361,6 @@ impl BytecodeCompiledRegex {
}
}
-pub struct Match {
- pub submatches: Box<[Option<core::ops::Range<usize>>]>,
-}
-
type AssertionHandler<'a, F> =
Box<dyn 'a + FnMut(LookDirection, LookPolarity, Pattern) -> CompileResult<Instr<F>>>;
@@ -629,11 +624,22 @@ impl TryFrom<Pattern> for BytecodeCompiledRegex {
}
}
+impl RegexEngine for BytecodeCompiledRegex {
+ type CompileError = RegexCompilationError;
+
+ fn compile(pat: Pattern) -> Result<Self, Self::CompileError> {
+ Self::try_from(pat)
+ }
+
+ fn run(&self, input: &[u8]) -> Option<Match> {
+ self.re_match(input)
+ }
+}
+
#[cfg(test)]
mod tests {
use super::*;
use crate::parse::Parse;
-
fn regex(s: &str) -> BytecodeCompiledRegex {
let pat = Pattern::parse_from_bytes(s.as_bytes()).unwrap();
let compiled = BytecodeCompiledRegex::try_from(pat).unwrap();
diff --git a/src/parse/regex/byte_range.rs b/src/regex/byte_range.rs
index b7642c1..b7642c1 100644
--- a/src/parse/regex/byte_range.rs
+++ b/src/regex/byte_range.rs
diff --git a/src/parse/regex/dfa.rs b/src/regex/dfa.rs
index 35f726c..c55d99d 100644
--- a/src/parse/regex/dfa.rs
+++ b/src/regex/dfa.rs
@@ -1,7 +1,10 @@
use core::fmt;
use std::collections::{BinaryHeap, HashMap, HashSet};
+use crate::regex::{Match, RegexEngine, enfa::EnfaTranslationError};
+
use super::{
+ Pattern,
byte_range::ByteRange,
enfa::{ENFA, MultiState},
};
@@ -88,6 +91,38 @@ impl From<ENFA> for DFA {
}
}
+#[derive(Clone, Debug)]
+pub enum DFACompileError {
+ SubmatchesNotSupported,
+ #[allow(unused)]
+ NFAError(EnfaTranslationError),
+}
+
+impl RegexEngine for DFA {
+ type CompileError = DFACompileError;
+
+ fn compile(pat: Pattern) -> Result<Self, Self::CompileError> {
+ match ENFA::try_from(pat) {
+ Ok(nfa) => {
+ if nfa.has_submatches {
+ Err(DFACompileError::SubmatchesNotSupported)
+ } else {
+ Ok(Self::from(nfa))
+ }
+ }
+ Err(e) => Err(DFACompileError::NFAError(e)),
+ }
+ }
+
+ fn run(&self, input: &[u8]) -> Option<super::Match> {
+ if self.matches(input) {
+ Some(Match::new_empty())
+ } else {
+ None
+ }
+ }
+}
+
impl DFA {
pub fn matches(&self, x: &[u8]) -> bool {
let mut state = self.start;
diff --git a/src/parse/regex/enfa.rs b/src/regex/enfa.rs
index dd4cf6e..8392642 100644
--- a/src/parse/regex/enfa.rs
+++ b/src/regex/enfa.rs
@@ -3,16 +3,14 @@ use std::{
hash::{DefaultHasher, Hash, Hasher},
};
-use crate::parse::regex::{LookDirection, LookPolarity};
-
-use super::Pattern;
-use super::byte_range::ByteRange;
+use super::{LookDirection, LookPolarity, Pattern, byte_range::ByteRange};
/// NFA with epsilon transitions
#[derive(Clone)]
#[allow(clippy::upper_case_acronyms)]
pub struct ENFA {
pub states: Vec<EState>,
+ pub has_submatches: bool,
}
fn cartesian_product<T: Clone>(x: Vec<Vec<T>>) -> Vec<Vec<T>> {
@@ -64,10 +62,11 @@ mod product_tests {
fn empty_outer_vector() {
let x: Vec<Vec<i32>> = vec![];
- let out = cartesian_product(x);
+ let out: Vec<Vec<i32>> = cartesian_product(x);
+ let r: Vec<Vec<i32>> = vec![vec![]];
// One empty combination.
- assert_eq!(out, vec![vec![]]);
+ assert_eq!(out, r);
}
#[test]
@@ -426,13 +425,14 @@ impl ENFA {
impl ENFA {
fn looping(self) -> Self {
+ let has_submatches = self.has_submatches;
let mut states = vec![EState::start()];
states.append(&mut self.shift(1));
let len = states.len();
states[0].set_epsilon_transitions([Transition::epsilon(1), Transition::epsilon(len)]);
states[len - 1].set_epsilon_transitions([Transition::epsilon(0), Transition::epsilon(len)]);
states.push(EState::terminal());
- Self { states }
+ Self { states, has_submatches }
}
fn repeat(self, times: usize) -> Self {
@@ -457,11 +457,14 @@ impl ENFA {
if nfas.is_empty() {
return Self {
states: vec![EState::terminal()],
+ has_submatches: false,
};
}
+ let mut has_submatches = false;
let mut states: Vec<EState> = Vec::new();
for nfa in nfas.into_iter() {
+ has_submatches = has_submatches || nfa.has_submatches;
let len = states.len();
let mut ns = nfa.shift(len);
if let Some(n) = states.last_mut() {
@@ -474,7 +477,7 @@ impl ENFA {
let len = states.len();
states[len - 1].accept = Acceptance::Accept;
- Self { states }
+ Self { states, has_submatches }
}
}
@@ -635,7 +638,7 @@ impl EState {
}
}
-#[derive(Debug)]
+#[derive(Clone, Debug)]
pub enum EnfaTranslationError {
CharacterClassNotSupported,
AssertionsNotSupported,
@@ -656,6 +659,7 @@ impl TryFrom<Pattern> for ENFA {
},
EState::terminal(),
],
+ has_submatches: false,
},
Pattern::CharacterClass(_) => {
return Err(EnfaTranslationError::CharacterClassNotSupported);
@@ -667,7 +671,9 @@ impl TryFrom<Pattern> for ENFA {
.collect::<Result<_, _>>()?;
let mut states = vec![EState::start()];
let mut ends = vec![];
+ let mut has_submatches = false;
for nfa in nfas.into_iter() {
+ has_submatches = has_submatches || nfa.has_submatches;
let len = states.len();
states[0].trans.insert(Transition::epsilon(len));
states.append(&mut (nfa.shift(len)));
@@ -678,7 +684,7 @@ impl TryFrom<Pattern> for ENFA {
let last = states.len() - 1;
states[end].trans.insert(Transition::epsilon(last));
}
- Self { states }
+ Self { states, has_submatches }
}
Pattern::Concat(seq) => {
let nfas: Vec<Self> = seq
@@ -702,6 +708,7 @@ impl TryFrom<Pattern> for ENFA {
}
Pattern::Nothing => Self {
states: vec![EState::terminal()],
+ has_submatches: false,
},
Pattern::Assertion(dir, polarity, pat) => {
if dir == LookDirection::Behind {
@@ -722,11 +729,12 @@ impl TryFrom<Pattern> for ENFA {
});
states.append(&mut regex);
states.push(EState::terminal());
- Self { states }
+ Self { states, has_submatches: false, }
}
Pattern::Submatch(pat) => {
- // NFA does not track submatches, so we are skipping it.
- Self::try_from(*pat)?
+ let mut this = Self::try_from(*pat)?;
+ this.has_submatches = true;
+ this
}
})
}
diff --git a/src/regex/mod.rs b/src/regex/mod.rs
new file mode 100644
index 0000000..be3026f
--- /dev/null
+++ b/src/regex/mod.rs
@@ -0,0 +1,344 @@
+use crate::parse::Parse;
+
+pub mod bc;
+mod byte_range;
+pub mod dfa;
+pub mod enfa;
+pub mod simple;
+
+#[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)]
+pub enum LookDirection {
+ Ahead,
+ Behind,
+}
+
+impl LookDirection {
+ pub fn reverse(self) -> Self {
+ match self {
+ Self::Ahead => Self::Behind,
+ Self::Behind => Self::Ahead,
+ }
+ }
+}
+
+#[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)]
+pub enum LookPolarity {
+ Positive,
+ Negative,
+}
+
+#[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)]
+pub enum CharacterClass {
+ Everything,
+ Nothing,
+ Whitespace,
+ Alphabetic,
+ Alphanumeric,
+}
+
+impl CharacterClass {
+ pub fn matches(self, byte: u8) -> bool {
+ match self {
+ CharacterClass::Everything => true,
+ CharacterClass::Nothing => false,
+ CharacterClass::Whitespace => byte.is_ascii_whitespace(),
+ CharacterClass::Alphabetic => byte.is_ascii_alphabetic(),
+ CharacterClass::Alphanumeric => byte.is_ascii_alphanumeric(),
+ }
+ }
+}
+
+#[derive(PartialEq, Eq, Hash, Debug, Clone)]
+pub enum Pattern {
+ Byte(u8),
+ Range(u8, u8),
+ CharacterClass(CharacterClass),
+ Alt(Vec<Pattern>),
+ Concat(Vec<Pattern>),
+ Rep(Box<Pattern>, u32, Option<u32>, GreedyBehavior),
+ Assertion(LookDirection, LookPolarity, Box<Pattern>),
+ Submatch(Box<Pattern>),
+ Nothing,
+}
+
+#[derive(Debug, PartialEq, Eq, Copy, Clone)]
+pub enum ByteConsumption {
+ Bounded(usize),
+ Unbounded,
+}
+
+impl ByteConsumption {
+ pub fn zero() -> Self {
+ Self::Bounded(0)
+ }
+ pub fn one() -> Self {
+ Self::Bounded(1)
+ }
+}
+
+impl Ord for ByteConsumption {
+ fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+ match (self, other) {
+ (Self::Bounded(a), Self::Bounded(b)) => a.cmp(b),
+ (Self::Bounded(_), Self::Unbounded) => std::cmp::Ordering::Less,
+ (Self::Unbounded, Self::Bounded(_)) => std::cmp::Ordering::Greater,
+ (Self::Unbounded, Self::Unbounded) => std::cmp::Ordering::Equal,
+ }
+ }
+}
+impl PartialOrd for ByteConsumption {
+ fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+ Some(self.cmp(other))
+ }
+}
+
+impl std::ops::Add for ByteConsumption {
+ type Output = Self;
+
+ fn add(self, rhs: Self) -> Self::Output {
+ if let Self::Bounded(a) = self
+ && let Self::Bounded(b) = rhs
+ {
+ Self::Bounded(a + b)
+ } else {
+ Self::Unbounded
+ }
+ }
+}
+
+impl std::ops::Mul<usize> for ByteConsumption {
+ type Output = Self;
+
+ fn mul(self, rhs: usize) -> Self::Output {
+ match self {
+ Self::Bounded(x) => Self::Bounded(x * rhs),
+ Self::Unbounded => Self::Unbounded,
+ }
+ }
+}
+
+impl std::iter::Sum for ByteConsumption {
+ fn sum<I: Iterator<Item = Self>>(iter: I) -> Self {
+ iter.fold(Self::zero(), |a, b| a + b)
+ }
+}
+
+impl Pattern {
+ pub fn max_byte_consumption(&self) -> ByteConsumption {
+ match self {
+ Pattern::Byte(_) => ByteConsumption::one(),
+ Pattern::Range(_, _) => ByteConsumption::one(),
+ Pattern::CharacterClass(_) => ByteConsumption::one(),
+ Pattern::Alt(patterns) => patterns
+ .iter()
+ .map(Self::max_byte_consumption)
+ .max()
+ .unwrap_or(ByteConsumption::zero()),
+ Pattern::Concat(patterns) => patterns.iter().map(Self::max_byte_consumption).sum(),
+ Pattern::Rep(pattern, _, Some(max_reps), _) => {
+ pattern.max_byte_consumption() * (*max_reps as usize)
+ }
+ Pattern::Rep(_, _, None, _) => ByteConsumption::Unbounded,
+ Pattern::Assertion(_, _, _) => ByteConsumption::zero(),
+ Pattern::Nothing => ByteConsumption::zero(),
+ Pattern::Submatch(pat) => pat.max_byte_consumption(),
+ }
+ }
+
+ pub fn reverse(self) -> Self {
+ use Pattern::*;
+ match self {
+ Byte(_) | Nothing | Range(..) | CharacterClass(_) => self,
+ Alt(patterns) => Alt(patterns.into_iter().map(Self::reverse).collect()),
+ Concat(patterns) => Concat(patterns.into_iter().map(Self::reverse).rev().collect()),
+ Rep(pattern, min, max, greedy) => Rep(Box::new(pattern.reverse()), min, max, greedy),
+ Assertion(dir, pol, pat) => Assertion(dir.reverse(), pol, Box::new(pat.reverse())),
+ Submatch(pat) => Submatch(Box::new(pat.reverse())),
+ }
+ }
+}
+
+#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
+pub enum GreedyBehavior {
+ Greedy,
+ NonGreedy,
+}
+
+pub enum CompiledPattern {
+ Dfa(dfa::DFA),
+ Bytecode(bc::BytecodeCompiledRegex),
+ Anything(simple::Anything),
+ Nothing(simple::Nothing),
+ Exact(simple::Exact),
+}
+
+impl CompiledPattern {
+ fn is_dfa(&self) -> bool {
+ matches!(self, Self::Dfa(_))
+ }
+ fn is_bytecode(&self) -> bool {
+ matches!(self, Self::Bytecode(_))
+ }
+ fn is_simple(&self) -> bool {
+ matches!(self, Self::Anything(_) | Self::Nothing(_) | Self::Exact(_))
+ }
+}
+
+#[derive(PartialEq, Eq, Debug)]
+pub struct Match {
+ pub submatches: Box<[Option<core::ops::Range<usize>>]>,
+}
+
+impl Match {
+ pub fn new_empty() -> Self {
+ Self {
+ submatches: [].into(),
+ }
+ }
+}
+
+pub trait RegexEngine: Sized {
+ type CompileError: std::fmt::Debug + Clone;
+
+ fn compile(pat: Pattern) -> Result<Self, Self::CompileError>;
+
+ fn run(&self, input: &[u8]) -> Option<Match>;
+
+ fn matches(&self, input: &[u8]) -> bool {
+ self.run(input).is_some()
+ }
+}
+
+impl RegexEngine for CompiledPattern {
+ type CompileError = bc::RegexCompilationError;
+
+ fn compile(pat: Pattern) -> Result<Self, Self::CompileError> {
+ if let Ok(c) = simple::Anything::compile(pat.clone()) {
+ Ok(Self::Anything(c))
+ } else if let Ok(c) = simple::Nothing::compile(pat.clone()) {
+ Ok(Self::Nothing(c))
+ } else if let Ok(c) = simple::Exact::compile(pat.clone()) {
+ Ok(Self::Exact(c))
+ } else if let Ok(c) = dfa::DFA::compile(pat.clone()) {
+ Ok(Self::Dfa(c))
+ } else {
+ bc::BytecodeCompiledRegex::compile(pat).map(Self::Bytecode)
+ }
+ }
+
+ fn run(&self, input: &[u8]) -> Option<Match> {
+ match self {
+ CompiledPattern::Dfa(x) => x.run(input),
+ CompiledPattern::Bytecode(x) => x.run(input),
+ CompiledPattern::Anything(x) => x.run(input),
+ CompiledPattern::Nothing(x) => x.run(input),
+ CompiledPattern::Exact(x) => x.run(input),
+ }
+ }
+}
+
+macro_rules! all_engines {
+ ($ty_name:ident, $($x:ident : $ty:ty,)*) => {
+ pub struct $ty_name {
+ $($x: Option<$ty>,)*
+ }
+ impl RegexEngine for $ty_name {
+ type CompileError = ();
+
+ fn compile(pat: Pattern) -> Result<Self, Self::CompileError> {
+ let x = Self {
+ $($x: RegexEngine::compile(pat.clone()).ok(),)*
+ };
+ if $(x.$x.is_none())&&* {
+ Err(())
+ } else {
+ Ok(x)
+ }
+ }
+
+ fn run(&self, input: &[u8]) -> Option<Match> {
+ $(let $x = self.$x.as_ref().map(|x| x.run(input));)*
+ let mut result = None;
+ $(
+ if let Some(res) = $x {
+ if let Some(result) = result {
+ assert_eq!(res, result, concat!("engine ", stringify!($x), " does not agree with previously run engines."));
+ }
+ result = Some(res)
+ }
+ )*
+ result.unwrap()
+ }
+ }
+ }
+}
+
+all_engines!(
+ AllEngines,
+ dfa: dfa::DFA,
+ bc: bc::BytecodeCompiledRegex,
+ any: simple::Anything,
+ nothing: simple::Nothing,
+ exact: simple::Exact,
+);
+
+impl Pattern {
+ pub fn try_compile(self) -> Result<CompiledPattern, bc::RegexCompilationError> {
+ CompiledPattern::compile(self)
+ }
+}
+
+#[cfg(test)]
+macro_rules! regex_matches {
+ ($regex:literal, $match:literal, $true:literal) => {
+ assert_eq!(
+ <Pattern as crate::parse::Parse>::parse_from_bytes($regex.as_bytes())
+ .unwrap()
+ .try_compile()
+ .unwrap()
+ .matches($match.as_bytes()),
+ $true
+ )
+ };
+}
+
+#[test]
+fn foo_matches_foo() {
+ regex_matches!("foo", "foo", true);
+}
+
+#[test]
+fn dot_star_is_simple() {
+ let x = Pattern::parse_from_bytes(b".*")
+ .unwrap()
+ .try_compile()
+ .unwrap();
+ assert!(x.is_simple());
+}
+
+#[test]
+fn match_is_bytecode() {
+ let x = Pattern::parse_from_bytes(b".*(ele.*phant).*")
+ .unwrap()
+ .try_compile()
+ .unwrap();
+ assert!(x.is_bytecode());
+}
+
+#[test]
+fn simple_word_is_exact() {
+ let x = Pattern::parse_from_bytes(b"Gnu[ ]plus[ ]Linux")
+ .unwrap()
+ .try_compile()
+ .unwrap();
+ assert!(x.is_simple());
+}
+
+#[test]
+fn no_match_is_dfa() {
+ let x = Pattern::parse_from_bytes(b".*Gnu.*plus.*Linux.*")
+ .unwrap()
+ .try_compile()
+ .unwrap();
+ assert!(x.is_dfa());
+}
diff --git a/src/regex/simple.rs b/src/regex/simple.rs
new file mode 100644
index 0000000..00bc9b4
--- /dev/null
+++ b/src/regex/simple.rs
@@ -0,0 +1,125 @@
+use crate::regex::CharacterClass;
+
+use super::{Match, Pattern, RegexEngine};
+
+fn empty_match() -> Option<Match> {
+ Some(Match {
+ submatches: [].into(),
+ })
+}
+
+pub struct Anything;
+
+#[derive(Debug, Clone)]
+pub struct NotASimpleWildcard;
+
+impl RegexEngine for Anything {
+ type CompileError = NotASimpleWildcard;
+
+ fn compile(pat: Pattern) -> Result<Self, Self::CompileError> {
+ if let Pattern::Rep(pat, 0, None, _) = pat
+ && let Pattern::CharacterClass(CharacterClass::Everything) = *pat
+ {
+ Ok(Anything)
+ } else {
+ Err(NotASimpleWildcard)
+ }
+ }
+
+ fn run(&self, _input: &[u8]) -> Option<Match> {
+ empty_match()
+ }
+}
+
+pub struct Nothing;
+#[derive(Debug, Clone)]
+pub struct NotASimpleNothing;
+
+impl RegexEngine for Nothing {
+ type CompileError = NotASimpleNothing;
+
+ fn compile(pat: Pattern) -> Result<Self, Self::CompileError> {
+ match pat {
+ Pattern::Range(a, b) if a > b => Ok(Nothing),
+ Pattern::CharacterClass(CharacterClass::Nothing) => Ok(Nothing),
+ Pattern::Alt(pats) => {
+ let all_impossible = pats.into_iter().map(Self::compile).all(|p| p.is_ok());
+ if all_impossible {
+ Ok(Nothing)
+ } else {
+ Err(NotASimpleNothing)
+ }
+ }
+ Pattern::Concat(pats) => {
+ if let Some(pat) = pats.into_iter().next() {
+ Self::compile(pat)
+ } else {
+ Err(NotASimpleNothing)
+ }
+ }
+ Pattern::Rep(_, x, Some(y), _) if y < x => Ok(Nothing),
+ Pattern::Rep(_, 0, None, _) => Err(NotASimpleNothing),
+ Pattern::Rep(pat, _gt_0, _, _) => Self::compile(*pat),
+ Pattern::Submatch(pat) => Self::compile(*pat),
+ _ => Err(NotASimpleNothing),
+ }
+ }
+
+ fn run(&self, _input: &[u8]) -> Option<Match> {
+ None
+ }
+}
+
+pub struct Exact {
+ bytes: Vec<u8>,
+}
+
+const MEM_LIMIT: usize = 25_000;
+
+#[derive(Debug, Clone)]
+pub struct NotSimplyAString;
+
+fn ce(pat: Pattern) -> Option<Vec<u8>> {
+ match pat {
+ Pattern::Byte(b) => Some(vec![b]),
+ Pattern::Concat(patterns) => {
+ let mut pats = patterns.into_iter().map(ce).collect::<Option<Vec<_>>>()?;
+ let mut out = Vec::new();
+ for p in pats.iter_mut() {
+ out.append(p);
+ }
+ Some(out)
+ }
+ Pattern::Rep(pat, min, Some(max), _) if min == max => {
+ if let Some(bytes) = ce(*pat)
+ && bytes.len() * (min as usize) < MEM_LIMIT
+ {
+ Some(bytes.repeat(min as usize))
+ } else {
+ None
+ }
+ }
+ Pattern::Submatch(_) => None, // TODO: submatches could be stored as constant offsets
+ Pattern::Nothing => Some(Vec::new()),
+ _ => None,
+ }
+}
+
+impl RegexEngine for Exact {
+ type CompileError = NotSimplyAString;
+
+ fn compile(pat: Pattern) -> Result<Self, Self::CompileError> {
+ match ce(pat) {
+ Some(bytes) => Ok(Self { bytes }),
+ None => Err(NotSimplyAString),
+ }
+ }
+
+ fn run(&self, input: &[u8]) -> Option<Match> {
+ if input == self.bytes {
+ empty_match()
+ } else {
+ None
+ }
+ }
+}
diff --git a/src/run/builtin.rs b/src/run/builtin.rs
index 87a0be0..45d33e2 100644
--- a/src/run/builtin.rs
+++ b/src/run/builtin.rs
@@ -1039,9 +1039,9 @@ impl Builtin for pish_theme {
#[cfg(debug_assertions)]
mod dbg {
- use crate::parse::regex::{dfa::DFA, enfa::ENFA};
+ use crate::regex::{dfa::DFA, enfa::ENFA};
-use super::*;
+ use super::*;
#[derive(Copy, Clone)]
pub struct debug;
@@ -1107,7 +1107,7 @@ use super::*;
fn name(&self) -> &str {
"case_match"
}
-
+
fn io(
&self,
_session: Arc<Mutex<Session>>,
@@ -1115,12 +1115,12 @@ use super::*;
_stdin: &mut dyn Read,
stdout: &mut dyn Write,
) -> Result {
- let regex = match crate::parse::regex::Pattern::parse_from_bytes(&args[0]) {
+ let regex = match crate::regex::Pattern::parse_from_bytes(&args[0]) {
Ok(r) => r,
Err(e) => {
writeln!(stdout, "parse error: {e:?}")?;
return Err(Error::Exit(1));
- },
+ }
};
let nfa = match ENFA::try_from(regex) {
@@ -1141,4 +1141,4 @@ use super::*;
}
#[cfg(debug_assertions)]
-pub use dbg::{debug, re, case_match};
+pub use dbg::{case_match, debug, re};
diff --git a/src/run/mod.rs b/src/run/mod.rs
index c730272..64522d9 100644
--- a/src/run/mod.rs
+++ b/src/run/mod.rs
@@ -1,3 +1,4 @@
+use crate::regex::RegexEngine;
use crate::rw::*;
use std::collections::HashMap;
use std::path::PathBuf;
diff --git a/tests/common.rs b/tests/common.rs
index ba4b292..0523906 100644
--- a/tests/common.rs
+++ b/tests/common.rs
@@ -1,9 +1,8 @@
#![allow(unused)]
-use pish::parse::{
- Parse,
- regex::{CompiledPattern, Pattern, bc::BytecodeCompiledRegex},
-};
+use pish::parse::Parse;
+use pish::regex::{AllEngines, Pattern};
+pub use pish::regex::RegexEngine;
pub fn test_case(_name: &str, script: &[u8], expected_output: &[u8]) {
use pish::parse::Parse;
@@ -30,28 +29,7 @@ pub fn test_case(_name: &str, script: &[u8], expected_output: &[u8]) {
);
}
-pub struct MultiTestCompiledPattern {
- dfa: CompiledPattern,
- vm: BytecodeCompiledRegex,
-}
-
-impl MultiTestCompiledPattern {
- pub fn matches(&self, string: impl Clone + AsRef<[u8]>) -> bool {
- let dfa_result = self.dfa.matches(string.clone());
- let vm_result = self.vm.matches(string.as_ref());
- assert_eq!(dfa_result, vm_result);
- dfa_result
- }
-}
-
-pub fn regex(pat: &str) -> MultiTestCompiledPattern {
- let parsed =
- Pattern::parse_from_bytes(pat.as_bytes()).expect(&format!("pattern {pat} does not parse"));
- let dfa = parsed
- .clone()
- .try_compile()
- .expect(&format!("pattern {pat} does not compile to DFA"));
- let vm = BytecodeCompiledRegex::try_from(parsed.clone())
- .expect(&format!("pattern {pat} does not compile to VM"));
- MultiTestCompiledPattern { dfa, vm }
+pub fn regex(pat: &str) -> impl RegexEngine {
+ let parsed = Pattern::parse_from_bytes(pat.as_bytes()).expect(&format!("pattern {pat} does not parse"));
+ AllEngines::compile(parsed).unwrap()
}
diff --git a/tests/regex-gen.sh b/tests/regex-gen.sh
index 8e2f0c8..ef5245a 100755
--- a/tests/regex-gen.sh
+++ b/tests/regex-gen.sh
@@ -8,7 +8,7 @@ cat <<"EOF"
//! Generated by `tests/regex-gen.sh` -- do not modify directly.
mod common;
-use common::regex;
+use common::{regex, RegexEngine};
EOF
@@ -25,12 +25,12 @@ while IFS= read -r line; do
jq -r --arg pat "$pattern" '
.accepts[]
- | " assert!(re.matches(r#\"\(.)\"#), \"pattern: \($pat) | expected accept: \(. )\");"
+ | " assert!(re.matches(br#\"\(.)\"#), \"pattern: \($pat) | expected accept: \(. )\");"
' <<<"$line"
jq -r --arg pat "$pattern" '
.rejects[]
- | " assert!(!re.matches(r#\"\(.)\"#), \"pattern: \($pat) | expected reject: \(. )\");"
+ | " assert!(!re.matches(br#\"\(.)\"#), \"pattern: \($pat) | expected reject: \(. )\");"
' <<<"$line"
echo "}"