aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/parse/regex.rs13
-rw-r--r--src/regex/bc.rs12
-rw-r--r--src/regex/enfa.rs5
-rw-r--r--src/regex/mod.rs161
-rw-r--r--src/regex/simple.rs6
5 files changed, 126 insertions, 71 deletions
diff --git a/src/parse/regex.rs b/src/parse/regex.rs
index e5329e8..22c3b9c 100644
--- a/src/parse/regex.rs
+++ b/src/parse/regex.rs
@@ -1,5 +1,5 @@
use super::{Cursor, OtherHighlights, Parse, ParseError, Result};
-use crate::regex::{GreedyBehavior, Pattern, LookDirection, LookPolarity};
+use crate::regex::{Class, GreedyBehavior, LookDirection, LookPolarity, Pattern};
const SYMBOLS: &[u8] = b"{}[]()*+-?|.\\ ";
fn is_symbol(x: u8) -> bool {
@@ -10,10 +10,13 @@ impl Parse for Pattern {
fn parse(b: &mut Cursor<'_>) -> super::Result<Self> {
let begin = b.loc();
let result = parse0(b);
- if result.is_ok() {
- b.highlight_from(begin, OtherHighlights::Regex);
+ match result {
+ Ok(re) => {
+ b.highlight_from(begin, OtherHighlights::Regex);
+ Ok(re.simplify())
+ }
+ Err(e) => Err(e),
}
- result
}
}
@@ -187,7 +190,7 @@ fn parse_atom(s: &mut super::Cursor<'_>) -> Result<Pattern> {
b'.' => {
s.adv();
s.highlight_from(begin, OtherHighlights::RegexSymbol);
- Ok(Pattern::Range(0, 127))
+ Ok(Pattern::CharacterClass(Class::Everything))
}
b'\\' => {
s.adv();
diff --git a/src/regex/bc.rs b/src/regex/bc.rs
index 4a79485..6e55c7d 100644
--- a/src/regex/bc.rs
+++ b/src/regex/bc.rs
@@ -1,7 +1,7 @@
use std::collections::{HashMap, VecDeque};
use super::{
- CharacterClass, GreedyBehavior, LookDirection, LookPolarity, Match, Pattern, RegexEngine,
+ Class, GreedyBehavior, LookDirection, LookPolarity, Match, Pattern, RegexEngine,
byte_range::ByteRange,
};
use crate::bitset::BitSet;
@@ -93,7 +93,7 @@ struct Assertion {
#[derive(Copy, Clone, Debug)]
enum Instr<F: Flavor> {
- Class(CharacterClass),
+ Class(Class),
Consume(ByteRange),
Jump(JumpTarget),
Fork(JumpTarget, JumpTarget),
@@ -528,7 +528,7 @@ impl<'a, F: Flavor> Compiler<'a, F> {
let begin = self.instrs.len() as JumpTarget;
self.compile(pat.clone())?;
let end = self.instrs.len() as JumpTarget;
- self.instrs.push(Instr::Class(CharacterClass::Nothing));
+ self.instrs.push(Instr::Class(Class::Nothing));
let bounds = CompiledSnippet { begin, end };
self.map.insert(pat, bounds);
Ok(bounds)
@@ -539,7 +539,7 @@ impl<'a, F: Flavor> Compiler<'a, F> {
let fork_begin = self.instrs.len() as JumpTarget;
match fork_targets.len() {
0 => {
- self.instrs[self.assertion_fork_base] = Instr::Class(CharacterClass::Nothing);
+ self.instrs[self.assertion_fork_base] = Instr::Class(Class::Nothing);
}
1 => {
self.instrs[self.assertion_fork_base] = Instr::Jump(fork_targets[0]);
@@ -566,7 +566,7 @@ impl<'a, F: Flavor> Compiler<'a, F> {
fn assertion_compiler() -> Compiler<'static, AssertionFlavor> {
let mut c = Compiler::new(|_, _, _| Err(RegexCompilationError::NestedLookaroundNotSupported));
c.rep_any_amt(
- Pattern::CharacterClass(CharacterClass::Everything),
+ Pattern::CharacterClass(Class::Everything),
GreedyBehavior::NonGreedy,
)
.expect("characterclass should always compile");
@@ -604,7 +604,7 @@ impl TryFrom<Pattern> for BytecodeCompiledRegex {
});
main.compile(value)?;
let end = main.instrs.len();
- main.instrs.push(Instr::Class(CharacterClass::Nothing));
+ main.instrs.push(Instr::Class(Class::Nothing));
(end, main.instrs, main.submatch_count)
};
neg.finalize_assertion_forks();
diff --git a/src/regex/enfa.rs b/src/regex/enfa.rs
index 8392642..c740915 100644
--- a/src/regex/enfa.rs
+++ b/src/regex/enfa.rs
@@ -3,6 +3,8 @@ use std::{
hash::{DefaultHasher, Hash, Hasher},
};
+use crate::regex::Class;
+
use super::{LookDirection, LookPolarity, Pattern, byte_range::ByteRange};
/// NFA with epsilon transitions
@@ -661,6 +663,9 @@ impl TryFrom<Pattern> for ENFA {
],
has_submatches: false,
},
+ Pattern::CharacterClass(Class::Everything) => {
+ Self::try_from(Pattern::Range(0, 254))?
+ }
Pattern::CharacterClass(_) => {
return Err(EnfaTranslationError::CharacterClassNotSupported);
}
diff --git a/src/regex/mod.rs b/src/regex/mod.rs
index be3026f..2c9f3d1 100644
--- a/src/regex/mod.rs
+++ b/src/regex/mod.rs
@@ -1,5 +1,3 @@
-use crate::parse::Parse;
-
pub mod bc;
mod byte_range;
pub mod dfa;
@@ -28,7 +26,7 @@ pub enum LookPolarity {
}
#[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)]
-pub enum CharacterClass {
+pub enum Class {
Everything,
Nothing,
Whitespace,
@@ -36,14 +34,14 @@ pub enum CharacterClass {
Alphanumeric,
}
-impl CharacterClass {
+impl Class {
pub fn matches(self, byte: u8) -> bool {
match self {
- CharacterClass::Everything => true,
- CharacterClass::Nothing => false,
- CharacterClass::Whitespace => byte.is_ascii_whitespace(),
- CharacterClass::Alphabetic => byte.is_ascii_alphabetic(),
- CharacterClass::Alphanumeric => byte.is_ascii_alphanumeric(),
+ Class::Everything => true,
+ Class::Nothing => false,
+ Class::Whitespace => byte.is_ascii_whitespace(),
+ Class::Alphabetic => byte.is_ascii_alphabetic(),
+ Class::Alphanumeric => byte.is_ascii_alphanumeric(),
}
}
}
@@ -52,7 +50,7 @@ impl CharacterClass {
pub enum Pattern {
Byte(u8),
Range(u8, u8),
- CharacterClass(CharacterClass),
+ CharacterClass(Class),
Alt(Vec<Pattern>),
Concat(Vec<Pattern>),
Rep(Box<Pattern>, u32, Option<u32>, GreedyBehavior),
@@ -156,6 +154,51 @@ impl Pattern {
Submatch(pat) => Submatch(Box::new(pat.reverse())),
}
}
+
+ pub fn simplify(self) -> Self {
+ use Pattern::*;
+
+ match self {
+ Range(a, b) if a > b => CharacterClass(Class::Nothing),
+ Range(a, b) if a == b => Byte(a),
+ Alt(pats) => {
+ let pats: Vec<_> = pats
+ .into_iter()
+ .map(Pattern::simplify)
+ .filter(|p| !matches!(p, CharacterClass(Class::Nothing)))
+ .collect();
+ match pats.len() {
+ 0 => CharacterClass(Class::Nothing),
+ 1 => pats.into_iter().next().unwrap(),
+ _ => Alt(pats)
+ }
+ }
+ Concat(pats) => {
+ let mut out = Vec::new();
+ for pat in pats {
+ let pat = pat.simplify();
+ match pat {
+ Concat(mut pats) => out.append(&mut pats),
+ Nothing => (),
+ CharacterClass(Class::Nothing) => return CharacterClass(Class::Nothing),
+ x => out.push(x),
+ }
+ }
+ match out.len() {
+ 0 => Nothing,
+ 1 => out.into_iter().next().unwrap(),
+ _ => Concat(out),
+ }
+ }
+ Rep(_, 0, Some(0), _) => Nothing,
+ Rep(_, min, Some(max), _) if min > max => CharacterClass(Class::Nothing),
+ Rep(pat, 1, Some(1), _) => pat.simplify(),
+ Rep(pat, min, max, greed) => Rep(Box::new(pat.simplify()), min, max, greed),
+ Assertion(dir, pol, pattern) => Assertion(dir, pol, Box::new(pattern.simplify())),
+ Submatch(pattern) => Submatch(Box::new(pattern.simplify())),
+ CharacterClass(_) | Range(_, _) | Nothing | Byte(_) => self,
+ }
+ }
}
#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
@@ -172,6 +215,7 @@ pub enum CompiledPattern {
Exact(simple::Exact),
}
+#[allow(unused)]
impl CompiledPattern {
fn is_dfa(&self) -> bool {
matches!(self, Self::Dfa(_))
@@ -289,56 +333,59 @@ impl Pattern {
}
#[cfg(test)]
-macro_rules! regex_matches {
- ($regex:literal, $match:literal, $true:literal) => {
- assert_eq!(
- <Pattern as crate::parse::Parse>::parse_from_bytes($regex.as_bytes())
- .unwrap()
- .try_compile()
- .unwrap()
- .matches($match.as_bytes()),
- $true
- )
- };
-}
+mod tests {
+ use super::*;
+ use crate::parse::Parse;
+
+ macro_rules! regex_matches {
+ ($regex:literal, $match:literal, $true:literal) => {
+ assert_eq!(
+ <Pattern as crate::parse::Parse>::parse_from_bytes($regex.as_bytes())
+ .unwrap()
+ .try_compile()
+ .unwrap()
+ .matches($match.as_bytes()),
+ $true
+ )
+ };
+ }
-#[test]
-fn foo_matches_foo() {
- regex_matches!("foo", "foo", true);
-}
+ #[test]
+ fn foo_matches_foo() {
+ regex_matches!("foo", "foo", true);
+ }
-#[test]
-fn dot_star_is_simple() {
- let x = Pattern::parse_from_bytes(b".*")
- .unwrap()
- .try_compile()
- .unwrap();
- assert!(x.is_simple());
-}
+ #[test]
+ fn dot_star_is_simple() {
+ let x = Pattern::parse_from_bytes(b".*")
+ .unwrap()
+ .try_compile()
+ .unwrap();
+ assert!(x.is_simple());
+ }
-#[test]
-fn match_is_bytecode() {
- let x = Pattern::parse_from_bytes(b".*(ele.*phant).*")
- .unwrap()
- .try_compile()
- .unwrap();
- assert!(x.is_bytecode());
-}
+ #[test]
+ fn match_is_bytecode() {
+ let x = Pattern::parse_from_bytes(b".*(ele.*phant).*")
+ .unwrap()
+ .try_compile()
+ .unwrap();
+ assert!(x.is_bytecode());
+ }
-#[test]
-fn simple_word_is_exact() {
- let x = Pattern::parse_from_bytes(b"Gnu[ ]plus[ ]Linux")
- .unwrap()
- .try_compile()
- .unwrap();
- assert!(x.is_simple());
-}
+ #[test]
+ fn simple_word_is_exact() {
+ let parsed = Pattern::parse_from_bytes(b"Gnu[ ]plus[ ]Linux").unwrap();
+ let compiled = parsed.clone().try_compile().unwrap();
+ assert!(compiled.is_simple(), "{parsed:?}");
+ }
-#[test]
-fn no_match_is_dfa() {
- let x = Pattern::parse_from_bytes(b".*Gnu.*plus.*Linux.*")
- .unwrap()
- .try_compile()
- .unwrap();
- assert!(x.is_dfa());
+ #[test]
+ fn no_match_is_dfa() {
+ let x = Pattern::parse_from_bytes(b".*Gnu.*plus.*Linux.*")
+ .unwrap()
+ .try_compile()
+ .unwrap();
+ assert!(x.is_dfa());
+ }
}
diff --git a/src/regex/simple.rs b/src/regex/simple.rs
index 00bc9b4..e75ff14 100644
--- a/src/regex/simple.rs
+++ b/src/regex/simple.rs
@@ -1,4 +1,4 @@
-use crate::regex::CharacterClass;
+use crate::regex::Class;
use super::{Match, Pattern, RegexEngine};
@@ -18,7 +18,7 @@ impl RegexEngine for Anything {
fn compile(pat: Pattern) -> Result<Self, Self::CompileError> {
if let Pattern::Rep(pat, 0, None, _) = pat
- && let Pattern::CharacterClass(CharacterClass::Everything) = *pat
+ && let Pattern::CharacterClass(Class::Everything) = *pat
{
Ok(Anything)
} else {
@@ -41,7 +41,7 @@ impl RegexEngine for Nothing {
fn compile(pat: Pattern) -> Result<Self, Self::CompileError> {
match pat {
Pattern::Range(a, b) if a > b => Ok(Nothing),
- Pattern::CharacterClass(CharacterClass::Nothing) => Ok(Nothing),
+ Pattern::CharacterClass(Class::Nothing) => Ok(Nothing),
Pattern::Alt(pats) => {
let all_impossible = pats.into_iter().map(Self::compile).all(|p| p.is_ok());
if all_impossible {