aboutsummaryrefslogtreecommitdiffstats
path: root/src/regex/mod.rs
diff options
context:
space:
mode:
authorJonas Maier <jonas@x77.dev>2026-06-06 12:34:22 +0200
committerJonas Maier <jonas@x77.dev>2026-06-06 12:34:22 +0200
commita2e9e8647daa2622cf55a047c329027fcfc49bf8 (patch)
treee59787ebbd6a71735190d222a55edc351927d8be /src/regex/mod.rs
parent53980774c327675e886179c0a2c140744dcf9b95 (diff)
downloadpish-a2e9e8647daa2622cf55a047c329027fcfc49bf8.tar.gz
some small opt
Diffstat (limited to 'src/regex/mod.rs')
-rw-r--r--src/regex/mod.rs161
1 files changed, 104 insertions, 57 deletions
diff --git a/src/regex/mod.rs b/src/regex/mod.rs
index be3026f..2c9f3d1 100644
--- a/src/regex/mod.rs
+++ b/src/regex/mod.rs
@@ -1,5 +1,3 @@
-use crate::parse::Parse;
-
pub mod bc;
mod byte_range;
pub mod dfa;
@@ -28,7 +26,7 @@ pub enum LookPolarity {
}
#[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)]
-pub enum CharacterClass {
+pub enum Class {
Everything,
Nothing,
Whitespace,
@@ -36,14 +34,14 @@ pub enum CharacterClass {
Alphanumeric,
}
-impl CharacterClass {
+impl Class {
pub fn matches(self, byte: u8) -> bool {
match self {
- CharacterClass::Everything => true,
- CharacterClass::Nothing => false,
- CharacterClass::Whitespace => byte.is_ascii_whitespace(),
- CharacterClass::Alphabetic => byte.is_ascii_alphabetic(),
- CharacterClass::Alphanumeric => byte.is_ascii_alphanumeric(),
+ Class::Everything => true,
+ Class::Nothing => false,
+ Class::Whitespace => byte.is_ascii_whitespace(),
+ Class::Alphabetic => byte.is_ascii_alphabetic(),
+ Class::Alphanumeric => byte.is_ascii_alphanumeric(),
}
}
}
@@ -52,7 +50,7 @@ impl CharacterClass {
pub enum Pattern {
Byte(u8),
Range(u8, u8),
- CharacterClass(CharacterClass),
+ CharacterClass(Class),
Alt(Vec<Pattern>),
Concat(Vec<Pattern>),
Rep(Box<Pattern>, u32, Option<u32>, GreedyBehavior),
@@ -156,6 +154,51 @@ impl Pattern {
Submatch(pat) => Submatch(Box::new(pat.reverse())),
}
}
+
+ pub fn simplify(self) -> Self {
+ use Pattern::*;
+
+ match self {
+ Range(a, b) if a > b => CharacterClass(Class::Nothing),
+ Range(a, b) if a == b => Byte(a),
+ Alt(pats) => {
+ let pats: Vec<_> = pats
+ .into_iter()
+ .map(Pattern::simplify)
+ .filter(|p| !matches!(p, CharacterClass(Class::Nothing)))
+ .collect();
+ match pats.len() {
+ 0 => CharacterClass(Class::Nothing),
+ 1 => pats.into_iter().next().unwrap(),
+ _ => Alt(pats)
+ }
+ }
+ Concat(pats) => {
+ let mut out = Vec::new();
+ for pat in pats {
+ let pat = pat.simplify();
+ match pat {
+ Concat(mut pats) => out.append(&mut pats),
+ Nothing => (),
+ CharacterClass(Class::Nothing) => return CharacterClass(Class::Nothing),
+ x => out.push(x),
+ }
+ }
+ match out.len() {
+ 0 => Nothing,
+ 1 => out.into_iter().next().unwrap(),
+ _ => Concat(out),
+ }
+ }
+ Rep(_, 0, Some(0), _) => Nothing,
+ Rep(_, min, Some(max), _) if min > max => CharacterClass(Class::Nothing),
+ Rep(pat, 1, Some(1), _) => pat.simplify(),
+ Rep(pat, min, max, greed) => Rep(Box::new(pat.simplify()), min, max, greed),
+ Assertion(dir, pol, pattern) => Assertion(dir, pol, Box::new(pattern.simplify())),
+ Submatch(pattern) => Submatch(Box::new(pattern.simplify())),
+ CharacterClass(_) | Range(_, _) | Nothing | Byte(_) => self,
+ }
+ }
}
#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
@@ -172,6 +215,7 @@ pub enum CompiledPattern {
Exact(simple::Exact),
}
+#[allow(unused)]
impl CompiledPattern {
fn is_dfa(&self) -> bool {
matches!(self, Self::Dfa(_))
@@ -289,56 +333,59 @@ impl Pattern {
}
#[cfg(test)]
-macro_rules! regex_matches {
- ($regex:literal, $match:literal, $true:literal) => {
- assert_eq!(
- <Pattern as crate::parse::Parse>::parse_from_bytes($regex.as_bytes())
- .unwrap()
- .try_compile()
- .unwrap()
- .matches($match.as_bytes()),
- $true
- )
- };
-}
+mod tests {
+ use super::*;
+ use crate::parse::Parse;
+
+ macro_rules! regex_matches {
+ ($regex:literal, $match:literal, $true:literal) => {
+ assert_eq!(
+ <Pattern as crate::parse::Parse>::parse_from_bytes($regex.as_bytes())
+ .unwrap()
+ .try_compile()
+ .unwrap()
+ .matches($match.as_bytes()),
+ $true
+ )
+ };
+ }
-#[test]
-fn foo_matches_foo() {
- regex_matches!("foo", "foo", true);
-}
+ #[test]
+ fn foo_matches_foo() {
+ regex_matches!("foo", "foo", true);
+ }
-#[test]
-fn dot_star_is_simple() {
- let x = Pattern::parse_from_bytes(b".*")
- .unwrap()
- .try_compile()
- .unwrap();
- assert!(x.is_simple());
-}
+ #[test]
+ fn dot_star_is_simple() {
+ let x = Pattern::parse_from_bytes(b".*")
+ .unwrap()
+ .try_compile()
+ .unwrap();
+ assert!(x.is_simple());
+ }
-#[test]
-fn match_is_bytecode() {
- let x = Pattern::parse_from_bytes(b".*(ele.*phant).*")
- .unwrap()
- .try_compile()
- .unwrap();
- assert!(x.is_bytecode());
-}
+ #[test]
+ fn match_is_bytecode() {
+ let x = Pattern::parse_from_bytes(b".*(ele.*phant).*")
+ .unwrap()
+ .try_compile()
+ .unwrap();
+ assert!(x.is_bytecode());
+ }
-#[test]
-fn simple_word_is_exact() {
- let x = Pattern::parse_from_bytes(b"Gnu[ ]plus[ ]Linux")
- .unwrap()
- .try_compile()
- .unwrap();
- assert!(x.is_simple());
-}
+ #[test]
+ fn simple_word_is_exact() {
+ let parsed = Pattern::parse_from_bytes(b"Gnu[ ]plus[ ]Linux").unwrap();
+ let compiled = parsed.clone().try_compile().unwrap();
+ assert!(compiled.is_simple(), "{parsed:?}");
+ }
-#[test]
-fn no_match_is_dfa() {
- let x = Pattern::parse_from_bytes(b".*Gnu.*plus.*Linux.*")
- .unwrap()
- .try_compile()
- .unwrap();
- assert!(x.is_dfa());
+ #[test]
+ fn no_match_is_dfa() {
+ let x = Pattern::parse_from_bytes(b".*Gnu.*plus.*Linux.*")
+ .unwrap()
+ .try_compile()
+ .unwrap();
+ assert!(x.is_dfa());
+ }
}