aboutsummaryrefslogtreecommitdiffstats
path: root/src/parse/regex.rs
diff options
context:
space:
mode:
authorJonas Maier <jonas@x77.dev>2026-06-06 12:15:52 +0200
committerJonas Maier <jonas@x77.dev>2026-06-06 12:15:52 +0200
commit53980774c327675e886179c0a2c140744dcf9b95 (patch)
treeca1fdcc9938fce2c10c51e0a51659c6ba38ac5ba /src/parse/regex.rs
parent75e0c29cf91ddc6299c14a94a038c3e3df3d2805 (diff)
downloadpish-53980774c327675e886179c0a2c140744dcf9b95.tar.gz
special cased regex for performance
Diffstat (limited to 'src/parse/regex.rs')
-rw-r--r--src/parse/regex.rs214
1 files changed, 214 insertions, 0 deletions
diff --git a/src/parse/regex.rs b/src/parse/regex.rs
new file mode 100644
index 0000000..e5329e8
--- /dev/null
+++ b/src/parse/regex.rs
@@ -0,0 +1,214 @@
+use super::{Cursor, OtherHighlights, Parse, ParseError, Result};
+use crate::regex::{GreedyBehavior, Pattern, LookDirection, LookPolarity};
+
+const SYMBOLS: &[u8] = b"{}[]()*+-?|.\\ ";
+fn is_symbol(x: u8) -> bool {
+ SYMBOLS.contains(&x)
+}
+
+impl Parse for Pattern {
+ fn parse(b: &mut Cursor<'_>) -> super::Result<Self> {
+ let begin = b.loc();
+ let result = parse0(b);
+ if result.is_ok() {
+ b.highlight_from(begin, OtherHighlights::Regex);
+ }
+ result
+ }
+}
+
+fn parse0(s: &mut Cursor<'_>) -> Result<Pattern> {
+ parse_alt(s)
+}
+
+fn parse_alt(s: &mut Cursor<'_>) -> Result<Pattern> {
+ let mut seqs = vec![];
+ loop {
+ let seq = parse_seq(s)?;
+ seqs.push(seq);
+ let begin = s.loc();
+ if s.has() && s.peek() == b'|' {
+ s.adv();
+ s.highlight_from(begin, OtherHighlights::RegexSymbol);
+ } else {
+ break;
+ }
+ }
+
+ Ok(match seqs.len() {
+ 0 => Pattern::Nothing,
+ 1 => seqs.into_iter().next().unwrap(),
+ _ => Pattern::Alt(seqs),
+ })
+}
+
+fn parse_seq(s: &mut Cursor<'_>) -> Result<Pattern> {
+ let mut reps = vec![];
+ loop {
+ let rep = parse_rep(s)?;
+ if rep != Pattern::Nothing {
+ reps.push(rep);
+ } else {
+ break;
+ }
+ }
+
+ Ok(match reps.len() {
+ 0 => Pattern::Nothing,
+ 1 => reps.into_iter().next().unwrap(),
+ _ => Pattern::Concat(reps),
+ })
+}
+
+fn parse_rep(s: &mut Cursor<'_>) -> Result<Pattern> {
+ let atom = parse_atom(s)?;
+
+ if atom == Pattern::Nothing {
+ return Ok(atom);
+ }
+
+ if !s.has() {
+ return Ok(atom);
+ }
+
+ let begin = s.loc();
+
+ let rep = match s.peek() {
+ b'*' => Some((0, None)),
+ b'+' => Some((1, None)),
+ b'?' => Some((0, Some(1))),
+ _ => None,
+ };
+
+ if let Some((min_rep, max_rep)) = rep {
+ s.adv();
+ s.highlight_from(begin, OtherHighlights::RegexSymbol);
+
+ let greed = if s.has() && s.peek() == b'?' {
+ s.adv();
+ GreedyBehavior::NonGreedy
+ } else {
+ GreedyBehavior::Greedy
+ };
+
+ Ok(Pattern::Rep(Box::new(atom), min_rep, max_rep, greed))
+ } else {
+ Ok(atom)
+ }
+}
+
+fn parse_atom(s: &mut super::Cursor<'_>) -> Result<Pattern> {
+ if !s.has() {
+ return Ok(Pattern::Nothing);
+ }
+
+ let begin = s.loc();
+
+ match s.peek() {
+ b'[' => {
+ s.adv();
+ s.highlight_from(begin, OtherHighlights::RegexSymbol);
+ let mut ranges = Vec::new();
+ loop {
+ if !s.has() {
+ return Err(ParseError::Eof);
+ }
+
+ let begin = s.loc();
+ let tok = s.adv();
+
+ if tok == b']' {
+ if ranges.is_empty() {
+ todo!("error handling for empty alternative list");
+ }
+ s.highlight_from(begin, OtherHighlights::RegexSymbol);
+ return Ok(Pattern::Alt(ranges));
+ }
+
+ let begin = s.loc();
+ if s.has() && s.peek() == b'-' {
+ s.adv();
+ s.highlight_from(begin, OtherHighlights::RegexSymbol);
+
+ if !s.has() {
+ return Err(ParseError::Eof);
+ }
+ let tok2 = s.peek();
+
+ if tok2 == b']' {
+ ranges.push(Pattern::Byte(tok));
+ ranges.push(Pattern::Byte(b'-'));
+ } else if is_symbol(tok2) {
+ return Err(ParseError::Unknown(tok2));
+ } else {
+ s.adv();
+ ranges.push(Pattern::Range(tok, tok2));
+ }
+ } else {
+ ranges.push(Pattern::Byte(tok));
+ }
+ }
+ }
+ b'(' => {
+ s.adv();
+
+ let mut assertion = None;
+ if s.buf.starts_with(b"?=") {
+ s.advance(2);
+ assertion = Some((LookDirection::Ahead, LookPolarity::Positive));
+ } else if s.buf.starts_with(b"?!") {
+ s.advance(2);
+ assertion = Some((LookDirection::Ahead, LookPolarity::Negative));
+ } else if s.buf.starts_with(b"?<=") {
+ s.advance(3);
+ assertion = Some((LookDirection::Behind, LookPolarity::Positive));
+ } else if s.buf.starts_with(b"?<!") {
+ s.advance(3);
+ assertion = Some((LookDirection::Behind, LookPolarity::Negative));
+ }
+
+ s.highlight_from(begin, OtherHighlights::RegexSymbol);
+ let inner = parse0(s)?;
+ if !s.has() {
+ return Err(ParseError::Eof);
+ }
+ let begin = s.loc();
+ if s.adv() != b')' {
+ return Err(ParseError::Expected(')'));
+ }
+ s.highlight_from(begin, OtherHighlights::RegexSymbol);
+
+ if let Some((dir, pol)) = assertion {
+ Ok(Pattern::Assertion(dir, pol, Box::new(inner)))
+ } else {
+ Ok(Pattern::Submatch(Box::new(inner)))
+ }
+ }
+ b'.' => {
+ s.adv();
+ s.highlight_from(begin, OtherHighlights::RegexSymbol);
+ Ok(Pattern::Range(0, 127))
+ }
+ b'\\' => {
+ s.adv();
+ if s.has() {
+ let escaped = s.adv();
+ s.highlight_from(begin, OtherHighlights::RegexSymbol);
+
+ if is_symbol(escaped) {
+ Ok(Pattern::Byte(escaped))
+ } else {
+ // TODO interpret \w and others
+ Err(ParseError::Unknown(escaped))
+ }
+ } else {
+ Err(ParseError::Eof)
+ }
+ }
+ x if is_symbol(x) => Ok(Pattern::Nothing),
+ ch => {
+ s.adv();
+ Ok(Pattern::Byte(ch))
+ }
+ }
+}