diff options
| author | Jonas Maier <jonas@x77.dev> | 2026-06-06 12:15:52 +0200 |
|---|---|---|
| committer | Jonas Maier <jonas@x77.dev> | 2026-06-06 12:15:52 +0200 |
| commit | 53980774c327675e886179c0a2c140744dcf9b95 (patch) | |
| tree | ca1fdcc9938fce2c10c51e0a51659c6ba38ac5ba /src/parse/regex.rs | |
| parent | 75e0c29cf91ddc6299c14a94a038c3e3df3d2805 (diff) | |
| download | pish-53980774c327675e886179c0a2c140744dcf9b95.tar.gz | |
special cased regex for performance
Diffstat (limited to 'src/parse/regex.rs')
| -rw-r--r-- | src/parse/regex.rs | 214 |
1 files changed, 214 insertions, 0 deletions
diff --git a/src/parse/regex.rs b/src/parse/regex.rs new file mode 100644 index 0000000..e5329e8 --- /dev/null +++ b/src/parse/regex.rs @@ -0,0 +1,214 @@ +use super::{Cursor, OtherHighlights, Parse, ParseError, Result}; +use crate::regex::{GreedyBehavior, Pattern, LookDirection, LookPolarity}; + +const SYMBOLS: &[u8] = b"{}[]()*+-?|.\\ "; +fn is_symbol(x: u8) -> bool { + SYMBOLS.contains(&x) +} + +impl Parse for Pattern { + fn parse(b: &mut Cursor<'_>) -> super::Result<Self> { + let begin = b.loc(); + let result = parse0(b); + if result.is_ok() { + b.highlight_from(begin, OtherHighlights::Regex); + } + result + } +} + +fn parse0(s: &mut Cursor<'_>) -> Result<Pattern> { + parse_alt(s) +} + +fn parse_alt(s: &mut Cursor<'_>) -> Result<Pattern> { + let mut seqs = vec![]; + loop { + let seq = parse_seq(s)?; + seqs.push(seq); + let begin = s.loc(); + if s.has() && s.peek() == b'|' { + s.adv(); + s.highlight_from(begin, OtherHighlights::RegexSymbol); + } else { + break; + } + } + + Ok(match seqs.len() { + 0 => Pattern::Nothing, + 1 => seqs.into_iter().next().unwrap(), + _ => Pattern::Alt(seqs), + }) +} + +fn parse_seq(s: &mut Cursor<'_>) -> Result<Pattern> { + let mut reps = vec![]; + loop { + let rep = parse_rep(s)?; + if rep != Pattern::Nothing { + reps.push(rep); + } else { + break; + } + } + + Ok(match reps.len() { + 0 => Pattern::Nothing, + 1 => reps.into_iter().next().unwrap(), + _ => Pattern::Concat(reps), + }) +} + +fn parse_rep(s: &mut Cursor<'_>) -> Result<Pattern> { + let atom = parse_atom(s)?; + + if atom == Pattern::Nothing { + return Ok(atom); + } + + if !s.has() { + return Ok(atom); + } + + let begin = s.loc(); + + let rep = match s.peek() { + b'*' => Some((0, None)), + b'+' => Some((1, None)), + b'?' => Some((0, Some(1))), + _ => None, + }; + + if let Some((min_rep, max_rep)) = rep { + s.adv(); + s.highlight_from(begin, OtherHighlights::RegexSymbol); + + let greed = if s.has() && s.peek() == b'?' { + s.adv(); + GreedyBehavior::NonGreedy + } else { + GreedyBehavior::Greedy + }; + + Ok(Pattern::Rep(Box::new(atom), min_rep, max_rep, greed)) + } else { + Ok(atom) + } +} + +fn parse_atom(s: &mut super::Cursor<'_>) -> Result<Pattern> { + if !s.has() { + return Ok(Pattern::Nothing); + } + + let begin = s.loc(); + + match s.peek() { + b'[' => { + s.adv(); + s.highlight_from(begin, OtherHighlights::RegexSymbol); + let mut ranges = Vec::new(); + loop { + if !s.has() { + return Err(ParseError::Eof); + } + + let begin = s.loc(); + let tok = s.adv(); + + if tok == b']' { + if ranges.is_empty() { + todo!("error handling for empty alternative list"); + } + s.highlight_from(begin, OtherHighlights::RegexSymbol); + return Ok(Pattern::Alt(ranges)); + } + + let begin = s.loc(); + if s.has() && s.peek() == b'-' { + s.adv(); + s.highlight_from(begin, OtherHighlights::RegexSymbol); + + if !s.has() { + return Err(ParseError::Eof); + } + let tok2 = s.peek(); + + if tok2 == b']' { + ranges.push(Pattern::Byte(tok)); + ranges.push(Pattern::Byte(b'-')); + } else if is_symbol(tok2) { + return Err(ParseError::Unknown(tok2)); + } else { + s.adv(); + ranges.push(Pattern::Range(tok, tok2)); + } + } else { + ranges.push(Pattern::Byte(tok)); + } + } + } + b'(' => { + s.adv(); + + let mut assertion = None; + if s.buf.starts_with(b"?=") { + s.advance(2); + assertion = Some((LookDirection::Ahead, LookPolarity::Positive)); + } else if s.buf.starts_with(b"?!") { + s.advance(2); + assertion = Some((LookDirection::Ahead, LookPolarity::Negative)); + } else if s.buf.starts_with(b"?<=") { + s.advance(3); + assertion = Some((LookDirection::Behind, LookPolarity::Positive)); + } else if s.buf.starts_with(b"?<!") { + s.advance(3); + assertion = Some((LookDirection::Behind, LookPolarity::Negative)); + } + + s.highlight_from(begin, OtherHighlights::RegexSymbol); + let inner = parse0(s)?; + if !s.has() { + return Err(ParseError::Eof); + } + let begin = s.loc(); + if s.adv() != b')' { + return Err(ParseError::Expected(')')); + } + s.highlight_from(begin, OtherHighlights::RegexSymbol); + + if let Some((dir, pol)) = assertion { + Ok(Pattern::Assertion(dir, pol, Box::new(inner))) + } else { + Ok(Pattern::Submatch(Box::new(inner))) + } + } + b'.' => { + s.adv(); + s.highlight_from(begin, OtherHighlights::RegexSymbol); + Ok(Pattern::Range(0, 127)) + } + b'\\' => { + s.adv(); + if s.has() { + let escaped = s.adv(); + s.highlight_from(begin, OtherHighlights::RegexSymbol); + + if is_symbol(escaped) { + Ok(Pattern::Byte(escaped)) + } else { + // TODO interpret \w and others + Err(ParseError::Unknown(escaped)) + } + } else { + Err(ParseError::Eof) + } + } + x if is_symbol(x) => Ok(Pattern::Nothing), + ch => { + s.adv(); + Ok(Pattern::Byte(ch)) + } + } +} |
