aboutsummaryrefslogtreecommitdiffstats
path: root/src/parse/regex/mod.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/parse/regex/mod.rs')
-rw-r--r--src/parse/regex/mod.rs432
1 files changed, 0 insertions, 432 deletions
diff --git a/src/parse/regex/mod.rs b/src/parse/regex/mod.rs
deleted file mode 100644
index 486b0d9..0000000
--- a/src/parse/regex/mod.rs
+++ /dev/null
@@ -1,432 +0,0 @@
-use crate::parse::OtherHighlights;
-
-use super::{Parse, ParseError, Result};
-
-pub mod bc;
-mod byte_range;
-pub mod dfa;
-pub mod enfa;
-
-#[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)]
-pub enum LookDirection {
- Ahead,
- Behind,
-}
-
-impl LookDirection {
- pub fn reverse(self) -> Self {
- match self {
- Self::Ahead => Self::Behind,
- Self::Behind => Self::Ahead,
- }
- }
-}
-
-#[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)]
-pub enum LookPolarity {
- Positive,
- Negative,
-}
-
-#[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)]
-pub enum CharacterClass {
- Everything,
- Nothing,
- Whitespace,
- Alphabetic,
- Alphanumeric,
-}
-
-impl CharacterClass {
- pub fn matches(self, byte: u8) -> bool {
- match self {
- CharacterClass::Everything => true,
- CharacterClass::Nothing => false,
- CharacterClass::Whitespace => byte.is_ascii_whitespace(),
- CharacterClass::Alphabetic => byte.is_ascii_alphabetic(),
- CharacterClass::Alphanumeric => byte.is_ascii_alphanumeric(),
- }
- }
-}
-
-#[derive(PartialEq, Eq, Hash, Debug, Clone)]
-pub enum Pattern {
- Byte(u8),
- Range(u8, u8),
- CharacterClass(CharacterClass),
- Alt(Vec<Pattern>),
- Concat(Vec<Pattern>),
- Rep(Box<Pattern>, u32, Option<u32>, GreedyBehavior),
- Assertion(LookDirection, LookPolarity, Box<Pattern>),
- Submatch(Box<Pattern>),
- Nothing,
-}
-
-#[derive(Debug, PartialEq, Eq, Copy, Clone)]
-pub enum ByteConsumption {
- Bounded(usize),
- Unbounded,
-}
-
-impl ByteConsumption {
- pub fn zero() -> Self {
- Self::Bounded(0)
- }
- pub fn one() -> Self {
- Self::Bounded(1)
- }
-}
-
-impl Ord for ByteConsumption {
- fn cmp(&self, other: &Self) -> std::cmp::Ordering {
- match (self, other) {
- (Self::Bounded(a), Self::Bounded(b)) => a.cmp(b),
- (Self::Bounded(_), Self::Unbounded) => std::cmp::Ordering::Less,
- (Self::Unbounded, Self::Bounded(_)) => std::cmp::Ordering::Greater,
- (Self::Unbounded, Self::Unbounded) => std::cmp::Ordering::Equal,
- }
- }
-}
-impl PartialOrd for ByteConsumption {
- fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
- Some(self.cmp(other))
- }
-}
-
-impl std::ops::Add for ByteConsumption {
- type Output = Self;
-
- fn add(self, rhs: Self) -> Self::Output {
- if let Self::Bounded(a) = self
- && let Self::Bounded(b) = rhs
- {
- Self::Bounded(a + b)
- } else {
- Self::Unbounded
- }
- }
-}
-
-impl std::ops::Mul<usize> for ByteConsumption {
- type Output = Self;
-
- fn mul(self, rhs: usize) -> Self::Output {
- match self {
- Self::Bounded(x) => Self::Bounded(x * rhs),
- Self::Unbounded => Self::Unbounded,
- }
- }
-}
-
-impl std::iter::Sum for ByteConsumption {
- fn sum<I: Iterator<Item = Self>>(iter: I) -> Self {
- iter.fold(Self::zero(), |a, b| a + b)
- }
-}
-
-impl Pattern {
- pub fn max_byte_consumption(&self) -> ByteConsumption {
- match self {
- Pattern::Byte(_) => ByteConsumption::one(),
- Pattern::Range(_, _) => ByteConsumption::one(),
- Pattern::CharacterClass(_) => ByteConsumption::one(),
- Pattern::Alt(patterns) => patterns
- .iter()
- .map(Self::max_byte_consumption)
- .max()
- .unwrap_or(ByteConsumption::zero()),
- Pattern::Concat(patterns) => patterns.iter().map(Self::max_byte_consumption).sum(),
- Pattern::Rep(pattern, _, Some(max_reps), _) => {
- pattern.max_byte_consumption() * (*max_reps as usize)
- }
- Pattern::Rep(_, _, None, _) => ByteConsumption::Unbounded,
- Pattern::Assertion(_, _, _) => ByteConsumption::zero(),
- Pattern::Nothing => ByteConsumption::zero(),
- Pattern::Submatch(pat) => pat.max_byte_consumption(),
- }
- }
-
- pub fn reverse(self) -> Self {
- use Pattern::*;
- match self {
- Byte(_) | Nothing | Range(..) | CharacterClass(_) => self,
- Alt(patterns) => Alt(patterns.into_iter().map(Self::reverse).collect()),
- Concat(patterns) => Concat(patterns.into_iter().map(Self::reverse).rev().collect()),
- Rep(pattern, min, max, greedy) => Rep(Box::new(pattern.reverse()), min, max, greedy),
- Assertion(dir, pol, pat) => Assertion(dir.reverse(), pol, Box::new(pat.reverse())),
- Submatch(pat) => Submatch(Box::new(pat.reverse()))
- }
- }
-}
-
-impl Parse for Pattern {
- fn parse(b: &mut super::Cursor<'_>) -> super::Result<Self> {
- let begin = b.loc();
- let result = parse0(b);
- if result.is_ok() {
- b.highlight_from(begin, OtherHighlights::Regex);
- }
- result
- }
-}
-
-fn parse0(s: &mut super::Cursor<'_>) -> Result<Pattern> {
- parse_alt(s)
-}
-
-fn parse_alt(s: &mut super::Cursor<'_>) -> Result<Pattern> {
- let mut seqs = vec![];
- loop {
- let seq = parse_seq(s)?;
- seqs.push(seq);
- let begin = s.loc();
- if s.has() && s.peek() == b'|' {
- s.adv();
- s.highlight_from(begin, OtherHighlights::RegexSymbol);
- } else {
- break;
- }
- }
-
- Ok(match seqs.len() {
- 0 => Pattern::Nothing,
- 1 => seqs.into_iter().next().unwrap(),
- _ => Pattern::Alt(seqs),
- })
-}
-
-fn parse_seq(s: &mut super::Cursor<'_>) -> Result<Pattern> {
- let mut reps = vec![];
- loop {
- let rep = parse_rep(s)?;
- if rep != Pattern::Nothing {
- reps.push(rep);
- } else {
- break;
- }
- }
-
- Ok(match reps.len() {
- 0 => Pattern::Nothing,
- 1 => reps.into_iter().next().unwrap(),
- _ => Pattern::Concat(reps),
- })
-}
-
-fn parse_rep(s: &mut super::Cursor<'_>) -> Result<Pattern> {
- let atom = parse_atom(s)?;
-
- if atom == Pattern::Nothing {
- return Ok(atom);
- }
-
- if !s.has() {
- return Ok(atom);
- }
-
- let begin = s.loc();
-
- let rep = match s.peek() {
- b'*' => Some((0, None)),
- b'+' => Some((1, None)),
- b'?' => Some((0, Some(1))),
- _ => None,
- };
-
- if let Some((min_rep, max_rep)) = rep {
- s.adv();
- s.highlight_from(begin, OtherHighlights::RegexSymbol);
-
- let greed = if s.has() && s.peek() == b'?' {
- s.adv();
- GreedyBehavior::NonGreedy
- } else {
- GreedyBehavior::Greedy
- };
-
- Ok(Pattern::Rep(Box::new(atom), min_rep, max_rep, greed))
- } else {
- Ok(atom)
- }
-}
-
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub enum GreedyBehavior {
- Greedy,
- NonGreedy,
-}
-
-const SYMBOLS: &[u8] = b"{}[]()*+-?|.\\ ";
-fn is_symbol(x: u8) -> bool {
- SYMBOLS.contains(&x)
-}
-
-fn parse_atom(s: &mut super::Cursor<'_>) -> Result<Pattern> {
- if !s.has() {
- return Ok(Pattern::Nothing);
- }
-
- let begin = s.loc();
-
- match s.peek() {
- b'[' => {
- s.adv();
- s.highlight_from(begin, OtherHighlights::RegexSymbol);
- let mut ranges = Vec::new();
- loop {
- if !s.has() {
- return Err(ParseError::Eof);
- }
-
- let begin = s.loc();
- let tok = s.adv();
-
- if tok == b']' {
- if ranges.is_empty() {
- todo!("error handling for empty alternative list");
- }
- s.highlight_from(begin, OtherHighlights::RegexSymbol);
- return Ok(Pattern::Alt(ranges));
- }
-
- let begin = s.loc();
- if s.has() && s.peek() == b'-' {
- s.adv();
- s.highlight_from(begin, OtherHighlights::RegexSymbol);
-
- if !s.has() {
- return Err(ParseError::Eof);
- }
- let tok2 = s.peek();
-
- if tok2 == b']' {
- ranges.push(Pattern::Byte(tok));
- ranges.push(Pattern::Byte(b'-'));
- } else if is_symbol(tok2) {
- return Err(ParseError::Unknown(tok2));
- } else {
- s.adv();
- ranges.push(Pattern::Range(tok, tok2));
- }
- } else {
- ranges.push(Pattern::Byte(tok));
- }
- }
- }
- b'(' => {
- s.adv();
-
- let mut assertion = None;
- if s.buf.starts_with(b"?=") {
- s.advance(2);
- assertion = Some((LookDirection::Ahead, LookPolarity::Positive));
- } else if s.buf.starts_with(b"?!") {
- s.advance(2);
- assertion = Some((LookDirection::Ahead, LookPolarity::Negative));
- } else if s.buf.starts_with(b"?<=") {
- s.advance(3);
- assertion = Some((LookDirection::Behind, LookPolarity::Positive));
- } else if s.buf.starts_with(b"?<!") {
- s.advance(3);
- assertion = Some((LookDirection::Behind, LookPolarity::Negative));
- }
-
- s.highlight_from(begin, OtherHighlights::RegexSymbol);
- let inner = parse0(s)?;
- if !s.has() {
- return Err(ParseError::Eof);
- }
- let begin = s.loc();
- if s.adv() != b')' {
- return Err(ParseError::Expected(')'));
- }
- s.highlight_from(begin, OtherHighlights::RegexSymbol);
-
- if let Some((dir, pol)) = assertion {
- Ok(Pattern::Assertion(dir, pol, Box::new(inner)))
- } else {
- Ok(Pattern::Submatch(Box::new(inner)))
- }
- }
- b'.' => {
- s.adv();
- s.highlight_from(begin, OtherHighlights::RegexSymbol);
- Ok(Pattern::Range(0, 127))
- }
- b'\\' => {
- s.adv();
- if s.has() {
- let escaped = s.adv();
- s.highlight_from(begin, OtherHighlights::RegexSymbol);
-
- if is_symbol(escaped) {
- Ok(Pattern::Byte(escaped))
- } else {
- // TODO interpret \w and others
- Err(ParseError::Unknown(escaped))
- }
- } else {
- Err(ParseError::Eof)
- }
- }
- x if is_symbol(x) => Ok(Pattern::Nothing),
- ch => {
- s.adv();
- Ok(Pattern::Byte(ch))
- }
- }
-}
-
-pub struct CompiledPattern {
- dfa: dfa::DFA,
-}
-
-impl std::fmt::Debug for CompiledPattern {
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- self.dfa.fmt(f)
- }
-}
-
-#[derive(Debug)]
-pub enum CompilationError {
- Enfa(enfa::EnfaTranslationError),
-}
-
-impl From<enfa::EnfaTranslationError> for CompilationError {
- fn from(value: enfa::EnfaTranslationError) -> Self {
- Self::Enfa(value)
- }
-}
-
-impl Pattern {
- pub fn try_compile(self) -> std::result::Result<CompiledPattern, CompilationError> {
- let enfa = enfa::ENFA::try_from(self)?;
- let dfa = dfa::DFA::from(enfa);
- Ok(CompiledPattern { dfa })
- }
-}
-
-impl CompiledPattern {
- pub fn matches(&self, bytes: impl AsRef<[u8]>) -> bool {
- self.dfa.matches(bytes.as_ref())
- }
-}
-
-#[cfg(test)]
-macro_rules! regex_matches {
- ($regex:literal, $match:literal, $true:literal) => {
- assert_eq!(
- Pattern::parse_from_bytes($regex.as_bytes())
- .unwrap()
- .try_compile()
- .unwrap()
- .matches($match.as_bytes()),
- $true
- )
- };
-}
-
-#[test]
-fn foo_matches_foo() {
- regex_matches!("foo", "foo", true);
-}