diff options
| author | Jonas Maier <jonas@x77.dev> | 2026-06-06 12:15:52 +0200 |
|---|---|---|
| committer | Jonas Maier <jonas@x77.dev> | 2026-06-06 12:15:52 +0200 |
| commit | 53980774c327675e886179c0a2c140744dcf9b95 (patch) | |
| tree | ca1fdcc9938fce2c10c51e0a51659c6ba38ac5ba /src/parse | |
| parent | 75e0c29cf91ddc6299c14a94a038c3e3df3d2805 (diff) | |
| download | pish-53980774c327675e886179c0a2c140744dcf9b95.tar.gz | |
special cased regex for performance
Diffstat (limited to 'src/parse')
| -rw-r--r-- | src/parse/mod.rs | 4 | ||||
| -rw-r--r-- | src/parse/regex.rs | 214 | ||||
| -rw-r--r-- | src/parse/regex/bc.rs | 746 | ||||
| -rw-r--r-- | src/parse/regex/byte_range.rs | 204 | ||||
| -rw-r--r-- | src/parse/regex/dfa.rs | 346 | ||||
| -rw-r--r-- | src/parse/regex/enfa.rs | 733 | ||||
| -rw-r--r-- | src/parse/regex/mod.rs | 432 |
7 files changed, 216 insertions, 2463 deletions
diff --git a/src/parse/mod.rs b/src/parse/mod.rs index 407cebd..8530aab 100644 --- a/src/parse/mod.rs +++ b/src/parse/mod.rs @@ -1922,7 +1922,7 @@ impl Parse for Pipes<PreExpansion> { #[derive(Debug, Clone, PartialEq)] pub struct CaseBranch { - pub pattern: regex::Pattern, + pub pattern: crate::regex::Pattern, pub block: Block, } @@ -1953,7 +1953,7 @@ impl Parse for CaseBranch { fn parse(b: &mut Cursor<'_>) -> Result<Self> { b.spaces(); - let pattern = regex::Pattern::parse(b)?; + let pattern = crate::regex::Pattern::parse(b)?; let block = Block::parse(b)?; Ok(Self { pattern, block }) diff --git a/src/parse/regex.rs b/src/parse/regex.rs new file mode 100644 index 0000000..e5329e8 --- /dev/null +++ b/src/parse/regex.rs @@ -0,0 +1,214 @@ +use super::{Cursor, OtherHighlights, Parse, ParseError, Result}; +use crate::regex::{GreedyBehavior, Pattern, LookDirection, LookPolarity}; + +const SYMBOLS: &[u8] = b"{}[]()*+-?|.\\ "; +fn is_symbol(x: u8) -> bool { + SYMBOLS.contains(&x) +} + +impl Parse for Pattern { + fn parse(b: &mut Cursor<'_>) -> super::Result<Self> { + let begin = b.loc(); + let result = parse0(b); + if result.is_ok() { + b.highlight_from(begin, OtherHighlights::Regex); + } + result + } +} + +fn parse0(s: &mut Cursor<'_>) -> Result<Pattern> { + parse_alt(s) +} + +fn parse_alt(s: &mut Cursor<'_>) -> Result<Pattern> { + let mut seqs = vec![]; + loop { + let seq = parse_seq(s)?; + seqs.push(seq); + let begin = s.loc(); + if s.has() && s.peek() == b'|' { + s.adv(); + s.highlight_from(begin, OtherHighlights::RegexSymbol); + } else { + break; + } + } + + Ok(match seqs.len() { + 0 => Pattern::Nothing, + 1 => seqs.into_iter().next().unwrap(), + _ => Pattern::Alt(seqs), + }) +} + +fn parse_seq(s: &mut Cursor<'_>) -> Result<Pattern> { + let mut reps = vec![]; + loop { + let rep = parse_rep(s)?; + if rep != Pattern::Nothing { + reps.push(rep); + } else { + break; + } + } + + Ok(match reps.len() { + 0 => Pattern::Nothing, + 1 => reps.into_iter().next().unwrap(), + _ => Pattern::Concat(reps), + }) +} + +fn parse_rep(s: &mut Cursor<'_>) -> Result<Pattern> { + let atom = parse_atom(s)?; + + if atom == Pattern::Nothing { + return Ok(atom); + } + + if !s.has() { + return Ok(atom); + } + + let begin = s.loc(); + + let rep = match s.peek() { + b'*' => Some((0, None)), + b'+' => Some((1, None)), + b'?' => Some((0, Some(1))), + _ => None, + }; + + if let Some((min_rep, max_rep)) = rep { + s.adv(); + s.highlight_from(begin, OtherHighlights::RegexSymbol); + + let greed = if s.has() && s.peek() == b'?' { + s.adv(); + GreedyBehavior::NonGreedy + } else { + GreedyBehavior::Greedy + }; + + Ok(Pattern::Rep(Box::new(atom), min_rep, max_rep, greed)) + } else { + Ok(atom) + } +} + +fn parse_atom(s: &mut super::Cursor<'_>) -> Result<Pattern> { + if !s.has() { + return Ok(Pattern::Nothing); + } + + let begin = s.loc(); + + match s.peek() { + b'[' => { + s.adv(); + s.highlight_from(begin, OtherHighlights::RegexSymbol); + let mut ranges = Vec::new(); + loop { + if !s.has() { + return Err(ParseError::Eof); + } + + let begin = s.loc(); + let tok = s.adv(); + + if tok == b']' { + if ranges.is_empty() { + todo!("error handling for empty alternative list"); + } + s.highlight_from(begin, OtherHighlights::RegexSymbol); + return Ok(Pattern::Alt(ranges)); + } + + let begin = s.loc(); + if s.has() && s.peek() == b'-' { + s.adv(); + s.highlight_from(begin, OtherHighlights::RegexSymbol); + + if !s.has() { + return Err(ParseError::Eof); + } + let tok2 = s.peek(); + + if tok2 == b']' { + ranges.push(Pattern::Byte(tok)); + ranges.push(Pattern::Byte(b'-')); + } else if is_symbol(tok2) { + return Err(ParseError::Unknown(tok2)); + } else { + s.adv(); + ranges.push(Pattern::Range(tok, tok2)); + } + } else { + ranges.push(Pattern::Byte(tok)); + } + } + } + b'(' => { + s.adv(); + + let mut assertion = None; + if s.buf.starts_with(b"?=") { + s.advance(2); + assertion = Some((LookDirection::Ahead, LookPolarity::Positive)); + } else if s.buf.starts_with(b"?!") { + s.advance(2); + assertion = Some((LookDirection::Ahead, LookPolarity::Negative)); + } else if s.buf.starts_with(b"?<=") { + s.advance(3); + assertion = Some((LookDirection::Behind, LookPolarity::Positive)); + } else if s.buf.starts_with(b"?<!") { + s.advance(3); + assertion = Some((LookDirection::Behind, LookPolarity::Negative)); + } + + s.highlight_from(begin, OtherHighlights::RegexSymbol); + let inner = parse0(s)?; + if !s.has() { + return Err(ParseError::Eof); + } + let begin = s.loc(); + if s.adv() != b')' { + return Err(ParseError::Expected(')')); + } + s.highlight_from(begin, OtherHighlights::RegexSymbol); + + if let Some((dir, pol)) = assertion { + Ok(Pattern::Assertion(dir, pol, Box::new(inner))) + } else { + Ok(Pattern::Submatch(Box::new(inner))) + } + } + b'.' => { + s.adv(); + s.highlight_from(begin, OtherHighlights::RegexSymbol); + Ok(Pattern::Range(0, 127)) + } + b'\\' => { + s.adv(); + if s.has() { + let escaped = s.adv(); + s.highlight_from(begin, OtherHighlights::RegexSymbol); + + if is_symbol(escaped) { + Ok(Pattern::Byte(escaped)) + } else { + // TODO interpret \w and others + Err(ParseError::Unknown(escaped)) + } + } else { + Err(ParseError::Eof) + } + } + x if is_symbol(x) => Ok(Pattern::Nothing), + ch => { + s.adv(); + Ok(Pattern::Byte(ch)) + } + } +} diff --git a/src/parse/regex/bc.rs b/src/parse/regex/bc.rs deleted file mode 100644 index cb41607..0000000 --- a/src/parse/regex/bc.rs +++ /dev/null @@ -1,746 +0,0 @@ -use std::collections::{HashMap, VecDeque}; - -use crate::{ - bitset::BitSet, - parse::regex::{ - CharacterClass, GreedyBehavior, LookDirection, LookPolarity, Pattern, byte_range::ByteRange, - }, -}; - -trait Flavor: Clone { - type CustomInstr: Copy + Clone + std::fmt::Debug; - type ThreadData: Clone; - type StepData<'a, 'b> - where - 'b: 'a; - - fn accepts<'a, 'b>( - thread: &mut Thread<Self>, - instr: Self::CustomInstr, - sd: &mut Self::StepData<'a, 'b>, - ) -> bool; - - fn save(x: u32) -> Option<Self::CustomInstr>; -} - -#[derive(Copy, Clone, Debug)] -struct MainFlavor; -impl Flavor for MainFlavor { - type CustomInstr = MainInstr; - type ThreadData = Box<[Option<usize>]>; - type StepData<'a, 'b> - = (usize, &'a BitSet, &'a mut LookaheadVM<'b>) - where - 'b: 'a; - - fn accepts<'a, 'b>( - thread: &mut Thread<Self>, - instr: Self::CustomInstr, - data: &mut Self::StepData<'a, 'b>, - ) -> bool { - match instr { - MainInstr::Save(reg) => { - thread.data[reg as usize] = Some(data.0); - true - } - MainInstr::Join(assertion) => { - let should_match = assertion.pol == LookPolarity::Positive; - let state = assertion.target as usize; - let is_matching = match assertion.dir { - LookDirection::Ahead => data.2.get_state(data.0, state), - LookDirection::Behind => data.1.get(state), - }; - is_matching == should_match - } - } - } - - fn save(x: u32) -> Option<Self::CustomInstr> { - Some(MainInstr::Save(x)) - } -} - -#[derive(Copy, Clone, Debug)] -enum Nothing {} - -#[derive(Copy, Clone, Debug)] -struct AssertionFlavor; -impl Flavor for AssertionFlavor { - type CustomInstr = Nothing; - type ThreadData = (); - type StepData<'a, 'b> - = () - where - 'b: 'a; - - fn accepts(_thread: &mut Thread<Self>, instr: Self::CustomInstr, _sd: &mut ()) -> bool { - match instr {} - } - - fn save(_: u32) -> Option<Self::CustomInstr> { - None - } -} - -type JumpTarget = u32; -type Register = u32; - -#[derive(Copy, Clone, Debug)] -struct Assertion { - target: JumpTarget, - dir: LookDirection, - pol: LookPolarity, -} - -#[derive(Copy, Clone, Debug)] -enum Instr<F: Flavor> { - Class(CharacterClass), - Consume(ByteRange), - Jump(JumpTarget), - Fork(JumpTarget, JumpTarget), - Custom(F::CustomInstr), -} - -#[derive(Copy, Clone, Debug)] -enum MainInstr { - Save(Register), - Join(Assertion), -} - -#[derive(Clone)] -struct Thread<F: Flavor> { - pc: JumpTarget, - data: F::ThreadData, -} - -struct VM<'p, F: Flavor> { - instr: &'p [Instr<F>], - passive_threads: VecDeque<Thread<F>>, - active_threads: VecDeque<Thread<F>>, - hot: BitSet, - warm: BitSet, -} - -impl<'p, F: Flavor> VM<'p, F> { - fn new(instr: &'p [Instr<F>], starting_thread: Thread<F>) -> Self { - Self { - instr, - passive_threads: vec![starting_thread].into(), - active_threads: VecDeque::new(), - hot: BitSet::new(instr.len()), - warm: BitSet::new(instr.len()), - } - } - - fn step_epsilon<'a>(&mut self, sd: &mut F::StepData<'a, 'p>) { - std::mem::swap(&mut self.active_threads, &mut self.passive_threads); - self.hot.set_all(false); - self.warm.set_all(false); - - macro_rules! add_thread { - ($t:expr) => {{ - let t = $t; - let bit = t.pc as usize; - if !self.warm.get(bit) { - self.warm.set(bit, true); - self.active_threads.push_front(t); - } - }}; - } - - while let Some(mut thread) = self.active_threads.pop_front() { - match self.instr[thread.pc as usize] { - Instr::Class(_) | Instr::Consume(_) => { - if !self.hot.get(thread.pc as usize) { - self.hot.set(thread.pc as usize, true); - self.passive_threads.push_back(thread); - } - } - Instr::Jump(j) => { - thread.pc = j; - add_thread!(thread); - } - Instr::Fork(a, b) => { - add_thread!(Thread { - pc: b, - data: thread.data.clone(), - }); - add_thread!(Thread { - pc: a, - data: thread.data.clone(), - }); - } - Instr::Custom(instr) => { - if F::accepts(&mut thread, instr, sd) { - thread.pc += 1; - add_thread!(thread); - } - } - } - } - } - - fn step_consume(&mut self, byte: u8) { - self.hot.set_all(false); - self.passive_threads - .retain_mut(|thread| match self.instr[thread.pc as usize] { - Instr::Class(class) => { - if class.matches(byte) { - thread.pc += 1; - self.hot.set(thread.pc as usize, true); - true - } else { - false - } - } - Instr::Consume(bytes) => { - if bytes.contains(byte) { - thread.pc += 1; - self.hot.set(thread.pc as usize, true); - true - } else { - false - } - } - _ => false, - }); - } -} - -struct LookaheadVM<'a> { - vm: VM<'a, AssertionFlavor>, - data: &'a [u8], - cached: bool, - cache_data: Vec<BitSet>, - loc_offset: usize, -} - -impl<'a> LookaheadVM<'a> { - fn new(vm: VM<'a, AssertionFlavor>, data: &'a [u8]) -> Self { - Self { - vm, - data, - cached: false, - cache_data: Vec::new(), - loc_offset: 0, - } - } - - fn get_state(&mut self, loc: usize, state: usize) -> bool { - if !self.cached { - assert!(self.cache_data.is_empty()); - assert_eq!(self.loc_offset, 0); - self.loc_offset = loc; - self.vm.step_epsilon(&mut ()); - self.cache_data.push(self.vm.hot.clone()); - for i in (loc..self.data.len()).rev() { - self.vm.step_consume(self.data[i]); - self.vm.step_epsilon(&mut ()); - self.cache_data.push(self.vm.hot.clone()); - } - self.cache_data.reverse(); - self.cached = true; - } - - assert!( - loc >= self.loc_offset, - "get_state must be called with non-decreasing arguments." - ); - self.cache_data[loc - self.loc_offset].get(state) - } -} - -struct VirtualMachine<'a> { - vm0: VM<'a, AssertionFlavor>, - vm1: VM<'a, MainFlavor>, - vm2: LookaheadVM<'a>, - accepting: &'a BitSet, -} - -impl<'a> VirtualMachine<'a> { - fn step_epsilon_1(&mut self, loc: usize) { - self.vm1 - .step_epsilon(&mut (loc, &self.vm0.hot, &mut self.vm2)); - } - - fn step_epsilon(&mut self, loc: usize) { - self.vm0.step_epsilon(&mut ()); - self.step_epsilon_1(loc); - } - - fn step_consume(&mut self, byte: u8) { - self.vm0.step_consume(byte); - self.vm1.step_consume(byte); - } - - fn step(&mut self, byte: u8, loc: usize) { - self.step_epsilon(loc); - self.step_consume(byte); - } - - fn extract_match(&self) -> Option<Match> { - self.vm1 - .passive_threads - .iter() - .filter(|t| self.accepting.get(t.pc as usize)) - .map(|t| { - let submatches: Vec<_> = t.data.windows(2).map(|x| Some(x[0]?..x[1]?)).collect(); - - Match { - submatches: submatches.into(), - } - }) - .next() - } -} - -fn fmt_instructions<F: std::fmt::Debug + Flavor>( - f: &mut std::fmt::Formatter<'_>, - label: &str, - insns: &[Instr<F>], -) -> std::fmt::Result { - writeln!(f, "# {label}")?; - for (idx, ins) in insns.iter().enumerate() { - writeln!(f, "{idx}: {ins:?}")?; - } - Ok(()) -} - -pub struct BytecodeCompiledRegex { - instrs0: Box<[Instr<AssertionFlavor>]>, - instrs1: Box<[Instr<MainFlavor>]>, - instrs2: Box<[Instr<AssertionFlavor>]>, - no_lookbehind: bool, - submatch_count: usize, - accepting: BitSet, -} - -impl std::fmt::Debug for BytecodeCompiledRegex { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - fmt_instructions(f, "behind", &self.instrs0)?; - fmt_instructions(f, "ahead", &self.instrs2)?; - fmt_instructions(f, "main", &self.instrs1)?; - writeln!(f, "accepting: {:?}", self.accepting) - } -} - -impl BytecodeCompiledRegex { - pub fn re_match(&self, data: &[u8]) -> Option<Match> { - let vm0 = VM::new(&self.instrs0, Thread { pc: 0, data: () }); - let vm1 = VM::new( - &self.instrs1, - Thread { - pc: 0, - data: vec![None; 2 * self.submatch_count].into(), - }, - ); - let vm2 = VM::new(&self.instrs2, Thread { pc: 0, data: () }); - let vm2 = LookaheadVM::new(vm2, data); - let mut vm = VirtualMachine { - vm0, - vm1, - vm2, - accepting: &self.accepting, - }; - if self.no_lookbehind { - for (i, ch) in data.iter().cloned().enumerate() { - vm.step_epsilon_1(i); - vm.vm1.step_consume(ch); - } - vm.step_epsilon(data.len()); - } else { - for (i, ch) in data.iter().cloned().enumerate() { - vm.step(ch, i); - } - vm.step_epsilon(data.len()); - } - vm.extract_match() - } - - pub fn matches(&self, data: &[u8]) -> bool { - self.re_match(data).is_some() - } -} - -pub struct Match { - pub submatches: Box<[Option<core::ops::Range<usize>>]>, -} - -type AssertionHandler<'a, F> = - Box<dyn 'a + FnMut(LookDirection, LookPolarity, Pattern) -> CompileResult<Instr<F>>>; - -#[derive(Copy, Clone)] -struct CompiledSnippet { - begin: JumpTarget, - end: JumpTarget, -} - -struct Compiler<'a, F: Flavor> { - instrs: Vec<Instr<F>>, - map: HashMap<Pattern, CompiledSnippet>, - assertion_handler: AssertionHandler<'a, F>, - assertion_fork_base: usize, - submatch_count: usize, -} - -fn fork<F: Flavor>(repeat: usize, exit: usize, greedy: GreedyBehavior) -> Instr<F> { - let repeat = repeat as JumpTarget; - let exit = exit as JumpTarget; - match greedy { - GreedyBehavior::Greedy => Instr::Fork(repeat, exit), - GreedyBehavior::NonGreedy => Instr::Fork(exit, repeat), - } -} - -impl<'a, F: Flavor> Compiler<'a, F> { - fn new( - assertion_handler: impl 'a - + FnMut(LookDirection, LookPolarity, Pattern) -> CompileResult<Instr<F>>, - ) -> Self { - Self { - instrs: Vec::new(), - map: HashMap::new(), - assertion_handler: Box::new(assertion_handler), - assertion_fork_base: usize::MAX, - submatch_count: 0, - } - } - - fn rep_1_or_more(&mut self, pat: Pattern, greedy: GreedyBehavior) -> CompileResult { - let base = self.instrs.len(); - self.compile(pat)?; - let exit = self.instrs.len() + 1; - self.instrs.push(fork(base, exit, greedy)); - Ok(()) - } - - fn rep_0_or_1(&mut self, pat: Pattern, greedy: GreedyBehavior) -> CompileResult { - let base = self.instrs.len(); - self.instrs.push(Instr::Jump(u32::MAX)); - self.compile(pat)?; - self.instrs[base] = fork(base + 1, self.instrs.len(), greedy); - Ok(()) - } - - fn rep_any_amt(&mut self, pat: Pattern, greedy: GreedyBehavior) -> CompileResult { - let base = self.instrs.len(); - self.instrs.push(Instr::Jump(u32::MAX)); - self.compile(pat)?; - let fork_pos = self.instrs.len(); - let after = fork_pos + 1; - self.instrs.push(fork(base + 1, after, greedy)); - self.instrs[base] = Instr::Jump(fork_pos as JumpTarget); - Ok(()) - } - - fn compile(&mut self, pat: Pattern) -> CompileResult { - match pat { - Pattern::Byte(x) => self.instrs.push(Instr::Consume(ByteRange::new_single(x))), - Pattern::Range(a, b) => self.instrs.push(Instr::Consume(ByteRange::new_range(a, b))), - Pattern::CharacterClass(cc) => { - self.instrs.push(Instr::Class(cc)); - } - Pattern::Alt(patterns) => { - let branch_factor = patterns.len(); - assert!(branch_factor > 0); - - let base = self.instrs.len(); - - // placeholders to later place in forks - for _ in 0..patterns.len() - 1 { - self.instrs.push(Instr::Jump(u32::MAX)); - } - - let mut enter_pats = Vec::new(); - let mut leave_pats = Vec::new(); - for pat in patterns.into_iter() { - enter_pats.push(self.instrs.len()); - self.compile(pat)?; - leave_pats.push(self.instrs.len()); - - // placeholder to place in join - self.instrs.push(Instr::Jump(u32::MAX)); - } - - self.instrs.pop(); // remove last jump - let join_point = self.instrs.len(); - - // link forks - for i in 0..branch_factor - 1 { - let a = enter_pats[i]; - let b = if i == branch_factor - 2 { - enter_pats[i + 1] - } else { - base + i + 1 - }; - self.instrs[base + i] = Instr::Fork(a as JumpTarget, b as JumpTarget); - } - - // link joins - for i in 0..branch_factor - 1 { - self.instrs[leave_pats[i]] = Instr::Jump(join_point as JumpTarget); - } - } - Pattern::Concat(patterns) => { - for pat in patterns.into_iter() { - self.compile(pat)?; - } - } - Pattern::Rep(pat, 0, None, greed) => { - self.rep_any_amt(*pat, greed)?; - } - Pattern::Rep(pat, min, None, greed) => { - let pat = *pat; - for _ in 1..min { - self.compile(pat.clone())?; - } - self.rep_1_or_more(pat, greed)?; - } - Pattern::Rep(pat, min, Some(max), greed) => { - let pat = *pat; - let opt = max - min; - for _ in 0..min { - self.compile(pat.clone())?; - } - for _ in 0..opt { - self.rep_0_or_1(pat.clone(), greed)?; - } - } - Pattern::Assertion(look_direction, look_polarity, pattern) => { - let ins = (self.assertion_handler)(look_direction, look_polarity, *pattern)?; - self.instrs.push(ins); - } - Pattern::Nothing => {} - Pattern::Submatch(pat) => { - let i = self.submatch_count as u32 * 2; - self.submatch_count += 1; - if let Some(ins) = F::save(i) { - self.instrs.push(Instr::Custom(ins)); - } - self.compile(*pat)?; - if let Some(ins) = F::save(i + 1) { - self.instrs.push(Instr::Custom(ins)); - } - } - } - Ok(()) - } - - fn compile_and_memoize(&mut self, pat: Pattern) -> CompileResult<CompiledSnippet> { - if let Some(&jt) = self.map.get(&pat) { - return Ok(jt); - } - let begin = self.instrs.len() as JumpTarget; - self.compile(pat.clone())?; - let end = self.instrs.len() as JumpTarget; - self.instrs.push(Instr::Class(CharacterClass::Nothing)); - let bounds = CompiledSnippet { begin, end }; - self.map.insert(pat, bounds); - Ok(bounds) - } - - fn finalize_assertion_forks(&mut self) { - let fork_targets: Vec<JumpTarget> = self.map.values().map(|v| v.begin).collect(); - let fork_begin = self.instrs.len() as JumpTarget; - match fork_targets.len() { - 0 => { - self.instrs[self.assertion_fork_base] = Instr::Class(CharacterClass::Nothing); - } - 1 => { - self.instrs[self.assertion_fork_base] = Instr::Jump(fork_targets[0]); - } - 2 => { - self.instrs[self.assertion_fork_base] = - Instr::Fork(fork_targets[0], fork_targets[1]); - } - n => { - self.instrs[self.assertion_fork_base] = Instr::Fork(fork_targets[0], fork_begin); - for i in 1..n - 1 { - let fork = if i == n - 2 { - Instr::Fork(fork_targets[i], fork_targets[i + 1]) - } else { - Instr::Fork(fork_targets[i], self.instrs.len() as JumpTarget + 1) - }; - self.instrs.push(fork); - } - } - } - } -} - -fn assertion_compiler() -> Compiler<'static, AssertionFlavor> { - let mut c = Compiler::new(|_, _, _| Err(RegexCompilationError::NestedLookaroundNotSupported)); - c.rep_any_amt( - Pattern::CharacterClass(CharacterClass::Everything), - GreedyBehavior::NonGreedy, - ) - .expect("characterclass should always compile"); - c.assertion_fork_base = c.instrs.len(); - c.instrs.push(Instr::Jump(u32::MAX)); // in the end this gets replaced by a jump to a fork-list for all the assertions - c -} - -#[derive(Clone, Debug)] -pub enum RegexCompilationError { - NestedLookaroundNotSupported, -} - -pub type CompileResult<T = ()> = Result<T, RegexCompilationError>; - -impl TryFrom<Pattern> for BytecodeCompiledRegex { - type Error = RegexCompilationError; - - fn try_from(value: Pattern) -> Result<Self, Self::Error> { - let mut neg = assertion_compiler(); - let mut pos = assertion_compiler(); - let (final_state, instrs, submatch_count) = { - let mut main: Compiler<MainFlavor> = Compiler::new(|dir, pol, pat| { - let target = match dir { - LookDirection::Ahead => pos.compile_and_memoize(pat.reverse()), - LookDirection::Behind => neg.compile_and_memoize(pat), - }? - .end; - - Ok(Instr::Custom(MainInstr::Join(Assertion { - target, - dir, - pol, - }))) - }); - main.compile(value)?; - let end = main.instrs.len(); - main.instrs.push(Instr::Class(CharacterClass::Nothing)); - (end, main.instrs, main.submatch_count) - }; - neg.finalize_assertion_forks(); - pos.finalize_assertion_forks(); - - let mut accepting = BitSet::new(instrs.len()); - accepting.set(final_state, true); - - Ok(Self { - no_lookbehind: neg.map.is_empty(), - instrs0: neg.instrs.into(), - instrs1: instrs.into(), - instrs2: pos.instrs.into(), - accepting, - submatch_count, - }) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::parse::Parse; - - fn regex(s: &str) -> BytecodeCompiledRegex { - let pat = Pattern::parse_from_bytes(s.as_bytes()).unwrap(); - let compiled = BytecodeCompiledRegex::try_from(pat).unwrap(); - compiled - } - - #[test] - fn print_compiled_vm() { - let compiled = regex("a?b?"); - println!("{compiled:#?}"); - assert_eq!(compiled.matches(b"ab"), true); - assert_eq!(compiled.matches(b"a"), true); - assert_eq!(compiled.matches(b"b"), true); - assert_eq!(compiled.matches(b""), true); - } - - #[test] - fn nongreedy_star() { - let re = regex("(ab*?)bb*"); - assert_eq!( - re.re_match(b"abbb").unwrap().submatches[0].clone().unwrap(), - 0..1 - ); - assert_eq!( - re.re_match(b"abbbbb").unwrap().submatches[0] - .clone() - .unwrap(), - 0..1 - ); - } - - #[test] - fn greedy_star() { - let re = regex("(ab*)bb*"); - assert_eq!( - re.re_match(b"abbb").unwrap().submatches[0].clone().unwrap(), - 0..3 - ); - assert_eq!( - re.re_match(b"abbbbb").unwrap().submatches[0] - .clone() - .unwrap(), - 0..5 - ); - } - - #[test] - fn nongreedy_plus() { - let re = regex("(ab+?)bb*"); - assert_eq!( - re.re_match(b"abbbb").unwrap().submatches[0] - .clone() - .unwrap(), - 0..2 - ); - assert_eq!( - re.re_match(b"abbbbb").unwrap().submatches[0] - .clone() - .unwrap(), - 0..2 - ); - } - - #[test] - fn greedy_plus() { - let re = regex("(ab+)bb*"); - assert_eq!( - re.re_match(b"abbb").unwrap().submatches[0].clone().unwrap(), - 0..3 - ); - assert_eq!( - re.re_match(b"abbbbb").unwrap().submatches[0] - .clone() - .unwrap(), - 0..5 - ); - } - - #[test] - fn nongreedy_qm() { - let re = regex("(ab??)bb*"); - assert_eq!( - re.re_match(b"abbbb").unwrap().submatches[0] - .clone() - .unwrap(), - 0..1 - ); - assert_eq!( - re.re_match(b"abbbbb").unwrap().submatches[0] - .clone() - .unwrap(), - 0..1 - ); - } - - #[test] - fn greedy_qm() { - let re = regex("(ab?)bb*"); - assert_eq!( - re.re_match(b"abbb").unwrap().submatches[0].clone().unwrap(), - 0..2 - ); - assert_eq!( - re.re_match(b"abbbbb").unwrap().submatches[0] - .clone() - .unwrap(), - 0..2 - ); - } -} diff --git a/src/parse/regex/byte_range.rs b/src/parse/regex/byte_range.rs deleted file mode 100644 index b7642c1..0000000 --- a/src/parse/regex/byte_range.rs +++ /dev/null @@ -1,204 +0,0 @@ -use std::ops::RangeInclusive; - -#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)] -pub struct ByteRange { - /// inclusive - from: u8, - /// inclusive - to: u8, -} - -impl From<RangeInclusive<u8>> for ByteRange { - fn from(value: RangeInclusive<u8>) -> Self { - Self::new_range(*value.start(), *value.end()) - } -} - -impl ByteRange { - pub fn new_range(from: u8, to: u8) -> Self { - assert!(from <= to, "{from} <= {to}"); - Self { from, to } - } - - pub fn new_single(c: u8) -> Self { - Self::new_range(c, c) - } - - pub fn contains(&self, c: u8) -> bool { - self.from <= c && c <= self.to - } - - pub fn overlaps(&self, other: Self) -> bool { - self.from.max(other.from) <= self.to.min(other.to) - } - - pub fn split_to_disjoint(ranges: Vec<ByteRange>) -> Vec<ByteRange> { - if ranges.is_empty() { - return vec![]; - } - - let mut points: Vec<u8> = Vec::new(); - for r in &ranges { - points.push(r.from); - if r.to != u8::MAX { - points.push(r.to + 1); - } - } - - points.sort_unstable(); - points.dedup(); - - let mut out = Vec::new(); - - for window in points.windows(2) { - let start = window[0]; - let end_exclusive = window[1]; - - if start >= end_exclusive { - continue; - } - - let mut active = false; - - for r in &ranges { - if r.from <= start && start <= r.to { - active = true; - break; - } - } - - if active { - out.push(ByteRange { - from: start, - to: end_exclusive - 1, - }); - } - } - - out - } -} - -#[test] -fn byterange_test() { - assert_eq!( - ByteRange::split_to_disjoint(vec![ - ByteRange::new_range(b'a', b'z'), - ByteRange::new_single(b'm') - ]), - vec![ - ByteRange::new_range(b'a', b'l'), - ByteRange::new_single(b'm'), - ByteRange::new_range(b'n', b'z'), - ] - ); -} - -impl std::fmt::Debug for ByteRange { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - if self.from == self.to { - write!(f, "{}", [self.from].escape_ascii()) - } else { - write!( - f, - "{}-{}", - [self.from].escape_ascii(), - [self.to].escape_ascii() - ) - } - } -} - -#[cfg(test)] -mod non_overlapping_tests { - use std::ops::RangeInclusive; - - use super::ByteRange; - - fn middle(r: ByteRange) -> u8 { - let a = r.from as u8; - let b = r.to as u8; - (a + (b - a) / 2) as u8 - } - - fn prev(c: u8) -> u8 { - c - 1 - } - - fn next(c: u8) -> u8 { - c + 1 - } - - fn run(ranges: Vec<RangeInclusive<u8>>) { - let ranges1: Vec<ByteRange> = ranges.into_iter().map(Into::into).collect(); - let ranges2 = ByteRange::split_to_disjoint(ranges1.clone()); - - let r1 = |c| ranges1.iter().any(|cr| cr.contains(c)); - let r2 = |c| ranges2.iter().any(|cr| cr.contains(c)); - - for &range in ranges1.iter() { - assert!(r1(range.from)); - assert!(r1(range.to)); - assert!(r1(middle(range))); - - assert!(r2(range.from)); - assert!(r2(range.to)); - assert!(r2(middle(range))); - - assert_eq!(r1(prev(range.from)), r2(prev(range.from))); - assert_eq!(r1(next(range.from)), r2(next(range.from))); - } - - for i in 0..ranges2.len() { - for j in 0..i { - assert!( - !ranges2[i].overlaps(ranges2[j]), - "{i} and {j} overlap: {:?}, {:?}", - ranges2[i], - ranges2[j] - ); - } - } - } - - #[test] - fn overlap_correct() { - assert!(ByteRange::new_range(b'a', b'g').overlaps(ByteRange::new_single(b'f'))); - assert!(!ByteRange::new_range(b'a', b'g').overlaps(ByteRange::new_single(b'h'))); - } - - #[test] - fn empty() { - run(vec![]); - } - - #[test] - fn singleton() { - run(vec![b'0'..=b'9']); - } - - #[test] - fn contained1() { - run(vec![b'0'..=b'9', b'5'..=b'6']); - } - - #[test] - fn contained2() { - run(vec![b'5'..=b'6', b'0'..=b'9']); - } - - #[test] - fn overlap2() { - run(vec![b'1'..=b'6', b'4'..=b'9']) - } - - #[test] - fn overlap3() { - run(vec![b'a'..=b'f', b'd'..=b'j', b'g'..=b'm']) - } - - #[test] - fn overlap4() { - run(vec![b'a'..=b'f', b'd'..=b'j', b'g'..=b'm', b'k'..=b'q']) - } -} diff --git a/src/parse/regex/dfa.rs b/src/parse/regex/dfa.rs deleted file mode 100644 index 35f726c..0000000 --- a/src/parse/regex/dfa.rs +++ /dev/null @@ -1,346 +0,0 @@ -use core::fmt; -use std::collections::{BinaryHeap, HashMap, HashSet}; - -use super::{ - byte_range::ByteRange, - enfa::{ENFA, MultiState}, -}; - -pub type StateId = usize; - -pub struct State { - trans: HashMap<ByteRange, StateId>, - default_trans: StateId, - accept: bool, -} - -#[allow(clippy::upper_case_acronyms)] -pub struct DFA { - start: StateId, - states: Vec<State>, -} - -impl fmt::Debug for DFA { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - writeln!(f, "DFA {{")?; - for (i, s) in self.states.iter().enumerate() { - if self.start == i { - write!(f, "-> {i}: ")?; - } else { - write!(f, " {i}: ")?; - } - - for (chr, to) in s.trans.iter() { - write!(f, "{chr:?} to {to}, ")?; - } - - write!(f, "dfl to {}", s.default_trans)?; - if s.accept { - write!(f, ", accept")?; - } - writeln!(f)?; - } - writeln!(f, "}}") - } -} - -impl From<ENFA> for DFA { - fn from(mut nfa: ENFA) -> Self { - nfa.remove_unreachable(); - - let mut multi_states = nfa.all_multi_states(); - multi_states.insert(nfa.void_multi_state()); - let mut len = 0; - let multi_to_dfa: HashMap<MultiState, StateId> = multi_states - .clone() - .into_iter() - .map(|ms| { - len += 1; - (ms, len - 1) - }) - .collect(); - - let void = multi_to_dfa[&nfa.void_multi_state()]; - - let mut states: Vec<State> = (0..len) - .map(|_| State { - trans: HashMap::new(), - default_trans: void, - accept: false, - }) - .collect(); - - for ms in multi_states.iter() { - let i: usize = multi_to_dfa[ms]; - states[i].accept = ms.accept(); - for t in ms.possible_transitions() { - let k = multi_to_dfa[&ms.transition(t)]; - states[i].trans.insert(t, k); - } - } - - let mut this = Self { - start: multi_to_dfa[&nfa.start_multi_state()], - states, - }; - this.minify(); - this - } -} - -impl DFA { - pub fn matches(&self, x: &[u8]) -> bool { - let mut state = self.start; - 'next_byte: for &b in x.iter() { - for (range, &next_state) in self.states[state].trans.iter() { - if range.contains(b) { - state = next_state; - continue 'next_byte; - } - } - state = self.states[state].default_trans; - } - self.states[state].accept - } -} - -mod state_set { - #[derive(Hash, Clone, PartialEq, Eq)] - pub struct StateSet { - set: Vec<usize>, - } - - impl StateSet { - pub fn new(mut set: Vec<usize>) -> Self { - set.sort(); - set.dedup(); - Self { set } - } - - pub fn iter(&self) -> impl Iterator<Item = usize> { - self.set.iter().cloned() - } - - pub fn intersection(&self, other: &Self) -> Self { - let a = &self.set; - let b = &other.set; - - let mut i = 0; - let mut j = 0; - let mut out = Vec::new(); - - while i < a.len() && j < b.len() { - match a[i].cmp(&b[j]) { - std::cmp::Ordering::Less => i += 1, - std::cmp::Ordering::Greater => j += 1, - std::cmp::Ordering::Equal => { - out.push(a[i]); - i += 1; - j += 1; - } - } - } - - Self::new(out) - } - - pub fn difference(&self, other: &Self) -> Self { - let a = &self.set; - let b = &other.set; - - let mut i = 0; - let mut j = 0; - let mut out = Vec::new(); - - while i < a.len() && j < b.len() { - match a[i].cmp(&b[j]) { - std::cmp::Ordering::Less => { - out.push(a[i]); - i += 1; - } - std::cmp::Ordering::Greater => { - j += 1; - } - std::cmp::Ordering::Equal => { - i += 1; - j += 1; - } - } - } - - out.extend_from_slice(&a[i..]); - Self::new(out) - } - - pub fn is_empty(&self) -> bool { - self.set.is_empty() - } - - pub fn len(&self) -> usize { - self.set.len() - } - - pub fn primary_state(&self) -> Option<usize> { - self.set.first().cloned() - } - } - - // custom implementation such that smaller sets come first in a BinaryHeap - impl Ord for StateSet { - fn cmp(&self, other: &Self) -> std::cmp::Ordering { - match self.set.len().cmp(&other.set.len()) { - std::cmp::Ordering::Equal => {} - other => return other.reverse(), - } - self.set.cmp(&other.set) - } - } - - impl PartialOrd for StateSet { - fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> { - Some(self.cmp(other)) - } - } -} - -use state_set::StateSet; - -trait GoesTo { - fn goes_to(&self, to: usize) -> bool; -} -impl GoesTo for (&State, ByteRange) { - fn goes_to(&self, target: usize) -> bool { - let from = self.0; - let ch = self.1; - for (c, &to) in from.trans.iter() { - if c.overlaps(ch) && to == target { - return true; - } - } - from.default_trans == target - } -} - -impl DFA { - fn states_where(&self, mut f: impl FnMut(&State) -> bool) -> StateSet { - let states = self - .states - .iter() - .enumerate() - .filter_map(|(i, s)| if f(s) { Some(i) } else { None }) - .collect(); - StateSet::new(states) - } - - /// https://en.wikipedia.org/wiki/DFA_minimization - fn hopcroft_minimization(&mut self) { - let mut partitions: HashSet<StateSet> = HashSet::new(); - partitions.insert(self.states_where(|s| s.accept)); - partitions.insert(self.states_where(|s| !s.accept)); - - let mut ranges: Vec<ByteRange> = self - .states - .iter() - .flat_map(|s| s.trans.iter().map(|t| *t.0)) - .chain([ByteRange::new_range(u8::MIN, u8::MAX)]) - .collect(); - ranges.sort(); - ranges.dedup(); - let ranges = ByteRange::split_to_disjoint(ranges); - - let mut queue: BinaryHeap<StateSet> = partitions.iter().cloned().collect(); - let mut queue_set = partitions.clone(); - - while let Some(a) = queue.pop() { - if !queue_set.contains(&a) { - continue; - } - - for &c in ranges.iter() { - let x = self.states_where(|s| a.iter().any(|a| (s, c).goes_to(a))); - - let mut del_list = HashSet::new(); - let mut add_list = Vec::new(); - for y in partitions.iter() { - let i = x.intersection(y); - let d = y.difference(&x); - - if !i.is_empty() && !d.is_empty() { - del_list.insert(y.clone()); - add_list.push(i.clone()); - add_list.push(d.clone()); - - if queue_set.contains(y) { - queue_set.remove(y); - - queue.push(i.clone()); - queue_set.insert(i); - - queue.push(d.clone()); - queue_set.insert(d); - } else if i.len() < d.len() { - queue.push(i.clone()); - queue_set.insert(i); - } else { - queue.push(d.clone()); - queue_set.insert(d); - } - } - } - - partitions.retain(|i| !del_list.contains(i)); - for x in add_list { - partitions.insert(x); - } - } - } - - let mut replacement = vec![None; self.states.len()]; - for partition in partitions { - if let Some(x) = partition.primary_state() { - for state in partition.iter() { - assert!(replacement[state].is_none()); - replacement[state] = Some(x); - } - } - } - - // replacement indices in original index space - let replacement: Vec<usize> = replacement.into_iter().map(|x| x.unwrap()).collect(); - let is_alive = |idx: usize| replacement[idx] == idx; - - // compact index space - let mut compact: Vec<usize> = vec![usize::MAX; self.states.len()]; - let mut next = 0; - for i in 0..self.states.len() { - if is_alive(i) { - compact[i] = next; - next += 1; - } - } - - // remap everything and skip all no-longer-needed states - let remap = |idx: usize| compact[replacement[idx]]; - let mut new_states = Vec::with_capacity(next); - for i in 0..self.states.len() { - if is_alive(i) { - let s = &self.states[i]; - new_states.push(State { - trans: s.trans.iter().map(|(&ch, &to)| (ch, remap(to))).collect(), - default_trans: remap(s.default_trans), - accept: s.accept, - }); - } - } - self.states = new_states; - self.start = remap(self.start); - } - - pub fn minify(&mut self) { - for state in self.states.iter_mut() { - state.trans.retain(|_, to| *to != state.default_trans); - } - - self.hopcroft_minimization(); - } -} diff --git a/src/parse/regex/enfa.rs b/src/parse/regex/enfa.rs deleted file mode 100644 index dd4cf6e..0000000 --- a/src/parse/regex/enfa.rs +++ /dev/null @@ -1,733 +0,0 @@ -use std::{ - collections::HashSet, - hash::{DefaultHasher, Hash, Hasher}, -}; - -use crate::parse::regex::{LookDirection, LookPolarity}; - -use super::Pattern; -use super::byte_range::ByteRange; - -/// NFA with epsilon transitions -#[derive(Clone)] -#[allow(clippy::upper_case_acronyms)] -pub struct ENFA { - pub states: Vec<EState>, -} - -fn cartesian_product<T: Clone>(x: Vec<Vec<T>>) -> Vec<Vec<T>> { - let mut result = vec![Vec::new()]; - - for xs in x { - let mut next = Vec::new(); - - for prefix in &result { - for item in &xs { - let mut v = prefix.clone(); - v.push(item.clone()); - next.push(v); - } - } - - result = next; - } - - result -} - -#[cfg(test)] -mod product_tests { - use super::cartesian_product; - - #[test] - fn basic_case() { - let x = vec![vec![1, 2], vec![10, 20]]; - - let out = cartesian_product(x); - - assert_eq!( - out, - vec![vec![1, 10], vec![1, 20], vec![2, 10], vec![2, 20],] - ); - } - - #[test] - fn single_dimension() { - let x = vec![vec![1, 2, 3]]; - - let out = cartesian_product(x); - - assert_eq!(out, vec![vec![1], vec![2], vec![3],]); - } - - #[test] - fn empty_outer_vector() { - let x: Vec<Vec<i32>> = vec![]; - - let out = cartesian_product(x); - - // One empty combination. - assert_eq!(out, vec![vec![]]); - } - - #[test] - fn empty_inner_vector() { - let x = vec![vec![1, 2], vec![], vec![3, 4]]; - - let out = cartesian_product(x); - - assert!(out.is_empty()); - } - - #[test] - fn output_size_matches_product() { - let x = vec![vec![1, 2, 3], vec![4, 5], vec![6, 7, 8, 9]]; - - let out = cartesian_product(x); - - assert_eq!(out.len(), 3 * 2 * 4); - } - - #[test] - fn every_output_has_correct_length() { - let x = vec![vec!['a', 'b'], vec!['x', 'y', 'z'], vec!['0']]; - - let out = cartesian_product(x); - - assert!(out.iter().all(|v| v.len() == 3)); - } - - #[test] - fn works_with_strings() { - let x = vec![ - vec!["a".to_string(), "b".to_string()], - vec!["x".to_string()], - ]; - - let out = cartesian_product(x); - - assert_eq!( - out, - vec![ - vec!["a".to_string(), "x".to_string()], - vec!["b".to_string(), "x".to_string()], - ] - ); - } -} - -#[derive(Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)] -struct Thread { - state: StateId, - positives: Vec<Thread>, - negatives: Vec<Thread>, -} - -impl Thread { - fn new_simple(state: StateId) -> Self { - Self { - state, - positives: Vec::new(), - negatives: Vec::new(), - } - } - - fn new( - enfa: &ENFA, - state: StateId, - mut positives: Vec<Thread>, - mut negatives: Vec<Thread>, - ) -> Option<Self> { - positives.sort(); - positives.dedup(); - positives.retain(|t| !t.accept(enfa)); - positives.shrink_to_fit(); - - negatives.sort(); - negatives.dedup(); - negatives.shrink_to_fit(); - - if negatives.iter().any(|t| t.accept(enfa)) { - return None; - } - - Some(Self { - state, - positives, - negatives, - }) - } - - fn accept(&self, enfa: &ENFA) -> bool { - let pos = self.positives.iter().all(|t| t.accept(enfa)); - let neg = self.negatives.iter().all(|t| !t.accept(enfa)); - let this = match enfa.states[self.state].accept { - Acceptance::Accept => true, - Acceptance::Assertion => true, - Acceptance::NotYet => false, - }; - pos && neg && this - } - - fn step_epsilon0( - self, - enfa: &ENFA, - ret: &mut impl FnMut(Thread), - visited: &mut [bool], - new_assertions: Vec<Assertion>, - ) { - if visited[self.state] { - return; - } - visited[self.state] = true; - - for t in enfa.states[self.state].trans.iter() { - if t.consumes.is_some() { - continue; - } - - let mut new_assertions = new_assertions.clone(); - if let Some(assertion) = enfa.states[self.state].assert.as_ref() { - new_assertions.push(assertion.clone()); - } - - Self { - state: t.to, - ..self.clone() - } - .step_epsilon0(enfa, ret, visited, new_assertions); - } - - let Self { - state, - positives, - negatives, - } = self; - let mut p = Vec::new(); - let mut n = Vec::new(); - for assertion in new_assertions { - let threads = Self::new_simple(assertion.to).step_epsilon(enfa); - let vec = match assertion.polarity { - LookPolarity::Positive => &mut p, - LookPolarity::Negative => &mut n, - }; - vec.push(threads); - } - let p = cartesian_product(p); - for mut p in p { - let mut positives = positives.clone(); - let mut negatives = negatives.clone(); - positives.append(&mut p); - negatives.append(&mut n.iter().flatten().cloned().collect()); - if let Some(thread) = Self::new(enfa, state, positives, negatives) { - ret(thread); - } - } - } - - fn step_epsilon(self, enfa: &ENFA) -> Vec<Self> { - let mut vec = Vec::new(); - self.step_epsilon0( - enfa, - &mut |t| vec.push(t), - &mut vec![false; enfa.states.len()], - Vec::new(), - ); - vec - } - - fn step0(self, enfa: &ENFA, input: ByteRange, ret: &mut impl FnMut(Thread)) { - let positives = self - .positives - .clone() - .into_iter() - .map(|t| t.step(enfa, input)) - .collect(); - let negatives: Vec<_> = self - .negatives - .into_iter() - .flat_map(|t| t.step(enfa, input)) - .collect(); - let positives = cartesian_product(positives); - let next_states: Vec<StateId> = enfa.states[self.state] - .trans - .iter() - .filter_map(|t| { - if let Some(ch) = t.consumes - && ch.overlaps(input) - { - Some(t.to) - } else { - None - } - }) - .collect(); - - for s in next_states { - for p in positives.clone() { - if let Some(thread) = Self::new(enfa, s, p.clone(), negatives.clone()) { - thread.step_epsilon0( - enfa, - ret, - &mut vec![false; enfa.states.len()], - Vec::new(), - ); - } - } - } - } - - fn step(self, enfa: &ENFA, input: ByteRange) -> Vec<Self> { - let mut vec = Vec::new(); - self.step0(enfa, input, &mut |x| vec.push(x)); - vec - } - - fn possible_transitions(&self, enfa: &ENFA, f: &mut impl FnMut(ByteRange)) { - for t in enfa.states[self.state].trans.iter() { - if let Some(ch) = t.consumes { - f(ch); - } - } - for t in self.positives.iter() { - t.possible_transitions(enfa, f); - } - for t in self.negatives.iter() { - t.possible_transitions(enfa, f); - } - } -} - -#[derive(Clone)] -pub struct MultiState<'a> { - nfa: &'a ENFA, - threads: Vec<Thread>, - accept: bool, - hash: u64, -} - -impl<'a> PartialEq for MultiState<'a> { - fn eq(&self, other: &Self) -> bool { - (self.nfa as *const ENFA as u64) == (other.nfa as *const ENFA as u64) - && self.threads == other.threads - && self.accept == other.accept - && self.hash == other.hash - } -} -impl<'a> Eq for MultiState<'a> {} - -impl<'a> MultiState<'a> { - fn new(nfa: &'a ENFA, mut threads: Vec<Thread>) -> Self { - threads.sort(); - threads.dedup(); - threads.shrink_to_fit(); - - let accept = threads.iter().any(|t| t.accept(nfa)); - let mut hasher = DefaultHasher::new(); - threads.hash(&mut hasher); - let hash = hasher.finish(); - - Self { - nfa, - threads, - accept, - hash, - } - } - - /// all the chars that will make an interesting transition - pub fn possible_transitions(&self) -> Vec<ByteRange> { - let mut vec = Vec::new(); - for t in self.threads.iter() { - t.possible_transitions(self.nfa, &mut |x| vec.push(x)); - } - vec = ByteRange::split_to_disjoint(vec); - vec.sort(); - vec.dedup(); - vec.shrink_to_fit(); - vec - } - - pub fn transition(&self, ch: ByteRange) -> Self { - let new_states = self - .threads - .iter() - .flat_map(|t| t.clone().step(self.nfa, ch)) - .collect(); - - Self::new(self.nfa, new_states) - } - - pub fn accept(&self) -> bool { - self.accept - } -} - -impl<'a> Hash for MultiState<'a> { - fn hash<H: Hasher>(&self, state: &mut H) { - self.hash.hash(state) - } -} - -macro_rules! set { - () => { - std::collections::HashSet::new() - }; - ( $( $x:expr ),* ) => {{ - let mut set = std::collections::HashSet::new(); - $( - set.insert($x); - )* - set - }}; -} - -impl ENFA { - fn shift(self, amt: usize) -> Vec<EState> { - let mut s = self.states; - - for state in s.iter_mut() { - state.remap(|i| i + amt); - if state.accept == Acceptance::Accept { - state.accept = Acceptance::NotYet; - } - } - - s - } - - pub fn remove_unreachable(&mut self) { - let mut used = vec![false; self.states.len()]; - used[0] = true; - for s in self.states.iter() { - for i in s.reachable_states() { - used[i] = true; - } - } - let mut remap = vec![0; self.states.len()]; - let mut shift = 0; - for i in 0..self.states.len() { - if used[i] { - remap[i] = i - shift; - } else { - shift += 1; - } - } - for i in (0..self.states.len()).rev() { - if !used[i] { - self.states.remove(i); - } - } - for s in self.states.iter_mut() { - s.remap(|i| remap[i]); - } - } -} - -impl ENFA { - fn looping(self) -> Self { - let mut states = vec![EState::start()]; - states.append(&mut self.shift(1)); - let len = states.len(); - states[0].set_epsilon_transitions([Transition::epsilon(1), Transition::epsilon(len)]); - states[len - 1].set_epsilon_transitions([Transition::epsilon(0), Transition::epsilon(len)]); - states.push(EState::terminal()); - Self { states } - } - - fn repeat(self, times: usize) -> Self { - let reps = vec![self; times]; - Self::concat(reps) - } - - /// between 0 and x repetitions - fn optx(self, x: usize) -> Self { - let len = self.states.len(); - let mut repped = self.repeat(x); - assert_eq!(repped.states.len(), x * len); - for i in 1..=x { - repped.states[0] - .trans - .insert(Transition::epsilon(i * len - 1)); - } - repped - } - - fn concat(nfas: Vec<Self>) -> Self { - if nfas.is_empty() { - return Self { - states: vec![EState::terminal()], - }; - } - - let mut states: Vec<EState> = Vec::new(); - for nfa in nfas.into_iter() { - let len = states.len(); - let mut ns = nfa.shift(len); - if let Some(n) = states.last_mut() { - n.trans.retain(|t| t.consumes.is_some()); - n.trans.insert(Transition::epsilon(len)); - } - states.append(&mut ns); - } - - let len = states.len(); - states[len - 1].accept = Acceptance::Accept; - - Self { states } - } -} - -impl ENFA { - pub fn start_multi_state<'a>(&'a self) -> MultiState<'a> { - let threads = Thread::new_simple(0).step_epsilon(self); - MultiState::new(self, threads) - } - - pub fn void_multi_state<'a>(&'a self) -> MultiState<'a> { - MultiState::new(self, vec![]) - } - - pub fn all_multi_states<'a>(&'a self) -> HashSet<MultiState<'a>> { - let mut states = set![self.start_multi_state()]; - let mut q = vec![self.start_multi_state()]; - - while let Some(state) = q.pop() { - let chars = state.possible_transitions(); - - for chr in chars { - let new = state.transition(chr); - - if !states.contains(&new) { - states.insert(new.clone()); - q.push(new); - } - } - } - - states - } -} - -impl std::fmt::Debug for ENFA { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - writeln!(f, "NFA {{")?; - for (i, s) in self.states.iter().enumerate() { - write!(f, " {i}: ")?; - if let Some(a) = s.assert.as_ref() { - match a.polarity { - LookPolarity::Positive => write!(f, "+{} ", a.to)?, - LookPolarity::Negative => write!(f, "-{} ", a.to)?, - } - } - for t in s.trans.iter() { - let k = t.to; - if let Some(c) = t.consumes { - write!(f, "{c:?}=>{k} ")?; - } else { - write!(f, "~>{k} ")?; - } - } - match s.accept { - Acceptance::Accept => write!(f, "accept")?, - Acceptance::Assertion => write!(f, "assert")?, - Acceptance::NotYet => {} - } - writeln!(f)?; - } - write!(f, "}}") - } -} - -pub type StateId = usize; - -#[derive(Debug, Clone, Hash, PartialEq, Eq)] -pub struct Assertion { - to: StateId, - polarity: LookPolarity, -} - -#[derive(Debug, Clone, Hash, PartialEq, Eq)] -pub struct Transition { - to: StateId, - consumes: Option<ByteRange>, -} - -impl Transition { - fn new(consumes: ByteRange, to: StateId) -> Self { - Self { - to, - consumes: Some(consumes), - } - } - - fn epsilon(to: StateId) -> Self { - Self { to, consumes: None } - } - - fn remap(&mut self, mut f: impl FnMut(StateId) -> StateId) { - self.to = f(self.to); - } - - fn reachable_states(&self) -> impl Iterator<Item = StateId> { - [self.to].into_iter() - } -} - -#[derive(Debug, Copy, Clone, PartialEq)] -pub enum Acceptance { - Accept, - Assertion, - NotYet, -} - -#[derive(Debug, Clone)] -pub struct EState { - pub trans: HashSet<Transition>, - pub assert: Option<Assertion>, - pub accept: Acceptance, -} - -impl EState { - fn set_epsilon_transitions(&mut self, trans: impl IntoIterator<Item = Transition>) { - self.trans.retain(|t| t.consumes.is_some()); - for transition in trans.into_iter() { - assert!(transition.consumes.is_none()); - self.trans.insert(transition); - } - } - - fn start() -> Self { - Self { - trans: HashSet::new(), - assert: None, - accept: Acceptance::NotYet, - } - } - fn terminal() -> Self { - Self { - trans: HashSet::new(), - assert: None, - accept: Acceptance::Accept, - } - } - - fn remap(&mut self, mut f: impl FnMut(StateId) -> StateId) { - self.trans = self - .trans - .iter() - .cloned() - .map(|mut t| { - t.remap(&mut f); - t - }) - .collect(); - if let Some(a) = self.assert.as_mut() { - a.to = f(a.to); - } - } - - fn reachable_states(&self) -> impl Iterator<Item = StateId> { - self.trans - .iter() - .flat_map(|t| t.reachable_states()) - .chain(self.assert.iter().map(|a| a.to)) - } -} - -#[derive(Debug)] -pub enum EnfaTranslationError { - CharacterClassNotSupported, - AssertionsNotSupported, -} - -impl TryFrom<Pattern> for ENFA { - type Error = EnfaTranslationError; - - fn try_from(value: Pattern) -> Result<Self, Self::Error> { - Ok(match value { - Pattern::Byte(c) => Self::try_from(Pattern::Range(c, c))?, - Pattern::Range(c1, c2) => Self { - states: vec![ - EState { - trans: set![Transition::new(ByteRange::new_range(c1, c2), 1)], - assert: None, - accept: Acceptance::NotYet, - }, - EState::terminal(), - ], - }, - Pattern::CharacterClass(_) => { - return Err(EnfaTranslationError::CharacterClassNotSupported); - } - Pattern::Alt(alts) => { - let nfas: Vec<ENFA> = alts - .into_iter() - .map(Self::try_from) - .collect::<Result<_, _>>()?; - let mut states = vec![EState::start()]; - let mut ends = vec![]; - for nfa in nfas.into_iter() { - let len = states.len(); - states[0].trans.insert(Transition::epsilon(len)); - states.append(&mut (nfa.shift(len))); - ends.push(states.len() - 1); - } - states.push(EState::terminal()); - for end in ends.into_iter() { - let last = states.len() - 1; - states[end].trans.insert(Transition::epsilon(last)); - } - Self { states } - } - Pattern::Concat(seq) => { - let nfas: Vec<Self> = seq - .into_iter() - .map(Self::try_from) - .collect::<Result<_, _>>()?; - Self::concat(nfas) - } - Pattern::Rep(regex, min, None, _) => { - let nfa = ENFA::try_from(*regex)?; - let base = nfa.clone().repeat(min as usize); - let tail = nfa.looping(); - Self::concat(vec![base, tail]) - } - Pattern::Rep(regex, min, Some(max), _) => { - assert!(min < max); - let nfa = Self::try_from(*regex)?; - let base = nfa.clone().repeat(min as usize); - let tail = nfa.optx((max - min) as usize); - Self::concat(vec![base, tail]) - } - Pattern::Nothing => Self { - states: vec![EState::terminal()], - }, - Pattern::Assertion(dir, polarity, pat) => { - if dir == LookDirection::Behind { - return Err(EnfaTranslationError::AssertionsNotSupported); - } - let mut regex = Self::try_from(*pat)?; - for s in regex.states.iter_mut() { - if s.accept == Acceptance::Accept { - s.accept = Acceptance::Assertion; - } - } - let mut regex = regex.shift(1); - let mut states = Vec::with_capacity(regex.len() + 2); - states.push(EState { - trans: set![Transition::epsilon(regex.len() + 1)], - assert: Some(Assertion { to: 1, polarity }), - accept: Acceptance::NotYet, - }); - states.append(&mut regex); - states.push(EState::terminal()); - Self { states } - } - Pattern::Submatch(pat) => { - // NFA does not track submatches, so we are skipping it. - Self::try_from(*pat)? - } - }) - } -} diff --git a/src/parse/regex/mod.rs b/src/parse/regex/mod.rs deleted file mode 100644 index 486b0d9..0000000 --- a/src/parse/regex/mod.rs +++ /dev/null @@ -1,432 +0,0 @@ -use crate::parse::OtherHighlights; - -use super::{Parse, ParseError, Result}; - -pub mod bc; -mod byte_range; -pub mod dfa; -pub mod enfa; - -#[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)] -pub enum LookDirection { - Ahead, - Behind, -} - -impl LookDirection { - pub fn reverse(self) -> Self { - match self { - Self::Ahead => Self::Behind, - Self::Behind => Self::Ahead, - } - } -} - -#[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)] -pub enum LookPolarity { - Positive, - Negative, -} - -#[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)] -pub enum CharacterClass { - Everything, - Nothing, - Whitespace, - Alphabetic, - Alphanumeric, -} - -impl CharacterClass { - pub fn matches(self, byte: u8) -> bool { - match self { - CharacterClass::Everything => true, - CharacterClass::Nothing => false, - CharacterClass::Whitespace => byte.is_ascii_whitespace(), - CharacterClass::Alphabetic => byte.is_ascii_alphabetic(), - CharacterClass::Alphanumeric => byte.is_ascii_alphanumeric(), - } - } -} - -#[derive(PartialEq, Eq, Hash, Debug, Clone)] -pub enum Pattern { - Byte(u8), - Range(u8, u8), - CharacterClass(CharacterClass), - Alt(Vec<Pattern>), - Concat(Vec<Pattern>), - Rep(Box<Pattern>, u32, Option<u32>, GreedyBehavior), - Assertion(LookDirection, LookPolarity, Box<Pattern>), - Submatch(Box<Pattern>), - Nothing, -} - -#[derive(Debug, PartialEq, Eq, Copy, Clone)] -pub enum ByteConsumption { - Bounded(usize), - Unbounded, -} - -impl ByteConsumption { - pub fn zero() -> Self { - Self::Bounded(0) - } - pub fn one() -> Self { - Self::Bounded(1) - } -} - -impl Ord for ByteConsumption { - fn cmp(&self, other: &Self) -> std::cmp::Ordering { - match (self, other) { - (Self::Bounded(a), Self::Bounded(b)) => a.cmp(b), - (Self::Bounded(_), Self::Unbounded) => std::cmp::Ordering::Less, - (Self::Unbounded, Self::Bounded(_)) => std::cmp::Ordering::Greater, - (Self::Unbounded, Self::Unbounded) => std::cmp::Ordering::Equal, - } - } -} -impl PartialOrd for ByteConsumption { - fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> { - Some(self.cmp(other)) - } -} - -impl std::ops::Add for ByteConsumption { - type Output = Self; - - fn add(self, rhs: Self) -> Self::Output { - if let Self::Bounded(a) = self - && let Self::Bounded(b) = rhs - { - Self::Bounded(a + b) - } else { - Self::Unbounded - } - } -} - -impl std::ops::Mul<usize> for ByteConsumption { - type Output = Self; - - fn mul(self, rhs: usize) -> Self::Output { - match self { - Self::Bounded(x) => Self::Bounded(x * rhs), - Self::Unbounded => Self::Unbounded, - } - } -} - -impl std::iter::Sum for ByteConsumption { - fn sum<I: Iterator<Item = Self>>(iter: I) -> Self { - iter.fold(Self::zero(), |a, b| a + b) - } -} - -impl Pattern { - pub fn max_byte_consumption(&self) -> ByteConsumption { - match self { - Pattern::Byte(_) => ByteConsumption::one(), - Pattern::Range(_, _) => ByteConsumption::one(), - Pattern::CharacterClass(_) => ByteConsumption::one(), - Pattern::Alt(patterns) => patterns - .iter() - .map(Self::max_byte_consumption) - .max() - .unwrap_or(ByteConsumption::zero()), - Pattern::Concat(patterns) => patterns.iter().map(Self::max_byte_consumption).sum(), - Pattern::Rep(pattern, _, Some(max_reps), _) => { - pattern.max_byte_consumption() * (*max_reps as usize) - } - Pattern::Rep(_, _, None, _) => ByteConsumption::Unbounded, - Pattern::Assertion(_, _, _) => ByteConsumption::zero(), - Pattern::Nothing => ByteConsumption::zero(), - Pattern::Submatch(pat) => pat.max_byte_consumption(), - } - } - - pub fn reverse(self) -> Self { - use Pattern::*; - match self { - Byte(_) | Nothing | Range(..) | CharacterClass(_) => self, - Alt(patterns) => Alt(patterns.into_iter().map(Self::reverse).collect()), - Concat(patterns) => Concat(patterns.into_iter().map(Self::reverse).rev().collect()), - Rep(pattern, min, max, greedy) => Rep(Box::new(pattern.reverse()), min, max, greedy), - Assertion(dir, pol, pat) => Assertion(dir.reverse(), pol, Box::new(pat.reverse())), - Submatch(pat) => Submatch(Box::new(pat.reverse())) - } - } -} - -impl Parse for Pattern { - fn parse(b: &mut super::Cursor<'_>) -> super::Result<Self> { - let begin = b.loc(); - let result = parse0(b); - if result.is_ok() { - b.highlight_from(begin, OtherHighlights::Regex); - } - result - } -} - -fn parse0(s: &mut super::Cursor<'_>) -> Result<Pattern> { - parse_alt(s) -} - -fn parse_alt(s: &mut super::Cursor<'_>) -> Result<Pattern> { - let mut seqs = vec![]; - loop { - let seq = parse_seq(s)?; - seqs.push(seq); - let begin = s.loc(); - if s.has() && s.peek() == b'|' { - s.adv(); - s.highlight_from(begin, OtherHighlights::RegexSymbol); - } else { - break; - } - } - - Ok(match seqs.len() { - 0 => Pattern::Nothing, - 1 => seqs.into_iter().next().unwrap(), - _ => Pattern::Alt(seqs), - }) -} - -fn parse_seq(s: &mut super::Cursor<'_>) -> Result<Pattern> { - let mut reps = vec![]; - loop { - let rep = parse_rep(s)?; - if rep != Pattern::Nothing { - reps.push(rep); - } else { - break; - } - } - - Ok(match reps.len() { - 0 => Pattern::Nothing, - 1 => reps.into_iter().next().unwrap(), - _ => Pattern::Concat(reps), - }) -} - -fn parse_rep(s: &mut super::Cursor<'_>) -> Result<Pattern> { - let atom = parse_atom(s)?; - - if atom == Pattern::Nothing { - return Ok(atom); - } - - if !s.has() { - return Ok(atom); - } - - let begin = s.loc(); - - let rep = match s.peek() { - b'*' => Some((0, None)), - b'+' => Some((1, None)), - b'?' => Some((0, Some(1))), - _ => None, - }; - - if let Some((min_rep, max_rep)) = rep { - s.adv(); - s.highlight_from(begin, OtherHighlights::RegexSymbol); - - let greed = if s.has() && s.peek() == b'?' { - s.adv(); - GreedyBehavior::NonGreedy - } else { - GreedyBehavior::Greedy - }; - - Ok(Pattern::Rep(Box::new(atom), min_rep, max_rep, greed)) - } else { - Ok(atom) - } -} - -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -pub enum GreedyBehavior { - Greedy, - NonGreedy, -} - -const SYMBOLS: &[u8] = b"{}[]()*+-?|.\\ "; -fn is_symbol(x: u8) -> bool { - SYMBOLS.contains(&x) -} - -fn parse_atom(s: &mut super::Cursor<'_>) -> Result<Pattern> { - if !s.has() { - return Ok(Pattern::Nothing); - } - - let begin = s.loc(); - - match s.peek() { - b'[' => { - s.adv(); - s.highlight_from(begin, OtherHighlights::RegexSymbol); - let mut ranges = Vec::new(); - loop { - if !s.has() { - return Err(ParseError::Eof); - } - - let begin = s.loc(); - let tok = s.adv(); - - if tok == b']' { - if ranges.is_empty() { - todo!("error handling for empty alternative list"); - } - s.highlight_from(begin, OtherHighlights::RegexSymbol); - return Ok(Pattern::Alt(ranges)); - } - - let begin = s.loc(); - if s.has() && s.peek() == b'-' { - s.adv(); - s.highlight_from(begin, OtherHighlights::RegexSymbol); - - if !s.has() { - return Err(ParseError::Eof); - } - let tok2 = s.peek(); - - if tok2 == b']' { - ranges.push(Pattern::Byte(tok)); - ranges.push(Pattern::Byte(b'-')); - } else if is_symbol(tok2) { - return Err(ParseError::Unknown(tok2)); - } else { - s.adv(); - ranges.push(Pattern::Range(tok, tok2)); - } - } else { - ranges.push(Pattern::Byte(tok)); - } - } - } - b'(' => { - s.adv(); - - let mut assertion = None; - if s.buf.starts_with(b"?=") { - s.advance(2); - assertion = Some((LookDirection::Ahead, LookPolarity::Positive)); - } else if s.buf.starts_with(b"?!") { - s.advance(2); - assertion = Some((LookDirection::Ahead, LookPolarity::Negative)); - } else if s.buf.starts_with(b"?<=") { - s.advance(3); - assertion = Some((LookDirection::Behind, LookPolarity::Positive)); - } else if s.buf.starts_with(b"?<!") { - s.advance(3); - assertion = Some((LookDirection::Behind, LookPolarity::Negative)); - } - - s.highlight_from(begin, OtherHighlights::RegexSymbol); - let inner = parse0(s)?; - if !s.has() { - return Err(ParseError::Eof); - } - let begin = s.loc(); - if s.adv() != b')' { - return Err(ParseError::Expected(')')); - } - s.highlight_from(begin, OtherHighlights::RegexSymbol); - - if let Some((dir, pol)) = assertion { - Ok(Pattern::Assertion(dir, pol, Box::new(inner))) - } else { - Ok(Pattern::Submatch(Box::new(inner))) - } - } - b'.' => { - s.adv(); - s.highlight_from(begin, OtherHighlights::RegexSymbol); - Ok(Pattern::Range(0, 127)) - } - b'\\' => { - s.adv(); - if s.has() { - let escaped = s.adv(); - s.highlight_from(begin, OtherHighlights::RegexSymbol); - - if is_symbol(escaped) { - Ok(Pattern::Byte(escaped)) - } else { - // TODO interpret \w and others - Err(ParseError::Unknown(escaped)) - } - } else { - Err(ParseError::Eof) - } - } - x if is_symbol(x) => Ok(Pattern::Nothing), - ch => { - s.adv(); - Ok(Pattern::Byte(ch)) - } - } -} - -pub struct CompiledPattern { - dfa: dfa::DFA, -} - -impl std::fmt::Debug for CompiledPattern { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - self.dfa.fmt(f) - } -} - -#[derive(Debug)] -pub enum CompilationError { - Enfa(enfa::EnfaTranslationError), -} - -impl From<enfa::EnfaTranslationError> for CompilationError { - fn from(value: enfa::EnfaTranslationError) -> Self { - Self::Enfa(value) - } -} - -impl Pattern { - pub fn try_compile(self) -> std::result::Result<CompiledPattern, CompilationError> { - let enfa = enfa::ENFA::try_from(self)?; - let dfa = dfa::DFA::from(enfa); - Ok(CompiledPattern { dfa }) - } -} - -impl CompiledPattern { - pub fn matches(&self, bytes: impl AsRef<[u8]>) -> bool { - self.dfa.matches(bytes.as_ref()) - } -} - -#[cfg(test)] -macro_rules! regex_matches { - ($regex:literal, $match:literal, $true:literal) => { - assert_eq!( - Pattern::parse_from_bytes($regex.as_bytes()) - .unwrap() - .try_compile() - .unwrap() - .matches($match.as_bytes()), - $true - ) - }; -} - -#[test] -fn foo_matches_foo() { - regex_matches!("foo", "foo", true); -} |
