aboutsummaryrefslogtreecommitdiffstats
path: root/src/parse/regex
diff options
context:
space:
mode:
authorJonas Maier <jonas@x77.dev>2026-06-06 12:15:52 +0200
committerJonas Maier <jonas@x77.dev>2026-06-06 12:15:52 +0200
commit53980774c327675e886179c0a2c140744dcf9b95 (patch)
treeca1fdcc9938fce2c10c51e0a51659c6ba38ac5ba /src/parse/regex
parent75e0c29cf91ddc6299c14a94a038c3e3df3d2805 (diff)
downloadpish-53980774c327675e886179c0a2c140744dcf9b95.tar.gz
special cased regex for performance
Diffstat (limited to 'src/parse/regex')
-rw-r--r--src/parse/regex/bc.rs746
-rw-r--r--src/parse/regex/byte_range.rs204
-rw-r--r--src/parse/regex/dfa.rs346
-rw-r--r--src/parse/regex/enfa.rs733
-rw-r--r--src/parse/regex/mod.rs432
5 files changed, 0 insertions, 2461 deletions
diff --git a/src/parse/regex/bc.rs b/src/parse/regex/bc.rs
deleted file mode 100644
index cb41607..0000000
--- a/src/parse/regex/bc.rs
+++ /dev/null
@@ -1,746 +0,0 @@
-use std::collections::{HashMap, VecDeque};
-
-use crate::{
- bitset::BitSet,
- parse::regex::{
- CharacterClass, GreedyBehavior, LookDirection, LookPolarity, Pattern, byte_range::ByteRange,
- },
-};
-
-trait Flavor: Clone {
- type CustomInstr: Copy + Clone + std::fmt::Debug;
- type ThreadData: Clone;
- type StepData<'a, 'b>
- where
- 'b: 'a;
-
- fn accepts<'a, 'b>(
- thread: &mut Thread<Self>,
- instr: Self::CustomInstr,
- sd: &mut Self::StepData<'a, 'b>,
- ) -> bool;
-
- fn save(x: u32) -> Option<Self::CustomInstr>;
-}
-
-#[derive(Copy, Clone, Debug)]
-struct MainFlavor;
-impl Flavor for MainFlavor {
- type CustomInstr = MainInstr;
- type ThreadData = Box<[Option<usize>]>;
- type StepData<'a, 'b>
- = (usize, &'a BitSet, &'a mut LookaheadVM<'b>)
- where
- 'b: 'a;
-
- fn accepts<'a, 'b>(
- thread: &mut Thread<Self>,
- instr: Self::CustomInstr,
- data: &mut Self::StepData<'a, 'b>,
- ) -> bool {
- match instr {
- MainInstr::Save(reg) => {
- thread.data[reg as usize] = Some(data.0);
- true
- }
- MainInstr::Join(assertion) => {
- let should_match = assertion.pol == LookPolarity::Positive;
- let state = assertion.target as usize;
- let is_matching = match assertion.dir {
- LookDirection::Ahead => data.2.get_state(data.0, state),
- LookDirection::Behind => data.1.get(state),
- };
- is_matching == should_match
- }
- }
- }
-
- fn save(x: u32) -> Option<Self::CustomInstr> {
- Some(MainInstr::Save(x))
- }
-}
-
-#[derive(Copy, Clone, Debug)]
-enum Nothing {}
-
-#[derive(Copy, Clone, Debug)]
-struct AssertionFlavor;
-impl Flavor for AssertionFlavor {
- type CustomInstr = Nothing;
- type ThreadData = ();
- type StepData<'a, 'b>
- = ()
- where
- 'b: 'a;
-
- fn accepts(_thread: &mut Thread<Self>, instr: Self::CustomInstr, _sd: &mut ()) -> bool {
- match instr {}
- }
-
- fn save(_: u32) -> Option<Self::CustomInstr> {
- None
- }
-}
-
-type JumpTarget = u32;
-type Register = u32;
-
-#[derive(Copy, Clone, Debug)]
-struct Assertion {
- target: JumpTarget,
- dir: LookDirection,
- pol: LookPolarity,
-}
-
-#[derive(Copy, Clone, Debug)]
-enum Instr<F: Flavor> {
- Class(CharacterClass),
- Consume(ByteRange),
- Jump(JumpTarget),
- Fork(JumpTarget, JumpTarget),
- Custom(F::CustomInstr),
-}
-
-#[derive(Copy, Clone, Debug)]
-enum MainInstr {
- Save(Register),
- Join(Assertion),
-}
-
-#[derive(Clone)]
-struct Thread<F: Flavor> {
- pc: JumpTarget,
- data: F::ThreadData,
-}
-
-struct VM<'p, F: Flavor> {
- instr: &'p [Instr<F>],
- passive_threads: VecDeque<Thread<F>>,
- active_threads: VecDeque<Thread<F>>,
- hot: BitSet,
- warm: BitSet,
-}
-
-impl<'p, F: Flavor> VM<'p, F> {
- fn new(instr: &'p [Instr<F>], starting_thread: Thread<F>) -> Self {
- Self {
- instr,
- passive_threads: vec![starting_thread].into(),
- active_threads: VecDeque::new(),
- hot: BitSet::new(instr.len()),
- warm: BitSet::new(instr.len()),
- }
- }
-
- fn step_epsilon<'a>(&mut self, sd: &mut F::StepData<'a, 'p>) {
- std::mem::swap(&mut self.active_threads, &mut self.passive_threads);
- self.hot.set_all(false);
- self.warm.set_all(false);
-
- macro_rules! add_thread {
- ($t:expr) => {{
- let t = $t;
- let bit = t.pc as usize;
- if !self.warm.get(bit) {
- self.warm.set(bit, true);
- self.active_threads.push_front(t);
- }
- }};
- }
-
- while let Some(mut thread) = self.active_threads.pop_front() {
- match self.instr[thread.pc as usize] {
- Instr::Class(_) | Instr::Consume(_) => {
- if !self.hot.get(thread.pc as usize) {
- self.hot.set(thread.pc as usize, true);
- self.passive_threads.push_back(thread);
- }
- }
- Instr::Jump(j) => {
- thread.pc = j;
- add_thread!(thread);
- }
- Instr::Fork(a, b) => {
- add_thread!(Thread {
- pc: b,
- data: thread.data.clone(),
- });
- add_thread!(Thread {
- pc: a,
- data: thread.data.clone(),
- });
- }
- Instr::Custom(instr) => {
- if F::accepts(&mut thread, instr, sd) {
- thread.pc += 1;
- add_thread!(thread);
- }
- }
- }
- }
- }
-
- fn step_consume(&mut self, byte: u8) {
- self.hot.set_all(false);
- self.passive_threads
- .retain_mut(|thread| match self.instr[thread.pc as usize] {
- Instr::Class(class) => {
- if class.matches(byte) {
- thread.pc += 1;
- self.hot.set(thread.pc as usize, true);
- true
- } else {
- false
- }
- }
- Instr::Consume(bytes) => {
- if bytes.contains(byte) {
- thread.pc += 1;
- self.hot.set(thread.pc as usize, true);
- true
- } else {
- false
- }
- }
- _ => false,
- });
- }
-}
-
-struct LookaheadVM<'a> {
- vm: VM<'a, AssertionFlavor>,
- data: &'a [u8],
- cached: bool,
- cache_data: Vec<BitSet>,
- loc_offset: usize,
-}
-
-impl<'a> LookaheadVM<'a> {
- fn new(vm: VM<'a, AssertionFlavor>, data: &'a [u8]) -> Self {
- Self {
- vm,
- data,
- cached: false,
- cache_data: Vec::new(),
- loc_offset: 0,
- }
- }
-
- fn get_state(&mut self, loc: usize, state: usize) -> bool {
- if !self.cached {
- assert!(self.cache_data.is_empty());
- assert_eq!(self.loc_offset, 0);
- self.loc_offset = loc;
- self.vm.step_epsilon(&mut ());
- self.cache_data.push(self.vm.hot.clone());
- for i in (loc..self.data.len()).rev() {
- self.vm.step_consume(self.data[i]);
- self.vm.step_epsilon(&mut ());
- self.cache_data.push(self.vm.hot.clone());
- }
- self.cache_data.reverse();
- self.cached = true;
- }
-
- assert!(
- loc >= self.loc_offset,
- "get_state must be called with non-decreasing arguments."
- );
- self.cache_data[loc - self.loc_offset].get(state)
- }
-}
-
-struct VirtualMachine<'a> {
- vm0: VM<'a, AssertionFlavor>,
- vm1: VM<'a, MainFlavor>,
- vm2: LookaheadVM<'a>,
- accepting: &'a BitSet,
-}
-
-impl<'a> VirtualMachine<'a> {
- fn step_epsilon_1(&mut self, loc: usize) {
- self.vm1
- .step_epsilon(&mut (loc, &self.vm0.hot, &mut self.vm2));
- }
-
- fn step_epsilon(&mut self, loc: usize) {
- self.vm0.step_epsilon(&mut ());
- self.step_epsilon_1(loc);
- }
-
- fn step_consume(&mut self, byte: u8) {
- self.vm0.step_consume(byte);
- self.vm1.step_consume(byte);
- }
-
- fn step(&mut self, byte: u8, loc: usize) {
- self.step_epsilon(loc);
- self.step_consume(byte);
- }
-
- fn extract_match(&self) -> Option<Match> {
- self.vm1
- .passive_threads
- .iter()
- .filter(|t| self.accepting.get(t.pc as usize))
- .map(|t| {
- let submatches: Vec<_> = t.data.windows(2).map(|x| Some(x[0]?..x[1]?)).collect();
-
- Match {
- submatches: submatches.into(),
- }
- })
- .next()
- }
-}
-
-fn fmt_instructions<F: std::fmt::Debug + Flavor>(
- f: &mut std::fmt::Formatter<'_>,
- label: &str,
- insns: &[Instr<F>],
-) -> std::fmt::Result {
- writeln!(f, "# {label}")?;
- for (idx, ins) in insns.iter().enumerate() {
- writeln!(f, "{idx}: {ins:?}")?;
- }
- Ok(())
-}
-
-pub struct BytecodeCompiledRegex {
- instrs0: Box<[Instr<AssertionFlavor>]>,
- instrs1: Box<[Instr<MainFlavor>]>,
- instrs2: Box<[Instr<AssertionFlavor>]>,
- no_lookbehind: bool,
- submatch_count: usize,
- accepting: BitSet,
-}
-
-impl std::fmt::Debug for BytecodeCompiledRegex {
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- fmt_instructions(f, "behind", &self.instrs0)?;
- fmt_instructions(f, "ahead", &self.instrs2)?;
- fmt_instructions(f, "main", &self.instrs1)?;
- writeln!(f, "accepting: {:?}", self.accepting)
- }
-}
-
-impl BytecodeCompiledRegex {
- pub fn re_match(&self, data: &[u8]) -> Option<Match> {
- let vm0 = VM::new(&self.instrs0, Thread { pc: 0, data: () });
- let vm1 = VM::new(
- &self.instrs1,
- Thread {
- pc: 0,
- data: vec![None; 2 * self.submatch_count].into(),
- },
- );
- let vm2 = VM::new(&self.instrs2, Thread { pc: 0, data: () });
- let vm2 = LookaheadVM::new(vm2, data);
- let mut vm = VirtualMachine {
- vm0,
- vm1,
- vm2,
- accepting: &self.accepting,
- };
- if self.no_lookbehind {
- for (i, ch) in data.iter().cloned().enumerate() {
- vm.step_epsilon_1(i);
- vm.vm1.step_consume(ch);
- }
- vm.step_epsilon(data.len());
- } else {
- for (i, ch) in data.iter().cloned().enumerate() {
- vm.step(ch, i);
- }
- vm.step_epsilon(data.len());
- }
- vm.extract_match()
- }
-
- pub fn matches(&self, data: &[u8]) -> bool {
- self.re_match(data).is_some()
- }
-}
-
-pub struct Match {
- pub submatches: Box<[Option<core::ops::Range<usize>>]>,
-}
-
-type AssertionHandler<'a, F> =
- Box<dyn 'a + FnMut(LookDirection, LookPolarity, Pattern) -> CompileResult<Instr<F>>>;
-
-#[derive(Copy, Clone)]
-struct CompiledSnippet {
- begin: JumpTarget,
- end: JumpTarget,
-}
-
-struct Compiler<'a, F: Flavor> {
- instrs: Vec<Instr<F>>,
- map: HashMap<Pattern, CompiledSnippet>,
- assertion_handler: AssertionHandler<'a, F>,
- assertion_fork_base: usize,
- submatch_count: usize,
-}
-
-fn fork<F: Flavor>(repeat: usize, exit: usize, greedy: GreedyBehavior) -> Instr<F> {
- let repeat = repeat as JumpTarget;
- let exit = exit as JumpTarget;
- match greedy {
- GreedyBehavior::Greedy => Instr::Fork(repeat, exit),
- GreedyBehavior::NonGreedy => Instr::Fork(exit, repeat),
- }
-}
-
-impl<'a, F: Flavor> Compiler<'a, F> {
- fn new(
- assertion_handler: impl 'a
- + FnMut(LookDirection, LookPolarity, Pattern) -> CompileResult<Instr<F>>,
- ) -> Self {
- Self {
- instrs: Vec::new(),
- map: HashMap::new(),
- assertion_handler: Box::new(assertion_handler),
- assertion_fork_base: usize::MAX,
- submatch_count: 0,
- }
- }
-
- fn rep_1_or_more(&mut self, pat: Pattern, greedy: GreedyBehavior) -> CompileResult {
- let base = self.instrs.len();
- self.compile(pat)?;
- let exit = self.instrs.len() + 1;
- self.instrs.push(fork(base, exit, greedy));
- Ok(())
- }
-
- fn rep_0_or_1(&mut self, pat: Pattern, greedy: GreedyBehavior) -> CompileResult {
- let base = self.instrs.len();
- self.instrs.push(Instr::Jump(u32::MAX));
- self.compile(pat)?;
- self.instrs[base] = fork(base + 1, self.instrs.len(), greedy);
- Ok(())
- }
-
- fn rep_any_amt(&mut self, pat: Pattern, greedy: GreedyBehavior) -> CompileResult {
- let base = self.instrs.len();
- self.instrs.push(Instr::Jump(u32::MAX));
- self.compile(pat)?;
- let fork_pos = self.instrs.len();
- let after = fork_pos + 1;
- self.instrs.push(fork(base + 1, after, greedy));
- self.instrs[base] = Instr::Jump(fork_pos as JumpTarget);
- Ok(())
- }
-
- fn compile(&mut self, pat: Pattern) -> CompileResult {
- match pat {
- Pattern::Byte(x) => self.instrs.push(Instr::Consume(ByteRange::new_single(x))),
- Pattern::Range(a, b) => self.instrs.push(Instr::Consume(ByteRange::new_range(a, b))),
- Pattern::CharacterClass(cc) => {
- self.instrs.push(Instr::Class(cc));
- }
- Pattern::Alt(patterns) => {
- let branch_factor = patterns.len();
- assert!(branch_factor > 0);
-
- let base = self.instrs.len();
-
- // placeholders to later place in forks
- for _ in 0..patterns.len() - 1 {
- self.instrs.push(Instr::Jump(u32::MAX));
- }
-
- let mut enter_pats = Vec::new();
- let mut leave_pats = Vec::new();
- for pat in patterns.into_iter() {
- enter_pats.push(self.instrs.len());
- self.compile(pat)?;
- leave_pats.push(self.instrs.len());
-
- // placeholder to place in join
- self.instrs.push(Instr::Jump(u32::MAX));
- }
-
- self.instrs.pop(); // remove last jump
- let join_point = self.instrs.len();
-
- // link forks
- for i in 0..branch_factor - 1 {
- let a = enter_pats[i];
- let b = if i == branch_factor - 2 {
- enter_pats[i + 1]
- } else {
- base + i + 1
- };
- self.instrs[base + i] = Instr::Fork(a as JumpTarget, b as JumpTarget);
- }
-
- // link joins
- for i in 0..branch_factor - 1 {
- self.instrs[leave_pats[i]] = Instr::Jump(join_point as JumpTarget);
- }
- }
- Pattern::Concat(patterns) => {
- for pat in patterns.into_iter() {
- self.compile(pat)?;
- }
- }
- Pattern::Rep(pat, 0, None, greed) => {
- self.rep_any_amt(*pat, greed)?;
- }
- Pattern::Rep(pat, min, None, greed) => {
- let pat = *pat;
- for _ in 1..min {
- self.compile(pat.clone())?;
- }
- self.rep_1_or_more(pat, greed)?;
- }
- Pattern::Rep(pat, min, Some(max), greed) => {
- let pat = *pat;
- let opt = max - min;
- for _ in 0..min {
- self.compile(pat.clone())?;
- }
- for _ in 0..opt {
- self.rep_0_or_1(pat.clone(), greed)?;
- }
- }
- Pattern::Assertion(look_direction, look_polarity, pattern) => {
- let ins = (self.assertion_handler)(look_direction, look_polarity, *pattern)?;
- self.instrs.push(ins);
- }
- Pattern::Nothing => {}
- Pattern::Submatch(pat) => {
- let i = self.submatch_count as u32 * 2;
- self.submatch_count += 1;
- if let Some(ins) = F::save(i) {
- self.instrs.push(Instr::Custom(ins));
- }
- self.compile(*pat)?;
- if let Some(ins) = F::save(i + 1) {
- self.instrs.push(Instr::Custom(ins));
- }
- }
- }
- Ok(())
- }
-
- fn compile_and_memoize(&mut self, pat: Pattern) -> CompileResult<CompiledSnippet> {
- if let Some(&jt) = self.map.get(&pat) {
- return Ok(jt);
- }
- let begin = self.instrs.len() as JumpTarget;
- self.compile(pat.clone())?;
- let end = self.instrs.len() as JumpTarget;
- self.instrs.push(Instr::Class(CharacterClass::Nothing));
- let bounds = CompiledSnippet { begin, end };
- self.map.insert(pat, bounds);
- Ok(bounds)
- }
-
- fn finalize_assertion_forks(&mut self) {
- let fork_targets: Vec<JumpTarget> = self.map.values().map(|v| v.begin).collect();
- let fork_begin = self.instrs.len() as JumpTarget;
- match fork_targets.len() {
- 0 => {
- self.instrs[self.assertion_fork_base] = Instr::Class(CharacterClass::Nothing);
- }
- 1 => {
- self.instrs[self.assertion_fork_base] = Instr::Jump(fork_targets[0]);
- }
- 2 => {
- self.instrs[self.assertion_fork_base] =
- Instr::Fork(fork_targets[0], fork_targets[1]);
- }
- n => {
- self.instrs[self.assertion_fork_base] = Instr::Fork(fork_targets[0], fork_begin);
- for i in 1..n - 1 {
- let fork = if i == n - 2 {
- Instr::Fork(fork_targets[i], fork_targets[i + 1])
- } else {
- Instr::Fork(fork_targets[i], self.instrs.len() as JumpTarget + 1)
- };
- self.instrs.push(fork);
- }
- }
- }
- }
-}
-
-fn assertion_compiler() -> Compiler<'static, AssertionFlavor> {
- let mut c = Compiler::new(|_, _, _| Err(RegexCompilationError::NestedLookaroundNotSupported));
- c.rep_any_amt(
- Pattern::CharacterClass(CharacterClass::Everything),
- GreedyBehavior::NonGreedy,
- )
- .expect("characterclass should always compile");
- c.assertion_fork_base = c.instrs.len();
- c.instrs.push(Instr::Jump(u32::MAX)); // in the end this gets replaced by a jump to a fork-list for all the assertions
- c
-}
-
-#[derive(Clone, Debug)]
-pub enum RegexCompilationError {
- NestedLookaroundNotSupported,
-}
-
-pub type CompileResult<T = ()> = Result<T, RegexCompilationError>;
-
-impl TryFrom<Pattern> for BytecodeCompiledRegex {
- type Error = RegexCompilationError;
-
- fn try_from(value: Pattern) -> Result<Self, Self::Error> {
- let mut neg = assertion_compiler();
- let mut pos = assertion_compiler();
- let (final_state, instrs, submatch_count) = {
- let mut main: Compiler<MainFlavor> = Compiler::new(|dir, pol, pat| {
- let target = match dir {
- LookDirection::Ahead => pos.compile_and_memoize(pat.reverse()),
- LookDirection::Behind => neg.compile_and_memoize(pat),
- }?
- .end;
-
- Ok(Instr::Custom(MainInstr::Join(Assertion {
- target,
- dir,
- pol,
- })))
- });
- main.compile(value)?;
- let end = main.instrs.len();
- main.instrs.push(Instr::Class(CharacterClass::Nothing));
- (end, main.instrs, main.submatch_count)
- };
- neg.finalize_assertion_forks();
- pos.finalize_assertion_forks();
-
- let mut accepting = BitSet::new(instrs.len());
- accepting.set(final_state, true);
-
- Ok(Self {
- no_lookbehind: neg.map.is_empty(),
- instrs0: neg.instrs.into(),
- instrs1: instrs.into(),
- instrs2: pos.instrs.into(),
- accepting,
- submatch_count,
- })
- }
-}
-
-#[cfg(test)]
-mod tests {
- use super::*;
- use crate::parse::Parse;
-
- fn regex(s: &str) -> BytecodeCompiledRegex {
- let pat = Pattern::parse_from_bytes(s.as_bytes()).unwrap();
- let compiled = BytecodeCompiledRegex::try_from(pat).unwrap();
- compiled
- }
-
- #[test]
- fn print_compiled_vm() {
- let compiled = regex("a?b?");
- println!("{compiled:#?}");
- assert_eq!(compiled.matches(b"ab"), true);
- assert_eq!(compiled.matches(b"a"), true);
- assert_eq!(compiled.matches(b"b"), true);
- assert_eq!(compiled.matches(b""), true);
- }
-
- #[test]
- fn nongreedy_star() {
- let re = regex("(ab*?)bb*");
- assert_eq!(
- re.re_match(b"abbb").unwrap().submatches[0].clone().unwrap(),
- 0..1
- );
- assert_eq!(
- re.re_match(b"abbbbb").unwrap().submatches[0]
- .clone()
- .unwrap(),
- 0..1
- );
- }
-
- #[test]
- fn greedy_star() {
- let re = regex("(ab*)bb*");
- assert_eq!(
- re.re_match(b"abbb").unwrap().submatches[0].clone().unwrap(),
- 0..3
- );
- assert_eq!(
- re.re_match(b"abbbbb").unwrap().submatches[0]
- .clone()
- .unwrap(),
- 0..5
- );
- }
-
- #[test]
- fn nongreedy_plus() {
- let re = regex("(ab+?)bb*");
- assert_eq!(
- re.re_match(b"abbbb").unwrap().submatches[0]
- .clone()
- .unwrap(),
- 0..2
- );
- assert_eq!(
- re.re_match(b"abbbbb").unwrap().submatches[0]
- .clone()
- .unwrap(),
- 0..2
- );
- }
-
- #[test]
- fn greedy_plus() {
- let re = regex("(ab+)bb*");
- assert_eq!(
- re.re_match(b"abbb").unwrap().submatches[0].clone().unwrap(),
- 0..3
- );
- assert_eq!(
- re.re_match(b"abbbbb").unwrap().submatches[0]
- .clone()
- .unwrap(),
- 0..5
- );
- }
-
- #[test]
- fn nongreedy_qm() {
- let re = regex("(ab??)bb*");
- assert_eq!(
- re.re_match(b"abbbb").unwrap().submatches[0]
- .clone()
- .unwrap(),
- 0..1
- );
- assert_eq!(
- re.re_match(b"abbbbb").unwrap().submatches[0]
- .clone()
- .unwrap(),
- 0..1
- );
- }
-
- #[test]
- fn greedy_qm() {
- let re = regex("(ab?)bb*");
- assert_eq!(
- re.re_match(b"abbb").unwrap().submatches[0].clone().unwrap(),
- 0..2
- );
- assert_eq!(
- re.re_match(b"abbbbb").unwrap().submatches[0]
- .clone()
- .unwrap(),
- 0..2
- );
- }
-}
diff --git a/src/parse/regex/byte_range.rs b/src/parse/regex/byte_range.rs
deleted file mode 100644
index b7642c1..0000000
--- a/src/parse/regex/byte_range.rs
+++ /dev/null
@@ -1,204 +0,0 @@
-use std::ops::RangeInclusive;
-
-#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)]
-pub struct ByteRange {
- /// inclusive
- from: u8,
- /// inclusive
- to: u8,
-}
-
-impl From<RangeInclusive<u8>> for ByteRange {
- fn from(value: RangeInclusive<u8>) -> Self {
- Self::new_range(*value.start(), *value.end())
- }
-}
-
-impl ByteRange {
- pub fn new_range(from: u8, to: u8) -> Self {
- assert!(from <= to, "{from} <= {to}");
- Self { from, to }
- }
-
- pub fn new_single(c: u8) -> Self {
- Self::new_range(c, c)
- }
-
- pub fn contains(&self, c: u8) -> bool {
- self.from <= c && c <= self.to
- }
-
- pub fn overlaps(&self, other: Self) -> bool {
- self.from.max(other.from) <= self.to.min(other.to)
- }
-
- pub fn split_to_disjoint(ranges: Vec<ByteRange>) -> Vec<ByteRange> {
- if ranges.is_empty() {
- return vec![];
- }
-
- let mut points: Vec<u8> = Vec::new();
- for r in &ranges {
- points.push(r.from);
- if r.to != u8::MAX {
- points.push(r.to + 1);
- }
- }
-
- points.sort_unstable();
- points.dedup();
-
- let mut out = Vec::new();
-
- for window in points.windows(2) {
- let start = window[0];
- let end_exclusive = window[1];
-
- if start >= end_exclusive {
- continue;
- }
-
- let mut active = false;
-
- for r in &ranges {
- if r.from <= start && start <= r.to {
- active = true;
- break;
- }
- }
-
- if active {
- out.push(ByteRange {
- from: start,
- to: end_exclusive - 1,
- });
- }
- }
-
- out
- }
-}
-
-#[test]
-fn byterange_test() {
- assert_eq!(
- ByteRange::split_to_disjoint(vec![
- ByteRange::new_range(b'a', b'z'),
- ByteRange::new_single(b'm')
- ]),
- vec![
- ByteRange::new_range(b'a', b'l'),
- ByteRange::new_single(b'm'),
- ByteRange::new_range(b'n', b'z'),
- ]
- );
-}
-
-impl std::fmt::Debug for ByteRange {
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- if self.from == self.to {
- write!(f, "{}", [self.from].escape_ascii())
- } else {
- write!(
- f,
- "{}-{}",
- [self.from].escape_ascii(),
- [self.to].escape_ascii()
- )
- }
- }
-}
-
-#[cfg(test)]
-mod non_overlapping_tests {
- use std::ops::RangeInclusive;
-
- use super::ByteRange;
-
- fn middle(r: ByteRange) -> u8 {
- let a = r.from as u8;
- let b = r.to as u8;
- (a + (b - a) / 2) as u8
- }
-
- fn prev(c: u8) -> u8 {
- c - 1
- }
-
- fn next(c: u8) -> u8 {
- c + 1
- }
-
- fn run(ranges: Vec<RangeInclusive<u8>>) {
- let ranges1: Vec<ByteRange> = ranges.into_iter().map(Into::into).collect();
- let ranges2 = ByteRange::split_to_disjoint(ranges1.clone());
-
- let r1 = |c| ranges1.iter().any(|cr| cr.contains(c));
- let r2 = |c| ranges2.iter().any(|cr| cr.contains(c));
-
- for &range in ranges1.iter() {
- assert!(r1(range.from));
- assert!(r1(range.to));
- assert!(r1(middle(range)));
-
- assert!(r2(range.from));
- assert!(r2(range.to));
- assert!(r2(middle(range)));
-
- assert_eq!(r1(prev(range.from)), r2(prev(range.from)));
- assert_eq!(r1(next(range.from)), r2(next(range.from)));
- }
-
- for i in 0..ranges2.len() {
- for j in 0..i {
- assert!(
- !ranges2[i].overlaps(ranges2[j]),
- "{i} and {j} overlap: {:?}, {:?}",
- ranges2[i],
- ranges2[j]
- );
- }
- }
- }
-
- #[test]
- fn overlap_correct() {
- assert!(ByteRange::new_range(b'a', b'g').overlaps(ByteRange::new_single(b'f')));
- assert!(!ByteRange::new_range(b'a', b'g').overlaps(ByteRange::new_single(b'h')));
- }
-
- #[test]
- fn empty() {
- run(vec![]);
- }
-
- #[test]
- fn singleton() {
- run(vec![b'0'..=b'9']);
- }
-
- #[test]
- fn contained1() {
- run(vec![b'0'..=b'9', b'5'..=b'6']);
- }
-
- #[test]
- fn contained2() {
- run(vec![b'5'..=b'6', b'0'..=b'9']);
- }
-
- #[test]
- fn overlap2() {
- run(vec![b'1'..=b'6', b'4'..=b'9'])
- }
-
- #[test]
- fn overlap3() {
- run(vec![b'a'..=b'f', b'd'..=b'j', b'g'..=b'm'])
- }
-
- #[test]
- fn overlap4() {
- run(vec![b'a'..=b'f', b'd'..=b'j', b'g'..=b'm', b'k'..=b'q'])
- }
-}
diff --git a/src/parse/regex/dfa.rs b/src/parse/regex/dfa.rs
deleted file mode 100644
index 35f726c..0000000
--- a/src/parse/regex/dfa.rs
+++ /dev/null
@@ -1,346 +0,0 @@
-use core::fmt;
-use std::collections::{BinaryHeap, HashMap, HashSet};
-
-use super::{
- byte_range::ByteRange,
- enfa::{ENFA, MultiState},
-};
-
-pub type StateId = usize;
-
-pub struct State {
- trans: HashMap<ByteRange, StateId>,
- default_trans: StateId,
- accept: bool,
-}
-
-#[allow(clippy::upper_case_acronyms)]
-pub struct DFA {
- start: StateId,
- states: Vec<State>,
-}
-
-impl fmt::Debug for DFA {
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- writeln!(f, "DFA {{")?;
- for (i, s) in self.states.iter().enumerate() {
- if self.start == i {
- write!(f, "-> {i}: ")?;
- } else {
- write!(f, " {i}: ")?;
- }
-
- for (chr, to) in s.trans.iter() {
- write!(f, "{chr:?} to {to}, ")?;
- }
-
- write!(f, "dfl to {}", s.default_trans)?;
- if s.accept {
- write!(f, ", accept")?;
- }
- writeln!(f)?;
- }
- writeln!(f, "}}")
- }
-}
-
-impl From<ENFA> for DFA {
- fn from(mut nfa: ENFA) -> Self {
- nfa.remove_unreachable();
-
- let mut multi_states = nfa.all_multi_states();
- multi_states.insert(nfa.void_multi_state());
- let mut len = 0;
- let multi_to_dfa: HashMap<MultiState, StateId> = multi_states
- .clone()
- .into_iter()
- .map(|ms| {
- len += 1;
- (ms, len - 1)
- })
- .collect();
-
- let void = multi_to_dfa[&nfa.void_multi_state()];
-
- let mut states: Vec<State> = (0..len)
- .map(|_| State {
- trans: HashMap::new(),
- default_trans: void,
- accept: false,
- })
- .collect();
-
- for ms in multi_states.iter() {
- let i: usize = multi_to_dfa[ms];
- states[i].accept = ms.accept();
- for t in ms.possible_transitions() {
- let k = multi_to_dfa[&ms.transition(t)];
- states[i].trans.insert(t, k);
- }
- }
-
- let mut this = Self {
- start: multi_to_dfa[&nfa.start_multi_state()],
- states,
- };
- this.minify();
- this
- }
-}
-
-impl DFA {
- pub fn matches(&self, x: &[u8]) -> bool {
- let mut state = self.start;
- 'next_byte: for &b in x.iter() {
- for (range, &next_state) in self.states[state].trans.iter() {
- if range.contains(b) {
- state = next_state;
- continue 'next_byte;
- }
- }
- state = self.states[state].default_trans;
- }
- self.states[state].accept
- }
-}
-
-mod state_set {
- #[derive(Hash, Clone, PartialEq, Eq)]
- pub struct StateSet {
- set: Vec<usize>,
- }
-
- impl StateSet {
- pub fn new(mut set: Vec<usize>) -> Self {
- set.sort();
- set.dedup();
- Self { set }
- }
-
- pub fn iter(&self) -> impl Iterator<Item = usize> {
- self.set.iter().cloned()
- }
-
- pub fn intersection(&self, other: &Self) -> Self {
- let a = &self.set;
- let b = &other.set;
-
- let mut i = 0;
- let mut j = 0;
- let mut out = Vec::new();
-
- while i < a.len() && j < b.len() {
- match a[i].cmp(&b[j]) {
- std::cmp::Ordering::Less => i += 1,
- std::cmp::Ordering::Greater => j += 1,
- std::cmp::Ordering::Equal => {
- out.push(a[i]);
- i += 1;
- j += 1;
- }
- }
- }
-
- Self::new(out)
- }
-
- pub fn difference(&self, other: &Self) -> Self {
- let a = &self.set;
- let b = &other.set;
-
- let mut i = 0;
- let mut j = 0;
- let mut out = Vec::new();
-
- while i < a.len() && j < b.len() {
- match a[i].cmp(&b[j]) {
- std::cmp::Ordering::Less => {
- out.push(a[i]);
- i += 1;
- }
- std::cmp::Ordering::Greater => {
- j += 1;
- }
- std::cmp::Ordering::Equal => {
- i += 1;
- j += 1;
- }
- }
- }
-
- out.extend_from_slice(&a[i..]);
- Self::new(out)
- }
-
- pub fn is_empty(&self) -> bool {
- self.set.is_empty()
- }
-
- pub fn len(&self) -> usize {
- self.set.len()
- }
-
- pub fn primary_state(&self) -> Option<usize> {
- self.set.first().cloned()
- }
- }
-
- // custom implementation such that smaller sets come first in a BinaryHeap
- impl Ord for StateSet {
- fn cmp(&self, other: &Self) -> std::cmp::Ordering {
- match self.set.len().cmp(&other.set.len()) {
- std::cmp::Ordering::Equal => {}
- other => return other.reverse(),
- }
- self.set.cmp(&other.set)
- }
- }
-
- impl PartialOrd for StateSet {
- fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
- Some(self.cmp(other))
- }
- }
-}
-
-use state_set::StateSet;
-
-trait GoesTo {
- fn goes_to(&self, to: usize) -> bool;
-}
-impl GoesTo for (&State, ByteRange) {
- fn goes_to(&self, target: usize) -> bool {
- let from = self.0;
- let ch = self.1;
- for (c, &to) in from.trans.iter() {
- if c.overlaps(ch) && to == target {
- return true;
- }
- }
- from.default_trans == target
- }
-}
-
-impl DFA {
- fn states_where(&self, mut f: impl FnMut(&State) -> bool) -> StateSet {
- let states = self
- .states
- .iter()
- .enumerate()
- .filter_map(|(i, s)| if f(s) { Some(i) } else { None })
- .collect();
- StateSet::new(states)
- }
-
- /// https://en.wikipedia.org/wiki/DFA_minimization
- fn hopcroft_minimization(&mut self) {
- let mut partitions: HashSet<StateSet> = HashSet::new();
- partitions.insert(self.states_where(|s| s.accept));
- partitions.insert(self.states_where(|s| !s.accept));
-
- let mut ranges: Vec<ByteRange> = self
- .states
- .iter()
- .flat_map(|s| s.trans.iter().map(|t| *t.0))
- .chain([ByteRange::new_range(u8::MIN, u8::MAX)])
- .collect();
- ranges.sort();
- ranges.dedup();
- let ranges = ByteRange::split_to_disjoint(ranges);
-
- let mut queue: BinaryHeap<StateSet> = partitions.iter().cloned().collect();
- let mut queue_set = partitions.clone();
-
- while let Some(a) = queue.pop() {
- if !queue_set.contains(&a) {
- continue;
- }
-
- for &c in ranges.iter() {
- let x = self.states_where(|s| a.iter().any(|a| (s, c).goes_to(a)));
-
- let mut del_list = HashSet::new();
- let mut add_list = Vec::new();
- for y in partitions.iter() {
- let i = x.intersection(y);
- let d = y.difference(&x);
-
- if !i.is_empty() && !d.is_empty() {
- del_list.insert(y.clone());
- add_list.push(i.clone());
- add_list.push(d.clone());
-
- if queue_set.contains(y) {
- queue_set.remove(y);
-
- queue.push(i.clone());
- queue_set.insert(i);
-
- queue.push(d.clone());
- queue_set.insert(d);
- } else if i.len() < d.len() {
- queue.push(i.clone());
- queue_set.insert(i);
- } else {
- queue.push(d.clone());
- queue_set.insert(d);
- }
- }
- }
-
- partitions.retain(|i| !del_list.contains(i));
- for x in add_list {
- partitions.insert(x);
- }
- }
- }
-
- let mut replacement = vec![None; self.states.len()];
- for partition in partitions {
- if let Some(x) = partition.primary_state() {
- for state in partition.iter() {
- assert!(replacement[state].is_none());
- replacement[state] = Some(x);
- }
- }
- }
-
- // replacement indices in original index space
- let replacement: Vec<usize> = replacement.into_iter().map(|x| x.unwrap()).collect();
- let is_alive = |idx: usize| replacement[idx] == idx;
-
- // compact index space
- let mut compact: Vec<usize> = vec![usize::MAX; self.states.len()];
- let mut next = 0;
- for i in 0..self.states.len() {
- if is_alive(i) {
- compact[i] = next;
- next += 1;
- }
- }
-
- // remap everything and skip all no-longer-needed states
- let remap = |idx: usize| compact[replacement[idx]];
- let mut new_states = Vec::with_capacity(next);
- for i in 0..self.states.len() {
- if is_alive(i) {
- let s = &self.states[i];
- new_states.push(State {
- trans: s.trans.iter().map(|(&ch, &to)| (ch, remap(to))).collect(),
- default_trans: remap(s.default_trans),
- accept: s.accept,
- });
- }
- }
- self.states = new_states;
- self.start = remap(self.start);
- }
-
- pub fn minify(&mut self) {
- for state in self.states.iter_mut() {
- state.trans.retain(|_, to| *to != state.default_trans);
- }
-
- self.hopcroft_minimization();
- }
-}
diff --git a/src/parse/regex/enfa.rs b/src/parse/regex/enfa.rs
deleted file mode 100644
index dd4cf6e..0000000
--- a/src/parse/regex/enfa.rs
+++ /dev/null
@@ -1,733 +0,0 @@
-use std::{
- collections::HashSet,
- hash::{DefaultHasher, Hash, Hasher},
-};
-
-use crate::parse::regex::{LookDirection, LookPolarity};
-
-use super::Pattern;
-use super::byte_range::ByteRange;
-
-/// NFA with epsilon transitions
-#[derive(Clone)]
-#[allow(clippy::upper_case_acronyms)]
-pub struct ENFA {
- pub states: Vec<EState>,
-}
-
-fn cartesian_product<T: Clone>(x: Vec<Vec<T>>) -> Vec<Vec<T>> {
- let mut result = vec![Vec::new()];
-
- for xs in x {
- let mut next = Vec::new();
-
- for prefix in &result {
- for item in &xs {
- let mut v = prefix.clone();
- v.push(item.clone());
- next.push(v);
- }
- }
-
- result = next;
- }
-
- result
-}
-
-#[cfg(test)]
-mod product_tests {
- use super::cartesian_product;
-
- #[test]
- fn basic_case() {
- let x = vec![vec![1, 2], vec![10, 20]];
-
- let out = cartesian_product(x);
-
- assert_eq!(
- out,
- vec![vec![1, 10], vec![1, 20], vec![2, 10], vec![2, 20],]
- );
- }
-
- #[test]
- fn single_dimension() {
- let x = vec![vec![1, 2, 3]];
-
- let out = cartesian_product(x);
-
- assert_eq!(out, vec![vec![1], vec![2], vec![3],]);
- }
-
- #[test]
- fn empty_outer_vector() {
- let x: Vec<Vec<i32>> = vec![];
-
- let out = cartesian_product(x);
-
- // One empty combination.
- assert_eq!(out, vec![vec![]]);
- }
-
- #[test]
- fn empty_inner_vector() {
- let x = vec![vec![1, 2], vec![], vec![3, 4]];
-
- let out = cartesian_product(x);
-
- assert!(out.is_empty());
- }
-
- #[test]
- fn output_size_matches_product() {
- let x = vec![vec![1, 2, 3], vec![4, 5], vec![6, 7, 8, 9]];
-
- let out = cartesian_product(x);
-
- assert_eq!(out.len(), 3 * 2 * 4);
- }
-
- #[test]
- fn every_output_has_correct_length() {
- let x = vec![vec!['a', 'b'], vec!['x', 'y', 'z'], vec!['0']];
-
- let out = cartesian_product(x);
-
- assert!(out.iter().all(|v| v.len() == 3));
- }
-
- #[test]
- fn works_with_strings() {
- let x = vec![
- vec!["a".to_string(), "b".to_string()],
- vec!["x".to_string()],
- ];
-
- let out = cartesian_product(x);
-
- assert_eq!(
- out,
- vec![
- vec!["a".to_string(), "x".to_string()],
- vec!["b".to_string(), "x".to_string()],
- ]
- );
- }
-}
-
-#[derive(Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
-struct Thread {
- state: StateId,
- positives: Vec<Thread>,
- negatives: Vec<Thread>,
-}
-
-impl Thread {
- fn new_simple(state: StateId) -> Self {
- Self {
- state,
- positives: Vec::new(),
- negatives: Vec::new(),
- }
- }
-
- fn new(
- enfa: &ENFA,
- state: StateId,
- mut positives: Vec<Thread>,
- mut negatives: Vec<Thread>,
- ) -> Option<Self> {
- positives.sort();
- positives.dedup();
- positives.retain(|t| !t.accept(enfa));
- positives.shrink_to_fit();
-
- negatives.sort();
- negatives.dedup();
- negatives.shrink_to_fit();
-
- if negatives.iter().any(|t| t.accept(enfa)) {
- return None;
- }
-
- Some(Self {
- state,
- positives,
- negatives,
- })
- }
-
- fn accept(&self, enfa: &ENFA) -> bool {
- let pos = self.positives.iter().all(|t| t.accept(enfa));
- let neg = self.negatives.iter().all(|t| !t.accept(enfa));
- let this = match enfa.states[self.state].accept {
- Acceptance::Accept => true,
- Acceptance::Assertion => true,
- Acceptance::NotYet => false,
- };
- pos && neg && this
- }
-
- fn step_epsilon0(
- self,
- enfa: &ENFA,
- ret: &mut impl FnMut(Thread),
- visited: &mut [bool],
- new_assertions: Vec<Assertion>,
- ) {
- if visited[self.state] {
- return;
- }
- visited[self.state] = true;
-
- for t in enfa.states[self.state].trans.iter() {
- if t.consumes.is_some() {
- continue;
- }
-
- let mut new_assertions = new_assertions.clone();
- if let Some(assertion) = enfa.states[self.state].assert.as_ref() {
- new_assertions.push(assertion.clone());
- }
-
- Self {
- state: t.to,
- ..self.clone()
- }
- .step_epsilon0(enfa, ret, visited, new_assertions);
- }
-
- let Self {
- state,
- positives,
- negatives,
- } = self;
- let mut p = Vec::new();
- let mut n = Vec::new();
- for assertion in new_assertions {
- let threads = Self::new_simple(assertion.to).step_epsilon(enfa);
- let vec = match assertion.polarity {
- LookPolarity::Positive => &mut p,
- LookPolarity::Negative => &mut n,
- };
- vec.push(threads);
- }
- let p = cartesian_product(p);
- for mut p in p {
- let mut positives = positives.clone();
- let mut negatives = negatives.clone();
- positives.append(&mut p);
- negatives.append(&mut n.iter().flatten().cloned().collect());
- if let Some(thread) = Self::new(enfa, state, positives, negatives) {
- ret(thread);
- }
- }
- }
-
- fn step_epsilon(self, enfa: &ENFA) -> Vec<Self> {
- let mut vec = Vec::new();
- self.step_epsilon0(
- enfa,
- &mut |t| vec.push(t),
- &mut vec![false; enfa.states.len()],
- Vec::new(),
- );
- vec
- }
-
- fn step0(self, enfa: &ENFA, input: ByteRange, ret: &mut impl FnMut(Thread)) {
- let positives = self
- .positives
- .clone()
- .into_iter()
- .map(|t| t.step(enfa, input))
- .collect();
- let negatives: Vec<_> = self
- .negatives
- .into_iter()
- .flat_map(|t| t.step(enfa, input))
- .collect();
- let positives = cartesian_product(positives);
- let next_states: Vec<StateId> = enfa.states[self.state]
- .trans
- .iter()
- .filter_map(|t| {
- if let Some(ch) = t.consumes
- && ch.overlaps(input)
- {
- Some(t.to)
- } else {
- None
- }
- })
- .collect();
-
- for s in next_states {
- for p in positives.clone() {
- if let Some(thread) = Self::new(enfa, s, p.clone(), negatives.clone()) {
- thread.step_epsilon0(
- enfa,
- ret,
- &mut vec![false; enfa.states.len()],
- Vec::new(),
- );
- }
- }
- }
- }
-
- fn step(self, enfa: &ENFA, input: ByteRange) -> Vec<Self> {
- let mut vec = Vec::new();
- self.step0(enfa, input, &mut |x| vec.push(x));
- vec
- }
-
- fn possible_transitions(&self, enfa: &ENFA, f: &mut impl FnMut(ByteRange)) {
- for t in enfa.states[self.state].trans.iter() {
- if let Some(ch) = t.consumes {
- f(ch);
- }
- }
- for t in self.positives.iter() {
- t.possible_transitions(enfa, f);
- }
- for t in self.negatives.iter() {
- t.possible_transitions(enfa, f);
- }
- }
-}
-
-#[derive(Clone)]
-pub struct MultiState<'a> {
- nfa: &'a ENFA,
- threads: Vec<Thread>,
- accept: bool,
- hash: u64,
-}
-
-impl<'a> PartialEq for MultiState<'a> {
- fn eq(&self, other: &Self) -> bool {
- (self.nfa as *const ENFA as u64) == (other.nfa as *const ENFA as u64)
- && self.threads == other.threads
- && self.accept == other.accept
- && self.hash == other.hash
- }
-}
-impl<'a> Eq for MultiState<'a> {}
-
-impl<'a> MultiState<'a> {
- fn new(nfa: &'a ENFA, mut threads: Vec<Thread>) -> Self {
- threads.sort();
- threads.dedup();
- threads.shrink_to_fit();
-
- let accept = threads.iter().any(|t| t.accept(nfa));
- let mut hasher = DefaultHasher::new();
- threads.hash(&mut hasher);
- let hash = hasher.finish();
-
- Self {
- nfa,
- threads,
- accept,
- hash,
- }
- }
-
- /// all the chars that will make an interesting transition
- pub fn possible_transitions(&self) -> Vec<ByteRange> {
- let mut vec = Vec::new();
- for t in self.threads.iter() {
- t.possible_transitions(self.nfa, &mut |x| vec.push(x));
- }
- vec = ByteRange::split_to_disjoint(vec);
- vec.sort();
- vec.dedup();
- vec.shrink_to_fit();
- vec
- }
-
- pub fn transition(&self, ch: ByteRange) -> Self {
- let new_states = self
- .threads
- .iter()
- .flat_map(|t| t.clone().step(self.nfa, ch))
- .collect();
-
- Self::new(self.nfa, new_states)
- }
-
- pub fn accept(&self) -> bool {
- self.accept
- }
-}
-
-impl<'a> Hash for MultiState<'a> {
- fn hash<H: Hasher>(&self, state: &mut H) {
- self.hash.hash(state)
- }
-}
-
-macro_rules! set {
- () => {
- std::collections::HashSet::new()
- };
- ( $( $x:expr ),* ) => {{
- let mut set = std::collections::HashSet::new();
- $(
- set.insert($x);
- )*
- set
- }};
-}
-
-impl ENFA {
- fn shift(self, amt: usize) -> Vec<EState> {
- let mut s = self.states;
-
- for state in s.iter_mut() {
- state.remap(|i| i + amt);
- if state.accept == Acceptance::Accept {
- state.accept = Acceptance::NotYet;
- }
- }
-
- s
- }
-
- pub fn remove_unreachable(&mut self) {
- let mut used = vec![false; self.states.len()];
- used[0] = true;
- for s in self.states.iter() {
- for i in s.reachable_states() {
- used[i] = true;
- }
- }
- let mut remap = vec![0; self.states.len()];
- let mut shift = 0;
- for i in 0..self.states.len() {
- if used[i] {
- remap[i] = i - shift;
- } else {
- shift += 1;
- }
- }
- for i in (0..self.states.len()).rev() {
- if !used[i] {
- self.states.remove(i);
- }
- }
- for s in self.states.iter_mut() {
- s.remap(|i| remap[i]);
- }
- }
-}
-
-impl ENFA {
- fn looping(self) -> Self {
- let mut states = vec![EState::start()];
- states.append(&mut self.shift(1));
- let len = states.len();
- states[0].set_epsilon_transitions([Transition::epsilon(1), Transition::epsilon(len)]);
- states[len - 1].set_epsilon_transitions([Transition::epsilon(0), Transition::epsilon(len)]);
- states.push(EState::terminal());
- Self { states }
- }
-
- fn repeat(self, times: usize) -> Self {
- let reps = vec![self; times];
- Self::concat(reps)
- }
-
- /// between 0 and x repetitions
- fn optx(self, x: usize) -> Self {
- let len = self.states.len();
- let mut repped = self.repeat(x);
- assert_eq!(repped.states.len(), x * len);
- for i in 1..=x {
- repped.states[0]
- .trans
- .insert(Transition::epsilon(i * len - 1));
- }
- repped
- }
-
- fn concat(nfas: Vec<Self>) -> Self {
- if nfas.is_empty() {
- return Self {
- states: vec![EState::terminal()],
- };
- }
-
- let mut states: Vec<EState> = Vec::new();
- for nfa in nfas.into_iter() {
- let len = states.len();
- let mut ns = nfa.shift(len);
- if let Some(n) = states.last_mut() {
- n.trans.retain(|t| t.consumes.is_some());
- n.trans.insert(Transition::epsilon(len));
- }
- states.append(&mut ns);
- }
-
- let len = states.len();
- states[len - 1].accept = Acceptance::Accept;
-
- Self { states }
- }
-}
-
-impl ENFA {
- pub fn start_multi_state<'a>(&'a self) -> MultiState<'a> {
- let threads = Thread::new_simple(0).step_epsilon(self);
- MultiState::new(self, threads)
- }
-
- pub fn void_multi_state<'a>(&'a self) -> MultiState<'a> {
- MultiState::new(self, vec![])
- }
-
- pub fn all_multi_states<'a>(&'a self) -> HashSet<MultiState<'a>> {
- let mut states = set![self.start_multi_state()];
- let mut q = vec![self.start_multi_state()];
-
- while let Some(state) = q.pop() {
- let chars = state.possible_transitions();
-
- for chr in chars {
- let new = state.transition(chr);
-
- if !states.contains(&new) {
- states.insert(new.clone());
- q.push(new);
- }
- }
- }
-
- states
- }
-}
-
-impl std::fmt::Debug for ENFA {
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- writeln!(f, "NFA {{")?;
- for (i, s) in self.states.iter().enumerate() {
- write!(f, " {i}: ")?;
- if let Some(a) = s.assert.as_ref() {
- match a.polarity {
- LookPolarity::Positive => write!(f, "+{} ", a.to)?,
- LookPolarity::Negative => write!(f, "-{} ", a.to)?,
- }
- }
- for t in s.trans.iter() {
- let k = t.to;
- if let Some(c) = t.consumes {
- write!(f, "{c:?}=>{k} ")?;
- } else {
- write!(f, "~>{k} ")?;
- }
- }
- match s.accept {
- Acceptance::Accept => write!(f, "accept")?,
- Acceptance::Assertion => write!(f, "assert")?,
- Acceptance::NotYet => {}
- }
- writeln!(f)?;
- }
- write!(f, "}}")
- }
-}
-
-pub type StateId = usize;
-
-#[derive(Debug, Clone, Hash, PartialEq, Eq)]
-pub struct Assertion {
- to: StateId,
- polarity: LookPolarity,
-}
-
-#[derive(Debug, Clone, Hash, PartialEq, Eq)]
-pub struct Transition {
- to: StateId,
- consumes: Option<ByteRange>,
-}
-
-impl Transition {
- fn new(consumes: ByteRange, to: StateId) -> Self {
- Self {
- to,
- consumes: Some(consumes),
- }
- }
-
- fn epsilon(to: StateId) -> Self {
- Self { to, consumes: None }
- }
-
- fn remap(&mut self, mut f: impl FnMut(StateId) -> StateId) {
- self.to = f(self.to);
- }
-
- fn reachable_states(&self) -> impl Iterator<Item = StateId> {
- [self.to].into_iter()
- }
-}
-
-#[derive(Debug, Copy, Clone, PartialEq)]
-pub enum Acceptance {
- Accept,
- Assertion,
- NotYet,
-}
-
-#[derive(Debug, Clone)]
-pub struct EState {
- pub trans: HashSet<Transition>,
- pub assert: Option<Assertion>,
- pub accept: Acceptance,
-}
-
-impl EState {
- fn set_epsilon_transitions(&mut self, trans: impl IntoIterator<Item = Transition>) {
- self.trans.retain(|t| t.consumes.is_some());
- for transition in trans.into_iter() {
- assert!(transition.consumes.is_none());
- self.trans.insert(transition);
- }
- }
-
- fn start() -> Self {
- Self {
- trans: HashSet::new(),
- assert: None,
- accept: Acceptance::NotYet,
- }
- }
- fn terminal() -> Self {
- Self {
- trans: HashSet::new(),
- assert: None,
- accept: Acceptance::Accept,
- }
- }
-
- fn remap(&mut self, mut f: impl FnMut(StateId) -> StateId) {
- self.trans = self
- .trans
- .iter()
- .cloned()
- .map(|mut t| {
- t.remap(&mut f);
- t
- })
- .collect();
- if let Some(a) = self.assert.as_mut() {
- a.to = f(a.to);
- }
- }
-
- fn reachable_states(&self) -> impl Iterator<Item = StateId> {
- self.trans
- .iter()
- .flat_map(|t| t.reachable_states())
- .chain(self.assert.iter().map(|a| a.to))
- }
-}
-
-#[derive(Debug)]
-pub enum EnfaTranslationError {
- CharacterClassNotSupported,
- AssertionsNotSupported,
-}
-
-impl TryFrom<Pattern> for ENFA {
- type Error = EnfaTranslationError;
-
- fn try_from(value: Pattern) -> Result<Self, Self::Error> {
- Ok(match value {
- Pattern::Byte(c) => Self::try_from(Pattern::Range(c, c))?,
- Pattern::Range(c1, c2) => Self {
- states: vec![
- EState {
- trans: set![Transition::new(ByteRange::new_range(c1, c2), 1)],
- assert: None,
- accept: Acceptance::NotYet,
- },
- EState::terminal(),
- ],
- },
- Pattern::CharacterClass(_) => {
- return Err(EnfaTranslationError::CharacterClassNotSupported);
- }
- Pattern::Alt(alts) => {
- let nfas: Vec<ENFA> = alts
- .into_iter()
- .map(Self::try_from)
- .collect::<Result<_, _>>()?;
- let mut states = vec![EState::start()];
- let mut ends = vec![];
- for nfa in nfas.into_iter() {
- let len = states.len();
- states[0].trans.insert(Transition::epsilon(len));
- states.append(&mut (nfa.shift(len)));
- ends.push(states.len() - 1);
- }
- states.push(EState::terminal());
- for end in ends.into_iter() {
- let last = states.len() - 1;
- states[end].trans.insert(Transition::epsilon(last));
- }
- Self { states }
- }
- Pattern::Concat(seq) => {
- let nfas: Vec<Self> = seq
- .into_iter()
- .map(Self::try_from)
- .collect::<Result<_, _>>()?;
- Self::concat(nfas)
- }
- Pattern::Rep(regex, min, None, _) => {
- let nfa = ENFA::try_from(*regex)?;
- let base = nfa.clone().repeat(min as usize);
- let tail = nfa.looping();
- Self::concat(vec![base, tail])
- }
- Pattern::Rep(regex, min, Some(max), _) => {
- assert!(min < max);
- let nfa = Self::try_from(*regex)?;
- let base = nfa.clone().repeat(min as usize);
- let tail = nfa.optx((max - min) as usize);
- Self::concat(vec![base, tail])
- }
- Pattern::Nothing => Self {
- states: vec![EState::terminal()],
- },
- Pattern::Assertion(dir, polarity, pat) => {
- if dir == LookDirection::Behind {
- return Err(EnfaTranslationError::AssertionsNotSupported);
- }
- let mut regex = Self::try_from(*pat)?;
- for s in regex.states.iter_mut() {
- if s.accept == Acceptance::Accept {
- s.accept = Acceptance::Assertion;
- }
- }
- let mut regex = regex.shift(1);
- let mut states = Vec::with_capacity(regex.len() + 2);
- states.push(EState {
- trans: set![Transition::epsilon(regex.len() + 1)],
- assert: Some(Assertion { to: 1, polarity }),
- accept: Acceptance::NotYet,
- });
- states.append(&mut regex);
- states.push(EState::terminal());
- Self { states }
- }
- Pattern::Submatch(pat) => {
- // NFA does not track submatches, so we are skipping it.
- Self::try_from(*pat)?
- }
- })
- }
-}
diff --git a/src/parse/regex/mod.rs b/src/parse/regex/mod.rs
deleted file mode 100644
index 486b0d9..0000000
--- a/src/parse/regex/mod.rs
+++ /dev/null
@@ -1,432 +0,0 @@
-use crate::parse::OtherHighlights;
-
-use super::{Parse, ParseError, Result};
-
-pub mod bc;
-mod byte_range;
-pub mod dfa;
-pub mod enfa;
-
-#[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)]
-pub enum LookDirection {
- Ahead,
- Behind,
-}
-
-impl LookDirection {
- pub fn reverse(self) -> Self {
- match self {
- Self::Ahead => Self::Behind,
- Self::Behind => Self::Ahead,
- }
- }
-}
-
-#[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)]
-pub enum LookPolarity {
- Positive,
- Negative,
-}
-
-#[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)]
-pub enum CharacterClass {
- Everything,
- Nothing,
- Whitespace,
- Alphabetic,
- Alphanumeric,
-}
-
-impl CharacterClass {
- pub fn matches(self, byte: u8) -> bool {
- match self {
- CharacterClass::Everything => true,
- CharacterClass::Nothing => false,
- CharacterClass::Whitespace => byte.is_ascii_whitespace(),
- CharacterClass::Alphabetic => byte.is_ascii_alphabetic(),
- CharacterClass::Alphanumeric => byte.is_ascii_alphanumeric(),
- }
- }
-}
-
-#[derive(PartialEq, Eq, Hash, Debug, Clone)]
-pub enum Pattern {
- Byte(u8),
- Range(u8, u8),
- CharacterClass(CharacterClass),
- Alt(Vec<Pattern>),
- Concat(Vec<Pattern>),
- Rep(Box<Pattern>, u32, Option<u32>, GreedyBehavior),
- Assertion(LookDirection, LookPolarity, Box<Pattern>),
- Submatch(Box<Pattern>),
- Nothing,
-}
-
-#[derive(Debug, PartialEq, Eq, Copy, Clone)]
-pub enum ByteConsumption {
- Bounded(usize),
- Unbounded,
-}
-
-impl ByteConsumption {
- pub fn zero() -> Self {
- Self::Bounded(0)
- }
- pub fn one() -> Self {
- Self::Bounded(1)
- }
-}
-
-impl Ord for ByteConsumption {
- fn cmp(&self, other: &Self) -> std::cmp::Ordering {
- match (self, other) {
- (Self::Bounded(a), Self::Bounded(b)) => a.cmp(b),
- (Self::Bounded(_), Self::Unbounded) => std::cmp::Ordering::Less,
- (Self::Unbounded, Self::Bounded(_)) => std::cmp::Ordering::Greater,
- (Self::Unbounded, Self::Unbounded) => std::cmp::Ordering::Equal,
- }
- }
-}
-impl PartialOrd for ByteConsumption {
- fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
- Some(self.cmp(other))
- }
-}
-
-impl std::ops::Add for ByteConsumption {
- type Output = Self;
-
- fn add(self, rhs: Self) -> Self::Output {
- if let Self::Bounded(a) = self
- && let Self::Bounded(b) = rhs
- {
- Self::Bounded(a + b)
- } else {
- Self::Unbounded
- }
- }
-}
-
-impl std::ops::Mul<usize> for ByteConsumption {
- type Output = Self;
-
- fn mul(self, rhs: usize) -> Self::Output {
- match self {
- Self::Bounded(x) => Self::Bounded(x * rhs),
- Self::Unbounded => Self::Unbounded,
- }
- }
-}
-
-impl std::iter::Sum for ByteConsumption {
- fn sum<I: Iterator<Item = Self>>(iter: I) -> Self {
- iter.fold(Self::zero(), |a, b| a + b)
- }
-}
-
-impl Pattern {
- pub fn max_byte_consumption(&self) -> ByteConsumption {
- match self {
- Pattern::Byte(_) => ByteConsumption::one(),
- Pattern::Range(_, _) => ByteConsumption::one(),
- Pattern::CharacterClass(_) => ByteConsumption::one(),
- Pattern::Alt(patterns) => patterns
- .iter()
- .map(Self::max_byte_consumption)
- .max()
- .unwrap_or(ByteConsumption::zero()),
- Pattern::Concat(patterns) => patterns.iter().map(Self::max_byte_consumption).sum(),
- Pattern::Rep(pattern, _, Some(max_reps), _) => {
- pattern.max_byte_consumption() * (*max_reps as usize)
- }
- Pattern::Rep(_, _, None, _) => ByteConsumption::Unbounded,
- Pattern::Assertion(_, _, _) => ByteConsumption::zero(),
- Pattern::Nothing => ByteConsumption::zero(),
- Pattern::Submatch(pat) => pat.max_byte_consumption(),
- }
- }
-
- pub fn reverse(self) -> Self {
- use Pattern::*;
- match self {
- Byte(_) | Nothing | Range(..) | CharacterClass(_) => self,
- Alt(patterns) => Alt(patterns.into_iter().map(Self::reverse).collect()),
- Concat(patterns) => Concat(patterns.into_iter().map(Self::reverse).rev().collect()),
- Rep(pattern, min, max, greedy) => Rep(Box::new(pattern.reverse()), min, max, greedy),
- Assertion(dir, pol, pat) => Assertion(dir.reverse(), pol, Box::new(pat.reverse())),
- Submatch(pat) => Submatch(Box::new(pat.reverse()))
- }
- }
-}
-
-impl Parse for Pattern {
- fn parse(b: &mut super::Cursor<'_>) -> super::Result<Self> {
- let begin = b.loc();
- let result = parse0(b);
- if result.is_ok() {
- b.highlight_from(begin, OtherHighlights::Regex);
- }
- result
- }
-}
-
-fn parse0(s: &mut super::Cursor<'_>) -> Result<Pattern> {
- parse_alt(s)
-}
-
-fn parse_alt(s: &mut super::Cursor<'_>) -> Result<Pattern> {
- let mut seqs = vec![];
- loop {
- let seq = parse_seq(s)?;
- seqs.push(seq);
- let begin = s.loc();
- if s.has() && s.peek() == b'|' {
- s.adv();
- s.highlight_from(begin, OtherHighlights::RegexSymbol);
- } else {
- break;
- }
- }
-
- Ok(match seqs.len() {
- 0 => Pattern::Nothing,
- 1 => seqs.into_iter().next().unwrap(),
- _ => Pattern::Alt(seqs),
- })
-}
-
-fn parse_seq(s: &mut super::Cursor<'_>) -> Result<Pattern> {
- let mut reps = vec![];
- loop {
- let rep = parse_rep(s)?;
- if rep != Pattern::Nothing {
- reps.push(rep);
- } else {
- break;
- }
- }
-
- Ok(match reps.len() {
- 0 => Pattern::Nothing,
- 1 => reps.into_iter().next().unwrap(),
- _ => Pattern::Concat(reps),
- })
-}
-
-fn parse_rep(s: &mut super::Cursor<'_>) -> Result<Pattern> {
- let atom = parse_atom(s)?;
-
- if atom == Pattern::Nothing {
- return Ok(atom);
- }
-
- if !s.has() {
- return Ok(atom);
- }
-
- let begin = s.loc();
-
- let rep = match s.peek() {
- b'*' => Some((0, None)),
- b'+' => Some((1, None)),
- b'?' => Some((0, Some(1))),
- _ => None,
- };
-
- if let Some((min_rep, max_rep)) = rep {
- s.adv();
- s.highlight_from(begin, OtherHighlights::RegexSymbol);
-
- let greed = if s.has() && s.peek() == b'?' {
- s.adv();
- GreedyBehavior::NonGreedy
- } else {
- GreedyBehavior::Greedy
- };
-
- Ok(Pattern::Rep(Box::new(atom), min_rep, max_rep, greed))
- } else {
- Ok(atom)
- }
-}
-
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub enum GreedyBehavior {
- Greedy,
- NonGreedy,
-}
-
-const SYMBOLS: &[u8] = b"{}[]()*+-?|.\\ ";
-fn is_symbol(x: u8) -> bool {
- SYMBOLS.contains(&x)
-}
-
-fn parse_atom(s: &mut super::Cursor<'_>) -> Result<Pattern> {
- if !s.has() {
- return Ok(Pattern::Nothing);
- }
-
- let begin = s.loc();
-
- match s.peek() {
- b'[' => {
- s.adv();
- s.highlight_from(begin, OtherHighlights::RegexSymbol);
- let mut ranges = Vec::new();
- loop {
- if !s.has() {
- return Err(ParseError::Eof);
- }
-
- let begin = s.loc();
- let tok = s.adv();
-
- if tok == b']' {
- if ranges.is_empty() {
- todo!("error handling for empty alternative list");
- }
- s.highlight_from(begin, OtherHighlights::RegexSymbol);
- return Ok(Pattern::Alt(ranges));
- }
-
- let begin = s.loc();
- if s.has() && s.peek() == b'-' {
- s.adv();
- s.highlight_from(begin, OtherHighlights::RegexSymbol);
-
- if !s.has() {
- return Err(ParseError::Eof);
- }
- let tok2 = s.peek();
-
- if tok2 == b']' {
- ranges.push(Pattern::Byte(tok));
- ranges.push(Pattern::Byte(b'-'));
- } else if is_symbol(tok2) {
- return Err(ParseError::Unknown(tok2));
- } else {
- s.adv();
- ranges.push(Pattern::Range(tok, tok2));
- }
- } else {
- ranges.push(Pattern::Byte(tok));
- }
- }
- }
- b'(' => {
- s.adv();
-
- let mut assertion = None;
- if s.buf.starts_with(b"?=") {
- s.advance(2);
- assertion = Some((LookDirection::Ahead, LookPolarity::Positive));
- } else if s.buf.starts_with(b"?!") {
- s.advance(2);
- assertion = Some((LookDirection::Ahead, LookPolarity::Negative));
- } else if s.buf.starts_with(b"?<=") {
- s.advance(3);
- assertion = Some((LookDirection::Behind, LookPolarity::Positive));
- } else if s.buf.starts_with(b"?<!") {
- s.advance(3);
- assertion = Some((LookDirection::Behind, LookPolarity::Negative));
- }
-
- s.highlight_from(begin, OtherHighlights::RegexSymbol);
- let inner = parse0(s)?;
- if !s.has() {
- return Err(ParseError::Eof);
- }
- let begin = s.loc();
- if s.adv() != b')' {
- return Err(ParseError::Expected(')'));
- }
- s.highlight_from(begin, OtherHighlights::RegexSymbol);
-
- if let Some((dir, pol)) = assertion {
- Ok(Pattern::Assertion(dir, pol, Box::new(inner)))
- } else {
- Ok(Pattern::Submatch(Box::new(inner)))
- }
- }
- b'.' => {
- s.adv();
- s.highlight_from(begin, OtherHighlights::RegexSymbol);
- Ok(Pattern::Range(0, 127))
- }
- b'\\' => {
- s.adv();
- if s.has() {
- let escaped = s.adv();
- s.highlight_from(begin, OtherHighlights::RegexSymbol);
-
- if is_symbol(escaped) {
- Ok(Pattern::Byte(escaped))
- } else {
- // TODO interpret \w and others
- Err(ParseError::Unknown(escaped))
- }
- } else {
- Err(ParseError::Eof)
- }
- }
- x if is_symbol(x) => Ok(Pattern::Nothing),
- ch => {
- s.adv();
- Ok(Pattern::Byte(ch))
- }
- }
-}
-
-pub struct CompiledPattern {
- dfa: dfa::DFA,
-}
-
-impl std::fmt::Debug for CompiledPattern {
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- self.dfa.fmt(f)
- }
-}
-
-#[derive(Debug)]
-pub enum CompilationError {
- Enfa(enfa::EnfaTranslationError),
-}
-
-impl From<enfa::EnfaTranslationError> for CompilationError {
- fn from(value: enfa::EnfaTranslationError) -> Self {
- Self::Enfa(value)
- }
-}
-
-impl Pattern {
- pub fn try_compile(self) -> std::result::Result<CompiledPattern, CompilationError> {
- let enfa = enfa::ENFA::try_from(self)?;
- let dfa = dfa::DFA::from(enfa);
- Ok(CompiledPattern { dfa })
- }
-}
-
-impl CompiledPattern {
- pub fn matches(&self, bytes: impl AsRef<[u8]>) -> bool {
- self.dfa.matches(bytes.as_ref())
- }
-}
-
-#[cfg(test)]
-macro_rules! regex_matches {
- ($regex:literal, $match:literal, $true:literal) => {
- assert_eq!(
- Pattern::parse_from_bytes($regex.as_bytes())
- .unwrap()
- .try_compile()
- .unwrap()
- .matches($match.as_bytes()),
- $true
- )
- };
-}
-
-#[test]
-fn foo_matches_foo() {
- regex_matches!("foo", "foo", true);
-}