aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJonas Maier <jonas@x77.dev>2026-06-05 21:55:42 +0200
committerJonas Maier <jonas@x77.dev>2026-06-05 21:55:42 +0200
commit959e8f2ea9c0b7f52b3ab98244cb110af179b82c (patch)
treebbb05d1c9bffd21f7157ea74f089f7b953b5d027
parent05f3f381d0066b2e6116470f4e6251ae191aaefe (diff)
downloadpish-959e8f2ea9c0b7f52b3ab98244cb110af179b82c.tar.gz
fix regex
-rw-r--r--src/bitset.rs13
-rw-r--r--src/parse/regex/bc.rs92
2 files changed, 89 insertions, 16 deletions
diff --git a/src/bitset.rs b/src/bitset.rs
index eae2596..90dc2c5 100644
--- a/src/bitset.rs
+++ b/src/bitset.rs
@@ -41,3 +41,16 @@ impl BitSet {
}
}
}
+
+impl std::fmt::Debug for BitSet {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ for i in 0..self.size {
+ if self.get(i) {
+ write!(f, "1")?;
+ } else {
+ write!(f, "0")?;
+ }
+ }
+ Ok(())
+ }
+}
diff --git a/src/parse/regex/bc.rs b/src/parse/regex/bc.rs
index 75a6d5d..c0eff8b 100644
--- a/src/parse/regex/bc.rs
+++ b/src/parse/regex/bc.rs
@@ -2,13 +2,17 @@ use std::collections::{HashMap, VecDeque};
use crate::{
bitset::BitSet,
- parse::regex::{
- CharacterClass, GreedyBehavior, LookDirection, LookPolarity, Pattern, byte_range::ByteRange,
+ parse::{
+ Parse,
+ regex::{
+ CharacterClass, GreedyBehavior, LookDirection, LookPolarity, Pattern,
+ byte_range::ByteRange,
+ },
},
};
trait Flavor: Clone {
- type CustomInstr: Copy + Clone;
+ type CustomInstr: Copy + Clone + std::fmt::Debug;
type ThreadData: Clone;
type StepData<'a, 'b>
where
@@ -21,7 +25,7 @@ trait Flavor: Clone {
) -> bool;
}
-#[derive(Copy, Clone)]
+#[derive(Copy, Clone, Debug)]
struct MainFlavor;
impl Flavor for MainFlavor {
type CustomInstr = MainInstr;
@@ -54,10 +58,10 @@ impl Flavor for MainFlavor {
}
}
-#[derive(Copy, Clone)]
+#[derive(Copy, Clone, Debug)]
enum Nothing {}
-#[derive(Copy, Clone)]
+#[derive(Copy, Clone, Debug)]
struct AssertionFlavor;
impl Flavor for AssertionFlavor {
type CustomInstr = Nothing;
@@ -75,14 +79,14 @@ impl Flavor for AssertionFlavor {
type JumpTarget = u32;
type Register = u32;
-#[derive(Copy, Clone)]
+#[derive(Copy, Clone, Debug)]
struct Assertion {
target: JumpTarget,
dir: LookDirection,
pol: LookPolarity,
}
-#[derive(Copy, Clone)]
+#[derive(Copy, Clone, Debug)]
enum Instr<F: Flavor> {
Class(CharacterClass),
Consume(ByteRange),
@@ -91,7 +95,7 @@ enum Instr<F: Flavor> {
Custom(F::CustomInstr),
}
-#[derive(Copy, Clone)]
+#[derive(Copy, Clone, Debug)]
enum MainInstr {
Save(Register),
Join(Assertion),
@@ -122,6 +126,18 @@ impl<'p, F: Flavor> VM<'p, F> {
let mut threads: VecDeque<_> = self.threads.drain(..).collect();
self.hot.set_all(false);
+ let mut warm = self.hot.clone();
+ macro_rules! add_thread {
+ ($t:expr) => {{
+ let t = $t;
+ let bit = t.pc as usize;
+ if !warm.get(bit) {
+ warm.set(bit, true);
+ threads.push_back(t);
+ }
+ }};
+ }
+
while let Some(mut thread) = threads.pop_front() {
match self.instr[thread.pc as usize] {
Instr::Class(_) | Instr::Consume(_) => {
@@ -132,14 +148,14 @@ impl<'p, F: Flavor> VM<'p, F> {
}
Instr::Jump(j) => {
thread.pc = j;
- threads.push_back(thread);
+ add_thread!(thread);
}
Instr::Fork(a, b) => {
- threads.push_back(Thread {
+ add_thread!(Thread {
pc: a,
data: thread.data.clone(),
});
- threads.push_back(Thread {
+ add_thread!(Thread {
pc: b,
data: thread.data,
});
@@ -147,7 +163,7 @@ impl<'p, F: Flavor> VM<'p, F> {
Instr::Custom(instr) => {
if F::accepts(&mut thread, instr, sd) {
thread.pc += 1;
- threads.push_back(thread);
+ add_thread!(thread);
}
}
}
@@ -205,9 +221,11 @@ impl<'a> LookaheadVM<'a> {
assert!(self.cache_data.is_empty());
assert_eq!(self.loc_offset, 0);
self.loc_offset = loc;
+ self.vm.step_epsilon(&mut ());
+ self.cache_data.push(self.vm.hot.clone());
for i in (loc..self.data.len()).rev() {
- self.vm.step_epsilon(&mut ());
self.vm.step_consume(self.data[i]);
+ self.vm.step_epsilon(&mut ());
self.cache_data.push(self.vm.hot.clone());
}
self.cache_data.reverse();
@@ -230,14 +248,22 @@ struct VirtualMachine<'a> {
}
impl<'a> VirtualMachine<'a> {
- fn step(&mut self, byte: u8, loc: usize) {
+ fn step_epsilon(&mut self, loc: usize) {
self.vm0.step_epsilon(&mut ());
self.vm1
.step_epsilon(&mut (loc, &self.vm0.hot, &mut self.vm2));
+ }
+
+ fn step_consume(&mut self, byte: u8) {
self.vm0.step_consume(byte);
self.vm1.step_consume(byte);
}
+ fn step(&mut self, byte: u8, loc: usize) {
+ self.step_epsilon(loc);
+ self.step_consume(byte);
+ }
+
fn extract_match(&self) -> Option<Match> {
self.vm1
.threads
@@ -249,6 +275,19 @@ impl<'a> VirtualMachine<'a> {
.next()
}
}
+
+fn fmt_instructions<F: std::fmt::Debug + Flavor>(
+ f: &mut std::fmt::Formatter<'_>,
+ label: &str,
+ insns: &[Instr<F>],
+) -> std::fmt::Result {
+ writeln!(f, "# {label}")?;
+ for (idx, ins) in insns.iter().enumerate() {
+ writeln!(f, "{idx}: {ins:?}")?;
+ }
+ Ok(())
+}
+
pub struct BytecodeCompiledRegex {
instrs0: Box<[Instr<AssertionFlavor>]>,
instrs1: Box<[Instr<MainFlavor>]>,
@@ -256,6 +295,15 @@ pub struct BytecodeCompiledRegex {
accepting: BitSet,
}
+impl std::fmt::Debug for BytecodeCompiledRegex {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ fmt_instructions(f, "behind", &self.instrs0)?;
+ fmt_instructions(f, "ahead", &self.instrs2)?;
+ fmt_instructions(f, "main", &self.instrs1)?;
+ writeln!(f, "accepting: {:?}", self.accepting)
+ }
+}
+
impl BytecodeCompiledRegex {
pub fn re_match(&self, data: &[u8]) -> Option<Match> {
let vm0 = VM::new(&self.instrs0, Thread { pc: 0, data: () });
@@ -277,6 +325,7 @@ impl BytecodeCompiledRegex {
for (i, ch) in data.iter().cloned().enumerate() {
vm.step(ch, i);
}
+ vm.step_epsilon(data.len());
vm.extract_match()
}
@@ -349,7 +398,7 @@ impl<'a, F: Flavor> Compiler<'a, F> {
self.compile(pat)?;
let fork_pos = self.instrs.len();
let after = fork_pos + 1;
- self.instrs.push(fork(base, after, greedy));
+ self.instrs.push(fork(base + 1, after, greedy));
self.instrs[base] = Instr::Jump(fork_pos as JumpTarget);
Ok(())
}
@@ -536,3 +585,14 @@ impl TryFrom<Pattern> for BytecodeCompiledRegex {
})
}
}
+
+#[test]
+fn print_compiled_vm() {
+ let pat = Pattern::parse_from_bytes(b"a?b?").unwrap();
+ let compiled = BytecodeCompiledRegex::try_from(pat).unwrap();
+ println!("{compiled:#?}");
+ assert_eq!(compiled.matches(b"ab"), true);
+ assert_eq!(compiled.matches(b"a"), true);
+ assert_eq!(compiled.matches(b"b"), true);
+ assert_eq!(compiled.matches(b""), true);
+}