aboutsummaryrefslogtreecommitdiffstats
path: root/src/parse/regex/enfa.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/parse/regex/enfa.rs')
-rw-r--r--src/parse/regex/enfa.rs383
1 files changed, 383 insertions, 0 deletions
diff --git a/src/parse/regex/enfa.rs b/src/parse/regex/enfa.rs
new file mode 100644
index 0000000..71998c9
--- /dev/null
+++ b/src/parse/regex/enfa.rs
@@ -0,0 +1,383 @@
+use std::{
+ collections::HashSet,
+ hash::{DefaultHasher, Hash, Hasher},
+};
+
+use super::Pattern;
+use super::byte_range::ByteRange;
+
+/// NFA with epsilon transitions
+#[derive(Clone)]
+pub struct ENFA {
+ pub states: Vec<EState>,
+}
+
+#[derive(Clone)]
+pub struct MultiState<'a> {
+ nfa: &'a ENFA,
+ states: Vec<StateId>,
+ accept: bool,
+ hash: u64,
+}
+
+impl<'a> PartialEq for MultiState<'a> {
+ fn eq(&self, other: &Self) -> bool {
+ (self.nfa as *const ENFA as u64) == (other.nfa as *const ENFA as u64)
+ && self.states == other.states
+ && self.accept == other.accept
+ && self.hash == other.hash
+ }
+}
+impl<'a> Eq for MultiState<'a> {}
+
+impl<'a> MultiState<'a> {
+ pub fn new(nfa: &'a ENFA, mut states: Vec<StateId>) -> Self {
+ states.sort();
+ states.dedup();
+ states.shrink_to_fit();
+
+ let accept = states.iter().any(|&x| nfa.states[x].accept);
+ let mut hasher = DefaultHasher::new();
+ states.hash(&mut hasher);
+ let hash = hasher.finish();
+
+ Self {
+ nfa,
+ states,
+ accept,
+ hash,
+ }
+ }
+
+ /// all the chars that will make an interesting transition
+ pub fn possible_transitions(&self) -> Vec<ByteRange> {
+ let mut vec: Vec<_> = self
+ .states
+ .iter()
+ .flat_map(|&i| self.nfa.states[i].trans.iter().map(|x| x.0))
+ .collect();
+ vec = ByteRange::non_overlapping(vec);
+ vec.sort();
+ vec.dedup();
+ vec.shrink_to_fit();
+ vec
+ }
+
+ pub fn transition(&self, ch: ByteRange) -> Self {
+ let new_states = self
+ .states
+ .iter()
+ .flat_map(|&s| {
+ self.nfa.states[s]
+ .trans
+ .iter()
+ .filter_map(|&(c, k)| if c.overlaps(ch) { Some(k) } else { None })
+ })
+ .collect();
+
+ Self::new(self.nfa, new_states)
+ }
+
+ pub fn accept(&self) -> bool {
+ self.accept
+ }
+}
+
+impl<'a> Hash for MultiState<'a> {
+ fn hash<H: Hasher>(&self, state: &mut H) {
+ self.hash.hash(state)
+ }
+}
+
+macro_rules! set {
+ () => {
+ std::collections::HashSet::new()
+ };
+ ( $( $x:expr ),* ) => {{
+ let mut set = std::collections::HashSet::new();
+ $(
+ set.insert($x);
+ )*
+ set
+ }};
+}
+
+impl ENFA {
+ fn shift(self, amt: usize) -> Vec<EState> {
+ let mut s = self.states;
+
+ for state in s.iter_mut() {
+ let trans = state.trans.iter().map(|(c, id)| (*c, id + amt)).collect();
+ let epsilon_trans = state.epsilon_trans.iter().map(|e| e + amt).collect();
+
+ *state = EState {
+ trans,
+ epsilon_trans,
+ accept: false,
+ }
+ }
+
+ s
+ }
+
+ fn epsilon_dfs(&self, i: StateId, visited: &mut [bool]) {
+ if visited[i] {
+ return;
+ }
+ visited[i] = true;
+ for &k in self.states[i].epsilon_trans.iter() {
+ self.epsilon_dfs(k, visited);
+ }
+ }
+
+ fn resolve_epsilon(&mut self) {
+ // state X --> { state Y, Z, W which get inlined into X }
+ let includes: Vec<Vec<StateId>> = (0..self.states.len())
+ .map(|i| {
+ let mut reach = vec![false; self.states.len()];
+ self.epsilon_dfs(i, &mut reach);
+ reach
+ .into_iter()
+ .enumerate()
+ .filter_map(|(x, r)| if r { Some(x) } else { None })
+ .collect()
+ })
+ .collect();
+
+ // inlining
+ for i in 0..self.states.len() {
+ for &k in includes[i].iter() {
+ let new = self.states[k].trans.clone();
+ self.states[i].trans.extend(new);
+ self.states[i].epsilon_trans.clear();
+ if self.states[k].accept {
+ self.states[i].accept = true;
+ }
+ }
+ }
+ }
+
+ fn remove_unreachable(&mut self) {
+ let mut used = vec![false; self.states.len()];
+ used[0] = true;
+ for s in self.states.iter() {
+ for &i in s.epsilon_trans.iter() {
+ used[i] = true;
+ }
+ for &(_, i) in s.trans.iter() {
+ used[i] = true;
+ }
+ }
+ let mut remap = vec![0; self.states.len()];
+ let mut shift = 0;
+ for i in 0..self.states.len() {
+ if used[i] {
+ remap[i] = i - shift;
+ } else {
+ shift += 1;
+ }
+ }
+ for i in (0..self.states.len()).rev() {
+ if !used[i] {
+ self.states.remove(i);
+ }
+ }
+ for s in self.states.iter_mut() {
+ s.epsilon_trans = s
+ .epsilon_trans
+ .clone()
+ .into_iter()
+ .map(|i| remap[i])
+ .collect();
+ s.trans = s
+ .trans
+ .clone()
+ .into_iter()
+ .map(|(c, i)| (c, remap[i]))
+ .collect();
+ }
+ }
+
+ pub fn simplify(&mut self) {
+ self.resolve_epsilon();
+ self.remove_unreachable();
+ }
+
+ pub fn start_multi_state<'a>(&'a self) -> MultiState<'a> {
+ MultiState::new(self, vec![0])
+ }
+
+ pub fn void_multi_state<'a>(&'a self) -> MultiState<'a> {
+ MultiState::new(self, vec![])
+ }
+
+ pub fn all_multi_states<'a>(&'a self) -> HashSet<MultiState<'a>> {
+ let mut states = set![self.start_multi_state()];
+ let mut q = vec![self.start_multi_state()];
+
+ while let Some(state) = q.pop() {
+ let chars = state.possible_transitions();
+
+ for chr in chars {
+ let new = state.transition(chr);
+
+ if !states.contains(&new) {
+ states.insert(new.clone());
+ q.push(new);
+ }
+ }
+ }
+
+ states
+ }
+
+ fn looping(self) -> Self {
+ let mut states = vec![EState::start()];
+ states.append(&mut self.shift(1));
+ let len = states.len();
+ states[0].epsilon_trans = set![1, len];
+ states[len - 1].epsilon_trans = set![0, len];
+ states.push(EState::terminal());
+ Self { states }
+ }
+
+ fn repeat(self, times: usize) -> Self {
+ let reps = vec![self; times];
+ Self::concat(reps)
+ }
+
+ /// between 0 and x repetitions
+ fn optx(self, x: usize) -> Self {
+ let len = self.states.len();
+ let mut repped = self.repeat(x);
+ assert_eq!(repped.states.len(), x * len);
+ for i in 1..=x {
+ repped.states[0].epsilon_trans.insert(i * len - 1);
+ }
+ repped
+ }
+
+ fn concat(nfas: Vec<Self>) -> Self {
+ if nfas.is_empty() {
+ return Self {
+ states: vec![EState::terminal()],
+ };
+ }
+
+ let mut states: Vec<EState> = Vec::new();
+ for nfa in nfas.into_iter() {
+ let len = states.len();
+ let mut ns = nfa.shift(len);
+ if let Some(n) = states.last_mut() {
+ n.epsilon_trans = set![len];
+ }
+ states.append(&mut ns);
+ }
+
+ let len = states.len();
+ states[len - 1].accept = true;
+
+ Self { states }
+ }
+}
+
+impl std::fmt::Debug for ENFA {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ writeln!(f, "NFA {{")?;
+ for (i, s) in self.states.iter().enumerate() {
+ write!(f, " {i}: ")?;
+ for k in s.epsilon_trans.iter() {
+ write!(f, "~>{k} ")?;
+ }
+ for (c, k) in s.trans.iter() {
+ write!(f, "{c:?}=>{k} ")?;
+ }
+ if s.accept {
+ write!(f, "accept")?;
+ }
+ writeln!(f)?;
+ }
+ write!(f, "}}")
+ }
+}
+
+pub type StateId = usize;
+
+#[derive(Debug, Clone)]
+pub struct EState {
+ pub trans: HashSet<(ByteRange, StateId)>,
+ pub epsilon_trans: HashSet<StateId>,
+ pub accept: bool,
+}
+
+impl EState {
+ fn start() -> Self {
+ Self {
+ trans: HashSet::new(),
+ epsilon_trans: HashSet::new(),
+ accept: false,
+ }
+ }
+ fn terminal() -> Self {
+ Self {
+ trans: HashSet::new(),
+ epsilon_trans: HashSet::new(),
+ accept: true,
+ }
+ }
+}
+
+impl From<Pattern> for ENFA {
+ fn from(value: Pattern) -> Self {
+ match value {
+ Pattern::Byte(c) => Self::from(Pattern::Range(c, c)),
+ Pattern::Range(c1, c2) => Self {
+ states: vec![
+ EState {
+ trans: set![(ByteRange::new_range(c1, c2), 1)],
+ epsilon_trans: set![],
+ accept: false,
+ },
+ EState::terminal(),
+ ],
+ },
+ Pattern::Alt(alts) => {
+ let nfas: Vec<ENFA> = alts.into_iter().map(ENFA::from).collect();
+ let mut states = vec![EState::start()];
+ let mut ends = vec![];
+ for nfa in nfas.into_iter() {
+ let len = states.len();
+ states[0].epsilon_trans.insert(len);
+ states.append(&mut (nfa.shift(len)));
+ ends.push(states.len() - 1);
+ }
+ states.push(EState::terminal());
+ for end in ends.into_iter() {
+ let last = states.len() - 1;
+ states[end].epsilon_trans.insert(last);
+ }
+ Self { states }
+ }
+ Pattern::Concat(seq) => {
+ let nfas: Vec<Self> = seq.into_iter().map(ENFA::from).collect();
+ Self::concat(nfas)
+ }
+ Pattern::Rep(regex, min, None) => {
+ let nfa = ENFA::from(*regex);
+ let base = nfa.clone().repeat(min as usize);
+ let tail = nfa.looping();
+ Self::concat(vec![base, tail])
+ }
+ Pattern::Rep(regex, min, Some(max)) => {
+ assert!(min < max);
+ let nfa = Self::from(*regex);
+ let base = nfa.clone().repeat(min as usize);
+ let tail = nfa.optx((max - min) as usize);
+ Self::concat(vec![base, tail])
+ }
+ Pattern::Nothing => Self {
+ states: vec![EState::terminal()],
+ },
+ }
+ }
+}