diff options
Diffstat (limited to 'src/parse')
| -rw-r--r-- | src/parse/regex/byte_range.rs | 84 | ||||
| -rw-r--r-- | src/parse/regex/dfa.rs | 14 | ||||
| -rw-r--r-- | src/parse/regex/enfa.rs | 2 |
3 files changed, 61 insertions, 39 deletions
diff --git a/src/parse/regex/byte_range.rs b/src/parse/regex/byte_range.rs index 86b2c2b..0151f0c 100644 --- a/src/parse/regex/byte_range.rs +++ b/src/parse/regex/byte_range.rs @@ -16,7 +16,7 @@ impl From<RangeInclusive<u8>> for ByteRange { impl ByteRange { pub fn new_range(from: u8, to: u8) -> Self { - assert!(from <= to); + assert!(from <= to, "{from} <= {to}"); Self { from, to } } @@ -33,42 +33,68 @@ impl ByteRange { self.from.max(other.from) <= self.to.min(other.to) } - pub fn non_overlapping(sets: Vec<ByteRange>) -> Vec<ByteRange> { - let begins = sets.iter().map(|cs| (cs.from, 1)); - let ends = sets.iter().map(|cs| (cs.to, 2)); - let mut edges: Vec<_> = begins.chain(ends).collect(); - edges.sort(); - edges.iter_mut().for_each(|c| { - if c.1 == 2 { - c.1 = -1; + pub fn split_to_disjoint(ranges: Vec<ByteRange>) -> Vec<ByteRange> { + if ranges.is_empty() { + return vec![]; + } + + let mut points: Vec<u8> = Vec::new(); + for r in &ranges { + points.push(r.from); + if r.to != u8::MAX { + points.push(r.to + 1); } - }); - - let mut last = None; - let mut depth = 0; + } + + points.sort_unstable(); + points.dedup(); + let mut out = Vec::new(); - - for (mut loc, delta) in edges { - if let Some(last) = last - && last <= loc - { - out.push(ByteRange::new_range(last, loc)); - loc += 1; + + for window in points.windows(2) { + let start = window[0]; + let end_exclusive = window[1]; + + if start >= end_exclusive { + continue; } - - depth += delta; - - if depth > 0 { - last = Some(loc); - } else { - last = None; + + let mut active = false; + + for r in &ranges { + if r.from <= start && start <= r.to { + active = true; + break; + } + } + + if active { + out.push(ByteRange { + from: start, + to: end_exclusive - 1, + }); } } - + out } } +#[test] +fn byterange_test() { + assert_eq!( + ByteRange::split_to_disjoint(vec![ + ByteRange::new_range(b'a', b'z'), + ByteRange::new_single(b'm') + ]), + vec![ + ByteRange::new_range(b'a', b'l'), + ByteRange::new_single(b'm'), + ByteRange::new_range(b'n', b'z'), + ] + ); +} + impl std::fmt::Debug for ByteRange { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { if self.from == self.to { @@ -106,7 +132,7 @@ mod non_overlapping_tests { fn run(ranges: Vec<RangeInclusive<u8>>) { let ranges1: Vec<ByteRange> = ranges.into_iter().map(Into::into).collect(); - let ranges2 = ByteRange::non_overlapping(ranges1.clone()); + let ranges2 = ByteRange::split_to_disjoint(ranges1.clone()); let r1 = |c| ranges1.iter().any(|cr| cr.contains(c)); let r2 = |c| ranges2.iter().any(|cr| cr.contains(c)); diff --git a/src/parse/regex/dfa.rs b/src/parse/regex/dfa.rs index 78888a2..243176d 100644 --- a/src/parse/regex/dfa.rs +++ b/src/parse/regex/dfa.rs @@ -3,7 +3,7 @@ use std::collections::HashMap; use super::{ byte_range::ByteRange, - enfa::{ENFA, Epsilon, MultiState, Resolved}, + enfa::{ENFA, MultiState}, }; pub type StateId = usize; @@ -60,10 +60,12 @@ impl DFA { } } -impl From<ENFA<Resolved>> for DFA { - fn from(mut nfa: ENFA<Resolved>) -> Self { +impl From<ENFA> for DFA { + fn from(mut nfa: ENFA) -> Self { nfa.remove_unreachable(); + println!("{nfa:?}"); + let mut multi_states = nfa.all_multi_states(); multi_states.insert(nfa.void_multi_state()); let mut len = 0; @@ -101,9 +103,3 @@ impl From<ENFA<Resolved>> for DFA { } } } - -impl From<ENFA<Epsilon>> for DFA { - fn from(value: ENFA<Epsilon>) -> Self { - Self::from(value.resolve_epsilon()) - } -} diff --git a/src/parse/regex/enfa.rs b/src/parse/regex/enfa.rs index 3cac2dd..fea2976 100644 --- a/src/parse/regex/enfa.rs +++ b/src/parse/regex/enfa.rs @@ -336,7 +336,7 @@ impl<'a> MultiState<'a> { for t in self.threads.iter() { t.possible_transitions(self.nfa, &mut |x| vec.push(x)); } - vec = ByteRange::non_overlapping(vec); + vec = ByteRange::split_to_disjoint(vec); vec.sort(); vec.dedup(); vec.shrink_to_fit(); |
