blob: 6fc40ce42d88c4bc018e2f9dd0727ec2623eb46f [file] [log] [blame]
use std::fmt;
use std::iter::Peekable;
use std::sync::atomic::{AtomicU32, Ordering};
use super::{Byte, Reference, Region, Tree, Type, Uninhabited};
use crate::{Map, Set};
#[derive(PartialEq)]
#[cfg_attr(test, derive(Clone))]
pub(crate) struct Dfa<R, T>
where
R: Region,
T: Type,
{
pub(crate) transitions: Map<State, Transitions<R, T>>,
pub(crate) start: State,
pub(crate) accept: State,
}
#[derive(PartialEq, Clone, Debug)]
pub(crate) struct Transitions<R, T>
where
R: Region,
T: Type,
{
byte_transitions: EdgeSet<State>,
ref_transitions: Map<Reference<R, T>, State>,
}
impl<R, T> Default for Transitions<R, T>
where
R: Region,
T: Type,
{
fn default() -> Self {
Self { byte_transitions: EdgeSet::empty(), ref_transitions: Map::default() }
}
}
/// The states in a [`Dfa`] represent byte offsets.
#[derive(Hash, Eq, PartialEq, PartialOrd, Ord, Copy, Clone)]
pub(crate) struct State(pub(crate) u32);
impl State {
pub(crate) fn new() -> Self {
static COUNTER: AtomicU32 = AtomicU32::new(0);
Self(COUNTER.fetch_add(1, Ordering::SeqCst))
}
}
impl fmt::Debug for State {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "S_{}", self.0)
}
}
impl<R, T> Dfa<R, T>
where
R: Region,
T: Type,
{
#[cfg(test)]
pub(crate) fn bool() -> Self {
Self::from_transitions(|accept| Transitions {
byte_transitions: EdgeSet::new(Byte::new(0x00..=0x01), accept),
ref_transitions: Map::default(),
})
}
pub(crate) fn unit() -> Self {
let transitions: Map<State, Transitions<R, T>> = Map::default();
let start = State::new();
let accept = start;
Self { transitions, start, accept }
}
pub(crate) fn from_byte(byte: Byte) -> Self {
Self::from_transitions(|accept| Transitions {
byte_transitions: EdgeSet::new(byte, accept),
ref_transitions: Map::default(),
})
}
pub(crate) fn from_ref(r: Reference<R, T>) -> Self {
Self::from_transitions(|accept| Transitions {
byte_transitions: EdgeSet::empty(),
ref_transitions: [(r, accept)].into_iter().collect(),
})
}
fn from_transitions(f: impl FnOnce(State) -> Transitions<R, T>) -> Self {
let start = State::new();
let accept = State::new();
Self { transitions: [(start, f(accept))].into_iter().collect(), start, accept }
}
pub(crate) fn from_tree(tree: Tree<!, R, T>) -> Result<Self, Uninhabited> {
Ok(match tree {
Tree::Byte(b) => Self::from_byte(b),
Tree::Ref(r) => Self::from_ref(r),
Tree::Alt(alts) => {
// Convert and filter the inhabited alternatives.
let mut alts = alts.into_iter().map(Self::from_tree).filter_map(Result::ok);
// If there are no alternatives, return `Uninhabited`.
let dfa = alts.next().ok_or(Uninhabited)?;
// Combine the remaining alternatives with `dfa`.
alts.fold(dfa, |dfa, alt| dfa.union(alt, State::new))
}
Tree::Seq(elts) => {
let mut dfa = Self::unit();
for elt in elts.into_iter().map(Self::from_tree) {
dfa = dfa.concat(elt?);
}
dfa
}
})
}
/// Concatenate two `Dfa`s.
pub(crate) fn concat(self, other: Self) -> Self {
if self.start == self.accept {
return other;
} else if other.start == other.accept {
return self;
}
let start = self.start;
let accept = other.accept;
let mut transitions: Map<State, Transitions<R, T>> = self.transitions;
for (source, transition) in other.transitions {
let fix_state = |state| if state == other.start { self.accept } else { state };
let byte_transitions = transition.byte_transitions.map_states(&fix_state);
let ref_transitions = transition
.ref_transitions
.into_iter()
.map(|(r, state)| (r, fix_state(state)))
.collect();
let old = transitions
.insert(fix_state(source), Transitions { byte_transitions, ref_transitions });
assert!(old.is_none());
}
Self { transitions, start, accept }
}
/// Compute the union of two `Dfa`s.
pub(crate) fn union(self, other: Self, mut new_state: impl FnMut() -> State) -> Self {
// We implement `union` by lazily initializing a set of states
// corresponding to the product of states in `self` and `other`, and
// then add transitions between these states that correspond to where
// they exist between `self` and `other`.
let a = self;
let b = other;
let accept = new_state();
let mut mapping: Map<(Option<State>, Option<State>), State> = Map::default();
let mut mapped = |(a_state, b_state)| {
if Some(a.accept) == a_state || Some(b.accept) == b_state {
// If either `a_state` or `b_state` are accepting, map to a
// common `accept` state.
accept
} else {
*mapping.entry((a_state, b_state)).or_insert_with(&mut new_state)
}
};
let start = mapped((Some(a.start), Some(b.start)));
let mut transitions: Map<State, Transitions<R, T>> = Map::default();
let empty_transitions = Transitions::default();
struct WorkQueue {
queue: Vec<(Option<State>, Option<State>)>,
// Track all entries ever enqueued to avoid duplicating work. This
// gives us a guarantee that a given (a_state, b_state) pair will
// only ever be visited once.
enqueued: Set<(Option<State>, Option<State>)>,
}
impl WorkQueue {
fn enqueue(&mut self, a_state: Option<State>, b_state: Option<State>) {
if self.enqueued.insert((a_state, b_state)) {
self.queue.push((a_state, b_state));
}
}
}
let mut queue = WorkQueue { queue: Vec::new(), enqueued: Set::default() };
queue.enqueue(Some(a.start), Some(b.start));
while let Some((a_src, b_src)) = queue.queue.pop() {
let src = mapped((a_src, b_src));
if src == accept {
// While it's possible to have a DFA whose accept state has
// out-edges, these do not affect the semantics of the DFA, and
// so there's no point in processing them. Continuing here also
// has the advantage of guaranteeing that we only ever process a
// given node in the output DFA once. In particular, with the
// exception of the accept state, we ensure that we only push a
// given node to the `queue` once. This allows the following
// code to assume that we're processing a node we've never
// processed before, which means we never need to merge two edge
// sets - we only ever need to construct a new edge set from
// whole cloth.
continue;
}
let a_transitions =
a_src.and_then(|a_src| a.transitions.get(&a_src)).unwrap_or(&empty_transitions);
let b_transitions =
b_src.and_then(|b_src| b.transitions.get(&b_src)).unwrap_or(&empty_transitions);
let byte_transitions = a_transitions.byte_transitions.union(
&b_transitions.byte_transitions,
|a_dst, b_dst| {
assert!(a_dst.is_some() || b_dst.is_some());
queue.enqueue(a_dst, b_dst);
mapped((a_dst, b_dst))
},
);
let ref_transitions =
a_transitions.ref_transitions.keys().chain(b_transitions.ref_transitions.keys());
let ref_transitions = ref_transitions
.map(|ref_transition| {
let a_dst = a_transitions.ref_transitions.get(ref_transition).copied();
let b_dst = b_transitions.ref_transitions.get(ref_transition).copied();
assert!(a_dst.is_some() || b_dst.is_some());
queue.enqueue(a_dst, b_dst);
(*ref_transition, mapped((a_dst, b_dst)))
})
.collect();
let old = transitions.insert(src, Transitions { byte_transitions, ref_transitions });
// See `if src == accept { ... }` above. The comment there explains
// why this assert is valid.
assert_eq!(old, None);
}
Self { transitions, start, accept }
}
pub(crate) fn get_uninit_edge_dst(&self, state: State) -> Option<State> {
let transitions = self.transitions.get(&state)?;
transitions.byte_transitions.get_uninit_edge_dst()
}
pub(crate) fn bytes_from(&self, start: State) -> impl Iterator<Item = (Byte, State)> {
self.transitions
.get(&start)
.into_iter()
.flat_map(|transitions| transitions.byte_transitions.iter())
}
pub(crate) fn refs_from(&self, start: State) -> impl Iterator<Item = (Reference<R, T>, State)> {
self.transitions
.get(&start)
.into_iter()
.flat_map(|transitions| transitions.ref_transitions.iter())
.map(|(r, s)| (*r, *s))
}
#[cfg(test)]
pub(crate) fn from_edges<B: Clone + Into<Byte>>(
start: u32,
accept: u32,
edges: &[(u32, B, u32)],
) -> Self {
let start = State(start);
let accept = State(accept);
let mut transitions: Map<State, Vec<(Byte, State)>> = Map::default();
for &(src, ref edge, dst) in edges.iter() {
transitions.entry(State(src)).or_default().push((edge.clone().into(), State(dst)));
}
let transitions = transitions
.into_iter()
.map(|(src, edges)| {
(
src,
Transitions {
byte_transitions: EdgeSet::from_edges(edges),
ref_transitions: Map::default(),
},
)
})
.collect();
Self { start, accept, transitions }
}
}
/// Serialize the DFA using the Graphviz DOT format.
impl<R, T> fmt::Debug for Dfa<R, T>
where
R: Region,
T: Type,
{
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
writeln!(f, "digraph {{")?;
writeln!(f, " {:?} [shape = doublecircle]", self.start)?;
writeln!(f, " {:?} [shape = doublecircle]", self.accept)?;
for (src, transitions) in self.transitions.iter() {
for (t, dst) in transitions.byte_transitions.iter() {
writeln!(f, " {src:?} -> {dst:?} [label=\"{t:?}\"]")?;
}
for (t, dst) in transitions.ref_transitions.iter() {
writeln!(f, " {src:?} -> {dst:?} [label=\"{t:?}\"]")?;
}
}
writeln!(f, "}}")
}
}
use edge_set::EdgeSet;
mod edge_set {
use smallvec::SmallVec;
use super::*;
/// The set of outbound byte edges associated with a DFA node.
#[derive(Eq, PartialEq, Clone, Debug)]
pub(super) struct EdgeSet<S = State> {
// A sequence of byte edges with contiguous byte values and a common
// destination is stored as a single run.
//
// Runs are non-empty, non-overlapping, and stored in ascending order.
runs: SmallVec<[(Byte, S); 1]>,
}
impl<S> EdgeSet<S> {
pub(crate) fn new(range: Byte, dst: S) -> Self {
let mut this = Self { runs: SmallVec::new() };
if !range.is_empty() {
this.runs.push((range, dst));
}
this
}
pub(crate) fn empty() -> Self {
Self { runs: SmallVec::new() }
}
#[cfg(test)]
pub(crate) fn from_edges(mut edges: Vec<(Byte, S)>) -> Self
where
S: Ord,
{
edges.sort();
Self { runs: edges.into() }
}
pub(crate) fn iter(&self) -> impl Iterator<Item = (Byte, S)>
where
S: Copy,
{
self.runs.iter().copied()
}
pub(crate) fn get_uninit_edge_dst(&self) -> Option<S>
where
S: Copy,
{
// Uninit is ordered last.
let &(range, dst) = self.runs.last()?;
if range.contains_uninit() { Some(dst) } else { None }
}
pub(crate) fn map_states<SS>(self, mut f: impl FnMut(S) -> SS) -> EdgeSet<SS> {
EdgeSet {
// NOTE: It appears as through `<Vec<_> as
// IntoIterator>::IntoIter` and `std::iter::Map` both implement
// `TrustedLen`, which in turn means that this `.collect()`
// allocates the correct number of elements once up-front [1].
//
// [1] https://doc.rust-lang.org/1.85.0/src/alloc/vec/spec_from_iter_nested.rs.html#47
runs: self.runs.into_iter().map(|(b, s)| (b, f(s))).collect(),
}
}
/// Unions two edge sets together.
///
/// If `u = a.union(b)`, then for each byte value, `u` will have an edge
/// with that byte value and with the destination `join(Some(_), None)`,
/// `join(None, Some(_))`, or `join(Some(_), Some(_))` depending on whether `a`,
/// `b`, or both have an edge with that byte value.
///
/// If neither `a` nor `b` have an edge with a particular byte value,
/// then no edge with that value will be present in `u`.
pub(crate) fn union(
&self,
other: &Self,
mut join: impl FnMut(Option<S>, Option<S>) -> S,
) -> EdgeSet<S>
where
S: Copy + Eq,
{
let mut runs: SmallVec<[(Byte, S); 1]> = SmallVec::new();
let xs = self.runs.iter().copied();
let ys = other.runs.iter().copied();
for (range, (x, y)) in union(xs, ys) {
let state = join(x, y);
match runs.last_mut() {
// Merge contiguous runs with a common destination.
Some(&mut (ref mut last_range, ref mut last_state))
if last_range.end == range.start && *last_state == state =>
{
last_range.end = range.end
}
_ => runs.push((range, state)),
}
}
EdgeSet { runs }
}
}
}
/// Merges two sorted sequences into one sorted sequence.
pub(crate) fn union<S: Copy, X: Iterator<Item = (Byte, S)>, Y: Iterator<Item = (Byte, S)>>(
xs: X,
ys: Y,
) -> UnionIter<X, Y> {
UnionIter { xs: xs.peekable(), ys: ys.peekable() }
}
pub(crate) struct UnionIter<X: Iterator, Y: Iterator> {
xs: Peekable<X>,
ys: Peekable<Y>,
}
// FIXME(jswrenn) we'd likely benefit from specializing try_fold here.
impl<S: Copy, X: Iterator<Item = (Byte, S)>, Y: Iterator<Item = (Byte, S)>> Iterator
for UnionIter<X, Y>
{
type Item = (Byte, (Option<S>, Option<S>));
fn next(&mut self) -> Option<Self::Item> {
use std::cmp::{self, Ordering};
let ret;
match (self.xs.peek_mut(), self.ys.peek_mut()) {
(None, None) => {
ret = None;
}
(Some(x), None) => {
ret = Some((x.0, (Some(x.1), None)));
self.xs.next();
}
(None, Some(y)) => {
ret = Some((y.0, (None, Some(y.1))));
self.ys.next();
}
(Some(x), Some(y)) => {
let start;
let end;
let dst;
match x.0.start.cmp(&y.0.start) {
Ordering::Less => {
start = x.0.start;
end = cmp::min(x.0.end, y.0.start);
dst = (Some(x.1), None);
}
Ordering::Greater => {
start = y.0.start;
end = cmp::min(x.0.start, y.0.end);
dst = (None, Some(y.1));
}
Ordering::Equal => {
start = x.0.start;
end = cmp::min(x.0.end, y.0.end);
dst = (Some(x.1), Some(y.1));
}
}
ret = Some((Byte { start, end }, dst));
if start == x.0.start {
x.0.start = end;
}
if start == y.0.start {
y.0.start = end;
}
if x.0.is_empty() {
self.xs.next();
}
if y.0.is_empty() {
self.ys.next();
}
}
}
ret
}
}