blob: 197fd2f5cfeea9408f8e513faf92689c34aa9c85 [file] [log] [blame] [edit]
//! Support for loading the grammar.
use diagnostics::{Diagnostics, warn_or_err};
use regex::Regex;
use std::collections::{HashMap, HashSet};
use std::path::{Path, PathBuf};
use std::sync::LazyLock;
use walkdir::WalkDir;
mod parser;
#[derive(Debug, Default)]
pub struct Grammar {
pub productions: HashMap<String, Production>,
/// The order that the production names were discovered.
pub name_order: Vec<String>,
}
#[derive(Debug)]
pub struct Production {
pub name: String,
/// Comments and breaks that precede the production name.
pub comments: Vec<Expression>,
/// Category is from the markdown lang string, and defines how it is
/// grouped and organized on the summary page.
pub category: String,
pub expression: Expression,
/// The path to the chapter where this is defined, relative to the book's
/// `src` directory.
pub path: PathBuf,
pub is_root: bool,
}
#[derive(Clone, Debug)]
pub struct Expression {
pub kind: ExpressionKind,
/// Suffix is the `_foo_` part that is shown as a subscript.
pub suffix: Option<String>,
/// A footnote is a markdown footnote link.
pub footnote: Option<String>,
}
#[derive(Clone, Debug)]
pub enum ExpressionKind {
/// `( A B C )`
Grouped(Box<Expression>),
/// `A | B | C`
Alt(Vec<Expression>),
/// `A B C`
Sequence(Vec<Expression>),
/// `A?`
Optional(Box<Expression>),
/// `A*`
Repeat(Box<Expression>),
/// `A*?`
RepeatNonGreedy(Box<Expression>),
/// `A+`
RepeatPlus(Box<Expression>),
/// `A+?`
RepeatPlusNonGreedy(Box<Expression>),
/// `A{2..4}`
RepeatRange(Box<Expression>, Option<u32>, Option<u32>),
/// `NonTerminal`
Nt(String),
/// `` `string` ``
Terminal(String),
/// `<english description>`
Prose(String),
/// An LF followed by the given number of spaces.
///
/// Used by the renderer to help format and structure the grammar.
Break(usize),
/// `// Single line comment.`
Comment(String),
/// ``[`A`-`Z` `_` LF]``
Charset(Vec<Characters>),
/// ``~[` ` LF]``
NegExpression(Box<Expression>),
/// `U+0060`
Unicode(String),
}
#[derive(Clone, Debug)]
pub enum Characters {
/// `LF`
Named(String),
/// `` `_` ``
Terminal(String),
/// `` `A`-`Z` ``
Range(char, char),
}
impl Grammar {
fn visit_nt(&self, callback: &mut dyn FnMut(&str)) {
for p in self.productions.values() {
p.expression.visit_nt(callback);
}
}
}
impl Expression {
pub fn new_kind(kind: ExpressionKind) -> Self {
Self {
kind,
suffix: None,
footnote: None,
}
}
fn visit_nt(&self, callback: &mut dyn FnMut(&str)) {
match &self.kind {
ExpressionKind::Grouped(e)
| ExpressionKind::Optional(e)
| ExpressionKind::Repeat(e)
| ExpressionKind::RepeatNonGreedy(e)
| ExpressionKind::RepeatPlus(e)
| ExpressionKind::RepeatPlusNonGreedy(e)
| ExpressionKind::RepeatRange(e, _, _)
| ExpressionKind::NegExpression(e) => {
e.visit_nt(callback);
}
ExpressionKind::Alt(es) | ExpressionKind::Sequence(es) => {
for e in es {
e.visit_nt(callback);
}
}
ExpressionKind::Nt(nt) => {
callback(&nt);
}
ExpressionKind::Terminal(_)
| ExpressionKind::Prose(_)
| ExpressionKind::Break(_)
| ExpressionKind::Comment(_)
| ExpressionKind::Unicode(_) => {}
ExpressionKind::Charset(set) => {
for ch in set {
match ch {
Characters::Named(s) => callback(s),
Characters::Terminal(_) | Characters::Range(_, _) => {}
}
}
}
}
}
pub fn is_break(&self) -> bool {
matches!(self.kind, ExpressionKind::Break(_))
}
}
pub static GRAMMAR_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(?ms)^```grammar,([^\n]+)\n(.*?)^```").unwrap());
/// Loads the [`Grammar`] from the book.
pub fn load_grammar(diag: &mut Diagnostics) -> Grammar {
let mut grammar = Grammar::default();
let base = Path::new(env!("CARGO_MANIFEST_DIR")).join("../../src");
for entry in WalkDir::new(&base) {
let entry = entry.unwrap();
let path = entry.path();
if path.extension().and_then(|s| s.to_str()) != Some("md") {
continue;
}
let content = std::fs::read_to_string(path).unwrap();
let relative_path = pathdiff::diff_paths(path, &base).expect("one path must be absolute");
for cap in GRAMMAR_RE.captures_iter(&content) {
let category = &cap[1];
let input = &cap[2];
if let Err(e) = parser::parse_grammar(input, &mut grammar, category, &relative_path) {
warn_or_err!(diag, "failed to parse grammar in {path:?}: {e}");
}
}
}
check_undefined_nt(&grammar, diag);
check_unexpected_roots(&grammar, diag);
grammar
}
/// Checks for nonterminals that are used but not defined.
fn check_undefined_nt(grammar: &Grammar, diag: &mut Diagnostics) {
grammar.visit_nt(&mut |nt| {
if !grammar.productions.contains_key(nt) {
warn_or_err!(diag, "non-terminal `{nt}` is used but not defined");
}
});
}
/// This checks that all the grammar roots are what we expect.
///
/// This is intended to help catch any unexpected misspellings, orphaned
/// productions, or general mistakes.
fn check_unexpected_roots(grammar: &Grammar, diag: &mut Diagnostics) {
// `set` starts with every production name.
let mut set: HashSet<_> = grammar.name_order.iter().map(|s| s.as_str()).collect();
fn remove(set: &mut HashSet<&str>, grammar: &Grammar, prod: &Production, root_name: &str) {
prod.expression.visit_nt(&mut |nt| {
// Leave the root name in the set if we find it recursively.
if nt == root_name {
return;
}
if !set.remove(nt) {
return;
}
if let Some(nt_prod) = grammar.productions.get(nt) {
remove(set, grammar, nt_prod, root_name);
}
});
}
// Walk the productions starting from the root nodes, and remove every
// non-terminal from `set`. What's left must be the set of roots.
grammar
.productions
.values()
.filter(|prod| prod.is_root)
.for_each(|root| {
remove(&mut set, grammar, root, &root.name);
});
let expected: HashSet<_> = grammar
.productions
.values()
.filter_map(|p| p.is_root.then(|| p.name.as_str()))
.collect();
if set != expected {
let new: Vec<_> = set.difference(&expected).collect();
let removed: Vec<_> = expected.difference(&set).collect();
if !new.is_empty() {
warn_or_err!(
diag,
"New grammar production detected that is not used in any root-accessible\n\
production. If this is expected, mark the production with\n\
`@root`. If not, make sure it is spelled correctly and used in\n\
another root-accessible production.\n\
\n\
The new names are: {new:?}\n"
);
} else if !removed.is_empty() {
warn_or_err!(
diag,
"Old grammar production root seems to have been removed\n\
(it is used in some other production that is root-accessible).\n\
If this is expected, remove `@root` from the production.\n\
\n\
The removed names are: {removed:?}\n"
);
} else {
unreachable!("unexpected");
}
}
}