tools/grammar/src/lib.rs - rust-lang/reference - Git at Google

 //! Support for loading the grammar.

 use diagnostics::{Diagnostics, warn_or_err};
 use regex::Regex;
 use std::collections::{HashMap, HashSet};
 use std::path::{Path, PathBuf};
 use std::sync::LazyLock;
 use walkdir::WalkDir;

 mod parser;

 #[derive(Debug, Default)]
 pub struct Grammar {
     pub productions: HashMap<String, Production>,
     /// The order that the production names were discovered.
     pub name_order: Vec<String>,
 }

 #[derive(Debug)]
 pub struct Production {
     pub name: String,
     /// Comments and breaks that precede the production name.
     pub comments: Vec<Expression>,
     /// Category is from the markdown lang string, and defines how it is
     /// grouped and organized on the summary page.
     pub category: String,
     pub expression: Expression,
     /// The path to the chapter where this is defined, relative to the book's
     /// `src` directory.
     pub path: PathBuf,
     pub is_root: bool,
 }

 #[derive(Clone, Debug)]
 pub struct Expression {
     pub kind: ExpressionKind,
     /// Suffix is the `_foo_` part that is shown as a subscript.
     pub suffix: Option<String>,
     /// A footnote is a markdown footnote link.
     pub footnote: Option<String>,
 }

 #[derive(Clone, Debug)]
 pub enum ExpressionKind {
     /// `( A B C )`
     Grouped(Box<Expression>),
     /// `A | B | C`
     Alt(Vec<Expression>),
     /// `A B C`
     Sequence(Vec<Expression>),
     /// `A?`
     Optional(Box<Expression>),
     /// `A*`
     Repeat(Box<Expression>),
     /// `A*?`
     RepeatNonGreedy(Box<Expression>),
     /// `A+`
     RepeatPlus(Box<Expression>),
     /// `A+?`
     RepeatPlusNonGreedy(Box<Expression>),
     /// `A{2..4}`
     RepeatRange(Box<Expression>, Option<u32>, Option<u32>),
     /// `NonTerminal`
     Nt(String),
     /// `` `string` ``
     Terminal(String),
     /// `<english description>`
     Prose(String),
     /// An LF followed by the given number of spaces.
     ///
     /// Used by the renderer to help format and structure the grammar.
     Break(usize),
     /// `// Single line comment.`
     Comment(String),
     /// ``[`A`-`Z` `_` LF]``
     Charset(Vec<Characters>),
     /// ``~[` ` LF]``
     NegExpression(Box<Expression>),
     /// `U+0060`
     Unicode(String),
 }

 #[derive(Clone, Debug)]
 pub enum Characters {
     /// `LF`
     Named(String),
     /// `` `_` ``
     Terminal(String),
     /// `` `A`-`Z` ``
     Range(char, char),
 }

 impl Grammar {
     fn visit_nt(&self, callback: &mut dyn FnMut(&str)) {
         for p in self.productions.values() {
             p.expression.visit_nt(callback);
         }
     }
 }

 impl Expression {
     pub fn new_kind(kind: ExpressionKind) -> Self {
         Self {
             kind,
             suffix: None,
             footnote: None,
         }
     }

     fn visit_nt(&self, callback: &mut dyn FnMut(&str)) {
         match &self.kind {
             ExpressionKind::Grouped(e)
             | ExpressionKind::Optional(e)
             | ExpressionKind::Repeat(e)
             | ExpressionKind::RepeatNonGreedy(e)
             | ExpressionKind::RepeatPlus(e)
             | ExpressionKind::RepeatPlusNonGreedy(e)
             | ExpressionKind::RepeatRange(e, _, _)
             | ExpressionKind::NegExpression(e) => {
                 e.visit_nt(callback);
             }
             ExpressionKind::Alt(es) | ExpressionKind::Sequence(es) => {
                 for e in es {
                     e.visit_nt(callback);
                 }
             }
             ExpressionKind::Nt(nt) => {
                 callback(&nt);
             }
             ExpressionKind::Terminal(_)
             | ExpressionKind::Prose(_)
             | ExpressionKind::Break(_)
             | ExpressionKind::Comment(_)
             | ExpressionKind::Unicode(_) => {}
             ExpressionKind::Charset(set) => {
                 for ch in set {
                     match ch {
                         Characters::Named(s) => callback(s),
                         Characters::Terminal(_) | Characters::Range(_, _) => {}
                     }
                 }
             }
         }
     }

     pub fn is_break(&self) -> bool {
         matches!(self.kind, ExpressionKind::Break(_))
     }
 }

 pub static GRAMMAR_RE: LazyLock<Regex> =
     LazyLock::new(|| Regex::new(r"(?ms)^```grammar,([^\n]+)\n(.*?)^```").unwrap());

 /// Loads the [`Grammar`] from the book.
 pub fn load_grammar(diag: &mut Diagnostics) -> Grammar {
     let mut grammar = Grammar::default();
     let base = Path::new(env!("CARGO_MANIFEST_DIR")).join("../../src");
     for entry in WalkDir::new(&base) {
         let entry = entry.unwrap();
         let path = entry.path();
         if path.extension().and_then(|s| s.to_str()) != Some("md") {
             continue;
         }
         let content = std::fs::read_to_string(path).unwrap();
         let relative_path = pathdiff::diff_paths(path, &base).expect("one path must be absolute");
         for cap in GRAMMAR_RE.captures_iter(&content) {
             let category = &cap[1];
             let input = &cap[2];
             if let Err(e) = parser::parse_grammar(input, &mut grammar, category, &relative_path) {
                 warn_or_err!(diag, "failed to parse grammar in {path:?}: {e}");
             }
         }
     }

     check_undefined_nt(&grammar, diag);
     check_unexpected_roots(&grammar, diag);
     grammar
 }

 /// Checks for nonterminals that are used but not defined.
 fn check_undefined_nt(grammar: &Grammar, diag: &mut Diagnostics) {
     grammar.visit_nt(&mut |nt| {
         if !grammar.productions.contains_key(nt) {
             warn_or_err!(diag, "non-terminal `{nt}` is used but not defined");
         }
     });
 }

 /// This checks that all the grammar roots are what we expect.
 ///
 /// This is intended to help catch any unexpected misspellings, orphaned
 /// productions, or general mistakes.
 fn check_unexpected_roots(grammar: &Grammar, diag: &mut Diagnostics) {
     // `set` starts with every production name.
     let mut set: HashSet<_> = grammar.name_order.iter().map(|s| s.as_str()).collect();
     fn remove(set: &mut HashSet<&str>, grammar: &Grammar, prod: &Production, root_name: &str) {
         prod.expression.visit_nt(&mut |nt| {
             // Leave the root name in the set if we find it recursively.
             if nt == root_name {
                 return;
             }
             if !set.remove(nt) {
                 return;
             }
             if let Some(nt_prod) = grammar.productions.get(nt) {
                 remove(set, grammar, nt_prod, root_name);
             }
         });
     }
     // Walk the productions starting from the root nodes, and remove every
     // non-terminal from `set`. What's left must be the set of roots.
     grammar
         .productions
         .values()
         .filter(|prod| prod.is_root)
         .for_each(|root| {
             remove(&mut set, grammar, root, &root.name);
         });
     let expected: HashSet<_> = grammar
         .productions
         .values()
         .filter_map(|p| p.is_root.then(|| p.name.as_str()))
         .collect();
     if set != expected {
         let new: Vec<_> = set.difference(&expected).collect();
         let removed: Vec<_> = expected.difference(&set).collect();
         if !new.is_empty() {
             warn_or_err!(
                 diag,
                 "New grammar production detected that is not used in any root-accessible\n\
                  production. If this is expected, mark the production with\n\
                  `@root`. If not, make sure it is spelled correctly and used in\n\
                  another root-accessible production.\n\
                  \n\
                  The new names are: {new:?}\n"
             );
         } else if !removed.is_empty() {
             warn_or_err!(
                 diag,
                 "Old grammar production root seems to have been removed\n\
                  (it is used in some other production that is root-accessible).\n\
                  If this is expected, remove `@root` from the production.\n\
                  \n\
                  The removed names are: {removed:?}\n"
             );
         } else {
             unreachable!("unexpected");
         }
     }
 }
	//! Support for loading the grammar.

	use diagnostics::{Diagnostics, warn_or_err};
	use regex::Regex;
	use std::collections::{HashMap, HashSet};
	use std::path::{Path, PathBuf};
	use std::sync::LazyLock;
	use walkdir::WalkDir;

	mod parser;

	#[derive(Debug, Default)]
	pub struct Grammar {
	pub productions: HashMap<String, Production>,
	/// The order that the production names were discovered.
	pub name_order: Vec<String>,
	}

	#[derive(Debug)]
	pub struct Production {
	pub name: String,
	/// Comments and breaks that precede the production name.
	pub comments: Vec<Expression>,
	/// Category is from the markdown lang string, and defines how it is
	/// grouped and organized on the summary page.
	pub category: String,
	pub expression: Expression,
	/// The path to the chapter where this is defined, relative to the book's
	/// `src` directory.
	pub path: PathBuf,
	pub is_root: bool,
	}

	#[derive(Clone, Debug)]
	pub struct Expression {
	pub kind: ExpressionKind,
	/// Suffix is the `_foo_` part that is shown as a subscript.
	pub suffix: Option<String>,
	/// A footnote is a markdown footnote link.
	pub footnote: Option<String>,
	}

	#[derive(Clone, Debug)]
	pub enum ExpressionKind {
	/// `( A B C )`
	Grouped(Box<Expression>),
	/// `A \| B \| C`
	Alt(Vec<Expression>),
	/// `A B C`
	Sequence(Vec<Expression>),
	/// `A?`
	Optional(Box<Expression>),
	/// `A*`
	Repeat(Box<Expression>),
	/// `A*?`
	RepeatNonGreedy(Box<Expression>),
	/// `A+`
	RepeatPlus(Box<Expression>),
	/// `A+?`
	RepeatPlusNonGreedy(Box<Expression>),
	/// `A{2..4}`
	RepeatRange(Box<Expression>, Option<u32>, Option<u32>),
	/// `NonTerminal`
	Nt(String),
	/// `` `string` ``
	Terminal(String),
	/// `<english description>`
	Prose(String),
	/// An LF followed by the given number of spaces.
	///
	/// Used by the renderer to help format and structure the grammar.
	Break(usize),
	/// `// Single line comment.`
	Comment(String),
	/// ``[`A`-`Z` `_` LF]``
	Charset(Vec<Characters>),
	/// ``~[` ` LF]``
	NegExpression(Box<Expression>),
	/// `U+0060`
	Unicode(String),
	}

	#[derive(Clone, Debug)]
	pub enum Characters {
	/// `LF`
	Named(String),
	/// `` `_` ``
	Terminal(String),
	/// `` `A`-`Z` ``
	Range(char, char),
	}

	impl Grammar {
	fn visit_nt(&self, callback: &mut dyn FnMut(&str)) {
	for p in self.productions.values() {
	p.expression.visit_nt(callback);
	}
	}
	}

	impl Expression {
	pub fn new_kind(kind: ExpressionKind) -> Self {
	Self {
	kind,
	suffix: None,
	footnote: None,
	}
	}

	fn visit_nt(&self, callback: &mut dyn FnMut(&str)) {
	match &self.kind {
	ExpressionKind::Grouped(e)
	\| ExpressionKind::Optional(e)
	\| ExpressionKind::Repeat(e)
	\| ExpressionKind::RepeatNonGreedy(e)
	\| ExpressionKind::RepeatPlus(e)
	\| ExpressionKind::RepeatPlusNonGreedy(e)
	\| ExpressionKind::RepeatRange(e, _, _)
	\| ExpressionKind::NegExpression(e) => {
	e.visit_nt(callback);
	}
	ExpressionKind::Alt(es) \| ExpressionKind::Sequence(es) => {
	for e in es {
	e.visit_nt(callback);
	}
	}
	ExpressionKind::Nt(nt) => {
	callback(&nt);
	}
	ExpressionKind::Terminal(_)
	\| ExpressionKind::Prose(_)
	\| ExpressionKind::Break(_)
	\| ExpressionKind::Comment(_)
	\| ExpressionKind::Unicode(_) => {}
	ExpressionKind::Charset(set) => {
	for ch in set {
	match ch {
	Characters::Named(s) => callback(s),
	Characters::Terminal(_) \| Characters::Range(_, _) => {}
	}
	}
	}
	}
	}

	pub fn is_break(&self) -> bool {
	matches!(self.kind, ExpressionKind::Break(_))
	}
	}

	pub static GRAMMAR_RE: LazyLock<Regex> =
	LazyLock::new(\|\| Regex::new(r"(?ms)^```grammar,([^\n]+)\n(.*?)^```").unwrap());

	/// Loads the [`Grammar`] from the book.
	pub fn load_grammar(diag: &mut Diagnostics) -> Grammar {
	let mut grammar = Grammar::default();
	let base = Path::new(env!("CARGO_MANIFEST_DIR")).join("../../src");
	for entry in WalkDir::new(&base) {
	let entry = entry.unwrap();
	let path = entry.path();
	if path.extension().and_then(\|s\| s.to_str()) != Some("md") {
	continue;
	}
	let content = std::fs::read_to_string(path).unwrap();
	let relative_path = pathdiff::diff_paths(path, &base).expect("one path must be absolute");
	for cap in GRAMMAR_RE.captures_iter(&content) {
	let category = &cap[1];
	let input = &cap[2];
	if let Err(e) = parser::parse_grammar(input, &mut grammar, category, &relative_path) {
	warn_or_err!(diag, "failed to parse grammar in {path:?}: {e}");
	}
	}
	}

	check_undefined_nt(&grammar, diag);
	check_unexpected_roots(&grammar, diag);
	grammar
	}

	/// Checks for nonterminals that are used but not defined.
	fn check_undefined_nt(grammar: &Grammar, diag: &mut Diagnostics) {
	grammar.visit_nt(&mut \|nt\| {
	if !grammar.productions.contains_key(nt) {
	warn_or_err!(diag, "non-terminal `{nt}` is used but not defined");
	}
	});
	}

	/// This checks that all the grammar roots are what we expect.
	///
	/// This is intended to help catch any unexpected misspellings, orphaned
	/// productions, or general mistakes.
	fn check_unexpected_roots(grammar: &Grammar, diag: &mut Diagnostics) {
	// `set` starts with every production name.
	let mut set: HashSet<_> = grammar.name_order.iter().map(\|s\| s.as_str()).collect();
	fn remove(set: &mut HashSet<&str>, grammar: &Grammar, prod: &Production, root_name: &str) {
	prod.expression.visit_nt(&mut \|nt\| {
	// Leave the root name in the set if we find it recursively.
	if nt == root_name {
	return;
	}
	if !set.remove(nt) {
	return;
	}
	if let Some(nt_prod) = grammar.productions.get(nt) {
	remove(set, grammar, nt_prod, root_name);
	}
	});
	}
	// Walk the productions starting from the root nodes, and remove every
	// non-terminal from `set`. What's left must be the set of roots.
	grammar
	.productions
	.values()
	.filter(\|prod\| prod.is_root)
	.for_each(\|root\| {
	remove(&mut set, grammar, root, &root.name);
	});
	let expected: HashSet<_> = grammar
	.productions
	.values()
	.filter_map(\|p\| p.is_root.then(\|\| p.name.as_str()))
	.collect();
	if set != expected {
	let new: Vec<_> = set.difference(&expected).collect();
	let removed: Vec<_> = expected.difference(&set).collect();
	if !new.is_empty() {
	warn_or_err!(
	diag,
	"New grammar production detected that is not used in any root-accessible\n\
	production. If this is expected, mark the production with\n\
	`@root`. If not, make sure it is spelled correctly and used in\n\
	another root-accessible production.\n\
	\n\
	The new names are: {new:?}\n"
	);
	} else if !removed.is_empty() {
	warn_or_err!(
	diag,
	"Old grammar production root seems to have been removed\n\
	(it is used in some other production that is root-accessible).\n\
	If this is expected, remove `@root` from the production.\n\
	\n\
	The removed names are: {removed:?}\n"
	);
	} else {
	unreachable!("unexpected");
	}
	}
	}