tools/grammar/src/parser.rs - rust-lang/reference - Git at Google

 //! A parser of the ENBF-like grammar.

 use super::{Character, Characters, Expression, ExpressionKind, Grammar, Production, RangeLimit};
 use std::fmt;
 use std::fmt::Display;
 use std::path::Path;

 struct Parser<'a> {
     input: &'a str,
     index: usize,
 }

 pub struct Error {
     message: String,
     line: String,
     lineno: usize,
     col: usize,
 }

 impl Display for Error {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::result::Result<(), std::fmt::Error> {
         let lineno = format!("{}", self.lineno);
         let space = " ".repeat(lineno.len() + 1);
         let col = " ".repeat(self.col);
         let line = &self.line;
         let message = &self.message;
         write!(f, "\n{space}|\n{lineno} | {line}\n{space}|{col}^ {message}")
     }
 }

 macro_rules! bail {
     ($parser:expr, $($arg:tt)*) => {{
         let mut msg = String::new();
         fmt::write(&mut msg, format_args!($($arg)*)).unwrap();
         return Err($parser.error(msg));
     }};
 }

 type Result<T> = std::result::Result<T, Error>;

 /// Whether a character can start a grammar rule name.
 ///
 /// This includes ASCII alphabetic characters, underscores, and
 /// non-ASCII Unicode symbols such as `⊥` (bottom) and `⊤` (top).
 /// ASCII symbols are excluded because characters such as `+`, `|`,
 /// `~`, and `^` are grammar syntax.
 fn is_name_start(ch: char) -> bool {
     ch.is_alphabetic() || ch == '_' || !ch.is_ascii()
 }

 /// Whether a character can continue a grammar rule name.
 ///
 /// Accepts alphanumeric characters, underscores, and non-ASCII
 /// characters.
 fn is_name_continue(ch: char) -> bool {
     ch.is_alphanumeric() || ch == '_' || !ch.is_ascii()
 }

 pub fn parse_grammar(
     input: &str,
     grammar: &mut Grammar,
     category: &str,
     path: &Path,
 ) -> Result<()> {
     let mut parser = Parser { input, index: 0 };
     loop {
         let p = parser.parse_production(category, path)?;
         grammar.name_order.push(p.name.clone());
         if let Some(dupe) = grammar.productions.insert(p.name.clone(), p) {
             bail!(parser, "duplicate production {} in grammar", dupe.name);
         }
         parser.take_while(&|ch| ch == '\n');
         if parser.eof() {
             break;
         }
     }
     Ok(())
 }

 impl Parser<'_> {
     fn take_while(&mut self, f: &dyn Fn(char) -> bool) -> &str {
         let mut upper = 0;
         let i = self.index;
         let mut ci = self.input[i..].chars();
         while let Some(ch) = ci.next() {
             if !f(ch) {
                 break;
             }
             upper += ch.len_utf8();
         }
         self.index += upper;
         &self.input[i..i + upper]
     }

     /// Returns whether or not the given string is next, and advances the head if it is.
     fn take_str(&mut self, s: &str) -> bool {
         if self.input[self.index..].starts_with(s) {
             self.index += s.len();
             true
         } else {
             false
         }
     }

     /// Returns the next byte, or None if eof.
     fn peek(&mut self) -> Option<u8> {
         if self.index >= self.input.len() {
             None
         } else {
             Some(self.input.as_bytes()[self.index])
         }
     }

     fn eof(&mut self) -> bool {
         self.index >= self.input.len()
     }

     /// Expects the next input to be the given string, and advances the head.
     fn expect(&mut self, s: &str, err: &str) -> Result<()> {
         if !self.input[self.index..].starts_with(s) {
             bail!(self, "{err}");
         };
         self.index += s.len();
         Ok(())
     }

     fn error(&mut self, message: String) -> Error {
         let (line, lineno, col) = translate_position(self.input, self.index);
         Error {
             message,
             line: line.to_string(),
             lineno,
             col,
         }
     }

     /// Advances zero or more spaces.
     fn space0(&mut self) -> &str {
         self.take_while(&|ch| ch == ' ')
     }

     fn parse_production(&mut self, category: &str, path: &Path) -> Result<Production> {
         let mut comments = Vec::new();
         while let Ok(comment) = self.parse_comment() {
             self.expect("\n", "expected newline")?;
             comments.push(Expression::new_kind(comment));
             comments.push(Expression::new_kind(ExpressionKind::Break(0)));
         }
         let is_root = self.parse_is_root();
         self.space0();
         let name = self
             .parse_name()
             .ok_or_else(|| self.error("expected production name".to_string()))?;
         self.expect(" ->", "expected -> arrow")?;
         let Some(expression) = self.parse_expression()? else {
             bail!(self, "expected an expression");
         };
         Ok(Production {
             name,
             comments,
             category: category.to_string(),
             expression,
             path: path.to_owned(),
             is_root,
         })
     }

     fn parse_is_root(&mut self) -> bool {
         self.take_str("@root")
     }

     fn parse_name(&mut self) -> Option<String> {
         let first = self.input[self.index..].chars().next()?;
         if !is_name_start(first) {
             return None;
         }
         Some(self.take_while(&|c| is_name_continue(c)).to_string())
     }

     fn parse_expression(&mut self) -> Result<Option<Expression>> {
         let mut es = Vec::new();
         loop {
             let Some(e) = self.parse_seq()? else { break };
             es.push(e);
             _ = self.space0();
             if !self.take_str("|") {
                 break;
             }
         }
         match es.len() {
             0 => Ok(None),
             1 => Ok(Some(es.pop().unwrap())),
             _ => Ok(Some(Expression::new_kind(ExpressionKind::Alt(es)))),
         }
     }

     fn parse_seq(&mut self) -> Result<Option<Expression>> {
         let mut es = Vec::new();
         loop {
             self.space0();
             if self.peek() == Some(b'^') {
                 let cut = self.parse_cut()?;
                 es.push(cut);
                 break;
             }
             let Some(e) = self.parse_expr1()? else {
                 break;
             };
             es.push(e);
         }
         match es.len() {
             0 => Ok(None),
             1 => Ok(Some(es.pop().unwrap())),
             _ => Ok(Some(Expression {
                 kind: ExpressionKind::Sequence(es),
                 suffix: None,
                 footnote: None,
             })),
         }
     }

     /// Parse cut (`^`) operator.
     fn parse_cut(&mut self) -> Result<Expression> {
         self.expect("^", "expected `^`")?;
         let Some(rhs) = self.parse_seq()? else {
             bail!(self, "expected expression after cut operator");
         };
         Ok(Expression {
             kind: ExpressionKind::Cut(Box::new(rhs)),
             suffix: None,
             footnote: None,
         })
     }

     fn parse_expr1(&mut self) -> Result<Option<Expression>> {
         let Some(next) = self.peek() else {
             return Ok(None);
         };

         let kind = if self.take_str("U+") {
             ExpressionKind::Unicode(self.parse_unicode()?)
         } else if self.input[self.index..]
             .chars()
             .next()
             .map(|ch| is_name_start(ch))
             .unwrap_or(false)
         {
             self.parse_nonterminal()
                 .expect("first char already checked")
         } else if self.take_str("\n") {
             if self.eof() || self.take_str("\n") {
                 return Ok(None);
             }
             let space = self.take_while(&|ch| ch == ' ');
             if space.len() == 0 {
                 bail!(self, "expected indentation on next line");
             }
             ExpressionKind::Break(space.len())
         } else if next == b'/' {
             self.parse_comment()?
         } else if next == b'`' {
             self.parse_terminal()?
         } else if next == b'[' {
             self.parse_charset()?
         } else if next == b'<' {
             self.parse_prose()?
         } else if next == b'(' {
             self.parse_grouped()?
         } else if next == b'~' {
             self.parse_neg_expression()?
         } else if next == b'!' {
             self.parse_negative_lookahead()?
         } else {
             return Ok(None);
         };
         let kind = match self.peek() {
             Some(b'?') => self.parse_optional(kind)?,
             Some(b'*') => self.parse_repeat(kind)?,
             Some(b'+') => self.parse_repeat_plus(kind)?,
             Some(b'{') => self.parse_repeat_range(kind)?,
             _ => kind,
         };
         let suffix = self.parse_suffix()?;
         let footnote = self.parse_footnote()?;

         Ok(Some(Expression {
             kind,
             suffix,
             footnote,
         }))
     }

     fn parse_nonterminal(&mut self) -> Option<ExpressionKind> {
         let nt = self.parse_name()?;
         Some(ExpressionKind::Nt(nt))
     }

     /// Parse terminal within backticks.
     fn parse_terminal(&mut self) -> Result<ExpressionKind> {
         Ok(ExpressionKind::Terminal(self.parse_terminal_str()?))
     }

     /// Parse string within backticks.
     fn parse_terminal_str(&mut self) -> Result<String> {
         self.expect("`", "expected opening backtick")?;
         let term = self.take_while(&|x| !['\n', '`'].contains(&x)).to_string();
         if term.is_empty() {
             bail!(self, "expected terminal");
         }
         self.expect("`", "expected closing backtick")?;
         Ok(term)
     }

     /// Parse e.g. `// Single line comment.`.
     fn parse_comment(&mut self) -> Result<ExpressionKind> {
         self.expect("//", "expected `//`")?;
         let text = self.take_while(&|x| x != '\n').to_string();
         Ok(ExpressionKind::Comment(text))
     }

     fn parse_charset(&mut self) -> Result<ExpressionKind> {
         self.expect("[", "expected opening [")?;
         let mut characters = Vec::new();
         loop {
             self.space0();
             let Some(ch) = self.parse_characters()? else {
                 break;
             };
             characters.push(ch);
         }
         if characters.is_empty() {
             bail!(self, "expected at least one character in character group");
         }
         self.space0();
         self.expect("]", "expected closing ]")?;
         Ok(ExpressionKind::Charset(characters))
     }

     /// Parse an element of a character class, e.g.
     /// `` `a`-`b` `` | `` `term` `` | `` NonTerminal ``.
     fn parse_characters(&mut self) -> Result<Option<Characters>> {
         if let Some(a) = self.parse_character()? {
             if self.take_str("-") {
                 let Some(b) = self.parse_character()? else {
                     bail!(self, "expected character in range");
                 };
                 Ok(Some(Characters::Range(a, b)))
             } else {
                 //~^ Parse terminal in backticks.
                 let t = match a {
                     Character::Char(ch) => ch.to_string(),
                     Character::Unicode(_) => bail!(self, "unicode not supported"),
                 };
                 Ok(Some(Characters::Terminal(t)))
             }
         } else if let Some(name) = self.parse_name() {
             //~^ Parse nonterminal identifier.
             Ok(Some(Characters::Named(name)))
         } else {
             Ok(None)
         }
     }

     fn parse_character(&mut self) -> Result<Option<Character>> {
         if let Some(b'`') = self.peek() {
             let recov = self.index;
             let term = self.parse_terminal_str()?;
             if term.len() > 1 {
                 self.index = recov + 1;
                 bail!(self, "invalid start terminal in range");
             }
             let ch = term.chars().next().unwrap();
             Ok(Some(Character::Char(ch)))
         } else if self.take_str("U+") {
             Ok(Some(Character::Unicode(self.parse_unicode()?)))
         } else {
             Ok(None)
         }
     }

     /// Parse e.g. `<prose text>`.
     fn parse_prose(&mut self) -> Result<ExpressionKind> {
         self.expect("<", "expected opening `<`")?;
         let text = self.take_while(&|x| !['\n', '>'].contains(&x)).to_string();
         if text.is_empty() {
             bail!(self, "expected prose text");
         }
         self.expect(">", "expected closing `>`")?;
         Ok(ExpressionKind::Prose(text))
     }

     fn parse_grouped(&mut self) -> Result<ExpressionKind> {
         self.expect("(", "expected opening `(`")?;
         self.space0();
         let Some(e) = self.parse_expression()? else {
             bail!(self, "expected expression in parenthesized group");
         };
         self.space0();
         self.expect(")", "expected closing `)`")?;
         Ok(ExpressionKind::Grouped(Box::new(e)))
     }

     fn parse_neg_expression(&mut self) -> Result<ExpressionKind> {
         self.expect("~", "expected ~")?;
         let Some(next) = self.peek() else {
             bail!(self, "expected expression after ~");
         };
         let kind = match next {
             b'[' => self.parse_charset()?,
             b'`' => self.parse_terminal()?,
             _ => self.parse_nonterminal().ok_or_else(|| {
                 self.error("expected a charset, terminal, or name after ~ negation".to_string())
             })?,
         };
         Ok(ExpressionKind::NegExpression(box_kind(kind)))
     }

     fn parse_negative_lookahead(&mut self) -> Result<ExpressionKind> {
         self.expect("!", "expected !")?;
         self.space0();
         let Some(e) = self.parse_expr1()? else {
             bail!(self, "expected expression after !");
         };
         Ok(ExpressionKind::NegativeLookahead(Box::new(e)))
     }

     /// Parse e.g. `F00F` after `U+`.
     fn parse_unicode(&mut self) -> Result<(char, String)> {
         let mut xs = Vec::with_capacity(6);
         let mut push_next = || {
             match self.peek() {
                 Some(x @ (b'0'..=b'9' | b'A'..=b'F')) => {
                     xs.push(x);
                     self.index += 1;
                 }
                 _ => bail!(self, "expected 4 uppercase hexadecimal digits after `U+`"),
             }
             Ok(())
         };
         for _ in 0..4 {
             push_next()?;
         }
         for _ in 0..2 {
             if push_next().is_err() {
                 break;
             }
         }
         let s = String::from_utf8(xs).unwrap();
         let ch = char::from_u32(u32::from_str_radix(&s, 16).unwrap()).unwrap();
         Ok((ch, s))
     }

     /// Parse `?` after expression.
     fn parse_optional(&mut self, kind: ExpressionKind) -> Result<ExpressionKind> {
         self.expect("?", "expected `?`")?;
         Ok(ExpressionKind::Optional(box_kind(kind)))
     }

     /// Parse `*` after expression.
     fn parse_repeat(&mut self, kind: ExpressionKind) -> Result<ExpressionKind> {
         self.expect("*", "expected `*`")?;
         Ok(ExpressionKind::Repeat(box_kind(kind)))
     }

     /// Parse `+` after expression.
     fn parse_repeat_plus(&mut self, kind: ExpressionKind) -> Result<ExpressionKind> {
         self.expect("+", "expected `+`")?;
         Ok(ExpressionKind::RepeatPlus(box_kind(kind)))
     }

     /// Parse `{a..b}` | `{a..=b}` | `{name:a..=b}` | `{name}` after expression.
     //
     // `name:` before the range is a named binding. `{name}` refers to that binding.
     fn parse_repeat_range(&mut self, kind: ExpressionKind) -> Result<ExpressionKind> {
         self.expect("{", "expected `{`")?;
         let start = self.index;
         let name = match (self.parse_name(), self.peek()) {
             (Some(name), Some(b':')) => {
                 self.index += 1;
                 Some(name)
             }
             (Some(name), Some(b'}')) => {
                 self.index += 1;
                 return Ok(ExpressionKind::RepeatRangeNamed(box_kind(kind), name));
             }
             _ => {
                 self.index = start;
                 None
             }
         };
         let min = self.take_while(&|x| x.is_ascii_digit());
         let Ok(min) = (!min.is_empty()).then(|| min.parse::<u32>()).transpose() else {
             bail!(self, "malformed range start");
         };
         self.expect("..", "expected `..` or `..=`")?;
         let limit = if self.take_str("=") {
             RangeLimit::Closed
         } else {
             RangeLimit::HalfOpen
         };
         let max = self.take_while(&|x| x.is_ascii_digit());
         let Ok(max) = (!max.is_empty()).then(|| max.parse::<u32>()).transpose() else {
             bail!(self, "malformed range end");
         };
         match (min, max, limit) {
             (Some(min), Some(max), _) if max < min => {
                 bail!(self, "range {min}{limit}{max} is malformed")
             }
             (Some(min), Some(max), RangeLimit::HalfOpen) if max <= min => {
                 bail!(self, "half-open range maximum must be greater than minimum")
             }
             (None, Some(0), RangeLimit::HalfOpen) => {
                 bail!(self, "half-open range `..0` is empty")
             }
             (_, None, RangeLimit::Closed) => bail!(self, "closed range must have an upper bound"),
             _ => {}
         }
         self.expect("}", "expected `}`")?;
         Ok(ExpressionKind::RepeatRange {
             expr: box_kind(kind),
             name,
             min,
             max,
             limit,
         })
     }

     fn parse_suffix(&mut self) -> Result<Option<String>> {
         if !self.take_str(" _") {
             return Ok(None);
         }
         let mut in_backtick = false;
         let start = self.index;
         loop {
             let Some(next) = self.peek() else {
                 bail!(self, "failed to find end of _ suffixed text");
             };
             self.index += 1;
             match next {
                 b'\n' => bail!(self, "failed to find end of _ suffixed text"),
                 b'`' => in_backtick = !in_backtick,
                 b'_' if !in_backtick => {
                     if self
                         .peek()
                         .map(|b| matches!(b, b'\n' | b' '))
                         .unwrap_or(true)
                     {
                         break;
                     }
                 }
                 _ => {}
             }
         }
         Ok(Some(self.input[start..self.index - 1].to_string()))
     }

     /// Parse footnote reference, e.g. `[^id]`.
     fn parse_footnote(&mut self) -> Result<Option<String>> {
         if !self.take_str("[^") {
             return Ok(None);
         }
         let id = self.take_while(&|x| !['\n', ']'].contains(&x)).to_string();
         if id.is_empty() {
             bail!(self, "expected footnote id");
         }
         self.expect("]", "expected closing `]`")?;
         Ok(Some(id))
     }
 }

 fn box_kind(kind: ExpressionKind) -> Box<Expression> {
     Box::new(Expression {
         kind,
         suffix: None,
         footnote: None,
     })
 }

 /// Helper to translate a byte index to a `(line, line_no, col_no)` (1-based).
 fn translate_position(input: &str, index: usize) -> (&str, usize, usize) {
     if input.is_empty() {
         return ("", 0, 0);
     }
     let index = index.min(input.len());

     let mut line_start = 0;
     let mut line_number = 0;
     for line in input.lines() {
         let line_end = line_start + line.len();
         if index >= line_start && index <= line_end {
             let column_number = index - line_start + 1;
             return (line, line_number + 1, column_number);
         }
         line_start = line_end + 1;
         line_number += 1;
     }
     ("", line_number + 1, 0)
 }

 #[cfg(test)]
 mod tests {
     use crate::parser::{parse_grammar, translate_position};
     use crate::{Character, Characters, ExpressionKind, Grammar, RangeLimit};
     use std::path::Path;

     #[test]
     fn test_translate() {
         assert_eq!(translate_position("", 0), ("", 0, 0));
         assert_eq!(translate_position("test", 0), ("test", 1, 1));
         assert_eq!(translate_position("test", 3), ("test", 1, 4));
         assert_eq!(translate_position("test", 4), ("test", 1, 5));
         assert_eq!(translate_position("test\ntest2", 4), ("test", 1, 5));
         assert_eq!(translate_position("test\ntest2", 5), ("test2", 2, 1));
         assert_eq!(translate_position("test\ntest2\n", 11), ("", 3, 0));
     }

     fn parse(input: &str) -> Result<Grammar, String> {
         let mut grammar = Grammar::default();
         parse_grammar(input, &mut grammar, "test", Path::new("test.md"))
             .map_err(|e| e.to_string())?;
         Ok(grammar)
     }

     #[test]
     fn test_cut() {
         let input = "Rule -> A ^ B | C";
         let grammar = parse(input).unwrap();
         grammar.productions.get("Rule").unwrap();
     }

     #[test]
     fn test_cut_captures() {
         let input = "Rule -> A ^ B C | D";
         let grammar = parse(input).unwrap();
         let rule = grammar.productions.get("Rule").unwrap();
         // The top-level expression is an alternation: (A ^ B C) | D.
         let ExpressionKind::Alt(alts) = &rule.expression.kind else {
             panic!("expected Alt, got {:?}", rule.expression.kind);
         };
         assert_eq!(alts.len(), 2);
         // First alternative is a sequence: A, Cut(Sequence(B, C)).
         let ExpressionKind::Sequence(seq) = &alts[0].kind else {
             panic!("expected Sequence, got {:?}", alts[0].kind);
         };
         assert_eq!(seq.len(), 2);
         assert!(matches!(&seq[0].kind, ExpressionKind::Nt(n) if n == "A"));
         // The cut captures the rest of the sequence (B and C).
         let ExpressionKind::Cut(cut_inner) = &seq[1].kind else {
             panic!("expected Cut, got {:?}", seq[1].kind);
         };
         let ExpressionKind::Sequence(cut_seq) = &cut_inner.kind else {
             panic!("expected Sequence inside Cut, got {:?}", cut_inner.kind);
         };
         assert_eq!(cut_seq.len(), 2);
         assert!(matches!(&cut_seq[0].kind, ExpressionKind::Nt(n) if n == "B"));
         assert!(matches!(&cut_seq[1].kind, ExpressionKind::Nt(n) if n == "C"));
         // Second alternative is just D.
         assert!(matches!(&alts[1].kind, ExpressionKind::Nt(n) if n == "D"));
     }

     #[test]
     fn test_cut_fail_trailing() {
         let input = "Rule -> A ^";
         let err = parse(input).unwrap_err();
         assert!(err.contains("expected expression after cut operator"));
     }

     /// Extract the `RepeatRange` fields from a single-production
     /// grammar whose rule body is a repeat-range expression.
     fn repeat_range(input: &str) -> (Option<u32>, Option<u32>, RangeLimit) {
         let grammar = parse(input).unwrap();
         let rule = grammar.productions.get("A").unwrap();
         let ExpressionKind::RepeatRange {
             min, max, limit, ..
         } = rule.expression.kind
         else {
             panic!("expected RepeatRange, got {:?}", rule.expression.kind);
         };
         (min, max, limit)
     }

     // -- Valid ranges -----------------------------------------------

     #[test]
     fn test_range_half_open() {
         let (min, max, limit) = repeat_range("A -> x{2..5}");
         assert_eq!(min, Some(2));
         assert_eq!(max, Some(5));
         assert!(matches!(limit, RangeLimit::HalfOpen));
     }

     #[test]
     fn test_range_half_open_no_min() {
         let (min, max, limit) = repeat_range("A -> x{..5}");
         assert_eq!(min, None);
         assert_eq!(max, Some(5));
         assert!(matches!(limit, RangeLimit::HalfOpen));
     }

     #[test]
     fn test_range_half_open_no_max() {
         let (min, max, limit) = repeat_range("A -> x{2..}");
         assert_eq!(min, Some(2));
         assert_eq!(max, None);
         assert!(matches!(limit, RangeLimit::HalfOpen));
     }

     #[test]
     fn test_range_half_open_unbounded() {
         let (min, max, limit) = repeat_range("A -> x{..}");
         assert_eq!(min, None);
         assert_eq!(max, None);
         assert!(matches!(limit, RangeLimit::HalfOpen));
     }

     #[test]
     fn test_range_closed() {
         let (min, max, limit) = repeat_range("A -> x{2..=5}");
         assert_eq!(min, Some(2));
         assert_eq!(max, Some(5));
         assert!(matches!(limit, RangeLimit::Closed));
     }

     #[test]
     fn test_range_closed_no_min() {
         let (min, max, limit) = repeat_range("A -> x{..=5}");
         assert_eq!(min, None);
         assert_eq!(max, Some(5));
         assert!(matches!(limit, RangeLimit::Closed));
     }

     // -- Invalid ranges ---------------------------------------------

     #[test]
     fn test_range_err_max_less_than_min() {
         let err = parse("A -> x{3..2}").unwrap_err();
         assert!(
             err.contains("malformed"),
             "expected malformed error, got: {err}"
         );
     }

     #[test]
     fn test_range_err_empty_exclusive_equal() {
         let err = parse("A -> x{2..2}").unwrap_err();
         assert!(
             err.contains("half-open range maximum must be greater"),
             "expected empty-exclusive error, got: {err}"
         );
     }

     #[test]
     fn test_range_err_empty_exclusive_zero() {
         let err = parse("A -> x{0..0}").unwrap_err();
         assert!(
             err.contains("half-open range maximum must be greater"),
             "expected empty-exclusive error, got: {err}"
         );
     }

     #[test]
     fn test_range_err_closed_no_upper() {
         let err = parse("A -> x{..=}").unwrap_err();
         assert!(
             err.contains("closed range must have an upper bound"),
             "expected closed-needs-upper error, got: {err}"
         );
     }

     #[test]
     fn test_range_err_closed_no_upper_with_min() {
         let err = parse("A -> x{2..=}").unwrap_err();
         assert!(
             err.contains("closed range must have an upper bound"),
             "expected closed-needs-upper error, got: {err}"
         );
     }

     #[test]
     fn test_range_err_half_open_zero_max() {
         let err = parse("A -> x{..0}").unwrap_err();
         assert!(
             err.contains("half-open range `..0` is empty"),
             "expected half-open-zero error, got: {err}"
         );
     }

     // -- Valid edge cases -------------------------------------------

     #[test]
     fn test_range_closed_exact() {
         // `x{2..=2}` means exactly 2 -- not empty.
         let (min, max, limit) = repeat_range("A -> x{2..=2}");
         assert_eq!(min, Some(2));
         assert_eq!(max, Some(2));
         assert!(matches!(limit, RangeLimit::Closed));
     }

     #[test]
     fn test_range_half_open_zero_to_one() {
         // `x{0..1}` means exactly 0 repetitions (the half-open
         // range contains only 0).
         let (min, max, limit) = repeat_range("A -> x{0..1}");
         assert_eq!(min, Some(0));
         assert_eq!(max, Some(1));
         assert!(matches!(limit, RangeLimit::HalfOpen));
     }

     // --- Negative lookahead tests ---

     #[test]
     fn lookahead_simple_nonterminal() {
         let input = "Rule -> !Foo";
         let grammar = parse(input).unwrap();
         let rule = grammar.productions.get("Rule").unwrap();
         let ExpressionKind::NegativeLookahead(inner) = &rule.expression.kind else {
             panic!("expected NegativeLookahead, got {:?}", rule.expression.kind);
         };
         assert!(matches!(&inner.kind, ExpressionKind::Nt(n) if n == "Foo"));
     }

     #[test]
     fn lookahead_terminal() {
         let input = "Rule -> !`'` Foo";
         let grammar = parse(input).unwrap();
         let rule = grammar.productions.get("Rule").unwrap();
         let ExpressionKind::Sequence(seq) = &rule.expression.kind else {
             panic!("expected Sequence, got {:?}", rule.expression.kind);
         };
         assert_eq!(seq.len(), 2);
         let ExpressionKind::NegativeLookahead(inner) = &seq[0].kind else {
             panic!("expected NegativeLookahead, got {:?}", seq[0].kind);
         };
         assert!(matches!(&inner.kind, ExpressionKind::Terminal(t) if t == "'"));
         assert!(matches!(&seq[1].kind, ExpressionKind::Nt(n) if n == "Foo"));
     }

     #[test]
     fn lookahead_charset() {
         let input = "Rule -> ![`e` `E`] SUFFIX";
         let grammar = parse(input).unwrap();
         let rule = grammar.productions.get("Rule").unwrap();
         let ExpressionKind::Sequence(seq) = &rule.expression.kind else {
             panic!("expected Sequence, got {:?}", rule.expression.kind);
         };
         assert_eq!(seq.len(), 2);
         let ExpressionKind::NegativeLookahead(inner) = &seq[0].kind else {
             panic!("expected NegativeLookahead, got {:?}", seq[0].kind);
         };
         let ExpressionKind::Charset(chars) = &inner.kind else {
             panic!("expected Charset inside lookahead, got {:?}", inner.kind);
         };
         assert_eq!(chars.len(), 2);
         assert!(matches!(&chars[0], Characters::Terminal(t) if t == "e"));
         assert!(matches!(&chars[1], Characters::Terminal(t) if t == "E"));
     }

     #[test]
     fn lookahead_grouped() {
         let input = "Rule -> !(`.` | `_` | XID_Start)";
         let grammar = parse(input).unwrap();
         let rule = grammar.productions.get("Rule").unwrap();
         let ExpressionKind::NegativeLookahead(inner) = &rule.expression.kind else {
             panic!("expected NegativeLookahead, got {:?}", rule.expression.kind);
         };
         let ExpressionKind::Grouped(grouped) = &inner.kind else {
             panic!("expected Grouped inside lookahead, got {:?}", inner.kind);
         };
         let ExpressionKind::Alt(alts) = &grouped.kind else {
             panic!("expected Alt inside Grouped, got {:?}", grouped.kind);
         };
         assert_eq!(alts.len(), 3);
         assert!(matches!(&alts[0].kind, ExpressionKind::Terminal(t) if t == "."));
         assert!(matches!(&alts[1].kind, ExpressionKind::Terminal(t) if t == "_"));
         assert!(matches!(&alts[2].kind, ExpressionKind::Nt(n) if n == "XID_Start"));
     }

     #[test]
     fn lookahead_in_sequence_middle() {
         let input = "Rule -> A !B C";
         let grammar = parse(input).unwrap();
         let rule = grammar.productions.get("Rule").unwrap();
         let ExpressionKind::Sequence(seq) = &rule.expression.kind else {
             panic!("expected Sequence, got {:?}", rule.expression.kind);
         };
         assert_eq!(seq.len(), 3);
         assert!(matches!(&seq[0].kind, ExpressionKind::Nt(n) if n == "A"));
         let ExpressionKind::NegativeLookahead(inner) = &seq[1].kind else {
             panic!("expected NegativeLookahead, got {:?}", seq[1].kind);
         };
         assert!(matches!(&inner.kind, ExpressionKind::Nt(n) if n == "B"));
         assert!(matches!(&seq[2].kind, ExpressionKind::Nt(n) if n == "C"));
     }

     #[test]
     fn lookahead_in_repetition() {
         let input = "Rule -> (!A B)*";
         let grammar = parse(input).unwrap();
         let rule = grammar.productions.get("Rule").unwrap();
         let ExpressionKind::Repeat(rep) = &rule.expression.kind else {
             panic!("expected Repeat, got {:?}", rule.expression.kind);
         };
         let ExpressionKind::Grouped(grouped) = &rep.kind else {
             panic!("expected Grouped inside Repeat, got {:?}", rep.kind);
         };
         let ExpressionKind::Sequence(seq) = &grouped.kind else {
             panic!("expected Sequence inside Grouped, got {:?}", grouped.kind);
         };
         assert_eq!(seq.len(), 2);
         assert!(matches!(&seq[0].kind, ExpressionKind::NegativeLookahead(_)));
         assert!(matches!(&seq[1].kind, ExpressionKind::Nt(n) if n == "B"));
     }

     #[test]
     fn lookahead_in_alternation() {
         let input = "Rule -> !A B | C";
         let grammar = parse(input).unwrap();
         let rule = grammar.productions.get("Rule").unwrap();
         let ExpressionKind::Alt(alts) = &rule.expression.kind else {
             panic!("expected Alt, got {:?}", rule.expression.kind);
         };
         assert_eq!(alts.len(), 2);
         let ExpressionKind::Sequence(seq) = &alts[0].kind else {
             panic!("expected Sequence, got {:?}", alts[0].kind);
         };
         assert_eq!(seq.len(), 2);
         assert!(matches!(&seq[0].kind, ExpressionKind::NegativeLookahead(_)));
         assert!(matches!(&seq[1].kind, ExpressionKind::Nt(n) if n == "B"));
         assert!(matches!(&alts[1].kind, ExpressionKind::Nt(n) if n == "C"));
     }

     #[test]
     fn lookahead_fail_trailing() {
         let input = "Rule -> !";
         let err = parse(input).unwrap_err();
         assert!(err.contains("expected expression after !"));
     }

     // --- Unicode tests ---

     #[test]
     fn unicode_4_digit() {
         let input = "Rule -> U+0009";
         let grammar = parse(input).unwrap();
         let rule = grammar.productions.get("Rule").unwrap();
         let ExpressionKind::Unicode((ch, s)) = &rule.expression.kind else {
             panic!("expected Unicode, got {:?}", rule.expression.kind);
         };
         assert_eq!(*ch, '\t');
         assert_eq!(s, "0009");
     }

     #[test]
     fn unicode_5_digit() {
         let input = "Rule -> U+E0000";
         let grammar = parse(input).unwrap();
         let rule = grammar.productions.get("Rule").unwrap();
         let ExpressionKind::Unicode((ch, s)) = &rule.expression.kind else {
             panic!("expected Unicode, got {:?}", rule.expression.kind);
         };
         assert_eq!(*ch, '\u{E0000}');
         assert_eq!(s, "E0000");
     }

     #[test]
     fn unicode_6_digit() {
         let input = "Rule -> U+10FFFF";
         let grammar = parse(input).unwrap();
         let rule = grammar.productions.get("Rule").unwrap();
         let ExpressionKind::Unicode((ch, s)) = &rule.expression.kind else {
             panic!("expected Unicode, got {:?}", rule.expression.kind);
         };
         assert_eq!(*ch, '\u{10FFFF}');
         assert_eq!(s, "10FFFF");
     }

     #[test]
     fn unicode_in_alternation() {
         let input = "Rule -> U+0009 | U+000A";
         let grammar = parse(input).unwrap();
         let rule = grammar.productions.get("Rule").unwrap();
         let ExpressionKind::Alt(alts) = &rule.expression.kind else {
             panic!("expected Alt, got {:?}", rule.expression.kind);
         };
         assert_eq!(alts.len(), 2);
         assert!(matches!(
             &alts[0].kind,
             ExpressionKind::Unicode((ch, _)) if *ch == '\t'
         ));
         assert!(matches!(
             &alts[1].kind,
             ExpressionKind::Unicode((ch, _)) if *ch == '\n'
         ));
     }

     // --- Character / charset range tests ---

     #[test]
     fn charset_unicode_range() {
         let input = "Rule -> [U+0000-U+007F]";
         let grammar = parse(input).unwrap();
         let rule = grammar.productions.get("Rule").unwrap();
         let ExpressionKind::Charset(chars) = &rule.expression.kind else {
             panic!("expected Charset, got {:?}", rule.expression.kind);
         };
         assert_eq!(chars.len(), 1);
         let Characters::Range(a, b) = &chars[0] else {
             panic!("expected Range, got {:?}", chars[0]);
         };
         assert!(matches!(a, Character::Unicode((ch, _)) if *ch == '\0'));
         assert!(matches!(
             b,
             Character::Unicode((ch, _)) if *ch == '\u{7F}'
         ));
     }

     #[test]
     fn charset_char_range() {
         let input = "Rule -> [`a`-`z`]";
         let grammar = parse(input).unwrap();
         let rule = grammar.productions.get("Rule").unwrap();
         let ExpressionKind::Charset(chars) = &rule.expression.kind else {
             panic!("expected Charset, got {:?}", rule.expression.kind);
         };
         assert_eq!(chars.len(), 1);
         let Characters::Range(a, b) = &chars[0] else {
             panic!("expected Range, got {:?}", chars[0]);
         };
         assert!(matches!(a, Character::Char(ch) if *ch == 'a'));
         assert!(matches!(b, Character::Char(ch) if *ch == 'z'));
     }

     #[test]
     fn charset_mixed_range() {
         let input = "Rule -> [`a`-U+007A]";
         let grammar = parse(input).unwrap();
         let rule = grammar.productions.get("Rule").unwrap();
         let ExpressionKind::Charset(chars) = &rule.expression.kind else {
             panic!("expected Charset, got {:?}", rule.expression.kind);
         };
         assert_eq!(chars.len(), 1);
         let Characters::Range(a, b) = &chars[0] else {
             panic!("expected Range, got {:?}", chars[0]);
         };
         assert!(matches!(a, Character::Char(ch) if *ch == 'a'));
         assert!(matches!(
             b,
             Character::Unicode((ch, _)) if *ch == 'z'
         ));
     }

     #[test]
     fn charset_multiple_unicode_ranges() {
         let input = "Rule -> [U+0000-U+D7FF U+E000-U+10FFFF]";
         let grammar = parse(input).unwrap();
         let rule = grammar.productions.get("Rule").unwrap();
         let ExpressionKind::Charset(chars) = &rule.expression.kind else {
             panic!("expected Charset, got {:?}", rule.expression.kind);
         };
         assert_eq!(chars.len(), 2);
         let Characters::Range(a1, b1) = &chars[0] else {
             panic!("expected Range, got {:?}", chars[0]);
         };
         assert!(matches!(a1, Character::Unicode((ch, _)) if *ch == '\0'));
         assert!(matches!(b1, Character::Unicode((ch, _)) if *ch == '\u{D7FF}'));
         let Characters::Range(a2, b2) = &chars[1] else {
             panic!("expected Range, got {:?}", chars[1]);
         };
         assert!(matches!(a2, Character::Unicode((ch, _)) if *ch == '\u{E000}'));
         assert!(matches!(b2, Character::Unicode((ch, _)) if *ch == '\u{10FFFF}'));
     }

     #[test]
     fn charset_terminals_and_named() {
         let input = "Rule -> [`a` `b` Foo]";
         let grammar = parse(input).unwrap();
         let rule = grammar.productions.get("Rule").unwrap();
         let ExpressionKind::Charset(chars) = &rule.expression.kind else {
             panic!("expected Charset, got {:?}", rule.expression.kind);
         };
         assert_eq!(chars.len(), 3);
         assert!(matches!(&chars[0], Characters::Terminal(t) if t == "a"));
         assert!(matches!(&chars[1], Characters::Terminal(t) if t == "b"));
         assert!(matches!(&chars[2], Characters::Named(n) if n == "Foo"));
     }

     // --- Negative lookahead combined with charset ---

     #[test]
     fn lookahead_charset_with_named_and_terminals() {
         // Pattern from tokens.md: ![`'` `\` LF CR TAB] ASCII
         let input = "Rule -> ![`x` `y` LF] Foo";
         let grammar = parse(input).unwrap();
         let rule = grammar.productions.get("Rule").unwrap();
         let ExpressionKind::Sequence(seq) = &rule.expression.kind else {
             panic!("expected Sequence, got {:?}", rule.expression.kind);
         };
         assert_eq!(seq.len(), 2);
         let ExpressionKind::NegativeLookahead(inner) = &seq[0].kind else {
             panic!("expected NegativeLookahead, got {:?}", seq[0].kind);
         };
         let ExpressionKind::Charset(chars) = &inner.kind else {
             panic!("expected Charset, got {:?}", inner.kind);
         };
         assert_eq!(chars.len(), 3);
         assert!(matches!(&chars[0], Characters::Terminal(t) if t == "x"));
         assert!(matches!(&chars[1], Characters::Terminal(t) if t == "y"));
         assert!(matches!(&chars[2], Characters::Named(n) if n == "LF"));
     }

     // --- Negative lookahead combined with Unicode ---

     #[test]
     fn lookahead_charset_with_unicode_range() {
         let input = "Rule -> ![U+0000-U+007F] Foo";
         let grammar = parse(input).unwrap();
         let rule = grammar.productions.get("Rule").unwrap();
         let ExpressionKind::Sequence(seq) = &rule.expression.kind else {
             panic!("expected Sequence, got {:?}", rule.expression.kind);
         };
         let ExpressionKind::NegativeLookahead(inner) = &seq[0].kind else {
             panic!("expected NegativeLookahead, got {:?}", seq[0].kind);
         };
         let ExpressionKind::Charset(chars) = &inner.kind else {
             panic!("expected Charset, got {:?}", inner.kind);
         };
         assert_eq!(chars.len(), 1);
         let Characters::Range(a, b) = &chars[0] else {
             panic!("expected Range, got {:?}", chars[0]);
         };
         assert!(matches!(a, Character::Unicode((ch, _)) if *ch == '\0'));
         assert!(matches!(
             b,
             Character::Unicode((ch, _)) if *ch == '\u{7F}'
         ));
     }

     // --- `parse_name` digit rejection tests ---

     #[test]
     fn parse_name_rejects_leading_digits() {
         // `{123}` should not parse as a named reference.  The
         // digits don't form a valid name and there is no `..`
         // range operator, so the parser should reject this.
         let err = parse("A -> x{123}").unwrap_err();
         assert!(
             err.contains("expected `..`"),
             "expected range-syntax error for {{123}}, got: {err}"
         );
     }

     #[test]
     fn parse_name_allows_letter_then_digit() {
         // `n1` is a valid name (starts with a letter).
         let grammar = parse("A -> x{n1:2..5}").unwrap();
         let rule = grammar.productions.get("A").unwrap();
         let ExpressionKind::RepeatRange {
             name,
             min,
             max,
             limit,
             ..
         } = &rule.expression.kind
         else {
             panic!("expected RepeatRange, got {:?}", rule.expression.kind);
         };
         assert_eq!(name.as_deref(), Some("n1"));
         assert_eq!(*min, Some(2));
         assert_eq!(*max, Some(5));
         assert!(matches!(limit, RangeLimit::HalfOpen));
     }

     #[test]
     fn parse_name_allows_underscore_start() {
         // `_n` is a valid name (starts with underscore).
         let grammar = parse("A -> x{_n:2..5}").unwrap();
         let rule = grammar.productions.get("A").unwrap();
         let ExpressionKind::RepeatRange {
             name,
             min,
             max,
             limit,
             ..
         } = &rule.expression.kind
         else {
             panic!("expected RepeatRange, got {:?}", rule.expression.kind);
         };
         assert_eq!(name.as_deref(), Some("_n"));
         assert_eq!(*min, Some(2));
         assert_eq!(*max, Some(5));
         assert!(matches!(limit, RangeLimit::HalfOpen));
     }

     // --- Named repeat range tests ---

     /// Extract full `RepeatRange` fields including the name.
     fn named_repeat_range(input: &str) -> (Option<String>, Option<u32>, Option<u32>, RangeLimit) {
         let grammar = parse(input).unwrap();
         let rule = grammar.productions.get("A").unwrap();
         let ExpressionKind::RepeatRange {
             name,
             min,
             max,
             limit,
             ..
         } = &rule.expression.kind
         else {
             panic!("expected RepeatRange, got {:?}", rule.expression.kind);
         };
         (name.clone(), *min, *max, *limit)
     }

     #[test]
     fn named_range_closed() {
         let (name, min, max, limit) = named_repeat_range("A -> x{n:1..=255}");
         assert_eq!(name.as_deref(), Some("n"));
         assert_eq!(min, Some(1));
         assert_eq!(max, Some(255));
         assert!(matches!(limit, RangeLimit::Closed));
     }

     #[test]
     fn named_range_half_open() {
         let (name, min, max, limit) = named_repeat_range("A -> x{n:2..5}");
         assert_eq!(name.as_deref(), Some("n"));
         assert_eq!(min, Some(2));
         assert_eq!(max, Some(5));
         assert!(matches!(limit, RangeLimit::HalfOpen));
     }

     #[test]
     fn named_range_omitted_min() {
         let (name, min, max, limit) = named_repeat_range("A -> x{n:..=5}");
         assert_eq!(name.as_deref(), Some("n"));
         assert_eq!(min, None);
         assert_eq!(max, Some(5));
         assert!(matches!(limit, RangeLimit::Closed));
     }

     #[test]
     fn named_range_omitted_max() {
         let (name, min, max, limit) = named_repeat_range("A -> x{n:2..}");
         assert_eq!(name.as_deref(), Some("n"));
         assert_eq!(min, Some(2));
         assert_eq!(max, None);
         assert!(matches!(limit, RangeLimit::HalfOpen));
     }

     #[test]
     fn named_reference() {
         // `{n}` without a colon or range produces a
         // RepeatRangeNamed variant.
         let grammar = parse("A -> x{n}").unwrap();
         let rule = grammar.productions.get("A").unwrap();
         let ExpressionKind::RepeatRangeNamed(_, name) = &rule.expression.kind else {
             panic!("expected RepeatRangeNamed, got {:?}", rule.expression.kind);
         };
         assert_eq!(name, "n");
     }

     #[test]
     fn named_binding_and_reference_in_sequence() {
         // A production with a named binding and a named reference.
         let grammar = parse("A -> x{n:1..=255} y{n}").unwrap();
         let rule = grammar.productions.get("A").unwrap();
         let ExpressionKind::Sequence(seq) = &rule.expression.kind else {
             panic!("expected Sequence, got {:?}", rule.expression.kind);
         };
         assert_eq!(seq.len(), 2);

         // First element: x{n:1..=255}
         let ExpressionKind::RepeatRange {
             name,
             min,
             max,
             limit,
             ..
         } = &seq[0].kind
         else {
             panic!("expected RepeatRange, got {:?}", seq[0].kind);
         };
         assert_eq!(name.as_deref(), Some("n"));
         assert_eq!(*min, Some(1));
         assert_eq!(*max, Some(255));
         assert!(matches!(limit, RangeLimit::Closed));

         // Second element: y{n}
         let ExpressionKind::RepeatRangeNamed(_, ref_name) = &seq[1].kind else {
             panic!("expected RepeatRangeNamed, got {:?}", seq[1].kind);
         };
         assert_eq!(ref_name, "n");
     }

     #[test]
     fn named_range_backtrack_to_plain_range() {
         // When parse_name() succeeds but the next byte is
         // neither `:` nor `}`, the parser backtracks and
         // falls through to plain range parsing.  `{2..5}` is
         // such a case after the parse_name fix (digits are
         // rejected), but let's test a scenario where a name is
         // parsed and then backtracked.
         //
         // There is no single-character token after a name that
         // triggers backtrack in valid grammar (the match arms
         // cover `:` and `}`), but the fallback resets the index
         // and tries plain range parsing.  We verify that
         // `{2..5}` parses correctly as a plain range even
         // though it starts with a digit.
         let (min, max, limit) = repeat_range("A -> x{2..5}");
         assert_eq!(min, Some(2));
         assert_eq!(max, Some(5));
         assert!(matches!(limit, RangeLimit::HalfOpen));
     }

     #[test]
     fn named_range_err_colon_missing_dots() {
         // `{n:}` -- name followed by colon, then no `..`.
         let err = parse("A -> x{n:}").unwrap_err();
         assert!(
             err.contains("expected `..`"),
             "expected `..` error for {{n:}}, got: {err}"
         );
     }

     #[test]
     fn named_range_err_empty_braces() {
         // `{}` -- empty braces contain no name and no range.
         let err = parse("A -> x{}").unwrap_err();
         assert!(
             err.contains("expected `..`"),
             "expected `..` error for {{}}, got: {err}"
         );
     }
 }