blob: a5a91a5cf0e4b45452ff2e5f862c6bdcf01aec26 [file] [edit]
//! A parser of the ENBF-like grammar.
use super::{Character, Characters, Expression, ExpressionKind, Grammar, Production, RangeLimit};
use std::fmt;
use std::fmt::Display;
use std::path::Path;
struct Parser<'a> {
input: &'a str,
index: usize,
}
pub struct Error {
message: String,
line: String,
lineno: usize,
col: usize,
}
impl Display for Error {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::result::Result<(), std::fmt::Error> {
let lineno = format!("{}", self.lineno);
let space = " ".repeat(lineno.len() + 1);
let col = " ".repeat(self.col);
let line = &self.line;
let message = &self.message;
write!(f, "\n{space}|\n{lineno} | {line}\n{space}|{col}^ {message}")
}
}
macro_rules! bail {
($parser:expr, $($arg:tt)*) => {{
let mut msg = String::new();
fmt::write(&mut msg, format_args!($($arg)*)).unwrap();
return Err($parser.error(msg));
}};
}
type Result<T> = std::result::Result<T, Error>;
/// Whether a character can start a grammar rule name.
///
/// This includes ASCII alphabetic characters, underscores, and
/// non-ASCII Unicode symbols such as `⊥` (bottom) and `⊤` (top).
/// ASCII symbols are excluded because characters such as `+`, `|`,
/// `~`, and `^` are grammar syntax.
fn is_name_start(ch: char) -> bool {
ch.is_alphabetic() || ch == '_' || !ch.is_ascii()
}
/// Whether a character can continue a grammar rule name.
///
/// Accepts alphanumeric characters, underscores, and non-ASCII
/// characters.
fn is_name_continue(ch: char) -> bool {
ch.is_alphanumeric() || ch == '_' || !ch.is_ascii()
}
pub fn parse_grammar(
input: &str,
grammar: &mut Grammar,
category: &str,
path: &Path,
) -> Result<()> {
let mut parser = Parser { input, index: 0 };
loop {
let p = parser.parse_production(category, path)?;
grammar.name_order.push(p.name.clone());
if let Some(dupe) = grammar.productions.insert(p.name.clone(), p) {
bail!(parser, "duplicate production {} in grammar", dupe.name);
}
parser.take_while(&|ch| ch == '\n');
if parser.eof() {
break;
}
}
Ok(())
}
impl Parser<'_> {
fn take_while(&mut self, f: &dyn Fn(char) -> bool) -> &str {
let mut upper = 0;
let i = self.index;
let mut ci = self.input[i..].chars();
while let Some(ch) = ci.next() {
if !f(ch) {
break;
}
upper += ch.len_utf8();
}
self.index += upper;
&self.input[i..i + upper]
}
/// Returns whether or not the given string is next, and advances the head if it is.
fn take_str(&mut self, s: &str) -> bool {
if self.input[self.index..].starts_with(s) {
self.index += s.len();
true
} else {
false
}
}
/// Returns the next byte, or None if eof.
fn peek(&mut self) -> Option<u8> {
if self.index >= self.input.len() {
None
} else {
Some(self.input.as_bytes()[self.index])
}
}
fn eof(&mut self) -> bool {
self.index >= self.input.len()
}
/// Expects the next input to be the given string, and advances the head.
fn expect(&mut self, s: &str, err: &str) -> Result<()> {
if !self.input[self.index..].starts_with(s) {
bail!(self, "{err}");
};
self.index += s.len();
Ok(())
}
fn error(&mut self, message: String) -> Error {
let (line, lineno, col) = translate_position(self.input, self.index);
Error {
message,
line: line.to_string(),
lineno,
col,
}
}
/// Advances zero or more spaces.
fn space0(&mut self) -> &str {
self.take_while(&|ch| ch == ' ')
}
fn parse_production(&mut self, category: &str, path: &Path) -> Result<Production> {
let mut comments = Vec::new();
while let Ok(comment) = self.parse_comment() {
self.expect("\n", "expected newline")?;
comments.push(Expression::new_kind(comment));
comments.push(Expression::new_kind(ExpressionKind::Break(0)));
}
let is_root = self.parse_is_root();
self.space0();
let name = self
.parse_name()
.ok_or_else(|| self.error("expected production name".to_string()))?;
self.expect(" ->", "expected -> arrow")?;
let Some(expression) = self.parse_expression()? else {
bail!(self, "expected an expression");
};
Ok(Production {
name,
comments,
category: category.to_string(),
expression,
path: path.to_owned(),
is_root,
})
}
fn parse_is_root(&mut self) -> bool {
self.take_str("@root")
}
fn parse_name(&mut self) -> Option<String> {
let first = self.input[self.index..].chars().next()?;
if !is_name_start(first) {
return None;
}
Some(self.take_while(&|c| is_name_continue(c)).to_string())
}
fn parse_expression(&mut self) -> Result<Option<Expression>> {
let mut es = Vec::new();
loop {
let Some(e) = self.parse_seq()? else { break };
es.push(e);
_ = self.space0();
if !self.take_str("|") {
break;
}
}
match es.len() {
0 => Ok(None),
1 => Ok(Some(es.pop().unwrap())),
_ => Ok(Some(Expression::new_kind(ExpressionKind::Alt(es)))),
}
}
fn parse_seq(&mut self) -> Result<Option<Expression>> {
let mut es = Vec::new();
loop {
self.space0();
if self.peek() == Some(b'^') {
let cut = self.parse_cut()?;
es.push(cut);
break;
}
let Some(e) = self.parse_expr1()? else {
break;
};
es.push(e);
}
match es.len() {
0 => Ok(None),
1 => Ok(Some(es.pop().unwrap())),
_ => Ok(Some(Expression {
kind: ExpressionKind::Sequence(es),
suffix: None,
footnote: None,
})),
}
}
/// Parse cut (`^`) operator.
fn parse_cut(&mut self) -> Result<Expression> {
self.expect("^", "expected `^`")?;
let Some(rhs) = self.parse_seq()? else {
bail!(self, "expected expression after cut operator");
};
Ok(Expression {
kind: ExpressionKind::Cut(Box::new(rhs)),
suffix: None,
footnote: None,
})
}
fn parse_expr1(&mut self) -> Result<Option<Expression>> {
let Some(next) = self.peek() else {
return Ok(None);
};
let kind = if self.take_str("U+") {
ExpressionKind::Unicode(self.parse_unicode()?)
} else if self.input[self.index..]
.chars()
.next()
.map(|ch| is_name_start(ch))
.unwrap_or(false)
{
self.parse_nonterminal()
.expect("first char already checked")
} else if self.take_str("\n") {
if self.eof() || self.take_str("\n") {
return Ok(None);
}
let space = self.take_while(&|ch| ch == ' ');
if space.len() == 0 {
bail!(self, "expected indentation on next line");
}
ExpressionKind::Break(space.len())
} else if next == b'/' {
self.parse_comment()?
} else if next == b'`' {
self.parse_terminal()?
} else if next == b'[' {
self.parse_charset()?
} else if next == b'<' {
self.parse_prose()?
} else if next == b'(' {
self.parse_grouped()?
} else if next == b'~' {
self.parse_neg_expression()?
} else if next == b'!' {
self.parse_negative_lookahead()?
} else {
return Ok(None);
};
let kind = match self.peek() {
Some(b'?') => self.parse_optional(kind)?,
Some(b'*') => self.parse_repeat(kind)?,
Some(b'+') => self.parse_repeat_plus(kind)?,
Some(b'{') => self.parse_repeat_range(kind)?,
_ => kind,
};
let suffix = self.parse_suffix()?;
let footnote = self.parse_footnote()?;
Ok(Some(Expression {
kind,
suffix,
footnote,
}))
}
fn parse_nonterminal(&mut self) -> Option<ExpressionKind> {
let nt = self.parse_name()?;
Some(ExpressionKind::Nt(nt))
}
/// Parse terminal within backticks.
fn parse_terminal(&mut self) -> Result<ExpressionKind> {
Ok(ExpressionKind::Terminal(self.parse_terminal_str()?))
}
/// Parse string within backticks.
fn parse_terminal_str(&mut self) -> Result<String> {
self.expect("`", "expected opening backtick")?;
let term = self.take_while(&|x| !['\n', '`'].contains(&x)).to_string();
if term.is_empty() {
bail!(self, "expected terminal");
}
self.expect("`", "expected closing backtick")?;
Ok(term)
}
/// Parse e.g. `// Single line comment.`.
fn parse_comment(&mut self) -> Result<ExpressionKind> {
self.expect("//", "expected `//`")?;
let text = self.take_while(&|x| x != '\n').to_string();
Ok(ExpressionKind::Comment(text))
}
fn parse_charset(&mut self) -> Result<ExpressionKind> {
self.expect("[", "expected opening [")?;
let mut characters = Vec::new();
loop {
self.space0();
let Some(ch) = self.parse_characters()? else {
break;
};
characters.push(ch);
}
if characters.is_empty() {
bail!(self, "expected at least one character in character group");
}
self.space0();
self.expect("]", "expected closing ]")?;
Ok(ExpressionKind::Charset(characters))
}
/// Parse an element of a character class, e.g.
/// `` `a`-`b` `` | `` `term` `` | `` NonTerminal ``.
fn parse_characters(&mut self) -> Result<Option<Characters>> {
if let Some(a) = self.parse_character()? {
if self.take_str("-") {
let Some(b) = self.parse_character()? else {
bail!(self, "expected character in range");
};
Ok(Some(Characters::Range(a, b)))
} else {
//~^ Parse terminal in backticks.
let t = match a {
Character::Char(ch) => ch.to_string(),
Character::Unicode(_) => bail!(self, "unicode not supported"),
};
Ok(Some(Characters::Terminal(t)))
}
} else if let Some(name) = self.parse_name() {
//~^ Parse nonterminal identifier.
Ok(Some(Characters::Named(name)))
} else {
Ok(None)
}
}
fn parse_character(&mut self) -> Result<Option<Character>> {
if let Some(b'`') = self.peek() {
let recov = self.index;
let term = self.parse_terminal_str()?;
if term.len() > 1 {
self.index = recov + 1;
bail!(self, "invalid start terminal in range");
}
let ch = term.chars().next().unwrap();
Ok(Some(Character::Char(ch)))
} else if self.take_str("U+") {
Ok(Some(Character::Unicode(self.parse_unicode()?)))
} else {
Ok(None)
}
}
/// Parse e.g. `<prose text>`.
fn parse_prose(&mut self) -> Result<ExpressionKind> {
self.expect("<", "expected opening `<`")?;
let text = self.take_while(&|x| !['\n', '>'].contains(&x)).to_string();
if text.is_empty() {
bail!(self, "expected prose text");
}
self.expect(">", "expected closing `>`")?;
Ok(ExpressionKind::Prose(text))
}
fn parse_grouped(&mut self) -> Result<ExpressionKind> {
self.expect("(", "expected opening `(`")?;
self.space0();
let Some(e) = self.parse_expression()? else {
bail!(self, "expected expression in parenthesized group");
};
self.space0();
self.expect(")", "expected closing `)`")?;
Ok(ExpressionKind::Grouped(Box::new(e)))
}
fn parse_neg_expression(&mut self) -> Result<ExpressionKind> {
self.expect("~", "expected ~")?;
let Some(next) = self.peek() else {
bail!(self, "expected expression after ~");
};
let kind = match next {
b'[' => self.parse_charset()?,
b'`' => self.parse_terminal()?,
_ => self.parse_nonterminal().ok_or_else(|| {
self.error("expected a charset, terminal, or name after ~ negation".to_string())
})?,
};
Ok(ExpressionKind::NegExpression(box_kind(kind)))
}
fn parse_negative_lookahead(&mut self) -> Result<ExpressionKind> {
self.expect("!", "expected !")?;
self.space0();
let Some(e) = self.parse_expr1()? else {
bail!(self, "expected expression after !");
};
Ok(ExpressionKind::NegativeLookahead(Box::new(e)))
}
/// Parse e.g. `F00F` after `U+`.
fn parse_unicode(&mut self) -> Result<(char, String)> {
let mut xs = Vec::with_capacity(6);
let mut push_next = || {
match self.peek() {
Some(x @ (b'0'..=b'9' | b'A'..=b'F')) => {
xs.push(x);
self.index += 1;
}
_ => bail!(self, "expected 4 uppercase hexadecimal digits after `U+`"),
}
Ok(())
};
for _ in 0..4 {
push_next()?;
}
for _ in 0..2 {
if push_next().is_err() {
break;
}
}
let s = String::from_utf8(xs).unwrap();
let ch = char::from_u32(u32::from_str_radix(&s, 16).unwrap()).unwrap();
Ok((ch, s))
}
/// Parse `?` after expression.
fn parse_optional(&mut self, kind: ExpressionKind) -> Result<ExpressionKind> {
self.expect("?", "expected `?`")?;
Ok(ExpressionKind::Optional(box_kind(kind)))
}
/// Parse `*` after expression.
fn parse_repeat(&mut self, kind: ExpressionKind) -> Result<ExpressionKind> {
self.expect("*", "expected `*`")?;
Ok(ExpressionKind::Repeat(box_kind(kind)))
}
/// Parse `+` after expression.
fn parse_repeat_plus(&mut self, kind: ExpressionKind) -> Result<ExpressionKind> {
self.expect("+", "expected `+`")?;
Ok(ExpressionKind::RepeatPlus(box_kind(kind)))
}
/// Parse `{a..b}` | `{a..=b}` | `{name:a..=b}` | `{name}` after expression.
//
// `name:` before the range is a named binding. `{name}` refers to that binding.
fn parse_repeat_range(&mut self, kind: ExpressionKind) -> Result<ExpressionKind> {
self.expect("{", "expected `{`")?;
let start = self.index;
let name = match (self.parse_name(), self.peek()) {
(Some(name), Some(b':')) => {
self.index += 1;
Some(name)
}
(Some(name), Some(b'}')) => {
self.index += 1;
return Ok(ExpressionKind::RepeatRangeNamed(box_kind(kind), name));
}
_ => {
self.index = start;
None
}
};
let min = self.take_while(&|x| x.is_ascii_digit());
let Ok(min) = (!min.is_empty()).then(|| min.parse::<u32>()).transpose() else {
bail!(self, "malformed range start");
};
self.expect("..", "expected `..` or `..=`")?;
let limit = if self.take_str("=") {
RangeLimit::Closed
} else {
RangeLimit::HalfOpen
};
let max = self.take_while(&|x| x.is_ascii_digit());
let Ok(max) = (!max.is_empty()).then(|| max.parse::<u32>()).transpose() else {
bail!(self, "malformed range end");
};
match (min, max, limit) {
(Some(min), Some(max), _) if max < min => {
bail!(self, "range {min}{limit}{max} is malformed")
}
(Some(min), Some(max), RangeLimit::HalfOpen) if max <= min => {
bail!(self, "half-open range maximum must be greater than minimum")
}
(None, Some(0), RangeLimit::HalfOpen) => {
bail!(self, "half-open range `..0` is empty")
}
(_, None, RangeLimit::Closed) => bail!(self, "closed range must have an upper bound"),
_ => {}
}
self.expect("}", "expected `}`")?;
Ok(ExpressionKind::RepeatRange {
expr: box_kind(kind),
name,
min,
max,
limit,
})
}
fn parse_suffix(&mut self) -> Result<Option<String>> {
if !self.take_str(" _") {
return Ok(None);
}
let mut in_backtick = false;
let start = self.index;
loop {
let Some(next) = self.peek() else {
bail!(self, "failed to find end of _ suffixed text");
};
self.index += 1;
match next {
b'\n' => bail!(self, "failed to find end of _ suffixed text"),
b'`' => in_backtick = !in_backtick,
b'_' if !in_backtick => {
if self
.peek()
.map(|b| matches!(b, b'\n' | b' '))
.unwrap_or(true)
{
break;
}
}
_ => {}
}
}
Ok(Some(self.input[start..self.index - 1].to_string()))
}
/// Parse footnote reference, e.g. `[^id]`.
fn parse_footnote(&mut self) -> Result<Option<String>> {
if !self.take_str("[^") {
return Ok(None);
}
let id = self.take_while(&|x| !['\n', ']'].contains(&x)).to_string();
if id.is_empty() {
bail!(self, "expected footnote id");
}
self.expect("]", "expected closing `]`")?;
Ok(Some(id))
}
}
fn box_kind(kind: ExpressionKind) -> Box<Expression> {
Box::new(Expression {
kind,
suffix: None,
footnote: None,
})
}
/// Helper to translate a byte index to a `(line, line_no, col_no)` (1-based).
fn translate_position(input: &str, index: usize) -> (&str, usize, usize) {
if input.is_empty() {
return ("", 0, 0);
}
let index = index.min(input.len());
let mut line_start = 0;
let mut line_number = 0;
for line in input.lines() {
let line_end = line_start + line.len();
if index >= line_start && index <= line_end {
let column_number = index - line_start + 1;
return (line, line_number + 1, column_number);
}
line_start = line_end + 1;
line_number += 1;
}
("", line_number + 1, 0)
}
#[cfg(test)]
mod tests {
use crate::parser::{parse_grammar, translate_position};
use crate::{Character, Characters, ExpressionKind, Grammar, RangeLimit};
use std::path::Path;
#[test]
fn test_translate() {
assert_eq!(translate_position("", 0), ("", 0, 0));
assert_eq!(translate_position("test", 0), ("test", 1, 1));
assert_eq!(translate_position("test", 3), ("test", 1, 4));
assert_eq!(translate_position("test", 4), ("test", 1, 5));
assert_eq!(translate_position("test\ntest2", 4), ("test", 1, 5));
assert_eq!(translate_position("test\ntest2", 5), ("test2", 2, 1));
assert_eq!(translate_position("test\ntest2\n", 11), ("", 3, 0));
}
fn parse(input: &str) -> Result<Grammar, String> {
let mut grammar = Grammar::default();
parse_grammar(input, &mut grammar, "test", Path::new("test.md"))
.map_err(|e| e.to_string())?;
Ok(grammar)
}
#[test]
fn test_cut() {
let input = "Rule -> A ^ B | C";
let grammar = parse(input).unwrap();
grammar.productions.get("Rule").unwrap();
}
#[test]
fn test_cut_captures() {
let input = "Rule -> A ^ B C | D";
let grammar = parse(input).unwrap();
let rule = grammar.productions.get("Rule").unwrap();
// The top-level expression is an alternation: (A ^ B C) | D.
let ExpressionKind::Alt(alts) = &rule.expression.kind else {
panic!("expected Alt, got {:?}", rule.expression.kind);
};
assert_eq!(alts.len(), 2);
// First alternative is a sequence: A, Cut(Sequence(B, C)).
let ExpressionKind::Sequence(seq) = &alts[0].kind else {
panic!("expected Sequence, got {:?}", alts[0].kind);
};
assert_eq!(seq.len(), 2);
assert!(matches!(&seq[0].kind, ExpressionKind::Nt(n) if n == "A"));
// The cut captures the rest of the sequence (B and C).
let ExpressionKind::Cut(cut_inner) = &seq[1].kind else {
panic!("expected Cut, got {:?}", seq[1].kind);
};
let ExpressionKind::Sequence(cut_seq) = &cut_inner.kind else {
panic!("expected Sequence inside Cut, got {:?}", cut_inner.kind);
};
assert_eq!(cut_seq.len(), 2);
assert!(matches!(&cut_seq[0].kind, ExpressionKind::Nt(n) if n == "B"));
assert!(matches!(&cut_seq[1].kind, ExpressionKind::Nt(n) if n == "C"));
// Second alternative is just D.
assert!(matches!(&alts[1].kind, ExpressionKind::Nt(n) if n == "D"));
}
#[test]
fn test_cut_fail_trailing() {
let input = "Rule -> A ^";
let err = parse(input).unwrap_err();
assert!(err.contains("expected expression after cut operator"));
}
/// Extract the `RepeatRange` fields from a single-production
/// grammar whose rule body is a repeat-range expression.
fn repeat_range(input: &str) -> (Option<u32>, Option<u32>, RangeLimit) {
let grammar = parse(input).unwrap();
let rule = grammar.productions.get("A").unwrap();
let ExpressionKind::RepeatRange {
min, max, limit, ..
} = rule.expression.kind
else {
panic!("expected RepeatRange, got {:?}", rule.expression.kind);
};
(min, max, limit)
}
// -- Valid ranges -----------------------------------------------
#[test]
fn test_range_half_open() {
let (min, max, limit) = repeat_range("A -> x{2..5}");
assert_eq!(min, Some(2));
assert_eq!(max, Some(5));
assert!(matches!(limit, RangeLimit::HalfOpen));
}
#[test]
fn test_range_half_open_no_min() {
let (min, max, limit) = repeat_range("A -> x{..5}");
assert_eq!(min, None);
assert_eq!(max, Some(5));
assert!(matches!(limit, RangeLimit::HalfOpen));
}
#[test]
fn test_range_half_open_no_max() {
let (min, max, limit) = repeat_range("A -> x{2..}");
assert_eq!(min, Some(2));
assert_eq!(max, None);
assert!(matches!(limit, RangeLimit::HalfOpen));
}
#[test]
fn test_range_half_open_unbounded() {
let (min, max, limit) = repeat_range("A -> x{..}");
assert_eq!(min, None);
assert_eq!(max, None);
assert!(matches!(limit, RangeLimit::HalfOpen));
}
#[test]
fn test_range_closed() {
let (min, max, limit) = repeat_range("A -> x{2..=5}");
assert_eq!(min, Some(2));
assert_eq!(max, Some(5));
assert!(matches!(limit, RangeLimit::Closed));
}
#[test]
fn test_range_closed_no_min() {
let (min, max, limit) = repeat_range("A -> x{..=5}");
assert_eq!(min, None);
assert_eq!(max, Some(5));
assert!(matches!(limit, RangeLimit::Closed));
}
// -- Invalid ranges ---------------------------------------------
#[test]
fn test_range_err_max_less_than_min() {
let err = parse("A -> x{3..2}").unwrap_err();
assert!(
err.contains("malformed"),
"expected malformed error, got: {err}"
);
}
#[test]
fn test_range_err_empty_exclusive_equal() {
let err = parse("A -> x{2..2}").unwrap_err();
assert!(
err.contains("half-open range maximum must be greater"),
"expected empty-exclusive error, got: {err}"
);
}
#[test]
fn test_range_err_empty_exclusive_zero() {
let err = parse("A -> x{0..0}").unwrap_err();
assert!(
err.contains("half-open range maximum must be greater"),
"expected empty-exclusive error, got: {err}"
);
}
#[test]
fn test_range_err_closed_no_upper() {
let err = parse("A -> x{..=}").unwrap_err();
assert!(
err.contains("closed range must have an upper bound"),
"expected closed-needs-upper error, got: {err}"
);
}
#[test]
fn test_range_err_closed_no_upper_with_min() {
let err = parse("A -> x{2..=}").unwrap_err();
assert!(
err.contains("closed range must have an upper bound"),
"expected closed-needs-upper error, got: {err}"
);
}
#[test]
fn test_range_err_half_open_zero_max() {
let err = parse("A -> x{..0}").unwrap_err();
assert!(
err.contains("half-open range `..0` is empty"),
"expected half-open-zero error, got: {err}"
);
}
// -- Valid edge cases -------------------------------------------
#[test]
fn test_range_closed_exact() {
// `x{2..=2}` means exactly 2 -- not empty.
let (min, max, limit) = repeat_range("A -> x{2..=2}");
assert_eq!(min, Some(2));
assert_eq!(max, Some(2));
assert!(matches!(limit, RangeLimit::Closed));
}
#[test]
fn test_range_half_open_zero_to_one() {
// `x{0..1}` means exactly 0 repetitions (the half-open
// range contains only 0).
let (min, max, limit) = repeat_range("A -> x{0..1}");
assert_eq!(min, Some(0));
assert_eq!(max, Some(1));
assert!(matches!(limit, RangeLimit::HalfOpen));
}
// --- Negative lookahead tests ---
#[test]
fn lookahead_simple_nonterminal() {
let input = "Rule -> !Foo";
let grammar = parse(input).unwrap();
let rule = grammar.productions.get("Rule").unwrap();
let ExpressionKind::NegativeLookahead(inner) = &rule.expression.kind else {
panic!("expected NegativeLookahead, got {:?}", rule.expression.kind);
};
assert!(matches!(&inner.kind, ExpressionKind::Nt(n) if n == "Foo"));
}
#[test]
fn lookahead_terminal() {
let input = "Rule -> !`'` Foo";
let grammar = parse(input).unwrap();
let rule = grammar.productions.get("Rule").unwrap();
let ExpressionKind::Sequence(seq) = &rule.expression.kind else {
panic!("expected Sequence, got {:?}", rule.expression.kind);
};
assert_eq!(seq.len(), 2);
let ExpressionKind::NegativeLookahead(inner) = &seq[0].kind else {
panic!("expected NegativeLookahead, got {:?}", seq[0].kind);
};
assert!(matches!(&inner.kind, ExpressionKind::Terminal(t) if t == "'"));
assert!(matches!(&seq[1].kind, ExpressionKind::Nt(n) if n == "Foo"));
}
#[test]
fn lookahead_charset() {
let input = "Rule -> ![`e` `E`] SUFFIX";
let grammar = parse(input).unwrap();
let rule = grammar.productions.get("Rule").unwrap();
let ExpressionKind::Sequence(seq) = &rule.expression.kind else {
panic!("expected Sequence, got {:?}", rule.expression.kind);
};
assert_eq!(seq.len(), 2);
let ExpressionKind::NegativeLookahead(inner) = &seq[0].kind else {
panic!("expected NegativeLookahead, got {:?}", seq[0].kind);
};
let ExpressionKind::Charset(chars) = &inner.kind else {
panic!("expected Charset inside lookahead, got {:?}", inner.kind);
};
assert_eq!(chars.len(), 2);
assert!(matches!(&chars[0], Characters::Terminal(t) if t == "e"));
assert!(matches!(&chars[1], Characters::Terminal(t) if t == "E"));
}
#[test]
fn lookahead_grouped() {
let input = "Rule -> !(`.` | `_` | XID_Start)";
let grammar = parse(input).unwrap();
let rule = grammar.productions.get("Rule").unwrap();
let ExpressionKind::NegativeLookahead(inner) = &rule.expression.kind else {
panic!("expected NegativeLookahead, got {:?}", rule.expression.kind);
};
let ExpressionKind::Grouped(grouped) = &inner.kind else {
panic!("expected Grouped inside lookahead, got {:?}", inner.kind);
};
let ExpressionKind::Alt(alts) = &grouped.kind else {
panic!("expected Alt inside Grouped, got {:?}", grouped.kind);
};
assert_eq!(alts.len(), 3);
assert!(matches!(&alts[0].kind, ExpressionKind::Terminal(t) if t == "."));
assert!(matches!(&alts[1].kind, ExpressionKind::Terminal(t) if t == "_"));
assert!(matches!(&alts[2].kind, ExpressionKind::Nt(n) if n == "XID_Start"));
}
#[test]
fn lookahead_in_sequence_middle() {
let input = "Rule -> A !B C";
let grammar = parse(input).unwrap();
let rule = grammar.productions.get("Rule").unwrap();
let ExpressionKind::Sequence(seq) = &rule.expression.kind else {
panic!("expected Sequence, got {:?}", rule.expression.kind);
};
assert_eq!(seq.len(), 3);
assert!(matches!(&seq[0].kind, ExpressionKind::Nt(n) if n == "A"));
let ExpressionKind::NegativeLookahead(inner) = &seq[1].kind else {
panic!("expected NegativeLookahead, got {:?}", seq[1].kind);
};
assert!(matches!(&inner.kind, ExpressionKind::Nt(n) if n == "B"));
assert!(matches!(&seq[2].kind, ExpressionKind::Nt(n) if n == "C"));
}
#[test]
fn lookahead_in_repetition() {
let input = "Rule -> (!A B)*";
let grammar = parse(input).unwrap();
let rule = grammar.productions.get("Rule").unwrap();
let ExpressionKind::Repeat(rep) = &rule.expression.kind else {
panic!("expected Repeat, got {:?}", rule.expression.kind);
};
let ExpressionKind::Grouped(grouped) = &rep.kind else {
panic!("expected Grouped inside Repeat, got {:?}", rep.kind);
};
let ExpressionKind::Sequence(seq) = &grouped.kind else {
panic!("expected Sequence inside Grouped, got {:?}", grouped.kind);
};
assert_eq!(seq.len(), 2);
assert!(matches!(&seq[0].kind, ExpressionKind::NegativeLookahead(_)));
assert!(matches!(&seq[1].kind, ExpressionKind::Nt(n) if n == "B"));
}
#[test]
fn lookahead_in_alternation() {
let input = "Rule -> !A B | C";
let grammar = parse(input).unwrap();
let rule = grammar.productions.get("Rule").unwrap();
let ExpressionKind::Alt(alts) = &rule.expression.kind else {
panic!("expected Alt, got {:?}", rule.expression.kind);
};
assert_eq!(alts.len(), 2);
let ExpressionKind::Sequence(seq) = &alts[0].kind else {
panic!("expected Sequence, got {:?}", alts[0].kind);
};
assert_eq!(seq.len(), 2);
assert!(matches!(&seq[0].kind, ExpressionKind::NegativeLookahead(_)));
assert!(matches!(&seq[1].kind, ExpressionKind::Nt(n) if n == "B"));
assert!(matches!(&alts[1].kind, ExpressionKind::Nt(n) if n == "C"));
}
#[test]
fn lookahead_fail_trailing() {
let input = "Rule -> !";
let err = parse(input).unwrap_err();
assert!(err.contains("expected expression after !"));
}
// --- Unicode tests ---
#[test]
fn unicode_4_digit() {
let input = "Rule -> U+0009";
let grammar = parse(input).unwrap();
let rule = grammar.productions.get("Rule").unwrap();
let ExpressionKind::Unicode((ch, s)) = &rule.expression.kind else {
panic!("expected Unicode, got {:?}", rule.expression.kind);
};
assert_eq!(*ch, '\t');
assert_eq!(s, "0009");
}
#[test]
fn unicode_5_digit() {
let input = "Rule -> U+E0000";
let grammar = parse(input).unwrap();
let rule = grammar.productions.get("Rule").unwrap();
let ExpressionKind::Unicode((ch, s)) = &rule.expression.kind else {
panic!("expected Unicode, got {:?}", rule.expression.kind);
};
assert_eq!(*ch, '\u{E0000}');
assert_eq!(s, "E0000");
}
#[test]
fn unicode_6_digit() {
let input = "Rule -> U+10FFFF";
let grammar = parse(input).unwrap();
let rule = grammar.productions.get("Rule").unwrap();
let ExpressionKind::Unicode((ch, s)) = &rule.expression.kind else {
panic!("expected Unicode, got {:?}", rule.expression.kind);
};
assert_eq!(*ch, '\u{10FFFF}');
assert_eq!(s, "10FFFF");
}
#[test]
fn unicode_in_alternation() {
let input = "Rule -> U+0009 | U+000A";
let grammar = parse(input).unwrap();
let rule = grammar.productions.get("Rule").unwrap();
let ExpressionKind::Alt(alts) = &rule.expression.kind else {
panic!("expected Alt, got {:?}", rule.expression.kind);
};
assert_eq!(alts.len(), 2);
assert!(matches!(
&alts[0].kind,
ExpressionKind::Unicode((ch, _)) if *ch == '\t'
));
assert!(matches!(
&alts[1].kind,
ExpressionKind::Unicode((ch, _)) if *ch == '\n'
));
}
// --- Character / charset range tests ---
#[test]
fn charset_unicode_range() {
let input = "Rule -> [U+0000-U+007F]";
let grammar = parse(input).unwrap();
let rule = grammar.productions.get("Rule").unwrap();
let ExpressionKind::Charset(chars) = &rule.expression.kind else {
panic!("expected Charset, got {:?}", rule.expression.kind);
};
assert_eq!(chars.len(), 1);
let Characters::Range(a, b) = &chars[0] else {
panic!("expected Range, got {:?}", chars[0]);
};
assert!(matches!(a, Character::Unicode((ch, _)) if *ch == '\0'));
assert!(matches!(
b,
Character::Unicode((ch, _)) if *ch == '\u{7F}'
));
}
#[test]
fn charset_char_range() {
let input = "Rule -> [`a`-`z`]";
let grammar = parse(input).unwrap();
let rule = grammar.productions.get("Rule").unwrap();
let ExpressionKind::Charset(chars) = &rule.expression.kind else {
panic!("expected Charset, got {:?}", rule.expression.kind);
};
assert_eq!(chars.len(), 1);
let Characters::Range(a, b) = &chars[0] else {
panic!("expected Range, got {:?}", chars[0]);
};
assert!(matches!(a, Character::Char(ch) if *ch == 'a'));
assert!(matches!(b, Character::Char(ch) if *ch == 'z'));
}
#[test]
fn charset_mixed_range() {
let input = "Rule -> [`a`-U+007A]";
let grammar = parse(input).unwrap();
let rule = grammar.productions.get("Rule").unwrap();
let ExpressionKind::Charset(chars) = &rule.expression.kind else {
panic!("expected Charset, got {:?}", rule.expression.kind);
};
assert_eq!(chars.len(), 1);
let Characters::Range(a, b) = &chars[0] else {
panic!("expected Range, got {:?}", chars[0]);
};
assert!(matches!(a, Character::Char(ch) if *ch == 'a'));
assert!(matches!(
b,
Character::Unicode((ch, _)) if *ch == 'z'
));
}
#[test]
fn charset_multiple_unicode_ranges() {
let input = "Rule -> [U+0000-U+D7FF U+E000-U+10FFFF]";
let grammar = parse(input).unwrap();
let rule = grammar.productions.get("Rule").unwrap();
let ExpressionKind::Charset(chars) = &rule.expression.kind else {
panic!("expected Charset, got {:?}", rule.expression.kind);
};
assert_eq!(chars.len(), 2);
let Characters::Range(a1, b1) = &chars[0] else {
panic!("expected Range, got {:?}", chars[0]);
};
assert!(matches!(a1, Character::Unicode((ch, _)) if *ch == '\0'));
assert!(matches!(b1, Character::Unicode((ch, _)) if *ch == '\u{D7FF}'));
let Characters::Range(a2, b2) = &chars[1] else {
panic!("expected Range, got {:?}", chars[1]);
};
assert!(matches!(a2, Character::Unicode((ch, _)) if *ch == '\u{E000}'));
assert!(matches!(b2, Character::Unicode((ch, _)) if *ch == '\u{10FFFF}'));
}
#[test]
fn charset_terminals_and_named() {
let input = "Rule -> [`a` `b` Foo]";
let grammar = parse(input).unwrap();
let rule = grammar.productions.get("Rule").unwrap();
let ExpressionKind::Charset(chars) = &rule.expression.kind else {
panic!("expected Charset, got {:?}", rule.expression.kind);
};
assert_eq!(chars.len(), 3);
assert!(matches!(&chars[0], Characters::Terminal(t) if t == "a"));
assert!(matches!(&chars[1], Characters::Terminal(t) if t == "b"));
assert!(matches!(&chars[2], Characters::Named(n) if n == "Foo"));
}
// --- Negative lookahead combined with charset ---
#[test]
fn lookahead_charset_with_named_and_terminals() {
// Pattern from tokens.md: ![`'` `\` LF CR TAB] ASCII
let input = "Rule -> ![`x` `y` LF] Foo";
let grammar = parse(input).unwrap();
let rule = grammar.productions.get("Rule").unwrap();
let ExpressionKind::Sequence(seq) = &rule.expression.kind else {
panic!("expected Sequence, got {:?}", rule.expression.kind);
};
assert_eq!(seq.len(), 2);
let ExpressionKind::NegativeLookahead(inner) = &seq[0].kind else {
panic!("expected NegativeLookahead, got {:?}", seq[0].kind);
};
let ExpressionKind::Charset(chars) = &inner.kind else {
panic!("expected Charset, got {:?}", inner.kind);
};
assert_eq!(chars.len(), 3);
assert!(matches!(&chars[0], Characters::Terminal(t) if t == "x"));
assert!(matches!(&chars[1], Characters::Terminal(t) if t == "y"));
assert!(matches!(&chars[2], Characters::Named(n) if n == "LF"));
}
// --- Negative lookahead combined with Unicode ---
#[test]
fn lookahead_charset_with_unicode_range() {
let input = "Rule -> ![U+0000-U+007F] Foo";
let grammar = parse(input).unwrap();
let rule = grammar.productions.get("Rule").unwrap();
let ExpressionKind::Sequence(seq) = &rule.expression.kind else {
panic!("expected Sequence, got {:?}", rule.expression.kind);
};
let ExpressionKind::NegativeLookahead(inner) = &seq[0].kind else {
panic!("expected NegativeLookahead, got {:?}", seq[0].kind);
};
let ExpressionKind::Charset(chars) = &inner.kind else {
panic!("expected Charset, got {:?}", inner.kind);
};
assert_eq!(chars.len(), 1);
let Characters::Range(a, b) = &chars[0] else {
panic!("expected Range, got {:?}", chars[0]);
};
assert!(matches!(a, Character::Unicode((ch, _)) if *ch == '\0'));
assert!(matches!(
b,
Character::Unicode((ch, _)) if *ch == '\u{7F}'
));
}
// --- `parse_name` digit rejection tests ---
#[test]
fn parse_name_rejects_leading_digits() {
// `{123}` should not parse as a named reference. The
// digits don't form a valid name and there is no `..`
// range operator, so the parser should reject this.
let err = parse("A -> x{123}").unwrap_err();
assert!(
err.contains("expected `..`"),
"expected range-syntax error for {{123}}, got: {err}"
);
}
#[test]
fn parse_name_allows_letter_then_digit() {
// `n1` is a valid name (starts with a letter).
let grammar = parse("A -> x{n1:2..5}").unwrap();
let rule = grammar.productions.get("A").unwrap();
let ExpressionKind::RepeatRange {
name,
min,
max,
limit,
..
} = &rule.expression.kind
else {
panic!("expected RepeatRange, got {:?}", rule.expression.kind);
};
assert_eq!(name.as_deref(), Some("n1"));
assert_eq!(*min, Some(2));
assert_eq!(*max, Some(5));
assert!(matches!(limit, RangeLimit::HalfOpen));
}
#[test]
fn parse_name_allows_underscore_start() {
// `_n` is a valid name (starts with underscore).
let grammar = parse("A -> x{_n:2..5}").unwrap();
let rule = grammar.productions.get("A").unwrap();
let ExpressionKind::RepeatRange {
name,
min,
max,
limit,
..
} = &rule.expression.kind
else {
panic!("expected RepeatRange, got {:?}", rule.expression.kind);
};
assert_eq!(name.as_deref(), Some("_n"));
assert_eq!(*min, Some(2));
assert_eq!(*max, Some(5));
assert!(matches!(limit, RangeLimit::HalfOpen));
}
// --- Named repeat range tests ---
/// Extract full `RepeatRange` fields including the name.
fn named_repeat_range(input: &str) -> (Option<String>, Option<u32>, Option<u32>, RangeLimit) {
let grammar = parse(input).unwrap();
let rule = grammar.productions.get("A").unwrap();
let ExpressionKind::RepeatRange {
name,
min,
max,
limit,
..
} = &rule.expression.kind
else {
panic!("expected RepeatRange, got {:?}", rule.expression.kind);
};
(name.clone(), *min, *max, *limit)
}
#[test]
fn named_range_closed() {
let (name, min, max, limit) = named_repeat_range("A -> x{n:1..=255}");
assert_eq!(name.as_deref(), Some("n"));
assert_eq!(min, Some(1));
assert_eq!(max, Some(255));
assert!(matches!(limit, RangeLimit::Closed));
}
#[test]
fn named_range_half_open() {
let (name, min, max, limit) = named_repeat_range("A -> x{n:2..5}");
assert_eq!(name.as_deref(), Some("n"));
assert_eq!(min, Some(2));
assert_eq!(max, Some(5));
assert!(matches!(limit, RangeLimit::HalfOpen));
}
#[test]
fn named_range_omitted_min() {
let (name, min, max, limit) = named_repeat_range("A -> x{n:..=5}");
assert_eq!(name.as_deref(), Some("n"));
assert_eq!(min, None);
assert_eq!(max, Some(5));
assert!(matches!(limit, RangeLimit::Closed));
}
#[test]
fn named_range_omitted_max() {
let (name, min, max, limit) = named_repeat_range("A -> x{n:2..}");
assert_eq!(name.as_deref(), Some("n"));
assert_eq!(min, Some(2));
assert_eq!(max, None);
assert!(matches!(limit, RangeLimit::HalfOpen));
}
#[test]
fn named_reference() {
// `{n}` without a colon or range produces a
// RepeatRangeNamed variant.
let grammar = parse("A -> x{n}").unwrap();
let rule = grammar.productions.get("A").unwrap();
let ExpressionKind::RepeatRangeNamed(_, name) = &rule.expression.kind else {
panic!("expected RepeatRangeNamed, got {:?}", rule.expression.kind);
};
assert_eq!(name, "n");
}
#[test]
fn named_binding_and_reference_in_sequence() {
// A production with a named binding and a named reference.
let grammar = parse("A -> x{n:1..=255} y{n}").unwrap();
let rule = grammar.productions.get("A").unwrap();
let ExpressionKind::Sequence(seq) = &rule.expression.kind else {
panic!("expected Sequence, got {:?}", rule.expression.kind);
};
assert_eq!(seq.len(), 2);
// First element: x{n:1..=255}
let ExpressionKind::RepeatRange {
name,
min,
max,
limit,
..
} = &seq[0].kind
else {
panic!("expected RepeatRange, got {:?}", seq[0].kind);
};
assert_eq!(name.as_deref(), Some("n"));
assert_eq!(*min, Some(1));
assert_eq!(*max, Some(255));
assert!(matches!(limit, RangeLimit::Closed));
// Second element: y{n}
let ExpressionKind::RepeatRangeNamed(_, ref_name) = &seq[1].kind else {
panic!("expected RepeatRangeNamed, got {:?}", seq[1].kind);
};
assert_eq!(ref_name, "n");
}
#[test]
fn named_range_backtrack_to_plain_range() {
// When parse_name() succeeds but the next byte is
// neither `:` nor `}`, the parser backtracks and
// falls through to plain range parsing. `{2..5}` is
// such a case after the parse_name fix (digits are
// rejected), but let's test a scenario where a name is
// parsed and then backtracked.
//
// There is no single-character token after a name that
// triggers backtrack in valid grammar (the match arms
// cover `:` and `}`), but the fallback resets the index
// and tries plain range parsing. We verify that
// `{2..5}` parses correctly as a plain range even
// though it starts with a digit.
let (min, max, limit) = repeat_range("A -> x{2..5}");
assert_eq!(min, Some(2));
assert_eq!(max, Some(5));
assert!(matches!(limit, RangeLimit::HalfOpen));
}
#[test]
fn named_range_err_colon_missing_dots() {
// `{n:}` -- name followed by colon, then no `..`.
let err = parse("A -> x{n:}").unwrap_err();
assert!(
err.contains("expected `..`"),
"expected `..` error for {{n:}}, got: {err}"
);
}
#[test]
fn named_range_err_empty_braces() {
// `{}` -- empty braces contain no name and no range.
let err = parse("A -> x{}").unwrap_err();
assert!(
err.contains("expected `..`"),
"expected `..` error for {{}}, got: {err}"
);
}
}