Parse grammar without regexes We'd been parsing the grammar with a combination of recursive descent and regular expression matchers. This combination has its merits, and it's done tastefully here, but it seems maybe more straightforward to do the parsing entirely with recursive descent. Among other things, doing it this way allows us to provide more precise error reporting on malformed inputs. The cost, in terms of lines of code, of doing this is rather modest, and the result seems at least as clear -- there's some mental cost to code switching between the two worlds. So let's make the switch and parse the grammar without regular expressions. We verified that the rendered output of the Reference is byte identical before and after this change.

commit: 1a7304b8a42a37366946de00f449b1b6a041ebee [log] [tgz]
author: Travis Cross <tc@traviscross.com> Thu May 15 15:00:56 2025 +0000
committer: Travis Cross <tc@traviscross.com> Fri May 16 07:27:34 2025 +0000
tree: 361dc8b08d9fa1f78a0bd70d02d425a4a468fa2b
parent: c703c8d13456d4a6fdc83c9b39b370af3df12d00 [diff]
diff --git a/mdbook-spec/src/grammar/parser.rs b/mdbook-spec/src/grammar/parser.rs
index 7a92f47..631fb7a 100644
--- a/mdbook-spec/src/grammar/parser.rs
+++ b/mdbook-spec/src/grammar/parser.rs

@@ -1,11 +1,9 @@
 //! A parser of the ENBF-like grammar.
 
 use super::{Characters, Expression, ExpressionKind, Grammar, Production};
-use regex::{Captures, Regex};
 use std::fmt;
 use std::fmt::Display;
 use std::path::Path;
-use std::sync::LazyLock;
 
 struct Parser<'a> {
     input: &'a str,
@@ -76,18 +74,6 @@
         &self.input[i..i + upper]
     }
 
-    /// If the input matches the given regex, it is returned and the head is moved forward.
-    ///
-    /// Note that regexes must start with `^`.
-    fn take_re(&mut self, re: &Regex) -> Option<Captures<'_>> {
-        if let Some(cap) = re.captures(&self.input[self.index..]) {
-            self.index += cap[0].len();
-            Some(cap)
-        } else {
-            None
-        }
-    }
-
     /// Returns whether or not the given string is next, and advances the head if it is.
     fn take_str(&mut self, s: &str) -> bool {
         if self.input[self.index..].starts_with(s) {
@@ -168,13 +154,12 @@
     }
 
     fn parse_expression(&mut self) -> Result<Option<Expression>> {
-        static ALT_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^ *\| *").unwrap());
-
         let mut es = Vec::new();
         loop {
             let Some(e) = self.parse_seq()? else { break };
             es.push(e);
-            if self.take_re(&ALT_RE).is_none() {
+            _ = self.space0();
+            if !self.take_str("|") {
                 break;
             }
         }
@@ -268,13 +253,20 @@
         Some(ExpressionKind::Nt(nt))
     }
 
+    /// Parse terminal within backticks.
     fn parse_terminal(&mut self) -> Result<ExpressionKind> {
-        static TERMINAL_RE: LazyLock<Regex> =
-            LazyLock::new(|| Regex::new(r"^`([^`\n]+)`").unwrap());
-        match self.take_re(&TERMINAL_RE) {
-            Some(cap) => Ok(ExpressionKind::Terminal(cap[1].to_string())),
-            None => bail!(self, "unterminated terminal, expected closing backtick"),
+        Ok(ExpressionKind::Terminal(self.parse_terminal_str()?))
+    }
+
+    /// Parse string within backticks.
+    fn parse_terminal_str(&mut self) -> Result<String> {
+        self.expect("`", "expected opening backtick")?;
+        let term = self.take_while(&|x| !['\n', '`'].contains(&x)).to_string();
+        if term.is_empty() {
+            bail!(self, "expected terminal");
         }
+        self.expect("`", "expected closing backtick")?;
+        Ok(term)
     }
 
     fn parse_charset(&mut self) -> Result<ExpressionKind> {
@@ -282,7 +274,7 @@
         let mut characters = Vec::new();
         loop {
             self.space0();
-            let Some(ch) = self.parse_characters() else {
+            let Some(ch) = self.parse_characters()? else {
                 break;
             };
             characters.push(ch);
@@ -295,27 +287,48 @@
         Ok(ExpressionKind::Charset(characters))
     }
 
-    fn parse_characters(&mut self) -> Option<Characters> {
-        static RANGE_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^`(.)`-`(.)`").unwrap());
-        static TERMINAL_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new("^`([^`\n]+)`").unwrap());
-        if let Some(cap) = self.take_re(&RANGE_RE) {
-            let a = cap[1].chars().next().unwrap();
-            let b = cap[2].chars().next().unwrap();
-            Some(Characters::Range(a, b))
-        } else if let Some(cap) = self.take_re(&TERMINAL_RE) {
-            Some(Characters::Terminal(cap[1].to_string()))
+    /// Parse an element of a character class, e.g.
+    /// `` `a`-`b` `` | `` `term` `` | `` NonTerminal ``.
+    fn parse_characters(&mut self) -> Result<Option<Characters>> {
+        if let Some(b'`') = self.peek() {
+            let recov = self.index;
+            let a = self.parse_terminal_str()?;
+            if self.take_str("-") {
+                //~^ Parse `` `a`-`b` `` character range.
+                if a.len() > 1 {
+                    self.index = recov + 1;
+                    bail!(self, "invalid start terminal in range");
+                }
+                let recov = self.index;
+                let b = self.parse_terminal_str()?;
+                if b.len() > 1 {
+                    self.index = recov + 1;
+                    bail!(self, "invalid end terminal in range");
+                }
+                let a = a.chars().next().unwrap();
+                let b = b.chars().next().unwrap();
+                Ok(Some(Characters::Range(a, b)))
+            } else {
+                //~^ Parse terminal in backticks.
+                Ok(Some(Characters::Terminal(a)))
+            }
+        } else if let Some(name) = self.parse_name() {
+            //~^ Parse nonterminal identifier.
+            Ok(Some(Characters::Named(name)))
         } else {
-            let name = self.parse_name()?;
-            Some(Characters::Named(name))
+            Ok(None)
         }
     }
 
+    /// Parse e.g. `<prose text>`.
     fn parse_prose(&mut self) -> Result<ExpressionKind> {
-        static PROSE_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^<([^>\n]+)>").unwrap());
-        match self.take_re(&PROSE_RE) {
-            Some(cap) => Ok(ExpressionKind::Prose(cap[1].to_string())),
-            None => bail!(self, "unterminated prose, expected closing `>`"),
+        self.expect("<", "expected opening `<`")?;
+        let text = self.take_while(&|x| !['\n', '>'].contains(&x)).to_string();
+        if text.is_empty() {
+            bail!(self, "expected prose text");
         }
+        self.expect(">", "expected closing `>`")?;
+        Ok(ExpressionKind::Prose(text))
     }
 
     fn parse_grouped(&mut self) -> Result<ExpressionKind> {
@@ -344,13 +357,19 @@
         Ok(ExpressionKind::NegExpression(box_kind(kind)))
     }
 
+    /// Parse e.g. `F00F` after `U+`.
     fn parse_unicode(&mut self) -> Result<ExpressionKind> {
-        static UNICODE_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^[A-Z0-9]{4}").unwrap());
-
-        match self.take_re(&UNICODE_RE) {
-            Some(s) => Ok(ExpressionKind::Unicode(s[0].to_string())),
-            None => bail!(self, "expected 4 hexadecimal uppercase digits after U+"),
+        let mut xs = Vec::with_capacity(4);
+        for _ in 0..4 {
+            match self.peek() {
+                Some(x @ (b'0'..=b'9' | b'A'..=b'F')) => {
+                    xs.push(x);
+                    self.index += 1;
+                }
+                _ => bail!(self, "expected 4 uppercase hexidecimal digits after `U+`"),
+            }
         }
+        Ok(ExpressionKind::Unicode(String::from_utf8(xs).unwrap()))
     }
 
     /// Parse `?` after expression.
@@ -428,16 +447,17 @@
         Ok(Some(self.input[start..self.index - 1].to_string()))
     }
 
+    /// Parse footnote reference, e.g. `[^id]`.
     fn parse_footnote(&mut self) -> Result<Option<String>> {
-        static FOOTNOTE_RE: LazyLock<Regex> =
-            LazyLock::new(|| Regex::new(r"^([^\]\n]+)]").unwrap());
         if !self.take_str("[^") {
             return Ok(None);
         }
-        match self.take_re(&FOOTNOTE_RE) {
-            Some(cap) => Ok(Some(cap[1].to_string())),
-            None => bail!(self, "unterminated footnote, expected closing `]`"),
+        let id = self.take_while(&|x| !['\n', ']'].contains(&x)).to_string();
+        if id.is_empty() {
+            bail!(self, "expected footnote id");
         }
+        self.expect("]", "expected closing `]`")?;
+        Ok(Some(id))
     }
 }
commit	1a7304b8a42a37366946de00f449b1b6a041ebee	[log] [tgz]
author	Travis Cross <tc@traviscross.com>	Thu May 15 15:00:56 2025 +0000
committer	Travis Cross <tc@traviscross.com>	Fri May 16 07:27:34 2025 +0000
tree	361dc8b08d9fa1f78a0bd70d02d425a4a468fa2b
parent	c703c8d13456d4a6fdc83c9b39b370af3df12d00 [diff]