Parse optionals and repeats without regexes
Rather than parsing optionals and repeats fully in the recursive
descent style, we were using regular expressions to do part of the
matching and parsing. That's fine for what it is, but as we think
about extending the grammar language surrounding repeats further, it
might be more straightforward for this to be parsed in the more usual
way. So let's do that. Doing this also results in better and more
targeted errors when parsing malformed syntax.
We had been supporting a space between an expression and the optional
and repeat sigils `?`, `*`, and `+` (but not between an expression and
the `{a..b}` ranged repeat syntax). In making this change, we drop
this support and adjust the affected productions. We were only using
this in a handful of places, and the clarity of the productions seem
the same or better by removing these spaces.
We verified that, setting aside the removal of these spaces, the
rendered output of the Reference is byte identical before and after
this change.
diff --git a/mdbook-spec/src/grammar/parser.rs b/mdbook-spec/src/grammar/parser.rs
index 60ca740..7a92f47 100644
--- a/mdbook-spec/src/grammar/parser.rs
+++ b/mdbook-spec/src/grammar/parser.rs
@@ -214,7 +214,7 @@
return Ok(None);
};
- let mut kind = if self.take_str("U+") {
+ let kind = if self.take_str("U+") {
self.parse_unicode()?
} else if self.input[self.index..]
.chars()
@@ -246,30 +246,13 @@
} else {
return Ok(None);
};
-
- static REPEAT_RE: LazyLock<Regex> =
- LazyLock::new(|| Regex::new(r"^ ?(\*\?|\+\?|\?|\*|\+)").unwrap());
- static RANGE_RE: LazyLock<Regex> =
- LazyLock::new(|| Regex::new(r"^\{([0-9]+)?\.\.([0-9]+)?\}").unwrap());
- if let Some(cap) = self.take_re(&REPEAT_RE) {
- kind = match &cap[1] {
- "?" => ExpressionKind::Optional(box_kind(kind)),
- "*" => ExpressionKind::Repeat(box_kind(kind)),
- "*?" => ExpressionKind::RepeatNonGreedy(box_kind(kind)),
- "+" => ExpressionKind::RepeatPlus(box_kind(kind)),
- "+?" => ExpressionKind::RepeatPlusNonGreedy(box_kind(kind)),
- s => panic!("unexpected `{s}`"),
- };
- } else if let Some(cap) = self.take_re(&RANGE_RE) {
- let a = cap.get(1).map(|m| m.as_str().parse::<u32>().unwrap());
- let b = cap.get(2).map(|m| m.as_str().parse::<u32>().unwrap());
- match (a, b) {
- (Some(a), Some(b)) if b < a => bail!(self, "range {a}..{b} is malformed"),
- _ => {}
- }
- kind = ExpressionKind::RepeatRange(box_kind(kind), a, b);
- }
-
+ let kind = match self.peek() {
+ Some(b'?') => self.parse_optional(kind)?,
+ Some(b'*') => self.parse_repeat(kind)?,
+ Some(b'+') => self.parse_repeat_plus(kind)?,
+ Some(b'{') => self.parse_repeat_range(kind)?,
+ _ => kind,
+ };
let suffix = self.parse_suffix()?;
let footnote = self.parse_footnote()?;
@@ -370,6 +353,52 @@
}
}
+ /// Parse `?` after expression.
+ fn parse_optional(&mut self, kind: ExpressionKind) -> Result<ExpressionKind> {
+ self.expect("?", "expected `?`")?;
+ Ok(ExpressionKind::Optional(box_kind(kind)))
+ }
+
+ /// Parse `*` | `*?` after expression.
+ fn parse_repeat(&mut self, kind: ExpressionKind) -> Result<ExpressionKind> {
+ self.expect("*", "expected `*`")?;
+ Ok(if self.take_str("?") {
+ ExpressionKind::RepeatNonGreedy(box_kind(kind))
+ } else {
+ ExpressionKind::Repeat(box_kind(kind))
+ })
+ }
+
+ /// Parse `+` | `+?` after expression.
+ fn parse_repeat_plus(&mut self, kind: ExpressionKind) -> Result<ExpressionKind> {
+ self.expect("+", "expected `+`")?;
+ Ok(if self.take_str("?") {
+ ExpressionKind::RepeatPlusNonGreedy(box_kind(kind))
+ } else {
+ ExpressionKind::RepeatPlus(box_kind(kind))
+ })
+ }
+
+ /// Parse `{a..}` | `{..b}` | `{a..b}` after expression.
+ fn parse_repeat_range(&mut self, kind: ExpressionKind) -> Result<ExpressionKind> {
+ self.expect("{", "expected `{`")?;
+ let a = self.take_while(&|x| x.is_ascii_digit());
+ let Ok(a) = (!a.is_empty()).then(|| a.parse::<u32>()).transpose() else {
+ bail!(self, "malformed range start");
+ };
+ self.expect("..", "expected `..`")?;
+ let b = self.take_while(&|x| x.is_ascii_digit());
+ let Ok(b) = (!b.is_empty()).then(|| b.parse::<u32>()).transpose() else {
+ bail!(self, "malformed range end");
+ };
+ match (a, b) {
+ (Some(a), Some(b)) if b < a => bail!(self, "range {a}..{b} is malformed"),
+ _ => {}
+ }
+ self.expect("}", "expected `}`")?;
+ Ok(ExpressionKind::RepeatRange(box_kind(kind), a, b))
+ }
+
fn parse_suffix(&mut self) -> Result<Option<String>> {
if !self.take_str(" _") {
return Ok(None);
diff --git a/src/items/generics.md b/src/items/generics.md
index 3349158..3a2e98a 100644
--- a/src/items/generics.md
+++ b/src/items/generics.md
@@ -232,7 +232,7 @@
r[items.generics.where.syntax]
```grammar,items
-WhereClause -> `where` ( WhereClauseItem `,` )* WhereClauseItem ?
+WhereClause -> `where` ( WhereClauseItem `,` )* WhereClauseItem?
WhereClauseItem ->
LifetimeWhereClauseItem
diff --git a/src/patterns.md b/src/patterns.md
index 2d9ddb5..e3a5c18 100644
--- a/src/patterns.md
+++ b/src/patterns.md
@@ -181,7 +181,7 @@
r[patterns.ident.syntax]
```grammar,patterns
-IdentifierPattern -> `ref`? `mut`? IDENTIFIER (`@` PatternNoTopAlt ) ?
+IdentifierPattern -> `ref`? `mut`? IDENTIFIER ( `@` PatternNoTopAlt )?
```
r[patterns.ident.intro]
@@ -704,7 +704,7 @@
```grammar,patterns
StructPattern ->
PathInExpression `{`
- StructPatternElements ?
+ StructPatternElements?
`}`
StructPatternElements ->