| //===--- Grammar.cpp - Grammar for clang pseudoparser -----------*- C++-*-===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "clang-pseudo/grammar/Grammar.h" |
| #include "clang/Basic/TokenKinds.h" |
| #include "llvm/ADT/ArrayRef.h" |
| #include "llvm/ADT/STLExtras.h" |
| #include "llvm/ADT/StringRef.h" |
| #include "llvm/Support/FormatVariadic.h" |
| #include "llvm/Support/raw_ostream.h" |
| #include <optional> |
| |
| namespace clang { |
| namespace pseudo { |
| |
| Rule::Rule(SymbolID Target, llvm::ArrayRef<SymbolID> Sequence) |
| : Target(Target), Size(static_cast<uint8_t>(Sequence.size())) { |
| assert(Sequence.size() <= Rule::MaxElements); |
| llvm::copy(Sequence, this->Sequence); |
| } |
| |
| Grammar::Grammar(std::unique_ptr<GrammarTable> Table) : T(std::move(Table)) { |
| Underscore = *findNonterminal("_"); |
| } |
| |
| llvm::ArrayRef<Rule> Grammar::rulesFor(SymbolID SID) const { |
| assert(isNonterminal(SID)); |
| const auto &R = T->Nonterminals[SID].RuleRange; |
| assert(R.End <= T->Rules.size()); |
| return llvm::ArrayRef(&T->Rules[R.Start], R.End - R.Start); |
| } |
| |
| const Rule &Grammar::lookupRule(RuleID RID) const { |
| assert(RID < T->Rules.size()); |
| return T->Rules[RID]; |
| } |
| |
| llvm::StringRef Grammar::symbolName(SymbolID SID) const { |
| if (isToken(SID)) |
| return T->Terminals[symbolToToken(SID)]; |
| return T->Nonterminals[SID].Name; |
| } |
| |
| std::optional<SymbolID> Grammar::findNonterminal(llvm::StringRef Name) const { |
| auto It = llvm::partition_point( |
| T->Nonterminals, |
| [&](const GrammarTable::Nonterminal &X) { return X.Name < Name; }); |
| if (It != T->Nonterminals.end() && It->Name == Name) |
| return It - T->Nonterminals.begin(); |
| return std::nullopt; |
| } |
| |
| std::string Grammar::dumpRule(RuleID RID) const { |
| std::string Result; |
| llvm::raw_string_ostream OS(Result); |
| const Rule &R = T->Rules[RID]; |
| OS << symbolName(R.Target) << " :="; |
| for (unsigned I = 0; I < R.Size; ++I) { |
| OS << " " << symbolName(R.Sequence[I]); |
| if (R.RecoveryIndex == I) |
| OS << " [recover=" << T->AttributeValues[R.Recovery] << "]"; |
| } |
| if (R.Guarded) |
| OS << " [guard]"; |
| return Result; |
| } |
| |
| std::string Grammar::dumpRules(SymbolID SID) const { |
| assert(isNonterminal(SID)); |
| std::string Result; |
| const auto &Range = T->Nonterminals[SID].RuleRange; |
| for (RuleID RID = Range.Start; RID < Range.End; ++RID) |
| Result.append(dumpRule(RID)).push_back('\n'); |
| return Result; |
| } |
| |
| std::string Grammar::dump() const { |
| std::string Result; |
| llvm::raw_string_ostream OS(Result); |
| OS << "Nonterminals:\n"; |
| for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID) |
| OS << llvm::formatv(" {0} {1}\n", SID, symbolName(SID)); |
| OS << "Rules:\n"; |
| for (RuleID RID = 0; RID < T->Rules.size(); ++RID) |
| OS << llvm::formatv(" {0} {1}\n", RID, dumpRule(RID)); |
| return OS.str(); |
| } |
| |
| std::vector<llvm::DenseSet<SymbolID>> firstSets(const Grammar &G) { |
| std::vector<llvm::DenseSet<SymbolID>> FirstSets( |
| G.table().Nonterminals.size()); |
| auto ExpandFirstSet = [&FirstSets](SymbolID Target, SymbolID First) { |
| assert(isNonterminal(Target)); |
| if (isToken(First)) |
| return FirstSets[Target].insert(First).second; |
| bool Changed = false; |
| for (SymbolID SID : FirstSets[First]) |
| Changed |= FirstSets[Target].insert(SID).second; |
| return Changed; |
| }; |
| |
| // A rule S := T ... implies elements in FIRST(S): |
| // - if T is a terminal, FIRST(S) contains T |
| // - if T is a nonterminal, FIRST(S) contains FIRST(T) |
| // Since FIRST(T) may not have been fully computed yet, FIRST(S) itself may |
| // end up being incomplete. |
| // We iterate until we hit a fixed point. |
| // (This isn't particularly efficient, but table building isn't on the |
| // critical path). |
| bool Changed = true; |
| while (Changed) { |
| Changed = false; |
| for (const auto &R : G.table().Rules) |
| // We only need to consider the first element because symbols are |
| // non-nullable. |
| Changed |= ExpandFirstSet(R.Target, R.seq().front()); |
| } |
| return FirstSets; |
| } |
| |
| std::vector<llvm::DenseSet<SymbolID>> followSets(const Grammar &G) { |
| auto FirstSets = firstSets(G); |
| std::vector<llvm::DenseSet<SymbolID>> FollowSets( |
| G.table().Nonterminals.size()); |
| // Expand the follow set of a nonterminal symbol Y by adding all from the |
| // given symbol set. |
| auto ExpandFollowSet = [&FollowSets](SymbolID Y, |
| const llvm::DenseSet<SymbolID> &ToAdd) { |
| assert(isNonterminal(Y)); |
| bool Changed = false; |
| for (SymbolID F : ToAdd) |
| Changed |= FollowSets[Y].insert(F).second; |
| return Changed; |
| }; |
| // Follow sets is computed based on the following 3 rules, the computation |
| // is completed at a fixed point where there is no more new symbols can be |
| // added to any of the follow sets. |
| // |
| // Rule 1: add endmarker to the FOLLOW(S), where S is the start symbol of the |
| // augmented grammar, in our case it is '_'. |
| FollowSets[G.underscore()].insert(tokenSymbol(tok::eof)); |
| bool Changed = true; |
| while (Changed) { |
| Changed = false; |
| for (const auto &R : G.table().Rules) { |
| // Rule 2: for a rule X := ... Y Z, we add all symbols from FIRST(Z) to |
| // FOLLOW(Y). |
| for (size_t I = 0; I + 1 < R.seq().size(); ++I) { |
| if (isToken(R.seq()[I])) |
| continue; |
| // We only need to consider the next symbol because symbols are |
| // non-nullable. |
| SymbolID Next = R.seq()[I + 1]; |
| if (isToken(Next)) |
| // First set for a terminal is itself. |
| Changed |= ExpandFollowSet(R.seq()[I], {Next}); |
| else |
| Changed |= ExpandFollowSet(R.seq()[I], FirstSets[Next]); |
| } |
| // Rule 3: for a rule X := ... Z, we add all symbols from FOLLOW(X) to |
| // FOLLOW(Z). |
| SymbolID Z = R.seq().back(); |
| if (isNonterminal(Z)) |
| Changed |= ExpandFollowSet(Z, FollowSets[R.Target]); |
| } |
| } |
| return FollowSets; |
| } |
| |
| static llvm::ArrayRef<std::string> getTerminalNames() { |
| static const auto &TerminalNames = []() { |
| auto TerminalNames = new std::string[NumTerminals]; |
| #define PUNCTUATOR(Tok, Spelling) TerminalNames[tok::Tok] = Spelling; |
| #define KEYWORD(Keyword, Condition) \ |
| TerminalNames[tok::kw_##Keyword] = llvm::StringRef(#Keyword).upper(); |
| #define TOK(Tok) TerminalNames[tok::Tok] = llvm::StringRef(#Tok).upper(); |
| #include "clang/Basic/TokenKinds.def" |
| return llvm::ArrayRef(TerminalNames, NumTerminals); |
| }(); |
| return TerminalNames; |
| } |
| GrammarTable::GrammarTable() : Terminals(getTerminalNames()) {} |
| |
| } // namespace pseudo |
| } // namespace clang |