| //===--- FuzzyMatch.h - Approximate identifier matching  ---------*- C++-*-===// | 
 | // | 
 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | 
 | // See https://llvm.org/LICENSE.txt for license information. | 
 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | 
 | // | 
 | //===----------------------------------------------------------------------===// | 
 | // | 
 | // To check for a match between a Pattern ('u_p') and a Word ('unique_ptr'), | 
 | // we consider the possible partial match states: | 
 | // | 
 | //     u n i q u e _ p t r | 
 | //   +--------------------- | 
 | //   |A . . . . . . . . . . | 
 | //  u| | 
 | //   |. . . . . . . . . . . | 
 | //  _| | 
 | //   |. . . . . . . O . . . | 
 | //  p| | 
 | //   |. . . . . . . . . . B | 
 | // | 
 | // Each dot represents some prefix of the pattern being matched against some | 
 | // prefix of the word. | 
 | //   - A is the initial state: '' matched against '' | 
 | //   - O is an intermediate state: 'u_' matched against 'unique_' | 
 | //   - B is the target state: 'u_p' matched against 'unique_ptr' | 
 | // | 
 | // We aim to find the best path from A->B. | 
 | //  - Moving right (consuming a word character) | 
 | //    Always legal: not all word characters must match. | 
 | //  - Moving diagonally (consuming both a word and pattern character) | 
 | //    Legal if the characters match. | 
 | //  - Moving down (consuming a pattern character) is never legal. | 
 | //    Never legal: all pattern characters must match something. | 
 | // Characters are matched case-insensitively. | 
 | // The first pattern character may only match the start of a word segment. | 
 | // | 
 | // The scoring is based on heuristics: | 
 | //  - when matching a character, apply a bonus or penalty depending on the | 
 | //    match quality (does case match, do word segments align, etc) | 
 | //  - when skipping a character, apply a penalty if it hurts the match | 
 | //    (it starts a word segment, or splits the matched region, etc) | 
 | // | 
 | // These heuristics require the ability to "look backward" one character, to | 
 | // see whether it was matched or not. Therefore the dynamic-programming matrix | 
 | // has an extra dimension (last character matched). | 
 | // Each entry also has an additional flag indicating whether the last-but-one | 
 | // character matched, which is needed to trace back through the scoring table | 
 | // and reconstruct the match. | 
 | // | 
 | // We treat strings as byte-sequences, so only ASCII has first-class support. | 
 | // | 
 | // This algorithm was inspired by VS code's client-side filtering, and aims | 
 | // to be mostly-compatible. | 
 | // | 
 | //===----------------------------------------------------------------------===// | 
 |  | 
 | #include "FuzzyMatch.h" | 
 | #include "llvm/ADT/Optional.h" | 
 | #include "llvm/Support/Format.h" | 
 |  | 
 | namespace clang { | 
 | namespace clangd { | 
 |  | 
 | constexpr int FuzzyMatcher::MaxPat; | 
 | constexpr int FuzzyMatcher::MaxWord; | 
 |  | 
 | static char lower(char C) { return C >= 'A' && C <= 'Z' ? C + ('a' - 'A') : C; } | 
 | // A "negative infinity" score that won't overflow. | 
 | // We use this to mark unreachable states and forbidden solutions. | 
 | // Score field is 15 bits wide, min value is -2^14, we use half of that. | 
 | static constexpr int AwfulScore = -(1 << 13); | 
 | static bool isAwful(int S) { return S < AwfulScore / 2; } | 
 | static constexpr int PerfectBonus = 4; // Perfect per-pattern-char score. | 
 |  | 
 | FuzzyMatcher::FuzzyMatcher(llvm::StringRef Pattern) | 
 |     : PatN(std::min<int>(MaxPat, Pattern.size())), | 
 |       ScoreScale(PatN ? float{1} / (PerfectBonus * PatN) : 0), WordN(0) { | 
 |   std::copy(Pattern.begin(), Pattern.begin() + PatN, Pat); | 
 |   for (int I = 0; I < PatN; ++I) | 
 |     LowPat[I] = lower(Pat[I]); | 
 |   Scores[0][0][Miss] = {0, Miss}; | 
 |   Scores[0][0][Match] = {AwfulScore, Miss}; | 
 |   for (int P = 0; P <= PatN; ++P) | 
 |     for (int W = 0; W < P; ++W) | 
 |       for (Action A : {Miss, Match}) | 
 |         Scores[P][W][A] = {AwfulScore, Miss}; | 
 |   PatTypeSet = calculateRoles(llvm::StringRef(Pat, PatN), | 
 |                               llvm::makeMutableArrayRef(PatRole, PatN)); | 
 | } | 
 |  | 
 | llvm::Optional<float> FuzzyMatcher::match(llvm::StringRef Word) { | 
 |   if (!(WordContainsPattern = init(Word))) | 
 |     return llvm::None; | 
 |   if (!PatN) | 
 |     return 1; | 
 |   buildGraph(); | 
 |   auto Best = std::max(Scores[PatN][WordN][Miss].Score, | 
 |                        Scores[PatN][WordN][Match].Score); | 
 |   if (isAwful(Best)) | 
 |     return llvm::None; | 
 |   float Score = | 
 |       ScoreScale * std::min(PerfectBonus * PatN, std::max<int>(0, Best)); | 
 |   // If the pattern is as long as the word, we have an exact string match, | 
 |   // since every pattern character must match something. | 
 |   if (WordN == PatN) | 
 |     Score *= 2; // May not be perfect 2 if case differs in a significant way. | 
 |   return Score; | 
 | } | 
 |  | 
 | // We get CharTypes from a lookup table. Each is 2 bits, 4 fit in each byte. | 
 | // The top 6 bits of the char select the byte, the bottom 2 select the offset. | 
 | // e.g. 'q' = 010100 01 = byte 28 (55), bits 3-2 (01) -> Lower. | 
 | constexpr static uint8_t CharTypes[] = { | 
 |     0x00, 0x00, 0x00, 0x00, // Control characters | 
 |     0x00, 0x00, 0x00, 0x00, // Control characters | 
 |     0xff, 0xff, 0xff, 0xff, // Punctuation | 
 |     0x55, 0x55, 0xf5, 0xff, // Numbers->Lower, more Punctuation. | 
 |     0xab, 0xaa, 0xaa, 0xaa, // @ and A-O | 
 |     0xaa, 0xaa, 0xea, 0xff, // P-Z, more Punctuation. | 
 |     0x57, 0x55, 0x55, 0x55, // ` and a-o | 
 |     0x55, 0x55, 0xd5, 0x3f, // p-z, Punctuation, DEL. | 
 |     0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, // Bytes over 127 -> Lower. | 
 |     0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, // (probably UTF-8). | 
 |     0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, | 
 |     0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, | 
 | }; | 
 |  | 
 | // The Role can be determined from the Type of a character and its neighbors: | 
 | // | 
 | //   Example  | Chars | Type | Role | 
 | //   ---------+--------------+----- | 
 | //   F(o)oBar | Foo   | Ull  | Tail | 
 | //   Foo(B)ar | oBa   | lUl  | Head | 
 | //   (f)oo    | ^fo   | Ell  | Head | 
 | //   H(T)TP   | HTT   | UUU  | Tail | 
 | // | 
 | // Our lookup table maps a 6 bit key (Prev, Curr, Next) to a 2-bit Role. | 
 | // A byte packs 4 Roles. (Prev, Curr) selects a byte, Next selects the offset. | 
 | // e.g. Lower, Upper, Lower -> 01 10 01 -> byte 6 (aa), bits 3-2 (10) -> Head. | 
 | constexpr static uint8_t CharRoles[] = { | 
 |     // clang-format off | 
 |     //         Curr= Empty Lower Upper Separ | 
 |     /* Prev=Empty */ 0x00, 0xaa, 0xaa, 0xff, // At start, Lower|Upper->Head | 
 |     /* Prev=Lower */ 0x00, 0x55, 0xaa, 0xff, // In word, Upper->Head;Lower->Tail | 
 |     /* Prev=Upper */ 0x00, 0x55, 0x59, 0xff, // Ditto, but U(U)U->Tail | 
 |     /* Prev=Separ */ 0x00, 0xaa, 0xaa, 0xff, // After separator, like at start | 
 |     // clang-format on | 
 | }; | 
 |  | 
 | template <typename T> static T packedLookup(const uint8_t *Data, int I) { | 
 |   return static_cast<T>((Data[I >> 2] >> ((I & 3) * 2)) & 3); | 
 | } | 
 | CharTypeSet calculateRoles(llvm::StringRef Text, | 
 |                            llvm::MutableArrayRef<CharRole> Roles) { | 
 |   assert(Text.size() == Roles.size()); | 
 |   if (Text.size() == 0) | 
 |     return 0; | 
 |   CharType Type = packedLookup<CharType>(CharTypes, Text[0]); | 
 |   CharTypeSet TypeSet = 1 << Type; | 
 |   // Types holds a sliding window of (Prev, Curr, Next) types. | 
 |   // Initial value is (Empty, Empty, type of Text[0]). | 
 |   int Types = Type; | 
 |   // Rotate slides in the type of the next character. | 
 |   auto Rotate = [&](CharType T) { Types = ((Types << 2) | T) & 0x3f; }; | 
 |   for (unsigned I = 0; I < Text.size() - 1; ++I) { | 
 |     // For each character, rotate in the next, and look up the role. | 
 |     Type = packedLookup<CharType>(CharTypes, Text[I + 1]); | 
 |     TypeSet |= 1 << Type; | 
 |     Rotate(Type); | 
 |     Roles[I] = packedLookup<CharRole>(CharRoles, Types); | 
 |   } | 
 |   // For the last character, the "next character" is Empty. | 
 |   Rotate(Empty); | 
 |   Roles[Text.size() - 1] = packedLookup<CharRole>(CharRoles, Types); | 
 |   return TypeSet; | 
 | } | 
 |  | 
 | // Sets up the data structures matching Word. | 
 | // Returns false if we can cheaply determine that no match is possible. | 
 | bool FuzzyMatcher::init(llvm::StringRef NewWord) { | 
 |   WordN = std::min<int>(MaxWord, NewWord.size()); | 
 |   if (PatN > WordN) | 
 |     return false; | 
 |   std::copy(NewWord.begin(), NewWord.begin() + WordN, Word); | 
 |   if (PatN == 0) | 
 |     return true; | 
 |   for (int I = 0; I < WordN; ++I) | 
 |     LowWord[I] = lower(Word[I]); | 
 |  | 
 |   // Cheap subsequence check. | 
 |   for (int W = 0, P = 0; P != PatN; ++W) { | 
 |     if (W == WordN) | 
 |       return false; | 
 |     if (LowWord[W] == LowPat[P]) | 
 |       ++P; | 
 |   } | 
 |  | 
 |   // FIXME: some words are hard to tokenize algorithmically. | 
 |   // e.g. vsprintf is V S Print F, and should match [pri] but not [int]. | 
 |   // We could add a tokenization dictionary for common stdlib names. | 
 |   WordTypeSet = calculateRoles(llvm::StringRef(Word, WordN), | 
 |                                llvm::makeMutableArrayRef(WordRole, WordN)); | 
 |   return true; | 
 | } | 
 |  | 
 | // The forwards pass finds the mappings of Pattern onto Word. | 
 | // Score = best score achieved matching Word[..W] against Pat[..P]. | 
 | // Unlike other tables, indices range from 0 to N *inclusive* | 
 | // Matched = whether we chose to match Word[W] with Pat[P] or not. | 
 | // | 
 | // Points are mostly assigned to matched characters, with 1 being a good score | 
 | // and 3 being a great one. So we treat the score range as [0, 3 * PatN]. | 
 | // This range is not strict: we can apply larger bonuses/penalties, or penalize | 
 | // non-matched characters. | 
 | void FuzzyMatcher::buildGraph() { | 
 |   for (int W = 0; W < WordN; ++W) { | 
 |     Scores[0][W + 1][Miss] = {Scores[0][W][Miss].Score - skipPenalty(W, Miss), | 
 |                               Miss}; | 
 |     Scores[0][W + 1][Match] = {AwfulScore, Miss}; | 
 |   } | 
 |   for (int P = 0; P < PatN; ++P) { | 
 |     for (int W = P; W < WordN; ++W) { | 
 |       auto &Score = Scores[P + 1][W + 1], &PreMiss = Scores[P + 1][W]; | 
 |  | 
 |       auto MatchMissScore = PreMiss[Match].Score; | 
 |       auto MissMissScore = PreMiss[Miss].Score; | 
 |       if (P < PatN - 1) { // Skipping trailing characters is always free. | 
 |         MatchMissScore -= skipPenalty(W, Match); | 
 |         MissMissScore -= skipPenalty(W, Miss); | 
 |       } | 
 |       Score[Miss] = (MatchMissScore > MissMissScore) | 
 |                         ? ScoreInfo{MatchMissScore, Match} | 
 |                         : ScoreInfo{MissMissScore, Miss}; | 
 |  | 
 |       auto &PreMatch = Scores[P][W]; | 
 |       auto MatchMatchScore = | 
 |           allowMatch(P, W, Match) | 
 |               ? PreMatch[Match].Score + matchBonus(P, W, Match) | 
 |               : AwfulScore; | 
 |       auto MissMatchScore = allowMatch(P, W, Miss) | 
 |                                 ? PreMatch[Miss].Score + matchBonus(P, W, Miss) | 
 |                                 : AwfulScore; | 
 |       Score[Match] = (MatchMatchScore > MissMatchScore) | 
 |                          ? ScoreInfo{MatchMatchScore, Match} | 
 |                          : ScoreInfo{MissMatchScore, Miss}; | 
 |     } | 
 |   } | 
 | } | 
 |  | 
 | bool FuzzyMatcher::allowMatch(int P, int W, Action Last) const { | 
 |   if (LowPat[P] != LowWord[W]) | 
 |     return false; | 
 |   // We require a "strong" match: | 
 |   // - for the first pattern character.  [foo] !~ "barefoot" | 
 |   // - after a gap.                      [pat] !~ "patnther" | 
 |   if (Last == Miss) { | 
 |     // We're banning matches outright, so conservatively accept some other cases | 
 |     // where our segmentation might be wrong: | 
 |     //  - allow matching B in ABCDef (but not in NDEBUG) | 
 |     //  - we'd like to accept print in sprintf, but too many false positives | 
 |     if (WordRole[W] == Tail && | 
 |         (Word[W] == LowWord[W] || !(WordTypeSet & 1 << Lower))) | 
 |       return false; | 
 |   } | 
 |   return true; | 
 | } | 
 |  | 
 | int FuzzyMatcher::skipPenalty(int W, Action Last) const { | 
 |   if (W == 0) // Skipping the first character. | 
 |     return 3; | 
 |   if (WordRole[W] == Head) // Skipping a segment. | 
 |     return 1; // We want to keep this lower than a consecutive match bonus. | 
 |   // Instead of penalizing non-consecutive matches, we give a bonus to a | 
 |   // consecutive match in matchBonus. This produces a better score distribution | 
 |   // than penalties in case of small patterns, e.g. 'up' for 'unique_ptr'. | 
 |   return 0; | 
 | } | 
 |  | 
 | int FuzzyMatcher::matchBonus(int P, int W, Action Last) const { | 
 |   assert(LowPat[P] == LowWord[W]); | 
 |   int S = 1; | 
 |   bool IsPatSingleCase = | 
 |       (PatTypeSet == 1 << Lower) || (PatTypeSet == 1 << Upper); | 
 |   // Bonus: case matches, or a Head in the pattern aligns with one in the word. | 
 |   // Single-case patterns lack segmentation signals and we assume any character | 
 |   // can be a head of a segment. | 
 |   if (Pat[P] == Word[W] || | 
 |       (WordRole[W] == Head && (IsPatSingleCase || PatRole[P] == Head))) | 
 |     ++S; | 
 |   // Bonus: a consecutive match. First character match also gets a bonus to | 
 |   // ensure prefix final match score normalizes to 1.0. | 
 |   if (W == 0 || Last == Match) | 
 |     S += 2; | 
 |   // Penalty: matching inside a segment (and previous char wasn't matched). | 
 |   if (WordRole[W] == Tail && P && Last == Miss) | 
 |     S -= 3; | 
 |   // Penalty: a Head in the pattern matches in the middle of a word segment. | 
 |   if (PatRole[P] == Head && WordRole[W] == Tail) | 
 |     --S; | 
 |   // Penalty: matching the first pattern character in the middle of a segment. | 
 |   if (P == 0 && WordRole[W] == Tail) | 
 |     S -= 4; | 
 |   assert(S <= PerfectBonus); | 
 |   return S; | 
 | } | 
 |  | 
 | llvm::SmallString<256> FuzzyMatcher::dumpLast(llvm::raw_ostream &OS) const { | 
 |   llvm::SmallString<256> Result; | 
 |   OS << "=== Match \"" << llvm::StringRef(Word, WordN) << "\" against [" | 
 |      << llvm::StringRef(Pat, PatN) << "] ===\n"; | 
 |   if (PatN == 0) { | 
 |     OS << "Pattern is empty: perfect match.\n"; | 
 |     return Result = llvm::StringRef(Word, WordN); | 
 |   } | 
 |   if (WordN == 0) { | 
 |     OS << "Word is empty: no match.\n"; | 
 |     return Result; | 
 |   } | 
 |   if (!WordContainsPattern) { | 
 |     OS << "Substring check failed.\n"; | 
 |     return Result; | 
 |   } else if (isAwful(std::max(Scores[PatN][WordN][Match].Score, | 
 |                               Scores[PatN][WordN][Miss].Score))) { | 
 |     OS << "Substring check passed, but all matches are forbidden\n"; | 
 |   } | 
 |   if (!(PatTypeSet & 1 << Upper)) | 
 |     OS << "Lowercase query, so scoring ignores case\n"; | 
 |  | 
 |   // Traverse Matched table backwards to reconstruct the Pattern/Word mapping. | 
 |   // The Score table has cumulative scores, subtracting along this path gives | 
 |   // us the per-letter scores. | 
 |   Action Last = | 
 |       (Scores[PatN][WordN][Match].Score > Scores[PatN][WordN][Miss].Score) | 
 |           ? Match | 
 |           : Miss; | 
 |   int S[MaxWord]; | 
 |   Action A[MaxWord]; | 
 |   for (int W = WordN - 1, P = PatN - 1; W >= 0; --W) { | 
 |     A[W] = Last; | 
 |     const auto &Cell = Scores[P + 1][W + 1][Last]; | 
 |     if (Last == Match) | 
 |       --P; | 
 |     const auto &Prev = Scores[P + 1][W][Cell.Prev]; | 
 |     S[W] = Cell.Score - Prev.Score; | 
 |     Last = Cell.Prev; | 
 |   } | 
 |   for (int I = 0; I < WordN; ++I) { | 
 |     if (A[I] == Match && (I == 0 || A[I - 1] == Miss)) | 
 |       Result.push_back('['); | 
 |     if (A[I] == Miss && I > 0 && A[I - 1] == Match) | 
 |       Result.push_back(']'); | 
 |     Result.push_back(Word[I]); | 
 |   } | 
 |   if (A[WordN - 1] == Match) | 
 |     Result.push_back(']'); | 
 |  | 
 |   for (char C : llvm::StringRef(Word, WordN)) | 
 |     OS << " " << C << " "; | 
 |   OS << "\n"; | 
 |   for (int I = 0, J = 0; I < WordN; I++) | 
 |     OS << " " << (A[I] == Match ? Pat[J++] : ' ') << " "; | 
 |   OS << "\n"; | 
 |   for (int I = 0; I < WordN; I++) | 
 |     OS << llvm::format("%2d ", S[I]); | 
 |   OS << "\n"; | 
 |  | 
 |   OS << "\nSegmentation:"; | 
 |   OS << "\n'" << llvm::StringRef(Word, WordN) << "'\n "; | 
 |   for (int I = 0; I < WordN; ++I) | 
 |     OS << "?-+ "[static_cast<int>(WordRole[I])]; | 
 |   OS << "\n[" << llvm::StringRef(Pat, PatN) << "]\n "; | 
 |   for (int I = 0; I < PatN; ++I) | 
 |     OS << "?-+ "[static_cast<int>(PatRole[I])]; | 
 |   OS << "\n"; | 
 |  | 
 |   OS << "\nScoring table (last-Miss, last-Match):\n"; | 
 |   OS << " |    "; | 
 |   for (char C : llvm::StringRef(Word, WordN)) | 
 |     OS << "  " << C << " "; | 
 |   OS << "\n"; | 
 |   OS << "-+----" << std::string(WordN * 4, '-') << "\n"; | 
 |   for (int I = 0; I <= PatN; ++I) { | 
 |     for (Action A : {Miss, Match}) { | 
 |       OS << ((I && A == Miss) ? Pat[I - 1] : ' ') << "|"; | 
 |       for (int J = 0; J <= WordN; ++J) { | 
 |         if (!isAwful(Scores[I][J][A].Score)) | 
 |           OS << llvm::format("%3d%c", Scores[I][J][A].Score, | 
 |                              Scores[I][J][A].Prev == Match ? '*' : ' '); | 
 |         else | 
 |           OS << "    "; | 
 |       } | 
 |       OS << "\n"; | 
 |     } | 
 |   } | 
 |  | 
 |   return Result; | 
 | } | 
 |  | 
 | } // namespace clangd | 
 | } // namespace clang |