|  | //===--- Trigram.cpp - Trigram generation for Fuzzy Matching ----*- C++ -*-===// | 
|  | // | 
|  | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | 
|  | // See https://llvm.org/LICENSE.txt for license information. | 
|  | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | 
|  | // | 
|  | //===----------------------------------------------------------------------===// | 
|  |  | 
|  | #include "Trigram.h" | 
|  | #include "FuzzyMatch.h" | 
|  | #include "index/dex/Token.h" | 
|  | #include "llvm/ADT/ArrayRef.h" | 
|  | #include "llvm/ADT/DenseSet.h" | 
|  | #include "llvm/ADT/STLExtras.h" | 
|  | #include "llvm/ADT/SmallVector.h" | 
|  | #include "llvm/ADT/StringExtras.h" | 
|  | #include "llvm/ADT/StringRef.h" | 
|  | #include <cctype> | 
|  | #include <limits> | 
|  | #include <queue> | 
|  | #include <string> | 
|  | #include <utility> | 
|  |  | 
|  | namespace clang { | 
|  | namespace clangd { | 
|  | namespace dex { | 
|  |  | 
|  | // Produce trigrams (including duplicates) and pass them to Out(). | 
|  | template <typename Func> | 
|  | static void identifierTrigrams(llvm::StringRef Identifier, Func Out) { | 
|  | assert(!Identifier.empty()); | 
|  | // Apply fuzzy matching text segmentation. | 
|  | llvm::SmallVector<CharRole> Roles(Identifier.size()); | 
|  | calculateRoles(Identifier, | 
|  | llvm::MutableArrayRef(Roles.data(), Identifier.size())); | 
|  |  | 
|  | std::string LowercaseIdentifier = Identifier.lower(); | 
|  |  | 
|  | // For each character, store indices of the characters to which fuzzy matching | 
|  | // algorithm can jump. There are 2 possible variants: | 
|  | // | 
|  | // * Next Tail - next character from the same segment | 
|  | // * Next Head - front character of the next segment | 
|  | // | 
|  | // Next stores tuples of three indices in the presented order, if a variant is | 
|  | // not available then 0 is stored. | 
|  | llvm::SmallVector<std::array<unsigned, 2>, 12> Next( | 
|  | LowercaseIdentifier.size()); | 
|  | unsigned NextTail = 0, NextHead = 0; | 
|  | for (int I = LowercaseIdentifier.size() - 1; I >= 0; --I) { | 
|  | Next[I] = {{NextTail, NextHead}}; | 
|  | NextTail = Roles[I] == Tail ? I : 0; | 
|  | if (Roles[I] == Head) { | 
|  | NextHead = I; | 
|  | } | 
|  | } | 
|  |  | 
|  | // Iterate through valid sequences of three characters Fuzzy Matcher can | 
|  | // process. | 
|  | for (unsigned I = 0; I < LowercaseIdentifier.size(); ++I) { | 
|  | // Skip delimiters. | 
|  | if (Roles[I] != Head && Roles[I] != Tail) | 
|  | continue; | 
|  | for (unsigned J : Next[I]) { | 
|  | if (J == 0) | 
|  | continue; | 
|  | for (unsigned K : Next[J]) { | 
|  | if (K == 0) | 
|  | continue; | 
|  | Out(Trigram(LowercaseIdentifier[I], LowercaseIdentifier[J], | 
|  | LowercaseIdentifier[K])); | 
|  | } | 
|  | } | 
|  | } | 
|  | // Short queries semantics are different. When the user dosn't type enough | 
|  | // symbols to form trigrams, we still want to serve meaningful results. To | 
|  | // achieve that, we form incomplete trigrams (bi- and unigrams) for the | 
|  | // identifiers and also generate short trigrams on the query side from what | 
|  | // is available. We allow a small number of short trigram types in order to | 
|  | // prevent excessive memory usage and increase the quality of the search. | 
|  | // Only the first few symbols are allowed to be used in incomplete trigrams. | 
|  | // | 
|  | // Example - for "_abc_def_ghi_jkl" we'll get following incomplete trigrams: | 
|  | // "_", "_a", "a", "ab", "ad", "d", "de", "dg" | 
|  | for (unsigned Position = 0, HeadsSeen = 0; HeadsSeen < 2;) { | 
|  | // The first symbol might be a separator, so the loop condition should be | 
|  | // stopping as soon as there is no next head or we have seen two heads. | 
|  | if (Roles[Position] == Head) | 
|  | ++HeadsSeen; | 
|  | Out(Trigram(LowercaseIdentifier[Position])); | 
|  | for (unsigned I : Next[Position]) | 
|  | if (I != 0) | 
|  | Out(Trigram(LowercaseIdentifier[Position], LowercaseIdentifier[I])); | 
|  | Position = Next[Position][1]; | 
|  | if (Position == 0) | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | void generateIdentifierTrigrams(llvm::StringRef Identifier, | 
|  | std::vector<Trigram> &Result) { | 
|  | // Empirically, scanning for duplicates is faster with fewer trigrams, and | 
|  | // a hashtable is faster with more. This is a hot path, so dispatch based on | 
|  | // expected number of trigrams. Longer identifiers produce more trigrams. | 
|  | // The magic number was tuned by running IndexBenchmark.DexBuild. | 
|  | constexpr unsigned ManyTrigramsIdentifierThreshold = 14; | 
|  | Result.clear(); | 
|  | if (Identifier.empty()) | 
|  | return; | 
|  |  | 
|  | if (Identifier.size() < ManyTrigramsIdentifierThreshold) { | 
|  | identifierTrigrams(Identifier, [&](Trigram T) { | 
|  | if (!llvm::is_contained(Result, T)) | 
|  | Result.push_back(T); | 
|  | }); | 
|  | } else { | 
|  | identifierTrigrams(Identifier, [&](Trigram T) { Result.push_back(T); }); | 
|  | llvm::sort(Result); | 
|  | Result.erase(std::unique(Result.begin(), Result.end()), Result.end()); | 
|  | } | 
|  | } | 
|  |  | 
|  | std::vector<Token> generateQueryTrigrams(llvm::StringRef Query) { | 
|  | if (Query.empty()) | 
|  | return {}; | 
|  |  | 
|  | // Apply fuzzy matching text segmentation. | 
|  | llvm::SmallVector<CharRole> Roles(Query.size()); | 
|  | calculateRoles(Query, llvm::MutableArrayRef(Roles.data(), Query.size())); | 
|  |  | 
|  | std::string LowercaseQuery = Query.lower(); | 
|  |  | 
|  | llvm::DenseSet<Token> UniqueTrigrams; | 
|  | std::string Chars; | 
|  | for (unsigned I = 0; I < LowercaseQuery.size(); ++I) { | 
|  | if (Roles[I] != Head && Roles[I] != Tail) | 
|  | continue; // Skip delimiters. | 
|  | Chars.push_back(LowercaseQuery[I]); | 
|  | if (Chars.size() > 3) | 
|  | Chars.erase(Chars.begin()); | 
|  | if (Chars.size() == 3) | 
|  | UniqueTrigrams.insert(Token(Token::Kind::Trigram, Chars)); | 
|  | } | 
|  |  | 
|  | // For queries with very few letters, generateIdentifierTrigrams emulates | 
|  | // outputs of this function to match the semantics. | 
|  | if (UniqueTrigrams.empty()) { | 
|  | // If bigram can't be formed out of letters/numbers, we prepend separator. | 
|  | std::string Result(1, LowercaseQuery.front()); | 
|  | for (unsigned I = 1; I < LowercaseQuery.size(); ++I) | 
|  | if (Roles[I] == Head || Roles[I] == Tail) | 
|  | Result += LowercaseQuery[I]; | 
|  | UniqueTrigrams.insert( | 
|  | Token(Token::Kind::Trigram, llvm::StringRef(Result).take_back(2))); | 
|  | } | 
|  |  | 
|  | return {UniqueTrigrams.begin(), UniqueTrigrams.end()}; | 
|  | } | 
|  |  | 
|  | } // namespace dex | 
|  | } // namespace clangd | 
|  | } // namespace clang |