| //===--- UnicodeNameMappingGenerator.cpp - Unicode name data generator ---===// | 
 | // | 
 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | 
 | // See https://llvm.org/LICENSE.txt for license information. | 
 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | 
 | // | 
 | //===----------------------------------------------------------------------===// | 
 | // | 
 | // This file is used to generate lib/Support/UnicodeNameToCodepointGenerated.cpp | 
 | // using UnicodeData.txt and NameAliases.txt available at | 
 | // https://unicode.org/Public/15.0.0/ucd/ | 
 | //===----------------------------------------------------------------------===// | 
 |  | 
 | #include "llvm/ADT/STLExtras.h" | 
 | #include "llvm/ADT/StringExtras.h" | 
 | #include "llvm/ADT/StringRef.h" | 
 | #include <algorithm> | 
 | #include <array> | 
 | #include <deque> | 
 | #include <fstream> | 
 | #include <memory> | 
 | #include <optional> | 
 | #include <set> | 
 | #include <string> | 
 | #include <unordered_map> | 
 | #include <utility> | 
 | #include <vector> | 
 |  | 
 | static const llvm::StringRef Letters = | 
 |     " _-ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; | 
 |  | 
 | // Collect names UnicodeData.txt and AliasNames.txt | 
 | // There may be multiple names per code points. | 
 | static std::unordered_multimap<char32_t, std::string> | 
 | loadDataFiles(const std::string &NamesFile, const std::string &AliasesFile) { | 
 |   std::unordered_multimap<char32_t, std::string> CollectedCharacters; | 
 |   auto FromFile = [&](const std::string &File, bool IsAliasFile = false) { | 
 |     std::ifstream InputFile(File); | 
 |     for (std::string Line; getline(InputFile, Line);) { | 
 |       if (Line.empty() || !isxdigit(Line[0])) | 
 |         continue; | 
 |       auto FirstSemiPos = Line.find(';'); | 
 |       if (FirstSemiPos == std::string::npos) | 
 |         continue; | 
 |       auto SecondSemiPos = Line.find(';', FirstSemiPos + 1); | 
 |       if (SecondSemiPos == std::string::npos) | 
 |         continue; | 
 |       unsigned long long CodePoint; | 
 |       if (llvm::getAsUnsignedInteger( | 
 |               llvm::StringRef(Line.c_str(), FirstSemiPos), 16, CodePoint)) { | 
 |         continue; | 
 |       } | 
 |  | 
 |       std::string Name = | 
 |           Line.substr(FirstSemiPos + 1, SecondSemiPos - FirstSemiPos - 1); | 
 |  | 
 |       if (!Name.empty() && Name[0] == '<') { | 
 |         // Ignore ranges of characters, as their name is either absent or | 
 |         // generated. | 
 |         continue; | 
 |       } | 
 |  | 
 |       // Some aliases are ignored for compatibility with C++ | 
 |       if (IsAliasFile) { | 
 |         std::string Kind = Line.substr(SecondSemiPos + 1); | 
 |         if (Kind != "control" && Kind != "correction" && Kind != "alternate") | 
 |           continue; | 
 |       } | 
 |  | 
 |       auto InsertUnique = [&](char32_t CP, std::string Name) { | 
 |         auto It = CollectedCharacters.find(CP); | 
 |         while (It != std::end(CollectedCharacters) && It->first == CP) { | 
 |           if (It->second == Name) | 
 |             return; | 
 |           ++It; | 
 |         } | 
 |         CollectedCharacters.insert({CP, std::move(Name)}); | 
 |       }; | 
 |       InsertUnique(CodePoint, std::move(Name)); | 
 |     } | 
 |   }; | 
 |  | 
 |   FromFile(NamesFile); | 
 |   FromFile(AliasesFile, true); | 
 |   return CollectedCharacters; | 
 | } | 
 |  | 
 | class Trie { | 
 |   struct Node; | 
 |  | 
 | public: | 
 |   // When inserting named codepoint | 
 |   // We create a node per character in the name. | 
 |   // SPARKLE becomes S <- P <- A <- R <- K <- L <- E | 
 |   // Once all  characters are inserted, the tree is compacted | 
 |   void insert(llvm::StringRef Name, char32_t Codepoint) { | 
 |     Node *N = Root.get(); | 
 |     for (auto Ch : Name) { | 
 |       std::string Label(1, Ch); | 
 |       auto It = llvm::find_if(N->Children, | 
 |                               [&](const auto &C) { return C->Name == Label; }); | 
 |       if (It == N->Children.end()) { | 
 |         It = N->Children.insert(It, std::make_unique<Node>(Label, N)); | 
 |       } | 
 |       N = It->get(); | 
 |     } | 
 |     N->Value = Codepoint; | 
 |   } | 
 |  | 
 |   void compact() { compact(Root.get()); } | 
 |  | 
 |   // This creates 2 arrays of bytes from the tree: | 
 |   // A serialized dictionary of node labels, | 
 |   // And the nodes themselves. | 
 |   // The name of each label is found by indexing into the dictionary. | 
 |   // The longest names are inserted first into the dictionary, | 
 |   // in the hope it will contain shorter labels as substring, | 
 |   // thereby reducing duplication. | 
 |   // We could theorically be more clever by trying to minimizing the size | 
 |   // of the dictionary. | 
 |   std::pair<std::string, std::vector<uint8_t>> serialize() { | 
 |     std::set<std::string> Names = this->getNameFragments(); | 
 |     std::vector<std::string> Sorted(Names.begin(), Names.end()); | 
 |     llvm::sort(Sorted, [](const auto &a, const auto &b) { | 
 |       return a.size() > b.size(); | 
 |     }); | 
 |     std::string Dict(Letters.begin(), Letters.end()); | 
 |     Dict.reserve(50000); | 
 |     for (const std::string &Name : Sorted) { | 
 |       if (Name.size() <= 1) | 
 |         continue; | 
 |       if (Dict.find(Name) != std::string::npos) | 
 |         continue; | 
 |       Dict += Name; | 
 |     } | 
 |  | 
 |     if (Dict.size() >= std::numeric_limits<uint16_t>::max()) { | 
 |       fprintf(stderr, "Dictionary too big  to be serialized"); | 
 |       exit(1); | 
 |     } | 
 |  | 
 |     auto Bytes = dumpIndex(Dict); | 
 |     return {Dict, Bytes}; | 
 |   } | 
 |  | 
 |   std::set<std::string> getNameFragments() { | 
 |     std::set<std::string> Keys; | 
 |     collectKeys(Root.get(), Keys); | 
 |     return Keys; | 
 |   } | 
 |  | 
 |   // Maps a valid char in an Unicode character name | 
 |   // To a 6 bits index. | 
 |   static uint8_t letter(char C) { | 
 |     auto Pos = Letters.find(C); | 
 |     assert(Pos != std::string::npos && | 
 |            "Invalid letter in Unicode character name"); | 
 |     return Pos; | 
 |   } | 
 |  | 
 |   // clang-format off | 
 |   // +================+============+======================+=============+========+===+==============+===============+ | 
 |   // | 0          | 1             | 2-7 (6)              | 8-23        | 24-44  |    | 46           | 47            | | 
 |   // +================+============+======================+=============+========+===+==============+===============+ | 
 |   // | Has Value |  Has Long Name | Letter OR Name Size  | Dict Index  | Value  |    | Has Sibling  | Has Children  | | 
 |   // +----------------+------------+----------------------+-------------+--------+---+--------------+---------------+ | 
 |   // clang-format on | 
 |  | 
 |   std::vector<uint8_t> dumpIndex(const std::string &Dict) { | 
 |     struct ChildrenOffset { | 
 |       Node *FirstChild; | 
 |       std::size_t Offset; | 
 |       bool HasValue; | 
 |     }; | 
 |  | 
 |     // Keep track of the start of each node | 
 |     // position in the serialized data. | 
 |     std::unordered_map<Node *, int32_t> Offsets; | 
 |  | 
 |     // Keep track of where to write the index | 
 |     // of the first children | 
 |     std::vector<ChildrenOffset> ChildrenOffsets; | 
 |     std::unordered_map<Node *, bool> SiblingTracker; | 
 |     std::deque<Node *> AllNodes; | 
 |     std::vector<uint8_t> Bytes; | 
 |     Bytes.reserve(250'000); | 
 |     // This leading byte is used by the reading code to detect the root node. | 
 |     Bytes.push_back(0); | 
 |  | 
 |     auto CollectChildren = [&SiblingTracker, &AllNodes](const auto &Children) { | 
 |       for (std::size_t Index = 0; Index < Children.size(); Index++) { | 
 |         const std::unique_ptr<Node> &Child = Children[Index]; | 
 |         AllNodes.push_back(Child.get()); | 
 |         if (Index != Children.size() - 1) | 
 |           SiblingTracker[Child.get()] = true; | 
 |       } | 
 |     }; | 
 |     CollectChildren(Root->Children); | 
 |  | 
 |     while (!AllNodes.empty()) { | 
 |       const std::size_t Offset = Bytes.size(); | 
 |       Node *const N = AllNodes.front(); | 
 |       AllNodes.pop_front(); | 
 |  | 
 |       assert(!N->Name.empty()); | 
 |       Offsets[N] = Offset; | 
 |  | 
 |       uint8_t FirstByte = (!!N->Value) ? 0x80 : 0; | 
 |       // Single letter node are indexed in 6 bits | 
 |       if (N->Name.size() == 1) { | 
 |         FirstByte |= letter(N->Name[0]); | 
 |         Bytes.push_back(FirstByte); | 
 |       } else { | 
 |         // Otherwise we use a 16 bits index | 
 |         FirstByte = FirstByte | uint8_t(N->Name.size()) | 0x40; | 
 |         Bytes.push_back(FirstByte); | 
 |         auto PosInDict = Dict.find(N->Name); | 
 |         assert(PosInDict != std::string::npos); | 
 |         uint8_t Low = PosInDict; | 
 |         uint8_t High = ((PosInDict >> 8) & 0xFF); | 
 |         Bytes.push_back(High); | 
 |         Bytes.push_back(Low); | 
 |       } | 
 |  | 
 |       const bool HasSibling = SiblingTracker.count(N) != 0; | 
 |       const bool HasChildren = N->Children.size() != 0; | 
 |  | 
 |       if (!!N->Value) { | 
 |         uint32_t Value = (*(N->Value) << 3); | 
 |         uint8_t H = ((Value >> 16) & 0xFF); | 
 |         uint8_t M = ((Value >> 8) & 0xFF); | 
 |         uint8_t L = (Value & 0xFF) | uint8_t(HasSibling ? 0x01 : 0) | | 
 |                     uint8_t(HasChildren ? 0x02 : 0); | 
 |  | 
 |         Bytes.push_back(H); | 
 |         Bytes.push_back(M); | 
 |         Bytes.push_back(L); | 
 |  | 
 |         if (HasChildren) { | 
 |           ChildrenOffsets.push_back( | 
 |               ChildrenOffset{N->Children[0].get(), Bytes.size(), true}); | 
 |           // index of the first children | 
 |           Bytes.push_back(0x00); | 
 |           Bytes.push_back(0x00); | 
 |           Bytes.push_back(0x00); | 
 |         } | 
 |       } else { | 
 |         // When there is no value (that's most intermediate nodes) | 
 |         // Dispense of the 3 values bytes, and only store | 
 |         // 1 byte to track whether the node has sibling and children | 
 |         // + 2 bytes for the index of the first children if necessary. | 
 |         // That index also uses bytes 0-6 of the previous byte. | 
 |         uint8_t Byte = | 
 |             uint8_t(HasSibling ? 0x80 : 0) | uint8_t(HasChildren ? 0x40 : 0); | 
 |         Bytes.push_back(Byte); | 
 |         if (HasChildren) { | 
 |           ChildrenOffsets.emplace_back( | 
 |               ChildrenOffset{N->Children[0].get(), Bytes.size() - 1, false}); | 
 |           Bytes.push_back(0x00); | 
 |           Bytes.push_back(0x00); | 
 |         } | 
 |       } | 
 |       CollectChildren(N->Children); | 
 |     } | 
 |  | 
 |     // Once all the nodes are in the inndex | 
 |     // Fill the bytes we left to indicate the position | 
 |     // of the children | 
 |     for (const ChildrenOffset &Parent : ChildrenOffsets) { | 
 |       const auto It = Offsets.find(Parent.FirstChild); | 
 |       assert(It != Offsets.end()); | 
 |       std::size_t Pos = It->second; | 
 |       if (Parent.HasValue) { | 
 |         Bytes[Parent.Offset] = ((Pos >> 16) & 0xFF); | 
 |       } else { | 
 |         Bytes[Parent.Offset] = | 
 |             Bytes[Parent.Offset] | uint8_t((Pos >> 16) & 0xFF); | 
 |       } | 
 |       Bytes[Parent.Offset + 1] = ((Pos >> 8) & 0xFF); | 
 |       Bytes[Parent.Offset + 2] = Pos & 0xFF; | 
 |     } | 
 |  | 
 |     // Add some padding so that the deserialization code | 
 |     // doesn't try to read past the enf of the array. | 
 |     Bytes.push_back(0); | 
 |     Bytes.push_back(0); | 
 |     Bytes.push_back(0); | 
 |     Bytes.push_back(0); | 
 |     Bytes.push_back(0); | 
 |     Bytes.push_back(0); | 
 |  | 
 |     return Bytes; | 
 |   } | 
 |  | 
 | private: | 
 |   void collectKeys(Node *N, std::set<std::string> &Keys) { | 
 |     Keys.insert(N->Name); | 
 |     for (const std::unique_ptr<Node> &Child : N->Children) { | 
 |       collectKeys(Child.get(), Keys); | 
 |     } | 
 |   } | 
 |  | 
 |   // Merge sequences of 1-character nodes | 
 |   // This greatly reduce the total number of nodes, | 
 |   // and therefore the size of the index. | 
 |   // When the tree gets serialized, we only have 5 bytes to store the | 
 |   // size of a name. Overlong names (>32 characters) are therefore | 
 |   // kep into separate nodes | 
 |   void compact(Node *N) { | 
 |     for (auto &&Child : N->Children) { | 
 |       compact(Child.get()); | 
 |     } | 
 |     if (N->Parent && N->Parent->Children.size() == 1 && !N->Parent->Value && | 
 |         (N->Parent->Name.size() + N->Name.size() <= 32)) { | 
 |       N->Parent->Value = N->Value; | 
 |       N->Parent->Name += N->Name; | 
 |       N->Parent->Children = std::move(N->Children); | 
 |       for (std::unique_ptr<Node> &c : N->Parent->Children) { | 
 |         c->Parent = N->Parent; | 
 |       } | 
 |     } | 
 |   } | 
 |   struct Node { | 
 |     Node(std::string Name, Node *Parent = nullptr) | 
 |         : Name(Name), Parent(Parent) {} | 
 |  | 
 |     std::vector<std::unique_ptr<Node>> Children; | 
 |     std::string Name; | 
 |     Node *Parent = nullptr; | 
 |     std::optional<char32_t> Value; | 
 |   }; | 
 |  | 
 |   std::unique_ptr<Node> Root = std::make_unique<Node>(""); | 
 | }; | 
 |  | 
 | extern const char *UnicodeLicense; | 
 |  | 
 | int main(int argc, char **argv) { | 
 |   printf("Unicode name -> codepoint mapping generator\n" | 
 |          "Usage: %s UnicodeData.txt NameAliases.txt output\n\n", | 
 |          argv[0]); | 
 |   printf("NameAliases.txt can be found at " | 
 |          "https://unicode.org/Public/15.0.0/ucd/NameAliases.txt\n" | 
 |          "UnicodeData.txt can be found at " | 
 |          "https://unicode.org/Public/15.0.0/ucd/UnicodeData.txt\n\n"); | 
 |  | 
 |   if (argc != 4) | 
 |     return EXIT_FAILURE; | 
 |  | 
 |   FILE *Out = fopen(argv[3], "w"); | 
 |   if (!Out) { | 
 |     printf("Error creating output file.\n"); | 
 |     return EXIT_FAILURE; | 
 |   } | 
 |  | 
 |   Trie T; | 
 |   uint32_t NameCount = 0; | 
 |   std::size_t LongestName = 0; | 
 |   auto Entries = loadDataFiles(argv[1], argv[2]); | 
 |   for (const std::pair<const char32_t, std::string> &Entry : Entries) { | 
 |     char32_t Codepoint = Entry.first; | 
 |     const std::string &Name = Entry.second; | 
 |     // Ignore names which are not valid. | 
 |     if (Name.empty() || | 
 |         !llvm::all_of(Name, [](char C) { return Letters.contains(C); })) { | 
 |       continue; | 
 |     } | 
 |     printf("%06x: %s\n", static_cast<unsigned int>(Codepoint), Name.c_str()); | 
 |     T.insert(Name, Codepoint); | 
 |     LongestName = | 
 |         std::max(LongestName, std::size_t(llvm::count_if(Name, llvm::isAlnum))); | 
 |     NameCount++; | 
 |   } | 
 |   T.compact(); | 
 |  | 
 |   std::pair<std::string, std::vector<uint8_t>> Data = T.serialize(); | 
 |   const std::string &Dict = Data.first; | 
 |   const std::vector<uint8_t> &Tree = Data.second; | 
 |  | 
 |   fprintf(Out, R"( | 
 | //===------------- Support/UnicodeNameToCodepointGenerated.cpp ------------===// | 
 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | 
 | // See https://llvm.org/LICENSE.txt for license information. | 
 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | 
 | // | 
 | //===----------------------------------------------------------------------===// | 
 | // | 
 | // This file implements mapping the name of a unicode code point to its value. | 
 | // | 
 | // This file was generated using %s. | 
 | // Do not edit manually. | 
 | // | 
 | //===----------------------------------------------------------------------===// | 
 | %s | 
 |  | 
 |  | 
 |  | 
 | #include "llvm/Support/Compiler.h" | 
 | #include <cstddef> | 
 | #include <cstdint> | 
 | )", | 
 |           argv[0], UnicodeLicense); | 
 |  | 
 |   fprintf(Out, | 
 |           "namespace llvm { namespace sys { namespace unicode { \n" | 
 |           "extern const char *UnicodeNameToCodepointDict;\n" | 
 |           "extern const uint8_t *UnicodeNameToCodepointIndex;\n" | 
 |           "extern const std::size_t UnicodeNameToCodepointIndexSize;\n" | 
 |           "extern const std::size_t UnicodeNameToCodepointLargestNameSize;\n"); | 
 |  | 
 |   fprintf(Out, "const char* UnicodeNameToCodepointDict = \"%s\";\n", | 
 |           Dict.c_str()); | 
 |  | 
 |   fprintf(Out, "uint8_t UnicodeNameToCodepointIndex_[%zu] = {\n", | 
 |           Tree.size() + 1); | 
 |  | 
 |   for (auto Byte : Tree) { | 
 |     fprintf(Out, "0x%02x,", Byte); | 
 |   } | 
 |  | 
 |   fprintf(Out, "0};"); | 
 |   fprintf(Out, "const uint8_t* UnicodeNameToCodepointIndex = " | 
 |                "UnicodeNameToCodepointIndex_; \n"); | 
 |   fprintf(Out, "const std::size_t UnicodeNameToCodepointIndexSize = %zu;\n", | 
 |           Tree.size() + 1); | 
 |   fprintf(Out, | 
 |           "const std::size_t UnicodeNameToCodepointLargestNameSize = %zu;\n", | 
 |           LongestName); | 
 |   fprintf(Out, "\n}}}\n"); | 
 |   fclose(Out); | 
 |   printf("Generated %s: %u Files.\nIndex: %f kB, Dictionary: %f kB.\nDone\n\n", | 
 |          argv[3], NameCount, Tree.size() / 1024.0, Dict.size() / 1024.0); | 
 | } | 
 |  | 
 | const char *UnicodeLicense = R"( | 
 | /* | 
 | UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE | 
 |  | 
 | See Terms of Use <https://www.unicode.org/copyright.html> | 
 | for definitions of Unicode Inc.’s Data Files and Software. | 
 |  | 
 | NOTICE TO USER: Carefully read the following legal agreement. | 
 | BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S | 
 | DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), | 
 | YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE | 
 | TERMS AND CONDITIONS OF THIS AGREEMENT. | 
 | IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE | 
 | THE DATA FILES OR SOFTWARE. | 
 |  | 
 | COPYRIGHT AND PERMISSION NOTICE | 
 |  | 
 | Copyright © 1991-2022 Unicode, Inc. All rights reserved. | 
 | Distributed under the Terms of Use in https://www.unicode.org/copyright.html. | 
 |  | 
 | Permission is hereby granted, free of charge, to any person obtaining | 
 | a copy of the Unicode data files and any associated documentation | 
 | (the "Data Files") or Unicode software and any associated documentation | 
 | (the "Software") to deal in the Data Files or Software | 
 | without restriction, including without limitation the rights to use, | 
 | copy, modify, merge, publish, distribute, and/or sell copies of | 
 | the Data Files or Software, and to permit persons to whom the Data Files | 
 | or Software are furnished to do so, provided that either | 
 | (a) this copyright and permission notice appear with all copies | 
 | of the Data Files or Software, or | 
 | (b) this copyright and permission notice appear in associated | 
 | Documentation. | 
 |  | 
 | THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF | 
 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE | 
 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | 
 | NONINFRINGEMENT OF THIRD PARTY RIGHTS. | 
 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS | 
 | NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL | 
 | DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, | 
 | DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER | 
 | TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR | 
 | PERFORMANCE OF THE DATA FILES OR SOFTWARE. | 
 |  | 
 | Except as contained in this notice, the name of a copyright holder | 
 | shall not be used in advertising or otherwise to promote the sale, | 
 | use or other dealings in these Data Files or Software without prior | 
 | written authorization of the copyright holder. | 
 | */ | 
 | )"; |