1c92056d0SCorentin Jabot //===--- UnicodeNameMappingGenerator.cpp - Unicode name data generator ---===//
2c92056d0SCorentin Jabot //
3c92056d0SCorentin Jabot // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4c92056d0SCorentin Jabot // See https://llvm.org/LICENSE.txt for license information.
5c92056d0SCorentin Jabot // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6c92056d0SCorentin Jabot //
7c92056d0SCorentin Jabot //===----------------------------------------------------------------------===//
8c92056d0SCorentin Jabot //
9c92056d0SCorentin Jabot // This file is used to generate lib/Support/UnicodeNameToCodepointGenerated.cpp
10c92056d0SCorentin Jabot // using UnicodeData.txt and NameAliases.txt available at
11c92056d0SCorentin Jabot // https://unicode.org/Public/14.0.0/ucd/
12c92056d0SCorentin Jabot //===----------------------------------------------------------------------===//
13c92056d0SCorentin Jabot
14c92056d0SCorentin Jabot #include "llvm/ADT/Optional.h"
15c92056d0SCorentin Jabot #include "llvm/ADT/STLExtras.h"
16c92056d0SCorentin Jabot #include "llvm/ADT/StringExtras.h"
17c92056d0SCorentin Jabot #include "llvm/ADT/StringRef.h"
18c92056d0SCorentin Jabot #include <algorithm>
19c92056d0SCorentin Jabot #include <array>
20c92056d0SCorentin Jabot #include <deque>
21c92056d0SCorentin Jabot #include <fstream>
22c92056d0SCorentin Jabot #include <memory>
23c92056d0SCorentin Jabot #include <set>
24c92056d0SCorentin Jabot #include <string>
25c92056d0SCorentin Jabot #include <unordered_map>
26c92056d0SCorentin Jabot #include <utility>
27c92056d0SCorentin Jabot #include <vector>
28c92056d0SCorentin Jabot
29c92056d0SCorentin Jabot static const llvm::StringRef Letters =
30c92056d0SCorentin Jabot " _-ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
31c92056d0SCorentin Jabot
32c92056d0SCorentin Jabot // Collect names UnicodeData.txt and AliasNames.txt
33c92056d0SCorentin Jabot // There may be multiple names per code points.
34c92056d0SCorentin Jabot static std::unordered_multimap<char32_t, std::string>
loadDataFiles(const std::string & NamesFile,const std::string & AliasesFile)35c92056d0SCorentin Jabot loadDataFiles(const std::string &NamesFile, const std::string &AliasesFile) {
36c92056d0SCorentin Jabot std::unordered_multimap<char32_t, std::string> CollectedCharacters;
37c92056d0SCorentin Jabot auto FromFile = [&](const std::string &File, bool IsAliasFile = false) {
38c92056d0SCorentin Jabot std::ifstream InputFile(File);
39c92056d0SCorentin Jabot for (std::string Line; getline(InputFile, Line);) {
40c92056d0SCorentin Jabot if (Line.empty() || !isxdigit(Line[0]))
41c92056d0SCorentin Jabot continue;
42c92056d0SCorentin Jabot auto FirstSemiPos = Line.find(';');
43c92056d0SCorentin Jabot if (FirstSemiPos == std::string::npos)
44c92056d0SCorentin Jabot continue;
45c92056d0SCorentin Jabot auto SecondSemiPos = Line.find(';', FirstSemiPos + 1);
46c92056d0SCorentin Jabot if (FirstSemiPos == std::string::npos)
47c92056d0SCorentin Jabot continue;
48c92056d0SCorentin Jabot unsigned long long CodePoint;
49c92056d0SCorentin Jabot if (llvm::getAsUnsignedInteger(
50c92056d0SCorentin Jabot llvm::StringRef(Line.c_str(), FirstSemiPos), 16, CodePoint)) {
51c92056d0SCorentin Jabot continue;
52c92056d0SCorentin Jabot }
53c92056d0SCorentin Jabot
54c92056d0SCorentin Jabot std::string Name =
55c92056d0SCorentin Jabot Line.substr(FirstSemiPos + 1, SecondSemiPos - FirstSemiPos - 1);
56c92056d0SCorentin Jabot
57c92056d0SCorentin Jabot if (!Name.empty() && Name[0] == '<') {
58c92056d0SCorentin Jabot // Ignore ranges of characters, as their name is either absent or
59c92056d0SCorentin Jabot // generated.
60c92056d0SCorentin Jabot continue;
61c92056d0SCorentin Jabot }
62c92056d0SCorentin Jabot
63c92056d0SCorentin Jabot // Some aliases are ignored for compatibility with C++
64c92056d0SCorentin Jabot if (IsAliasFile) {
65c92056d0SCorentin Jabot std::string Kind = Line.substr(SecondSemiPos + 1);
66c92056d0SCorentin Jabot if (Kind != "control" && Kind != "correction" && Kind != "alternate")
67c92056d0SCorentin Jabot continue;
68c92056d0SCorentin Jabot }
69c92056d0SCorentin Jabot
70c92056d0SCorentin Jabot auto InsertUnique = [&](char32_t CP, std::string Name) {
71c92056d0SCorentin Jabot auto It = CollectedCharacters.find(CP);
72c92056d0SCorentin Jabot while (It != std::end(CollectedCharacters) && It->first == CP) {
73c92056d0SCorentin Jabot if (It->second == Name)
74c92056d0SCorentin Jabot return;
75c92056d0SCorentin Jabot ++It;
76c92056d0SCorentin Jabot }
77c92056d0SCorentin Jabot CollectedCharacters.insert({CP, std::move(Name)});
78c92056d0SCorentin Jabot };
79c92056d0SCorentin Jabot InsertUnique(CodePoint, std::move(Name));
80c92056d0SCorentin Jabot }
81c92056d0SCorentin Jabot };
82c92056d0SCorentin Jabot
83c92056d0SCorentin Jabot FromFile(NamesFile);
84c92056d0SCorentin Jabot FromFile(AliasesFile, true);
85c92056d0SCorentin Jabot return CollectedCharacters;
86c92056d0SCorentin Jabot }
87c92056d0SCorentin Jabot
88c92056d0SCorentin Jabot class Trie {
89c92056d0SCorentin Jabot struct Node;
90c92056d0SCorentin Jabot
91c92056d0SCorentin Jabot public:
92c92056d0SCorentin Jabot // When inserting named codepoint
93c92056d0SCorentin Jabot // We create a node per character in the name.
94c92056d0SCorentin Jabot // SPARKLE becomes S <- P <- A <- R <- K <- L <- E
95c92056d0SCorentin Jabot // Once all characters are inserted, the tree is compacted
insert(llvm::StringRef Name,char32_t Codepoint)96c92056d0SCorentin Jabot void insert(llvm::StringRef Name, char32_t Codepoint) {
97c92056d0SCorentin Jabot Node *N = Root.get();
98c92056d0SCorentin Jabot for (auto Ch : Name) {
99c92056d0SCorentin Jabot std::string Label(1, Ch);
100c92056d0SCorentin Jabot auto It = std::find_if(N->Children.begin(), N->Children.end(),
101c92056d0SCorentin Jabot [&](const auto &C) { return C->Name == Label; });
102c92056d0SCorentin Jabot if (It == N->Children.end()) {
103c92056d0SCorentin Jabot It = N->Children.insert(It, std::make_unique<Node>(Label, N));
104c92056d0SCorentin Jabot }
105c92056d0SCorentin Jabot N = It->get();
106c92056d0SCorentin Jabot }
107c92056d0SCorentin Jabot N->Value = Codepoint;
108c92056d0SCorentin Jabot }
109c92056d0SCorentin Jabot
compact()110c92056d0SCorentin Jabot void compact() { compact(Root.get()); }
111c92056d0SCorentin Jabot
112c92056d0SCorentin Jabot // This creates 2 arrays of bytes from the tree:
113c92056d0SCorentin Jabot // A serialized dictionary of node labels,
114c92056d0SCorentin Jabot // And the nodes themselves.
115c92056d0SCorentin Jabot // The name of each label is found by indexing into the dictionary.
116c92056d0SCorentin Jabot // The longest names are inserted first into the dictionary,
117c92056d0SCorentin Jabot // in the hope it will contain shorter labels as substring,
118c92056d0SCorentin Jabot // thereby reducing duplication.
119c92056d0SCorentin Jabot // We could theorically be more clever by trying to minimizing the size
120c92056d0SCorentin Jabot // of the dictionary.
serialize()121c92056d0SCorentin Jabot std::pair<std::string, std::vector<uint8_t>> serialize() {
122c92056d0SCorentin Jabot std::set<std::string> Names = this->getNameFragments();
123c92056d0SCorentin Jabot std::vector<std::string> Sorted(Names.begin(), Names.end());
124*aba43035SDmitri Gribenko llvm::sort(Sorted, [](const auto &a, const auto &b) {
125*aba43035SDmitri Gribenko return a.size() > b.size();
126*aba43035SDmitri Gribenko });
127c92056d0SCorentin Jabot std::string Dict(Letters.begin(), Letters.end());
128c92056d0SCorentin Jabot Dict.reserve(50000);
129c92056d0SCorentin Jabot for (const std::string &Name : Sorted) {
130c92056d0SCorentin Jabot if (Name.size() <= 1)
131c92056d0SCorentin Jabot continue;
132c92056d0SCorentin Jabot if (Dict.find(Name) != std::string::npos)
133c92056d0SCorentin Jabot continue;
134c92056d0SCorentin Jabot Dict += Name;
135c92056d0SCorentin Jabot }
136c92056d0SCorentin Jabot
137c92056d0SCorentin Jabot if (Dict.size() >= std::numeric_limits<uint16_t>::max()) {
138c92056d0SCorentin Jabot fprintf(stderr, "Dictionary too big to be serialized");
139c92056d0SCorentin Jabot exit(1);
140c92056d0SCorentin Jabot }
141c92056d0SCorentin Jabot
142c92056d0SCorentin Jabot auto Bytes = dumpIndex(Dict);
143c92056d0SCorentin Jabot return {Dict, Bytes};
144c92056d0SCorentin Jabot }
145c92056d0SCorentin Jabot
getNameFragments()146c92056d0SCorentin Jabot std::set<std::string> getNameFragments() {
147c92056d0SCorentin Jabot std::set<std::string> Keys;
148c92056d0SCorentin Jabot collectKeys(Root.get(), Keys);
149c92056d0SCorentin Jabot return Keys;
150c92056d0SCorentin Jabot }
151c92056d0SCorentin Jabot
152c92056d0SCorentin Jabot // Maps a valid char in an Unicode character name
153c92056d0SCorentin Jabot // To a 6 bits index.
letter(char C)154c92056d0SCorentin Jabot static uint8_t letter(char C) {
155c92056d0SCorentin Jabot auto Pos = Letters.find(C);
156c92056d0SCorentin Jabot assert(Pos != std::string::npos &&
157c92056d0SCorentin Jabot "Invalid letter in Unicode character name");
158c92056d0SCorentin Jabot return Pos;
159c92056d0SCorentin Jabot }
160c92056d0SCorentin Jabot
161c92056d0SCorentin Jabot // clang-format off
162c92056d0SCorentin Jabot // +================+============+======================+=============+========+===+==============+===============+
163c92056d0SCorentin Jabot // | 0 | 1 | 2-7 (6) | 8-23 | 24-44 | | 46 | 47 |
164c92056d0SCorentin Jabot // +================+============+======================+=============+========+===+==============+===============+
165c92056d0SCorentin Jabot // | Has Value | Has Long Name | Letter OR Name Size | Dict Index | Value | | Has Sibling | Has Children |
166c92056d0SCorentin Jabot // +----------------+------------+----------------------+-------------+--------+---+--------------+---------------+
167c92056d0SCorentin Jabot // clang-format on
168c92056d0SCorentin Jabot
dumpIndex(const std::string & Dict)169c92056d0SCorentin Jabot std::vector<uint8_t> dumpIndex(const std::string &Dict) {
170c92056d0SCorentin Jabot struct ChildrenOffset {
171c92056d0SCorentin Jabot Node *FirstChild;
172c92056d0SCorentin Jabot std::size_t Offset;
173c92056d0SCorentin Jabot bool HasValue;
174c92056d0SCorentin Jabot };
175c92056d0SCorentin Jabot
176c92056d0SCorentin Jabot // Keep track of the start of each node
177c92056d0SCorentin Jabot // position in the serialized data.
178c92056d0SCorentin Jabot std::unordered_map<Node *, int32_t> Offsets;
179c92056d0SCorentin Jabot
180c92056d0SCorentin Jabot // Keep track of where to write the index
181c92056d0SCorentin Jabot // of the first children
182c92056d0SCorentin Jabot std::vector<ChildrenOffset> ChildrenOffsets;
183c92056d0SCorentin Jabot std::unordered_map<Node *, bool> SiblingTracker;
184c92056d0SCorentin Jabot std::deque<Node *> AllNodes;
185c92056d0SCorentin Jabot std::vector<uint8_t> Bytes;
186c92056d0SCorentin Jabot Bytes.reserve(250'000);
187c92056d0SCorentin Jabot // This leading byte is used by the reading code to detect the root node.
188c92056d0SCorentin Jabot Bytes.push_back(0);
189c92056d0SCorentin Jabot
190c92056d0SCorentin Jabot auto CollectChildren = [&SiblingTracker, &AllNodes](const auto &Children) {
191c92056d0SCorentin Jabot for (std::size_t Index = 0; Index < Children.size(); Index++) {
192c92056d0SCorentin Jabot const std::unique_ptr<Node> &Child = Children[Index];
193c92056d0SCorentin Jabot AllNodes.push_back(Child.get());
194c92056d0SCorentin Jabot if (Index != Children.size() - 1)
195c92056d0SCorentin Jabot SiblingTracker[Child.get()] = true;
196c92056d0SCorentin Jabot }
197c92056d0SCorentin Jabot };
198c92056d0SCorentin Jabot CollectChildren(Root->Children);
199c92056d0SCorentin Jabot
200c92056d0SCorentin Jabot while (!AllNodes.empty()) {
201c92056d0SCorentin Jabot const std::size_t Offset = Bytes.size();
202c92056d0SCorentin Jabot Node *const N = AllNodes.front();
203c92056d0SCorentin Jabot AllNodes.pop_front();
204c92056d0SCorentin Jabot
205c92056d0SCorentin Jabot assert(!N->Name.empty());
206c92056d0SCorentin Jabot Offsets[N] = Offset;
207c92056d0SCorentin Jabot
208c92056d0SCorentin Jabot uint8_t FirstByte = (!!N->Value) ? 0x80 : 0;
209c92056d0SCorentin Jabot // Single letter node are indexed in 6 bits
210c92056d0SCorentin Jabot if (N->Name.size() == 1) {
211c92056d0SCorentin Jabot FirstByte |= letter(N->Name[0]);
212c92056d0SCorentin Jabot Bytes.push_back(FirstByte);
213c92056d0SCorentin Jabot } else {
214c92056d0SCorentin Jabot // Otherwise we use a 16 bits index
215c92056d0SCorentin Jabot FirstByte = FirstByte | uint8_t(N->Name.size()) | 0x40;
216c92056d0SCorentin Jabot Bytes.push_back(FirstByte);
217c92056d0SCorentin Jabot auto PosInDict = Dict.find(N->Name);
218c92056d0SCorentin Jabot assert(PosInDict != std::string::npos);
219c92056d0SCorentin Jabot uint8_t Low = PosInDict;
220c92056d0SCorentin Jabot uint8_t High = ((PosInDict >> 8) & 0xFF);
221c92056d0SCorentin Jabot Bytes.push_back(High);
222c92056d0SCorentin Jabot Bytes.push_back(Low);
223c92056d0SCorentin Jabot }
224c92056d0SCorentin Jabot
225c92056d0SCorentin Jabot const bool HasSibling = SiblingTracker.count(N) != 0;
226c92056d0SCorentin Jabot const bool HasChildren = N->Children.size() != 0;
227c92056d0SCorentin Jabot
228c92056d0SCorentin Jabot if (!!N->Value) {
229c92056d0SCorentin Jabot uint32_t Value = (*(N->Value) << 3);
230c92056d0SCorentin Jabot uint8_t H = ((Value >> 16) & 0xFF);
231c92056d0SCorentin Jabot uint8_t M = ((Value >> 8) & 0xFF);
232c92056d0SCorentin Jabot uint8_t L = (Value & 0xFF) | uint8_t(HasSibling ? 0x01 : 0) |
233c92056d0SCorentin Jabot uint8_t(HasChildren ? 0x02 : 0);
234c92056d0SCorentin Jabot
235c92056d0SCorentin Jabot Bytes.push_back(H);
236c92056d0SCorentin Jabot Bytes.push_back(M);
237c92056d0SCorentin Jabot Bytes.push_back(L);
238c92056d0SCorentin Jabot
239c92056d0SCorentin Jabot if (HasChildren) {
240c92056d0SCorentin Jabot ChildrenOffsets.push_back(
241c92056d0SCorentin Jabot ChildrenOffset{N->Children[0].get(), Bytes.size(), true});
242c92056d0SCorentin Jabot // index of the first children
243c92056d0SCorentin Jabot Bytes.push_back(0x00);
244c92056d0SCorentin Jabot Bytes.push_back(0x00);
245c92056d0SCorentin Jabot Bytes.push_back(0x00);
246c92056d0SCorentin Jabot }
247c92056d0SCorentin Jabot } else {
248c92056d0SCorentin Jabot // When there is no value (that's most intermediate nodes)
249c92056d0SCorentin Jabot // Dispense of the 3 values bytes, and only store
250c92056d0SCorentin Jabot // 1 byte to track whether the node has sibling and chidren
251c92056d0SCorentin Jabot // + 2 bytes for the index of the first children if necessary.
252c92056d0SCorentin Jabot // That index also uses bytes 0-6 of the previous byte.
253c92056d0SCorentin Jabot uint8_t Byte =
254c92056d0SCorentin Jabot uint8_t(HasSibling ? 0x80 : 0) | uint8_t(HasChildren ? 0x40 : 0);
255c92056d0SCorentin Jabot Bytes.push_back(Byte);
256c92056d0SCorentin Jabot if (HasChildren) {
257c92056d0SCorentin Jabot ChildrenOffsets.emplace_back(
258c92056d0SCorentin Jabot ChildrenOffset{N->Children[0].get(), Bytes.size() - 1, false});
259c92056d0SCorentin Jabot Bytes.push_back(0x00);
260c92056d0SCorentin Jabot Bytes.push_back(0x00);
261c92056d0SCorentin Jabot }
262c92056d0SCorentin Jabot }
263c92056d0SCorentin Jabot CollectChildren(N->Children);
264c92056d0SCorentin Jabot }
265c92056d0SCorentin Jabot
266c92056d0SCorentin Jabot // Once all the nodes are in the inndex
267c92056d0SCorentin Jabot // Fill the bytes we left to indicate the position
268c92056d0SCorentin Jabot // of the children
269c92056d0SCorentin Jabot for (const ChildrenOffset &Parent : ChildrenOffsets) {
270c92056d0SCorentin Jabot const auto It = Offsets.find(Parent.FirstChild);
271c92056d0SCorentin Jabot assert(It != Offsets.end());
272c92056d0SCorentin Jabot std::size_t Pos = It->second;
273c92056d0SCorentin Jabot if (Parent.HasValue) {
274c92056d0SCorentin Jabot Bytes[Parent.Offset] = ((Pos >> 16) & 0xFF);
275c92056d0SCorentin Jabot } else {
276c92056d0SCorentin Jabot Bytes[Parent.Offset] =
277c92056d0SCorentin Jabot Bytes[Parent.Offset] | uint8_t((Pos >> 16) & 0xFF);
278c92056d0SCorentin Jabot }
279c92056d0SCorentin Jabot Bytes[Parent.Offset + 1] = ((Pos >> 8) & 0xFF);
280c92056d0SCorentin Jabot Bytes[Parent.Offset + 2] = Pos & 0xFF;
281c92056d0SCorentin Jabot }
282c92056d0SCorentin Jabot
283c92056d0SCorentin Jabot // Add some padding so that the deserialization code
284c92056d0SCorentin Jabot // doesn't try to read past the enf of the array.
285c92056d0SCorentin Jabot Bytes.push_back(0);
286c92056d0SCorentin Jabot Bytes.push_back(0);
287c92056d0SCorentin Jabot Bytes.push_back(0);
288c92056d0SCorentin Jabot Bytes.push_back(0);
289c92056d0SCorentin Jabot Bytes.push_back(0);
290c92056d0SCorentin Jabot Bytes.push_back(0);
291c92056d0SCorentin Jabot
292c92056d0SCorentin Jabot return Bytes;
293c92056d0SCorentin Jabot }
294c92056d0SCorentin Jabot
295c92056d0SCorentin Jabot private:
collectKeys(Node * N,std::set<std::string> & Keys)296c92056d0SCorentin Jabot void collectKeys(Node *N, std::set<std::string> &Keys) {
297c92056d0SCorentin Jabot Keys.insert(N->Name);
298c92056d0SCorentin Jabot for (const std::unique_ptr<Node> &Child : N->Children) {
299c92056d0SCorentin Jabot collectKeys(Child.get(), Keys);
300c92056d0SCorentin Jabot }
301c92056d0SCorentin Jabot }
302c92056d0SCorentin Jabot
303c92056d0SCorentin Jabot // Merge sequences of 1-character nodes
304c92056d0SCorentin Jabot // This greatly reduce the total number of nodes,
305c92056d0SCorentin Jabot // and therefore the size of the index.
306c92056d0SCorentin Jabot // When the tree gets serialized, we only have 5 bytes to store the
307c92056d0SCorentin Jabot // size of a name. Overlong names (>32 characters) are therefore
308c92056d0SCorentin Jabot // kep into separate nodes
compact(Node * N)309c92056d0SCorentin Jabot void compact(Node *N) {
310c92056d0SCorentin Jabot for (auto &&Child : N->Children) {
311c92056d0SCorentin Jabot compact(Child.get());
312c92056d0SCorentin Jabot }
313c92056d0SCorentin Jabot if (N->Parent && N->Parent->Children.size() == 1 && !N->Parent->Value &&
314c92056d0SCorentin Jabot (N->Parent->Name.size() + N->Name.size() <= 32)) {
315c92056d0SCorentin Jabot N->Parent->Value = N->Value;
316c92056d0SCorentin Jabot N->Parent->Name += N->Name;
317c92056d0SCorentin Jabot N->Parent->Children = std::move(N->Children);
318c92056d0SCorentin Jabot for (std::unique_ptr<Node> &c : N->Parent->Children) {
319c92056d0SCorentin Jabot c->Parent = N->Parent;
320c92056d0SCorentin Jabot }
321c92056d0SCorentin Jabot }
322c92056d0SCorentin Jabot }
323c92056d0SCorentin Jabot struct Node {
NodeTrie::Node324c92056d0SCorentin Jabot Node(std::string Name, Node *Parent = nullptr)
325c92056d0SCorentin Jabot : Name(Name), Parent(Parent) {}
326c92056d0SCorentin Jabot
327c92056d0SCorentin Jabot std::vector<std::unique_ptr<Node>> Children;
328c92056d0SCorentin Jabot std::string Name;
329c92056d0SCorentin Jabot Node *Parent = nullptr;
330c92056d0SCorentin Jabot llvm::Optional<char32_t> Value;
331c92056d0SCorentin Jabot };
332c92056d0SCorentin Jabot
333c92056d0SCorentin Jabot std::unique_ptr<Node> Root = std::make_unique<Node>("");
334c92056d0SCorentin Jabot };
335c92056d0SCorentin Jabot
336c92056d0SCorentin Jabot extern const char *UnicodeLicense;
337c92056d0SCorentin Jabot
main(int argc,char ** argv)338c92056d0SCorentin Jabot int main(int argc, char **argv) {
339c92056d0SCorentin Jabot printf("Unicode name -> codepoint mapping generator\n"
340c92056d0SCorentin Jabot "Usage: %s UnicodeData.txt NameAliases.txt output\n\n",
341c92056d0SCorentin Jabot argv[0]);
342c92056d0SCorentin Jabot printf("NameAliases.txt can be found at "
343c92056d0SCorentin Jabot "https://unicode.org/Public/14.0.0/ucd/NameAliases.txt\n"
344c92056d0SCorentin Jabot "UnicodeData.txt can be found at "
345c92056d0SCorentin Jabot "https://unicode.org/Public/14.0.0/ucd/UnicodeData.txt\n\n");
346c92056d0SCorentin Jabot
347c92056d0SCorentin Jabot if (argc != 4)
348c92056d0SCorentin Jabot return EXIT_FAILURE;
349c92056d0SCorentin Jabot
350c92056d0SCorentin Jabot FILE *Out = fopen(argv[3], "w");
351c92056d0SCorentin Jabot if (!Out) {
352c92056d0SCorentin Jabot printf("Error creating output file.\n");
353c92056d0SCorentin Jabot return EXIT_FAILURE;
354c92056d0SCorentin Jabot }
355c92056d0SCorentin Jabot
356c92056d0SCorentin Jabot Trie T;
357c92056d0SCorentin Jabot uint32_t NameCount = 0;
358c92056d0SCorentin Jabot std::size_t LongestName = 0;
359c92056d0SCorentin Jabot auto Entries = loadDataFiles(argv[1], argv[2]);
360c92056d0SCorentin Jabot for (const std::pair<const char32_t, std::string> &Entry : Entries) {
361c92056d0SCorentin Jabot char32_t Codepoint = Entry.first;
362c92056d0SCorentin Jabot const std::string &Name = Entry.second;
363c92056d0SCorentin Jabot // Ignore names which are not valid.
364c92056d0SCorentin Jabot if (Name.empty() || !std::all_of(Name.begin(), Name.end(), [](char C) {
365c92056d0SCorentin Jabot return llvm::is_contained(Letters, C);
366c92056d0SCorentin Jabot })) {
367c92056d0SCorentin Jabot continue;
368c92056d0SCorentin Jabot }
36992d31a7cSAaron Ballman printf("%06x: %s\n", static_cast<unsigned int>(Codepoint), Name.c_str());
370c92056d0SCorentin Jabot T.insert(Name, Codepoint);
371c92056d0SCorentin Jabot LongestName =
372380a1b20SKazu Hirata std::max(LongestName, std::size_t(llvm::count_if(Name, llvm::isAlnum)));
373c92056d0SCorentin Jabot NameCount++;
374c92056d0SCorentin Jabot }
375c92056d0SCorentin Jabot T.compact();
376c92056d0SCorentin Jabot
377c92056d0SCorentin Jabot std::pair<std::string, std::vector<uint8_t>> Data = T.serialize();
378c92056d0SCorentin Jabot const std::string &Dict = Data.first;
379c92056d0SCorentin Jabot const std::vector<uint8_t> &Tree = Data.second;
380c92056d0SCorentin Jabot
381c92056d0SCorentin Jabot fprintf(Out, R"(
382c92056d0SCorentin Jabot //===------------- Support/UnicodeNameToCodepointGenerated.cpp ------------===//
383c92056d0SCorentin Jabot // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
384c92056d0SCorentin Jabot // See https://llvm.org/LICENSE.txt for license information.
385c92056d0SCorentin Jabot // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
386c92056d0SCorentin Jabot //
387c92056d0SCorentin Jabot //===----------------------------------------------------------------------===//
388c92056d0SCorentin Jabot //
389c92056d0SCorentin Jabot // This file implements mapping the name of a unicode code point to its value.
390c92056d0SCorentin Jabot //
391c92056d0SCorentin Jabot // This file was generated using %s.
392c92056d0SCorentin Jabot // Do not edit manually.
393c92056d0SCorentin Jabot //
394c92056d0SCorentin Jabot //===----------------------------------------------------------------------===//
395c92056d0SCorentin Jabot %s
396c92056d0SCorentin Jabot
397c92056d0SCorentin Jabot
398c92056d0SCorentin Jabot
399c92056d0SCorentin Jabot #include "llvm/Support/Compiler.h"
400c92056d0SCorentin Jabot #include <cstddef>
401c92056d0SCorentin Jabot #include <cstdint>
402c92056d0SCorentin Jabot )",
403c92056d0SCorentin Jabot argv[0], UnicodeLicense);
404c92056d0SCorentin Jabot
405c92056d0SCorentin Jabot fprintf(Out,
406c92056d0SCorentin Jabot "namespace llvm { namespace sys { namespace unicode { \n"
407c92056d0SCorentin Jabot "extern const char *UnicodeNameToCodepointDict;\n"
408c92056d0SCorentin Jabot "extern const uint8_t *UnicodeNameToCodepointIndex;\n"
409c92056d0SCorentin Jabot "extern const std::size_t UnicodeNameToCodepointIndexSize;\n"
410c92056d0SCorentin Jabot "extern const std::size_t UnicodeNameToCodepointLargestNameSize;\n");
411c92056d0SCorentin Jabot
412c92056d0SCorentin Jabot fprintf(Out, "const char* UnicodeNameToCodepointDict = \"%s\";\n",
413c92056d0SCorentin Jabot Dict.c_str());
414c92056d0SCorentin Jabot
41592d31a7cSAaron Ballman fprintf(Out, "uint8_t UnicodeNameToCodepointIndex_[%zu] = {\n",
416c92056d0SCorentin Jabot Tree.size() + 1);
417c92056d0SCorentin Jabot
418c92056d0SCorentin Jabot for (auto Byte : Tree) {
419c92056d0SCorentin Jabot fprintf(Out, "0x%02x,", Byte);
420c92056d0SCorentin Jabot }
421c92056d0SCorentin Jabot
422c92056d0SCorentin Jabot fprintf(Out, "0};");
423c92056d0SCorentin Jabot fprintf(Out, "const uint8_t* UnicodeNameToCodepointIndex = "
424c92056d0SCorentin Jabot "UnicodeNameToCodepointIndex_; \n");
42592d31a7cSAaron Ballman fprintf(Out, "const std::size_t UnicodeNameToCodepointIndexSize = %zu;\n",
426c92056d0SCorentin Jabot Tree.size() + 1);
427c92056d0SCorentin Jabot fprintf(Out,
42892d31a7cSAaron Ballman "const std::size_t UnicodeNameToCodepointLargestNameSize = %zu;\n",
429c92056d0SCorentin Jabot LongestName);
430c92056d0SCorentin Jabot fprintf(Out, "\n}}}\n");
431c92056d0SCorentin Jabot fclose(Out);
432c92056d0SCorentin Jabot printf("Generated %s: %u Files.\nIndex: %f kB, Dictionary: %f kB.\nDone\n\n",
433c92056d0SCorentin Jabot argv[3], NameCount, Tree.size() / 1024.0, Dict.size() / 1024.0);
434c92056d0SCorentin Jabot }
435c92056d0SCorentin Jabot
436c92056d0SCorentin Jabot const char *UnicodeLicense = R"(
437c92056d0SCorentin Jabot /*
438c92056d0SCorentin Jabot UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
439c92056d0SCorentin Jabot
440c92056d0SCorentin Jabot See Terms of Use <https://www.unicode.org/copyright.html>
441c92056d0SCorentin Jabot for definitions of Unicode Inc.’s Data Files and Software.
442c92056d0SCorentin Jabot
443c92056d0SCorentin Jabot NOTICE TO USER: Carefully read the following legal agreement.
444c92056d0SCorentin Jabot BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
445c92056d0SCorentin Jabot DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
446c92056d0SCorentin Jabot YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
447c92056d0SCorentin Jabot TERMS AND CONDITIONS OF THIS AGREEMENT.
448c92056d0SCorentin Jabot IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
449c92056d0SCorentin Jabot THE DATA FILES OR SOFTWARE.
450c92056d0SCorentin Jabot
451c92056d0SCorentin Jabot COPYRIGHT AND PERMISSION NOTICE
452c92056d0SCorentin Jabot
453c92056d0SCorentin Jabot Copyright © 1991-2022 Unicode, Inc. All rights reserved.
454c92056d0SCorentin Jabot Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
455c92056d0SCorentin Jabot
456c92056d0SCorentin Jabot Permission is hereby granted, free of charge, to any person obtaining
457c92056d0SCorentin Jabot a copy of the Unicode data files and any associated documentation
458c92056d0SCorentin Jabot (the "Data Files") or Unicode software and any associated documentation
459c92056d0SCorentin Jabot (the "Software") to deal in the Data Files or Software
460c92056d0SCorentin Jabot without restriction, including without limitation the rights to use,
461c92056d0SCorentin Jabot copy, modify, merge, publish, distribute, and/or sell copies of
462c92056d0SCorentin Jabot the Data Files or Software, and to permit persons to whom the Data Files
463c92056d0SCorentin Jabot or Software are furnished to do so, provided that either
464c92056d0SCorentin Jabot (a) this copyright and permission notice appear with all copies
465c92056d0SCorentin Jabot of the Data Files or Software, or
466c92056d0SCorentin Jabot (b) this copyright and permission notice appear in associated
467c92056d0SCorentin Jabot Documentation.
468c92056d0SCorentin Jabot
469c92056d0SCorentin Jabot THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
470c92056d0SCorentin Jabot ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
471c92056d0SCorentin Jabot WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
472c92056d0SCorentin Jabot NONINFRINGEMENT OF THIRD PARTY RIGHTS.
473c92056d0SCorentin Jabot IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
474c92056d0SCorentin Jabot NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
475c92056d0SCorentin Jabot DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
476c92056d0SCorentin Jabot DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
477c92056d0SCorentin Jabot TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
478c92056d0SCorentin Jabot PERFORMANCE OF THE DATA FILES OR SOFTWARE.
479c92056d0SCorentin Jabot
480c92056d0SCorentin Jabot Except as contained in this notice, the name of a copyright holder
481c92056d0SCorentin Jabot shall not be used in advertising or otherwise to promote the sale,
482c92056d0SCorentin Jabot use or other dealings in these Data Files or Software without prior
483c92056d0SCorentin Jabot written authorization of the copyright holder.
484c92056d0SCorentin Jabot */
485c92056d0SCorentin Jabot )";
486