1c92056d0SCorentin Jabot //===- llvm/Support/UnicodeNameToCodepoint.cpp - Unicode character properties
2c92056d0SCorentin Jabot //-*- C++ -*-===//
3c92056d0SCorentin Jabot //
4c92056d0SCorentin Jabot // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5c92056d0SCorentin Jabot // See https://llvm.org/LICENSE.txt for license information.
6c92056d0SCorentin Jabot // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7c92056d0SCorentin Jabot //
8c92056d0SCorentin Jabot //===----------------------------------------------------------------------===//
9c92056d0SCorentin Jabot //
10c92056d0SCorentin Jabot // This file implements functions to map the name or alias of a unicode
11c92056d0SCorentin Jabot // character to its codepoint.
12c92056d0SCorentin Jabot //
13c92056d0SCorentin Jabot //===----------------------------------------------------------------------===//
14c92056d0SCorentin Jabot 
15c92056d0SCorentin Jabot #include "llvm/ADT/STLExtras.h"
16c92056d0SCorentin Jabot #include "llvm/ADT/StringExtras.h"
17c92056d0SCorentin Jabot #include "llvm/ADT/StringRef.h"
18c92056d0SCorentin Jabot #include "llvm/Support/Unicode.h"
19c92056d0SCorentin Jabot 
20c92056d0SCorentin Jabot namespace llvm {
21c92056d0SCorentin Jabot namespace sys {
22c92056d0SCorentin Jabot namespace unicode {
23c92056d0SCorentin Jabot 
24c92056d0SCorentin Jabot extern const char *UnicodeNameToCodepointDict;
25c92056d0SCorentin Jabot extern const uint8_t *UnicodeNameToCodepointIndex;
26c92056d0SCorentin Jabot extern const std::size_t UnicodeNameToCodepointIndexSize;
27c92056d0SCorentin Jabot extern const std::size_t UnicodeNameToCodepointLargestNameSize;
28c92056d0SCorentin Jabot 
29c92056d0SCorentin Jabot using BufferType = SmallString<64>;
30c92056d0SCorentin Jabot 
31c92056d0SCorentin Jabot struct Node {
32c92056d0SCorentin Jabot   bool IsRoot = false;
33c92056d0SCorentin Jabot   char32_t Value = 0xFFFFFFFF;
34c92056d0SCorentin Jabot   uint32_t ChildrenOffset = 0;
35c92056d0SCorentin Jabot   bool HasSibling = false;
36c92056d0SCorentin Jabot   uint32_t Size = 0;
37c92056d0SCorentin Jabot   StringRef Name;
38c92056d0SCorentin Jabot   const Node *Parent = nullptr;
39c92056d0SCorentin Jabot 
isValidllvm::sys::unicode::Node40c92056d0SCorentin Jabot   constexpr bool isValid() const {
41c92056d0SCorentin Jabot     return !Name.empty() || Value == 0xFFFFFFFF;
42c92056d0SCorentin Jabot   }
hasChildrenllvm::sys::unicode::Node43c92056d0SCorentin Jabot   constexpr bool hasChildren() const { return ChildrenOffset != 0 || IsRoot; }
44c92056d0SCorentin Jabot 
fullNamellvm::sys::unicode::Node45c92056d0SCorentin Jabot   std::string fullName() const {
46c92056d0SCorentin Jabot     std::string S;
47c92056d0SCorentin Jabot     // Reserve enough space for most unicode code points.
48c92056d0SCorentin Jabot     // The chosen value represent the 99th percentile of name size as of
49c92056d0SCorentin Jabot     // Unicode 14.
50c92056d0SCorentin Jabot     S.reserve(46);
51c92056d0SCorentin Jabot     const Node *N = this;
52c92056d0SCorentin Jabot     while (N) {
53c92056d0SCorentin Jabot       std::reverse_copy(N->Name.begin(), N->Name.end(), std::back_inserter(S));
54c92056d0SCorentin Jabot       N = N->Parent;
55c92056d0SCorentin Jabot     }
56c92056d0SCorentin Jabot     std::reverse(S.begin(), S.end());
57c92056d0SCorentin Jabot     return S;
58c92056d0SCorentin Jabot   }
59c92056d0SCorentin Jabot };
60c92056d0SCorentin Jabot 
createRoot()61c92056d0SCorentin Jabot static Node createRoot() {
62c92056d0SCorentin Jabot   Node N;
63c92056d0SCorentin Jabot   N.IsRoot = true;
64c92056d0SCorentin Jabot   N.ChildrenOffset = 1;
65c92056d0SCorentin Jabot   N.Size = 1;
66c92056d0SCorentin Jabot   return N;
67c92056d0SCorentin Jabot }
68c92056d0SCorentin Jabot 
readNode(uint32_t Offset,const Node * Parent=nullptr)69c92056d0SCorentin Jabot static Node readNode(uint32_t Offset, const Node *Parent = nullptr) {
70c92056d0SCorentin Jabot   if (Offset == 0)
71c92056d0SCorentin Jabot     return createRoot();
72c92056d0SCorentin Jabot 
73c92056d0SCorentin Jabot   uint32_t Origin = Offset;
74c92056d0SCorentin Jabot   Node N;
75c92056d0SCorentin Jabot   N.Parent = Parent;
76c92056d0SCorentin Jabot   uint8_t NameInfo = UnicodeNameToCodepointIndex[Offset++];
77c92056d0SCorentin Jabot   if (Offset + 6 >= UnicodeNameToCodepointIndexSize)
78c92056d0SCorentin Jabot     return N;
79c92056d0SCorentin Jabot 
80c92056d0SCorentin Jabot   bool LongName = NameInfo & 0x40;
81c92056d0SCorentin Jabot   bool HasValue = NameInfo & 0x80;
82c92056d0SCorentin Jabot   std::size_t Size = NameInfo & ~0xC0;
83c92056d0SCorentin Jabot   if (LongName) {
84c92056d0SCorentin Jabot     uint32_t NameOffset = (UnicodeNameToCodepointIndex[Offset++] << 8);
85c92056d0SCorentin Jabot     NameOffset |= UnicodeNameToCodepointIndex[Offset++];
86c92056d0SCorentin Jabot     N.Name = StringRef(UnicodeNameToCodepointDict + NameOffset, Size);
87c92056d0SCorentin Jabot   } else {
88c92056d0SCorentin Jabot     N.Name = StringRef(UnicodeNameToCodepointDict + Size, 1);
89c92056d0SCorentin Jabot   }
90c92056d0SCorentin Jabot   if (HasValue) {
91c92056d0SCorentin Jabot     uint8_t H = UnicodeNameToCodepointIndex[Offset++];
92c92056d0SCorentin Jabot     uint8_t M = UnicodeNameToCodepointIndex[Offset++];
93c92056d0SCorentin Jabot     uint8_t L = UnicodeNameToCodepointIndex[Offset++];
94c92056d0SCorentin Jabot     N.Value = ((H << 16) | (M << 8) | L) >> 3;
95c92056d0SCorentin Jabot 
96c92056d0SCorentin Jabot     bool HasChildren = L & 0x02;
97c92056d0SCorentin Jabot     N.HasSibling = L & 0x01;
98c92056d0SCorentin Jabot 
99c92056d0SCorentin Jabot     if (HasChildren) {
100c92056d0SCorentin Jabot       N.ChildrenOffset = UnicodeNameToCodepointIndex[Offset++] << 16;
101c92056d0SCorentin Jabot       N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++] << 8;
102c92056d0SCorentin Jabot       N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++];
103c92056d0SCorentin Jabot     }
104c92056d0SCorentin Jabot   } else {
105c92056d0SCorentin Jabot     uint8_t H = UnicodeNameToCodepointIndex[Offset++];
106c92056d0SCorentin Jabot     N.HasSibling = H & 0x80;
107c92056d0SCorentin Jabot     bool HasChildren = H & 0x40;
108c92056d0SCorentin Jabot     H &= ~0xC0;
109c92056d0SCorentin Jabot     if (HasChildren) {
110c92056d0SCorentin Jabot       N.ChildrenOffset = (H << 16);
111c92056d0SCorentin Jabot       N.ChildrenOffset |=
112c92056d0SCorentin Jabot           (uint32_t(UnicodeNameToCodepointIndex[Offset++]) << 8);
113c92056d0SCorentin Jabot       N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++];
114c92056d0SCorentin Jabot     }
115c92056d0SCorentin Jabot   }
116c92056d0SCorentin Jabot   N.Size = Offset - Origin;
117c92056d0SCorentin Jabot   return N;
118c92056d0SCorentin Jabot }
119c92056d0SCorentin Jabot 
startsWith(StringRef Name,StringRef Needle,bool Strict,std::size_t & Consummed,char & PreviousCharInName,char & PreviousCharInNeedle,bool IsPrefix=false)120c92056d0SCorentin Jabot static bool startsWith(StringRef Name, StringRef Needle, bool Strict,
121c92056d0SCorentin Jabot                        std::size_t &Consummed, char &PreviousCharInName,
122c92056d0SCorentin Jabot                        char &PreviousCharInNeedle, bool IsPrefix = false) {
123c92056d0SCorentin Jabot 
124c92056d0SCorentin Jabot   Consummed = 0;
125c92056d0SCorentin Jabot   if (Strict) {
126c92056d0SCorentin Jabot     if (!Name.startswith(Needle))
127c92056d0SCorentin Jabot       return false;
128c92056d0SCorentin Jabot     Consummed = Needle.size();
129c92056d0SCorentin Jabot     return true;
130c92056d0SCorentin Jabot   }
131c92056d0SCorentin Jabot   if (Needle.empty())
132c92056d0SCorentin Jabot     return true;
133c92056d0SCorentin Jabot 
134c92056d0SCorentin Jabot   auto NamePos = Name.begin();
135c92056d0SCorentin Jabot   auto NeedlePos = Needle.begin();
136c92056d0SCorentin Jabot 
137c92056d0SCorentin Jabot   char PreviousCharInNameOrigin = PreviousCharInName;
138c92056d0SCorentin Jabot   char PreviousCharInNeedleOrigin = PreviousCharInNeedle;
139c92056d0SCorentin Jabot 
140c92056d0SCorentin Jabot   auto IgnoreSpaces = [](auto It, auto End, char &PreviousChar,
141c92056d0SCorentin Jabot                          bool IgnoreEnd = false) {
142c92056d0SCorentin Jabot     while (It != End) {
143c92056d0SCorentin Jabot       const auto Next = std::next(It);
144c92056d0SCorentin Jabot       // Ignore spaces, underscore, medial hyphens
145c92056d0SCorentin Jabot       // https://unicode.org/reports/tr44/#UAX44-LM2.
146c92056d0SCorentin Jabot       bool Ignore =
147c92056d0SCorentin Jabot           *It == ' ' || *It == '_' ||
148c92056d0SCorentin Jabot           (*It == '-' && isAlnum(PreviousChar) &&
149c92056d0SCorentin Jabot            ((Next != End && isAlnum(*Next)) || (Next == End && IgnoreEnd)));
150c92056d0SCorentin Jabot       PreviousChar = *It;
151c92056d0SCorentin Jabot       if (!Ignore)
152c92056d0SCorentin Jabot         break;
153c92056d0SCorentin Jabot       ++It;
154c92056d0SCorentin Jabot     }
155c92056d0SCorentin Jabot     return It;
156c92056d0SCorentin Jabot   };
157c92056d0SCorentin Jabot 
158c92056d0SCorentin Jabot   while (true) {
159c92056d0SCorentin Jabot     NamePos = IgnoreSpaces(NamePos, Name.end(), PreviousCharInName);
160c92056d0SCorentin Jabot     NeedlePos =
161c92056d0SCorentin Jabot         IgnoreSpaces(NeedlePos, Needle.end(), PreviousCharInNeedle, IsPrefix);
162c92056d0SCorentin Jabot     if (NeedlePos == Needle.end())
163c92056d0SCorentin Jabot       break;
164c92056d0SCorentin Jabot     if (NamePos == Name.end())
165c92056d0SCorentin Jabot       break;
166c92056d0SCorentin Jabot     if (toUpper(*NeedlePos) != toUpper(*NamePos))
167c92056d0SCorentin Jabot       break;
168c92056d0SCorentin Jabot     NeedlePos++;
169c92056d0SCorentin Jabot     NamePos++;
170c92056d0SCorentin Jabot   }
171c92056d0SCorentin Jabot   Consummed = std::distance(Name.begin(), NamePos);
172c92056d0SCorentin Jabot   if (NeedlePos != Needle.end()) {
173c92056d0SCorentin Jabot     PreviousCharInName = PreviousCharInNameOrigin;
174c92056d0SCorentin Jabot     PreviousCharInNeedle = PreviousCharInNeedleOrigin;
175c92056d0SCorentin Jabot   }
176c92056d0SCorentin Jabot   return NeedlePos == Needle.end();
177c92056d0SCorentin Jabot }
178c92056d0SCorentin Jabot 
179c92056d0SCorentin Jabot static std::tuple<Node, bool, uint32_t>
compareNode(uint32_t Offset,StringRef Name,bool Strict,char PreviousCharInName,char PreviousCharInNeedle,BufferType & Buffer,const Node * Parent=nullptr)180c92056d0SCorentin Jabot compareNode(uint32_t Offset, StringRef Name, bool Strict,
181c92056d0SCorentin Jabot             char PreviousCharInName, char PreviousCharInNeedle,
182c92056d0SCorentin Jabot             BufferType &Buffer, const Node *Parent = nullptr) {
183c92056d0SCorentin Jabot   Node N = readNode(Offset, Parent);
184c92056d0SCorentin Jabot   std::size_t Consummed = 0;
185c92056d0SCorentin Jabot   bool DoesStartWith =
186c92056d0SCorentin Jabot       N.IsRoot || startsWith(Name, N.Name, Strict, Consummed,
187c92056d0SCorentin Jabot                              PreviousCharInName, PreviousCharInNeedle);
188c92056d0SCorentin Jabot   if (!DoesStartWith)
189*f5cd172eSBenjamin Kramer     return std::make_tuple(N, false, 0);
190c92056d0SCorentin Jabot 
191c92056d0SCorentin Jabot   if (Name.size() - Consummed == 0 && N.Value != 0xFFFFFFFF)
192*f5cd172eSBenjamin Kramer     return std::make_tuple(N, true, N.Value);
193c92056d0SCorentin Jabot 
194c92056d0SCorentin Jabot   if (N.hasChildren()) {
195c92056d0SCorentin Jabot     uint32_t ChildOffset = N.ChildrenOffset;
196c92056d0SCorentin Jabot     for (;;) {
197c92056d0SCorentin Jabot       Node C;
198c92056d0SCorentin Jabot       bool Matches;
199c92056d0SCorentin Jabot       uint32_t Value;
200c92056d0SCorentin Jabot       std::tie(C, Matches, Value) =
201c92056d0SCorentin Jabot           compareNode(ChildOffset, Name.substr(Consummed), Strict,
202c92056d0SCorentin Jabot                       PreviousCharInName, PreviousCharInNeedle, Buffer, &N);
203c92056d0SCorentin Jabot       if (Matches) {
204c92056d0SCorentin Jabot         std::reverse_copy(C.Name.begin(), C.Name.end(),
205c92056d0SCorentin Jabot                           std::back_inserter(Buffer));
206*f5cd172eSBenjamin Kramer         return std::make_tuple(N, true, Value);
207c92056d0SCorentin Jabot       }
208c92056d0SCorentin Jabot       ChildOffset += C.Size;
209c92056d0SCorentin Jabot       if (!C.HasSibling)
210c92056d0SCorentin Jabot         break;
211c92056d0SCorentin Jabot     }
212c92056d0SCorentin Jabot   }
213*f5cd172eSBenjamin Kramer   return std::make_tuple(N, false, 0);
214c92056d0SCorentin Jabot }
215c92056d0SCorentin Jabot 
216c92056d0SCorentin Jabot static std::tuple<Node, bool, uint32_t>
compareNode(uint32_t Offset,StringRef Name,bool Strict,BufferType & Buffer)217c92056d0SCorentin Jabot compareNode(uint32_t Offset, StringRef Name, bool Strict, BufferType &Buffer) {
218c92056d0SCorentin Jabot   return compareNode(Offset, Name, Strict, 0, 0, Buffer);
219c92056d0SCorentin Jabot }
220c92056d0SCorentin Jabot 
221c92056d0SCorentin Jabot // clang-format off
222c92056d0SCorentin Jabot constexpr const char *const HangulSyllables[][3] = {
223c92056d0SCorentin Jabot     { "G",  "A",   ""   },
224c92056d0SCorentin Jabot     { "GG", "AE",  "G"  },
225c92056d0SCorentin Jabot     { "N",  "YA",  "GG" },
226c92056d0SCorentin Jabot     { "D",  "YAE", "GS" },
227c92056d0SCorentin Jabot     { "DD", "EO",  "N", },
228c92056d0SCorentin Jabot     { "R",  "E",   "NJ" },
229c92056d0SCorentin Jabot     { "M",  "YEO", "NH" },
230c92056d0SCorentin Jabot     { "B",  "YE",  "D"  },
231c92056d0SCorentin Jabot     { "BB", "O",   "L"  },
232c92056d0SCorentin Jabot     { "S",  "WA",  "LG" },
233c92056d0SCorentin Jabot     { "SS", "WAE", "LM" },
234c92056d0SCorentin Jabot     { "",   "OE",  "LB" },
235c92056d0SCorentin Jabot     { "J",  "YO",  "LS" },
236c92056d0SCorentin Jabot     { "JJ", "U",   "LT" },
237c92056d0SCorentin Jabot     { "C",  "WEO", "LP" },
238c92056d0SCorentin Jabot     { "K",  "WE",  "LH" },
239c92056d0SCorentin Jabot     { "T",  "WI",  "M"  },
240c92056d0SCorentin Jabot     { "P",  "YU",  "B"  },
241c92056d0SCorentin Jabot     { "H",  "EU",  "BS" },
242c92056d0SCorentin Jabot     { 0,    "YI",  "S"  },
243c92056d0SCorentin Jabot     { 0,    "I",   "SS" },
244c92056d0SCorentin Jabot     { 0,    0,     "NG" },
245c92056d0SCorentin Jabot     { 0,    0,     "J"  },
246c92056d0SCorentin Jabot     { 0,    0,     "C"  },
247c92056d0SCorentin Jabot     { 0,    0,     "K"  },
248c92056d0SCorentin Jabot     { 0,    0,     "T"  },
249c92056d0SCorentin Jabot     { 0,    0,     "P"  },
250c92056d0SCorentin Jabot     { 0,    0,     "H"  }
251c92056d0SCorentin Jabot     };
252c92056d0SCorentin Jabot // clang-format on
253c92056d0SCorentin Jabot 
254c92056d0SCorentin Jabot // Unicode 14.0
255c92056d0SCorentin Jabot // 3.12 Conjoining Jamo Behavior Common constants
256c92056d0SCorentin Jabot constexpr const char32_t SBase = 0xAC00;
257c92056d0SCorentin Jabot constexpr const uint32_t LCount = 19;
258c92056d0SCorentin Jabot constexpr const uint32_t VCount = 21;
259c92056d0SCorentin Jabot constexpr const uint32_t TCount = 28;
260c92056d0SCorentin Jabot 
findSyllable(StringRef Name,bool Strict,char & PreviousInName,int & Pos,int Column)261c92056d0SCorentin Jabot static std::size_t findSyllable(StringRef Name, bool Strict,
262c92056d0SCorentin Jabot                                 char &PreviousInName, int &Pos, int Column) {
263c92056d0SCorentin Jabot   assert(Column == 0 || Column == 1 || Column == 2);
264c92056d0SCorentin Jabot   static std::size_t CountPerColumn[] = {LCount, VCount, TCount};
265c92056d0SCorentin Jabot   char NeedleStart = 0;
266c92056d0SCorentin Jabot   int Len = -1;
267c92056d0SCorentin Jabot   int Prev = PreviousInName;
268c92056d0SCorentin Jabot   for (std::size_t I = 0; I < CountPerColumn[Column]; I++) {
269c92056d0SCorentin Jabot     StringRef Syllable(HangulSyllables[I][Column]);
270c92056d0SCorentin Jabot     if (int(Syllable.size()) <= Len)
271c92056d0SCorentin Jabot       continue;
272c92056d0SCorentin Jabot     std::size_t Consummed = 0;
273c92056d0SCorentin Jabot     char PreviousInNameCopy = PreviousInName;
274c92056d0SCorentin Jabot     bool DoesStartWith = startsWith(Name, Syllable, Strict, Consummed,
275c92056d0SCorentin Jabot                                     PreviousInNameCopy, NeedleStart);
276c92056d0SCorentin Jabot     if (!DoesStartWith)
277c92056d0SCorentin Jabot       continue;
278c92056d0SCorentin Jabot     Len = Consummed;
279c92056d0SCorentin Jabot     Pos = I;
280c92056d0SCorentin Jabot     Prev = PreviousInNameCopy;
281c92056d0SCorentin Jabot   }
282c92056d0SCorentin Jabot   if (Len == -1)
283c92056d0SCorentin Jabot     return 0;
284c92056d0SCorentin Jabot   PreviousInName = Prev;
285c92056d0SCorentin Jabot   return size_t(Len);
286c92056d0SCorentin Jabot }
287c92056d0SCorentin Jabot 
288c92056d0SCorentin Jabot static llvm::Optional<char32_t>
nameToHangulCodePoint(StringRef Name,bool Strict,BufferType & Buffer)289c92056d0SCorentin Jabot nameToHangulCodePoint(StringRef Name, bool Strict, BufferType &Buffer) {
290c92056d0SCorentin Jabot   Buffer.clear();
291c92056d0SCorentin Jabot   // Hangul Syllable Decomposition
292c92056d0SCorentin Jabot   std::size_t Consummed = 0;
293c92056d0SCorentin Jabot   char NameStart = 0, NeedleStart = 0;
294c92056d0SCorentin Jabot   bool DoesStartWith = startsWith(Name, "HANGUL SYLLABLE ", Strict, Consummed,
295c92056d0SCorentin Jabot                                   NameStart, NeedleStart);
296c92056d0SCorentin Jabot   if (!DoesStartWith)
297c92056d0SCorentin Jabot     return None;
298c92056d0SCorentin Jabot   Name = Name.substr(Consummed);
299c92056d0SCorentin Jabot   int L = -1, V = -1, T = -1;
300c92056d0SCorentin Jabot   Name = Name.substr(findSyllable(Name, Strict, NameStart, L, 0));
301c92056d0SCorentin Jabot   Name = Name.substr(findSyllable(Name, Strict, NameStart, V, 1));
302c92056d0SCorentin Jabot   Name = Name.substr(findSyllable(Name, Strict, NameStart, T, 2));
303c92056d0SCorentin Jabot   if (L != -1 && V != -1 && T != -1 && Name.empty()) {
304c92056d0SCorentin Jabot     if (!Strict) {
305c92056d0SCorentin Jabot       Buffer.append("HANGUL SYLLABLE ");
306c92056d0SCorentin Jabot       if (L != -1)
307c92056d0SCorentin Jabot         Buffer.append(HangulSyllables[L][0]);
308c92056d0SCorentin Jabot       if (V != -1)
309c92056d0SCorentin Jabot         Buffer.append(HangulSyllables[V][1]);
310c92056d0SCorentin Jabot       if (T != -1)
311c92056d0SCorentin Jabot         Buffer.append(HangulSyllables[T][2]);
312c92056d0SCorentin Jabot     }
313c92056d0SCorentin Jabot     return SBase + (std::uint32_t(L) * VCount + std::uint32_t(V)) * TCount +
314c92056d0SCorentin Jabot            std::uint32_t(T);
315c92056d0SCorentin Jabot   }
316c92056d0SCorentin Jabot   // Otherwise, it's an illegal syllable name.
317c92056d0SCorentin Jabot   return None;
318c92056d0SCorentin Jabot }
319c92056d0SCorentin Jabot 
320c92056d0SCorentin Jabot struct GeneratedNamesData {
321c92056d0SCorentin Jabot   StringRef Prefix;
322c92056d0SCorentin Jabot   uint32_t Start;
323c92056d0SCorentin Jabot   uint32_t End;
324c92056d0SCorentin Jabot };
325c92056d0SCorentin Jabot 
326c92056d0SCorentin Jabot // Unicode 14.0 Table 4-8. Name Derivation Rule Prefix Strings
327c92056d0SCorentin Jabot // This needs to be kept in sync with
328c92056d0SCorentin Jabot // llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp
329c92056d0SCorentin Jabot static const GeneratedNamesData GeneratedNamesDataTable[] = {
330c92056d0SCorentin Jabot     {"CJK UNIFIED IDEOGRAPH-", 0x3400, 0x4DBF},
331c92056d0SCorentin Jabot     {"CJK UNIFIED IDEOGRAPH-", 0x4E00, 0x9FFC},
332c92056d0SCorentin Jabot     {"CJK UNIFIED IDEOGRAPH-", 0x20000, 0x2A6DD},
333c92056d0SCorentin Jabot     {"CJK UNIFIED IDEOGRAPH-", 0x2A700, 0x2B734},
334c92056d0SCorentin Jabot     {"CJK UNIFIED IDEOGRAPH-", 0x2B740, 0x2B81D},
335c92056d0SCorentin Jabot     {"CJK UNIFIED IDEOGRAPH-", 0x2B820, 0x2CEA1},
336c92056d0SCorentin Jabot     {"CJK UNIFIED IDEOGRAPH-", 0x2CEB0, 0x2EBE0},
337c92056d0SCorentin Jabot     {"CJK UNIFIED IDEOGRAPH-", 0x30000, 0x3134A},
338c92056d0SCorentin Jabot     {"TANGUT IDEOGRAPH-", 0x17000, 0x187F7},
339c92056d0SCorentin Jabot     {"TANGUT IDEOGRAPH-", 0x18D00, 0x18D08},
340c92056d0SCorentin Jabot     {"KHITAN SMALL SCRIPT CHARACTER-", 0x18B00, 0x18CD5},
341c92056d0SCorentin Jabot     {"NUSHU CHARACTER-", 0x1B170, 0x1B2FB},
342c92056d0SCorentin Jabot     {"CJK COMPATIBILITY IDEOGRAPH-", 0xF900, 0xFA6D},
343c92056d0SCorentin Jabot     {"CJK COMPATIBILITY IDEOGRAPH-", 0xFA70, 0xFAD9},
344c92056d0SCorentin Jabot     {"CJK COMPATIBILITY IDEOGRAPH-", 0x2F800, 0x2FA1D},
345c92056d0SCorentin Jabot };
346c92056d0SCorentin Jabot 
347c92056d0SCorentin Jabot static llvm::Optional<char32_t>
nameToGeneratedCodePoint(StringRef Name,bool Strict,BufferType & Buffer)348c92056d0SCorentin Jabot nameToGeneratedCodePoint(StringRef Name, bool Strict, BufferType &Buffer) {
349c92056d0SCorentin Jabot   for (auto &&Item : GeneratedNamesDataTable) {
350c92056d0SCorentin Jabot     Buffer.clear();
351c92056d0SCorentin Jabot     std::size_t Consummed = 0;
352c92056d0SCorentin Jabot     char NameStart = 0, NeedleStart = 0;
353c92056d0SCorentin Jabot     bool DoesStartWith = startsWith(Name, Item.Prefix, Strict, Consummed,
354c92056d0SCorentin Jabot                                     NameStart, NeedleStart, /*isPrefix*/ true);
355c92056d0SCorentin Jabot     if (!DoesStartWith)
356c92056d0SCorentin Jabot       continue;
357c92056d0SCorentin Jabot     auto Number = Name.substr(Consummed);
358c92056d0SCorentin Jabot     unsigned long long V = 0;
359c92056d0SCorentin Jabot     // Be consistent about mandating upper casing.
360c92056d0SCorentin Jabot     if (Strict &&
361c92056d0SCorentin Jabot         llvm::any_of(Number, [](char C) { return C >= 'a' && C <= 'f'; }))
362c92056d0SCorentin Jabot       return {};
363c92056d0SCorentin Jabot     if (getAsUnsignedInteger(Number, 16, V) || V < Item.Start || V > Item.End)
364c92056d0SCorentin Jabot       continue;
365c92056d0SCorentin Jabot     if (!Strict) {
366c92056d0SCorentin Jabot       Buffer.append(Item.Prefix);
367c92056d0SCorentin Jabot       Buffer.append(utohexstr(V, true));
368c92056d0SCorentin Jabot     }
369c92056d0SCorentin Jabot     return V;
370c92056d0SCorentin Jabot   }
371c92056d0SCorentin Jabot   return None;
372c92056d0SCorentin Jabot }
373c92056d0SCorentin Jabot 
nameToCodepoint(StringRef Name,bool Strict,BufferType & Buffer)374c92056d0SCorentin Jabot static llvm::Optional<char32_t> nameToCodepoint(StringRef Name, bool Strict,
375c92056d0SCorentin Jabot                                                 BufferType &Buffer) {
376c92056d0SCorentin Jabot   if (Name.empty())
377c92056d0SCorentin Jabot     return None;
378c92056d0SCorentin Jabot 
379c92056d0SCorentin Jabot   llvm::Optional<char32_t> Res = nameToHangulCodePoint(Name, Strict, Buffer);
380c92056d0SCorentin Jabot   if (!Res)
381c92056d0SCorentin Jabot     Res = nameToGeneratedCodePoint(Name, Strict, Buffer);
382c92056d0SCorentin Jabot   if (Res)
383c92056d0SCorentin Jabot     return *Res;
384c92056d0SCorentin Jabot 
385c92056d0SCorentin Jabot   Buffer.clear();
386c92056d0SCorentin Jabot   Node Node;
387c92056d0SCorentin Jabot   bool Matches;
388c92056d0SCorentin Jabot   uint32_t Value;
389c92056d0SCorentin Jabot   std::tie(Node, Matches, Value) = compareNode(0, Name, Strict, Buffer);
390c92056d0SCorentin Jabot   if (Matches) {
391c92056d0SCorentin Jabot     std::reverse(Buffer.begin(), Buffer.end());
392c92056d0SCorentin Jabot     // UAX44-LM2. Ignore case, whitespace, underscore ('_'), and all medial
393c92056d0SCorentin Jabot     // hyphens except the hyphen in U+1180 HANGUL JUNGSEONG O-E.
394c92056d0SCorentin Jabot     if (!Strict && Value == 0x116c &&
395c92056d0SCorentin Jabot         Name.find_insensitive("O-E") != StringRef::npos) {
396c92056d0SCorentin Jabot       Buffer = "HANGUL JUNGSEONG O-E";
397c92056d0SCorentin Jabot       Value = 0x1180;
398c92056d0SCorentin Jabot     }
399c92056d0SCorentin Jabot     return Value;
400c92056d0SCorentin Jabot   }
401c92056d0SCorentin Jabot   return None;
402c92056d0SCorentin Jabot }
403c92056d0SCorentin Jabot 
nameToCodepointStrict(StringRef Name)404c92056d0SCorentin Jabot llvm::Optional<char32_t> nameToCodepointStrict(StringRef Name) {
405c92056d0SCorentin Jabot 
406c92056d0SCorentin Jabot   BufferType Buffer;
407c92056d0SCorentin Jabot   auto Opt = nameToCodepoint(Name, true, Buffer);
408c92056d0SCorentin Jabot   return Opt;
409c92056d0SCorentin Jabot }
410c92056d0SCorentin Jabot 
411c92056d0SCorentin Jabot llvm::Optional<LooseMatchingResult>
nameToCodepointLooseMatching(StringRef Name)412c92056d0SCorentin Jabot nameToCodepointLooseMatching(StringRef Name) {
413c92056d0SCorentin Jabot   BufferType Buffer;
414c92056d0SCorentin Jabot   auto Opt = nameToCodepoint(Name, false, Buffer);
415c92056d0SCorentin Jabot   if (!Opt)
416c92056d0SCorentin Jabot     return None;
417c92056d0SCorentin Jabot   return LooseMatchingResult{*Opt, Buffer};
418c92056d0SCorentin Jabot }
419c92056d0SCorentin Jabot 
420c92056d0SCorentin Jabot // Find the unicode character whose editing distance to Pattern
421c92056d0SCorentin Jabot // is shortest, using the Wagner–Fischer algorithm.
422c92056d0SCorentin Jabot llvm::SmallVector<MatchForCodepointName>
nearestMatchesForCodepointName(StringRef Pattern,std::size_t MaxMatchesCount)423c92056d0SCorentin Jabot nearestMatchesForCodepointName(StringRef Pattern, std::size_t MaxMatchesCount) {
424c92056d0SCorentin Jabot   // We maintain a fixed size vector of matches,
425c92056d0SCorentin Jabot   // sorted by distance
426c92056d0SCorentin Jabot   // The worst match (with the biggest distance) are discarded when new elements
427c92056d0SCorentin Jabot   // are added.
428c92056d0SCorentin Jabot   std::size_t LargestEditDistance = 0;
429c92056d0SCorentin Jabot   llvm::SmallVector<MatchForCodepointName> Matches;
430c92056d0SCorentin Jabot   Matches.reserve(MaxMatchesCount + 1);
431c92056d0SCorentin Jabot 
432c92056d0SCorentin Jabot   auto Insert = [&](const Node &Node, uint32_t Distance,
433c92056d0SCorentin Jabot                     char32_t Value) -> bool {
434c92056d0SCorentin Jabot     if (Distance > LargestEditDistance) {
435c92056d0SCorentin Jabot       if (Matches.size() == MaxMatchesCount)
436c92056d0SCorentin Jabot         return false;
437c92056d0SCorentin Jabot       LargestEditDistance = Distance;
438c92056d0SCorentin Jabot     }
439c92056d0SCorentin Jabot     // To avoid allocations, the creation of the name is delayed
440c92056d0SCorentin Jabot     // as much as possible.
441c92056d0SCorentin Jabot     std::string Name;
442c92056d0SCorentin Jabot     auto GetName = [&] {
443c92056d0SCorentin Jabot       if (Name.empty())
444c92056d0SCorentin Jabot         Name = Node.fullName();
445c92056d0SCorentin Jabot       return Name;
446c92056d0SCorentin Jabot     };
447c92056d0SCorentin Jabot 
448c92056d0SCorentin Jabot     auto It = std::lower_bound(
449c92056d0SCorentin Jabot         Matches.begin(), Matches.end(), Distance,
450c92056d0SCorentin Jabot         [&](const MatchForCodepointName &a, std::size_t Distance) {
451c92056d0SCorentin Jabot           if (Distance == a.Distance)
452c92056d0SCorentin Jabot             return a.Name < GetName();
453c92056d0SCorentin Jabot           return a.Distance < Distance;
454c92056d0SCorentin Jabot         });
455c92056d0SCorentin Jabot     if (It == Matches.end() && Matches.size() == MaxMatchesCount)
456c92056d0SCorentin Jabot       return false;
457c92056d0SCorentin Jabot 
458c92056d0SCorentin Jabot     MatchForCodepointName M{GetName(), Distance, Value};
459c92056d0SCorentin Jabot     Matches.insert(It, std::move(M));
460c92056d0SCorentin Jabot     if (Matches.size() > MaxMatchesCount)
461c92056d0SCorentin Jabot       Matches.pop_back();
462c92056d0SCorentin Jabot     return true;
463c92056d0SCorentin Jabot   };
464c92056d0SCorentin Jabot 
465c92056d0SCorentin Jabot   // We ignore case, space, hyphens, etc,
466c92056d0SCorentin Jabot   // in both the search pattern and the prospective names.
467c92056d0SCorentin Jabot   auto Normalize = [](StringRef Name) {
468c92056d0SCorentin Jabot     std::string Out;
469c92056d0SCorentin Jabot     Out.reserve(Name.size());
470c92056d0SCorentin Jabot     for (char C : Name) {
471c92056d0SCorentin Jabot       if (isAlnum(C))
472c92056d0SCorentin Jabot         Out.push_back(toUpper(C));
473c92056d0SCorentin Jabot     }
474c92056d0SCorentin Jabot     return Out;
475c92056d0SCorentin Jabot   };
476c92056d0SCorentin Jabot   std::string NormalizedName = Normalize(Pattern);
477c92056d0SCorentin Jabot 
478c92056d0SCorentin Jabot   // Allocate a matrix big enough for longest names.
479c92056d0SCorentin Jabot   const std::size_t Columns =
480c92056d0SCorentin Jabot       std::min(NormalizedName.size(), UnicodeNameToCodepointLargestNameSize) +
481c92056d0SCorentin Jabot       1;
482c92056d0SCorentin Jabot 
483c92056d0SCorentin Jabot   LLVM_ATTRIBUTE_UNUSED static std::size_t Rows =
484c92056d0SCorentin Jabot       UnicodeNameToCodepointLargestNameSize + 1;
485c92056d0SCorentin Jabot 
486c92056d0SCorentin Jabot   std::vector<char> Distances(
487c92056d0SCorentin Jabot       Columns * (UnicodeNameToCodepointLargestNameSize + 1), 0);
488c92056d0SCorentin Jabot 
489c92056d0SCorentin Jabot   auto Get = [&Distances, Columns](size_t Column, std::size_t Row) -> char & {
490c92056d0SCorentin Jabot     assert(Column < Columns);
491c92056d0SCorentin Jabot     assert(Row < Rows);
492c92056d0SCorentin Jabot     return Distances[Row * Columns + Column];
493c92056d0SCorentin Jabot   };
494c92056d0SCorentin Jabot 
495c92056d0SCorentin Jabot   for (std::size_t I = 0; I < Columns; I++)
496c92056d0SCorentin Jabot     Get(I, 0) = I;
497c92056d0SCorentin Jabot 
498c92056d0SCorentin Jabot   // Visit the childrens,
499c92056d0SCorentin Jabot   // Filling (and overriding) the matrix for the name fragment of each node
500c92056d0SCorentin Jabot   // iteratively. CompleteName is used to collect the actual name of potential
501c92056d0SCorentin Jabot   // match, respecting case and spacing.
502c92056d0SCorentin Jabot   auto VisitNode = [&](const Node &N, std::size_t Row,
503c92056d0SCorentin Jabot                        auto &VisitNode) -> void {
504c92056d0SCorentin Jabot     std::size_t J = 0;
505c92056d0SCorentin Jabot     for (; J < N.Name.size(); J++) {
506c92056d0SCorentin Jabot       if (!isAlnum(N.Name[J]))
507c92056d0SCorentin Jabot         continue;
508c92056d0SCorentin Jabot 
509c92056d0SCorentin Jabot       Get(0, Row) = Row;
510c92056d0SCorentin Jabot 
511c92056d0SCorentin Jabot       for (std::size_t I = 1; I < Columns; I++) {
512c92056d0SCorentin Jabot         const int Delete = Get(I - 1, Row) + 1;
513c92056d0SCorentin Jabot         const int Insert = Get(I, Row - 1) + 1;
514c92056d0SCorentin Jabot 
515c92056d0SCorentin Jabot         const int Replace =
516c92056d0SCorentin Jabot             Get(I - 1, Row - 1) + (NormalizedName[I - 1] != N.Name[J] ? 1 : 0);
517c92056d0SCorentin Jabot 
518c92056d0SCorentin Jabot         Get(I, Row) = std::min(Insert, std::min(Delete, Replace));
519c92056d0SCorentin Jabot       }
520c92056d0SCorentin Jabot 
521c92056d0SCorentin Jabot       Row++;
522c92056d0SCorentin Jabot     }
523c92056d0SCorentin Jabot 
524c92056d0SCorentin Jabot     unsigned Cost = Get(Columns - 1, Row - 1);
525c92056d0SCorentin Jabot     if (N.Value != 0xFFFFFFFF) {
526c92056d0SCorentin Jabot       Insert(N, Cost, N.Value);
527c92056d0SCorentin Jabot     }
528c92056d0SCorentin Jabot 
529c92056d0SCorentin Jabot     if (N.hasChildren()) {
530c92056d0SCorentin Jabot       auto ChildOffset = N.ChildrenOffset;
531c92056d0SCorentin Jabot       for (;;) {
532c92056d0SCorentin Jabot         Node C = readNode(ChildOffset, &N);
533c92056d0SCorentin Jabot         ChildOffset += C.Size;
534c92056d0SCorentin Jabot         if (!C.isValid())
535c92056d0SCorentin Jabot           break;
536c92056d0SCorentin Jabot         VisitNode(C, Row, VisitNode);
537c92056d0SCorentin Jabot         if (!C.HasSibling)
538c92056d0SCorentin Jabot           break;
539c92056d0SCorentin Jabot       }
540c92056d0SCorentin Jabot     }
541c92056d0SCorentin Jabot   };
542c92056d0SCorentin Jabot 
543c92056d0SCorentin Jabot   Node Root = createRoot();
544c92056d0SCorentin Jabot   VisitNode(Root, 1, VisitNode);
545c92056d0SCorentin Jabot   return Matches;
546c92056d0SCorentin Jabot }
547c92056d0SCorentin Jabot 
548c92056d0SCorentin Jabot } // namespace unicode
549c92056d0SCorentin Jabot 
550c92056d0SCorentin Jabot } // namespace sys
551c92056d0SCorentin Jabot } // namespace llvm
552