xref: /llvm-project-15.0.7/flang/runtime/utf.cpp (revision bafbae23)
1 //===-- runtime/utf.cpp ---------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "utf.h"
10 
11 namespace Fortran::runtime {
12 
13 // clang-format off
14 const std::uint8_t UTF8FirstByteTable[256]{
15   /* 00 - 7F:  7 bit payload in single byte */
16     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
17     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
18     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
19     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
20     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
21     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
22     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
23     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
24   /* 80 - BF: invalid first byte, valid later byte */
25     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
26     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
27     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
28     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
29   /* C0 - DF: 11 bit payload */
30     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
31     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
32   /* E0 - EF: 16 bit payload */
33     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
34   /* F0 - F7: 21 bit payload */ 4, 4, 4, 4, 4, 4, 4, 4,
35   /* F8 - FB: 26 bit payload */ 5, 5, 5, 5,
36   /* FC - FD: 31 bit payload */ 6, 6,
37   /* FE:      32 bit payload */ 7,
38   /* FF:      invalid */ 0
39 };
40 // clang-format on
41 
42 // Non-minimal encodings are accepted.
DecodeUTF8(const char * p0)43 std::optional<char32_t> DecodeUTF8(const char *p0) {
44   const std::uint8_t *p{reinterpret_cast<const std::uint8_t *>(p0)};
45   std::size_t bytes{MeasureUTF8Bytes(*p0)};
46   if (bytes == 1) {
47     return char32_t{*p};
48   } else if (bytes > 1) {
49     std::uint64_t result{char32_t{*p} & (0x7f >> bytes)};
50     for (std::size_t j{1}; j < bytes; ++j) {
51       std::uint8_t next{p[j]};
52       if (next < 0x80 || next > 0xbf) {
53         return std::nullopt;
54       }
55       result = (result << 6) | (next & 0x3f);
56     }
57     if (result <= 0xffffffff) {
58       return static_cast<char32_t>(result);
59     }
60   }
61   return std::nullopt;
62 }
63 
EncodeUTF8(char * p0,char32_t ucs)64 std::size_t EncodeUTF8(char *p0, char32_t ucs) {
65   std::uint8_t *p{reinterpret_cast<std::uint8_t *>(p0)};
66   if (ucs <= 0x7f) {
67     p[0] = ucs;
68     return 1;
69   } else if (ucs <= 0x7ff) {
70     p[0] = 0xc0 | (ucs >> 6);
71     p[1] = 0x80 | (ucs & 0x3f);
72     return 2;
73   } else if (ucs <= 0xffff) {
74     p[0] = 0xe0 | (ucs >> 12);
75     p[1] = 0x80 | ((ucs >> 6) & 0x3f);
76     p[2] = 0x80 | (ucs & 0x3f);
77     return 3;
78   } else if (ucs <= 0x1fffff) {
79     p[0] = 0xf0 | (ucs >> 18);
80     p[1] = 0x80 | ((ucs >> 12) & 0x3f);
81     p[2] = 0x80 | ((ucs >> 6) & 0x3f);
82     p[3] = 0x80 | (ucs & 0x3f);
83     return 4;
84   } else if (ucs <= 0x3ffffff) {
85     p[0] = 0xf8 | (ucs >> 24);
86     p[1] = 0x80 | ((ucs >> 18) & 0x3f);
87     p[2] = 0x80 | ((ucs >> 12) & 0x3f);
88     p[3] = 0x80 | ((ucs >> 6) & 0x3f);
89     p[4] = 0x80 | (ucs & 0x3f);
90     return 5;
91   } else if (ucs <= 0x7ffffff) {
92     p[0] = 0xf8 | (ucs >> 30);
93     p[1] = 0x80 | ((ucs >> 24) & 0x3f);
94     p[2] = 0x80 | ((ucs >> 18) & 0x3f);
95     p[3] = 0x80 | ((ucs >> 12) & 0x3f);
96     p[4] = 0x80 | ((ucs >> 6) & 0x3f);
97     p[5] = 0x80 | (ucs & 0x3f);
98     return 6;
99   } else {
100     p[0] = 0xfe;
101     p[1] = 0x80 | ((ucs >> 30) & 0x3f);
102     p[2] = 0x80 | ((ucs >> 24) & 0x3f);
103     p[3] = 0x80 | ((ucs >> 18) & 0x3f);
104     p[4] = 0x80 | ((ucs >> 12) & 0x3f);
105     p[5] = 0x80 | ((ucs >> 6) & 0x3f);
106     p[6] = 0x80 | (ucs & 0x3f);
107     return 7;
108   }
109 }
110 
111 } // namespace Fortran::runtime
112