xref: /llvm-project-15.0.7/flang/runtime/utf.cpp (revision bafbae23)
1*bafbae23SPeter Klausler //===-- runtime/utf.cpp ---------------------------------------------------===//
2*bafbae23SPeter Klausler //
3*bafbae23SPeter Klausler // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*bafbae23SPeter Klausler // See https://llvm.org/LICENSE.txt for license information.
5*bafbae23SPeter Klausler // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*bafbae23SPeter Klausler //
7*bafbae23SPeter Klausler //===----------------------------------------------------------------------===//
8*bafbae23SPeter Klausler 
9*bafbae23SPeter Klausler #include "utf.h"
10*bafbae23SPeter Klausler 
11*bafbae23SPeter Klausler namespace Fortran::runtime {
12*bafbae23SPeter Klausler 
13*bafbae23SPeter Klausler // clang-format off
14*bafbae23SPeter Klausler const std::uint8_t UTF8FirstByteTable[256]{
15*bafbae23SPeter Klausler   /* 00 - 7F:  7 bit payload in single byte */
16*bafbae23SPeter Klausler     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
17*bafbae23SPeter Klausler     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
18*bafbae23SPeter Klausler     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
19*bafbae23SPeter Klausler     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
20*bafbae23SPeter Klausler     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
21*bafbae23SPeter Klausler     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
22*bafbae23SPeter Klausler     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
23*bafbae23SPeter Klausler     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
24*bafbae23SPeter Klausler   /* 80 - BF: invalid first byte, valid later byte */
25*bafbae23SPeter Klausler     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
26*bafbae23SPeter Klausler     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
27*bafbae23SPeter Klausler     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
28*bafbae23SPeter Klausler     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
29*bafbae23SPeter Klausler   /* C0 - DF: 11 bit payload */
30*bafbae23SPeter Klausler     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
31*bafbae23SPeter Klausler     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
32*bafbae23SPeter Klausler   /* E0 - EF: 16 bit payload */
33*bafbae23SPeter Klausler     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
34*bafbae23SPeter Klausler   /* F0 - F7: 21 bit payload */ 4, 4, 4, 4, 4, 4, 4, 4,
35*bafbae23SPeter Klausler   /* F8 - FB: 26 bit payload */ 5, 5, 5, 5,
36*bafbae23SPeter Klausler   /* FC - FD: 31 bit payload */ 6, 6,
37*bafbae23SPeter Klausler   /* FE:      32 bit payload */ 7,
38*bafbae23SPeter Klausler   /* FF:      invalid */ 0
39*bafbae23SPeter Klausler };
40*bafbae23SPeter Klausler // clang-format on
41*bafbae23SPeter Klausler 
42*bafbae23SPeter Klausler // Non-minimal encodings are accepted.
DecodeUTF8(const char * p0)43*bafbae23SPeter Klausler std::optional<char32_t> DecodeUTF8(const char *p0) {
44*bafbae23SPeter Klausler   const std::uint8_t *p{reinterpret_cast<const std::uint8_t *>(p0)};
45*bafbae23SPeter Klausler   std::size_t bytes{MeasureUTF8Bytes(*p0)};
46*bafbae23SPeter Klausler   if (bytes == 1) {
47*bafbae23SPeter Klausler     return char32_t{*p};
48*bafbae23SPeter Klausler   } else if (bytes > 1) {
49*bafbae23SPeter Klausler     std::uint64_t result{char32_t{*p} & (0x7f >> bytes)};
50*bafbae23SPeter Klausler     for (std::size_t j{1}; j < bytes; ++j) {
51*bafbae23SPeter Klausler       std::uint8_t next{p[j]};
52*bafbae23SPeter Klausler       if (next < 0x80 || next > 0xbf) {
53*bafbae23SPeter Klausler         return std::nullopt;
54*bafbae23SPeter Klausler       }
55*bafbae23SPeter Klausler       result = (result << 6) | (next & 0x3f);
56*bafbae23SPeter Klausler     }
57*bafbae23SPeter Klausler     if (result <= 0xffffffff) {
58*bafbae23SPeter Klausler       return static_cast<char32_t>(result);
59*bafbae23SPeter Klausler     }
60*bafbae23SPeter Klausler   }
61*bafbae23SPeter Klausler   return std::nullopt;
62*bafbae23SPeter Klausler }
63*bafbae23SPeter Klausler 
EncodeUTF8(char * p0,char32_t ucs)64*bafbae23SPeter Klausler std::size_t EncodeUTF8(char *p0, char32_t ucs) {
65*bafbae23SPeter Klausler   std::uint8_t *p{reinterpret_cast<std::uint8_t *>(p0)};
66*bafbae23SPeter Klausler   if (ucs <= 0x7f) {
67*bafbae23SPeter Klausler     p[0] = ucs;
68*bafbae23SPeter Klausler     return 1;
69*bafbae23SPeter Klausler   } else if (ucs <= 0x7ff) {
70*bafbae23SPeter Klausler     p[0] = 0xc0 | (ucs >> 6);
71*bafbae23SPeter Klausler     p[1] = 0x80 | (ucs & 0x3f);
72*bafbae23SPeter Klausler     return 2;
73*bafbae23SPeter Klausler   } else if (ucs <= 0xffff) {
74*bafbae23SPeter Klausler     p[0] = 0xe0 | (ucs >> 12);
75*bafbae23SPeter Klausler     p[1] = 0x80 | ((ucs >> 6) & 0x3f);
76*bafbae23SPeter Klausler     p[2] = 0x80 | (ucs & 0x3f);
77*bafbae23SPeter Klausler     return 3;
78*bafbae23SPeter Klausler   } else if (ucs <= 0x1fffff) {
79*bafbae23SPeter Klausler     p[0] = 0xf0 | (ucs >> 18);
80*bafbae23SPeter Klausler     p[1] = 0x80 | ((ucs >> 12) & 0x3f);
81*bafbae23SPeter Klausler     p[2] = 0x80 | ((ucs >> 6) & 0x3f);
82*bafbae23SPeter Klausler     p[3] = 0x80 | (ucs & 0x3f);
83*bafbae23SPeter Klausler     return 4;
84*bafbae23SPeter Klausler   } else if (ucs <= 0x3ffffff) {
85*bafbae23SPeter Klausler     p[0] = 0xf8 | (ucs >> 24);
86*bafbae23SPeter Klausler     p[1] = 0x80 | ((ucs >> 18) & 0x3f);
87*bafbae23SPeter Klausler     p[2] = 0x80 | ((ucs >> 12) & 0x3f);
88*bafbae23SPeter Klausler     p[3] = 0x80 | ((ucs >> 6) & 0x3f);
89*bafbae23SPeter Klausler     p[4] = 0x80 | (ucs & 0x3f);
90*bafbae23SPeter Klausler     return 5;
91*bafbae23SPeter Klausler   } else if (ucs <= 0x7ffffff) {
92*bafbae23SPeter Klausler     p[0] = 0xf8 | (ucs >> 30);
93*bafbae23SPeter Klausler     p[1] = 0x80 | ((ucs >> 24) & 0x3f);
94*bafbae23SPeter Klausler     p[2] = 0x80 | ((ucs >> 18) & 0x3f);
95*bafbae23SPeter Klausler     p[3] = 0x80 | ((ucs >> 12) & 0x3f);
96*bafbae23SPeter Klausler     p[4] = 0x80 | ((ucs >> 6) & 0x3f);
97*bafbae23SPeter Klausler     p[5] = 0x80 | (ucs & 0x3f);
98*bafbae23SPeter Klausler     return 6;
99*bafbae23SPeter Klausler   } else {
100*bafbae23SPeter Klausler     p[0] = 0xfe;
101*bafbae23SPeter Klausler     p[1] = 0x80 | ((ucs >> 30) & 0x3f);
102*bafbae23SPeter Klausler     p[2] = 0x80 | ((ucs >> 24) & 0x3f);
103*bafbae23SPeter Klausler     p[3] = 0x80 | ((ucs >> 18) & 0x3f);
104*bafbae23SPeter Klausler     p[4] = 0x80 | ((ucs >> 12) & 0x3f);
105*bafbae23SPeter Klausler     p[5] = 0x80 | ((ucs >> 6) & 0x3f);
106*bafbae23SPeter Klausler     p[6] = 0x80 | (ucs & 0x3f);
107*bafbae23SPeter Klausler     return 7;
108*bafbae23SPeter Klausler   }
109*bafbae23SPeter Klausler }
110*bafbae23SPeter Klausler 
111*bafbae23SPeter Klausler } // namespace Fortran::runtime
112