1 //===-- runtime/utf.cpp ---------------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "utf.h" 10 11 namespace Fortran::runtime { 12 13 // clang-format off 14 const std::uint8_t UTF8FirstByteTable[256]{ 15 /* 00 - 7F: 7 bit payload in single byte */ 16 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 17 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 18 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 19 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 20 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 21 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 22 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 23 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 24 /* 80 - BF: invalid first byte, valid later byte */ 25 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 26 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 27 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 28 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 29 /* C0 - DF: 11 bit payload */ 30 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 31 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 32 /* E0 - EF: 16 bit payload */ 33 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 34 /* F0 - F7: 21 bit payload */ 4, 4, 4, 4, 4, 4, 4, 4, 35 /* F8 - FB: 26 bit payload */ 5, 5, 5, 5, 36 /* FC - FD: 31 bit payload */ 6, 6, 37 /* FE: 32 bit payload */ 7, 38 /* FF: invalid */ 0 39 }; 40 // clang-format on 41 42 // Non-minimal encodings are accepted. 43 std::optional<char32_t> DecodeUTF8(const char *p0) { 44 const std::uint8_t *p{reinterpret_cast<const std::uint8_t *>(p0)}; 45 std::size_t bytes{MeasureUTF8Bytes(*p0)}; 46 if (bytes == 1) { 47 return char32_t{*p}; 48 } else if (bytes > 1) { 49 std::uint64_t result{char32_t{*p} & (0x7f >> bytes)}; 50 for (std::size_t j{1}; j < bytes; ++j) { 51 std::uint8_t next{p[j]}; 52 if (next < 0x80 || next > 0xbf) { 53 return std::nullopt; 54 } 55 result = (result << 6) | (next & 0x3f); 56 } 57 if (result <= 0xffffffff) { 58 return static_cast<char32_t>(result); 59 } 60 } 61 return std::nullopt; 62 } 63 64 std::size_t EncodeUTF8(char *p0, char32_t ucs) { 65 std::uint8_t *p{reinterpret_cast<std::uint8_t *>(p0)}; 66 if (ucs <= 0x7f) { 67 p[0] = ucs; 68 return 1; 69 } else if (ucs <= 0x7ff) { 70 p[0] = 0xc0 | (ucs >> 6); 71 p[1] = 0x80 | (ucs & 0x3f); 72 return 2; 73 } else if (ucs <= 0xffff) { 74 p[0] = 0xe0 | (ucs >> 12); 75 p[1] = 0x80 | ((ucs >> 6) & 0x3f); 76 p[2] = 0x80 | (ucs & 0x3f); 77 return 3; 78 } else if (ucs <= 0x1fffff) { 79 p[0] = 0xf0 | (ucs >> 18); 80 p[1] = 0x80 | ((ucs >> 12) & 0x3f); 81 p[2] = 0x80 | ((ucs >> 6) & 0x3f); 82 p[3] = 0x80 | (ucs & 0x3f); 83 return 4; 84 } else if (ucs <= 0x3ffffff) { 85 p[0] = 0xf8 | (ucs >> 24); 86 p[1] = 0x80 | ((ucs >> 18) & 0x3f); 87 p[2] = 0x80 | ((ucs >> 12) & 0x3f); 88 p[3] = 0x80 | ((ucs >> 6) & 0x3f); 89 p[4] = 0x80 | (ucs & 0x3f); 90 return 5; 91 } else if (ucs <= 0x7ffffff) { 92 p[0] = 0xf8 | (ucs >> 30); 93 p[1] = 0x80 | ((ucs >> 24) & 0x3f); 94 p[2] = 0x80 | ((ucs >> 18) & 0x3f); 95 p[3] = 0x80 | ((ucs >> 12) & 0x3f); 96 p[4] = 0x80 | ((ucs >> 6) & 0x3f); 97 p[5] = 0x80 | (ucs & 0x3f); 98 return 6; 99 } else { 100 p[0] = 0xfe; 101 p[1] = 0x80 | ((ucs >> 30) & 0x3f); 102 p[2] = 0x80 | ((ucs >> 24) & 0x3f); 103 p[3] = 0x80 | ((ucs >> 18) & 0x3f); 104 p[4] = 0x80 | ((ucs >> 12) & 0x3f); 105 p[5] = 0x80 | ((ucs >> 6) & 0x3f); 106 p[6] = 0x80 | (ucs & 0x3f); 107 return 7; 108 } 109 } 110 111 } // namespace Fortran::runtime 112