1 //===-- lib/Parser/characters.cpp -----------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "flang/Parser/characters.h" 10 #include "flang/Common/idioms.h" 11 #include <algorithm> 12 #include <cstddef> 13 #include <optional> 14 #include <type_traits> 15 16 namespace Fortran::parser { 17 18 bool useHexadecimalEscapeSequences{false}; 19 20 int UTF_8CharacterBytes(const char *p) { 21 if ((*p & 0x80) == 0) { 22 return 1; 23 } else if ((*p & 0xe0) == 0xc0) { 24 return 2; 25 } else if ((*p & 0xf0) == 0xe0) { 26 return 3; 27 } else if ((*p & 0xf8) == 0xf0) { 28 return 4; 29 } else if ((*p & 0xfc) == 0xf8) { 30 return 5; 31 } else { 32 return 6; 33 } 34 } 35 36 template <typename STRING> 37 std::string QuoteCharacterLiteralHelper( 38 const STRING &str, bool backslashEscapes, Encoding encoding) { 39 std::string result{'"'}; 40 const auto emit{[&](char ch) { result += ch; }}; 41 for (auto ch : str) { 42 using CharT = std::decay_t<decltype(ch)>; 43 char32_t ch32{static_cast<std::make_unsigned_t<CharT>>(ch)}; 44 if (ch32 == static_cast<unsigned char>('"')) { 45 emit('"'); // double the " when it appears in the text 46 } 47 EmitQuotedChar(ch32, emit, emit, backslashEscapes, encoding); 48 } 49 result += '"'; 50 return result; 51 } 52 53 std::string QuoteCharacterLiteral( 54 const std::string &str, bool backslashEscapes, Encoding encoding) { 55 return QuoteCharacterLiteralHelper(str, backslashEscapes, encoding); 56 } 57 58 std::string QuoteCharacterLiteral( 59 const std::u16string &str, bool backslashEscapes, Encoding encoding) { 60 return QuoteCharacterLiteralHelper(str, backslashEscapes, encoding); 61 } 62 63 std::string QuoteCharacterLiteral( 64 const std::u32string &str, bool backslashEscapes, Encoding encoding) { 65 return QuoteCharacterLiteralHelper(str, backslashEscapes, encoding); 66 } 67 68 template <> EncodedCharacter EncodeCharacter<Encoding::LATIN_1>(char32_t ucs) { 69 CHECK(ucs <= 0xff); 70 EncodedCharacter result; 71 result.buffer[0] = ucs; 72 result.bytes = 1; 73 return result; 74 } 75 76 template <> EncodedCharacter EncodeCharacter<Encoding::UTF_8>(char32_t ucs) { 77 // N.B. char32_t is unsigned 78 EncodedCharacter result; 79 if (ucs <= 0x7f) { 80 result.buffer[0] = ucs; 81 result.bytes = 1; 82 } else if (ucs <= 0x7ff) { 83 result.buffer[0] = 0xc0 | (ucs >> 6); 84 result.buffer[1] = 0x80 | (ucs & 0x3f); 85 result.bytes = 2; 86 } else if (ucs <= 0xffff) { 87 result.buffer[0] = 0xe0 | (ucs >> 12); 88 result.buffer[1] = 0x80 | ((ucs >> 6) & 0x3f); 89 result.buffer[2] = 0x80 | (ucs & 0x3f); 90 result.bytes = 3; 91 } else if (ucs <= 0x1fffff) { 92 // UCS actually only goes up to 0x10ffff, but the 93 // UTF-8 encoding can handle 32 bits. 94 result.buffer[0] = 0xf0 | (ucs >> 18); 95 result.buffer[1] = 0x80 | ((ucs >> 12) & 0x3f); 96 result.buffer[2] = 0x80 | ((ucs >> 6) & 0x3f); 97 result.buffer[3] = 0x80 | (ucs & 0x3f); 98 result.bytes = 4; 99 } else if (ucs <= 0x3ffffff) { 100 result.buffer[0] = 0xf8 | (ucs >> 24); 101 result.buffer[1] = 0x80 | ((ucs >> 18) & 0x3f); 102 result.buffer[2] = 0x80 | ((ucs >> 12) & 0x3f); 103 result.buffer[3] = 0x80 | ((ucs >> 6) & 0x3f); 104 result.buffer[4] = 0x80 | (ucs & 0x3f); 105 result.bytes = 5; 106 } else { 107 result.buffer[0] = 0xfc | (ucs >> 30); 108 result.buffer[1] = 0x80 | ((ucs >> 24) & 0x3f); 109 result.buffer[2] = 0x80 | ((ucs >> 18) & 0x3f); 110 result.buffer[3] = 0x80 | ((ucs >> 12) & 0x3f); 111 result.buffer[4] = 0x80 | ((ucs >> 6) & 0x3f); 112 result.buffer[5] = 0x80 | (ucs & 0x3f); 113 result.bytes = 6; 114 } 115 return result; 116 } 117 118 EncodedCharacter EncodeCharacter(Encoding encoding, char32_t ucs) { 119 switch (encoding) { 120 SWITCH_COVERS_ALL_CASES 121 case Encoding::LATIN_1: 122 return EncodeCharacter<Encoding::LATIN_1>(ucs); 123 case Encoding::UTF_8: 124 return EncodeCharacter<Encoding::UTF_8>(ucs); 125 } 126 } 127 128 template <Encoding ENCODING, typename STRING> 129 std::string EncodeString(const STRING &str) { 130 std::string result; 131 for (auto ch : str) { 132 char32_t uch{static_cast<std::make_unsigned_t<decltype(ch)>>(ch)}; 133 EncodedCharacter encoded{EncodeCharacter<ENCODING>(uch)}; 134 result.append(encoded.buffer, static_cast<std::size_t>(encoded.bytes)); 135 } 136 return result; 137 } 138 139 template std::string EncodeString<Encoding::LATIN_1, std::string>( 140 const std::string &); 141 template std::string EncodeString<Encoding::UTF_8, std::u16string>( 142 const std::u16string &); 143 template std::string EncodeString<Encoding::UTF_8, std::u32string>( 144 const std::u32string &); 145 146 template <> 147 DecodedCharacter DecodeRawCharacter<Encoding::LATIN_1>( 148 const char *cp, std::size_t bytes) { 149 if (bytes >= 1) { 150 return {*reinterpret_cast<const std::uint8_t *>(cp), 1}; 151 } else { 152 return {}; 153 } 154 } 155 156 template <> 157 DecodedCharacter DecodeRawCharacter<Encoding::UTF_8>( 158 const char *cp, std::size_t bytes) { 159 auto p{reinterpret_cast<const std::uint8_t *>(cp)}; 160 char32_t ch{*p}; 161 if (ch <= 0x7f) { 162 return {ch, 1}; 163 } else if ((ch & 0xf8) == 0xf0 && bytes >= 4 && ch > 0xf0 && 164 ((p[1] | p[2] | p[3]) & 0xc0) == 0x80) { 165 ch = ((ch & 7) << 6) | (p[1] & 0x3f); 166 ch = (ch << 6) | (p[2] & 0x3f); 167 ch = (ch << 6) | (p[3] & 0x3f); 168 return {ch, 4}; 169 } else if ((ch & 0xf0) == 0xe0 && bytes >= 3 && ch > 0xe0 && 170 ((p[1] | p[2]) & 0xc0) == 0x80) { 171 ch = ((ch & 0xf) << 6) | (p[1] & 0x3f); 172 ch = (ch << 6) | (p[2] & 0x3f); 173 return {ch, 3}; 174 } else if ((ch & 0xe0) == 0xc0 && bytes >= 2 && ch > 0xc0 && 175 (p[1] & 0xc0) == 0x80) { 176 ch = ((ch & 0x1f) << 6) | (p[1] & 0x3f); 177 return {ch, 2}; 178 } else { 179 return {}; // not valid UTF-8 180 } 181 } 182 183 static DecodedCharacter DecodeEscapedCharacter( 184 const char *cp, std::size_t bytes) { 185 if (cp[0] == '\\' && bytes >= 2) { 186 if (std::optional<char> escChar{BackslashEscapeValue(cp[1])}) { 187 return {static_cast<unsigned char>(*escChar), 2}; 188 } else if (IsOctalDigit(cp[1])) { 189 std::size_t maxLen{std::min(std::size_t{4}, bytes)}; 190 char32_t code{static_cast<char32_t>(DecimalDigitValue(cp[1]))}; 191 std::size_t len{2}; // so far 192 for (; code <= 037 && len < maxLen && IsOctalDigit(cp[len]); ++len) { 193 code = 8 * code + DecimalDigitValue(cp[len]); 194 } 195 return {code, static_cast<int>(len)}; 196 } else if (bytes >= 4 && ToLowerCaseLetter(cp[1]) == 'x' && 197 IsHexadecimalDigit(cp[2]) && IsHexadecimalDigit(cp[3])) { 198 return {static_cast<char32_t>(16 * HexadecimalDigitValue(cp[2]) + 199 HexadecimalDigitValue(cp[3])), 200 4}; 201 } else if (IsLetter(cp[1])) { 202 // Unknown escape - ignore the '\' (PGI compatibility) 203 return {static_cast<unsigned char>(cp[1]), 2}; 204 } else { 205 // Not an escape character. 206 return {'\\', 1}; 207 } 208 } 209 return {static_cast<unsigned char>(cp[0]), 1}; 210 } 211 212 template <Encoding ENCODING> 213 static DecodedCharacter DecodeEscapedCharacters( 214 const char *cp, std::size_t bytes) { 215 char buffer[EncodedCharacter::maxEncodingBytes]; 216 int count[EncodedCharacter::maxEncodingBytes]; 217 std::size_t at{0}, len{0}; 218 for (; len < EncodedCharacter::maxEncodingBytes && at < bytes; ++len) { 219 DecodedCharacter code{DecodeEscapedCharacter(cp + at, bytes - at)}; 220 buffer[len] = code.codepoint; 221 at += code.bytes; 222 count[len] = at; 223 } 224 DecodedCharacter code{DecodeCharacter<ENCODING>(buffer, len, false)}; 225 if (code.bytes > 0) { 226 code.bytes = count[code.bytes - 1]; 227 } else { 228 code.codepoint = buffer[0] & 0xff; 229 code.bytes = count[0]; 230 } 231 return code; 232 } 233 234 template <Encoding ENCODING> 235 DecodedCharacter DecodeCharacter( 236 const char *cp, std::size_t bytes, bool backslashEscapes) { 237 if (backslashEscapes && bytes >= 2 && *cp == '\\') { 238 return DecodeEscapedCharacters<ENCODING>(cp, bytes); 239 } else { 240 return DecodeRawCharacter<ENCODING>(cp, bytes); 241 } 242 } 243 244 template DecodedCharacter DecodeCharacter<Encoding::LATIN_1>( 245 const char *, std::size_t, bool); 246 template DecodedCharacter DecodeCharacter<Encoding::UTF_8>( 247 const char *, std::size_t, bool); 248 249 DecodedCharacter DecodeCharacter(Encoding encoding, const char *cp, 250 std::size_t bytes, bool backslashEscapes) { 251 switch (encoding) { 252 SWITCH_COVERS_ALL_CASES 253 case Encoding::LATIN_1: 254 return DecodeCharacter<Encoding::LATIN_1>(cp, bytes, backslashEscapes); 255 case Encoding::UTF_8: 256 return DecodeCharacter<Encoding::UTF_8>(cp, bytes, backslashEscapes); 257 } 258 } 259 260 template <typename RESULT, Encoding ENCODING> 261 RESULT DecodeString(const std::string &s, bool backslashEscapes) { 262 RESULT result; 263 const char *p{s.c_str()}; 264 for (auto bytes{s.size()}; bytes != 0;) { 265 DecodedCharacter decoded{ 266 DecodeCharacter<ENCODING>(p, bytes, backslashEscapes)}; 267 if (decoded.bytes > 0) { 268 if (static_cast<std::size_t>(decoded.bytes) <= bytes) { 269 result.append(1, decoded.codepoint); 270 bytes -= decoded.bytes; 271 p += decoded.bytes; 272 continue; 273 } 274 } 275 result.append(1, static_cast<uint8_t>(*p)); 276 ++p; 277 --bytes; 278 } 279 return result; 280 } 281 282 template std::string DecodeString<std::string, Encoding::LATIN_1>( 283 const std::string &, bool); 284 template std::u16string DecodeString<std::u16string, Encoding::UTF_8>( 285 const std::string &, bool); 286 template std::u32string DecodeString<std::u32string, Encoding::UTF_8>( 287 const std::string &, bool); 288 } // namespace Fortran::parser 289