164ab3302SCarolineConcatto //===-- lib/Parser/characters.cpp -----------------------------------------===//
264ab3302SCarolineConcatto //
364ab3302SCarolineConcatto // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
464ab3302SCarolineConcatto // See https://llvm.org/LICENSE.txt for license information.
564ab3302SCarolineConcatto // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
664ab3302SCarolineConcatto //
764ab3302SCarolineConcatto //===----------------------------------------------------------------------===//
864ab3302SCarolineConcatto 
964ab3302SCarolineConcatto #include "flang/Parser/characters.h"
1064ab3302SCarolineConcatto #include "flang/Common/idioms.h"
1164ab3302SCarolineConcatto #include <algorithm>
1264ab3302SCarolineConcatto #include <cstddef>
1364ab3302SCarolineConcatto #include <optional>
1464ab3302SCarolineConcatto #include <type_traits>
1564ab3302SCarolineConcatto 
1664ab3302SCarolineConcatto namespace Fortran::parser {
1764ab3302SCarolineConcatto 
1864ab3302SCarolineConcatto bool useHexadecimalEscapeSequences{false};
1964ab3302SCarolineConcatto 
UTF_8CharacterBytes(const char * p)2064ab3302SCarolineConcatto int UTF_8CharacterBytes(const char *p) {
2164ab3302SCarolineConcatto   if ((*p & 0x80) == 0) {
2264ab3302SCarolineConcatto     return 1;
2364ab3302SCarolineConcatto   } else if ((*p & 0xe0) == 0xc0) {
2464ab3302SCarolineConcatto     return 2;
2564ab3302SCarolineConcatto   } else if ((*p & 0xf0) == 0xe0) {
2664ab3302SCarolineConcatto     return 3;
2764ab3302SCarolineConcatto   } else if ((*p & 0xf8) == 0xf0) {
2864ab3302SCarolineConcatto     return 4;
2964ab3302SCarolineConcatto   } else if ((*p & 0xfc) == 0xf8) {
3064ab3302SCarolineConcatto     return 5;
3164ab3302SCarolineConcatto   } else {
3264ab3302SCarolineConcatto     return 6;
3364ab3302SCarolineConcatto   }
3464ab3302SCarolineConcatto }
3564ab3302SCarolineConcatto 
3664ab3302SCarolineConcatto template <typename STRING>
QuoteCharacterLiteralHelper(const STRING & str,bool backslashEscapes,Encoding encoding)3764ab3302SCarolineConcatto std::string QuoteCharacterLiteralHelper(
3864ab3302SCarolineConcatto     const STRING &str, bool backslashEscapes, Encoding encoding) {
3964ab3302SCarolineConcatto   std::string result{'"'};
4064ab3302SCarolineConcatto   const auto emit{[&](char ch) { result += ch; }};
4164ab3302SCarolineConcatto   for (auto ch : str) {
4264ab3302SCarolineConcatto     using CharT = std::decay_t<decltype(ch)>;
4364ab3302SCarolineConcatto     char32_t ch32{static_cast<std::make_unsigned_t<CharT>>(ch)};
4464ab3302SCarolineConcatto     if (ch32 == static_cast<unsigned char>('"')) {
4564ab3302SCarolineConcatto       emit('"'); // double the " when it appears in the text
4664ab3302SCarolineConcatto     }
4764ab3302SCarolineConcatto     EmitQuotedChar(ch32, emit, emit, backslashEscapes, encoding);
4864ab3302SCarolineConcatto   }
4964ab3302SCarolineConcatto   result += '"';
5064ab3302SCarolineConcatto   return result;
5164ab3302SCarolineConcatto }
5264ab3302SCarolineConcatto 
QuoteCharacterLiteral(const std::string & str,bool backslashEscapes,Encoding encoding)5364ab3302SCarolineConcatto std::string QuoteCharacterLiteral(
5464ab3302SCarolineConcatto     const std::string &str, bool backslashEscapes, Encoding encoding) {
5564ab3302SCarolineConcatto   return QuoteCharacterLiteralHelper(str, backslashEscapes, encoding);
5664ab3302SCarolineConcatto }
5764ab3302SCarolineConcatto 
QuoteCharacterLiteral(const std::u16string & str,bool backslashEscapes,Encoding encoding)5864ab3302SCarolineConcatto std::string QuoteCharacterLiteral(
5964ab3302SCarolineConcatto     const std::u16string &str, bool backslashEscapes, Encoding encoding) {
6064ab3302SCarolineConcatto   return QuoteCharacterLiteralHelper(str, backslashEscapes, encoding);
6164ab3302SCarolineConcatto }
6264ab3302SCarolineConcatto 
QuoteCharacterLiteral(const std::u32string & str,bool backslashEscapes,Encoding encoding)6364ab3302SCarolineConcatto std::string QuoteCharacterLiteral(
6464ab3302SCarolineConcatto     const std::u32string &str, bool backslashEscapes, Encoding encoding) {
6564ab3302SCarolineConcatto   return QuoteCharacterLiteralHelper(str, backslashEscapes, encoding);
6664ab3302SCarolineConcatto }
6764ab3302SCarolineConcatto 
EncodeCharacter(char32_t ucs)6864ab3302SCarolineConcatto template <> EncodedCharacter EncodeCharacter<Encoding::LATIN_1>(char32_t ucs) {
6964ab3302SCarolineConcatto   CHECK(ucs <= 0xff);
7064ab3302SCarolineConcatto   EncodedCharacter result;
7164ab3302SCarolineConcatto   result.buffer[0] = ucs;
7264ab3302SCarolineConcatto   result.bytes = 1;
7364ab3302SCarolineConcatto   return result;
7464ab3302SCarolineConcatto }
7564ab3302SCarolineConcatto 
EncodeCharacter(char32_t ucs)7664ab3302SCarolineConcatto template <> EncodedCharacter EncodeCharacter<Encoding::UTF_8>(char32_t ucs) {
7764ab3302SCarolineConcatto   // N.B. char32_t is unsigned
7864ab3302SCarolineConcatto   EncodedCharacter result;
7964ab3302SCarolineConcatto   if (ucs <= 0x7f) {
8064ab3302SCarolineConcatto     result.buffer[0] = ucs;
8164ab3302SCarolineConcatto     result.bytes = 1;
8264ab3302SCarolineConcatto   } else if (ucs <= 0x7ff) {
8364ab3302SCarolineConcatto     result.buffer[0] = 0xc0 | (ucs >> 6);
8464ab3302SCarolineConcatto     result.buffer[1] = 0x80 | (ucs & 0x3f);
8564ab3302SCarolineConcatto     result.bytes = 2;
8664ab3302SCarolineConcatto   } else if (ucs <= 0xffff) {
8764ab3302SCarolineConcatto     result.buffer[0] = 0xe0 | (ucs >> 12);
8864ab3302SCarolineConcatto     result.buffer[1] = 0x80 | ((ucs >> 6) & 0x3f);
8964ab3302SCarolineConcatto     result.buffer[2] = 0x80 | (ucs & 0x3f);
9064ab3302SCarolineConcatto     result.bytes = 3;
9164ab3302SCarolineConcatto   } else if (ucs <= 0x1fffff) {
9264ab3302SCarolineConcatto     // UCS actually only goes up to 0x10ffff, but the
9364ab3302SCarolineConcatto     // UTF-8 encoding can handle 32 bits.
9464ab3302SCarolineConcatto     result.buffer[0] = 0xf0 | (ucs >> 18);
9564ab3302SCarolineConcatto     result.buffer[1] = 0x80 | ((ucs >> 12) & 0x3f);
9664ab3302SCarolineConcatto     result.buffer[2] = 0x80 | ((ucs >> 6) & 0x3f);
9764ab3302SCarolineConcatto     result.buffer[3] = 0x80 | (ucs & 0x3f);
9864ab3302SCarolineConcatto     result.bytes = 4;
9964ab3302SCarolineConcatto   } else if (ucs <= 0x3ffffff) {
10064ab3302SCarolineConcatto     result.buffer[0] = 0xf8 | (ucs >> 24);
10164ab3302SCarolineConcatto     result.buffer[1] = 0x80 | ((ucs >> 18) & 0x3f);
10264ab3302SCarolineConcatto     result.buffer[2] = 0x80 | ((ucs >> 12) & 0x3f);
10364ab3302SCarolineConcatto     result.buffer[3] = 0x80 | ((ucs >> 6) & 0x3f);
10464ab3302SCarolineConcatto     result.buffer[4] = 0x80 | (ucs & 0x3f);
10564ab3302SCarolineConcatto     result.bytes = 5;
10664ab3302SCarolineConcatto   } else {
10764ab3302SCarolineConcatto     result.buffer[0] = 0xfc | (ucs >> 30);
10864ab3302SCarolineConcatto     result.buffer[1] = 0x80 | ((ucs >> 24) & 0x3f);
10964ab3302SCarolineConcatto     result.buffer[2] = 0x80 | ((ucs >> 18) & 0x3f);
11064ab3302SCarolineConcatto     result.buffer[3] = 0x80 | ((ucs >> 12) & 0x3f);
11164ab3302SCarolineConcatto     result.buffer[4] = 0x80 | ((ucs >> 6) & 0x3f);
11264ab3302SCarolineConcatto     result.buffer[5] = 0x80 | (ucs & 0x3f);
11364ab3302SCarolineConcatto     result.bytes = 6;
11464ab3302SCarolineConcatto   }
11564ab3302SCarolineConcatto   return result;
11664ab3302SCarolineConcatto }
11764ab3302SCarolineConcatto 
EncodeCharacter(Encoding encoding,char32_t ucs)11864ab3302SCarolineConcatto EncodedCharacter EncodeCharacter(Encoding encoding, char32_t ucs) {
11964ab3302SCarolineConcatto   switch (encoding) {
12064ab3302SCarolineConcatto     SWITCH_COVERS_ALL_CASES
121*1f879005STim Keith   case Encoding::LATIN_1:
122*1f879005STim Keith     return EncodeCharacter<Encoding::LATIN_1>(ucs);
123*1f879005STim Keith   case Encoding::UTF_8:
124*1f879005STim Keith     return EncodeCharacter<Encoding::UTF_8>(ucs);
12564ab3302SCarolineConcatto   }
12664ab3302SCarolineConcatto }
12764ab3302SCarolineConcatto 
12864ab3302SCarolineConcatto template <Encoding ENCODING, typename STRING>
EncodeString(const STRING & str)12964ab3302SCarolineConcatto std::string EncodeString(const STRING &str) {
13064ab3302SCarolineConcatto   std::string result;
13164ab3302SCarolineConcatto   for (auto ch : str) {
13264ab3302SCarolineConcatto     char32_t uch{static_cast<std::make_unsigned_t<decltype(ch)>>(ch)};
13364ab3302SCarolineConcatto     EncodedCharacter encoded{EncodeCharacter<ENCODING>(uch)};
13464ab3302SCarolineConcatto     result.append(encoded.buffer, static_cast<std::size_t>(encoded.bytes));
13564ab3302SCarolineConcatto   }
13664ab3302SCarolineConcatto   return result;
13764ab3302SCarolineConcatto }
13864ab3302SCarolineConcatto 
13964ab3302SCarolineConcatto template std::string EncodeString<Encoding::LATIN_1, std::string>(
14064ab3302SCarolineConcatto     const std::string &);
14164ab3302SCarolineConcatto template std::string EncodeString<Encoding::UTF_8, std::u16string>(
14264ab3302SCarolineConcatto     const std::u16string &);
14364ab3302SCarolineConcatto template std::string EncodeString<Encoding::UTF_8, std::u32string>(
14464ab3302SCarolineConcatto     const std::u32string &);
14564ab3302SCarolineConcatto 
14664ab3302SCarolineConcatto template <>
DecodeRawCharacter(const char * cp,std::size_t bytes)14764ab3302SCarolineConcatto DecodedCharacter DecodeRawCharacter<Encoding::LATIN_1>(
14864ab3302SCarolineConcatto     const char *cp, std::size_t bytes) {
14964ab3302SCarolineConcatto   if (bytes >= 1) {
15064ab3302SCarolineConcatto     return {*reinterpret_cast<const std::uint8_t *>(cp), 1};
15164ab3302SCarolineConcatto   } else {
15264ab3302SCarolineConcatto     return {};
15364ab3302SCarolineConcatto   }
15464ab3302SCarolineConcatto }
15564ab3302SCarolineConcatto 
15664ab3302SCarolineConcatto template <>
DecodeRawCharacter(const char * cp,std::size_t bytes)15764ab3302SCarolineConcatto DecodedCharacter DecodeRawCharacter<Encoding::UTF_8>(
15864ab3302SCarolineConcatto     const char *cp, std::size_t bytes) {
15964ab3302SCarolineConcatto   auto p{reinterpret_cast<const std::uint8_t *>(cp)};
16064ab3302SCarolineConcatto   char32_t ch{*p};
16164ab3302SCarolineConcatto   if (ch <= 0x7f) {
16264ab3302SCarolineConcatto     return {ch, 1};
16364ab3302SCarolineConcatto   } else if ((ch & 0xf8) == 0xf0 && bytes >= 4 && ch > 0xf0 &&
16464ab3302SCarolineConcatto       ((p[1] | p[2] | p[3]) & 0xc0) == 0x80) {
16564ab3302SCarolineConcatto     ch = ((ch & 7) << 6) | (p[1] & 0x3f);
16664ab3302SCarolineConcatto     ch = (ch << 6) | (p[2] & 0x3f);
16764ab3302SCarolineConcatto     ch = (ch << 6) | (p[3] & 0x3f);
16864ab3302SCarolineConcatto     return {ch, 4};
16964ab3302SCarolineConcatto   } else if ((ch & 0xf0) == 0xe0 && bytes >= 3 && ch > 0xe0 &&
17064ab3302SCarolineConcatto       ((p[1] | p[2]) & 0xc0) == 0x80) {
17164ab3302SCarolineConcatto     ch = ((ch & 0xf) << 6) | (p[1] & 0x3f);
17264ab3302SCarolineConcatto     ch = (ch << 6) | (p[2] & 0x3f);
17364ab3302SCarolineConcatto     return {ch, 3};
17464ab3302SCarolineConcatto   } else if ((ch & 0xe0) == 0xc0 && bytes >= 2 && ch > 0xc0 &&
17564ab3302SCarolineConcatto       (p[1] & 0xc0) == 0x80) {
17664ab3302SCarolineConcatto     ch = ((ch & 0x1f) << 6) | (p[1] & 0x3f);
17764ab3302SCarolineConcatto     return {ch, 2};
17864ab3302SCarolineConcatto   } else {
17964ab3302SCarolineConcatto     return {}; // not valid UTF-8
18064ab3302SCarolineConcatto   }
18164ab3302SCarolineConcatto }
18264ab3302SCarolineConcatto 
DecodeEscapedCharacter(const char * cp,std::size_t bytes)18364ab3302SCarolineConcatto static DecodedCharacter DecodeEscapedCharacter(
18464ab3302SCarolineConcatto     const char *cp, std::size_t bytes) {
18564ab3302SCarolineConcatto   if (cp[0] == '\\' && bytes >= 2) {
18664ab3302SCarolineConcatto     if (std::optional<char> escChar{BackslashEscapeValue(cp[1])}) {
18764ab3302SCarolineConcatto       return {static_cast<unsigned char>(*escChar), 2};
18864ab3302SCarolineConcatto     } else if (IsOctalDigit(cp[1])) {
18964ab3302SCarolineConcatto       std::size_t maxLen{std::min(std::size_t{4}, bytes)};
19064ab3302SCarolineConcatto       char32_t code{static_cast<char32_t>(DecimalDigitValue(cp[1]))};
19164ab3302SCarolineConcatto       std::size_t len{2}; // so far
19264ab3302SCarolineConcatto       for (; code <= 037 && len < maxLen && IsOctalDigit(cp[len]); ++len) {
19364ab3302SCarolineConcatto         code = 8 * code + DecimalDigitValue(cp[len]);
19464ab3302SCarolineConcatto       }
19564ab3302SCarolineConcatto       return {code, static_cast<int>(len)};
19664ab3302SCarolineConcatto     } else if (bytes >= 4 && ToLowerCaseLetter(cp[1]) == 'x' &&
19764ab3302SCarolineConcatto         IsHexadecimalDigit(cp[2]) && IsHexadecimalDigit(cp[3])) {
19864ab3302SCarolineConcatto       return {static_cast<char32_t>(16 * HexadecimalDigitValue(cp[2]) +
19964ab3302SCarolineConcatto                   HexadecimalDigitValue(cp[3])),
20064ab3302SCarolineConcatto           4};
20164ab3302SCarolineConcatto     } else if (IsLetter(cp[1])) {
20264ab3302SCarolineConcatto       // Unknown escape - ignore the '\' (PGI compatibility)
20364ab3302SCarolineConcatto       return {static_cast<unsigned char>(cp[1]), 2};
20464ab3302SCarolineConcatto     } else {
20564ab3302SCarolineConcatto       // Not an escape character.
20664ab3302SCarolineConcatto       return {'\\', 1};
20764ab3302SCarolineConcatto     }
20864ab3302SCarolineConcatto   }
20964ab3302SCarolineConcatto   return {static_cast<unsigned char>(cp[0]), 1};
21064ab3302SCarolineConcatto }
21164ab3302SCarolineConcatto 
21264ab3302SCarolineConcatto template <Encoding ENCODING>
DecodeEscapedCharacters(const char * cp,std::size_t bytes)21364ab3302SCarolineConcatto static DecodedCharacter DecodeEscapedCharacters(
21464ab3302SCarolineConcatto     const char *cp, std::size_t bytes) {
21564ab3302SCarolineConcatto   char buffer[EncodedCharacter::maxEncodingBytes];
21664ab3302SCarolineConcatto   int count[EncodedCharacter::maxEncodingBytes];
21764ab3302SCarolineConcatto   std::size_t at{0}, len{0};
21864ab3302SCarolineConcatto   for (; len < EncodedCharacter::maxEncodingBytes && at < bytes; ++len) {
21964ab3302SCarolineConcatto     DecodedCharacter code{DecodeEscapedCharacter(cp + at, bytes - at)};
22064ab3302SCarolineConcatto     buffer[len] = code.codepoint;
22164ab3302SCarolineConcatto     at += code.bytes;
22264ab3302SCarolineConcatto     count[len] = at;
22364ab3302SCarolineConcatto   }
22464ab3302SCarolineConcatto   DecodedCharacter code{DecodeCharacter<ENCODING>(buffer, len, false)};
22564ab3302SCarolineConcatto   if (code.bytes > 0) {
22664ab3302SCarolineConcatto     code.bytes = count[code.bytes - 1];
22764ab3302SCarolineConcatto   } else {
22864ab3302SCarolineConcatto     code.codepoint = buffer[0] & 0xff;
22964ab3302SCarolineConcatto     code.bytes = count[0];
23064ab3302SCarolineConcatto   }
23164ab3302SCarolineConcatto   return code;
23264ab3302SCarolineConcatto }
23364ab3302SCarolineConcatto 
23464ab3302SCarolineConcatto template <Encoding ENCODING>
DecodeCharacter(const char * cp,std::size_t bytes,bool backslashEscapes)23564ab3302SCarolineConcatto DecodedCharacter DecodeCharacter(
23664ab3302SCarolineConcatto     const char *cp, std::size_t bytes, bool backslashEscapes) {
23764ab3302SCarolineConcatto   if (backslashEscapes && bytes >= 2 && *cp == '\\') {
23864ab3302SCarolineConcatto     return DecodeEscapedCharacters<ENCODING>(cp, bytes);
23964ab3302SCarolineConcatto   } else {
24064ab3302SCarolineConcatto     return DecodeRawCharacter<ENCODING>(cp, bytes);
24164ab3302SCarolineConcatto   }
24264ab3302SCarolineConcatto }
24364ab3302SCarolineConcatto 
24464ab3302SCarolineConcatto template DecodedCharacter DecodeCharacter<Encoding::LATIN_1>(
24564ab3302SCarolineConcatto     const char *, std::size_t, bool);
24664ab3302SCarolineConcatto template DecodedCharacter DecodeCharacter<Encoding::UTF_8>(
24764ab3302SCarolineConcatto     const char *, std::size_t, bool);
24864ab3302SCarolineConcatto 
DecodeCharacter(Encoding encoding,const char * cp,std::size_t bytes,bool backslashEscapes)24964ab3302SCarolineConcatto DecodedCharacter DecodeCharacter(Encoding encoding, const char *cp,
25064ab3302SCarolineConcatto     std::size_t bytes, bool backslashEscapes) {
25164ab3302SCarolineConcatto   switch (encoding) {
25264ab3302SCarolineConcatto     SWITCH_COVERS_ALL_CASES
25364ab3302SCarolineConcatto   case Encoding::LATIN_1:
25464ab3302SCarolineConcatto     return DecodeCharacter<Encoding::LATIN_1>(cp, bytes, backslashEscapes);
25564ab3302SCarolineConcatto   case Encoding::UTF_8:
25664ab3302SCarolineConcatto     return DecodeCharacter<Encoding::UTF_8>(cp, bytes, backslashEscapes);
25764ab3302SCarolineConcatto   }
25864ab3302SCarolineConcatto }
25964ab3302SCarolineConcatto 
26064ab3302SCarolineConcatto template <typename RESULT, Encoding ENCODING>
DecodeString(const std::string & s,bool backslashEscapes)26164ab3302SCarolineConcatto RESULT DecodeString(const std::string &s, bool backslashEscapes) {
26264ab3302SCarolineConcatto   RESULT result;
26364ab3302SCarolineConcatto   const char *p{s.c_str()};
26464ab3302SCarolineConcatto   for (auto bytes{s.size()}; bytes != 0;) {
26564ab3302SCarolineConcatto     DecodedCharacter decoded{
26664ab3302SCarolineConcatto         DecodeCharacter<ENCODING>(p, bytes, backslashEscapes)};
26764ab3302SCarolineConcatto     if (decoded.bytes > 0) {
26864ab3302SCarolineConcatto       if (static_cast<std::size_t>(decoded.bytes) <= bytes) {
26964ab3302SCarolineConcatto         result.append(1, decoded.codepoint);
27064ab3302SCarolineConcatto         bytes -= decoded.bytes;
27164ab3302SCarolineConcatto         p += decoded.bytes;
27264ab3302SCarolineConcatto         continue;
27364ab3302SCarolineConcatto       }
27464ab3302SCarolineConcatto     }
27564ab3302SCarolineConcatto     result.append(1, static_cast<uint8_t>(*p));
27664ab3302SCarolineConcatto     ++p;
27764ab3302SCarolineConcatto     --bytes;
27864ab3302SCarolineConcatto   }
27964ab3302SCarolineConcatto   return result;
28064ab3302SCarolineConcatto }
28164ab3302SCarolineConcatto 
28264ab3302SCarolineConcatto template std::string DecodeString<std::string, Encoding::LATIN_1>(
28364ab3302SCarolineConcatto     const std::string &, bool);
28464ab3302SCarolineConcatto template std::u16string DecodeString<std::u16string, Encoding::UTF_8>(
28564ab3302SCarolineConcatto     const std::string &, bool);
28664ab3302SCarolineConcatto template std::u32string DecodeString<std::u32string, Encoding::UTF_8>(
28764ab3302SCarolineConcatto     const std::string &, bool);
288*1f879005STim Keith } // namespace Fortran::parser
289