//===------- JITLoaderPerf.cpp - Register profiler objects ------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Register objects for access by profilers via the perf JIT interface.
//
//===----------------------------------------------------------------------===//

#include "llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderPerf.h"

#include "llvm/ExecutionEngine/Orc/Shared/PerfSharedStructs.h"

#include "llvm/Support/FileSystem.h"
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/Process.h"
#include "llvm/Support/Threading.h"

#include <mutex>
#include <optional>

#ifdef __linux__

#include <sys/mman.h> // mmap()
#include <time.h>     // clock_gettime(), time(), localtime_r() */
#include <unistd.h>   // for read(), close()

#define DEBUG_TYPE "orc"

// language identifier (XXX: should we generate something better from debug
// info?)
#define JIT_LANG "llvm-IR"
#define LLVM_PERF_JIT_MAGIC                                                    \
  ((uint32_t)'J' << 24 | (uint32_t)'i' << 16 | (uint32_t)'T' << 8 |            \
   (uint32_t)'D')
#define LLVM_PERF_JIT_VERSION 1

using namespace llvm;
using namespace llvm::orc;

struct PerfState {
  // cache lookups
  uint32_t Pid;

  // base directory for output data
  std::string JitPath;

  // output data stream, closed via Dumpstream
  int DumpFd = -1;

  // output data stream
  std::unique_ptr<raw_fd_ostream> Dumpstream;

  // perf mmap marker
  void *MarkerAddr = NULL;
};

// prevent concurrent dumps from messing up the output file
static std::mutex Mutex;
static std::optional<PerfState> State;

struct RecHeader {
  uint32_t Id;
  uint32_t TotalSize;
  uint64_t Timestamp;
};

struct DIR {
  RecHeader Prefix;
  uint64_t CodeAddr;
  uint64_t NrEntry;
};

struct DIE {
  uint64_t CodeAddr;
  uint32_t Line;
  uint32_t Discrim;
};

struct CLR {
  RecHeader Prefix;
  uint32_t Pid;
  uint32_t Tid;
  uint64_t Vma;
  uint64_t CodeAddr;
  uint64_t CodeSize;
  uint64_t CodeIndex;
};

struct UWR {
  RecHeader Prefix;
  uint64_t UnwindDataSize;
  uint64_t EhFrameHeaderSize;
  uint64_t MappedSize;
};

static inline uint64_t timespec_to_ns(const struct timespec *TS) {
  const uint64_t NanoSecPerSec = 1000000000;
  return ((uint64_t)TS->tv_sec * NanoSecPerSec) + TS->tv_nsec;
}

static inline uint64_t perf_get_timestamp() {
  timespec TS;
  if (clock_gettime(CLOCK_MONOTONIC, &TS))
    return 0;

  return timespec_to_ns(&TS);
}

static void writeDebugRecord(const PerfJITDebugInfoRecord &DebugRecord) {
  assert(State && "PerfState not initialized");
  LLVM_DEBUG(dbgs() << "Writing debug record with "
                    << DebugRecord.Entries.size() << " entries\n");
  [[maybe_unused]] size_t Written = 0;
  DIR Dir{RecHeader{static_cast<uint32_t>(DebugRecord.Prefix.Id),
                    DebugRecord.Prefix.TotalSize, perf_get_timestamp()},
          DebugRecord.CodeAddr, DebugRecord.Entries.size()};
  State->Dumpstream->write(reinterpret_cast<const char *>(&Dir), sizeof(Dir));
  Written += sizeof(Dir);
  for (auto &Die : DebugRecord.Entries) {
    DIE d{Die.Addr, Die.Lineno, Die.Discrim};
    State->Dumpstream->write(reinterpret_cast<const char *>(&d), sizeof(d));
    State->Dumpstream->write(Die.Name.data(), Die.Name.size() + 1);
    Written += sizeof(d) + Die.Name.size() + 1;
  }
  LLVM_DEBUG(dbgs() << "wrote " << Written << " bytes of debug info\n");
}

static void writeCodeRecord(const PerfJITCodeLoadRecord &CodeRecord) {
  assert(State && "PerfState not initialized");
  uint32_t Tid = get_threadid();
  LLVM_DEBUG(dbgs() << "Writing code record with code size "
                    << CodeRecord.CodeSize << " and code index "
                    << CodeRecord.CodeIndex << "\n");
  CLR Clr{RecHeader{static_cast<uint32_t>(CodeRecord.Prefix.Id),
                    CodeRecord.Prefix.TotalSize, perf_get_timestamp()},
          State->Pid,
          Tid,
          CodeRecord.Vma,
          CodeRecord.CodeAddr,
          CodeRecord.CodeSize,
          CodeRecord.CodeIndex};
  LLVM_DEBUG(dbgs() << "wrote " << sizeof(Clr) << " bytes of CLR, "
                    << CodeRecord.Name.size() + 1 << " bytes of name, "
                    << CodeRecord.CodeSize << " bytes of code\n");
  State->Dumpstream->write(reinterpret_cast<const char *>(&Clr), sizeof(Clr));
  State->Dumpstream->write(CodeRecord.Name.data(), CodeRecord.Name.size() + 1);
  State->Dumpstream->write((const char *)CodeRecord.CodeAddr,
                           CodeRecord.CodeSize);
}

static void
writeUnwindRecord(const PerfJITCodeUnwindingInfoRecord &UnwindRecord) {
  assert(State && "PerfState not initialized");
  dbgs() << "Writing unwind record with unwind data size "
         << UnwindRecord.UnwindDataSize << " and EH frame header size "
         << UnwindRecord.EHFrameHdrSize << " and mapped size "
         << UnwindRecord.MappedSize << "\n";
  UWR Uwr{RecHeader{static_cast<uint32_t>(UnwindRecord.Prefix.Id),
                    UnwindRecord.Prefix.TotalSize, perf_get_timestamp()},
          UnwindRecord.UnwindDataSize, UnwindRecord.EHFrameHdrSize,
          UnwindRecord.MappedSize};
  LLVM_DEBUG(dbgs() << "wrote " << sizeof(Uwr) << " bytes of UWR, "
                    << UnwindRecord.EHFrameHdrSize
                    << " bytes of EH frame header, "
                    << UnwindRecord.UnwindDataSize - UnwindRecord.EHFrameHdrSize
                    << " bytes of EH frame\n");
  State->Dumpstream->write(reinterpret_cast<const char *>(&Uwr), sizeof(Uwr));
  if (UnwindRecord.EHFrameHdrAddr)
    State->Dumpstream->write((const char *)UnwindRecord.EHFrameHdrAddr,
                             UnwindRecord.EHFrameHdrSize);
  else
    State->Dumpstream->write(UnwindRecord.EHFrameHdr.data(),
                             UnwindRecord.EHFrameHdrSize);
  State->Dumpstream->write((const char *)UnwindRecord.EHFrameAddr,
                           UnwindRecord.UnwindDataSize -
                               UnwindRecord.EHFrameHdrSize);
}

static Error registerJITLoaderPerfImpl(const PerfJITRecordBatch &Batch) {
  if (!State)
    return make_error<StringError>("PerfState not initialized",
                                   inconvertibleErrorCode());

  // Serialize the batch
  std::lock_guard<std::mutex> Lock(Mutex);
  if (Batch.UnwindingRecord.Prefix.TotalSize > 0)
    writeUnwindRecord(Batch.UnwindingRecord);

  for (const auto &DebugInfo : Batch.DebugInfoRecords)
    writeDebugRecord(DebugInfo);

  for (const auto &CodeLoad : Batch.CodeLoadRecords)
    writeCodeRecord(CodeLoad);

  State->Dumpstream->flush();

  return Error::success();
}

struct Header {
  uint32_t Magic;     // characters "JiTD"
  uint32_t Version;   // header version
  uint32_t TotalSize; // total size of header
  uint32_t ElfMach;   // elf mach target
  uint32_t Pad1;      // reserved
  uint32_t Pid;
  uint64_t Timestamp; // timestamp
  uint64_t Flags;     // flags
};

static Error OpenMarker(PerfState &State) {
  // We mmap the jitdump to create an MMAP RECORD in perf.data file.  The mmap
  // is captured either live (perf record running when we mmap) or in deferred
  // mode, via /proc/PID/maps. The MMAP record is used as a marker of a jitdump
  // file for more meta data info about the jitted code. Perf report/annotate
  // detect this special filename and process the jitdump file.
  //
  // Mapping must be PROT_EXEC to ensure it is captured by perf record
  // even when not using -d option.
  State.MarkerAddr =
      ::mmap(NULL, sys::Process::getPageSizeEstimate(), PROT_READ | PROT_EXEC,
             MAP_PRIVATE, State.DumpFd, 0);

  if (State.MarkerAddr == MAP_FAILED)
    return make_error<llvm::StringError>("could not mmap JIT marker",
                                         inconvertibleErrorCode());

  return Error::success();
}

void CloseMarker(PerfState &State) {
  if (!State.MarkerAddr)
    return;

  munmap(State.MarkerAddr, sys::Process::getPageSizeEstimate());
  State.MarkerAddr = nullptr;
}

static Expected<Header> FillMachine(PerfState &State) {
  Header Hdr;
  Hdr.Magic = LLVM_PERF_JIT_MAGIC;
  Hdr.Version = LLVM_PERF_JIT_VERSION;
  Hdr.TotalSize = sizeof(Hdr);
  Hdr.Pid = State.Pid;
  Hdr.Timestamp = perf_get_timestamp();

  char Id[16];
  struct {
    uint16_t e_type;
    uint16_t e_machine;
  } Info;

  size_t RequiredMemory = sizeof(Id) + sizeof(Info);

  ErrorOr<std::unique_ptr<MemoryBuffer>> MB =
      MemoryBuffer::getFileSlice("/proc/self/exe", RequiredMemory, 0);

  // This'll not guarantee that enough data was actually read from the
  // underlying file. Instead the trailing part of the buffer would be
  // zeroed. Given the ELF signature check below that seems ok though,
  // it's unlikely that the file ends just after that, and the
  // consequence would just be that perf wouldn't recognize the
  // signature.
  if (!MB)
    return make_error<llvm::StringError>("could not open /proc/self/exe",
                                         MB.getError());

  memcpy(&Id, (*MB)->getBufferStart(), sizeof(Id));
  memcpy(&Info, (*MB)->getBufferStart() + sizeof(Id), sizeof(Info));

  // check ELF signature
  if (Id[0] != 0x7f || Id[1] != 'E' || Id[2] != 'L' || Id[3] != 'F')
    return make_error<llvm::StringError>("invalid ELF signature",
                                         inconvertibleErrorCode());

  Hdr.ElfMach = Info.e_machine;

  return Hdr;
}

static Error InitDebuggingDir(PerfState &State) {
  time_t Time;
  struct tm LocalTime;
  char TimeBuffer[sizeof("YYYYMMDD")];
  SmallString<64> Path;

  // search for location to dump data to
  if (const char *BaseDir = getenv("JITDUMPDIR"))
    Path.append(BaseDir);
  else if (!sys::path::home_directory(Path))
    Path = ".";

  // create debug directory
  Path += "/.debug/jit/";
  if (auto EC = sys::fs::create_directories(Path)) {
    std::string ErrStr;
    raw_string_ostream ErrStream(ErrStr);
    ErrStream << "could not create jit cache directory " << Path << ": "
              << EC.message() << "\n";
    return make_error<StringError>(std::move(ErrStr), inconvertibleErrorCode());
  }

  // create unique directory for dump data related to this process
  time(&Time);
  localtime_r(&Time, &LocalTime);
  strftime(TimeBuffer, sizeof(TimeBuffer), "%Y%m%d", &LocalTime);
  Path += JIT_LANG "-jit-";
  Path += TimeBuffer;

  SmallString<128> UniqueDebugDir;

  using sys::fs::createUniqueDirectory;
  if (auto EC = createUniqueDirectory(Path, UniqueDebugDir)) {
    std::string ErrStr;
    raw_string_ostream ErrStream(ErrStr);
    ErrStream << "could not create unique jit cache directory "
              << UniqueDebugDir << ": " << EC.message() << "\n";
    return make_error<StringError>(std::move(ErrStr), inconvertibleErrorCode());
  }

  State.JitPath = std::string(UniqueDebugDir.str());

  return Error::success();
}

static Error registerJITLoaderPerfStartImpl() {
  PerfState Tentative;
  Tentative.Pid = sys::Process::getProcessId();
  // check if clock-source is supported
  if (!perf_get_timestamp())
    return make_error<StringError>("kernel does not support CLOCK_MONOTONIC",
                                   inconvertibleErrorCode());

  if (auto Err = InitDebuggingDir(Tentative))
    return Err;

  std::string Filename;
  raw_string_ostream FilenameBuf(Filename);
  FilenameBuf << Tentative.JitPath << "/jit-" << Tentative.Pid << ".dump";

  // Need to open ourselves, because we need to hand the FD to OpenMarker() and
  // raw_fd_ostream doesn't expose the FD.
  using sys::fs::openFileForWrite;
  if (auto EC = openFileForReadWrite(FilenameBuf.str(), Tentative.DumpFd,
                                     sys::fs::CD_CreateNew, sys::fs::OF_None)) {
    std::string ErrStr;
    raw_string_ostream ErrStream(ErrStr);
    ErrStream << "could not open JIT dump file " << FilenameBuf.str() << ": "
              << EC.message() << "\n";
    return make_error<StringError>(std::move(ErrStr), inconvertibleErrorCode());
  }

  Tentative.Dumpstream =
      std::make_unique<raw_fd_ostream>(Tentative.DumpFd, true);

  auto Header = FillMachine(Tentative);
  if (!Header)
    return Header.takeError();

  // signal this process emits JIT information
  if (auto Err = OpenMarker(Tentative))
    return Err;

  Tentative.Dumpstream->write(reinterpret_cast<const char *>(&Header.get()),
                              sizeof(*Header));

  // Everything initialized, can do profiling now.
  if (Tentative.Dumpstream->has_error())
    return make_error<StringError>("could not write JIT dump header",
                                   inconvertibleErrorCode());

  State = std::move(Tentative);
  return Error::success();
}

static Error registerJITLoaderPerfEndImpl() {
  if (!State)
    return make_error<StringError>("PerfState not initialized",
                                   inconvertibleErrorCode());

  RecHeader Close;
  Close.Id = static_cast<uint32_t>(PerfJITRecordType::JIT_CODE_CLOSE);
  Close.TotalSize = sizeof(Close);
  Close.Timestamp = perf_get_timestamp();
  State->Dumpstream->write(reinterpret_cast<const char *>(&Close),
                           sizeof(Close));
  if (State->MarkerAddr)
    CloseMarker(*State);

  State.reset();
  return Error::success();
}

extern "C" llvm::orc::shared::CWrapperFunctionResult
llvm_orc_registerJITLoaderPerfImpl(const char *Data, uint64_t Size) {
  using namespace orc::shared;
  return WrapperFunction<SPSError(SPSPerfJITRecordBatch)>::handle(
             Data, Size, registerJITLoaderPerfImpl)
      .release();
}

extern "C" llvm::orc::shared::CWrapperFunctionResult
llvm_orc_registerJITLoaderPerfStart(const char *Data, uint64_t Size) {
  using namespace orc::shared;
  return WrapperFunction<SPSError()>::handle(Data, Size,
                                             registerJITLoaderPerfStartImpl)
      .release();
}

extern "C" llvm::orc::shared::CWrapperFunctionResult
llvm_orc_registerJITLoaderPerfEnd(const char *Data, uint64_t Size) {
  using namespace orc::shared;
  return WrapperFunction<SPSError()>::handle(Data, Size,
                                             registerJITLoaderPerfEndImpl)
      .release();
}

#else

using namespace llvm;
using namespace llvm::orc;

static Error badOS() {
  using namespace llvm;
  return llvm::make_error<StringError>(
      "unsupported OS (perf support is only available on linux!)",
      inconvertibleErrorCode());
}

static Error badOSBatch(PerfJITRecordBatch &Batch) { return badOS(); }

extern "C" llvm::orc::shared::CWrapperFunctionResult
llvm_orc_registerJITLoaderPerfImpl(const char *Data, uint64_t Size) {
  using namespace shared;
  return WrapperFunction<SPSError(SPSPerfJITRecordBatch)>::handle(Data, Size,
                                                                  badOSBatch)
      .release();
}

extern "C" llvm::orc::shared::CWrapperFunctionResult
llvm_orc_registerJITLoaderPerfStart(const char *Data, uint64_t Size) {
  using namespace shared;
  return WrapperFunction<SPSError()>::handle(Data, Size, badOS).release();
}

extern "C" llvm::orc::shared::CWrapperFunctionResult
llvm_orc_registerJITLoaderPerfEnd(const char *Data, uint64_t Size) {
  using namespace shared;
  return WrapperFunction<SPSError()>::handle(Data, Size, badOS).release();
}

#endif
