//===- bolt/Core/DynoStats.cpp - Dynamic execution stats ------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements the DynoStats class.
//
//===----------------------------------------------------------------------===//

#include "bolt/Core/DynoStats.h"
#include "bolt/Core/BinaryBasicBlock.h"
#include "bolt/Core/BinaryFunction.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
#include <string>

#undef  DEBUG_TYPE
#define DEBUG_TYPE "bolt"

using namespace llvm;
using namespace bolt;

namespace opts {

extern cl::OptionCategory BoltCategory;

static cl::opt<uint32_t>
DynoStatsScale("dyno-stats-scale",
  cl::desc("scale to be applied while reporting dyno stats"),
  cl::Optional,
  cl::init(1),
  cl::Hidden,
  cl::cat(BoltCategory));

static cl::opt<uint32_t>
PrintDynoOpcodeStat("print-dyno-opcode-stats",
  cl::desc("print per instruction opcode dyno stats and the function"
              "names:BB offsets of the nth highest execution counts"),
  cl::init(0),
  cl::Hidden,
  cl::cat(BoltCategory));

} // namespace opts

namespace llvm {
namespace bolt {

constexpr const char *DynoStats::Desc[];

bool DynoStats::operator<(const DynoStats &Other) const {
  return std::lexicographical_compare(
      &Stats[FIRST_DYNO_STAT], &Stats[LAST_DYNO_STAT],
      &Other.Stats[FIRST_DYNO_STAT], &Other.Stats[LAST_DYNO_STAT]);
}

bool DynoStats::operator==(const DynoStats &Other) const {
  return std::equal(&Stats[FIRST_DYNO_STAT], &Stats[LAST_DYNO_STAT],
                    &Other.Stats[FIRST_DYNO_STAT]);
}

bool DynoStats::lessThan(const DynoStats &Other,
                         ArrayRef<Category> Keys) const {
  return std::lexicographical_compare(
      Keys.begin(), Keys.end(), Keys.begin(), Keys.end(),
      [this, &Other](const Category A, const Category) {
        return Stats[A] < Other.Stats[A];
      });
}

void DynoStats::print(raw_ostream &OS, const DynoStats *Other,
                      MCInstPrinter *Printer) const {
  auto printStatWithDelta = [&](const std::string &Name, uint64_t Stat,
                                uint64_t OtherStat) {
    OS << format("%'20lld : ", Stat * opts::DynoStatsScale) << Name;
    if (Other) {
      if (Stat != OtherStat) {
        OtherStat = std::max(OtherStat, uint64_t(1)); // to prevent divide by 0
        OS << format(" (%+.1f%%)", ((float)Stat - (float)OtherStat) * 100.0 /
                                       (float)(OtherStat));
      } else {
        OS << " (=)";
      }
    }
    OS << '\n';
  };

  for (auto Stat = DynoStats::FIRST_DYNO_STAT + 1;
       Stat < DynoStats::LAST_DYNO_STAT; ++Stat) {

    if (!PrintAArch64Stats && Stat == DynoStats::VENEER_CALLS_AARCH64)
      continue;

    printStatWithDelta(Desc[Stat], Stats[Stat], Other ? (*Other)[Stat] : 0);
  }
  if (opts::PrintDynoOpcodeStat && Printer) {
    outs() << "\nProgram-wide opcode histogram:\n";
    OS << "              Opcode,   Execution Count,     Max Exec Count, "
          "Function Name:Offset ...\n";
    std::vector<std::pair<uint64_t, unsigned>> SortedHistogram;
    for (const OpcodeStatTy &Stat : OpcodeHistogram)
      SortedHistogram.emplace_back(Stat.second.first, Stat.first);

    // Sort using lexicographic ordering
    std::sort(SortedHistogram.begin(), SortedHistogram.end());

    // Dump in ascending order: Start with Opcode with Highest execution
    // count.
    for (auto Stat = SortedHistogram.rbegin(); Stat != SortedHistogram.rend();
         ++Stat) {
      OS << format("%20s,%'18lld", Printer->getOpcodeName(Stat->second).data(),
                   Stat->first * opts::DynoStatsScale);

      MaxOpcodeHistogramTy MaxMultiMap =
          OpcodeHistogram.at(Stat->second).second;
      // Start with function name:BB offset with highest execution count.
      for (auto Max = MaxMultiMap.rbegin(); Max != MaxMultiMap.rend(); ++Max) {
        OS << format(", %'18lld, ", Max->first * opts::DynoStatsScale)
           << Max->second.first.str() << ':' << Max->second.second;
      }
      OS << '\n';
    }
  }
}

void DynoStats::operator+=(const DynoStats &Other) {
  for (auto Stat = DynoStats::FIRST_DYNO_STAT + 1;
       Stat < DynoStats::LAST_DYNO_STAT; ++Stat) {
    Stats[Stat] += Other[Stat];
  }
  for (const OpcodeStatTy &Stat : Other.OpcodeHistogram) {
    auto I = OpcodeHistogram.find(Stat.first);
    if (I == OpcodeHistogram.end()) {
      OpcodeHistogram.emplace(Stat);
    } else {
      // Merge Other Historgrams, log only the opts::PrintDynoOpcodeStat'th
      // maximum counts.
      I->second.first += Stat.second.first;
      auto &MMap = I->second.second;
      auto &OtherMMap = Stat.second.second;
      auto Size = MMap.size();
      assert(Size <= opts::PrintDynoOpcodeStat);
      for (auto Iter = OtherMMap.rbegin(); Iter != OtherMMap.rend(); ++Iter) {
        if (Size++ >= opts::PrintDynoOpcodeStat) {
          auto First = MMap.begin();
          if (Iter->first <= First->first)
            break;
          MMap.erase(First);
        }
        MMap.emplace(*Iter);
      }
    }
  }
}

DynoStats getDynoStats(const BinaryFunction &BF) {
  auto &BC = BF.getBinaryContext();

  DynoStats Stats(/*PrintAArch64Stats*/ BC.isAArch64());

  // Return empty-stats about the function we don't completely understand.
  if (!BF.isSimple() || !BF.hasValidProfile() || !BF.hasCanonicalCFG())
    return Stats;

  // Update enumeration of basic blocks for correct detection of branch'
  // direction.
  BF.updateLayoutIndices();

  for (BinaryBasicBlock *const &BB : BF.layout()) {
    // The basic block execution count equals to the sum of incoming branch
    // frequencies. This may deviate from the sum of outgoing branches of the
    // basic block especially since the block may contain a function that
    // does not return or a function that throws an exception.
    const uint64_t BBExecutionCount = BB->getKnownExecutionCount();

    // Ignore empty blocks and blocks that were not executed.
    if (BB->getNumNonPseudos() == 0 || BBExecutionCount == 0)
      continue;

    // Count AArch64 linker-inserted veneers
    if (BF.isAArch64Veneer())
      Stats[DynoStats::VENEER_CALLS_AARCH64] += BF.getKnownExecutionCount();

    // Count various instruction types by iterating through all instructions.
    // When -print-dyno-opcode-stats is on, count per each opcode and record
    // maximum execution counts.
    for (const MCInst &Instr : *BB) {
      if (opts::PrintDynoOpcodeStat) {
        unsigned Opcode = Instr.getOpcode();
        auto I = Stats.OpcodeHistogram.find(Opcode);
        if (I == Stats.OpcodeHistogram.end()) {
          DynoStats::MaxOpcodeHistogramTy MMap;
          MMap.emplace(BBExecutionCount,
                       std::make_pair(BF.getOneName(), BB->getOffset()));
          Stats.OpcodeHistogram.emplace(Opcode,
                                        std::make_pair(BBExecutionCount, MMap));
        } else {
          I->second.first += BBExecutionCount;
          bool Insert = true;
          if (I->second.second.size() == opts::PrintDynoOpcodeStat) {
            auto First = I->second.second.begin();
            if (First->first < BBExecutionCount)
              I->second.second.erase(First);
            else
              Insert = false;
          }
          if (Insert) {
            I->second.second.emplace(
                BBExecutionCount,
                std::make_pair(BF.getOneName(), BB->getOffset()));
          }
        }
      }

      if (BC.MIB->isStore(Instr)) {
        Stats[DynoStats::STORES] += BBExecutionCount;
      }
      if (BC.MIB->isLoad(Instr)) {
        Stats[DynoStats::LOADS] += BBExecutionCount;
      }
      if (!BC.MIB->isCall(Instr))
        continue;

      uint64_t CallFreq = BBExecutionCount;
      if (BC.MIB->getConditionalTailCall(Instr)) {
        CallFreq =
            BC.MIB->getAnnotationWithDefault<uint64_t>(Instr, "CTCTakenCount");
      }
      Stats[DynoStats::FUNCTION_CALLS] += CallFreq;
      if (BC.MIB->isIndirectCall(Instr)) {
        Stats[DynoStats::INDIRECT_CALLS] += CallFreq;
      } else if (const MCSymbol *CallSymbol = BC.MIB->getTargetSymbol(Instr)) {
        const BinaryFunction *BF = BC.getFunctionForSymbol(CallSymbol);
        if (BF && BF->isPLTFunction()) {
          Stats[DynoStats::PLT_CALLS] += CallFreq;

          // We don't process PLT functions and hence have to adjust relevant
          // dynostats here for:
          //
          //   jmp *GOT_ENTRY(%rip)
          //
          // NOTE: this is arch-specific.
          Stats[DynoStats::FUNCTION_CALLS] += CallFreq;
          Stats[DynoStats::INDIRECT_CALLS] += CallFreq;
          Stats[DynoStats::LOADS] += CallFreq;
          Stats[DynoStats::INSTRUCTIONS] += CallFreq;
        }
      }
    }

    Stats[DynoStats::INSTRUCTIONS] += BB->getNumNonPseudos() * BBExecutionCount;

    // Jump tables.
    const MCInst *LastInstr = BB->getLastNonPseudoInstr();
    if (BC.MIB->getJumpTable(*LastInstr)) {
      Stats[DynoStats::JUMP_TABLE_BRANCHES] += BBExecutionCount;
      LLVM_DEBUG(
        static uint64_t MostFrequentJT;
        if (BBExecutionCount > MostFrequentJT) {
          MostFrequentJT = BBExecutionCount;
          dbgs() << "BOLT-INFO: most frequently executed jump table is in "
                 << "function " << BF << " in basic block " << BB->getName()
                 << " executed totally " << BBExecutionCount << " times.\n";
        }
      );
      continue;
    }

    if (BC.MIB->isIndirectBranch(*LastInstr) && !BC.MIB->isCall(*LastInstr)) {
      Stats[DynoStats::UNKNOWN_INDIRECT_BRANCHES] += BBExecutionCount;
      continue;
    }

    // Update stats for branches.
    const MCSymbol *TBB = nullptr;
    const MCSymbol *FBB = nullptr;
    MCInst *CondBranch = nullptr;
    MCInst *UncondBranch = nullptr;
    if (!BB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch))
      continue;

    if (!CondBranch && !UncondBranch)
      continue;

    // Simple unconditional branch.
    if (!CondBranch) {
      Stats[DynoStats::UNCOND_BRANCHES] += BBExecutionCount;
      continue;
    }

    // CTCs: instruction annotations could be stripped, hence check the number
    // of successors to identify conditional tail calls.
    if (BB->succ_size() == 1) {
      if (BB->branch_info_begin() != BB->branch_info_end())
        Stats[DynoStats::UNCOND_BRANCHES] += BB->branch_info_begin()->Count;
      continue;
    }

    // Conditional branch that could be followed by an unconditional branch.
    uint64_t TakenCount = BB->getTakenBranchInfo().Count;
    if (TakenCount == BinaryBasicBlock::COUNT_NO_PROFILE)
      TakenCount = 0;

    uint64_t NonTakenCount = BB->getFallthroughBranchInfo().Count;
    if (NonTakenCount == BinaryBasicBlock::COUNT_NO_PROFILE)
      NonTakenCount = 0;

    if (BF.isForwardBranch(BB, BB->getConditionalSuccessor(true))) {
      Stats[DynoStats::FORWARD_COND_BRANCHES] += BBExecutionCount;
      Stats[DynoStats::FORWARD_COND_BRANCHES_TAKEN] += TakenCount;
    } else {
      Stats[DynoStats::BACKWARD_COND_BRANCHES] += BBExecutionCount;
      Stats[DynoStats::BACKWARD_COND_BRANCHES_TAKEN] += TakenCount;
    }

    if (UncondBranch) {
      Stats[DynoStats::UNCOND_BRANCHES] += NonTakenCount;
    }
  }

  return Stats;
}

} // namespace bolt
} // namespace llvm
