//===-- AMDGPUAnnotateUniformValues.cpp - ---------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// This pass adds amdgpu.uniform metadata to IR values so this information
/// can be used during instruction selection.
//
//===----------------------------------------------------------------------===//

#include "AMDGPU.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
#include "llvm/Analysis/MemorySSA.h"
#include "llvm/IR/InstVisitor.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/InitializePasses.h"

#define DEBUG_TYPE "amdgpu-annotate-uniform"

using namespace llvm;

namespace {

class AMDGPUAnnotateUniformValues : public FunctionPass,
                       public InstVisitor<AMDGPUAnnotateUniformValues> {
  LegacyDivergenceAnalysis *DA;
  MemorySSA *MSSA;
  AliasAnalysis *AA;
  DenseMap<Value*, GetElementPtrInst*> noClobberClones;
  bool isEntryFunc;

public:
  static char ID;
  AMDGPUAnnotateUniformValues() :
    FunctionPass(ID) { }
  bool doInitialization(Module &M) override;
  bool runOnFunction(Function &F) override;
  StringRef getPassName() const override {
    return "AMDGPU Annotate Uniform Values";
  }
  void getAnalysisUsage(AnalysisUsage &AU) const override {
    AU.addRequired<LegacyDivergenceAnalysis>();
    AU.addRequired<MemorySSAWrapperPass>();
    AU.addRequired<AAResultsWrapperPass>();
    AU.setPreservesAll();
 }

  void visitBranchInst(BranchInst &I);
  void visitLoadInst(LoadInst &I);
  bool isClobberedInFunction(LoadInst * Load);
};

} // End anonymous namespace

INITIALIZE_PASS_BEGIN(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
                      "Add AMDGPU uniform metadata", false, false)
INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
                    "Add AMDGPU uniform metadata", false, false)

char AMDGPUAnnotateUniformValues::ID = 0;

static void setUniformMetadata(Instruction *I) {
  I->setMetadata("amdgpu.uniform", MDNode::get(I->getContext(), {}));
}
static void setNoClobberMetadata(Instruction *I) {
  I->setMetadata("amdgpu.noclobber", MDNode::get(I->getContext(), {}));
}

bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst *Load) {
  MemorySSAWalker *Walker = MSSA->getWalker();
  SmallVector<MemoryAccess *> WorkList{Walker->getClobberingMemoryAccess(Load)};
  SmallSet<MemoryAccess *, 8> Visited;
  MemoryLocation Loc(MemoryLocation::get(Load));

  const auto isReallyAClobber = [this, Load](MemoryDef *Def) -> bool {
    Instruction *DefInst = Def->getMemoryInst();
    LLVM_DEBUG(dbgs() << "  Def: " << *DefInst << '\n');

    if (isa<FenceInst>(DefInst))
      return false;

    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(DefInst)) {
      switch (II->getIntrinsicID()) {
      case Intrinsic::amdgcn_s_barrier:
      case Intrinsic::amdgcn_wave_barrier:
        return false;
      default:
        break;
      }
    }

    // Ignore atomics not aliasing with the original load, any atomic is a
    // universal MemoryDef from MSSA's point of view too, just like a fence.
    const auto checkNoAlias = [this, Load](auto I) -> bool {
      return I && AA->isNoAlias(I->getPointerOperand(),
                                Load->getPointerOperand());
    };

    if (checkNoAlias(dyn_cast<AtomicCmpXchgInst>(DefInst)) ||
        checkNoAlias(dyn_cast<AtomicRMWInst>(DefInst)))
      return false;

    return true;
  };

  LLVM_DEBUG(dbgs() << "Checking clobbering of: " << *Load << '\n');

  // Start with a nearest dominating clobbering access, it will be either
  // live on entry (nothing to do, load is not clobbered), MemoryDef, or
  // MemoryPhi if several MemoryDefs can define this memory state. In that
  // case add all Defs to WorkList and continue going up and checking all
  // the definitions of this memory location until the root. When all the
  // defs are exhausted and came to the entry state we have no clobber.
  // Along the scan ignore barriers and fences which are considered clobbers
  // by the MemorySSA, but not really writing anything into the memory.
  while (!WorkList.empty()) {
    MemoryAccess *MA = WorkList.pop_back_val();
    if (!Visited.insert(MA).second)
      continue;

    if (MSSA->isLiveOnEntryDef(MA))
      continue;

    if (MemoryDef *Def = dyn_cast<MemoryDef>(MA)) {
      if (isReallyAClobber(Def)) {
        LLVM_DEBUG(dbgs() << "      -> load is clobbered\n");
        return true;
      }

      WorkList.push_back(
          Walker->getClobberingMemoryAccess(Def->getDefiningAccess(), Loc));
      continue;
    }

    const MemoryPhi *Phi = cast<MemoryPhi>(MA);
    for (auto &Use : Phi->incoming_values())
      WorkList.push_back(cast<MemoryAccess>(&Use));
  }

  LLVM_DEBUG(dbgs() << "      -> no clobber\n");
  return false;
}

void AMDGPUAnnotateUniformValues::visitBranchInst(BranchInst &I) {
  if (DA->isUniform(&I))
    setUniformMetadata(&I);
}

void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) {
  Value *Ptr = I.getPointerOperand();
  if (!DA->isUniform(Ptr))
    return;
  // We're tracking up to the Function boundaries, and cannot go beyond because
  // of FunctionPass restrictions. We can ensure that is memory not clobbered
  // for memory operations that are live in to entry points only.
  Instruction *PtrI = dyn_cast<Instruction>(Ptr);

  if (!isEntryFunc) {
    if (PtrI)
      setUniformMetadata(PtrI);
    return;
  }

  bool NotClobbered = false;
  bool GlobalLoad = I.getPointerAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
  if (PtrI)
    NotClobbered = GlobalLoad && !isClobberedInFunction(&I);
  else if (isa<Argument>(Ptr) || isa<GlobalValue>(Ptr)) {
    if (GlobalLoad && !isClobberedInFunction(&I)) {
      NotClobbered = true;
      // Lookup for the existing GEP
      if (noClobberClones.count(Ptr)) {
        PtrI = noClobberClones[Ptr];
      } else {
        // Create GEP of the Value
        Function *F = I.getParent()->getParent();
        Value *Idx = Constant::getIntegerValue(
          Type::getInt32Ty(Ptr->getContext()), APInt(64, 0));
        // Insert GEP at the entry to make it dominate all uses
        PtrI = GetElementPtrInst::Create(I.getType(), Ptr,
                                         ArrayRef<Value *>(Idx), Twine(""),
                                         F->getEntryBlock().getFirstNonPHI());
      }
      I.replaceUsesOfWith(Ptr, PtrI);
    }
  }

  if (PtrI)
    setUniformMetadata(PtrI);

  if (NotClobbered)
    setNoClobberMetadata(&I);
}

bool AMDGPUAnnotateUniformValues::doInitialization(Module &M) {
  return false;
}

bool AMDGPUAnnotateUniformValues::runOnFunction(Function &F) {
  if (skipFunction(F))
    return false;

  DA = &getAnalysis<LegacyDivergenceAnalysis>();
  MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
  isEntryFunc = AMDGPU::isEntryFunctionCC(F.getCallingConv());

  visit(F);
  noClobberClones.clear();
  return true;
}

FunctionPass *
llvm::createAMDGPUAnnotateUniformValues() {
  return new AMDGPUAnnotateUniformValues();
}
