1 //===-- NVPTXLowerArgs.cpp - Lower arguments ------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // 11 // Arguments to kernel and device functions are passed via param space, 12 // which imposes certain restrictions: 13 // http://docs.nvidia.com/cuda/parallel-thread-execution/#state-spaces 14 // 15 // Kernel parameters are read-only and accessible only via ld.param 16 // instruction, directly or via a pointer. Pointers to kernel 17 // arguments can't be converted to generic address space. 18 // 19 // Device function parameters are directly accessible via 20 // ld.param/st.param, but taking the address of one returns a pointer 21 // to a copy created in local space which *can't* be used with 22 // ld.param/st.param. 23 // 24 // Copying a byval struct into local memory in IR allows us to enforce 25 // the param space restrictions, gives the rest of IR a pointer w/o 26 // param space restrictions, and gives us an opportunity to eliminate 27 // the copy. 28 // 29 // Pointer arguments to kernel functions need more work to be lowered: 30 // 31 // 1. Convert non-byval pointer arguments of CUDA kernels to pointers in the 32 // global address space. This allows later optimizations to emit 33 // ld.global.*/st.global.* for accessing these pointer arguments. For 34 // example, 35 // 36 // define void @foo(float* %input) { 37 // %v = load float, float* %input, align 4 38 // ... 39 // } 40 // 41 // becomes 42 // 43 // define void @foo(float* %input) { 44 // %input2 = addrspacecast float* %input to float addrspace(1)* 45 // %input3 = addrspacecast float addrspace(1)* %input2 to float* 46 // %v = load float, float* %input3, align 4 47 // ... 48 // } 49 // 50 // Later, NVPTXInferAddressSpaces will optimize it to 51 // 52 // define void @foo(float* %input) { 53 // %input2 = addrspacecast float* %input to float addrspace(1)* 54 // %v = load float, float addrspace(1)* %input2, align 4 55 // ... 56 // } 57 // 58 // 2. Convert pointers in a byval kernel parameter to pointers in the global 59 // address space. As #2, it allows NVPTX to emit more ld/st.global. E.g., 60 // 61 // struct S { 62 // int *x; 63 // int *y; 64 // }; 65 // __global__ void foo(S s) { 66 // int *b = s.y; 67 // // use b 68 // } 69 // 70 // "b" points to the global address space. In the IR level, 71 // 72 // define void @foo({i32*, i32*}* byval %input) { 73 // %b_ptr = getelementptr {i32*, i32*}, {i32*, i32*}* %input, i64 0, i32 1 74 // %b = load i32*, i32** %b_ptr 75 // ; use %b 76 // } 77 // 78 // becomes 79 // 80 // define void @foo({i32*, i32*}* byval %input) { 81 // %b_ptr = getelementptr {i32*, i32*}, {i32*, i32*}* %input, i64 0, i32 1 82 // %b = load i32*, i32** %b_ptr 83 // %b_global = addrspacecast i32* %b to i32 addrspace(1)* 84 // %b_generic = addrspacecast i32 addrspace(1)* %b_global to i32* 85 // ; use %b_generic 86 // } 87 // 88 // TODO: merge this pass with NVPTXInferAddressSpaces so that other passes don't 89 // cancel the addrspacecast pair this pass emits. 90 //===----------------------------------------------------------------------===// 91 92 #include "NVPTX.h" 93 #include "NVPTXUtilities.h" 94 #include "NVPTXTargetMachine.h" 95 #include "llvm/Analysis/ValueTracking.h" 96 #include "llvm/IR/Function.h" 97 #include "llvm/IR/Instructions.h" 98 #include "llvm/IR/Module.h" 99 #include "llvm/IR/Type.h" 100 #include "llvm/Pass.h" 101 102 using namespace llvm; 103 104 namespace llvm { 105 void initializeNVPTXLowerArgsPass(PassRegistry &); 106 } 107 108 namespace { 109 class NVPTXLowerArgs : public FunctionPass { 110 bool runOnFunction(Function &F) override; 111 112 bool runOnKernelFunction(Function &F); 113 bool runOnDeviceFunction(Function &F); 114 115 // handle byval parameters 116 void handleByValParam(Argument *Arg); 117 // Knowing Ptr must point to the global address space, this function 118 // addrspacecasts Ptr to global and then back to generic. This allows 119 // NVPTXInferAddressSpaces to fold the global-to-generic cast into 120 // loads/stores that appear later. 121 void markPointerAsGlobal(Value *Ptr); 122 123 public: 124 static char ID; // Pass identification, replacement for typeid 125 NVPTXLowerArgs(const NVPTXTargetMachine *TM = nullptr) 126 : FunctionPass(ID), TM(TM) {} 127 StringRef getPassName() const override { 128 return "Lower pointer arguments of CUDA kernels"; 129 } 130 131 private: 132 const NVPTXTargetMachine *TM; 133 }; 134 } // namespace 135 136 char NVPTXLowerArgs::ID = 1; 137 138 INITIALIZE_PASS(NVPTXLowerArgs, "nvptx-lower-args", 139 "Lower arguments (NVPTX)", false, false) 140 141 // ============================================================================= 142 // If the function had a byval struct ptr arg, say foo(%struct.x* byval %d), 143 // then add the following instructions to the first basic block: 144 // 145 // %temp = alloca %struct.x, align 8 146 // %tempd = addrspacecast %struct.x* %d to %struct.x addrspace(101)* 147 // %tv = load %struct.x addrspace(101)* %tempd 148 // store %struct.x %tv, %struct.x* %temp, align 8 149 // 150 // The above code allocates some space in the stack and copies the incoming 151 // struct from param space to local space. 152 // Then replace all occurrences of %d by %temp. 153 // ============================================================================= 154 void NVPTXLowerArgs::handleByValParam(Argument *Arg) { 155 Function *Func = Arg->getParent(); 156 Instruction *FirstInst = &(Func->getEntryBlock().front()); 157 PointerType *PType = dyn_cast<PointerType>(Arg->getType()); 158 159 assert(PType && "Expecting pointer type in handleByValParam"); 160 161 Type *StructType = PType->getElementType(); 162 AllocaInst *AllocA = new AllocaInst(StructType, Arg->getName(), FirstInst); 163 // Set the alignment to alignment of the byval parameter. This is because, 164 // later load/stores assume that alignment, and we are going to replace 165 // the use of the byval parameter with this alloca instruction. 166 AllocA->setAlignment(Func->getParamAlignment(Arg->getArgNo() + 1)); 167 Arg->replaceAllUsesWith(AllocA); 168 169 Value *ArgInParam = new AddrSpaceCastInst( 170 Arg, PointerType::get(StructType, ADDRESS_SPACE_PARAM), Arg->getName(), 171 FirstInst); 172 LoadInst *LI = new LoadInst(ArgInParam, Arg->getName(), FirstInst); 173 new StoreInst(LI, AllocA, FirstInst); 174 } 175 176 void NVPTXLowerArgs::markPointerAsGlobal(Value *Ptr) { 177 if (Ptr->getType()->getPointerAddressSpace() == ADDRESS_SPACE_GLOBAL) 178 return; 179 180 // Deciding where to emit the addrspacecast pair. 181 BasicBlock::iterator InsertPt; 182 if (Argument *Arg = dyn_cast<Argument>(Ptr)) { 183 // Insert at the functon entry if Ptr is an argument. 184 InsertPt = Arg->getParent()->getEntryBlock().begin(); 185 } else { 186 // Insert right after Ptr if Ptr is an instruction. 187 InsertPt = ++cast<Instruction>(Ptr)->getIterator(); 188 assert(InsertPt != InsertPt->getParent()->end() && 189 "We don't call this function with Ptr being a terminator."); 190 } 191 192 Instruction *PtrInGlobal = new AddrSpaceCastInst( 193 Ptr, PointerType::get(Ptr->getType()->getPointerElementType(), 194 ADDRESS_SPACE_GLOBAL), 195 Ptr->getName(), &*InsertPt); 196 Value *PtrInGeneric = new AddrSpaceCastInst(PtrInGlobal, Ptr->getType(), 197 Ptr->getName(), &*InsertPt); 198 // Replace with PtrInGeneric all uses of Ptr except PtrInGlobal. 199 Ptr->replaceAllUsesWith(PtrInGeneric); 200 PtrInGlobal->setOperand(0, Ptr); 201 } 202 203 // ============================================================================= 204 // Main function for this pass. 205 // ============================================================================= 206 bool NVPTXLowerArgs::runOnKernelFunction(Function &F) { 207 if (TM && TM->getDrvInterface() == NVPTX::CUDA) { 208 // Mark pointers in byval structs as global. 209 for (auto &B : F) { 210 for (auto &I : B) { 211 if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { 212 if (LI->getType()->isPointerTy()) { 213 Value *UO = GetUnderlyingObject(LI->getPointerOperand(), 214 F.getParent()->getDataLayout()); 215 if (Argument *Arg = dyn_cast<Argument>(UO)) { 216 if (Arg->hasByValAttr()) { 217 // LI is a load from a pointer within a byval kernel parameter. 218 markPointerAsGlobal(LI); 219 } 220 } 221 } 222 } 223 } 224 } 225 } 226 227 for (Argument &Arg : F.args()) { 228 if (Arg.getType()->isPointerTy()) { 229 if (Arg.hasByValAttr()) 230 handleByValParam(&Arg); 231 else if (TM && TM->getDrvInterface() == NVPTX::CUDA) 232 markPointerAsGlobal(&Arg); 233 } 234 } 235 return true; 236 } 237 238 // Device functions only need to copy byval args into local memory. 239 bool NVPTXLowerArgs::runOnDeviceFunction(Function &F) { 240 for (Argument &Arg : F.args()) 241 if (Arg.getType()->isPointerTy() && Arg.hasByValAttr()) 242 handleByValParam(&Arg); 243 return true; 244 } 245 246 bool NVPTXLowerArgs::runOnFunction(Function &F) { 247 return isKernelFunction(F) ? runOnKernelFunction(F) : runOnDeviceFunction(F); 248 } 249 250 FunctionPass * 251 llvm::createNVPTXLowerArgsPass(const NVPTXTargetMachine *TM) { 252 return new NVPTXLowerArgs(TM); 253 } 254