1 //===------ LoopGeneratorsKMP.cpp - IR helper to create loops -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file contains functions to create parallel loops as LLVM-IR. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "polly/CodeGen/LoopGeneratorsKMP.h" 14 #include "polly/Options.h" 15 #include "polly/ScopDetection.h" 16 #include "llvm/Analysis/LoopInfo.h" 17 #include "llvm/IR/DataLayout.h" 18 #include "llvm/IR/Dominators.h" 19 #include "llvm/IR/Module.h" 20 #include "llvm/Support/CommandLine.h" 21 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 22 23 using namespace llvm; 24 using namespace polly; 25 26 void ParallelLoopGeneratorKMP::createCallSpawnThreads(Value *SubFn, 27 Value *SubFnParam, 28 Value *LB, Value *UB, 29 Value *Stride) { 30 const std::string Name = "__kmpc_fork_call"; 31 Function *F = M->getFunction(Name); 32 Type *KMPCMicroTy = M->getTypeByName("kmpc_micro"); 33 34 if (!KMPCMicroTy) { 35 // void (*kmpc_micro)(kmp_int32 *global_tid, kmp_int32 *bound_tid, ...) 36 Type *MicroParams[] = {Builder.getInt32Ty()->getPointerTo(), 37 Builder.getInt32Ty()->getPointerTo()}; 38 39 KMPCMicroTy = FunctionType::get(Builder.getVoidTy(), MicroParams, true); 40 } 41 42 // If F is not available, declare it. 43 if (!F) { 44 StructType *IdentTy = M->getTypeByName("struct.ident_t"); 45 46 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 47 Type *Params[] = {IdentTy->getPointerTo(), Builder.getInt32Ty(), 48 KMPCMicroTy->getPointerTo()}; 49 50 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, true); 51 F = Function::Create(Ty, Linkage, Name, M); 52 } 53 54 Value *Task = Builder.CreatePointerBitCastOrAddrSpaceCast( 55 SubFn, KMPCMicroTy->getPointerTo()); 56 57 Value *Args[] = {SourceLocationInfo, 58 Builder.getInt32(4) /* Number of arguments (w/o Task) */, 59 Task, 60 LB, 61 UB, 62 Stride, 63 SubFnParam}; 64 65 Builder.CreateCall(F, Args); 66 } 67 68 void ParallelLoopGeneratorKMP::deployParallelExecution(Value *SubFn, 69 Value *SubFnParam, 70 Value *LB, Value *UB, 71 Value *Stride) { 72 // Inform OpenMP runtime about the number of threads if greater than zero 73 if (PollyNumThreads > 0) { 74 Value *GlobalThreadID = createCallGlobalThreadNum(); 75 createCallPushNumThreads(GlobalThreadID, Builder.getInt32(PollyNumThreads)); 76 } 77 78 // Tell the runtime we start a parallel loop 79 createCallSpawnThreads(SubFn, SubFnParam, LB, UB, Stride); 80 } 81 82 Function *ParallelLoopGeneratorKMP::prepareSubFnDefinition(Function *F) const { 83 std::vector<Type *> Arguments = {Builder.getInt32Ty()->getPointerTo(), 84 Builder.getInt32Ty()->getPointerTo(), 85 LongType, 86 LongType, 87 LongType, 88 Builder.getInt8PtrTy()}; 89 90 FunctionType *FT = FunctionType::get(Builder.getVoidTy(), Arguments, false); 91 Function *SubFn = Function::Create(FT, Function::InternalLinkage, 92 F->getName() + "_polly_subfn", M); 93 // Name the function's arguments 94 Function::arg_iterator AI = SubFn->arg_begin(); 95 AI->setName("polly.kmpc.global_tid"); 96 std::advance(AI, 1); 97 AI->setName("polly.kmpc.bound_tid"); 98 std::advance(AI, 1); 99 AI->setName("polly.kmpc.lb"); 100 std::advance(AI, 1); 101 AI->setName("polly.kmpc.ub"); 102 std::advance(AI, 1); 103 AI->setName("polly.kmpc.inc"); 104 std::advance(AI, 1); 105 AI->setName("polly.kmpc.shared"); 106 107 return SubFn; 108 } 109 110 // Create a subfunction of the following (preliminary) structure: 111 // 112 // PrevBB 113 // | 114 // v 115 // HeaderBB 116 // | _____ 117 // v v | 118 // CheckNextBB PreHeaderBB 119 // |\ | 120 // | \______/ 121 // | 122 // v 123 // ExitBB 124 // 125 // HeaderBB will hold allocations, loading of variables and kmp-init calls. 126 // CheckNextBB will check for more work (dynamic) or will be "empty" (static). 127 // If there is more work to do: go to PreHeaderBB, otherwise go to ExitBB. 128 // PreHeaderBB loads the new boundaries (& will lead to the loop body later on). 129 // Just like CheckNextBB: PreHeaderBB is empty in the static scheduling case. 130 // ExitBB marks the end of the parallel execution. 131 // The possibly empty BasicBlocks will automatically be removed. 132 std::tuple<Value *, Function *> 133 ParallelLoopGeneratorKMP::createSubFn(Value *StrideNotUsed, 134 AllocaInst *StructData, 135 SetVector<Value *> Data, ValueMapT &Map) { 136 Function *SubFn = createSubFnDefinition(); 137 LLVMContext &Context = SubFn->getContext(); 138 139 // Store the previous basic block. 140 BasicBlock *PrevBB = Builder.GetInsertBlock(); 141 142 // Create basic blocks. 143 BasicBlock *HeaderBB = BasicBlock::Create(Context, "polly.par.setup", SubFn); 144 BasicBlock *ExitBB = BasicBlock::Create(Context, "polly.par.exit", SubFn); 145 BasicBlock *CheckNextBB = 146 BasicBlock::Create(Context, "polly.par.checkNext", SubFn); 147 BasicBlock *PreHeaderBB = 148 BasicBlock::Create(Context, "polly.par.loadIVBounds", SubFn); 149 150 DT.addNewBlock(HeaderBB, PrevBB); 151 DT.addNewBlock(ExitBB, HeaderBB); 152 DT.addNewBlock(CheckNextBB, HeaderBB); 153 DT.addNewBlock(PreHeaderBB, HeaderBB); 154 155 // Fill up basic block HeaderBB. 156 Builder.SetInsertPoint(HeaderBB); 157 Value *LBPtr = Builder.CreateAlloca(LongType, nullptr, "polly.par.LBPtr"); 158 Value *UBPtr = Builder.CreateAlloca(LongType, nullptr, "polly.par.UBPtr"); 159 Value *IsLastPtr = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, 160 "polly.par.lastIterPtr"); 161 Value *StridePtr = 162 Builder.CreateAlloca(LongType, nullptr, "polly.par.StridePtr"); 163 164 // Get iterator for retrieving the previously defined parameters. 165 Function::arg_iterator AI = SubFn->arg_begin(); 166 // First argument holds "global thread ID". 167 Value *IDPtr = &*AI; 168 // Skip "bound thread ID" since it is not used (but had to be defined). 169 std::advance(AI, 2); 170 // Move iterator to: LB, UB, Stride, Shared variable struct. 171 Value *LB = &*AI; 172 std::advance(AI, 1); 173 Value *UB = &*AI; 174 std::advance(AI, 1); 175 Value *Stride = &*AI; 176 std::advance(AI, 1); 177 Value *Shared = &*AI; 178 179 Value *UserContext = Builder.CreateBitCast(Shared, StructData->getType(), 180 "polly.par.userContext"); 181 182 extractValuesFromStruct(Data, StructData->getAllocatedType(), UserContext, 183 Map); 184 185 const int Alignment = (is64BitArch()) ? 8 : 4; 186 Value *ID = 187 Builder.CreateAlignedLoad(IDPtr, Alignment, "polly.par.global_tid"); 188 189 Builder.CreateAlignedStore(LB, LBPtr, Alignment); 190 Builder.CreateAlignedStore(UB, UBPtr, Alignment); 191 Builder.CreateAlignedStore(Builder.getInt32(0), IsLastPtr, Alignment); 192 Builder.CreateAlignedStore(Stride, StridePtr, Alignment); 193 194 // Subtract one as the upper bound provided by openmp is a < comparison 195 // whereas the codegenForSequential function creates a <= comparison. 196 Value *AdjustedUB = Builder.CreateAdd(UB, ConstantInt::get(LongType, -1), 197 "polly.indvar.UBAdjusted"); 198 199 Value *ChunkSize = 200 ConstantInt::get(LongType, std::max<int>(PollyChunkSize, 1)); 201 202 switch (PollyScheduling) { 203 case OMPGeneralSchedulingType::Dynamic: 204 case OMPGeneralSchedulingType::Guided: 205 case OMPGeneralSchedulingType::Runtime: 206 // "DYNAMIC" scheduling types are handled below (including 'runtime') 207 { 208 UB = AdjustedUB; 209 createCallDispatchInit(ID, LB, UB, Stride, ChunkSize); 210 Value *HasWork = 211 createCallDispatchNext(ID, IsLastPtr, LBPtr, UBPtr, StridePtr); 212 Value *HasIteration = 213 Builder.CreateICmp(llvm::CmpInst::Predicate::ICMP_EQ, HasWork, 214 Builder.getInt32(1), "polly.hasIteration"); 215 Builder.CreateCondBr(HasIteration, PreHeaderBB, ExitBB); 216 217 Builder.SetInsertPoint(CheckNextBB); 218 HasWork = createCallDispatchNext(ID, IsLastPtr, LBPtr, UBPtr, StridePtr); 219 HasIteration = 220 Builder.CreateICmp(llvm::CmpInst::Predicate::ICMP_EQ, HasWork, 221 Builder.getInt32(1), "polly.hasWork"); 222 Builder.CreateCondBr(HasIteration, PreHeaderBB, ExitBB); 223 224 Builder.SetInsertPoint(PreHeaderBB); 225 LB = Builder.CreateAlignedLoad(LBPtr, Alignment, "polly.indvar.LB"); 226 UB = Builder.CreateAlignedLoad(UBPtr, Alignment, "polly.indvar.UB"); 227 } 228 break; 229 case OMPGeneralSchedulingType::StaticChunked: 230 case OMPGeneralSchedulingType::StaticNonChunked: 231 // "STATIC" scheduling types are handled below 232 { 233 createCallStaticInit(ID, IsLastPtr, LBPtr, UBPtr, StridePtr, ChunkSize); 234 235 LB = Builder.CreateAlignedLoad(LBPtr, Alignment, "polly.indvar.LB"); 236 UB = Builder.CreateAlignedLoad(UBPtr, Alignment, "polly.indvar.UB"); 237 238 Value *AdjUBOutOfBounds = 239 Builder.CreateICmp(llvm::CmpInst::Predicate::ICMP_SLT, UB, AdjustedUB, 240 "polly.adjustedUBOutOfBounds"); 241 242 UB = Builder.CreateSelect(AdjUBOutOfBounds, UB, AdjustedUB); 243 Builder.CreateAlignedStore(UB, UBPtr, Alignment); 244 245 Value *HasIteration = Builder.CreateICmp( 246 llvm::CmpInst::Predicate::ICMP_SLE, LB, UB, "polly.hasIteration"); 247 Builder.CreateCondBr(HasIteration, PreHeaderBB, ExitBB); 248 249 Builder.SetInsertPoint(CheckNextBB); 250 Builder.CreateBr(ExitBB); 251 252 Builder.SetInsertPoint(PreHeaderBB); 253 } 254 break; 255 } 256 257 Builder.CreateBr(CheckNextBB); 258 Builder.SetInsertPoint(&*--Builder.GetInsertPoint()); 259 BasicBlock *AfterBB; 260 Value *IV = createLoop(LB, UB, Stride, Builder, LI, DT, AfterBB, 261 ICmpInst::ICMP_SLE, nullptr, true, 262 /* UseGuard */ false); 263 264 BasicBlock::iterator LoopBody = Builder.GetInsertPoint(); 265 266 // Add code to terminate this subfunction. 267 Builder.SetInsertPoint(ExitBB); 268 // Static (i.e. non-dynamic) scheduling types, are terminated with a fini-call 269 if (PollyScheduling == OMPGeneralSchedulingType::StaticChunked) { 270 createCallStaticFini(ID); 271 } 272 Builder.CreateRetVoid(); 273 Builder.SetInsertPoint(&*LoopBody); 274 275 return std::make_tuple(IV, SubFn); 276 } 277 278 Value *ParallelLoopGeneratorKMP::createCallGlobalThreadNum() { 279 const std::string Name = "__kmpc_global_thread_num"; 280 Function *F = M->getFunction(Name); 281 282 // If F is not available, declare it. 283 if (!F) { 284 StructType *IdentTy = M->getTypeByName("struct.ident_t"); 285 286 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 287 Type *Params[] = {IdentTy->getPointerTo()}; 288 289 FunctionType *Ty = FunctionType::get(Builder.getInt32Ty(), Params, false); 290 F = Function::Create(Ty, Linkage, Name, M); 291 } 292 293 return Builder.CreateCall(F, {SourceLocationInfo}); 294 } 295 296 void ParallelLoopGeneratorKMP::createCallPushNumThreads(Value *GlobalThreadID, 297 Value *NumThreads) { 298 const std::string Name = "__kmpc_push_num_threads"; 299 Function *F = M->getFunction(Name); 300 301 // If F is not available, declare it. 302 if (!F) { 303 StructType *IdentTy = M->getTypeByName("struct.ident_t"); 304 305 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 306 Type *Params[] = {IdentTy->getPointerTo(), Builder.getInt32Ty(), 307 Builder.getInt32Ty()}; 308 309 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false); 310 F = Function::Create(Ty, Linkage, Name, M); 311 } 312 313 Value *Args[] = {SourceLocationInfo, GlobalThreadID, NumThreads}; 314 315 Builder.CreateCall(F, Args); 316 } 317 318 void ParallelLoopGeneratorKMP::createCallStaticInit(Value *GlobalThreadID, 319 Value *IsLastPtr, 320 Value *LBPtr, Value *UBPtr, 321 Value *StridePtr, 322 Value *ChunkSize) { 323 const std::string Name = 324 is64BitArch() ? "__kmpc_for_static_init_8" : "__kmpc_for_static_init_4"; 325 Function *F = M->getFunction(Name); 326 StructType *IdentTy = M->getTypeByName("struct.ident_t"); 327 328 // If F is not available, declare it. 329 if (!F) { 330 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 331 332 Type *Params[] = {IdentTy->getPointerTo(), 333 Builder.getInt32Ty(), 334 Builder.getInt32Ty(), 335 Builder.getInt32Ty()->getPointerTo(), 336 LongType->getPointerTo(), 337 LongType->getPointerTo(), 338 LongType->getPointerTo(), 339 LongType, 340 LongType}; 341 342 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false); 343 F = Function::Create(Ty, Linkage, Name, M); 344 } 345 346 // The parameter 'ChunkSize' will hold strictly positive integer values, 347 // regardless of PollyChunkSize's value 348 Value *Args[] = { 349 SourceLocationInfo, 350 GlobalThreadID, 351 Builder.getInt32(int(getSchedType(PollyChunkSize, PollyScheduling))), 352 IsLastPtr, 353 LBPtr, 354 UBPtr, 355 StridePtr, 356 ConstantInt::get(LongType, 1), 357 ChunkSize}; 358 359 Builder.CreateCall(F, Args); 360 } 361 362 void ParallelLoopGeneratorKMP::createCallStaticFini(Value *GlobalThreadID) { 363 const std::string Name = "__kmpc_for_static_fini"; 364 Function *F = M->getFunction(Name); 365 StructType *IdentTy = M->getTypeByName("struct.ident_t"); 366 367 // If F is not available, declare it. 368 if (!F) { 369 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 370 Type *Params[] = {IdentTy->getPointerTo(), Builder.getInt32Ty()}; 371 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false); 372 F = Function::Create(Ty, Linkage, Name, M); 373 } 374 375 Value *Args[] = {SourceLocationInfo, GlobalThreadID}; 376 377 Builder.CreateCall(F, Args); 378 } 379 380 void ParallelLoopGeneratorKMP::createCallDispatchInit(Value *GlobalThreadID, 381 Value *LB, Value *UB, 382 Value *Inc, 383 Value *ChunkSize) { 384 const std::string Name = 385 is64BitArch() ? "__kmpc_dispatch_init_8" : "__kmpc_dispatch_init_4"; 386 Function *F = M->getFunction(Name); 387 StructType *IdentTy = M->getTypeByName("struct.ident_t"); 388 389 // If F is not available, declare it. 390 if (!F) { 391 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 392 393 Type *Params[] = {IdentTy->getPointerTo(), 394 Builder.getInt32Ty(), 395 Builder.getInt32Ty(), 396 LongType, 397 LongType, 398 LongType, 399 LongType}; 400 401 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false); 402 F = Function::Create(Ty, Linkage, Name, M); 403 } 404 405 // The parameter 'ChunkSize' will hold strictly positive integer values, 406 // regardless of PollyChunkSize's value 407 Value *Args[] = { 408 SourceLocationInfo, 409 GlobalThreadID, 410 Builder.getInt32(int(getSchedType(PollyChunkSize, PollyScheduling))), 411 LB, 412 UB, 413 Inc, 414 ChunkSize}; 415 416 Builder.CreateCall(F, Args); 417 } 418 419 Value *ParallelLoopGeneratorKMP::createCallDispatchNext(Value *GlobalThreadID, 420 Value *IsLastPtr, 421 Value *LBPtr, 422 Value *UBPtr, 423 Value *StridePtr) { 424 const std::string Name = 425 is64BitArch() ? "__kmpc_dispatch_next_8" : "__kmpc_dispatch_next_4"; 426 Function *F = M->getFunction(Name); 427 StructType *IdentTy = M->getTypeByName("struct.ident_t"); 428 429 // If F is not available, declare it. 430 if (!F) { 431 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 432 433 Type *Params[] = {IdentTy->getPointerTo(), 434 Builder.getInt32Ty(), 435 Builder.getInt32Ty()->getPointerTo(), 436 LongType->getPointerTo(), 437 LongType->getPointerTo(), 438 LongType->getPointerTo()}; 439 440 FunctionType *Ty = FunctionType::get(Builder.getInt32Ty(), Params, false); 441 F = Function::Create(Ty, Linkage, Name, M); 442 } 443 444 Value *Args[] = {SourceLocationInfo, GlobalThreadID, IsLastPtr, LBPtr, UBPtr, 445 StridePtr}; 446 447 return Builder.CreateCall(F, Args); 448 } 449 450 // TODO: This function currently creates a source location dummy. It might be 451 // necessary to (actually) provide information, in the future. 452 GlobalVariable *ParallelLoopGeneratorKMP::createSourceLocation() { 453 const std::string LocName = ".loc.dummy"; 454 GlobalVariable *SourceLocDummy = M->getGlobalVariable(LocName); 455 456 if (SourceLocDummy == nullptr) { 457 const std::string StructName = "struct.ident_t"; 458 StructType *IdentTy = M->getTypeByName(StructName); 459 460 // If the ident_t StructType is not available, declare it. 461 // in LLVM-IR: ident_t = type { i32, i32, i32, i32, i8* } 462 if (!IdentTy) { 463 Type *LocMembers[] = {Builder.getInt32Ty(), Builder.getInt32Ty(), 464 Builder.getInt32Ty(), Builder.getInt32Ty(), 465 Builder.getInt8PtrTy()}; 466 467 IdentTy = 468 StructType::create(M->getContext(), LocMembers, StructName, false); 469 } 470 471 const auto ArrayType = 472 llvm::ArrayType::get(Builder.getInt8Ty(), /* Length */ 23); 473 474 // Global Variable Definitions 475 GlobalVariable *StrVar = new GlobalVariable( 476 *M, ArrayType, true, GlobalValue::PrivateLinkage, 0, ".str.ident"); 477 StrVar->setAlignment(1); 478 479 SourceLocDummy = new GlobalVariable( 480 *M, IdentTy, true, GlobalValue::PrivateLinkage, nullptr, LocName); 481 SourceLocDummy->setAlignment(8); 482 483 // Constant Definitions 484 Constant *InitStr = ConstantDataArray::getString( 485 M->getContext(), "Source location dummy.", true); 486 487 Constant *StrPtr = static_cast<Constant *>(Builder.CreateInBoundsGEP( 488 ArrayType, StrVar, {Builder.getInt32(0), Builder.getInt32(0)})); 489 490 Constant *LocInitStruct = ConstantStruct::get( 491 IdentTy, {Builder.getInt32(0), Builder.getInt32(0), Builder.getInt32(0), 492 Builder.getInt32(0), StrPtr}); 493 494 // Initialize variables 495 StrVar->setInitializer(InitStr); 496 SourceLocDummy->setInitializer(LocInitStruct); 497 } 498 499 return SourceLocDummy; 500 } 501 502 bool ParallelLoopGeneratorKMP::is64BitArch() { 503 return (LongType->getIntegerBitWidth() == 64); 504 } 505 506 OMPGeneralSchedulingType ParallelLoopGeneratorKMP::getSchedType( 507 int ChunkSize, OMPGeneralSchedulingType Scheduling) const { 508 if (ChunkSize == 0 && Scheduling == OMPGeneralSchedulingType::StaticChunked) 509 return OMPGeneralSchedulingType::StaticNonChunked; 510 511 return Scheduling; 512 } 513