1 //===------ LoopGeneratorsKMP.cpp - IR helper to create loops -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file contains functions to create parallel loops as LLVM-IR. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "polly/CodeGen/LoopGeneratorsKMP.h" 14 #include "llvm/IR/Dominators.h" 15 #include "llvm/IR/Module.h" 16 17 using namespace llvm; 18 using namespace polly; 19 20 void ParallelLoopGeneratorKMP::createCallSpawnThreads(Value *SubFn, 21 Value *SubFnParam, 22 Value *LB, Value *UB, 23 Value *Stride) { 24 const std::string Name = "__kmpc_fork_call"; 25 Function *F = M->getFunction(Name); 26 Type *KMPCMicroTy = M->getTypeByName("kmpc_micro"); 27 28 if (!KMPCMicroTy) { 29 // void (*kmpc_micro)(kmp_int32 *global_tid, kmp_int32 *bound_tid, ...) 30 Type *MicroParams[] = {Builder.getInt32Ty()->getPointerTo(), 31 Builder.getInt32Ty()->getPointerTo()}; 32 33 KMPCMicroTy = FunctionType::get(Builder.getVoidTy(), MicroParams, true); 34 } 35 36 // If F is not available, declare it. 37 if (!F) { 38 StructType *IdentTy = M->getTypeByName("struct.ident_t"); 39 40 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 41 Type *Params[] = {IdentTy->getPointerTo(), Builder.getInt32Ty(), 42 KMPCMicroTy->getPointerTo()}; 43 44 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, true); 45 F = Function::Create(Ty, Linkage, Name, M); 46 } 47 48 Value *Task = Builder.CreatePointerBitCastOrAddrSpaceCast( 49 SubFn, KMPCMicroTy->getPointerTo()); 50 51 Value *Args[] = {SourceLocationInfo, 52 Builder.getInt32(4) /* Number of arguments (w/o Task) */, 53 Task, 54 LB, 55 UB, 56 Stride, 57 SubFnParam}; 58 59 Builder.CreateCall(F, Args); 60 } 61 62 void ParallelLoopGeneratorKMP::deployParallelExecution(Function *SubFn, 63 Value *SubFnParam, 64 Value *LB, Value *UB, 65 Value *Stride) { 66 // Inform OpenMP runtime about the number of threads if greater than zero 67 if (PollyNumThreads > 0) { 68 Value *GlobalThreadID = createCallGlobalThreadNum(); 69 createCallPushNumThreads(GlobalThreadID, Builder.getInt32(PollyNumThreads)); 70 } 71 72 // Tell the runtime we start a parallel loop 73 createCallSpawnThreads(SubFn, SubFnParam, LB, UB, Stride); 74 } 75 76 Function *ParallelLoopGeneratorKMP::prepareSubFnDefinition(Function *F) const { 77 std::vector<Type *> Arguments = {Builder.getInt32Ty()->getPointerTo(), 78 Builder.getInt32Ty()->getPointerTo(), 79 LongType, 80 LongType, 81 LongType, 82 Builder.getInt8PtrTy()}; 83 84 FunctionType *FT = FunctionType::get(Builder.getVoidTy(), Arguments, false); 85 Function *SubFn = Function::Create(FT, Function::InternalLinkage, 86 F->getName() + "_polly_subfn", M); 87 // Name the function's arguments 88 Function::arg_iterator AI = SubFn->arg_begin(); 89 AI->setName("polly.kmpc.global_tid"); 90 std::advance(AI, 1); 91 AI->setName("polly.kmpc.bound_tid"); 92 std::advance(AI, 1); 93 AI->setName("polly.kmpc.lb"); 94 std::advance(AI, 1); 95 AI->setName("polly.kmpc.ub"); 96 std::advance(AI, 1); 97 AI->setName("polly.kmpc.inc"); 98 std::advance(AI, 1); 99 AI->setName("polly.kmpc.shared"); 100 101 return SubFn; 102 } 103 104 // Create a subfunction of the following (preliminary) structure: 105 // 106 // PrevBB 107 // | 108 // v 109 // HeaderBB 110 // / | _____ 111 // / v v | 112 // / PreHeaderBB | 113 // | | | 114 // | v | 115 // | CheckNextBB | 116 // \ | \_____/ 117 // \ | 118 // v v 119 // ExitBB 120 // 121 // HeaderBB will hold allocations, loading of variables and kmp-init calls. 122 // CheckNextBB will check for more work (dynamic / static chunked) or will be 123 // empty (static non chunked). 124 // If there is more work to do: go to PreHeaderBB, otherwise go to ExitBB. 125 // PreHeaderBB loads the new boundaries (& will lead to the loop body later on). 126 // Just like CheckNextBB: PreHeaderBB is (preliminary) empty in the static non 127 // chunked scheduling case. ExitBB marks the end of the parallel execution. 128 // The possibly empty BasicBlocks will automatically be removed. 129 std::tuple<Value *, Function *> 130 ParallelLoopGeneratorKMP::createSubFn(Value *SequentialLoopStride, 131 AllocaInst *StructData, 132 SetVector<Value *> Data, ValueMapT &Map) { 133 Function *SubFn = createSubFnDefinition(); 134 LLVMContext &Context = SubFn->getContext(); 135 136 // Store the previous basic block. 137 BasicBlock *PrevBB = Builder.GetInsertBlock(); 138 139 // Create basic blocks. 140 BasicBlock *HeaderBB = BasicBlock::Create(Context, "polly.par.setup", SubFn); 141 BasicBlock *ExitBB = BasicBlock::Create(Context, "polly.par.exit", SubFn); 142 BasicBlock *CheckNextBB = 143 BasicBlock::Create(Context, "polly.par.checkNext", SubFn); 144 BasicBlock *PreHeaderBB = 145 BasicBlock::Create(Context, "polly.par.loadIVBounds", SubFn); 146 147 DT.addNewBlock(HeaderBB, PrevBB); 148 DT.addNewBlock(ExitBB, HeaderBB); 149 DT.addNewBlock(CheckNextBB, HeaderBB); 150 DT.addNewBlock(PreHeaderBB, HeaderBB); 151 152 // Fill up basic block HeaderBB. 153 Builder.SetInsertPoint(HeaderBB); 154 Value *LBPtr = Builder.CreateAlloca(LongType, nullptr, "polly.par.LBPtr"); 155 Value *UBPtr = Builder.CreateAlloca(LongType, nullptr, "polly.par.UBPtr"); 156 Value *IsLastPtr = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, 157 "polly.par.lastIterPtr"); 158 Value *StridePtr = 159 Builder.CreateAlloca(LongType, nullptr, "polly.par.StridePtr"); 160 161 // Get iterator for retrieving the previously defined parameters. 162 Function::arg_iterator AI = SubFn->arg_begin(); 163 // First argument holds "global thread ID". 164 Value *IDPtr = &*AI; 165 // Skip "bound thread ID" since it is not used (but had to be defined). 166 std::advance(AI, 2); 167 // Move iterator to: LB, UB, Stride, Shared variable struct. 168 Value *LB = &*AI; 169 std::advance(AI, 1); 170 Value *UB = &*AI; 171 std::advance(AI, 1); 172 Value *Stride = &*AI; 173 std::advance(AI, 1); 174 Value *Shared = &*AI; 175 176 Value *UserContext = Builder.CreateBitCast(Shared, StructData->getType(), 177 "polly.par.userContext"); 178 179 extractValuesFromStruct(Data, StructData->getAllocatedType(), UserContext, 180 Map); 181 182 const auto Alignment = llvm::Align(is64BitArch() ? 8 : 4); 183 Value *ID = 184 Builder.CreateAlignedLoad(IDPtr, Alignment, "polly.par.global_tid"); 185 186 Builder.CreateAlignedStore(LB, LBPtr, Alignment); 187 Builder.CreateAlignedStore(UB, UBPtr, Alignment); 188 Builder.CreateAlignedStore(Builder.getInt32(0), IsLastPtr, Alignment); 189 Builder.CreateAlignedStore(Stride, StridePtr, Alignment); 190 191 // Subtract one as the upper bound provided by openmp is a < comparison 192 // whereas the codegenForSequential function creates a <= comparison. 193 Value *AdjustedUB = Builder.CreateAdd(UB, ConstantInt::get(LongType, -1), 194 "polly.indvar.UBAdjusted"); 195 196 Value *ChunkSize = 197 ConstantInt::get(LongType, std::max<int>(PollyChunkSize, 1)); 198 199 OMPGeneralSchedulingType Scheduling = 200 getSchedType(PollyChunkSize, PollyScheduling); 201 202 switch (Scheduling) { 203 case OMPGeneralSchedulingType::Dynamic: 204 case OMPGeneralSchedulingType::Guided: 205 case OMPGeneralSchedulingType::Runtime: 206 // "DYNAMIC" scheduling types are handled below (including 'runtime') 207 { 208 UB = AdjustedUB; 209 createCallDispatchInit(ID, LB, UB, Stride, ChunkSize); 210 Value *HasWork = 211 createCallDispatchNext(ID, IsLastPtr, LBPtr, UBPtr, StridePtr); 212 Value *HasIteration = 213 Builder.CreateICmp(llvm::CmpInst::Predicate::ICMP_EQ, HasWork, 214 Builder.getInt32(1), "polly.hasIteration"); 215 Builder.CreateCondBr(HasIteration, PreHeaderBB, ExitBB); 216 217 Builder.SetInsertPoint(CheckNextBB); 218 HasWork = createCallDispatchNext(ID, IsLastPtr, LBPtr, UBPtr, StridePtr); 219 HasIteration = 220 Builder.CreateICmp(llvm::CmpInst::Predicate::ICMP_EQ, HasWork, 221 Builder.getInt32(1), "polly.hasWork"); 222 Builder.CreateCondBr(HasIteration, PreHeaderBB, ExitBB); 223 224 Builder.SetInsertPoint(PreHeaderBB); 225 LB = Builder.CreateAlignedLoad(LBPtr, Alignment, "polly.indvar.LB"); 226 UB = Builder.CreateAlignedLoad(UBPtr, Alignment, "polly.indvar.UB"); 227 } 228 break; 229 case OMPGeneralSchedulingType::StaticChunked: 230 case OMPGeneralSchedulingType::StaticNonChunked: 231 // "STATIC" scheduling types are handled below 232 { 233 Builder.CreateAlignedStore(AdjustedUB, UBPtr, Alignment); 234 createCallStaticInit(ID, IsLastPtr, LBPtr, UBPtr, StridePtr, ChunkSize); 235 236 Value *ChunkedStride = 237 Builder.CreateAlignedLoad(StridePtr, Alignment, "polly.kmpc.stride"); 238 239 LB = Builder.CreateAlignedLoad(LBPtr, Alignment, "polly.indvar.LB"); 240 UB = Builder.CreateAlignedLoad(UBPtr, Alignment, "polly.indvar.UB.temp"); 241 242 Value *UBInRange = 243 Builder.CreateICmp(llvm::CmpInst::Predicate::ICMP_SLE, UB, AdjustedUB, 244 "polly.indvar.UB.inRange"); 245 UB = Builder.CreateSelect(UBInRange, UB, AdjustedUB, "polly.indvar.UB"); 246 Builder.CreateAlignedStore(UB, UBPtr, Alignment); 247 248 Value *HasIteration = Builder.CreateICmp( 249 llvm::CmpInst::Predicate::ICMP_SLE, LB, UB, "polly.hasIteration"); 250 Builder.CreateCondBr(HasIteration, PreHeaderBB, ExitBB); 251 252 if (Scheduling == OMPGeneralSchedulingType::StaticChunked) { 253 Builder.SetInsertPoint(PreHeaderBB); 254 LB = Builder.CreateAlignedLoad(LBPtr, Alignment, 255 "polly.indvar.LB.entry"); 256 UB = Builder.CreateAlignedLoad(UBPtr, Alignment, 257 "polly.indvar.UB.entry"); 258 } 259 260 Builder.SetInsertPoint(CheckNextBB); 261 262 if (Scheduling == OMPGeneralSchedulingType::StaticChunked) { 263 Value *NextLB = 264 Builder.CreateAdd(LB, ChunkedStride, "polly.indvar.nextLB"); 265 Value *NextUB = Builder.CreateAdd(UB, ChunkedStride); 266 267 Value *NextUBOutOfBounds = 268 Builder.CreateICmp(llvm::CmpInst::Predicate::ICMP_SGT, NextUB, 269 AdjustedUB, "polly.indvar.nextUB.outOfBounds"); 270 NextUB = Builder.CreateSelect(NextUBOutOfBounds, AdjustedUB, NextUB, 271 "polly.indvar.nextUB"); 272 273 Builder.CreateAlignedStore(NextLB, LBPtr, Alignment); 274 Builder.CreateAlignedStore(NextUB, UBPtr, Alignment); 275 276 Value *HasWork = 277 Builder.CreateICmp(llvm::CmpInst::Predicate::ICMP_SLE, NextLB, 278 AdjustedUB, "polly.hasWork"); 279 Builder.CreateCondBr(HasWork, PreHeaderBB, ExitBB); 280 } else { 281 Builder.CreateBr(ExitBB); 282 } 283 284 Builder.SetInsertPoint(PreHeaderBB); 285 } 286 break; 287 } 288 289 Builder.CreateBr(CheckNextBB); 290 Builder.SetInsertPoint(&*--Builder.GetInsertPoint()); 291 BasicBlock *AfterBB; 292 Value *IV = createLoop(LB, UB, SequentialLoopStride, Builder, LI, DT, AfterBB, 293 ICmpInst::ICMP_SLE, nullptr, true, 294 /* UseGuard */ false); 295 296 BasicBlock::iterator LoopBody = Builder.GetInsertPoint(); 297 298 // Add code to terminate this subfunction. 299 Builder.SetInsertPoint(ExitBB); 300 // Static (i.e. non-dynamic) scheduling types, are terminated with a fini-call 301 if (Scheduling == OMPGeneralSchedulingType::StaticChunked || 302 Scheduling == OMPGeneralSchedulingType::StaticNonChunked) { 303 createCallStaticFini(ID); 304 } 305 Builder.CreateRetVoid(); 306 Builder.SetInsertPoint(&*LoopBody); 307 308 return std::make_tuple(IV, SubFn); 309 } 310 311 Value *ParallelLoopGeneratorKMP::createCallGlobalThreadNum() { 312 const std::string Name = "__kmpc_global_thread_num"; 313 Function *F = M->getFunction(Name); 314 315 // If F is not available, declare it. 316 if (!F) { 317 StructType *IdentTy = M->getTypeByName("struct.ident_t"); 318 319 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 320 Type *Params[] = {IdentTy->getPointerTo()}; 321 322 FunctionType *Ty = FunctionType::get(Builder.getInt32Ty(), Params, false); 323 F = Function::Create(Ty, Linkage, Name, M); 324 } 325 326 return Builder.CreateCall(F, {SourceLocationInfo}); 327 } 328 329 void ParallelLoopGeneratorKMP::createCallPushNumThreads(Value *GlobalThreadID, 330 Value *NumThreads) { 331 const std::string Name = "__kmpc_push_num_threads"; 332 Function *F = M->getFunction(Name); 333 334 // If F is not available, declare it. 335 if (!F) { 336 StructType *IdentTy = M->getTypeByName("struct.ident_t"); 337 338 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 339 Type *Params[] = {IdentTy->getPointerTo(), Builder.getInt32Ty(), 340 Builder.getInt32Ty()}; 341 342 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false); 343 F = Function::Create(Ty, Linkage, Name, M); 344 } 345 346 Value *Args[] = {SourceLocationInfo, GlobalThreadID, NumThreads}; 347 348 Builder.CreateCall(F, Args); 349 } 350 351 void ParallelLoopGeneratorKMP::createCallStaticInit(Value *GlobalThreadID, 352 Value *IsLastPtr, 353 Value *LBPtr, Value *UBPtr, 354 Value *StridePtr, 355 Value *ChunkSize) { 356 const std::string Name = 357 is64BitArch() ? "__kmpc_for_static_init_8" : "__kmpc_for_static_init_4"; 358 Function *F = M->getFunction(Name); 359 StructType *IdentTy = M->getTypeByName("struct.ident_t"); 360 361 // If F is not available, declare it. 362 if (!F) { 363 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 364 365 Type *Params[] = {IdentTy->getPointerTo(), 366 Builder.getInt32Ty(), 367 Builder.getInt32Ty(), 368 Builder.getInt32Ty()->getPointerTo(), 369 LongType->getPointerTo(), 370 LongType->getPointerTo(), 371 LongType->getPointerTo(), 372 LongType, 373 LongType}; 374 375 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false); 376 F = Function::Create(Ty, Linkage, Name, M); 377 } 378 379 // The parameter 'ChunkSize' will hold strictly positive integer values, 380 // regardless of PollyChunkSize's value 381 Value *Args[] = { 382 SourceLocationInfo, 383 GlobalThreadID, 384 Builder.getInt32(int(getSchedType(PollyChunkSize, PollyScheduling))), 385 IsLastPtr, 386 LBPtr, 387 UBPtr, 388 StridePtr, 389 ConstantInt::get(LongType, 1), 390 ChunkSize}; 391 392 Builder.CreateCall(F, Args); 393 } 394 395 void ParallelLoopGeneratorKMP::createCallStaticFini(Value *GlobalThreadID) { 396 const std::string Name = "__kmpc_for_static_fini"; 397 Function *F = M->getFunction(Name); 398 StructType *IdentTy = M->getTypeByName("struct.ident_t"); 399 400 // If F is not available, declare it. 401 if (!F) { 402 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 403 Type *Params[] = {IdentTy->getPointerTo(), Builder.getInt32Ty()}; 404 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false); 405 F = Function::Create(Ty, Linkage, Name, M); 406 } 407 408 Value *Args[] = {SourceLocationInfo, GlobalThreadID}; 409 410 Builder.CreateCall(F, Args); 411 } 412 413 void ParallelLoopGeneratorKMP::createCallDispatchInit(Value *GlobalThreadID, 414 Value *LB, Value *UB, 415 Value *Inc, 416 Value *ChunkSize) { 417 const std::string Name = 418 is64BitArch() ? "__kmpc_dispatch_init_8" : "__kmpc_dispatch_init_4"; 419 Function *F = M->getFunction(Name); 420 StructType *IdentTy = M->getTypeByName("struct.ident_t"); 421 422 // If F is not available, declare it. 423 if (!F) { 424 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 425 426 Type *Params[] = {IdentTy->getPointerTo(), 427 Builder.getInt32Ty(), 428 Builder.getInt32Ty(), 429 LongType, 430 LongType, 431 LongType, 432 LongType}; 433 434 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false); 435 F = Function::Create(Ty, Linkage, Name, M); 436 } 437 438 // The parameter 'ChunkSize' will hold strictly positive integer values, 439 // regardless of PollyChunkSize's value 440 Value *Args[] = { 441 SourceLocationInfo, 442 GlobalThreadID, 443 Builder.getInt32(int(getSchedType(PollyChunkSize, PollyScheduling))), 444 LB, 445 UB, 446 Inc, 447 ChunkSize}; 448 449 Builder.CreateCall(F, Args); 450 } 451 452 Value *ParallelLoopGeneratorKMP::createCallDispatchNext(Value *GlobalThreadID, 453 Value *IsLastPtr, 454 Value *LBPtr, 455 Value *UBPtr, 456 Value *StridePtr) { 457 const std::string Name = 458 is64BitArch() ? "__kmpc_dispatch_next_8" : "__kmpc_dispatch_next_4"; 459 Function *F = M->getFunction(Name); 460 StructType *IdentTy = M->getTypeByName("struct.ident_t"); 461 462 // If F is not available, declare it. 463 if (!F) { 464 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 465 466 Type *Params[] = {IdentTy->getPointerTo(), 467 Builder.getInt32Ty(), 468 Builder.getInt32Ty()->getPointerTo(), 469 LongType->getPointerTo(), 470 LongType->getPointerTo(), 471 LongType->getPointerTo()}; 472 473 FunctionType *Ty = FunctionType::get(Builder.getInt32Ty(), Params, false); 474 F = Function::Create(Ty, Linkage, Name, M); 475 } 476 477 Value *Args[] = {SourceLocationInfo, GlobalThreadID, IsLastPtr, LBPtr, UBPtr, 478 StridePtr}; 479 480 return Builder.CreateCall(F, Args); 481 } 482 483 // TODO: This function currently creates a source location dummy. It might be 484 // necessary to (actually) provide information, in the future. 485 GlobalVariable *ParallelLoopGeneratorKMP::createSourceLocation() { 486 const std::string LocName = ".loc.dummy"; 487 GlobalVariable *SourceLocDummy = M->getGlobalVariable(LocName); 488 489 if (SourceLocDummy == nullptr) { 490 const std::string StructName = "struct.ident_t"; 491 StructType *IdentTy = M->getTypeByName(StructName); 492 493 // If the ident_t StructType is not available, declare it. 494 // in LLVM-IR: ident_t = type { i32, i32, i32, i32, i8* } 495 if (!IdentTy) { 496 Type *LocMembers[] = {Builder.getInt32Ty(), Builder.getInt32Ty(), 497 Builder.getInt32Ty(), Builder.getInt32Ty(), 498 Builder.getInt8PtrTy()}; 499 500 IdentTy = 501 StructType::create(M->getContext(), LocMembers, StructName, false); 502 } 503 504 const auto ArrayType = 505 llvm::ArrayType::get(Builder.getInt8Ty(), /* Length */ 23); 506 507 // Global Variable Definitions 508 GlobalVariable *StrVar = new GlobalVariable( 509 *M, ArrayType, true, GlobalValue::PrivateLinkage, 0, ".str.ident"); 510 StrVar->setAlignment(llvm::Align(1)); 511 512 SourceLocDummy = new GlobalVariable( 513 *M, IdentTy, true, GlobalValue::PrivateLinkage, nullptr, LocName); 514 SourceLocDummy->setAlignment(llvm::Align(8)); 515 516 // Constant Definitions 517 Constant *InitStr = ConstantDataArray::getString( 518 M->getContext(), "Source location dummy.", true); 519 520 Constant *StrPtr = static_cast<Constant *>(Builder.CreateInBoundsGEP( 521 ArrayType, StrVar, {Builder.getInt32(0), Builder.getInt32(0)})); 522 523 Constant *LocInitStruct = ConstantStruct::get( 524 IdentTy, {Builder.getInt32(0), Builder.getInt32(0), Builder.getInt32(0), 525 Builder.getInt32(0), StrPtr}); 526 527 // Initialize variables 528 StrVar->setInitializer(InitStr); 529 SourceLocDummy->setInitializer(LocInitStruct); 530 } 531 532 return SourceLocDummy; 533 } 534 535 bool ParallelLoopGeneratorKMP::is64BitArch() { 536 return (LongType->getIntegerBitWidth() == 64); 537 } 538 539 OMPGeneralSchedulingType ParallelLoopGeneratorKMP::getSchedType( 540 int ChunkSize, OMPGeneralSchedulingType Scheduling) const { 541 if (ChunkSize == 0 && Scheduling == OMPGeneralSchedulingType::StaticChunked) 542 return OMPGeneralSchedulingType::StaticNonChunked; 543 544 return Scheduling; 545 } 546