1 //===------ LoopGeneratorsKMP.cpp - IR helper to create loops -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file contains functions to create parallel loops as LLVM-IR. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "polly/CodeGen/LoopGeneratorsKMP.h" 14 #include "llvm/IR/Dominators.h" 15 #include "llvm/IR/Module.h" 16 17 using namespace llvm; 18 using namespace polly; 19 20 void ParallelLoopGeneratorKMP::createCallSpawnThreads(Value *SubFn, 21 Value *SubFnParam, 22 Value *LB, Value *UB, 23 Value *Stride) { 24 const std::string Name = "__kmpc_fork_call"; 25 Function *F = M->getFunction(Name); 26 Type *KMPCMicroTy = StructType::getTypeByName(M->getContext(), "kmpc_micro"); 27 28 if (!KMPCMicroTy) { 29 // void (*kmpc_micro)(kmp_int32 *global_tid, kmp_int32 *bound_tid, ...) 30 Type *MicroParams[] = {Builder.getInt32Ty()->getPointerTo(), 31 Builder.getInt32Ty()->getPointerTo()}; 32 33 KMPCMicroTy = FunctionType::get(Builder.getVoidTy(), MicroParams, true); 34 } 35 36 // If F is not available, declare it. 37 if (!F) { 38 StructType *IdentTy = 39 StructType::getTypeByName(M->getContext(), "struct.ident_t"); 40 41 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 42 Type *Params[] = {IdentTy->getPointerTo(), Builder.getInt32Ty(), 43 KMPCMicroTy->getPointerTo()}; 44 45 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, true); 46 F = Function::Create(Ty, Linkage, Name, M); 47 } 48 49 Value *Task = Builder.CreatePointerBitCastOrAddrSpaceCast( 50 SubFn, KMPCMicroTy->getPointerTo()); 51 52 Value *Args[] = {SourceLocationInfo, 53 Builder.getInt32(4) /* Number of arguments (w/o Task) */, 54 Task, 55 LB, 56 UB, 57 Stride, 58 SubFnParam}; 59 60 Builder.CreateCall(F, Args); 61 } 62 63 void ParallelLoopGeneratorKMP::deployParallelExecution(Function *SubFn, 64 Value *SubFnParam, 65 Value *LB, Value *UB, 66 Value *Stride) { 67 // Inform OpenMP runtime about the number of threads if greater than zero 68 if (PollyNumThreads > 0) { 69 Value *GlobalThreadID = createCallGlobalThreadNum(); 70 createCallPushNumThreads(GlobalThreadID, Builder.getInt32(PollyNumThreads)); 71 } 72 73 // Tell the runtime we start a parallel loop 74 createCallSpawnThreads(SubFn, SubFnParam, LB, UB, Stride); 75 } 76 77 Function *ParallelLoopGeneratorKMP::prepareSubFnDefinition(Function *F) const { 78 std::vector<Type *> Arguments = {Builder.getInt32Ty()->getPointerTo(), 79 Builder.getInt32Ty()->getPointerTo(), 80 LongType, 81 LongType, 82 LongType, 83 Builder.getInt8PtrTy()}; 84 85 FunctionType *FT = FunctionType::get(Builder.getVoidTy(), Arguments, false); 86 Function *SubFn = Function::Create(FT, Function::InternalLinkage, 87 F->getName() + "_polly_subfn", M); 88 // Name the function's arguments 89 Function::arg_iterator AI = SubFn->arg_begin(); 90 AI->setName("polly.kmpc.global_tid"); 91 std::advance(AI, 1); 92 AI->setName("polly.kmpc.bound_tid"); 93 std::advance(AI, 1); 94 AI->setName("polly.kmpc.lb"); 95 std::advance(AI, 1); 96 AI->setName("polly.kmpc.ub"); 97 std::advance(AI, 1); 98 AI->setName("polly.kmpc.inc"); 99 std::advance(AI, 1); 100 AI->setName("polly.kmpc.shared"); 101 102 return SubFn; 103 } 104 105 // Create a subfunction of the following (preliminary) structure: 106 // 107 // PrevBB 108 // | 109 // v 110 // HeaderBB 111 // / | _____ 112 // / v v | 113 // / PreHeaderBB | 114 // | | | 115 // | v | 116 // | CheckNextBB | 117 // \ | \_____/ 118 // \ | 119 // v v 120 // ExitBB 121 // 122 // HeaderBB will hold allocations, loading of variables and kmp-init calls. 123 // CheckNextBB will check for more work (dynamic / static chunked) or will be 124 // empty (static non chunked). 125 // If there is more work to do: go to PreHeaderBB, otherwise go to ExitBB. 126 // PreHeaderBB loads the new boundaries (& will lead to the loop body later on). 127 // Just like CheckNextBB: PreHeaderBB is (preliminary) empty in the static non 128 // chunked scheduling case. ExitBB marks the end of the parallel execution. 129 // The possibly empty BasicBlocks will automatically be removed. 130 std::tuple<Value *, Function *> 131 ParallelLoopGeneratorKMP::createSubFn(Value *SequentialLoopStride, 132 AllocaInst *StructData, 133 SetVector<Value *> Data, ValueMapT &Map) { 134 Function *SubFn = createSubFnDefinition(); 135 LLVMContext &Context = SubFn->getContext(); 136 137 // Store the previous basic block. 138 BasicBlock *PrevBB = Builder.GetInsertBlock(); 139 140 // Create basic blocks. 141 BasicBlock *HeaderBB = BasicBlock::Create(Context, "polly.par.setup", SubFn); 142 BasicBlock *ExitBB = BasicBlock::Create(Context, "polly.par.exit", SubFn); 143 BasicBlock *CheckNextBB = 144 BasicBlock::Create(Context, "polly.par.checkNext", SubFn); 145 BasicBlock *PreHeaderBB = 146 BasicBlock::Create(Context, "polly.par.loadIVBounds", SubFn); 147 148 DT.addNewBlock(HeaderBB, PrevBB); 149 DT.addNewBlock(ExitBB, HeaderBB); 150 DT.addNewBlock(CheckNextBB, HeaderBB); 151 DT.addNewBlock(PreHeaderBB, HeaderBB); 152 153 // Fill up basic block HeaderBB. 154 Builder.SetInsertPoint(HeaderBB); 155 Value *LBPtr = Builder.CreateAlloca(LongType, nullptr, "polly.par.LBPtr"); 156 Value *UBPtr = Builder.CreateAlloca(LongType, nullptr, "polly.par.UBPtr"); 157 Value *IsLastPtr = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, 158 "polly.par.lastIterPtr"); 159 Value *StridePtr = 160 Builder.CreateAlloca(LongType, nullptr, "polly.par.StridePtr"); 161 162 // Get iterator for retrieving the previously defined parameters. 163 Function::arg_iterator AI = SubFn->arg_begin(); 164 // First argument holds "global thread ID". 165 Value *IDPtr = &*AI; 166 // Skip "bound thread ID" since it is not used (but had to be defined). 167 std::advance(AI, 2); 168 // Move iterator to: LB, UB, Stride, Shared variable struct. 169 Value *LB = &*AI; 170 std::advance(AI, 1); 171 Value *UB = &*AI; 172 std::advance(AI, 1); 173 Value *Stride = &*AI; 174 std::advance(AI, 1); 175 Value *Shared = &*AI; 176 177 Value *UserContext = Builder.CreateBitCast(Shared, StructData->getType(), 178 "polly.par.userContext"); 179 180 extractValuesFromStruct(Data, StructData->getAllocatedType(), UserContext, 181 Map); 182 183 const auto Alignment = llvm::Align(is64BitArch() ? 8 : 4); 184 Value *ID = 185 Builder.CreateAlignedLoad(IDPtr, Alignment, "polly.par.global_tid"); 186 187 Builder.CreateAlignedStore(LB, LBPtr, Alignment); 188 Builder.CreateAlignedStore(UB, UBPtr, Alignment); 189 Builder.CreateAlignedStore(Builder.getInt32(0), IsLastPtr, Alignment); 190 Builder.CreateAlignedStore(Stride, StridePtr, Alignment); 191 192 // Subtract one as the upper bound provided by openmp is a < comparison 193 // whereas the codegenForSequential function creates a <= comparison. 194 Value *AdjustedUB = Builder.CreateAdd(UB, ConstantInt::get(LongType, -1), 195 "polly.indvar.UBAdjusted"); 196 197 Value *ChunkSize = 198 ConstantInt::get(LongType, std::max<int>(PollyChunkSize, 1)); 199 200 OMPGeneralSchedulingType Scheduling = 201 getSchedType(PollyChunkSize, PollyScheduling); 202 203 switch (Scheduling) { 204 case OMPGeneralSchedulingType::Dynamic: 205 case OMPGeneralSchedulingType::Guided: 206 case OMPGeneralSchedulingType::Runtime: 207 // "DYNAMIC" scheduling types are handled below (including 'runtime') 208 { 209 UB = AdjustedUB; 210 createCallDispatchInit(ID, LB, UB, Stride, ChunkSize); 211 Value *HasWork = 212 createCallDispatchNext(ID, IsLastPtr, LBPtr, UBPtr, StridePtr); 213 Value *HasIteration = 214 Builder.CreateICmp(llvm::CmpInst::Predicate::ICMP_EQ, HasWork, 215 Builder.getInt32(1), "polly.hasIteration"); 216 Builder.CreateCondBr(HasIteration, PreHeaderBB, ExitBB); 217 218 Builder.SetInsertPoint(CheckNextBB); 219 HasWork = createCallDispatchNext(ID, IsLastPtr, LBPtr, UBPtr, StridePtr); 220 HasIteration = 221 Builder.CreateICmp(llvm::CmpInst::Predicate::ICMP_EQ, HasWork, 222 Builder.getInt32(1), "polly.hasWork"); 223 Builder.CreateCondBr(HasIteration, PreHeaderBB, ExitBB); 224 225 Builder.SetInsertPoint(PreHeaderBB); 226 LB = Builder.CreateAlignedLoad(LBPtr, Alignment, "polly.indvar.LB"); 227 UB = Builder.CreateAlignedLoad(UBPtr, Alignment, "polly.indvar.UB"); 228 } 229 break; 230 case OMPGeneralSchedulingType::StaticChunked: 231 case OMPGeneralSchedulingType::StaticNonChunked: 232 // "STATIC" scheduling types are handled below 233 { 234 Builder.CreateAlignedStore(AdjustedUB, UBPtr, Alignment); 235 createCallStaticInit(ID, IsLastPtr, LBPtr, UBPtr, StridePtr, ChunkSize); 236 237 Value *ChunkedStride = 238 Builder.CreateAlignedLoad(StridePtr, Alignment, "polly.kmpc.stride"); 239 240 LB = Builder.CreateAlignedLoad(LBPtr, Alignment, "polly.indvar.LB"); 241 UB = Builder.CreateAlignedLoad(UBPtr, Alignment, "polly.indvar.UB.temp"); 242 243 Value *UBInRange = 244 Builder.CreateICmp(llvm::CmpInst::Predicate::ICMP_SLE, UB, AdjustedUB, 245 "polly.indvar.UB.inRange"); 246 UB = Builder.CreateSelect(UBInRange, UB, AdjustedUB, "polly.indvar.UB"); 247 Builder.CreateAlignedStore(UB, UBPtr, Alignment); 248 249 Value *HasIteration = Builder.CreateICmp( 250 llvm::CmpInst::Predicate::ICMP_SLE, LB, UB, "polly.hasIteration"); 251 Builder.CreateCondBr(HasIteration, PreHeaderBB, ExitBB); 252 253 if (Scheduling == OMPGeneralSchedulingType::StaticChunked) { 254 Builder.SetInsertPoint(PreHeaderBB); 255 LB = Builder.CreateAlignedLoad(LBPtr, Alignment, 256 "polly.indvar.LB.entry"); 257 UB = Builder.CreateAlignedLoad(UBPtr, Alignment, 258 "polly.indvar.UB.entry"); 259 } 260 261 Builder.SetInsertPoint(CheckNextBB); 262 263 if (Scheduling == OMPGeneralSchedulingType::StaticChunked) { 264 Value *NextLB = 265 Builder.CreateAdd(LB, ChunkedStride, "polly.indvar.nextLB"); 266 Value *NextUB = Builder.CreateAdd(UB, ChunkedStride); 267 268 Value *NextUBOutOfBounds = 269 Builder.CreateICmp(llvm::CmpInst::Predicate::ICMP_SGT, NextUB, 270 AdjustedUB, "polly.indvar.nextUB.outOfBounds"); 271 NextUB = Builder.CreateSelect(NextUBOutOfBounds, AdjustedUB, NextUB, 272 "polly.indvar.nextUB"); 273 274 Builder.CreateAlignedStore(NextLB, LBPtr, Alignment); 275 Builder.CreateAlignedStore(NextUB, UBPtr, Alignment); 276 277 Value *HasWork = 278 Builder.CreateICmp(llvm::CmpInst::Predicate::ICMP_SLE, NextLB, 279 AdjustedUB, "polly.hasWork"); 280 Builder.CreateCondBr(HasWork, PreHeaderBB, ExitBB); 281 } else { 282 Builder.CreateBr(ExitBB); 283 } 284 285 Builder.SetInsertPoint(PreHeaderBB); 286 } 287 break; 288 } 289 290 Builder.CreateBr(CheckNextBB); 291 Builder.SetInsertPoint(&*--Builder.GetInsertPoint()); 292 BasicBlock *AfterBB; 293 Value *IV = createLoop(LB, UB, SequentialLoopStride, Builder, LI, DT, AfterBB, 294 ICmpInst::ICMP_SLE, nullptr, true, 295 /* UseGuard */ false); 296 297 BasicBlock::iterator LoopBody = Builder.GetInsertPoint(); 298 299 // Add code to terminate this subfunction. 300 Builder.SetInsertPoint(ExitBB); 301 // Static (i.e. non-dynamic) scheduling types, are terminated with a fini-call 302 if (Scheduling == OMPGeneralSchedulingType::StaticChunked || 303 Scheduling == OMPGeneralSchedulingType::StaticNonChunked) { 304 createCallStaticFini(ID); 305 } 306 Builder.CreateRetVoid(); 307 Builder.SetInsertPoint(&*LoopBody); 308 309 return std::make_tuple(IV, SubFn); 310 } 311 312 Value *ParallelLoopGeneratorKMP::createCallGlobalThreadNum() { 313 const std::string Name = "__kmpc_global_thread_num"; 314 Function *F = M->getFunction(Name); 315 316 // If F is not available, declare it. 317 if (!F) { 318 StructType *IdentTy = 319 StructType::getTypeByName(M->getContext(), "struct.ident_t"); 320 321 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 322 Type *Params[] = {IdentTy->getPointerTo()}; 323 324 FunctionType *Ty = FunctionType::get(Builder.getInt32Ty(), Params, false); 325 F = Function::Create(Ty, Linkage, Name, M); 326 } 327 328 return Builder.CreateCall(F, {SourceLocationInfo}); 329 } 330 331 void ParallelLoopGeneratorKMP::createCallPushNumThreads(Value *GlobalThreadID, 332 Value *NumThreads) { 333 const std::string Name = "__kmpc_push_num_threads"; 334 Function *F = M->getFunction(Name); 335 336 // If F is not available, declare it. 337 if (!F) { 338 StructType *IdentTy = 339 StructType::getTypeByName(M->getContext(), "struct.ident_t"); 340 341 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 342 Type *Params[] = {IdentTy->getPointerTo(), Builder.getInt32Ty(), 343 Builder.getInt32Ty()}; 344 345 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false); 346 F = Function::Create(Ty, Linkage, Name, M); 347 } 348 349 Value *Args[] = {SourceLocationInfo, GlobalThreadID, NumThreads}; 350 351 Builder.CreateCall(F, Args); 352 } 353 354 void ParallelLoopGeneratorKMP::createCallStaticInit(Value *GlobalThreadID, 355 Value *IsLastPtr, 356 Value *LBPtr, Value *UBPtr, 357 Value *StridePtr, 358 Value *ChunkSize) { 359 const std::string Name = 360 is64BitArch() ? "__kmpc_for_static_init_8" : "__kmpc_for_static_init_4"; 361 Function *F = M->getFunction(Name); 362 StructType *IdentTy = 363 StructType::getTypeByName(M->getContext(), "struct.ident_t"); 364 365 // If F is not available, declare it. 366 if (!F) { 367 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 368 369 Type *Params[] = {IdentTy->getPointerTo(), 370 Builder.getInt32Ty(), 371 Builder.getInt32Ty(), 372 Builder.getInt32Ty()->getPointerTo(), 373 LongType->getPointerTo(), 374 LongType->getPointerTo(), 375 LongType->getPointerTo(), 376 LongType, 377 LongType}; 378 379 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false); 380 F = Function::Create(Ty, Linkage, Name, M); 381 } 382 383 // The parameter 'ChunkSize' will hold strictly positive integer values, 384 // regardless of PollyChunkSize's value 385 Value *Args[] = { 386 SourceLocationInfo, 387 GlobalThreadID, 388 Builder.getInt32(int(getSchedType(PollyChunkSize, PollyScheduling))), 389 IsLastPtr, 390 LBPtr, 391 UBPtr, 392 StridePtr, 393 ConstantInt::get(LongType, 1), 394 ChunkSize}; 395 396 Builder.CreateCall(F, Args); 397 } 398 399 void ParallelLoopGeneratorKMP::createCallStaticFini(Value *GlobalThreadID) { 400 const std::string Name = "__kmpc_for_static_fini"; 401 Function *F = M->getFunction(Name); 402 StructType *IdentTy = 403 StructType::getTypeByName(M->getContext(), "struct.ident_t"); 404 405 // If F is not available, declare it. 406 if (!F) { 407 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 408 Type *Params[] = {IdentTy->getPointerTo(), Builder.getInt32Ty()}; 409 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false); 410 F = Function::Create(Ty, Linkage, Name, M); 411 } 412 413 Value *Args[] = {SourceLocationInfo, GlobalThreadID}; 414 415 Builder.CreateCall(F, Args); 416 } 417 418 void ParallelLoopGeneratorKMP::createCallDispatchInit(Value *GlobalThreadID, 419 Value *LB, Value *UB, 420 Value *Inc, 421 Value *ChunkSize) { 422 const std::string Name = 423 is64BitArch() ? "__kmpc_dispatch_init_8" : "__kmpc_dispatch_init_4"; 424 Function *F = M->getFunction(Name); 425 StructType *IdentTy = 426 StructType::getTypeByName(M->getContext(), "struct.ident_t"); 427 428 // If F is not available, declare it. 429 if (!F) { 430 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 431 432 Type *Params[] = {IdentTy->getPointerTo(), 433 Builder.getInt32Ty(), 434 Builder.getInt32Ty(), 435 LongType, 436 LongType, 437 LongType, 438 LongType}; 439 440 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false); 441 F = Function::Create(Ty, Linkage, Name, M); 442 } 443 444 // The parameter 'ChunkSize' will hold strictly positive integer values, 445 // regardless of PollyChunkSize's value 446 Value *Args[] = { 447 SourceLocationInfo, 448 GlobalThreadID, 449 Builder.getInt32(int(getSchedType(PollyChunkSize, PollyScheduling))), 450 LB, 451 UB, 452 Inc, 453 ChunkSize}; 454 455 Builder.CreateCall(F, Args); 456 } 457 458 Value *ParallelLoopGeneratorKMP::createCallDispatchNext(Value *GlobalThreadID, 459 Value *IsLastPtr, 460 Value *LBPtr, 461 Value *UBPtr, 462 Value *StridePtr) { 463 const std::string Name = 464 is64BitArch() ? "__kmpc_dispatch_next_8" : "__kmpc_dispatch_next_4"; 465 Function *F = M->getFunction(Name); 466 StructType *IdentTy = 467 StructType::getTypeByName(M->getContext(), "struct.ident_t"); 468 469 // If F is not available, declare it. 470 if (!F) { 471 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 472 473 Type *Params[] = {IdentTy->getPointerTo(), 474 Builder.getInt32Ty(), 475 Builder.getInt32Ty()->getPointerTo(), 476 LongType->getPointerTo(), 477 LongType->getPointerTo(), 478 LongType->getPointerTo()}; 479 480 FunctionType *Ty = FunctionType::get(Builder.getInt32Ty(), Params, false); 481 F = Function::Create(Ty, Linkage, Name, M); 482 } 483 484 Value *Args[] = {SourceLocationInfo, GlobalThreadID, IsLastPtr, LBPtr, UBPtr, 485 StridePtr}; 486 487 return Builder.CreateCall(F, Args); 488 } 489 490 // TODO: This function currently creates a source location dummy. It might be 491 // necessary to (actually) provide information, in the future. 492 GlobalVariable *ParallelLoopGeneratorKMP::createSourceLocation() { 493 const std::string LocName = ".loc.dummy"; 494 GlobalVariable *SourceLocDummy = M->getGlobalVariable(LocName); 495 496 if (SourceLocDummy == nullptr) { 497 const std::string StructName = "struct.ident_t"; 498 StructType *IdentTy = 499 StructType::getTypeByName(M->getContext(), StructName); 500 501 // If the ident_t StructType is not available, declare it. 502 // in LLVM-IR: ident_t = type { i32, i32, i32, i32, i8* } 503 if (!IdentTy) { 504 Type *LocMembers[] = {Builder.getInt32Ty(), Builder.getInt32Ty(), 505 Builder.getInt32Ty(), Builder.getInt32Ty(), 506 Builder.getInt8PtrTy()}; 507 508 IdentTy = 509 StructType::create(M->getContext(), LocMembers, StructName, false); 510 } 511 512 const auto ArrayType = 513 llvm::ArrayType::get(Builder.getInt8Ty(), /* Length */ 23); 514 515 // Global Variable Definitions 516 GlobalVariable *StrVar = new GlobalVariable( 517 *M, ArrayType, true, GlobalValue::PrivateLinkage, 0, ".str.ident"); 518 StrVar->setAlignment(llvm::Align(1)); 519 520 SourceLocDummy = new GlobalVariable( 521 *M, IdentTy, true, GlobalValue::PrivateLinkage, nullptr, LocName); 522 SourceLocDummy->setAlignment(llvm::Align(8)); 523 524 // Constant Definitions 525 Constant *InitStr = ConstantDataArray::getString( 526 M->getContext(), "Source location dummy.", true); 527 528 Constant *StrPtr = static_cast<Constant *>(Builder.CreateInBoundsGEP( 529 ArrayType, StrVar, {Builder.getInt32(0), Builder.getInt32(0)})); 530 531 Constant *LocInitStruct = ConstantStruct::get( 532 IdentTy, {Builder.getInt32(0), Builder.getInt32(0), Builder.getInt32(0), 533 Builder.getInt32(0), StrPtr}); 534 535 // Initialize variables 536 StrVar->setInitializer(InitStr); 537 SourceLocDummy->setInitializer(LocInitStruct); 538 } 539 540 return SourceLocDummy; 541 } 542 543 bool ParallelLoopGeneratorKMP::is64BitArch() { 544 return (LongType->getIntegerBitWidth() == 64); 545 } 546 547 OMPGeneralSchedulingType ParallelLoopGeneratorKMP::getSchedType( 548 int ChunkSize, OMPGeneralSchedulingType Scheduling) const { 549 if (ChunkSize == 0 && Scheduling == OMPGeneralSchedulingType::StaticChunked) 550 return OMPGeneralSchedulingType::StaticNonChunked; 551 552 return Scheduling; 553 } 554