1 //===------ LoopGeneratorsKMP.cpp - IR helper to create loops -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file contains functions to create parallel loops as LLVM-IR. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "polly/CodeGen/LoopGeneratorsKMP.h" 14 #include "llvm/IR/Dominators.h" 15 #include "llvm/IR/Module.h" 16 17 using namespace llvm; 18 using namespace polly; 19 20 void ParallelLoopGeneratorKMP::createCallSpawnThreads(Value *SubFn, 21 Value *SubFnParam, 22 Value *LB, Value *UB, 23 Value *Stride) { 24 const std::string Name = "__kmpc_fork_call"; 25 Function *F = M->getFunction(Name); 26 Type *KMPCMicroTy = M->getTypeByName("kmpc_micro"); 27 28 if (!KMPCMicroTy) { 29 // void (*kmpc_micro)(kmp_int32 *global_tid, kmp_int32 *bound_tid, ...) 30 Type *MicroParams[] = {Builder.getInt32Ty()->getPointerTo(), 31 Builder.getInt32Ty()->getPointerTo()}; 32 33 KMPCMicroTy = FunctionType::get(Builder.getVoidTy(), MicroParams, true); 34 } 35 36 // If F is not available, declare it. 37 if (!F) { 38 StructType *IdentTy = M->getTypeByName("struct.ident_t"); 39 40 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 41 Type *Params[] = {IdentTy->getPointerTo(), Builder.getInt32Ty(), 42 KMPCMicroTy->getPointerTo()}; 43 44 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, true); 45 F = Function::Create(Ty, Linkage, Name, M); 46 } 47 48 Value *Task = Builder.CreatePointerBitCastOrAddrSpaceCast( 49 SubFn, KMPCMicroTy->getPointerTo()); 50 51 Value *Args[] = {SourceLocationInfo, 52 Builder.getInt32(4) /* Number of arguments (w/o Task) */, 53 Task, 54 LB, 55 UB, 56 Stride, 57 SubFnParam}; 58 59 Builder.CreateCall(F, Args); 60 } 61 62 void ParallelLoopGeneratorKMP::deployParallelExecution(Value *SubFn, 63 Value *SubFnParam, 64 Value *LB, Value *UB, 65 Value *Stride) { 66 // Inform OpenMP runtime about the number of threads if greater than zero 67 if (PollyNumThreads > 0) { 68 Value *GlobalThreadID = createCallGlobalThreadNum(); 69 createCallPushNumThreads(GlobalThreadID, Builder.getInt32(PollyNumThreads)); 70 } 71 72 // Tell the runtime we start a parallel loop 73 createCallSpawnThreads(SubFn, SubFnParam, LB, UB, Stride); 74 } 75 76 Function *ParallelLoopGeneratorKMP::prepareSubFnDefinition(Function *F) const { 77 std::vector<Type *> Arguments = {Builder.getInt32Ty()->getPointerTo(), 78 Builder.getInt32Ty()->getPointerTo(), 79 LongType, 80 LongType, 81 LongType, 82 Builder.getInt8PtrTy()}; 83 84 FunctionType *FT = FunctionType::get(Builder.getVoidTy(), Arguments, false); 85 Function *SubFn = Function::Create(FT, Function::InternalLinkage, 86 F->getName() + "_polly_subfn", M); 87 // Name the function's arguments 88 Function::arg_iterator AI = SubFn->arg_begin(); 89 AI->setName("polly.kmpc.global_tid"); 90 std::advance(AI, 1); 91 AI->setName("polly.kmpc.bound_tid"); 92 std::advance(AI, 1); 93 AI->setName("polly.kmpc.lb"); 94 std::advance(AI, 1); 95 AI->setName("polly.kmpc.ub"); 96 std::advance(AI, 1); 97 AI->setName("polly.kmpc.inc"); 98 std::advance(AI, 1); 99 AI->setName("polly.kmpc.shared"); 100 101 return SubFn; 102 } 103 104 // Create a subfunction of the following (preliminary) structure: 105 // 106 // PrevBB 107 // | 108 // v 109 // HeaderBB 110 // | _____ 111 // v v | 112 // CheckNextBB PreHeaderBB 113 // |\ | 114 // | \______/ 115 // | 116 // v 117 // ExitBB 118 // 119 // HeaderBB will hold allocations, loading of variables and kmp-init calls. 120 // CheckNextBB will check for more work (dynamic) or will be "empty" (static). 121 // If there is more work to do: go to PreHeaderBB, otherwise go to ExitBB. 122 // PreHeaderBB loads the new boundaries (& will lead to the loop body later on). 123 // Just like CheckNextBB: PreHeaderBB is empty in the static scheduling case. 124 // ExitBB marks the end of the parallel execution. 125 // The possibly empty BasicBlocks will automatically be removed. 126 std::tuple<Value *, Function *> 127 ParallelLoopGeneratorKMP::createSubFn(Value *StrideNotUsed, 128 AllocaInst *StructData, 129 SetVector<Value *> Data, ValueMapT &Map) { 130 Function *SubFn = createSubFnDefinition(); 131 LLVMContext &Context = SubFn->getContext(); 132 133 // Store the previous basic block. 134 BasicBlock *PrevBB = Builder.GetInsertBlock(); 135 136 // Create basic blocks. 137 BasicBlock *HeaderBB = BasicBlock::Create(Context, "polly.par.setup", SubFn); 138 BasicBlock *ExitBB = BasicBlock::Create(Context, "polly.par.exit", SubFn); 139 BasicBlock *CheckNextBB = 140 BasicBlock::Create(Context, "polly.par.checkNext", SubFn); 141 BasicBlock *PreHeaderBB = 142 BasicBlock::Create(Context, "polly.par.loadIVBounds", SubFn); 143 144 DT.addNewBlock(HeaderBB, PrevBB); 145 DT.addNewBlock(ExitBB, HeaderBB); 146 DT.addNewBlock(CheckNextBB, HeaderBB); 147 DT.addNewBlock(PreHeaderBB, HeaderBB); 148 149 // Fill up basic block HeaderBB. 150 Builder.SetInsertPoint(HeaderBB); 151 Value *LBPtr = Builder.CreateAlloca(LongType, nullptr, "polly.par.LBPtr"); 152 Value *UBPtr = Builder.CreateAlloca(LongType, nullptr, "polly.par.UBPtr"); 153 Value *IsLastPtr = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, 154 "polly.par.lastIterPtr"); 155 Value *StridePtr = 156 Builder.CreateAlloca(LongType, nullptr, "polly.par.StridePtr"); 157 158 // Get iterator for retrieving the previously defined parameters. 159 Function::arg_iterator AI = SubFn->arg_begin(); 160 // First argument holds "global thread ID". 161 Value *IDPtr = &*AI; 162 // Skip "bound thread ID" since it is not used (but had to be defined). 163 std::advance(AI, 2); 164 // Move iterator to: LB, UB, Stride, Shared variable struct. 165 Value *LB = &*AI; 166 std::advance(AI, 1); 167 Value *UB = &*AI; 168 std::advance(AI, 1); 169 Value *Stride = &*AI; 170 std::advance(AI, 1); 171 Value *Shared = &*AI; 172 173 Value *UserContext = Builder.CreateBitCast(Shared, StructData->getType(), 174 "polly.par.userContext"); 175 176 extractValuesFromStruct(Data, StructData->getAllocatedType(), UserContext, 177 Map); 178 179 const auto Alignment = llvm::Align(is64BitArch() ? 8 : 4); 180 Value *ID = 181 Builder.CreateAlignedLoad(IDPtr, Alignment, "polly.par.global_tid"); 182 183 Builder.CreateAlignedStore(LB, LBPtr, Alignment); 184 Builder.CreateAlignedStore(UB, UBPtr, Alignment); 185 Builder.CreateAlignedStore(Builder.getInt32(0), IsLastPtr, Alignment); 186 Builder.CreateAlignedStore(Stride, StridePtr, Alignment); 187 188 // Subtract one as the upper bound provided by openmp is a < comparison 189 // whereas the codegenForSequential function creates a <= comparison. 190 Value *AdjustedUB = Builder.CreateAdd(UB, ConstantInt::get(LongType, -1), 191 "polly.indvar.UBAdjusted"); 192 193 Value *ChunkSize = 194 ConstantInt::get(LongType, std::max<int>(PollyChunkSize, 1)); 195 196 switch (PollyScheduling) { 197 case OMPGeneralSchedulingType::Dynamic: 198 case OMPGeneralSchedulingType::Guided: 199 case OMPGeneralSchedulingType::Runtime: 200 // "DYNAMIC" scheduling types are handled below (including 'runtime') 201 { 202 UB = AdjustedUB; 203 createCallDispatchInit(ID, LB, UB, Stride, ChunkSize); 204 Value *HasWork = 205 createCallDispatchNext(ID, IsLastPtr, LBPtr, UBPtr, StridePtr); 206 Value *HasIteration = 207 Builder.CreateICmp(llvm::CmpInst::Predicate::ICMP_EQ, HasWork, 208 Builder.getInt32(1), "polly.hasIteration"); 209 Builder.CreateCondBr(HasIteration, PreHeaderBB, ExitBB); 210 211 Builder.SetInsertPoint(CheckNextBB); 212 HasWork = createCallDispatchNext(ID, IsLastPtr, LBPtr, UBPtr, StridePtr); 213 HasIteration = 214 Builder.CreateICmp(llvm::CmpInst::Predicate::ICMP_EQ, HasWork, 215 Builder.getInt32(1), "polly.hasWork"); 216 Builder.CreateCondBr(HasIteration, PreHeaderBB, ExitBB); 217 218 Builder.SetInsertPoint(PreHeaderBB); 219 LB = Builder.CreateAlignedLoad(LBPtr, Alignment, "polly.indvar.LB"); 220 UB = Builder.CreateAlignedLoad(UBPtr, Alignment, "polly.indvar.UB"); 221 } 222 break; 223 case OMPGeneralSchedulingType::StaticChunked: 224 case OMPGeneralSchedulingType::StaticNonChunked: 225 // "STATIC" scheduling types are handled below 226 { 227 createCallStaticInit(ID, IsLastPtr, LBPtr, UBPtr, StridePtr, ChunkSize); 228 229 LB = Builder.CreateAlignedLoad(LBPtr, Alignment, "polly.indvar.LB"); 230 UB = Builder.CreateAlignedLoad(UBPtr, Alignment, "polly.indvar.UB"); 231 232 Value *AdjUBOutOfBounds = 233 Builder.CreateICmp(llvm::CmpInst::Predicate::ICMP_SLT, UB, AdjustedUB, 234 "polly.adjustedUBOutOfBounds"); 235 236 UB = Builder.CreateSelect(AdjUBOutOfBounds, UB, AdjustedUB); 237 Builder.CreateAlignedStore(UB, UBPtr, Alignment); 238 239 Value *HasIteration = Builder.CreateICmp( 240 llvm::CmpInst::Predicate::ICMP_SLE, LB, UB, "polly.hasIteration"); 241 Builder.CreateCondBr(HasIteration, PreHeaderBB, ExitBB); 242 243 Builder.SetInsertPoint(CheckNextBB); 244 Builder.CreateBr(ExitBB); 245 246 Builder.SetInsertPoint(PreHeaderBB); 247 } 248 break; 249 } 250 251 Builder.CreateBr(CheckNextBB); 252 Builder.SetInsertPoint(&*--Builder.GetInsertPoint()); 253 BasicBlock *AfterBB; 254 Value *IV = createLoop(LB, UB, Stride, Builder, LI, DT, AfterBB, 255 ICmpInst::ICMP_SLE, nullptr, true, 256 /* UseGuard */ false); 257 258 BasicBlock::iterator LoopBody = Builder.GetInsertPoint(); 259 260 // Add code to terminate this subfunction. 261 Builder.SetInsertPoint(ExitBB); 262 // Static (i.e. non-dynamic) scheduling types, are terminated with a fini-call 263 if (PollyScheduling == OMPGeneralSchedulingType::StaticChunked) { 264 createCallStaticFini(ID); 265 } 266 Builder.CreateRetVoid(); 267 Builder.SetInsertPoint(&*LoopBody); 268 269 return std::make_tuple(IV, SubFn); 270 } 271 272 Value *ParallelLoopGeneratorKMP::createCallGlobalThreadNum() { 273 const std::string Name = "__kmpc_global_thread_num"; 274 Function *F = M->getFunction(Name); 275 276 // If F is not available, declare it. 277 if (!F) { 278 StructType *IdentTy = M->getTypeByName("struct.ident_t"); 279 280 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 281 Type *Params[] = {IdentTy->getPointerTo()}; 282 283 FunctionType *Ty = FunctionType::get(Builder.getInt32Ty(), Params, false); 284 F = Function::Create(Ty, Linkage, Name, M); 285 } 286 287 return Builder.CreateCall(F, {SourceLocationInfo}); 288 } 289 290 void ParallelLoopGeneratorKMP::createCallPushNumThreads(Value *GlobalThreadID, 291 Value *NumThreads) { 292 const std::string Name = "__kmpc_push_num_threads"; 293 Function *F = M->getFunction(Name); 294 295 // If F is not available, declare it. 296 if (!F) { 297 StructType *IdentTy = M->getTypeByName("struct.ident_t"); 298 299 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 300 Type *Params[] = {IdentTy->getPointerTo(), Builder.getInt32Ty(), 301 Builder.getInt32Ty()}; 302 303 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false); 304 F = Function::Create(Ty, Linkage, Name, M); 305 } 306 307 Value *Args[] = {SourceLocationInfo, GlobalThreadID, NumThreads}; 308 309 Builder.CreateCall(F, Args); 310 } 311 312 void ParallelLoopGeneratorKMP::createCallStaticInit(Value *GlobalThreadID, 313 Value *IsLastPtr, 314 Value *LBPtr, Value *UBPtr, 315 Value *StridePtr, 316 Value *ChunkSize) { 317 const std::string Name = 318 is64BitArch() ? "__kmpc_for_static_init_8" : "__kmpc_for_static_init_4"; 319 Function *F = M->getFunction(Name); 320 StructType *IdentTy = M->getTypeByName("struct.ident_t"); 321 322 // If F is not available, declare it. 323 if (!F) { 324 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 325 326 Type *Params[] = {IdentTy->getPointerTo(), 327 Builder.getInt32Ty(), 328 Builder.getInt32Ty(), 329 Builder.getInt32Ty()->getPointerTo(), 330 LongType->getPointerTo(), 331 LongType->getPointerTo(), 332 LongType->getPointerTo(), 333 LongType, 334 LongType}; 335 336 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false); 337 F = Function::Create(Ty, Linkage, Name, M); 338 } 339 340 // The parameter 'ChunkSize' will hold strictly positive integer values, 341 // regardless of PollyChunkSize's value 342 Value *Args[] = { 343 SourceLocationInfo, 344 GlobalThreadID, 345 Builder.getInt32(int(getSchedType(PollyChunkSize, PollyScheduling))), 346 IsLastPtr, 347 LBPtr, 348 UBPtr, 349 StridePtr, 350 ConstantInt::get(LongType, 1), 351 ChunkSize}; 352 353 Builder.CreateCall(F, Args); 354 } 355 356 void ParallelLoopGeneratorKMP::createCallStaticFini(Value *GlobalThreadID) { 357 const std::string Name = "__kmpc_for_static_fini"; 358 Function *F = M->getFunction(Name); 359 StructType *IdentTy = M->getTypeByName("struct.ident_t"); 360 361 // If F is not available, declare it. 362 if (!F) { 363 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 364 Type *Params[] = {IdentTy->getPointerTo(), Builder.getInt32Ty()}; 365 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false); 366 F = Function::Create(Ty, Linkage, Name, M); 367 } 368 369 Value *Args[] = {SourceLocationInfo, GlobalThreadID}; 370 371 Builder.CreateCall(F, Args); 372 } 373 374 void ParallelLoopGeneratorKMP::createCallDispatchInit(Value *GlobalThreadID, 375 Value *LB, Value *UB, 376 Value *Inc, 377 Value *ChunkSize) { 378 const std::string Name = 379 is64BitArch() ? "__kmpc_dispatch_init_8" : "__kmpc_dispatch_init_4"; 380 Function *F = M->getFunction(Name); 381 StructType *IdentTy = M->getTypeByName("struct.ident_t"); 382 383 // If F is not available, declare it. 384 if (!F) { 385 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 386 387 Type *Params[] = {IdentTy->getPointerTo(), 388 Builder.getInt32Ty(), 389 Builder.getInt32Ty(), 390 LongType, 391 LongType, 392 LongType, 393 LongType}; 394 395 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false); 396 F = Function::Create(Ty, Linkage, Name, M); 397 } 398 399 // The parameter 'ChunkSize' will hold strictly positive integer values, 400 // regardless of PollyChunkSize's value 401 Value *Args[] = { 402 SourceLocationInfo, 403 GlobalThreadID, 404 Builder.getInt32(int(getSchedType(PollyChunkSize, PollyScheduling))), 405 LB, 406 UB, 407 Inc, 408 ChunkSize}; 409 410 Builder.CreateCall(F, Args); 411 } 412 413 Value *ParallelLoopGeneratorKMP::createCallDispatchNext(Value *GlobalThreadID, 414 Value *IsLastPtr, 415 Value *LBPtr, 416 Value *UBPtr, 417 Value *StridePtr) { 418 const std::string Name = 419 is64BitArch() ? "__kmpc_dispatch_next_8" : "__kmpc_dispatch_next_4"; 420 Function *F = M->getFunction(Name); 421 StructType *IdentTy = M->getTypeByName("struct.ident_t"); 422 423 // If F is not available, declare it. 424 if (!F) { 425 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 426 427 Type *Params[] = {IdentTy->getPointerTo(), 428 Builder.getInt32Ty(), 429 Builder.getInt32Ty()->getPointerTo(), 430 LongType->getPointerTo(), 431 LongType->getPointerTo(), 432 LongType->getPointerTo()}; 433 434 FunctionType *Ty = FunctionType::get(Builder.getInt32Ty(), Params, false); 435 F = Function::Create(Ty, Linkage, Name, M); 436 } 437 438 Value *Args[] = {SourceLocationInfo, GlobalThreadID, IsLastPtr, LBPtr, UBPtr, 439 StridePtr}; 440 441 return Builder.CreateCall(F, Args); 442 } 443 444 // TODO: This function currently creates a source location dummy. It might be 445 // necessary to (actually) provide information, in the future. 446 GlobalVariable *ParallelLoopGeneratorKMP::createSourceLocation() { 447 const std::string LocName = ".loc.dummy"; 448 GlobalVariable *SourceLocDummy = M->getGlobalVariable(LocName); 449 450 if (SourceLocDummy == nullptr) { 451 const std::string StructName = "struct.ident_t"; 452 StructType *IdentTy = M->getTypeByName(StructName); 453 454 // If the ident_t StructType is not available, declare it. 455 // in LLVM-IR: ident_t = type { i32, i32, i32, i32, i8* } 456 if (!IdentTy) { 457 Type *LocMembers[] = {Builder.getInt32Ty(), Builder.getInt32Ty(), 458 Builder.getInt32Ty(), Builder.getInt32Ty(), 459 Builder.getInt8PtrTy()}; 460 461 IdentTy = 462 StructType::create(M->getContext(), LocMembers, StructName, false); 463 } 464 465 const auto ArrayType = 466 llvm::ArrayType::get(Builder.getInt8Ty(), /* Length */ 23); 467 468 // Global Variable Definitions 469 GlobalVariable *StrVar = new GlobalVariable( 470 *M, ArrayType, true, GlobalValue::PrivateLinkage, 0, ".str.ident"); 471 StrVar->setAlignment(llvm::Align(1)); 472 473 SourceLocDummy = new GlobalVariable( 474 *M, IdentTy, true, GlobalValue::PrivateLinkage, nullptr, LocName); 475 SourceLocDummy->setAlignment(llvm::Align(8)); 476 477 // Constant Definitions 478 Constant *InitStr = ConstantDataArray::getString( 479 M->getContext(), "Source location dummy.", true); 480 481 Constant *StrPtr = static_cast<Constant *>(Builder.CreateInBoundsGEP( 482 ArrayType, StrVar, {Builder.getInt32(0), Builder.getInt32(0)})); 483 484 Constant *LocInitStruct = ConstantStruct::get( 485 IdentTy, {Builder.getInt32(0), Builder.getInt32(0), Builder.getInt32(0), 486 Builder.getInt32(0), StrPtr}); 487 488 // Initialize variables 489 StrVar->setInitializer(InitStr); 490 SourceLocDummy->setInitializer(LocInitStruct); 491 } 492 493 return SourceLocDummy; 494 } 495 496 bool ParallelLoopGeneratorKMP::is64BitArch() { 497 return (LongType->getIntegerBitWidth() == 64); 498 } 499 500 OMPGeneralSchedulingType ParallelLoopGeneratorKMP::getSchedType( 501 int ChunkSize, OMPGeneralSchedulingType Scheduling) const { 502 if (ChunkSize == 0 && Scheduling == OMPGeneralSchedulingType::StaticChunked) 503 return OMPGeneralSchedulingType::StaticNonChunked; 504 505 return Scheduling; 506 } 507