1 //===------ LoopGeneratorsKMP.cpp - IR helper to create loops -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains functions to create parallel loops as LLVM-IR.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "polly/CodeGen/LoopGeneratorsKMP.h"
14 #include "polly/Options.h"
15 #include "polly/ScopDetection.h"
16 #include "llvm/Analysis/LoopInfo.h"
17 #include "llvm/IR/DataLayout.h"
18 #include "llvm/IR/Dominators.h"
19 #include "llvm/IR/Module.h"
20 #include "llvm/Support/CommandLine.h"
21 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
22 
23 using namespace llvm;
24 using namespace polly;
25 
26 void ParallelLoopGeneratorKMP::createCallSpawnThreads(Value *SubFn,
27                                                       Value *SubFnParam,
28                                                       Value *LB, Value *UB,
29                                                       Value *Stride) {
30   const std::string Name = "__kmpc_fork_call";
31   Function *F = M->getFunction(Name);
32   Type *KMPCMicroTy = M->getTypeByName("kmpc_micro");
33 
34   if (!KMPCMicroTy) {
35     // void (*kmpc_micro)(kmp_int32 *global_tid, kmp_int32 *bound_tid, ...)
36     Type *MicroParams[] = {Builder.getInt32Ty()->getPointerTo(),
37                            Builder.getInt32Ty()->getPointerTo()};
38 
39     KMPCMicroTy = FunctionType::get(Builder.getVoidTy(), MicroParams, true);
40   }
41 
42   // If F is not available, declare it.
43   if (!F) {
44     StructType *IdentTy = M->getTypeByName("struct.ident_t");
45 
46     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
47     Type *Params[] = {IdentTy->getPointerTo(), Builder.getInt32Ty(),
48                       KMPCMicroTy->getPointerTo()};
49 
50     FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, true);
51     F = Function::Create(Ty, Linkage, Name, M);
52   }
53 
54   Value *Task = Builder.CreatePointerBitCastOrAddrSpaceCast(
55       SubFn, KMPCMicroTy->getPointerTo());
56 
57   Value *Args[] = {SourceLocationInfo,
58                    Builder.getInt32(4) /* Number of arguments (w/o Task) */,
59                    Task,
60                    LB,
61                    UB,
62                    Stride,
63                    SubFnParam};
64 
65   Builder.CreateCall(F, Args);
66 }
67 
68 void ParallelLoopGeneratorKMP::deployParallelExecution(Value *SubFn,
69                                                        Value *SubFnParam,
70                                                        Value *LB, Value *UB,
71                                                        Value *Stride) {
72   // Inform OpenMP runtime about the number of threads if greater than zero
73   if (PollyNumThreads > 0) {
74     Value *GlobalThreadID = createCallGlobalThreadNum();
75     createCallPushNumThreads(GlobalThreadID, Builder.getInt32(PollyNumThreads));
76   }
77 
78   // Tell the runtime we start a parallel loop
79   createCallSpawnThreads(SubFn, SubFnParam, LB, UB, Stride);
80 }
81 
82 Function *ParallelLoopGeneratorKMP::prepareSubFnDefinition(Function *F) const {
83   std::vector<Type *> Arguments = {Builder.getInt32Ty()->getPointerTo(),
84                                    Builder.getInt32Ty()->getPointerTo(),
85                                    LongType,
86                                    LongType,
87                                    LongType,
88                                    Builder.getInt8PtrTy()};
89 
90   FunctionType *FT = FunctionType::get(Builder.getVoidTy(), Arguments, false);
91   Function *SubFn = Function::Create(FT, Function::InternalLinkage,
92                                      F->getName() + "_polly_subfn", M);
93   // Name the function's arguments
94   Function::arg_iterator AI = SubFn->arg_begin();
95   AI->setName("polly.kmpc.global_tid");
96   std::advance(AI, 1);
97   AI->setName("polly.kmpc.bound_tid");
98   std::advance(AI, 1);
99   AI->setName("polly.kmpc.lb");
100   std::advance(AI, 1);
101   AI->setName("polly.kmpc.ub");
102   std::advance(AI, 1);
103   AI->setName("polly.kmpc.inc");
104   std::advance(AI, 1);
105   AI->setName("polly.kmpc.shared");
106 
107   return SubFn;
108 }
109 
110 // Create a subfunction of the following (preliminary) structure:
111 //
112 //    PrevBB
113 //       |
114 //       v
115 //    HeaderBB
116 //       |   _____
117 //       v  v    |
118 //   CheckNextBB  PreHeaderBB
119 //       |\       |
120 //       | \______/
121 //       |
122 //       v
123 //     ExitBB
124 //
125 // HeaderBB will hold allocations, loading of variables and kmp-init calls.
126 // CheckNextBB will check for more work (dynamic) or will be "empty" (static).
127 // If there is more work to do: go to PreHeaderBB, otherwise go to ExitBB.
128 // PreHeaderBB loads the new boundaries (& will lead to the loop body later on).
129 // Just like CheckNextBB: PreHeaderBB is empty in the static scheduling case.
130 // ExitBB marks the end of the parallel execution.
131 // The possibly empty BasicBlocks will automatically be removed.
132 std::tuple<Value *, Function *>
133 ParallelLoopGeneratorKMP::createSubFn(Value *StrideNotUsed,
134                                       AllocaInst *StructData,
135                                       SetVector<Value *> Data, ValueMapT &Map) {
136   Function *SubFn = createSubFnDefinition();
137   LLVMContext &Context = SubFn->getContext();
138 
139   // Store the previous basic block.
140   BasicBlock *PrevBB = Builder.GetInsertBlock();
141 
142   // Create basic blocks.
143   BasicBlock *HeaderBB = BasicBlock::Create(Context, "polly.par.setup", SubFn);
144   BasicBlock *ExitBB = BasicBlock::Create(Context, "polly.par.exit", SubFn);
145   BasicBlock *CheckNextBB =
146       BasicBlock::Create(Context, "polly.par.checkNext", SubFn);
147   BasicBlock *PreHeaderBB =
148       BasicBlock::Create(Context, "polly.par.loadIVBounds", SubFn);
149 
150   DT.addNewBlock(HeaderBB, PrevBB);
151   DT.addNewBlock(ExitBB, HeaderBB);
152   DT.addNewBlock(CheckNextBB, HeaderBB);
153   DT.addNewBlock(PreHeaderBB, HeaderBB);
154 
155   // Fill up basic block HeaderBB.
156   Builder.SetInsertPoint(HeaderBB);
157   Value *LBPtr = Builder.CreateAlloca(LongType, nullptr, "polly.par.LBPtr");
158   Value *UBPtr = Builder.CreateAlloca(LongType, nullptr, "polly.par.UBPtr");
159   Value *IsLastPtr = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
160                                           "polly.par.lastIterPtr");
161   Value *StridePtr =
162       Builder.CreateAlloca(LongType, nullptr, "polly.par.StridePtr");
163 
164   // Get iterator for retrieving the previously defined parameters.
165   Function::arg_iterator AI = SubFn->arg_begin();
166   // First argument holds "global thread ID".
167   Value *IDPtr = &*AI;
168   // Skip "bound thread ID" since it is not used (but had to be defined).
169   std::advance(AI, 2);
170   // Move iterator to: LB, UB, Stride, Shared variable struct.
171   Value *LB = &*AI;
172   std::advance(AI, 1);
173   Value *UB = &*AI;
174   std::advance(AI, 1);
175   Value *Stride = &*AI;
176   std::advance(AI, 1);
177   Value *Shared = &*AI;
178 
179   Value *UserContext = Builder.CreateBitCast(Shared, StructData->getType(),
180                                              "polly.par.userContext");
181 
182   extractValuesFromStruct(Data, StructData->getAllocatedType(), UserContext,
183                           Map);
184 
185   const int Alignment = (is64BitArch()) ? 8 : 4;
186   Value *ID =
187       Builder.CreateAlignedLoad(IDPtr, Alignment, "polly.par.global_tid");
188 
189   Builder.CreateAlignedStore(LB, LBPtr, Alignment);
190   Builder.CreateAlignedStore(UB, UBPtr, Alignment);
191   Builder.CreateAlignedStore(Builder.getInt32(0), IsLastPtr, Alignment);
192   Builder.CreateAlignedStore(Stride, StridePtr, Alignment);
193 
194   // Subtract one as the upper bound provided by openmp is a < comparison
195   // whereas the codegenForSequential function creates a <= comparison.
196   Value *AdjustedUB = Builder.CreateAdd(UB, ConstantInt::get(LongType, -1),
197                                         "polly.indvar.UBAdjusted");
198 
199   Value *ChunkSize =
200       ConstantInt::get(LongType, std::max<int>(PollyChunkSize, 1));
201 
202   switch (PollyScheduling) {
203   case OMPGeneralSchedulingType::Dynamic:
204   case OMPGeneralSchedulingType::Guided:
205   case OMPGeneralSchedulingType::Runtime:
206     // "DYNAMIC" scheduling types are handled below (including 'runtime')
207     {
208       UB = AdjustedUB;
209       createCallDispatchInit(ID, LB, UB, Stride, ChunkSize);
210       Value *HasWork =
211           createCallDispatchNext(ID, IsLastPtr, LBPtr, UBPtr, StridePtr);
212       Value *HasIteration =
213           Builder.CreateICmp(llvm::CmpInst::Predicate::ICMP_EQ, HasWork,
214                              Builder.getInt32(1), "polly.hasIteration");
215       Builder.CreateCondBr(HasIteration, PreHeaderBB, ExitBB);
216 
217       Builder.SetInsertPoint(CheckNextBB);
218       HasWork = createCallDispatchNext(ID, IsLastPtr, LBPtr, UBPtr, StridePtr);
219       HasIteration =
220           Builder.CreateICmp(llvm::CmpInst::Predicate::ICMP_EQ, HasWork,
221                              Builder.getInt32(1), "polly.hasWork");
222       Builder.CreateCondBr(HasIteration, PreHeaderBB, ExitBB);
223 
224       Builder.SetInsertPoint(PreHeaderBB);
225       LB = Builder.CreateAlignedLoad(LBPtr, Alignment, "polly.indvar.LB");
226       UB = Builder.CreateAlignedLoad(UBPtr, Alignment, "polly.indvar.UB");
227     }
228     break;
229   case OMPGeneralSchedulingType::StaticChunked:
230   case OMPGeneralSchedulingType::StaticNonChunked:
231     // "STATIC" scheduling types are handled below
232     {
233       createCallStaticInit(ID, IsLastPtr, LBPtr, UBPtr, StridePtr, ChunkSize);
234 
235       LB = Builder.CreateAlignedLoad(LBPtr, Alignment, "polly.indvar.LB");
236       UB = Builder.CreateAlignedLoad(UBPtr, Alignment, "polly.indvar.UB");
237 
238       Value *AdjUBOutOfBounds =
239           Builder.CreateICmp(llvm::CmpInst::Predicate::ICMP_SLT, UB, AdjustedUB,
240                              "polly.adjustedUBOutOfBounds");
241 
242       UB = Builder.CreateSelect(AdjUBOutOfBounds, UB, AdjustedUB);
243       Builder.CreateAlignedStore(UB, UBPtr, Alignment);
244 
245       Value *HasIteration = Builder.CreateICmp(
246           llvm::CmpInst::Predicate::ICMP_SLE, LB, UB, "polly.hasIteration");
247       Builder.CreateCondBr(HasIteration, PreHeaderBB, ExitBB);
248 
249       Builder.SetInsertPoint(CheckNextBB);
250       Builder.CreateBr(ExitBB);
251 
252       Builder.SetInsertPoint(PreHeaderBB);
253     }
254     break;
255   }
256 
257   Builder.CreateBr(CheckNextBB);
258   Builder.SetInsertPoint(&*--Builder.GetInsertPoint());
259   BasicBlock *AfterBB;
260   Value *IV = createLoop(LB, UB, Stride, Builder, LI, DT, AfterBB,
261                          ICmpInst::ICMP_SLE, nullptr, true,
262                          /* UseGuard */ false);
263 
264   BasicBlock::iterator LoopBody = Builder.GetInsertPoint();
265 
266   // Add code to terminate this subfunction.
267   Builder.SetInsertPoint(ExitBB);
268   // Static (i.e. non-dynamic) scheduling types, are terminated with a fini-call
269   if (PollyScheduling == OMPGeneralSchedulingType::StaticChunked) {
270     createCallStaticFini(ID);
271   }
272   Builder.CreateRetVoid();
273   Builder.SetInsertPoint(&*LoopBody);
274 
275   return std::make_tuple(IV, SubFn);
276 }
277 
278 Value *ParallelLoopGeneratorKMP::createCallGlobalThreadNum() {
279   const std::string Name = "__kmpc_global_thread_num";
280   Function *F = M->getFunction(Name);
281 
282   // If F is not available, declare it.
283   if (!F) {
284     StructType *IdentTy = M->getTypeByName("struct.ident_t");
285 
286     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
287     Type *Params[] = {IdentTy->getPointerTo()};
288 
289     FunctionType *Ty = FunctionType::get(Builder.getInt32Ty(), Params, false);
290     F = Function::Create(Ty, Linkage, Name, M);
291   }
292 
293   return Builder.CreateCall(F, {SourceLocationInfo});
294 }
295 
296 void ParallelLoopGeneratorKMP::createCallPushNumThreads(Value *GlobalThreadID,
297                                                         Value *NumThreads) {
298   const std::string Name = "__kmpc_push_num_threads";
299   Function *F = M->getFunction(Name);
300 
301   // If F is not available, declare it.
302   if (!F) {
303     StructType *IdentTy = M->getTypeByName("struct.ident_t");
304 
305     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
306     Type *Params[] = {IdentTy->getPointerTo(), Builder.getInt32Ty(),
307                       Builder.getInt32Ty()};
308 
309     FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false);
310     F = Function::Create(Ty, Linkage, Name, M);
311   }
312 
313   Value *Args[] = {SourceLocationInfo, GlobalThreadID, NumThreads};
314 
315   Builder.CreateCall(F, Args);
316 }
317 
318 void ParallelLoopGeneratorKMP::createCallStaticInit(Value *GlobalThreadID,
319                                                     Value *IsLastPtr,
320                                                     Value *LBPtr, Value *UBPtr,
321                                                     Value *StridePtr,
322                                                     Value *ChunkSize) {
323   const std::string Name =
324       is64BitArch() ? "__kmpc_for_static_init_8" : "__kmpc_for_static_init_4";
325   Function *F = M->getFunction(Name);
326   StructType *IdentTy = M->getTypeByName("struct.ident_t");
327 
328   // If F is not available, declare it.
329   if (!F) {
330     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
331 
332     Type *Params[] = {IdentTy->getPointerTo(),
333                       Builder.getInt32Ty(),
334                       Builder.getInt32Ty(),
335                       Builder.getInt32Ty()->getPointerTo(),
336                       LongType->getPointerTo(),
337                       LongType->getPointerTo(),
338                       LongType->getPointerTo(),
339                       LongType,
340                       LongType};
341 
342     FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false);
343     F = Function::Create(Ty, Linkage, Name, M);
344   }
345 
346   // The parameter 'ChunkSize' will hold strictly positive integer values,
347   // regardless of PollyChunkSize's value
348   Value *Args[] = {
349       SourceLocationInfo,
350       GlobalThreadID,
351       Builder.getInt32(int(getSchedType(PollyChunkSize, PollyScheduling))),
352       IsLastPtr,
353       LBPtr,
354       UBPtr,
355       StridePtr,
356       ConstantInt::get(LongType, 1),
357       ChunkSize};
358 
359   Builder.CreateCall(F, Args);
360 }
361 
362 void ParallelLoopGeneratorKMP::createCallStaticFini(Value *GlobalThreadID) {
363   const std::string Name = "__kmpc_for_static_fini";
364   Function *F = M->getFunction(Name);
365   StructType *IdentTy = M->getTypeByName("struct.ident_t");
366 
367   // If F is not available, declare it.
368   if (!F) {
369     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
370     Type *Params[] = {IdentTy->getPointerTo(), Builder.getInt32Ty()};
371     FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false);
372     F = Function::Create(Ty, Linkage, Name, M);
373   }
374 
375   Value *Args[] = {SourceLocationInfo, GlobalThreadID};
376 
377   Builder.CreateCall(F, Args);
378 }
379 
380 void ParallelLoopGeneratorKMP::createCallDispatchInit(Value *GlobalThreadID,
381                                                       Value *LB, Value *UB,
382                                                       Value *Inc,
383                                                       Value *ChunkSize) {
384   const std::string Name =
385       is64BitArch() ? "__kmpc_dispatch_init_8" : "__kmpc_dispatch_init_4";
386   Function *F = M->getFunction(Name);
387   StructType *IdentTy = M->getTypeByName("struct.ident_t");
388 
389   // If F is not available, declare it.
390   if (!F) {
391     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
392 
393     Type *Params[] = {IdentTy->getPointerTo(),
394                       Builder.getInt32Ty(),
395                       Builder.getInt32Ty(),
396                       LongType,
397                       LongType,
398                       LongType,
399                       LongType};
400 
401     FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false);
402     F = Function::Create(Ty, Linkage, Name, M);
403   }
404 
405   // The parameter 'ChunkSize' will hold strictly positive integer values,
406   // regardless of PollyChunkSize's value
407   Value *Args[] = {
408       SourceLocationInfo,
409       GlobalThreadID,
410       Builder.getInt32(int(getSchedType(PollyChunkSize, PollyScheduling))),
411       LB,
412       UB,
413       Inc,
414       ChunkSize};
415 
416   Builder.CreateCall(F, Args);
417 }
418 
419 Value *ParallelLoopGeneratorKMP::createCallDispatchNext(Value *GlobalThreadID,
420                                                         Value *IsLastPtr,
421                                                         Value *LBPtr,
422                                                         Value *UBPtr,
423                                                         Value *StridePtr) {
424   const std::string Name =
425       is64BitArch() ? "__kmpc_dispatch_next_8" : "__kmpc_dispatch_next_4";
426   Function *F = M->getFunction(Name);
427   StructType *IdentTy = M->getTypeByName("struct.ident_t");
428 
429   // If F is not available, declare it.
430   if (!F) {
431     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
432 
433     Type *Params[] = {IdentTy->getPointerTo(),
434                       Builder.getInt32Ty(),
435                       Builder.getInt32Ty()->getPointerTo(),
436                       LongType->getPointerTo(),
437                       LongType->getPointerTo(),
438                       LongType->getPointerTo()};
439 
440     FunctionType *Ty = FunctionType::get(Builder.getInt32Ty(), Params, false);
441     F = Function::Create(Ty, Linkage, Name, M);
442   }
443 
444   Value *Args[] = {SourceLocationInfo, GlobalThreadID, IsLastPtr, LBPtr, UBPtr,
445                    StridePtr};
446 
447   return Builder.CreateCall(F, Args);
448 }
449 
450 // TODO: This function currently creates a source location dummy. It might be
451 // necessary to (actually) provide information, in the future.
452 GlobalVariable *ParallelLoopGeneratorKMP::createSourceLocation() {
453   const std::string LocName = ".loc.dummy";
454   GlobalVariable *SourceLocDummy = M->getGlobalVariable(LocName);
455 
456   if (SourceLocDummy == nullptr) {
457     const std::string StructName = "struct.ident_t";
458     StructType *IdentTy = M->getTypeByName(StructName);
459 
460     // If the ident_t StructType is not available, declare it.
461     // in LLVM-IR: ident_t = type { i32, i32, i32, i32, i8* }
462     if (!IdentTy) {
463       Type *LocMembers[] = {Builder.getInt32Ty(), Builder.getInt32Ty(),
464                             Builder.getInt32Ty(), Builder.getInt32Ty(),
465                             Builder.getInt8PtrTy()};
466 
467       IdentTy =
468           StructType::create(M->getContext(), LocMembers, StructName, false);
469     }
470 
471     const auto ArrayType =
472         llvm::ArrayType::get(Builder.getInt8Ty(), /* Length */ 23);
473 
474     // Global Variable Definitions
475     GlobalVariable *StrVar = new GlobalVariable(
476         *M, ArrayType, true, GlobalValue::PrivateLinkage, 0, ".str.ident");
477     StrVar->setAlignment(1);
478 
479     SourceLocDummy = new GlobalVariable(
480         *M, IdentTy, true, GlobalValue::PrivateLinkage, nullptr, LocName);
481     SourceLocDummy->setAlignment(8);
482 
483     // Constant Definitions
484     Constant *InitStr = ConstantDataArray::getString(
485         M->getContext(), "Source location dummy.", true);
486 
487     Constant *StrPtr = static_cast<Constant *>(Builder.CreateInBoundsGEP(
488         ArrayType, StrVar, {Builder.getInt32(0), Builder.getInt32(0)}));
489 
490     Constant *LocInitStruct = ConstantStruct::get(
491         IdentTy, {Builder.getInt32(0), Builder.getInt32(0), Builder.getInt32(0),
492                   Builder.getInt32(0), StrPtr});
493 
494     // Initialize variables
495     StrVar->setInitializer(InitStr);
496     SourceLocDummy->setInitializer(LocInitStruct);
497   }
498 
499   return SourceLocDummy;
500 }
501 
502 bool ParallelLoopGeneratorKMP::is64BitArch() {
503   return (LongType->getIntegerBitWidth() == 64);
504 }
505 
506 OMPGeneralSchedulingType ParallelLoopGeneratorKMP::getSchedType(
507     int ChunkSize, OMPGeneralSchedulingType Scheduling) const {
508   if (ChunkSize == 0 && Scheduling == OMPGeneralSchedulingType::StaticChunked)
509     return OMPGeneralSchedulingType::StaticNonChunked;
510 
511   return Scheduling;
512 }
513