1 //===------ LoopGeneratorsKMP.cpp - IR helper to create loops -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains functions to create parallel loops as LLVM-IR.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "polly/CodeGen/LoopGeneratorsKMP.h"
14 #include "llvm/IR/Dominators.h"
15 #include "llvm/IR/Module.h"
16 
17 using namespace llvm;
18 using namespace polly;
19 
20 void ParallelLoopGeneratorKMP::createCallSpawnThreads(Value *SubFn,
21                                                       Value *SubFnParam,
22                                                       Value *LB, Value *UB,
23                                                       Value *Stride) {
24   const std::string Name = "__kmpc_fork_call";
25   Function *F = M->getFunction(Name);
26   Type *KMPCMicroTy = M->getTypeByName("kmpc_micro");
27 
28   if (!KMPCMicroTy) {
29     // void (*kmpc_micro)(kmp_int32 *global_tid, kmp_int32 *bound_tid, ...)
30     Type *MicroParams[] = {Builder.getInt32Ty()->getPointerTo(),
31                            Builder.getInt32Ty()->getPointerTo()};
32 
33     KMPCMicroTy = FunctionType::get(Builder.getVoidTy(), MicroParams, true);
34   }
35 
36   // If F is not available, declare it.
37   if (!F) {
38     StructType *IdentTy = M->getTypeByName("struct.ident_t");
39 
40     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
41     Type *Params[] = {IdentTy->getPointerTo(), Builder.getInt32Ty(),
42                       KMPCMicroTy->getPointerTo()};
43 
44     FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, true);
45     F = Function::Create(Ty, Linkage, Name, M);
46   }
47 
48   Value *Task = Builder.CreatePointerBitCastOrAddrSpaceCast(
49       SubFn, KMPCMicroTy->getPointerTo());
50 
51   Value *Args[] = {SourceLocationInfo,
52                    Builder.getInt32(4) /* Number of arguments (w/o Task) */,
53                    Task,
54                    LB,
55                    UB,
56                    Stride,
57                    SubFnParam};
58 
59   Builder.CreateCall(F, Args);
60 }
61 
62 void ParallelLoopGeneratorKMP::deployParallelExecution(Value *SubFn,
63                                                        Value *SubFnParam,
64                                                        Value *LB, Value *UB,
65                                                        Value *Stride) {
66   // Inform OpenMP runtime about the number of threads if greater than zero
67   if (PollyNumThreads > 0) {
68     Value *GlobalThreadID = createCallGlobalThreadNum();
69     createCallPushNumThreads(GlobalThreadID, Builder.getInt32(PollyNumThreads));
70   }
71 
72   // Tell the runtime we start a parallel loop
73   createCallSpawnThreads(SubFn, SubFnParam, LB, UB, Stride);
74 }
75 
76 Function *ParallelLoopGeneratorKMP::prepareSubFnDefinition(Function *F) const {
77   std::vector<Type *> Arguments = {Builder.getInt32Ty()->getPointerTo(),
78                                    Builder.getInt32Ty()->getPointerTo(),
79                                    LongType,
80                                    LongType,
81                                    LongType,
82                                    Builder.getInt8PtrTy()};
83 
84   FunctionType *FT = FunctionType::get(Builder.getVoidTy(), Arguments, false);
85   Function *SubFn = Function::Create(FT, Function::InternalLinkage,
86                                      F->getName() + "_polly_subfn", M);
87   // Name the function's arguments
88   Function::arg_iterator AI = SubFn->arg_begin();
89   AI->setName("polly.kmpc.global_tid");
90   std::advance(AI, 1);
91   AI->setName("polly.kmpc.bound_tid");
92   std::advance(AI, 1);
93   AI->setName("polly.kmpc.lb");
94   std::advance(AI, 1);
95   AI->setName("polly.kmpc.ub");
96   std::advance(AI, 1);
97   AI->setName("polly.kmpc.inc");
98   std::advance(AI, 1);
99   AI->setName("polly.kmpc.shared");
100 
101   return SubFn;
102 }
103 
104 // Create a subfunction of the following (preliminary) structure:
105 //
106 //    PrevBB
107 //       |
108 //       v
109 //    HeaderBB
110 //       |   _____
111 //       v  v    |
112 //   CheckNextBB  PreHeaderBB
113 //       |\       |
114 //       | \______/
115 //       |
116 //       v
117 //     ExitBB
118 //
119 // HeaderBB will hold allocations, loading of variables and kmp-init calls.
120 // CheckNextBB will check for more work (dynamic) or will be "empty" (static).
121 // If there is more work to do: go to PreHeaderBB, otherwise go to ExitBB.
122 // PreHeaderBB loads the new boundaries (& will lead to the loop body later on).
123 // Just like CheckNextBB: PreHeaderBB is empty in the static scheduling case.
124 // ExitBB marks the end of the parallel execution.
125 // The possibly empty BasicBlocks will automatically be removed.
126 std::tuple<Value *, Function *>
127 ParallelLoopGeneratorKMP::createSubFn(Value *StrideNotUsed,
128                                       AllocaInst *StructData,
129                                       SetVector<Value *> Data, ValueMapT &Map) {
130   Function *SubFn = createSubFnDefinition();
131   LLVMContext &Context = SubFn->getContext();
132 
133   // Store the previous basic block.
134   BasicBlock *PrevBB = Builder.GetInsertBlock();
135 
136   // Create basic blocks.
137   BasicBlock *HeaderBB = BasicBlock::Create(Context, "polly.par.setup", SubFn);
138   BasicBlock *ExitBB = BasicBlock::Create(Context, "polly.par.exit", SubFn);
139   BasicBlock *CheckNextBB =
140       BasicBlock::Create(Context, "polly.par.checkNext", SubFn);
141   BasicBlock *PreHeaderBB =
142       BasicBlock::Create(Context, "polly.par.loadIVBounds", SubFn);
143 
144   DT.addNewBlock(HeaderBB, PrevBB);
145   DT.addNewBlock(ExitBB, HeaderBB);
146   DT.addNewBlock(CheckNextBB, HeaderBB);
147   DT.addNewBlock(PreHeaderBB, HeaderBB);
148 
149   // Fill up basic block HeaderBB.
150   Builder.SetInsertPoint(HeaderBB);
151   Value *LBPtr = Builder.CreateAlloca(LongType, nullptr, "polly.par.LBPtr");
152   Value *UBPtr = Builder.CreateAlloca(LongType, nullptr, "polly.par.UBPtr");
153   Value *IsLastPtr = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
154                                           "polly.par.lastIterPtr");
155   Value *StridePtr =
156       Builder.CreateAlloca(LongType, nullptr, "polly.par.StridePtr");
157 
158   // Get iterator for retrieving the previously defined parameters.
159   Function::arg_iterator AI = SubFn->arg_begin();
160   // First argument holds "global thread ID".
161   Value *IDPtr = &*AI;
162   // Skip "bound thread ID" since it is not used (but had to be defined).
163   std::advance(AI, 2);
164   // Move iterator to: LB, UB, Stride, Shared variable struct.
165   Value *LB = &*AI;
166   std::advance(AI, 1);
167   Value *UB = &*AI;
168   std::advance(AI, 1);
169   Value *Stride = &*AI;
170   std::advance(AI, 1);
171   Value *Shared = &*AI;
172 
173   Value *UserContext = Builder.CreateBitCast(Shared, StructData->getType(),
174                                              "polly.par.userContext");
175 
176   extractValuesFromStruct(Data, StructData->getAllocatedType(), UserContext,
177                           Map);
178 
179   const auto Alignment = llvm::Align(is64BitArch() ? 8 : 4);
180   Value *ID =
181       Builder.CreateAlignedLoad(IDPtr, Alignment, "polly.par.global_tid");
182 
183   Builder.CreateAlignedStore(LB, LBPtr, Alignment);
184   Builder.CreateAlignedStore(UB, UBPtr, Alignment);
185   Builder.CreateAlignedStore(Builder.getInt32(0), IsLastPtr, Alignment);
186   Builder.CreateAlignedStore(Stride, StridePtr, Alignment);
187 
188   // Subtract one as the upper bound provided by openmp is a < comparison
189   // whereas the codegenForSequential function creates a <= comparison.
190   Value *AdjustedUB = Builder.CreateAdd(UB, ConstantInt::get(LongType, -1),
191                                         "polly.indvar.UBAdjusted");
192 
193   Value *ChunkSize =
194       ConstantInt::get(LongType, std::max<int>(PollyChunkSize, 1));
195 
196   switch (PollyScheduling) {
197   case OMPGeneralSchedulingType::Dynamic:
198   case OMPGeneralSchedulingType::Guided:
199   case OMPGeneralSchedulingType::Runtime:
200     // "DYNAMIC" scheduling types are handled below (including 'runtime')
201     {
202       UB = AdjustedUB;
203       createCallDispatchInit(ID, LB, UB, Stride, ChunkSize);
204       Value *HasWork =
205           createCallDispatchNext(ID, IsLastPtr, LBPtr, UBPtr, StridePtr);
206       Value *HasIteration =
207           Builder.CreateICmp(llvm::CmpInst::Predicate::ICMP_EQ, HasWork,
208                              Builder.getInt32(1), "polly.hasIteration");
209       Builder.CreateCondBr(HasIteration, PreHeaderBB, ExitBB);
210 
211       Builder.SetInsertPoint(CheckNextBB);
212       HasWork = createCallDispatchNext(ID, IsLastPtr, LBPtr, UBPtr, StridePtr);
213       HasIteration =
214           Builder.CreateICmp(llvm::CmpInst::Predicate::ICMP_EQ, HasWork,
215                              Builder.getInt32(1), "polly.hasWork");
216       Builder.CreateCondBr(HasIteration, PreHeaderBB, ExitBB);
217 
218       Builder.SetInsertPoint(PreHeaderBB);
219       LB = Builder.CreateAlignedLoad(LBPtr, Alignment, "polly.indvar.LB");
220       UB = Builder.CreateAlignedLoad(UBPtr, Alignment, "polly.indvar.UB");
221     }
222     break;
223   case OMPGeneralSchedulingType::StaticChunked:
224   case OMPGeneralSchedulingType::StaticNonChunked:
225     // "STATIC" scheduling types are handled below
226     {
227       createCallStaticInit(ID, IsLastPtr, LBPtr, UBPtr, StridePtr, ChunkSize);
228 
229       LB = Builder.CreateAlignedLoad(LBPtr, Alignment, "polly.indvar.LB");
230       UB = Builder.CreateAlignedLoad(UBPtr, Alignment, "polly.indvar.UB");
231 
232       Value *AdjUBOutOfBounds =
233           Builder.CreateICmp(llvm::CmpInst::Predicate::ICMP_SLT, UB, AdjustedUB,
234                              "polly.adjustedUBOutOfBounds");
235 
236       UB = Builder.CreateSelect(AdjUBOutOfBounds, UB, AdjustedUB);
237       Builder.CreateAlignedStore(UB, UBPtr, Alignment);
238 
239       Value *HasIteration = Builder.CreateICmp(
240           llvm::CmpInst::Predicate::ICMP_SLE, LB, UB, "polly.hasIteration");
241       Builder.CreateCondBr(HasIteration, PreHeaderBB, ExitBB);
242 
243       Builder.SetInsertPoint(CheckNextBB);
244       Builder.CreateBr(ExitBB);
245 
246       Builder.SetInsertPoint(PreHeaderBB);
247     }
248     break;
249   }
250 
251   Builder.CreateBr(CheckNextBB);
252   Builder.SetInsertPoint(&*--Builder.GetInsertPoint());
253   BasicBlock *AfterBB;
254   Value *IV = createLoop(LB, UB, Stride, Builder, LI, DT, AfterBB,
255                          ICmpInst::ICMP_SLE, nullptr, true,
256                          /* UseGuard */ false);
257 
258   BasicBlock::iterator LoopBody = Builder.GetInsertPoint();
259 
260   // Add code to terminate this subfunction.
261   Builder.SetInsertPoint(ExitBB);
262   // Static (i.e. non-dynamic) scheduling types, are terminated with a fini-call
263   if (PollyScheduling == OMPGeneralSchedulingType::StaticChunked) {
264     createCallStaticFini(ID);
265   }
266   Builder.CreateRetVoid();
267   Builder.SetInsertPoint(&*LoopBody);
268 
269   return std::make_tuple(IV, SubFn);
270 }
271 
272 Value *ParallelLoopGeneratorKMP::createCallGlobalThreadNum() {
273   const std::string Name = "__kmpc_global_thread_num";
274   Function *F = M->getFunction(Name);
275 
276   // If F is not available, declare it.
277   if (!F) {
278     StructType *IdentTy = M->getTypeByName("struct.ident_t");
279 
280     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
281     Type *Params[] = {IdentTy->getPointerTo()};
282 
283     FunctionType *Ty = FunctionType::get(Builder.getInt32Ty(), Params, false);
284     F = Function::Create(Ty, Linkage, Name, M);
285   }
286 
287   return Builder.CreateCall(F, {SourceLocationInfo});
288 }
289 
290 void ParallelLoopGeneratorKMP::createCallPushNumThreads(Value *GlobalThreadID,
291                                                         Value *NumThreads) {
292   const std::string Name = "__kmpc_push_num_threads";
293   Function *F = M->getFunction(Name);
294 
295   // If F is not available, declare it.
296   if (!F) {
297     StructType *IdentTy = M->getTypeByName("struct.ident_t");
298 
299     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
300     Type *Params[] = {IdentTy->getPointerTo(), Builder.getInt32Ty(),
301                       Builder.getInt32Ty()};
302 
303     FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false);
304     F = Function::Create(Ty, Linkage, Name, M);
305   }
306 
307   Value *Args[] = {SourceLocationInfo, GlobalThreadID, NumThreads};
308 
309   Builder.CreateCall(F, Args);
310 }
311 
312 void ParallelLoopGeneratorKMP::createCallStaticInit(Value *GlobalThreadID,
313                                                     Value *IsLastPtr,
314                                                     Value *LBPtr, Value *UBPtr,
315                                                     Value *StridePtr,
316                                                     Value *ChunkSize) {
317   const std::string Name =
318       is64BitArch() ? "__kmpc_for_static_init_8" : "__kmpc_for_static_init_4";
319   Function *F = M->getFunction(Name);
320   StructType *IdentTy = M->getTypeByName("struct.ident_t");
321 
322   // If F is not available, declare it.
323   if (!F) {
324     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
325 
326     Type *Params[] = {IdentTy->getPointerTo(),
327                       Builder.getInt32Ty(),
328                       Builder.getInt32Ty(),
329                       Builder.getInt32Ty()->getPointerTo(),
330                       LongType->getPointerTo(),
331                       LongType->getPointerTo(),
332                       LongType->getPointerTo(),
333                       LongType,
334                       LongType};
335 
336     FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false);
337     F = Function::Create(Ty, Linkage, Name, M);
338   }
339 
340   // The parameter 'ChunkSize' will hold strictly positive integer values,
341   // regardless of PollyChunkSize's value
342   Value *Args[] = {
343       SourceLocationInfo,
344       GlobalThreadID,
345       Builder.getInt32(int(getSchedType(PollyChunkSize, PollyScheduling))),
346       IsLastPtr,
347       LBPtr,
348       UBPtr,
349       StridePtr,
350       ConstantInt::get(LongType, 1),
351       ChunkSize};
352 
353   Builder.CreateCall(F, Args);
354 }
355 
356 void ParallelLoopGeneratorKMP::createCallStaticFini(Value *GlobalThreadID) {
357   const std::string Name = "__kmpc_for_static_fini";
358   Function *F = M->getFunction(Name);
359   StructType *IdentTy = M->getTypeByName("struct.ident_t");
360 
361   // If F is not available, declare it.
362   if (!F) {
363     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
364     Type *Params[] = {IdentTy->getPointerTo(), Builder.getInt32Ty()};
365     FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false);
366     F = Function::Create(Ty, Linkage, Name, M);
367   }
368 
369   Value *Args[] = {SourceLocationInfo, GlobalThreadID};
370 
371   Builder.CreateCall(F, Args);
372 }
373 
374 void ParallelLoopGeneratorKMP::createCallDispatchInit(Value *GlobalThreadID,
375                                                       Value *LB, Value *UB,
376                                                       Value *Inc,
377                                                       Value *ChunkSize) {
378   const std::string Name =
379       is64BitArch() ? "__kmpc_dispatch_init_8" : "__kmpc_dispatch_init_4";
380   Function *F = M->getFunction(Name);
381   StructType *IdentTy = M->getTypeByName("struct.ident_t");
382 
383   // If F is not available, declare it.
384   if (!F) {
385     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
386 
387     Type *Params[] = {IdentTy->getPointerTo(),
388                       Builder.getInt32Ty(),
389                       Builder.getInt32Ty(),
390                       LongType,
391                       LongType,
392                       LongType,
393                       LongType};
394 
395     FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false);
396     F = Function::Create(Ty, Linkage, Name, M);
397   }
398 
399   // The parameter 'ChunkSize' will hold strictly positive integer values,
400   // regardless of PollyChunkSize's value
401   Value *Args[] = {
402       SourceLocationInfo,
403       GlobalThreadID,
404       Builder.getInt32(int(getSchedType(PollyChunkSize, PollyScheduling))),
405       LB,
406       UB,
407       Inc,
408       ChunkSize};
409 
410   Builder.CreateCall(F, Args);
411 }
412 
413 Value *ParallelLoopGeneratorKMP::createCallDispatchNext(Value *GlobalThreadID,
414                                                         Value *IsLastPtr,
415                                                         Value *LBPtr,
416                                                         Value *UBPtr,
417                                                         Value *StridePtr) {
418   const std::string Name =
419       is64BitArch() ? "__kmpc_dispatch_next_8" : "__kmpc_dispatch_next_4";
420   Function *F = M->getFunction(Name);
421   StructType *IdentTy = M->getTypeByName("struct.ident_t");
422 
423   // If F is not available, declare it.
424   if (!F) {
425     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
426 
427     Type *Params[] = {IdentTy->getPointerTo(),
428                       Builder.getInt32Ty(),
429                       Builder.getInt32Ty()->getPointerTo(),
430                       LongType->getPointerTo(),
431                       LongType->getPointerTo(),
432                       LongType->getPointerTo()};
433 
434     FunctionType *Ty = FunctionType::get(Builder.getInt32Ty(), Params, false);
435     F = Function::Create(Ty, Linkage, Name, M);
436   }
437 
438   Value *Args[] = {SourceLocationInfo, GlobalThreadID, IsLastPtr, LBPtr, UBPtr,
439                    StridePtr};
440 
441   return Builder.CreateCall(F, Args);
442 }
443 
444 // TODO: This function currently creates a source location dummy. It might be
445 // necessary to (actually) provide information, in the future.
446 GlobalVariable *ParallelLoopGeneratorKMP::createSourceLocation() {
447   const std::string LocName = ".loc.dummy";
448   GlobalVariable *SourceLocDummy = M->getGlobalVariable(LocName);
449 
450   if (SourceLocDummy == nullptr) {
451     const std::string StructName = "struct.ident_t";
452     StructType *IdentTy = M->getTypeByName(StructName);
453 
454     // If the ident_t StructType is not available, declare it.
455     // in LLVM-IR: ident_t = type { i32, i32, i32, i32, i8* }
456     if (!IdentTy) {
457       Type *LocMembers[] = {Builder.getInt32Ty(), Builder.getInt32Ty(),
458                             Builder.getInt32Ty(), Builder.getInt32Ty(),
459                             Builder.getInt8PtrTy()};
460 
461       IdentTy =
462           StructType::create(M->getContext(), LocMembers, StructName, false);
463     }
464 
465     const auto ArrayType =
466         llvm::ArrayType::get(Builder.getInt8Ty(), /* Length */ 23);
467 
468     // Global Variable Definitions
469     GlobalVariable *StrVar = new GlobalVariable(
470         *M, ArrayType, true, GlobalValue::PrivateLinkage, 0, ".str.ident");
471     StrVar->setAlignment(llvm::Align(1));
472 
473     SourceLocDummy = new GlobalVariable(
474         *M, IdentTy, true, GlobalValue::PrivateLinkage, nullptr, LocName);
475     SourceLocDummy->setAlignment(llvm::Align(8));
476 
477     // Constant Definitions
478     Constant *InitStr = ConstantDataArray::getString(
479         M->getContext(), "Source location dummy.", true);
480 
481     Constant *StrPtr = static_cast<Constant *>(Builder.CreateInBoundsGEP(
482         ArrayType, StrVar, {Builder.getInt32(0), Builder.getInt32(0)}));
483 
484     Constant *LocInitStruct = ConstantStruct::get(
485         IdentTy, {Builder.getInt32(0), Builder.getInt32(0), Builder.getInt32(0),
486                   Builder.getInt32(0), StrPtr});
487 
488     // Initialize variables
489     StrVar->setInitializer(InitStr);
490     SourceLocDummy->setInitializer(LocInitStruct);
491   }
492 
493   return SourceLocDummy;
494 }
495 
496 bool ParallelLoopGeneratorKMP::is64BitArch() {
497   return (LongType->getIntegerBitWidth() == 64);
498 }
499 
500 OMPGeneralSchedulingType ParallelLoopGeneratorKMP::getSchedType(
501     int ChunkSize, OMPGeneralSchedulingType Scheduling) const {
502   if (ChunkSize == 0 && Scheduling == OMPGeneralSchedulingType::StaticChunked)
503     return OMPGeneralSchedulingType::StaticNonChunked;
504 
505   return Scheduling;
506 }
507