1 //===----- Workshare.cpp -  OpenMP workshare implementation ------ C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the implementation of the KMPC interface
10 // for the loop construct plus other worksharing constructs that use the same
11 // interface as loops.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "Debug.h"
16 #include "Interface.h"
17 #include "Mapping.h"
18 #include "State.h"
19 #include "Synchronization.h"
20 #include "Types.h"
21 #include "Utils.h"
22 
23 using namespace _OMP;
24 
25 // TODO:
26 struct DynamicScheduleTracker {
27   int64_t Chunk;
28   int64_t LoopUpperBound;
29   int64_t NextLowerBound;
30   int64_t Stride;
31   kmp_sched_t ScheduleType;
32   DynamicScheduleTracker *NextDST;
33 };
34 
35 #define ASSERT0(...)
36 
37 // used by the library for the interface with the app
38 #define DISPATCH_FINISHED 0
39 #define DISPATCH_NOTFINISHED 1
40 
41 // used by dynamic scheduling
42 #define FINISHED 0
43 #define NOT_FINISHED 1
44 #define LAST_CHUNK 2
45 
46 #pragma omp begin declare target device_type(nohost)
47 
48 // TODO: This variable is a hack inherited from the old runtime.
49 static uint64_t SHARED(Cnt);
50 
51 template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
52   ////////////////////////////////////////////////////////////////////////////////
53   // Loop with static scheduling with chunk
54 
55   // Generic implementation of OMP loop scheduling with static policy
56   /*! \brief Calculate initial bounds for static loop and stride
57    *  @param[in] loc location in code of the call (not used here)
58    *  @param[in] global_tid global thread id
59    *  @param[in] schetype type of scheduling (see omptarget-nvptx.h)
60    *  @param[in] plastiter pointer to last iteration
61    *  @param[in,out] pointer to loop lower bound. it will contain value of
62    *  lower bound of first chunk
63    *  @param[in,out] pointer to loop upper bound. It will contain value of
64    *  upper bound of first chunk
65    *  @param[in,out] pointer to loop stride. It will contain value of stride
66    *  between two successive chunks executed by the same thread
67    *  @param[in] loop increment bump
68    *  @param[in] chunk size
69    */
70 
71   // helper function for static chunk
ForStaticChunkomptarget_nvptx_LoopSupport72   static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride, ST chunk,
73                              T entityId, T numberOfEntities) {
74     // each thread executes multiple chunks all of the same size, except
75     // the last one
76     // distance between two successive chunks
77     stride = numberOfEntities * chunk;
78     lb = lb + entityId * chunk;
79     T inputUb = ub;
80     ub = lb + chunk - 1; // Clang uses i <= ub
81     // Say ub' is the begining of the last chunk. Then who ever has a
82     // lower bound plus a multiple of the increment equal to ub' is
83     // the last one.
84     T beginingLastChunk = inputUb - (inputUb % chunk);
85     last = ((beginingLastChunk - lb) % stride) == 0;
86   }
87 
88   ////////////////////////////////////////////////////////////////////////////////
89   // Loop with static scheduling without chunk
90 
91   // helper function for static no chunk
ForStaticNoChunkomptarget_nvptx_LoopSupport92   static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride, ST &chunk,
93                                T entityId, T numberOfEntities) {
94     // No chunk size specified.  Each thread or warp gets at most one
95     // chunk; chunks are all almost of equal size
96     T loopSize = ub - lb + 1;
97 
98     chunk = loopSize / numberOfEntities;
99     T leftOver = loopSize - chunk * numberOfEntities;
100 
101     if (entityId < leftOver) {
102       chunk++;
103       lb = lb + entityId * chunk;
104     } else {
105       lb = lb + entityId * chunk + leftOver;
106     }
107 
108     T inputUb = ub;
109     ub = lb + chunk - 1; // Clang uses i <= ub
110     last = lb <= inputUb && inputUb <= ub;
111     stride = loopSize; // make sure we only do 1 chunk per warp
112   }
113 
114   ////////////////////////////////////////////////////////////////////////////////
115   // Support for Static Init
116 
for_static_initomptarget_nvptx_LoopSupport117   static void for_static_init(int32_t, int32_t schedtype,
118                               int32_t *plastiter, T *plower, T *pupper,
119                               ST *pstride, ST chunk, bool IsSPMDExecutionMode) {
120     int32_t gtid = omp_get_thread_num();
121     int numberOfActiveOMPThreads = omp_get_num_threads();
122 
123     // All warps that are in excess of the maximum requested, do
124     // not execute the loop
125     ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads,
126             "current thread is not needed here; error");
127 
128     // copy
129     int lastiter = 0;
130     T lb = *plower;
131     T ub = *pupper;
132     ST stride = *pstride;
133 
134     // init
135     switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) {
136     case kmp_sched_static_chunk: {
137       if (chunk > 0) {
138         ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
139                        numberOfActiveOMPThreads);
140         break;
141       }
142       [[fallthrough]];
143     } // note: if chunk <=0, use nochunk
144     case kmp_sched_static_balanced_chunk: {
145       if (chunk > 0) {
146         // round up to make sure the chunk is enough to cover all iterations
147         T tripCount = ub - lb + 1; // +1 because ub is inclusive
148         T span = (tripCount + numberOfActiveOMPThreads - 1) /
149                  numberOfActiveOMPThreads;
150         // perform chunk adjustment
151         chunk = (span + chunk - 1) & ~(chunk - 1);
152 
153         ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
154         T oldUb = ub;
155         ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
156                        numberOfActiveOMPThreads);
157         if (ub > oldUb)
158           ub = oldUb;
159         break;
160       }
161       [[fallthrough]];
162     } // note: if chunk <=0, use nochunk
163     case kmp_sched_static_nochunk: {
164       ForStaticNoChunk(lastiter, lb, ub, stride, chunk, gtid,
165                        numberOfActiveOMPThreads);
166       break;
167     }
168     case kmp_sched_distr_static_chunk: {
169       if (chunk > 0) {
170         ForStaticChunk(lastiter, lb, ub, stride, chunk, omp_get_team_num(),
171                        omp_get_num_teams());
172         break;
173       }
174       [[fallthrough]];
175     } // note: if chunk <=0, use nochunk
176     case kmp_sched_distr_static_nochunk: {
177       ForStaticNoChunk(lastiter, lb, ub, stride, chunk, omp_get_team_num(),
178                        omp_get_num_teams());
179       break;
180     }
181     case kmp_sched_distr_static_chunk_sched_static_chunkone: {
182       ForStaticChunk(lastiter, lb, ub, stride, chunk,
183                      numberOfActiveOMPThreads * omp_get_team_num() + gtid,
184                      omp_get_num_teams() * numberOfActiveOMPThreads);
185       break;
186     }
187     default: {
188       // ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype);
189       ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
190                      numberOfActiveOMPThreads);
191       break;
192     }
193     }
194     // copy back
195     *plastiter = lastiter;
196     *plower = lb;
197     *pupper = ub;
198     *pstride = stride;
199   }
200 
201   ////////////////////////////////////////////////////////////////////////////////
202   // Support for dispatch Init
203 
OrderedScheduleomptarget_nvptx_LoopSupport204   static int OrderedSchedule(kmp_sched_t schedule) {
205     return schedule >= kmp_sched_ordered_first &&
206            schedule <= kmp_sched_ordered_last;
207   }
208 
dispatch_initomptarget_nvptx_LoopSupport209   static void dispatch_init(IdentTy *loc, int32_t threadId,
210                             kmp_sched_t schedule, T lb, T ub, ST st, ST chunk,
211                             DynamicScheduleTracker *DST) {
212     int tid = mapping::getThreadIdInBlock();
213     T tnum = omp_get_num_threads();
214     T tripCount = ub - lb + 1; // +1 because ub is inclusive
215     ASSERT0(LT_FUSSY, threadId < tnum,
216             "current thread is not needed here; error");
217 
218     /* Currently just ignore the monotonic and non-monotonic modifiers
219      * (the compiler isn't producing them * yet anyway).
220      * When it is we'll want to look at them somewhere here and use that
221      * information to add to our schedule choice. We shouldn't need to pass
222      * them on, they merely affect which schedule we can legally choose for
223      * various dynamic cases. (In particular, whether or not a stealing scheme
224      * is legal).
225      */
226     schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
227 
228     // Process schedule.
229     if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) {
230       if (OrderedSchedule(schedule))
231         __kmpc_barrier(loc, threadId);
232       schedule = kmp_sched_static_chunk;
233       chunk = tripCount; // one thread gets the whole loop
234     } else if (schedule == kmp_sched_runtime) {
235       // process runtime
236       omp_sched_t rtSched;
237       int ChunkInt;
238       omp_get_schedule(&rtSched, &ChunkInt);
239       chunk = ChunkInt;
240       switch (rtSched) {
241       case omp_sched_static: {
242         if (chunk > 0)
243           schedule = kmp_sched_static_chunk;
244         else
245           schedule = kmp_sched_static_nochunk;
246         break;
247       }
248       case omp_sched_auto: {
249         schedule = kmp_sched_static_chunk;
250         chunk = 1;
251         break;
252       }
253       case omp_sched_dynamic:
254       case omp_sched_guided: {
255         schedule = kmp_sched_dynamic;
256         break;
257       }
258       }
259     } else if (schedule == kmp_sched_auto) {
260       schedule = kmp_sched_static_chunk;
261       chunk = 1;
262     } else {
263       // ASSERT(LT_FUSSY,
264       //        schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
265       //        "unknown schedule %d & chunk %lld\n", (int)schedule,
266       //        (long long)chunk);
267     }
268 
269     // init schedules
270     if (schedule == kmp_sched_static_chunk) {
271       ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
272       // save sched state
273       DST->ScheduleType = schedule;
274       // save ub
275       DST->LoopUpperBound = ub;
276       // compute static chunk
277       ST stride;
278       int lastiter = 0;
279       ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
280       // save computed params
281       DST->Chunk = chunk;
282       DST->NextLowerBound = lb;
283       DST->Stride = stride;
284     } else if (schedule == kmp_sched_static_balanced_chunk) {
285       ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
286       // save sched state
287       DST->ScheduleType = schedule;
288       // save ub
289       DST->LoopUpperBound = ub;
290       // compute static chunk
291       ST stride;
292       int lastiter = 0;
293       // round up to make sure the chunk is enough to cover all iterations
294       T span = (tripCount + tnum - 1) / tnum;
295       // perform chunk adjustment
296       chunk = (span + chunk - 1) & ~(chunk - 1);
297 
298       T oldUb = ub;
299       ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
300       ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
301       if (ub > oldUb)
302         ub = oldUb;
303       // save computed params
304       DST->Chunk = chunk;
305       DST->NextLowerBound = lb;
306       DST->Stride = stride;
307     } else if (schedule == kmp_sched_static_nochunk) {
308       ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value");
309       // save sched state
310       DST->ScheduleType = schedule;
311       // save ub
312       DST->LoopUpperBound = ub;
313       // compute static chunk
314       ST stride;
315       int lastiter = 0;
316       ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
317       // save computed params
318       DST->Chunk = chunk;
319       DST->NextLowerBound = lb;
320       DST->Stride = stride;
321     } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
322       // save data
323       DST->ScheduleType = schedule;
324       if (chunk < 1)
325         chunk = 1;
326       DST->Chunk = chunk;
327       DST->LoopUpperBound = ub;
328       DST->NextLowerBound = lb;
329       __kmpc_barrier(loc, threadId);
330       if (tid == 0) {
331         Cnt = 0;
332         fence::team(__ATOMIC_SEQ_CST);
333       }
334       __kmpc_barrier(loc, threadId);
335     }
336   }
337 
338   ////////////////////////////////////////////////////////////////////////////////
339   // Support for dispatch next
340 
NextIteromptarget_nvptx_LoopSupport341   static uint64_t NextIter() {
342     __kmpc_impl_lanemask_t active = mapping::activemask();
343     uint32_t leader = utils::ffs(active) - 1;
344     uint32_t change = utils::popc(active);
345     __kmpc_impl_lanemask_t lane_mask_lt = mapping::lanemaskLT();
346     unsigned int rank = utils::popc(active & lane_mask_lt);
347     uint64_t warp_res = 0;
348     if (rank == 0) {
349       warp_res = atomic::add(&Cnt, change, __ATOMIC_SEQ_CST);
350     }
351     warp_res = utils::shuffle(active, warp_res, leader);
352     return warp_res + rank;
353   }
354 
DynamicNextChunkomptarget_nvptx_LoopSupport355   static int DynamicNextChunk(T &lb, T &ub, T chunkSize, T loopLowerBound,
356                               T loopUpperBound) {
357     T N = NextIter();
358     lb = loopLowerBound + N * chunkSize;
359     ub = lb + chunkSize - 1; // Clang uses i <= ub
360 
361     // 3 result cases:
362     //  a. lb and ub < loopUpperBound --> NOT_FINISHED
363     //  b. lb < loopUpperBound and ub >= loopUpperBound: last chunk -->
364     //  NOT_FINISHED
365     //  c. lb and ub >= loopUpperBound: empty chunk --> FINISHED
366     // a.
367     if (lb <= loopUpperBound && ub < loopUpperBound) {
368       return NOT_FINISHED;
369     }
370     // b.
371     if (lb <= loopUpperBound) {
372       ub = loopUpperBound;
373       return LAST_CHUNK;
374     }
375     // c. if we are here, we are in case 'c'
376     lb = loopUpperBound + 2;
377     ub = loopUpperBound + 1;
378     return FINISHED;
379   }
380 
dispatch_nextomptarget_nvptx_LoopSupport381   static int dispatch_next(IdentTy *loc, int32_t gtid, int32_t *plast,
382                            T *plower, T *pupper, ST *pstride,
383                            DynamicScheduleTracker *DST) {
384     // ID of a thread in its own warp
385 
386     // automatically selects thread or warp ID based on selected implementation
387     ASSERT0(LT_FUSSY, gtid < omp_get_num_threads(),
388             "current thread is not needed here; error");
389     // retrieve schedule
390     kmp_sched_t schedule = DST->ScheduleType;
391 
392     // xxx reduce to one
393     if (schedule == kmp_sched_static_chunk ||
394         schedule == kmp_sched_static_nochunk) {
395       T myLb = DST->NextLowerBound;
396       T ub = DST->LoopUpperBound;
397       // finished?
398       if (myLb > ub) {
399         return DISPATCH_FINISHED;
400       }
401       // not finished, save current bounds
402       ST chunk = DST->Chunk;
403       *plower = myLb;
404       T myUb = myLb + chunk - 1; // Clang uses i <= ub
405       if (myUb > ub)
406         myUb = ub;
407       *pupper = myUb;
408       *plast = (int32_t)(myUb == ub);
409 
410       // increment next lower bound by the stride
411       ST stride = DST->Stride;
412       DST->NextLowerBound = myLb + stride;
413       return DISPATCH_NOTFINISHED;
414     }
415     ASSERT0(LT_FUSSY,
416             schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
417             "bad sched");
418     T myLb, myUb;
419     int finished = DynamicNextChunk(myLb, myUb, DST->Chunk, DST->NextLowerBound,
420                                     DST->LoopUpperBound);
421 
422     if (finished == FINISHED)
423       return DISPATCH_FINISHED;
424 
425     // not finished (either not finished or last chunk)
426     *plast = (int32_t)(finished == LAST_CHUNK);
427     *plower = myLb;
428     *pupper = myUb;
429     *pstride = 1;
430 
431     return DISPATCH_NOTFINISHED;
432   }
433 
dispatch_finiomptarget_nvptx_LoopSupport434   static void dispatch_fini() {
435     // nothing
436   }
437 
438   ////////////////////////////////////////////////////////////////////////////////
439   // end of template class that encapsulate all the helper functions
440   ////////////////////////////////////////////////////////////////////////////////
441 };
442 
443 ////////////////////////////////////////////////////////////////////////////////
444 // KMP interface implementation (dyn loops)
445 ////////////////////////////////////////////////////////////////////////////////
446 
447 // TODO: This is a stopgap. We probably want to expand the dispatch API to take
448 //       an DST pointer which can then be allocated properly without malloc.
449 static DynamicScheduleTracker *THREAD_LOCAL(ThreadDSTPtr);
450 
451 // Create a new DST, link the current one, and define the new as current.
pushDST()452 static DynamicScheduleTracker *pushDST() {
453   DynamicScheduleTracker *NewDST = static_cast<DynamicScheduleTracker *>(
454       memory::allocGlobal(sizeof(DynamicScheduleTracker), "new DST"));
455   *NewDST = DynamicScheduleTracker({0});
456   NewDST->NextDST = ThreadDSTPtr;
457   ThreadDSTPtr = NewDST;
458   return ThreadDSTPtr;
459 }
460 
461 // Return the current DST.
peekDST()462 static DynamicScheduleTracker *peekDST() { return ThreadDSTPtr; }
463 
464 // Pop the current DST and restore the last one.
popDST()465 static void popDST() {
466   DynamicScheduleTracker *OldDST = ThreadDSTPtr->NextDST;
467   memory::freeGlobal(ThreadDSTPtr, "remove DST");
468   ThreadDSTPtr = OldDST;
469 }
470 
471 extern "C" {
472 
473 // init
__kmpc_dispatch_init_4(IdentTy * loc,int32_t tid,int32_t schedule,int32_t lb,int32_t ub,int32_t st,int32_t chunk)474 void __kmpc_dispatch_init_4(IdentTy *loc, int32_t tid, int32_t schedule,
475                             int32_t lb, int32_t ub, int32_t st, int32_t chunk) {
476   FunctionTracingRAII();
477   DynamicScheduleTracker *DST = pushDST();
478   omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init(
479       loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
480 }
481 
__kmpc_dispatch_init_4u(IdentTy * loc,int32_t tid,int32_t schedule,uint32_t lb,uint32_t ub,int32_t st,int32_t chunk)482 void __kmpc_dispatch_init_4u(IdentTy *loc, int32_t tid, int32_t schedule,
483                              uint32_t lb, uint32_t ub, int32_t st,
484                              int32_t chunk) {
485   FunctionTracingRAII();
486   DynamicScheduleTracker *DST = pushDST();
487   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init(
488       loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
489 }
490 
__kmpc_dispatch_init_8(IdentTy * loc,int32_t tid,int32_t schedule,int64_t lb,int64_t ub,int64_t st,int64_t chunk)491 void __kmpc_dispatch_init_8(IdentTy *loc, int32_t tid, int32_t schedule,
492                             int64_t lb, int64_t ub, int64_t st, int64_t chunk) {
493   FunctionTracingRAII();
494   DynamicScheduleTracker *DST = pushDST();
495   omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init(
496       loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
497 }
498 
__kmpc_dispatch_init_8u(IdentTy * loc,int32_t tid,int32_t schedule,uint64_t lb,uint64_t ub,int64_t st,int64_t chunk)499 void __kmpc_dispatch_init_8u(IdentTy *loc, int32_t tid, int32_t schedule,
500                              uint64_t lb, uint64_t ub, int64_t st,
501                              int64_t chunk) {
502   FunctionTracingRAII();
503   DynamicScheduleTracker *DST = pushDST();
504   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init(
505       loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
506 }
507 
508 // next
__kmpc_dispatch_next_4(IdentTy * loc,int32_t tid,int32_t * p_last,int32_t * p_lb,int32_t * p_ub,int32_t * p_st)509 int __kmpc_dispatch_next_4(IdentTy *loc, int32_t tid, int32_t *p_last,
510                            int32_t *p_lb, int32_t *p_ub, int32_t *p_st) {
511   FunctionTracingRAII();
512   DynamicScheduleTracker *DST = peekDST();
513   return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next(
514       loc, tid, p_last, p_lb, p_ub, p_st, DST);
515 }
516 
__kmpc_dispatch_next_4u(IdentTy * loc,int32_t tid,int32_t * p_last,uint32_t * p_lb,uint32_t * p_ub,int32_t * p_st)517 int __kmpc_dispatch_next_4u(IdentTy *loc, int32_t tid, int32_t *p_last,
518                             uint32_t *p_lb, uint32_t *p_ub, int32_t *p_st) {
519   FunctionTracingRAII();
520   DynamicScheduleTracker *DST = peekDST();
521   return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next(
522       loc, tid, p_last, p_lb, p_ub, p_st, DST);
523 }
524 
__kmpc_dispatch_next_8(IdentTy * loc,int32_t tid,int32_t * p_last,int64_t * p_lb,int64_t * p_ub,int64_t * p_st)525 int __kmpc_dispatch_next_8(IdentTy *loc, int32_t tid, int32_t *p_last,
526                            int64_t *p_lb, int64_t *p_ub, int64_t *p_st) {
527   FunctionTracingRAII();
528   DynamicScheduleTracker *DST = peekDST();
529   return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next(
530       loc, tid, p_last, p_lb, p_ub, p_st, DST);
531 }
532 
__kmpc_dispatch_next_8u(IdentTy * loc,int32_t tid,int32_t * p_last,uint64_t * p_lb,uint64_t * p_ub,int64_t * p_st)533 int __kmpc_dispatch_next_8u(IdentTy *loc, int32_t tid, int32_t *p_last,
534                             uint64_t *p_lb, uint64_t *p_ub, int64_t *p_st) {
535   FunctionTracingRAII();
536   DynamicScheduleTracker *DST = peekDST();
537   return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next(
538       loc, tid, p_last, p_lb, p_ub, p_st, DST);
539 }
540 
541 // fini
__kmpc_dispatch_fini_4(IdentTy * loc,int32_t tid)542 void __kmpc_dispatch_fini_4(IdentTy *loc, int32_t tid) {
543   FunctionTracingRAII();
544   omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_fini();
545   popDST();
546 }
547 
__kmpc_dispatch_fini_4u(IdentTy * loc,int32_t tid)548 void __kmpc_dispatch_fini_4u(IdentTy *loc, int32_t tid) {
549   FunctionTracingRAII();
550   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_fini();
551   popDST();
552 }
553 
__kmpc_dispatch_fini_8(IdentTy * loc,int32_t tid)554 void __kmpc_dispatch_fini_8(IdentTy *loc, int32_t tid) {
555   FunctionTracingRAII();
556   omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_fini();
557   popDST();
558 }
559 
__kmpc_dispatch_fini_8u(IdentTy * loc,int32_t tid)560 void __kmpc_dispatch_fini_8u(IdentTy *loc, int32_t tid) {
561   FunctionTracingRAII();
562   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_fini();
563   popDST();
564 }
565 
566 ////////////////////////////////////////////////////////////////////////////////
567 // KMP interface implementation (static loops)
568 ////////////////////////////////////////////////////////////////////////////////
569 
__kmpc_for_static_init_4(IdentTy * loc,int32_t global_tid,int32_t schedtype,int32_t * plastiter,int32_t * plower,int32_t * pupper,int32_t * pstride,int32_t incr,int32_t chunk)570 void __kmpc_for_static_init_4(IdentTy *loc, int32_t global_tid,
571                               int32_t schedtype, int32_t *plastiter,
572                               int32_t *plower, int32_t *pupper,
573                               int32_t *pstride, int32_t incr, int32_t chunk) {
574   FunctionTracingRAII();
575   omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
576       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
577       mapping::isSPMDMode());
578 }
579 
__kmpc_for_static_init_4u(IdentTy * loc,int32_t global_tid,int32_t schedtype,int32_t * plastiter,uint32_t * plower,uint32_t * pupper,int32_t * pstride,int32_t incr,int32_t chunk)580 void __kmpc_for_static_init_4u(IdentTy *loc, int32_t global_tid,
581                                int32_t schedtype, int32_t *plastiter,
582                                uint32_t *plower, uint32_t *pupper,
583                                int32_t *pstride, int32_t incr, int32_t chunk) {
584   FunctionTracingRAII();
585   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
586       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
587       mapping::isSPMDMode());
588 }
589 
__kmpc_for_static_init_8(IdentTy * loc,int32_t global_tid,int32_t schedtype,int32_t * plastiter,int64_t * plower,int64_t * pupper,int64_t * pstride,int64_t incr,int64_t chunk)590 void __kmpc_for_static_init_8(IdentTy *loc, int32_t global_tid,
591                               int32_t schedtype, int32_t *plastiter,
592                               int64_t *plower, int64_t *pupper,
593                               int64_t *pstride, int64_t incr, int64_t chunk) {
594   FunctionTracingRAII();
595   omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
596       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
597       mapping::isSPMDMode());
598 }
599 
__kmpc_for_static_init_8u(IdentTy * loc,int32_t global_tid,int32_t schedtype,int32_t * plastiter,uint64_t * plower,uint64_t * pupper,int64_t * pstride,int64_t incr,int64_t chunk)600 void __kmpc_for_static_init_8u(IdentTy *loc, int32_t global_tid,
601                                int32_t schedtype, int32_t *plastiter,
602                                uint64_t *plower, uint64_t *pupper,
603                                int64_t *pstride, int64_t incr, int64_t chunk) {
604   FunctionTracingRAII();
605   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
606       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
607       mapping::isSPMDMode());
608 }
609 
__kmpc_distribute_static_init_4(IdentTy * loc,int32_t global_tid,int32_t schedtype,int32_t * plastiter,int32_t * plower,int32_t * pupper,int32_t * pstride,int32_t incr,int32_t chunk)610 void __kmpc_distribute_static_init_4(IdentTy *loc, int32_t global_tid,
611                                      int32_t schedtype, int32_t *plastiter,
612                                      int32_t *plower, int32_t *pupper,
613                                      int32_t *pstride, int32_t incr,
614                                      int32_t chunk) {
615   FunctionTracingRAII();
616   omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
617       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
618       mapping::isSPMDMode());
619 }
620 
__kmpc_distribute_static_init_4u(IdentTy * loc,int32_t global_tid,int32_t schedtype,int32_t * plastiter,uint32_t * plower,uint32_t * pupper,int32_t * pstride,int32_t incr,int32_t chunk)621 void __kmpc_distribute_static_init_4u(IdentTy *loc, int32_t global_tid,
622                                       int32_t schedtype, int32_t *plastiter,
623                                       uint32_t *plower, uint32_t *pupper,
624                                       int32_t *pstride, int32_t incr,
625                                       int32_t chunk) {
626   FunctionTracingRAII();
627   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
628       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
629       mapping::isSPMDMode());
630 }
631 
__kmpc_distribute_static_init_8(IdentTy * loc,int32_t global_tid,int32_t schedtype,int32_t * plastiter,int64_t * plower,int64_t * pupper,int64_t * pstride,int64_t incr,int64_t chunk)632 void __kmpc_distribute_static_init_8(IdentTy *loc, int32_t global_tid,
633                                      int32_t schedtype, int32_t *plastiter,
634                                      int64_t *plower, int64_t *pupper,
635                                      int64_t *pstride, int64_t incr,
636                                      int64_t chunk) {
637   FunctionTracingRAII();
638   omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
639       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
640       mapping::isSPMDMode());
641 }
642 
__kmpc_distribute_static_init_8u(IdentTy * loc,int32_t global_tid,int32_t schedtype,int32_t * plastiter,uint64_t * plower,uint64_t * pupper,int64_t * pstride,int64_t incr,int64_t chunk)643 void __kmpc_distribute_static_init_8u(IdentTy *loc, int32_t global_tid,
644                                       int32_t schedtype, int32_t *plastiter,
645                                       uint64_t *plower, uint64_t *pupper,
646                                       int64_t *pstride, int64_t incr,
647                                       int64_t chunk) {
648   FunctionTracingRAII();
649   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
650       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
651       mapping::isSPMDMode());
652 }
653 
__kmpc_for_static_fini(IdentTy * loc,int32_t global_tid)654 void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) {
655   FunctionTracingRAII();
656 }
657 
__kmpc_distribute_static_fini(IdentTy * loc,int32_t global_tid)658 void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) {
659   FunctionTracingRAII();
660 }
661 }
662 
663 #pragma omp end declare target
664