1 //===----- Workshare.cpp -  OpenMP workshare implementation ------ C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the implementation of the KMPC interface
10 // for the loop construct plus other worksharing constructs that use the same
11 // interface as loops.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "Debug.h"
16 #include "Interface.h"
17 #include "Mapping.h"
18 #include "State.h"
19 #include "Synchronization.h"
20 #include "Types.h"
21 #include "Utils.h"
22 
23 using namespace _OMP;
24 
25 // TODO:
26 struct DynamicScheduleTracker {
27   int64_t Chunk;
28   int64_t LoopUpperBound;
29   int64_t NextLowerBound;
30   int64_t Stride;
31   kmp_sched_t ScheduleType;
32   DynamicScheduleTracker *NextDST;
33 };
34 
35 #define ASSERT0(...)
36 
37 // used by the library for the interface with the app
38 #define DISPATCH_FINISHED 0
39 #define DISPATCH_NOTFINISHED 1
40 
41 // used by dynamic scheduling
42 #define FINISHED 0
43 #define NOT_FINISHED 1
44 #define LAST_CHUNK 2
45 
46 #pragma omp declare target
47 
48 // TODO: This variable is a hack inherited from the old runtime.
49 uint64_t SHARED(Cnt);
50 
51 template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
52   ////////////////////////////////////////////////////////////////////////////////
53   // Loop with static scheduling with chunk
54 
55   // Generic implementation of OMP loop scheduling with static policy
56   /*! \brief Calculate initial bounds for static loop and stride
57    *  @param[in] loc location in code of the call (not used here)
58    *  @param[in] global_tid global thread id
59    *  @param[in] schetype type of scheduling (see omptarget-nvptx.h)
60    *  @param[in] plastiter pointer to last iteration
61    *  @param[in,out] pointer to loop lower bound. it will contain value of
62    *  lower bound of first chunk
63    *  @param[in,out] pointer to loop upper bound. It will contain value of
64    *  upper bound of first chunk
65    *  @param[in,out] pointer to loop stride. It will contain value of stride
66    *  between two successive chunks executed by the same thread
67    *  @param[in] loop increment bump
68    *  @param[in] chunk size
69    */
70 
71   // helper function for static chunk
72   static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride, ST chunk,
73                              T entityId, T numberOfEntities) {
74     // each thread executes multiple chunks all of the same size, except
75     // the last one
76     // distance between two successive chunks
77     stride = numberOfEntities * chunk;
78     lb = lb + entityId * chunk;
79     T inputUb = ub;
80     ub = lb + chunk - 1; // Clang uses i <= ub
81     // Say ub' is the begining of the last chunk. Then who ever has a
82     // lower bound plus a multiple of the increment equal to ub' is
83     // the last one.
84     T beginingLastChunk = inputUb - (inputUb % chunk);
85     last = ((beginingLastChunk - lb) % stride) == 0;
86   }
87 
88   ////////////////////////////////////////////////////////////////////////////////
89   // Loop with static scheduling without chunk
90 
91   // helper function for static no chunk
92   static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride, ST &chunk,
93                                T entityId, T numberOfEntities) {
94     // No chunk size specified.  Each thread or warp gets at most one
95     // chunk; chunks are all almost of equal size
96     T loopSize = ub - lb + 1;
97 
98     chunk = loopSize / numberOfEntities;
99     T leftOver = loopSize - chunk * numberOfEntities;
100 
101     if (entityId < leftOver) {
102       chunk++;
103       lb = lb + entityId * chunk;
104     } else {
105       lb = lb + entityId * chunk + leftOver;
106     }
107 
108     T inputUb = ub;
109     ub = lb + chunk - 1; // Clang uses i <= ub
110     last = lb <= inputUb && inputUb <= ub;
111     stride = loopSize; // make sure we only do 1 chunk per warp
112   }
113 
114   ////////////////////////////////////////////////////////////////////////////////
115   // Support for Static Init
116 
117   static void for_static_init(int32_t, int32_t schedtype,
118                               int32_t *plastiter, T *plower, T *pupper,
119                               ST *pstride, ST chunk, bool IsSPMDExecutionMode) {
120     int32_t gtid = omp_get_thread_num();
121     int numberOfActiveOMPThreads = omp_get_num_threads();
122 
123     // All warps that are in excess of the maximum requested, do
124     // not execute the loop
125     ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads,
126             "current thread is not needed here; error");
127 
128     // copy
129     int lastiter = 0;
130     T lb = *plower;
131     T ub = *pupper;
132     ST stride = *pstride;
133 
134     // init
135     switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) {
136     case kmp_sched_static_chunk: {
137       if (chunk > 0) {
138         ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
139                        numberOfActiveOMPThreads);
140         break;
141       }
142     } // note: if chunk <=0, use nochunk
143     case kmp_sched_static_balanced_chunk: {
144       if (chunk > 0) {
145         // round up to make sure the chunk is enough to cover all iterations
146         T tripCount = ub - lb + 1; // +1 because ub is inclusive
147         T span = (tripCount + numberOfActiveOMPThreads - 1) /
148                  numberOfActiveOMPThreads;
149         // perform chunk adjustment
150         chunk = (span + chunk - 1) & ~(chunk - 1);
151 
152         ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
153         T oldUb = ub;
154         ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
155                        numberOfActiveOMPThreads);
156         if (ub > oldUb)
157           ub = oldUb;
158         break;
159       }
160     } // note: if chunk <=0, use nochunk
161     case kmp_sched_static_nochunk: {
162       ForStaticNoChunk(lastiter, lb, ub, stride, chunk, gtid,
163                        numberOfActiveOMPThreads);
164       break;
165     }
166     case kmp_sched_distr_static_chunk: {
167       if (chunk > 0) {
168         ForStaticChunk(lastiter, lb, ub, stride, chunk, omp_get_team_num(),
169                        omp_get_num_teams());
170         break;
171       } // note: if chunk <=0, use nochunk
172     }
173     case kmp_sched_distr_static_nochunk: {
174       ForStaticNoChunk(lastiter, lb, ub, stride, chunk, omp_get_team_num(),
175                        omp_get_num_teams());
176       break;
177     }
178     case kmp_sched_distr_static_chunk_sched_static_chunkone: {
179       ForStaticChunk(lastiter, lb, ub, stride, chunk,
180                      numberOfActiveOMPThreads * omp_get_team_num() + gtid,
181                      omp_get_num_teams() * numberOfActiveOMPThreads);
182       break;
183     }
184     default: {
185       // ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype);
186       ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
187                      numberOfActiveOMPThreads);
188       break;
189     }
190     }
191     // copy back
192     *plastiter = lastiter;
193     *plower = lb;
194     *pupper = ub;
195     *pstride = stride;
196   }
197 
198   ////////////////////////////////////////////////////////////////////////////////
199   // Support for dispatch Init
200 
201   static int OrderedSchedule(kmp_sched_t schedule) {
202     return schedule >= kmp_sched_ordered_first &&
203            schedule <= kmp_sched_ordered_last;
204   }
205 
206   static void dispatch_init(IdentTy *loc, int32_t threadId,
207                             kmp_sched_t schedule, T lb, T ub, ST st, ST chunk,
208                             DynamicScheduleTracker *DST) {
209     int tid = mapping::getThreadIdInBlock();
210     T tnum = omp_get_num_threads();
211     T tripCount = ub - lb + 1; // +1 because ub is inclusive
212     ASSERT0(LT_FUSSY, threadId < tnum,
213             "current thread is not needed here; error");
214 
215     /* Currently just ignore the monotonic and non-monotonic modifiers
216      * (the compiler isn't producing them * yet anyway).
217      * When it is we'll want to look at them somewhere here and use that
218      * information to add to our schedule choice. We shouldn't need to pass
219      * them on, they merely affect which schedule we can legally choose for
220      * various dynamic cases. (In particular, whether or not a stealing scheme
221      * is legal).
222      */
223     schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
224 
225     // Process schedule.
226     if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) {
227       if (OrderedSchedule(schedule))
228         __kmpc_barrier(loc, threadId);
229       schedule = kmp_sched_static_chunk;
230       chunk = tripCount; // one thread gets the whole loop
231     } else if (schedule == kmp_sched_runtime) {
232       // process runtime
233       omp_sched_t rtSched;
234       int ChunkInt;
235       omp_get_schedule(&rtSched, &ChunkInt);
236       chunk = ChunkInt;
237       switch (rtSched) {
238       case omp_sched_static: {
239         if (chunk > 0)
240           schedule = kmp_sched_static_chunk;
241         else
242           schedule = kmp_sched_static_nochunk;
243         break;
244       }
245       case omp_sched_auto: {
246         schedule = kmp_sched_static_chunk;
247         chunk = 1;
248         break;
249       }
250       case omp_sched_dynamic:
251       case omp_sched_guided: {
252         schedule = kmp_sched_dynamic;
253         break;
254       }
255       }
256     } else if (schedule == kmp_sched_auto) {
257       schedule = kmp_sched_static_chunk;
258       chunk = 1;
259     } else {
260       // ASSERT(LT_FUSSY,
261       //        schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
262       //        "unknown schedule %d & chunk %lld\n", (int)schedule,
263       //        (long long)chunk);
264     }
265 
266     // init schedules
267     if (schedule == kmp_sched_static_chunk) {
268       ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
269       // save sched state
270       DST->ScheduleType = schedule;
271       // save ub
272       DST->LoopUpperBound = ub;
273       // compute static chunk
274       ST stride;
275       int lastiter = 0;
276       ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
277       // save computed params
278       DST->Chunk = chunk;
279       DST->NextLowerBound = lb;
280       DST->Stride = stride;
281     } else if (schedule == kmp_sched_static_balanced_chunk) {
282       ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
283       // save sched state
284       DST->ScheduleType = schedule;
285       // save ub
286       DST->LoopUpperBound = ub;
287       // compute static chunk
288       ST stride;
289       int lastiter = 0;
290       // round up to make sure the chunk is enough to cover all iterations
291       T span = (tripCount + tnum - 1) / tnum;
292       // perform chunk adjustment
293       chunk = (span + chunk - 1) & ~(chunk - 1);
294 
295       T oldUb = ub;
296       ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
297       ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
298       if (ub > oldUb)
299         ub = oldUb;
300       // save computed params
301       DST->Chunk = chunk;
302       DST->NextLowerBound = lb;
303       DST->Stride = stride;
304     } else if (schedule == kmp_sched_static_nochunk) {
305       ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value");
306       // save sched state
307       DST->ScheduleType = schedule;
308       // save ub
309       DST->LoopUpperBound = ub;
310       // compute static chunk
311       ST stride;
312       int lastiter = 0;
313       ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
314       // save computed params
315       DST->Chunk = chunk;
316       DST->NextLowerBound = lb;
317       DST->Stride = stride;
318     } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
319       // save data
320       DST->ScheduleType = schedule;
321       if (chunk < 1)
322         chunk = 1;
323       DST->Chunk = chunk;
324       DST->LoopUpperBound = ub;
325       DST->NextLowerBound = lb;
326       __kmpc_barrier(loc, threadId);
327       if (tid == 0) {
328         Cnt = 0;
329         fence::team(__ATOMIC_SEQ_CST);
330       }
331       __kmpc_barrier(loc, threadId);
332     }
333   }
334 
335   ////////////////////////////////////////////////////////////////////////////////
336   // Support for dispatch next
337 
338   static uint64_t NextIter() {
339     __kmpc_impl_lanemask_t active = mapping::activemask();
340     uint32_t leader = utils::ffs(active) - 1;
341     uint32_t change = utils::popc(active);
342     __kmpc_impl_lanemask_t lane_mask_lt = mapping::lanemaskLT();
343     unsigned int rank = utils::popc(active & lane_mask_lt);
344     uint64_t warp_res;
345     if (rank == 0) {
346       warp_res = atomic::add(&Cnt, change, __ATOMIC_SEQ_CST);
347     }
348     warp_res = utils::shuffle(active, warp_res, leader);
349     return warp_res + rank;
350   }
351 
352   static int DynamicNextChunk(T &lb, T &ub, T chunkSize, T loopLowerBound,
353                               T loopUpperBound) {
354     T N = NextIter();
355     lb = loopLowerBound + N * chunkSize;
356     ub = lb + chunkSize - 1; // Clang uses i <= ub
357 
358     // 3 result cases:
359     //  a. lb and ub < loopUpperBound --> NOT_FINISHED
360     //  b. lb < loopUpperBound and ub >= loopUpperBound: last chunk -->
361     //  NOT_FINISHED
362     //  c. lb and ub >= loopUpperBound: empty chunk --> FINISHED
363     // a.
364     if (lb <= loopUpperBound && ub < loopUpperBound) {
365       return NOT_FINISHED;
366     }
367     // b.
368     if (lb <= loopUpperBound) {
369       ub = loopUpperBound;
370       return LAST_CHUNK;
371     }
372     // c. if we are here, we are in case 'c'
373     lb = loopUpperBound + 2;
374     ub = loopUpperBound + 1;
375     return FINISHED;
376   }
377 
378   static int dispatch_next(IdentTy *loc, int32_t gtid, int32_t *plast,
379                            T *plower, T *pupper, ST *pstride,
380                            DynamicScheduleTracker *DST) {
381     // ID of a thread in its own warp
382 
383     // automatically selects thread or warp ID based on selected implementation
384     ASSERT0(LT_FUSSY, gtid < omp_get_num_threads(),
385             "current thread is not needed here; error");
386     // retrieve schedule
387     kmp_sched_t schedule = DST->ScheduleType;
388 
389     // xxx reduce to one
390     if (schedule == kmp_sched_static_chunk ||
391         schedule == kmp_sched_static_nochunk) {
392       T myLb = DST->NextLowerBound;
393       T ub = DST->LoopUpperBound;
394       // finished?
395       if (myLb > ub) {
396         return DISPATCH_FINISHED;
397       }
398       // not finished, save current bounds
399       ST chunk = DST->Chunk;
400       *plower = myLb;
401       T myUb = myLb + chunk - 1; // Clang uses i <= ub
402       if (myUb > ub)
403         myUb = ub;
404       *pupper = myUb;
405       *plast = (int32_t)(myUb == ub);
406 
407       // increment next lower bound by the stride
408       ST stride = DST->Stride;
409       DST->NextLowerBound = myLb + stride;
410       return DISPATCH_NOTFINISHED;
411     }
412     ASSERT0(LT_FUSSY,
413             schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
414             "bad sched");
415     T myLb, myUb;
416     int finished = DynamicNextChunk(myLb, myUb, DST->Chunk, DST->NextLowerBound,
417                                     DST->LoopUpperBound);
418 
419     if (finished == FINISHED)
420       return DISPATCH_FINISHED;
421 
422     // not finished (either not finished or last chunk)
423     *plast = (int32_t)(finished == LAST_CHUNK);
424     *plower = myLb;
425     *pupper = myUb;
426     *pstride = 1;
427 
428     return DISPATCH_NOTFINISHED;
429   }
430 
431   static void dispatch_fini() {
432     // nothing
433   }
434 
435   ////////////////////////////////////////////////////////////////////////////////
436   // end of template class that encapsulate all the helper functions
437   ////////////////////////////////////////////////////////////////////////////////
438 };
439 
440 ////////////////////////////////////////////////////////////////////////////////
441 // KMP interface implementation (dyn loops)
442 ////////////////////////////////////////////////////////////////////////////////
443 
444 // TODO: This is a stopgap. We probably want to expand the dispatch API to take
445 //       an DST pointer which can then be allocated properly without malloc.
446 DynamicScheduleTracker *THREAD_LOCAL(ThreadDSTPtr);
447 
448 // Create a new DST, link the current one, and define the new as current.
449 static DynamicScheduleTracker *pushDST() {
450   DynamicScheduleTracker *NewDST = static_cast<DynamicScheduleTracker *>(
451       memory::allocGlobal(sizeof(DynamicScheduleTracker), "new DST"));
452   *NewDST = DynamicScheduleTracker({0});
453   NewDST->NextDST = ThreadDSTPtr;
454   ThreadDSTPtr = NewDST;
455   return ThreadDSTPtr;
456 }
457 
458 // Return the current DST.
459 static DynamicScheduleTracker *peekDST() { return ThreadDSTPtr; }
460 
461 // Pop the current DST and restore the last one.
462 static void popDST() {
463   DynamicScheduleTracker *OldDST = ThreadDSTPtr->NextDST;
464   memory::freeGlobal(ThreadDSTPtr, "remove DST");
465   ThreadDSTPtr = OldDST;
466 }
467 
468 extern "C" {
469 
470 // init
471 void __kmpc_dispatch_init_4(IdentTy *loc, int32_t tid, int32_t schedule,
472                             int32_t lb, int32_t ub, int32_t st, int32_t chunk) {
473   FunctionTracingRAII();
474   DynamicScheduleTracker *DST = pushDST();
475   omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init(
476       loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
477 }
478 
479 void __kmpc_dispatch_init_4u(IdentTy *loc, int32_t tid, int32_t schedule,
480                              uint32_t lb, uint32_t ub, int32_t st,
481                              int32_t chunk) {
482   FunctionTracingRAII();
483   DynamicScheduleTracker *DST = pushDST();
484   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init(
485       loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
486 }
487 
488 void __kmpc_dispatch_init_8(IdentTy *loc, int32_t tid, int32_t schedule,
489                             int64_t lb, int64_t ub, int64_t st, int64_t chunk) {
490   FunctionTracingRAII();
491   DynamicScheduleTracker *DST = pushDST();
492   omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init(
493       loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
494 }
495 
496 void __kmpc_dispatch_init_8u(IdentTy *loc, int32_t tid, int32_t schedule,
497                              uint64_t lb, uint64_t ub, int64_t st,
498                              int64_t chunk) {
499   FunctionTracingRAII();
500   DynamicScheduleTracker *DST = pushDST();
501   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init(
502       loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
503 }
504 
505 // next
506 int __kmpc_dispatch_next_4(IdentTy *loc, int32_t tid, int32_t *p_last,
507                            int32_t *p_lb, int32_t *p_ub, int32_t *p_st) {
508   FunctionTracingRAII();
509   DynamicScheduleTracker *DST = peekDST();
510   return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next(
511       loc, tid, p_last, p_lb, p_ub, p_st, DST);
512 }
513 
514 int __kmpc_dispatch_next_4u(IdentTy *loc, int32_t tid, int32_t *p_last,
515                             uint32_t *p_lb, uint32_t *p_ub, int32_t *p_st) {
516   FunctionTracingRAII();
517   DynamicScheduleTracker *DST = peekDST();
518   return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next(
519       loc, tid, p_last, p_lb, p_ub, p_st, DST);
520 }
521 
522 int __kmpc_dispatch_next_8(IdentTy *loc, int32_t tid, int32_t *p_last,
523                            int64_t *p_lb, int64_t *p_ub, int64_t *p_st) {
524   FunctionTracingRAII();
525   DynamicScheduleTracker *DST = peekDST();
526   return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next(
527       loc, tid, p_last, p_lb, p_ub, p_st, DST);
528 }
529 
530 int __kmpc_dispatch_next_8u(IdentTy *loc, int32_t tid, int32_t *p_last,
531                             uint64_t *p_lb, uint64_t *p_ub, int64_t *p_st) {
532   FunctionTracingRAII();
533   DynamicScheduleTracker *DST = peekDST();
534   return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next(
535       loc, tid, p_last, p_lb, p_ub, p_st, DST);
536 }
537 
538 // fini
539 void __kmpc_dispatch_fini_4(IdentTy *loc, int32_t tid) {
540   FunctionTracingRAII();
541   omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_fini();
542   popDST();
543 }
544 
545 void __kmpc_dispatch_fini_4u(IdentTy *loc, int32_t tid) {
546   FunctionTracingRAII();
547   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_fini();
548   popDST();
549 }
550 
551 void __kmpc_dispatch_fini_8(IdentTy *loc, int32_t tid) {
552   FunctionTracingRAII();
553   omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_fini();
554   popDST();
555 }
556 
557 void __kmpc_dispatch_fini_8u(IdentTy *loc, int32_t tid) {
558   FunctionTracingRAII();
559   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_fini();
560   popDST();
561 }
562 
563 ////////////////////////////////////////////////////////////////////////////////
564 // KMP interface implementation (static loops)
565 ////////////////////////////////////////////////////////////////////////////////
566 
567 void __kmpc_for_static_init_4(IdentTy *loc, int32_t global_tid,
568                               int32_t schedtype, int32_t *plastiter,
569                               int32_t *plower, int32_t *pupper,
570                               int32_t *pstride, int32_t incr, int32_t chunk) {
571   FunctionTracingRAII();
572   omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
573       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
574       mapping::isSPMDMode());
575 }
576 
577 void __kmpc_for_static_init_4u(IdentTy *loc, int32_t global_tid,
578                                int32_t schedtype, int32_t *plastiter,
579                                uint32_t *plower, uint32_t *pupper,
580                                int32_t *pstride, int32_t incr, int32_t chunk) {
581   FunctionTracingRAII();
582   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
583       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
584       mapping::isSPMDMode());
585 }
586 
587 void __kmpc_for_static_init_8(IdentTy *loc, int32_t global_tid,
588                               int32_t schedtype, int32_t *plastiter,
589                               int64_t *plower, int64_t *pupper,
590                               int64_t *pstride, int64_t incr, int64_t chunk) {
591   FunctionTracingRAII();
592   omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
593       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
594       mapping::isSPMDMode());
595 }
596 
597 void __kmpc_for_static_init_8u(IdentTy *loc, int32_t global_tid,
598                                int32_t schedtype, int32_t *plastiter,
599                                uint64_t *plower, uint64_t *pupper,
600                                int64_t *pstride, int64_t incr, int64_t chunk) {
601   FunctionTracingRAII();
602   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
603       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
604       mapping::isSPMDMode());
605 }
606 
607 void __kmpc_distribute_static_init_4(IdentTy *loc, int32_t global_tid,
608                                      int32_t schedtype, int32_t *plastiter,
609                                      int32_t *plower, int32_t *pupper,
610                                      int32_t *pstride, int32_t incr,
611                                      int32_t chunk) {
612   FunctionTracingRAII();
613   omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
614       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
615       mapping::isSPMDMode());
616 }
617 
618 void __kmpc_distribute_static_init_4u(IdentTy *loc, int32_t global_tid,
619                                       int32_t schedtype, int32_t *plastiter,
620                                       uint32_t *plower, uint32_t *pupper,
621                                       int32_t *pstride, int32_t incr,
622                                       int32_t chunk) {
623   FunctionTracingRAII();
624   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
625       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
626       mapping::isSPMDMode());
627 }
628 
629 void __kmpc_distribute_static_init_8(IdentTy *loc, int32_t global_tid,
630                                      int32_t schedtype, int32_t *plastiter,
631                                      int64_t *plower, int64_t *pupper,
632                                      int64_t *pstride, int64_t incr,
633                                      int64_t chunk) {
634   FunctionTracingRAII();
635   omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
636       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
637       mapping::isSPMDMode());
638 }
639 
640 void __kmpc_distribute_static_init_8u(IdentTy *loc, int32_t global_tid,
641                                       int32_t schedtype, int32_t *plastiter,
642                                       uint64_t *plower, uint64_t *pupper,
643                                       int64_t *pstride, int64_t incr,
644                                       int64_t chunk) {
645   FunctionTracingRAII();
646   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
647       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
648       mapping::isSPMDMode());
649 }
650 
651 void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) {
652   FunctionTracingRAII();
653 }
654 
655 void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) {
656   FunctionTracingRAII();
657 }
658 }
659 
660 #pragma omp end declare target
661