1 //===----- Workshare.cpp -  OpenMP workshare implementation ------ C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the implementation of the KMPC interface
10 // for the loop construct plus other worksharing constructs that use the same
11 // interface as loops.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "Debug.h"
16 #include "Interface.h"
17 #include "Mapping.h"
18 #include "State.h"
19 #include "Synchronization.h"
20 #include "Types.h"
21 #include "Utils.h"
22 
23 using namespace _OMP;
24 
25 // TODO:
26 struct DynamicScheduleTracker {
27   int64_t Chunk;
28   int64_t LoopUpperBound;
29   int64_t NextLowerBound;
30   int64_t Stride;
31   kmp_sched_t ScheduleType;
32   DynamicScheduleTracker *NextDST;
33 };
34 
35 #define ASSERT0(...)
36 
37 // used by the library for the interface with the app
38 #define DISPATCH_FINISHED 0
39 #define DISPATCH_NOTFINISHED 1
40 
41 // used by dynamic scheduling
42 #define FINISHED 0
43 #define NOT_FINISHED 1
44 #define LAST_CHUNK 2
45 
46 #pragma omp declare target
47 
48 // TODO: This variable is a hack inherited from the old runtime.
49 uint64_t SHARED(Cnt);
50 
51 template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
52   ////////////////////////////////////////////////////////////////////////////////
53   // Loop with static scheduling with chunk
54 
55   // Generic implementation of OMP loop scheduling with static policy
56   /*! \brief Calculate initial bounds for static loop and stride
57    *  @param[in] loc location in code of the call (not used here)
58    *  @param[in] global_tid global thread id
59    *  @param[in] schetype type of scheduling (see omptarget-nvptx.h)
60    *  @param[in] plastiter pointer to last iteration
61    *  @param[in,out] pointer to loop lower bound. it will contain value of
62    *  lower bound of first chunk
63    *  @param[in,out] pointer to loop upper bound. It will contain value of
64    *  upper bound of first chunk
65    *  @param[in,out] pointer to loop stride. It will contain value of stride
66    *  between two successive chunks executed by the same thread
67    *  @param[in] loop increment bump
68    *  @param[in] chunk size
69    */
70 
71   // helper function for static chunk
72   static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride, ST chunk,
73                              T entityId, T numberOfEntities) {
74     // each thread executes multiple chunks all of the same size, except
75     // the last one
76     // distance between two successive chunks
77     stride = numberOfEntities * chunk;
78     lb = lb + entityId * chunk;
79     T inputUb = ub;
80     ub = lb + chunk - 1; // Clang uses i <= ub
81     // Say ub' is the begining of the last chunk. Then who ever has a
82     // lower bound plus a multiple of the increment equal to ub' is
83     // the last one.
84     T beginingLastChunk = inputUb - (inputUb % chunk);
85     last = ((beginingLastChunk - lb) % stride) == 0;
86   }
87 
88   ////////////////////////////////////////////////////////////////////////////////
89   // Loop with static scheduling without chunk
90 
91   // helper function for static no chunk
92   static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride, ST &chunk,
93                                T entityId, T numberOfEntities) {
94     // No chunk size specified.  Each thread or warp gets at most one
95     // chunk; chunks are all almost of equal size
96     T loopSize = ub - lb + 1;
97 
98     chunk = loopSize / numberOfEntities;
99     T leftOver = loopSize - chunk * numberOfEntities;
100 
101     if (entityId < leftOver) {
102       chunk++;
103       lb = lb + entityId * chunk;
104     } else {
105       lb = lb + entityId * chunk + leftOver;
106     }
107 
108     T inputUb = ub;
109     ub = lb + chunk - 1; // Clang uses i <= ub
110     last = lb <= inputUb && inputUb <= ub;
111     stride = loopSize; // make sure we only do 1 chunk per warp
112   }
113 
114   ////////////////////////////////////////////////////////////////////////////////
115   // Support for Static Init
116 
117   static void for_static_init(int32_t gtid, int32_t schedtype,
118                               int32_t *plastiter, T *plower, T *pupper,
119                               ST *pstride, ST chunk, bool IsSPMDExecutionMode) {
120     // When IsRuntimeUninitialized is true, we assume that the caller is
121     // in an L0 parallel region and that all worker threads participate.
122 
123     // Assume we are in teams region or that we use a single block
124     // per target region
125     int numberOfActiveOMPThreads = omp_get_num_threads();
126 
127     // All warps that are in excess of the maximum requested, do
128     // not execute the loop
129     ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads,
130             "current thread is not needed here; error");
131 
132     // copy
133     int lastiter = 0;
134     T lb = *plower;
135     T ub = *pupper;
136     ST stride = *pstride;
137 
138     // init
139     switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) {
140     case kmp_sched_static_chunk: {
141       if (chunk > 0) {
142         ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
143                        numberOfActiveOMPThreads);
144         break;
145       }
146     } // note: if chunk <=0, use nochunk
147     case kmp_sched_static_balanced_chunk: {
148       if (chunk > 0) {
149         // round up to make sure the chunk is enough to cover all iterations
150         T tripCount = ub - lb + 1; // +1 because ub is inclusive
151         T span = (tripCount + numberOfActiveOMPThreads - 1) /
152                  numberOfActiveOMPThreads;
153         // perform chunk adjustment
154         chunk = (span + chunk - 1) & ~(chunk - 1);
155 
156         ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
157         T oldUb = ub;
158         ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
159                        numberOfActiveOMPThreads);
160         if (ub > oldUb)
161           ub = oldUb;
162         break;
163       }
164     } // note: if chunk <=0, use nochunk
165     case kmp_sched_static_nochunk: {
166       ForStaticNoChunk(lastiter, lb, ub, stride, chunk, gtid,
167                        numberOfActiveOMPThreads);
168       break;
169     }
170     case kmp_sched_distr_static_chunk: {
171       if (chunk > 0) {
172         ForStaticChunk(lastiter, lb, ub, stride, chunk, omp_get_team_num(),
173                        omp_get_num_teams());
174         break;
175       } // note: if chunk <=0, use nochunk
176     }
177     case kmp_sched_distr_static_nochunk: {
178       ForStaticNoChunk(lastiter, lb, ub, stride, chunk, omp_get_team_num(),
179                        omp_get_num_teams());
180       break;
181     }
182     case kmp_sched_distr_static_chunk_sched_static_chunkone: {
183       ForStaticChunk(lastiter, lb, ub, stride, chunk,
184                      numberOfActiveOMPThreads * omp_get_team_num() + gtid,
185                      omp_get_num_teams() * numberOfActiveOMPThreads);
186       break;
187     }
188     default: {
189       // ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype);
190       ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
191                      numberOfActiveOMPThreads);
192       break;
193     }
194     }
195     // copy back
196     *plastiter = lastiter;
197     *plower = lb;
198     *pupper = ub;
199     *pstride = stride;
200   }
201 
202   ////////////////////////////////////////////////////////////////////////////////
203   // Support for dispatch Init
204 
205   static int OrderedSchedule(kmp_sched_t schedule) {
206     return schedule >= kmp_sched_ordered_first &&
207            schedule <= kmp_sched_ordered_last;
208   }
209 
210   static void dispatch_init(IdentTy *loc, int32_t threadId,
211                             kmp_sched_t schedule, T lb, T ub, ST st, ST chunk,
212                             DynamicScheduleTracker *DST) {
213     int tid = mapping::getThreadIdInBlock();
214     T tnum = omp_get_num_threads();
215     T tripCount = ub - lb + 1; // +1 because ub is inclusive
216     ASSERT0(LT_FUSSY, threadId < tnum,
217             "current thread is not needed here; error");
218 
219     /* Currently just ignore the monotonic and non-monotonic modifiers
220      * (the compiler isn't producing them * yet anyway).
221      * When it is we'll want to look at them somewhere here and use that
222      * information to add to our schedule choice. We shouldn't need to pass
223      * them on, they merely affect which schedule we can legally choose for
224      * various dynamic cases. (In particular, whether or not a stealing scheme
225      * is legal).
226      */
227     schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
228 
229     // Process schedule.
230     if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) {
231       if (OrderedSchedule(schedule))
232         __kmpc_barrier(loc, threadId);
233       schedule = kmp_sched_static_chunk;
234       chunk = tripCount; // one thread gets the whole loop
235     } else if (schedule == kmp_sched_runtime) {
236       // process runtime
237       omp_sched_t rtSched;
238       int ChunkInt;
239       omp_get_schedule(&rtSched, &ChunkInt);
240       chunk = ChunkInt;
241       switch (rtSched) {
242       case omp_sched_static: {
243         if (chunk > 0)
244           schedule = kmp_sched_static_chunk;
245         else
246           schedule = kmp_sched_static_nochunk;
247         break;
248       }
249       case omp_sched_auto: {
250         schedule = kmp_sched_static_chunk;
251         chunk = 1;
252         break;
253       }
254       case omp_sched_dynamic:
255       case omp_sched_guided: {
256         schedule = kmp_sched_dynamic;
257         break;
258       }
259       }
260     } else if (schedule == kmp_sched_auto) {
261       schedule = kmp_sched_static_chunk;
262       chunk = 1;
263     } else {
264       // ASSERT(LT_FUSSY,
265       //        schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
266       //        "unknown schedule %d & chunk %lld\n", (int)schedule,
267       //        (long long)chunk);
268     }
269 
270     // init schedules
271     if (schedule == kmp_sched_static_chunk) {
272       ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
273       // save sched state
274       DST->ScheduleType = schedule;
275       // save ub
276       DST->LoopUpperBound = ub;
277       // compute static chunk
278       ST stride;
279       int lastiter = 0;
280       ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
281       // save computed params
282       DST->Chunk = chunk;
283       DST->NextLowerBound = lb;
284       DST->Stride = stride;
285     } else if (schedule == kmp_sched_static_balanced_chunk) {
286       ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
287       // save sched state
288       DST->ScheduleType = schedule;
289       // save ub
290       DST->LoopUpperBound = ub;
291       // compute static chunk
292       ST stride;
293       int lastiter = 0;
294       // round up to make sure the chunk is enough to cover all iterations
295       T span = (tripCount + tnum - 1) / tnum;
296       // perform chunk adjustment
297       chunk = (span + chunk - 1) & ~(chunk - 1);
298 
299       T oldUb = ub;
300       ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
301       ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
302       if (ub > oldUb)
303         ub = oldUb;
304       // save computed params
305       DST->Chunk = chunk;
306       DST->NextLowerBound = lb;
307       DST->Stride = stride;
308     } else if (schedule == kmp_sched_static_nochunk) {
309       ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value");
310       // save sched state
311       DST->ScheduleType = schedule;
312       // save ub
313       DST->LoopUpperBound = ub;
314       // compute static chunk
315       ST stride;
316       int lastiter = 0;
317       ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
318       // save computed params
319       DST->Chunk = chunk;
320       DST->NextLowerBound = lb;
321       DST->Stride = stride;
322     } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
323       // save data
324       DST->ScheduleType = schedule;
325       if (chunk < 1)
326         chunk = 1;
327       DST->Chunk = chunk;
328       DST->LoopUpperBound = ub;
329       DST->NextLowerBound = lb;
330       __kmpc_barrier(loc, threadId);
331       if (tid == 0) {
332         Cnt = 0;
333         fence::team(__ATOMIC_SEQ_CST);
334       }
335       __kmpc_barrier(loc, threadId);
336     }
337   }
338 
339   ////////////////////////////////////////////////////////////////////////////////
340   // Support for dispatch next
341 
342   static uint64_t NextIter() {
343     __kmpc_impl_lanemask_t active = mapping::activemask();
344     uint32_t leader = utils::ffs(active) - 1;
345     uint32_t change = utils::popc(active);
346     __kmpc_impl_lanemask_t lane_mask_lt = mapping::lanemaskLT();
347     unsigned int rank = utils::popc(active & lane_mask_lt);
348     uint64_t warp_res;
349     if (rank == 0) {
350       warp_res = atomic::add(&Cnt, change, __ATOMIC_SEQ_CST);
351     }
352     warp_res = utils::shuffle(active, warp_res, leader);
353     return warp_res + rank;
354   }
355 
356   static int DynamicNextChunk(T &lb, T &ub, T chunkSize, T loopLowerBound,
357                               T loopUpperBound) {
358     T N = NextIter();
359     lb = loopLowerBound + N * chunkSize;
360     ub = lb + chunkSize - 1; // Clang uses i <= ub
361 
362     // 3 result cases:
363     //  a. lb and ub < loopUpperBound --> NOT_FINISHED
364     //  b. lb < loopUpperBound and ub >= loopUpperBound: last chunk -->
365     //  NOT_FINISHED
366     //  c. lb and ub >= loopUpperBound: empty chunk --> FINISHED
367     // a.
368     if (lb <= loopUpperBound && ub < loopUpperBound) {
369       return NOT_FINISHED;
370     }
371     // b.
372     if (lb <= loopUpperBound) {
373       ub = loopUpperBound;
374       return LAST_CHUNK;
375     }
376     // c. if we are here, we are in case 'c'
377     lb = loopUpperBound + 2;
378     ub = loopUpperBound + 1;
379     return FINISHED;
380   }
381 
382   static int dispatch_next(IdentTy *loc, int32_t gtid, int32_t *plast,
383                            T *plower, T *pupper, ST *pstride,
384                            DynamicScheduleTracker *DST) {
385     // ID of a thread in its own warp
386 
387     // automatically selects thread or warp ID based on selected implementation
388     ASSERT0(LT_FUSSY, gtid < omp_get_num_threads(),
389             "current thread is not needed here; error");
390     // retrieve schedule
391     kmp_sched_t schedule = DST->ScheduleType;
392 
393     // xxx reduce to one
394     if (schedule == kmp_sched_static_chunk ||
395         schedule == kmp_sched_static_nochunk) {
396       T myLb = DST->NextLowerBound;
397       T ub = DST->LoopUpperBound;
398       // finished?
399       if (myLb > ub) {
400         return DISPATCH_FINISHED;
401       }
402       // not finished, save current bounds
403       ST chunk = DST->Chunk;
404       *plower = myLb;
405       T myUb = myLb + chunk - 1; // Clang uses i <= ub
406       if (myUb > ub)
407         myUb = ub;
408       *pupper = myUb;
409       *plast = (int32_t)(myUb == ub);
410 
411       // increment next lower bound by the stride
412       ST stride = DST->Stride;
413       DST->NextLowerBound = myLb + stride;
414       return DISPATCH_NOTFINISHED;
415     }
416     ASSERT0(LT_FUSSY,
417             schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
418             "bad sched");
419     T myLb, myUb;
420     int finished = DynamicNextChunk(myLb, myUb, DST->Chunk, DST->NextLowerBound,
421                                     DST->LoopUpperBound);
422 
423     if (finished == FINISHED)
424       return DISPATCH_FINISHED;
425 
426     // not finished (either not finished or last chunk)
427     *plast = (int32_t)(finished == LAST_CHUNK);
428     *plower = myLb;
429     *pupper = myUb;
430     *pstride = 1;
431 
432     return DISPATCH_NOTFINISHED;
433   }
434 
435   static void dispatch_fini() {
436     // nothing
437   }
438 
439   ////////////////////////////////////////////////////////////////////////////////
440   // end of template class that encapsulate all the helper functions
441   ////////////////////////////////////////////////////////////////////////////////
442 };
443 
444 ////////////////////////////////////////////////////////////////////////////////
445 // KMP interface implementation (dyn loops)
446 ////////////////////////////////////////////////////////////////////////////////
447 
448 // TODO: This is a stopgap. We probably want to expand the dispatch API to take
449 //       an DST pointer which can then be allocated properly without malloc.
450 DynamicScheduleTracker *THREAD_LOCAL(ThreadDSTPtr);
451 
452 // Create a new DST, link the current one, and define the new as current.
453 static DynamicScheduleTracker *pushDST() {
454   DynamicScheduleTracker *NewDST = static_cast<DynamicScheduleTracker *>(
455       memory::allocGlobal(sizeof(DynamicScheduleTracker), "new DST"));
456   *NewDST = DynamicScheduleTracker({0});
457   NewDST->NextDST = ThreadDSTPtr;
458   ThreadDSTPtr = NewDST;
459   return ThreadDSTPtr;
460 }
461 
462 // Return the current DST.
463 static DynamicScheduleTracker *peekDST() { return ThreadDSTPtr; }
464 
465 // Pop the current DST and restore the last one.
466 static void popDST() {
467   DynamicScheduleTracker *OldDST = ThreadDSTPtr->NextDST;
468   memory::freeGlobal(ThreadDSTPtr, "remove DST");
469   ThreadDSTPtr = OldDST;
470 }
471 
472 extern "C" {
473 
474 // init
475 void __kmpc_dispatch_init_4(IdentTy *loc, int32_t tid, int32_t schedule,
476                             int32_t lb, int32_t ub, int32_t st, int32_t chunk) {
477   DynamicScheduleTracker *DST = pushDST();
478   omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init(
479       loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
480 }
481 
482 void __kmpc_dispatch_init_4u(IdentTy *loc, int32_t tid, int32_t schedule,
483                              uint32_t lb, uint32_t ub, int32_t st,
484                              int32_t chunk) {
485   DynamicScheduleTracker *DST = pushDST();
486   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init(
487       loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
488 }
489 
490 void __kmpc_dispatch_init_8(IdentTy *loc, int32_t tid, int32_t schedule,
491                             int64_t lb, int64_t ub, int64_t st, int64_t chunk) {
492   DynamicScheduleTracker *DST = pushDST();
493   omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init(
494       loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
495 }
496 
497 void __kmpc_dispatch_init_8u(IdentTy *loc, int32_t tid, int32_t schedule,
498                              uint64_t lb, uint64_t ub, int64_t st,
499                              int64_t chunk) {
500   DynamicScheduleTracker *DST = pushDST();
501   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init(
502       loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
503 }
504 
505 // next
506 int __kmpc_dispatch_next_4(IdentTy *loc, int32_t tid, int32_t *p_last,
507                            int32_t *p_lb, int32_t *p_ub, int32_t *p_st) {
508   DynamicScheduleTracker *DST = peekDST();
509   return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next(
510       loc, tid, p_last, p_lb, p_ub, p_st, DST);
511 }
512 
513 int __kmpc_dispatch_next_4u(IdentTy *loc, int32_t tid, int32_t *p_last,
514                             uint32_t *p_lb, uint32_t *p_ub, int32_t *p_st) {
515   DynamicScheduleTracker *DST = peekDST();
516   return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next(
517       loc, tid, p_last, p_lb, p_ub, p_st, DST);
518 }
519 
520 int __kmpc_dispatch_next_8(IdentTy *loc, int32_t tid, int32_t *p_last,
521                            int64_t *p_lb, int64_t *p_ub, int64_t *p_st) {
522   DynamicScheduleTracker *DST = peekDST();
523   return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next(
524       loc, tid, p_last, p_lb, p_ub, p_st, DST);
525 }
526 
527 int __kmpc_dispatch_next_8u(IdentTy *loc, int32_t tid, int32_t *p_last,
528                             uint64_t *p_lb, uint64_t *p_ub, int64_t *p_st) {
529   DynamicScheduleTracker *DST = peekDST();
530   return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next(
531       loc, tid, p_last, p_lb, p_ub, p_st, DST);
532 }
533 
534 // fini
535 void __kmpc_dispatch_fini_4(IdentTy *loc, int32_t tid) {
536   omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_fini();
537   popDST();
538 }
539 
540 void __kmpc_dispatch_fini_4u(IdentTy *loc, int32_t tid) {
541   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_fini();
542   popDST();
543 }
544 
545 void __kmpc_dispatch_fini_8(IdentTy *loc, int32_t tid) {
546   omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_fini();
547   popDST();
548 }
549 
550 void __kmpc_dispatch_fini_8u(IdentTy *loc, int32_t tid) {
551   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_fini();
552   popDST();
553 }
554 
555 ////////////////////////////////////////////////////////////////////////////////
556 // KMP interface implementation (static loops)
557 ////////////////////////////////////////////////////////////////////////////////
558 
559 void __kmpc_for_static_init_4(IdentTy *loc, int32_t global_tid,
560                               int32_t schedtype, int32_t *plastiter,
561                               int32_t *plower, int32_t *pupper,
562                               int32_t *pstride, int32_t incr, int32_t chunk) {
563   omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
564       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
565       mapping::isSPMDMode());
566 }
567 
568 void __kmpc_for_static_init_4u(IdentTy *loc, int32_t global_tid,
569                                int32_t schedtype, int32_t *plastiter,
570                                uint32_t *plower, uint32_t *pupper,
571                                int32_t *pstride, int32_t incr, int32_t chunk) {
572   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
573       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
574       mapping::isSPMDMode());
575 }
576 
577 void __kmpc_for_static_init_8(IdentTy *loc, int32_t global_tid,
578                               int32_t schedtype, int32_t *plastiter,
579                               int64_t *plower, int64_t *pupper,
580                               int64_t *pstride, int64_t incr, int64_t chunk) {
581   omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
582       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
583       mapping::isSPMDMode());
584 }
585 
586 void __kmpc_for_static_init_8u(IdentTy *loc, int32_t global_tid,
587                                int32_t schedtype, int32_t *plastiter,
588                                uint64_t *plower, uint64_t *pupper,
589                                int64_t *pstride, int64_t incr, int64_t chunk) {
590   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
591       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
592       mapping::isSPMDMode());
593 }
594 
595 void __kmpc_distribute_static_init_4(IdentTy *loc, int32_t global_tid,
596                                      int32_t schedtype, int32_t *plastiter,
597                                      int32_t *plower, int32_t *pupper,
598                                      int32_t *pstride, int32_t incr,
599                                      int32_t chunk) {
600   omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
601       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
602       mapping::isSPMDMode());
603 }
604 
605 void __kmpc_distribute_static_init_4u(IdentTy *loc, int32_t global_tid,
606                                       int32_t schedtype, int32_t *plastiter,
607                                       uint32_t *plower, uint32_t *pupper,
608                                       int32_t *pstride, int32_t incr,
609                                       int32_t chunk) {
610   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
611       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
612       mapping::isSPMDMode());
613 }
614 
615 void __kmpc_distribute_static_init_8(IdentTy *loc, int32_t global_tid,
616                                      int32_t schedtype, int32_t *plastiter,
617                                      int64_t *plower, int64_t *pupper,
618                                      int64_t *pstride, int64_t incr,
619                                      int64_t chunk) {
620   omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
621       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
622       mapping::isSPMDMode());
623 }
624 
625 void __kmpc_distribute_static_init_8u(IdentTy *loc, int32_t global_tid,
626                                       int32_t schedtype, int32_t *plastiter,
627                                       uint64_t *plower, uint64_t *pupper,
628                                       int64_t *pstride, int64_t incr,
629                                       int64_t chunk) {
630   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
631       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
632       mapping::isSPMDMode());
633 }
634 
635 void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) {}
636 
637 void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) {}
638 }
639 
640 #pragma omp end declare target
641