1 /*
2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3 */
4
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12
13 /* Dynamic scheduling initialization and dispatch.
14 *
15 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
16 * it may change values between parallel regions. __kmp_max_nth
17 * is the largest value __kmp_nth may take, 1 is the smallest.
18 */
19
20 #include "kmp.h"
21 #include "kmp_error.h"
22 #include "kmp_i18n.h"
23 #include "kmp_itt.h"
24 #include "kmp_stats.h"
25 #include "kmp_str.h"
26 #if KMP_USE_X87CONTROL
27 #include <float.h>
28 #endif
29 #include "kmp_lock.h"
30 #include "kmp_dispatch.h"
31 #if KMP_USE_HIER_SCHED
32 #include "kmp_dispatch_hier.h"
33 #endif
34
35 #if OMPT_SUPPORT
36 #include "ompt-specific.h"
37 #endif
38
39 /* ------------------------------------------------------------------------ */
40 /* ------------------------------------------------------------------------ */
41
__kmp_dispatch_deo_error(int * gtid_ref,int * cid_ref,ident_t * loc_ref)42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
43 kmp_info_t *th;
44
45 KMP_DEBUG_ASSERT(gtid_ref);
46
47 if (__kmp_env_consistency_check) {
48 th = __kmp_threads[*gtid_ref];
49 if (th->th.th_root->r.r_active &&
50 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51 #if KMP_USE_DYNAMIC_LOCK
52 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
53 #else
54 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
55 #endif
56 }
57 }
58 }
59
__kmp_dispatch_dxo_error(int * gtid_ref,int * cid_ref,ident_t * loc_ref)60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
61 kmp_info_t *th;
62
63 if (__kmp_env_consistency_check) {
64 th = __kmp_threads[*gtid_ref];
65 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
67 }
68 }
69 }
70
71 // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
__kmp_get_monotonicity(ident_t * loc,enum sched_type schedule,bool use_hier=false)72 static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule,
73 bool use_hier = false) {
74 // Pick up the nonmonotonic/monotonic bits from the scheduling type
75 // TODO: make nonmonotonic when static_steal is fixed
76 int monotonicity = SCHEDULE_MONOTONIC;
77
78 // Let default be monotonic for executables
79 // compiled with OpenMP* 4.5 or less compilers
80 if (loc != NULL && loc->get_openmp_version() < 50)
81 monotonicity = SCHEDULE_MONOTONIC;
82
83 if (use_hier || __kmp_force_monotonic)
84 monotonicity = SCHEDULE_MONOTONIC;
85 else if (SCHEDULE_HAS_NONMONOTONIC(schedule))
86 monotonicity = SCHEDULE_NONMONOTONIC;
87 else if (SCHEDULE_HAS_MONOTONIC(schedule))
88 monotonicity = SCHEDULE_MONOTONIC;
89
90 return monotonicity;
91 }
92
93 #if KMP_STATIC_STEAL_ENABLED
94 enum { // values for steal_flag (possible states of private per-loop buffer)
95 UNUSED = 0,
96 CLAIMED = 1, // owner thread started initialization
97 READY = 2, // available for stealing
98 THIEF = 3 // finished by owner, or claimed by thief
99 // possible state changes:
100 // 0 -> 1 owner only, sync
101 // 0 -> 3 thief only, sync
102 // 1 -> 2 owner only, async
103 // 2 -> 3 owner only, async
104 // 3 -> 2 owner only, async
105 // 3 -> 0 last thread finishing the loop, async
106 };
107 #endif
108
109 // Initialize a dispatch_private_info_template<T> buffer for a particular
110 // type of schedule,chunk. The loop description is found in lb (lower bound),
111 // ub (upper bound), and st (stride). nproc is the number of threads relevant
112 // to the scheduling (often the number of threads in a team, but not always if
113 // hierarchical scheduling is used). tid is the id of the thread calling
114 // the function within the group of nproc threads. It will have a value
115 // between 0 and nproc - 1. This is often just the thread id within a team, but
116 // is not necessarily the case when using hierarchical scheduling.
117 // loc is the source file location of the corresponding loop
118 // gtid is the global thread id
119 template <typename T>
__kmp_dispatch_init_algorithm(ident_t * loc,int gtid,dispatch_private_info_template<T> * pr,enum sched_type schedule,T lb,T ub,typename traits_t<T>::signed_t st,kmp_uint64 * cur_chunk,typename traits_t<T>::signed_t chunk,T nproc,T tid)120 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
121 dispatch_private_info_template<T> *pr,
122 enum sched_type schedule, T lb, T ub,
123 typename traits_t<T>::signed_t st,
124 #if USE_ITT_BUILD
125 kmp_uint64 *cur_chunk,
126 #endif
127 typename traits_t<T>::signed_t chunk,
128 T nproc, T tid) {
129 typedef typename traits_t<T>::unsigned_t UT;
130 typedef typename traits_t<T>::floating_t DBL;
131
132 int active;
133 T tc;
134 kmp_info_t *th;
135 kmp_team_t *team;
136 int monotonicity;
137 bool use_hier;
138
139 #ifdef KMP_DEBUG
140 typedef typename traits_t<T>::signed_t ST;
141 {
142 char *buff;
143 // create format specifiers before the debug output
144 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
145 "pr:%%p lb:%%%s ub:%%%s st:%%%s "
146 "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
147 traits_t<T>::spec, traits_t<T>::spec,
148 traits_t<ST>::spec, traits_t<ST>::spec,
149 traits_t<T>::spec, traits_t<T>::spec);
150 KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
151 __kmp_str_free(&buff);
152 }
153 #endif
154 /* setup data */
155 th = __kmp_threads[gtid];
156 team = th->th.th_team;
157 active = !team->t.t_serialized;
158
159 #if USE_ITT_BUILD
160 int itt_need_metadata_reporting =
161 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
162 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
163 team->t.t_active_level == 1;
164 #endif
165
166 #if KMP_USE_HIER_SCHED
167 use_hier = pr->flags.use_hier;
168 #else
169 use_hier = false;
170 #endif
171
172 /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
173 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
174 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
175
176 /* Pick up the nomerge/ordered bits from the scheduling type */
177 if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
178 pr->flags.nomerge = TRUE;
179 schedule =
180 (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
181 } else {
182 pr->flags.nomerge = FALSE;
183 }
184 pr->type_size = traits_t<T>::type_size; // remember the size of variables
185 if (kmp_ord_lower & schedule) {
186 pr->flags.ordered = TRUE;
187 schedule =
188 (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
189 } else {
190 pr->flags.ordered = FALSE;
191 }
192 // Ordered overrides nonmonotonic
193 if (pr->flags.ordered) {
194 monotonicity = SCHEDULE_MONOTONIC;
195 }
196
197 if (schedule == kmp_sch_static) {
198 schedule = __kmp_static;
199 } else {
200 if (schedule == kmp_sch_runtime) {
201 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
202 // not specified)
203 schedule = team->t.t_sched.r_sched_type;
204 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
205 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
206 if (pr->flags.ordered) // correct monotonicity for ordered loop if needed
207 monotonicity = SCHEDULE_MONOTONIC;
208 // Detail the schedule if needed (global controls are differentiated
209 // appropriately)
210 if (schedule == kmp_sch_guided_chunked) {
211 schedule = __kmp_guided;
212 } else if (schedule == kmp_sch_static) {
213 schedule = __kmp_static;
214 }
215 // Use the chunk size specified by OMP_SCHEDULE (or default if not
216 // specified)
217 chunk = team->t.t_sched.chunk;
218 #if USE_ITT_BUILD
219 if (cur_chunk)
220 *cur_chunk = chunk;
221 #endif
222 #ifdef KMP_DEBUG
223 {
224 char *buff;
225 // create format specifiers before the debug output
226 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
227 "schedule:%%d chunk:%%%s\n",
228 traits_t<ST>::spec);
229 KD_TRACE(10, (buff, gtid, schedule, chunk));
230 __kmp_str_free(&buff);
231 }
232 #endif
233 } else {
234 if (schedule == kmp_sch_guided_chunked) {
235 schedule = __kmp_guided;
236 }
237 if (chunk <= 0) {
238 chunk = KMP_DEFAULT_CHUNK;
239 }
240 }
241
242 if (schedule == kmp_sch_auto) {
243 // mapping and differentiation: in the __kmp_do_serial_initialize()
244 schedule = __kmp_auto;
245 #ifdef KMP_DEBUG
246 {
247 char *buff;
248 // create format specifiers before the debug output
249 buff = __kmp_str_format(
250 "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
251 "schedule:%%d chunk:%%%s\n",
252 traits_t<ST>::spec);
253 KD_TRACE(10, (buff, gtid, schedule, chunk));
254 __kmp_str_free(&buff);
255 }
256 #endif
257 }
258 #if KMP_STATIC_STEAL_ENABLED
259 // map nonmonotonic:dynamic to static steal
260 if (schedule == kmp_sch_dynamic_chunked) {
261 if (monotonicity == SCHEDULE_NONMONOTONIC)
262 schedule = kmp_sch_static_steal;
263 }
264 #endif
265 /* guided analytical not safe for too many threads */
266 if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
267 schedule = kmp_sch_guided_iterative_chunked;
268 KMP_WARNING(DispatchManyThreads);
269 }
270 if (schedule == kmp_sch_runtime_simd) {
271 // compiler provides simd_width in the chunk parameter
272 schedule = team->t.t_sched.r_sched_type;
273 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
274 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
275 // Detail the schedule if needed (global controls are differentiated
276 // appropriately)
277 if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
278 schedule == __kmp_static) {
279 schedule = kmp_sch_static_balanced_chunked;
280 } else {
281 if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
282 schedule = kmp_sch_guided_simd;
283 }
284 chunk = team->t.t_sched.chunk * chunk;
285 }
286 #if USE_ITT_BUILD
287 if (cur_chunk)
288 *cur_chunk = chunk;
289 #endif
290 #ifdef KMP_DEBUG
291 {
292 char *buff;
293 // create format specifiers before the debug output
294 buff = __kmp_str_format(
295 "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
296 " chunk:%%%s\n",
297 traits_t<ST>::spec);
298 KD_TRACE(10, (buff, gtid, schedule, chunk));
299 __kmp_str_free(&buff);
300 }
301 #endif
302 }
303 pr->u.p.parm1 = chunk;
304 }
305 KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
306 "unknown scheduling type");
307
308 pr->u.p.count = 0;
309
310 if (__kmp_env_consistency_check) {
311 if (st == 0) {
312 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
313 (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
314 }
315 }
316 // compute trip count
317 if (st == 1) { // most common case
318 if (ub >= lb) {
319 tc = ub - lb + 1;
320 } else { // ub < lb
321 tc = 0; // zero-trip
322 }
323 } else if (st < 0) {
324 if (lb >= ub) {
325 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
326 // where the division needs to be unsigned regardless of the result type
327 tc = (UT)(lb - ub) / (-st) + 1;
328 } else { // lb < ub
329 tc = 0; // zero-trip
330 }
331 } else { // st > 0
332 if (ub >= lb) {
333 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
334 // where the division needs to be unsigned regardless of the result type
335 tc = (UT)(ub - lb) / st + 1;
336 } else { // ub < lb
337 tc = 0; // zero-trip
338 }
339 }
340
341 #if KMP_STATS_ENABLED
342 if (KMP_MASTER_GTID(gtid)) {
343 KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
344 }
345 #endif
346
347 pr->u.p.lb = lb;
348 pr->u.p.ub = ub;
349 pr->u.p.st = st;
350 pr->u.p.tc = tc;
351
352 #if KMP_OS_WINDOWS
353 pr->u.p.last_upper = ub + st;
354 #endif /* KMP_OS_WINDOWS */
355
356 /* NOTE: only the active parallel region(s) has active ordered sections */
357
358 if (active) {
359 if (pr->flags.ordered) {
360 pr->ordered_bumped = 0;
361 pr->u.p.ordered_lower = 1;
362 pr->u.p.ordered_upper = 0;
363 }
364 }
365
366 switch (schedule) {
367 #if KMP_STATIC_STEAL_ENABLED
368 case kmp_sch_static_steal: {
369 T ntc, init;
370
371 KD_TRACE(100,
372 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
373 gtid));
374
375 ntc = (tc % chunk ? 1 : 0) + tc / chunk;
376 if (nproc > 1 && ntc >= nproc) {
377 KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
378 T id = tid;
379 T small_chunk, extras;
380 kmp_uint32 old = UNUSED;
381 int claimed = pr->steal_flag.compare_exchange_strong(old, CLAIMED);
382 if (traits_t<T>::type_size > 4) {
383 // AC: TODO: check if 16-byte CAS available and use it to
384 // improve performance (probably wait for explicit request
385 // before spending time on this).
386 // For now use dynamically allocated per-private-buffer lock,
387 // free memory in __kmp_dispatch_next when status==0.
388 pr->u.p.steal_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
389 __kmp_init_lock(pr->u.p.steal_lock);
390 }
391 small_chunk = ntc / nproc;
392 extras = ntc % nproc;
393
394 init = id * small_chunk + (id < extras ? id : extras);
395 pr->u.p.count = init;
396 if (claimed) { // are we succeeded in claiming own buffer?
397 pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
398 // Other threads will inspect steal_flag when searching for a victim.
399 // READY means other threads may steal from this thread from now on.
400 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
401 } else {
402 // other thread has stolen whole our range
403 KMP_DEBUG_ASSERT(pr->steal_flag == THIEF);
404 pr->u.p.ub = init; // mark there is no iterations to work on
405 }
406 pr->u.p.parm2 = ntc; // save number of chunks
407 // parm3 is the number of times to attempt stealing which is
408 // nproc (just a heuristics, could be optimized later on).
409 pr->u.p.parm3 = nproc;
410 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
411 break;
412 } else {
413 /* too few chunks: switching to kmp_sch_dynamic_chunked */
414 schedule = kmp_sch_dynamic_chunked;
415 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "
416 "kmp_sch_dynamic_chunked\n",
417 gtid));
418 goto dynamic_init;
419 break;
420 } // if
421 } // case
422 #endif
423 case kmp_sch_static_balanced: {
424 T init, limit;
425
426 KD_TRACE(
427 100,
428 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
429 gtid));
430
431 if (nproc > 1) {
432 T id = tid;
433
434 if (tc < nproc) {
435 if (id < tc) {
436 init = id;
437 limit = id;
438 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
439 } else {
440 pr->u.p.count = 1; /* means no more chunks to execute */
441 pr->u.p.parm1 = FALSE;
442 break;
443 }
444 } else {
445 T small_chunk = tc / nproc;
446 T extras = tc % nproc;
447 init = id * small_chunk + (id < extras ? id : extras);
448 limit = init + small_chunk - (id < extras ? 0 : 1);
449 pr->u.p.parm1 = (id == nproc - 1);
450 }
451 } else {
452 if (tc > 0) {
453 init = 0;
454 limit = tc - 1;
455 pr->u.p.parm1 = TRUE;
456 } else {
457 // zero trip count
458 pr->u.p.count = 1; /* means no more chunks to execute */
459 pr->u.p.parm1 = FALSE;
460 break;
461 }
462 }
463 #if USE_ITT_BUILD
464 // Calculate chunk for metadata report
465 if (itt_need_metadata_reporting)
466 if (cur_chunk)
467 *cur_chunk = limit - init + 1;
468 #endif
469 if (st == 1) {
470 pr->u.p.lb = lb + init;
471 pr->u.p.ub = lb + limit;
472 } else {
473 // calculated upper bound, "ub" is user-defined upper bound
474 T ub_tmp = lb + limit * st;
475 pr->u.p.lb = lb + init * st;
476 // adjust upper bound to "ub" if needed, so that MS lastprivate will match
477 // it exactly
478 if (st > 0) {
479 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
480 } else {
481 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
482 }
483 }
484 if (pr->flags.ordered) {
485 pr->u.p.ordered_lower = init;
486 pr->u.p.ordered_upper = limit;
487 }
488 break;
489 } // case
490 case kmp_sch_static_balanced_chunked: {
491 // similar to balanced, but chunk adjusted to multiple of simd width
492 T nth = nproc;
493 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
494 " -> falling-through to static_greedy\n",
495 gtid));
496 schedule = kmp_sch_static_greedy;
497 if (nth > 1)
498 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
499 else
500 pr->u.p.parm1 = tc;
501 break;
502 } // case
503 case kmp_sch_guided_simd:
504 case kmp_sch_guided_iterative_chunked: {
505 KD_TRACE(
506 100,
507 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
508 " case\n",
509 gtid));
510
511 if (nproc > 1) {
512 if ((2L * chunk + 1) * nproc >= tc) {
513 /* chunk size too large, switch to dynamic */
514 schedule = kmp_sch_dynamic_chunked;
515 goto dynamic_init;
516 } else {
517 // when remaining iters become less than parm2 - switch to dynamic
518 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
519 *(double *)&pr->u.p.parm3 =
520 guided_flt_param / (double)nproc; // may occupy parm3 and parm4
521 }
522 } else {
523 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
524 "kmp_sch_static_greedy\n",
525 gtid));
526 schedule = kmp_sch_static_greedy;
527 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
528 KD_TRACE(
529 100,
530 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
531 gtid));
532 pr->u.p.parm1 = tc;
533 } // if
534 } // case
535 break;
536 case kmp_sch_guided_analytical_chunked: {
537 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
538 "kmp_sch_guided_analytical_chunked case\n",
539 gtid));
540
541 if (nproc > 1) {
542 if ((2L * chunk + 1) * nproc >= tc) {
543 /* chunk size too large, switch to dynamic */
544 schedule = kmp_sch_dynamic_chunked;
545 goto dynamic_init;
546 } else {
547 /* commonly used term: (2 nproc - 1)/(2 nproc) */
548 DBL x;
549
550 #if KMP_USE_X87CONTROL
551 /* Linux* OS already has 64-bit computation by default for long double,
552 and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
553 Windows* OS on IA-32 architecture, we need to set precision to 64-bit
554 instead of the default 53-bit. Even though long double doesn't work
555 on Windows* OS on Intel(R) 64, the resulting lack of precision is not
556 expected to impact the correctness of the algorithm, but this has not
557 been mathematically proven. */
558 // save original FPCW and set precision to 64-bit, as
559 // Windows* OS on IA-32 architecture defaults to 53-bit
560 unsigned int oldFpcw = _control87(0, 0);
561 _control87(_PC_64, _MCW_PC); // 0,0x30000
562 #endif
563 /* value used for comparison in solver for cross-over point */
564 long double target = ((long double)chunk * 2 + 1) * nproc / tc;
565
566 /* crossover point--chunk indexes equal to or greater than
567 this point switch to dynamic-style scheduling */
568 UT cross;
569
570 /* commonly used term: (2 nproc - 1)/(2 nproc) */
571 x = 1.0 - 0.5 / (double)nproc;
572
573 #ifdef KMP_DEBUG
574 { // test natural alignment
575 struct _test_a {
576 char a;
577 union {
578 char b;
579 DBL d;
580 };
581 } t;
582 ptrdiff_t natural_alignment =
583 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
584 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
585 // long)natural_alignment );
586 KMP_DEBUG_ASSERT(
587 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
588 }
589 #endif // KMP_DEBUG
590
591 /* save the term in thread private dispatch structure */
592 *(DBL *)&pr->u.p.parm3 = x;
593
594 /* solve for the crossover point to the nearest integer i for which C_i
595 <= chunk */
596 {
597 UT left, right, mid;
598 long double p;
599
600 /* estimate initial upper and lower bound */
601
602 /* doesn't matter what value right is as long as it is positive, but
603 it affects performance of the solver */
604 right = 229;
605 p = __kmp_pow<UT>(x, right);
606 if (p > target) {
607 do {
608 p *= p;
609 right <<= 1;
610 } while (p > target && right < (1 << 27));
611 /* lower bound is previous (failed) estimate of upper bound */
612 left = right >> 1;
613 } else {
614 left = 0;
615 }
616
617 /* bisection root-finding method */
618 while (left + 1 < right) {
619 mid = (left + right) / 2;
620 if (__kmp_pow<UT>(x, mid) > target) {
621 left = mid;
622 } else {
623 right = mid;
624 }
625 } // while
626 cross = right;
627 }
628 /* assert sanity of computed crossover point */
629 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
630 __kmp_pow<UT>(x, cross) <= target);
631
632 /* save the crossover point in thread private dispatch structure */
633 pr->u.p.parm2 = cross;
634
635 // C75803
636 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
637 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
638 #else
639 #define GUIDED_ANALYTICAL_WORKAROUND (x)
640 #endif
641 /* dynamic-style scheduling offset */
642 pr->u.p.count = tc -
643 __kmp_dispatch_guided_remaining(
644 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
645 cross * chunk;
646 #if KMP_USE_X87CONTROL
647 // restore FPCW
648 _control87(oldFpcw, _MCW_PC);
649 #endif
650 } // if
651 } else {
652 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
653 "kmp_sch_static_greedy\n",
654 gtid));
655 schedule = kmp_sch_static_greedy;
656 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
657 pr->u.p.parm1 = tc;
658 } // if
659 } // case
660 break;
661 case kmp_sch_static_greedy:
662 KD_TRACE(
663 100,
664 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
665 gtid));
666 pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
667 break;
668 case kmp_sch_static_chunked:
669 case kmp_sch_dynamic_chunked:
670 dynamic_init:
671 if (pr->u.p.parm1 <= 0)
672 pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
673 else if (pr->u.p.parm1 > tc)
674 pr->u.p.parm1 = tc;
675 // Store the total number of chunks to prevent integer overflow during
676 // bounds calculations in the get next chunk routine.
677 pr->u.p.parm2 = (tc / pr->u.p.parm1) + (tc % pr->u.p.parm1 ? 1 : 0);
678 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
679 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
680 gtid));
681 break;
682 case kmp_sch_trapezoidal: {
683 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
684
685 T parm1, parm2, parm3, parm4;
686 KD_TRACE(100,
687 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
688 gtid));
689
690 parm1 = chunk;
691
692 /* F : size of the first cycle */
693 parm2 = (tc / (2 * nproc));
694
695 if (parm2 < 1) {
696 parm2 = 1;
697 }
698
699 /* L : size of the last cycle. Make sure the last cycle is not larger
700 than the first cycle. */
701 if (parm1 < 1) {
702 parm1 = 1;
703 } else if (parm1 > parm2) {
704 parm1 = parm2;
705 }
706
707 /* N : number of cycles */
708 parm3 = (parm2 + parm1);
709 parm3 = (2 * tc + parm3 - 1) / parm3;
710
711 if (parm3 < 2) {
712 parm3 = 2;
713 }
714
715 /* sigma : decreasing incr of the trapezoid */
716 parm4 = (parm3 - 1);
717 parm4 = (parm2 - parm1) / parm4;
718
719 // pointless check, because parm4 >= 0 always
720 // if ( parm4 < 0 ) {
721 // parm4 = 0;
722 //}
723
724 pr->u.p.parm1 = parm1;
725 pr->u.p.parm2 = parm2;
726 pr->u.p.parm3 = parm3;
727 pr->u.p.parm4 = parm4;
728 } // case
729 break;
730
731 default: {
732 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
733 KMP_HNT(GetNewerLibrary), // Hint
734 __kmp_msg_null // Variadic argument list terminator
735 );
736 } break;
737 } // switch
738 pr->schedule = schedule;
739 }
740
741 #if KMP_USE_HIER_SCHED
742 template <typename T>
743 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
744 typename traits_t<T>::signed_t st);
745 template <>
746 inline void
__kmp_dispatch_init_hier_runtime(ident_t * loc,kmp_int32 lb,kmp_int32 ub,kmp_int32 st)747 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
748 kmp_int32 ub, kmp_int32 st) {
749 __kmp_dispatch_init_hierarchy<kmp_int32>(
750 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
751 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
752 }
753 template <>
754 inline void
__kmp_dispatch_init_hier_runtime(ident_t * loc,kmp_uint32 lb,kmp_uint32 ub,kmp_int32 st)755 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
756 kmp_uint32 ub, kmp_int32 st) {
757 __kmp_dispatch_init_hierarchy<kmp_uint32>(
758 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
759 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
760 }
761 template <>
762 inline void
__kmp_dispatch_init_hier_runtime(ident_t * loc,kmp_int64 lb,kmp_int64 ub,kmp_int64 st)763 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
764 kmp_int64 ub, kmp_int64 st) {
765 __kmp_dispatch_init_hierarchy<kmp_int64>(
766 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
767 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
768 }
769 template <>
770 inline void
__kmp_dispatch_init_hier_runtime(ident_t * loc,kmp_uint64 lb,kmp_uint64 ub,kmp_int64 st)771 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
772 kmp_uint64 ub, kmp_int64 st) {
773 __kmp_dispatch_init_hierarchy<kmp_uint64>(
774 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
775 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
776 }
777
778 // free all the hierarchy scheduling memory associated with the team
__kmp_dispatch_free_hierarchies(kmp_team_t * team)779 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
780 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
781 for (int i = 0; i < num_disp_buff; ++i) {
782 // type does not matter here so use kmp_int32
783 auto sh =
784 reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
785 &team->t.t_disp_buffer[i]);
786 if (sh->hier) {
787 sh->hier->deallocate();
788 __kmp_free(sh->hier);
789 }
790 }
791 }
792 #endif
793
794 // UT - unsigned flavor of T, ST - signed flavor of T,
795 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
796 template <typename T>
797 static void
__kmp_dispatch_init(ident_t * loc,int gtid,enum sched_type schedule,T lb,T ub,typename traits_t<T>::signed_t st,typename traits_t<T>::signed_t chunk,int push_ws)798 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
799 T ub, typename traits_t<T>::signed_t st,
800 typename traits_t<T>::signed_t chunk, int push_ws) {
801 typedef typename traits_t<T>::unsigned_t UT;
802
803 int active;
804 kmp_info_t *th;
805 kmp_team_t *team;
806 kmp_uint32 my_buffer_index;
807 dispatch_private_info_template<T> *pr;
808 dispatch_shared_info_template<T> volatile *sh;
809
810 KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
811 sizeof(dispatch_private_info));
812 KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
813 sizeof(dispatch_shared_info));
814 __kmp_assert_valid_gtid(gtid);
815
816 if (!TCR_4(__kmp_init_parallel))
817 __kmp_parallel_initialize();
818
819 __kmp_resume_if_soft_paused();
820
821 #if INCLUDE_SSC_MARKS
822 SSC_MARK_DISPATCH_INIT();
823 #endif
824 #ifdef KMP_DEBUG
825 typedef typename traits_t<T>::signed_t ST;
826 {
827 char *buff;
828 // create format specifiers before the debug output
829 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
830 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
831 traits_t<ST>::spec, traits_t<T>::spec,
832 traits_t<T>::spec, traits_t<ST>::spec);
833 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
834 __kmp_str_free(&buff);
835 }
836 #endif
837 /* setup data */
838 th = __kmp_threads[gtid];
839 team = th->th.th_team;
840 active = !team->t.t_serialized;
841 th->th.th_ident = loc;
842
843 // Any half-decent optimizer will remove this test when the blocks are empty
844 // since the macros expand to nothing
845 // when statistics are disabled.
846 if (schedule == __kmp_static) {
847 KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
848 } else {
849 KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
850 }
851
852 #if KMP_USE_HIER_SCHED
853 // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
854 // Hierarchical scheduling does not work with ordered, so if ordered is
855 // detected, then revert back to threaded scheduling.
856 bool ordered;
857 enum sched_type my_sched = schedule;
858 my_buffer_index = th->th.th_dispatch->th_disp_index;
859 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
860 &th->th.th_dispatch
861 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
862 my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
863 if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
864 my_sched =
865 (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
866 ordered = (kmp_ord_lower & my_sched);
867 if (pr->flags.use_hier) {
868 if (ordered) {
869 KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. "
870 "Disabling hierarchical scheduling.\n",
871 gtid));
872 pr->flags.use_hier = FALSE;
873 }
874 }
875 if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
876 // Don't use hierarchical for ordered parallel loops and don't
877 // use the runtime hierarchy if one was specified in the program
878 if (!ordered && !pr->flags.use_hier)
879 __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
880 }
881 #endif // KMP_USE_HIER_SCHED
882
883 #if USE_ITT_BUILD
884 kmp_uint64 cur_chunk = chunk;
885 int itt_need_metadata_reporting =
886 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
887 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
888 team->t.t_active_level == 1;
889 #endif
890 if (!active) {
891 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
892 th->th.th_dispatch->th_disp_buffer); /* top of the stack */
893 } else {
894 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
895 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
896
897 my_buffer_index = th->th.th_dispatch->th_disp_index++;
898
899 /* What happens when number of threads changes, need to resize buffer? */
900 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
901 &th->th.th_dispatch
902 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
903 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
904 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
905 KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
906 my_buffer_index));
907 if (sh->buffer_index != my_buffer_index) { // too many loops in progress?
908 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d"
909 " sh->buffer_index:%d\n",
910 gtid, my_buffer_index, sh->buffer_index));
911 __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
912 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
913 // Note: KMP_WAIT() cannot be used there: buffer index and
914 // my_buffer_index are *always* 32-bit integers.
915 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
916 "sh->buffer_index:%d\n",
917 gtid, my_buffer_index, sh->buffer_index));
918 }
919 }
920
921 __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
922 #if USE_ITT_BUILD
923 &cur_chunk,
924 #endif
925 chunk, (T)th->th.th_team_nproc,
926 (T)th->th.th_info.ds.ds_tid);
927 if (active) {
928 if (pr->flags.ordered == 0) {
929 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
930 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
931 } else {
932 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
933 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
934 }
935 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
936 th->th.th_dispatch->th_dispatch_sh_current =
937 CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
938 #if USE_ITT_BUILD
939 if (pr->flags.ordered) {
940 __kmp_itt_ordered_init(gtid);
941 }
942 // Report loop metadata
943 if (itt_need_metadata_reporting) {
944 // Only report metadata by primary thread of active team at level 1
945 kmp_uint64 schedtype = 0;
946 switch (schedule) {
947 case kmp_sch_static_chunked:
948 case kmp_sch_static_balanced: // Chunk is calculated in the switch above
949 break;
950 case kmp_sch_static_greedy:
951 cur_chunk = pr->u.p.parm1;
952 break;
953 case kmp_sch_dynamic_chunked:
954 schedtype = 1;
955 break;
956 case kmp_sch_guided_iterative_chunked:
957 case kmp_sch_guided_analytical_chunked:
958 case kmp_sch_guided_simd:
959 schedtype = 2;
960 break;
961 default:
962 // Should we put this case under "static"?
963 // case kmp_sch_static_steal:
964 schedtype = 3;
965 break;
966 }
967 __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
968 }
969 #if KMP_USE_HIER_SCHED
970 if (pr->flags.use_hier) {
971 pr->u.p.count = 0;
972 pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
973 }
974 #endif // KMP_USER_HIER_SCHED
975 #endif /* USE_ITT_BUILD */
976 }
977
978 #ifdef KMP_DEBUG
979 {
980 char *buff;
981 // create format specifiers before the debug output
982 buff = __kmp_str_format(
983 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
984 "lb:%%%s ub:%%%s"
985 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
986 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
987 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
988 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
989 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
990 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
991 KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
992 pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
993 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
994 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
995 __kmp_str_free(&buff);
996 }
997 #endif
998 #if OMPT_SUPPORT && OMPT_OPTIONAL
999 if (ompt_enabled.ompt_callback_work) {
1000 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1001 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1002 ompt_callbacks.ompt_callback(ompt_callback_work)(
1003 ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
1004 &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
1005 }
1006 #endif
1007 KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
1008 }
1009
1010 /* For ordered loops, either __kmp_dispatch_finish() should be called after
1011 * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1012 * every chunk of iterations. If the ordered section(s) were not executed
1013 * for this iteration (or every iteration in this chunk), we need to set the
1014 * ordered iteration counters so that the next thread can proceed. */
1015 template <typename UT>
__kmp_dispatch_finish(int gtid,ident_t * loc)1016 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
1017 typedef typename traits_t<UT>::signed_t ST;
1018 __kmp_assert_valid_gtid(gtid);
1019 kmp_info_t *th = __kmp_threads[gtid];
1020
1021 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1022 if (!th->th.th_team->t.t_serialized) {
1023
1024 dispatch_private_info_template<UT> *pr =
1025 reinterpret_cast<dispatch_private_info_template<UT> *>(
1026 th->th.th_dispatch->th_dispatch_pr_current);
1027 dispatch_shared_info_template<UT> volatile *sh =
1028 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1029 th->th.th_dispatch->th_dispatch_sh_current);
1030 KMP_DEBUG_ASSERT(pr);
1031 KMP_DEBUG_ASSERT(sh);
1032 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1033 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1034
1035 if (pr->ordered_bumped) {
1036 KD_TRACE(
1037 1000,
1038 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1039 gtid));
1040 pr->ordered_bumped = 0;
1041 } else {
1042 UT lower = pr->u.p.ordered_lower;
1043
1044 #ifdef KMP_DEBUG
1045 {
1046 char *buff;
1047 // create format specifiers before the debug output
1048 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1049 "ordered_iteration:%%%s lower:%%%s\n",
1050 traits_t<UT>::spec, traits_t<UT>::spec);
1051 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1052 __kmp_str_free(&buff);
1053 }
1054 #endif
1055
1056 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1057 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1058 KMP_MB(); /* is this necessary? */
1059 #ifdef KMP_DEBUG
1060 {
1061 char *buff;
1062 // create format specifiers before the debug output
1063 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1064 "ordered_iteration:%%%s lower:%%%s\n",
1065 traits_t<UT>::spec, traits_t<UT>::spec);
1066 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1067 __kmp_str_free(&buff);
1068 }
1069 #endif
1070
1071 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1072 } // if
1073 } // if
1074 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1075 }
1076
1077 #ifdef KMP_GOMP_COMPAT
1078
1079 template <typename UT>
__kmp_dispatch_finish_chunk(int gtid,ident_t * loc)1080 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1081 typedef typename traits_t<UT>::signed_t ST;
1082 __kmp_assert_valid_gtid(gtid);
1083 kmp_info_t *th = __kmp_threads[gtid];
1084
1085 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1086 if (!th->th.th_team->t.t_serialized) {
1087 dispatch_private_info_template<UT> *pr =
1088 reinterpret_cast<dispatch_private_info_template<UT> *>(
1089 th->th.th_dispatch->th_dispatch_pr_current);
1090 dispatch_shared_info_template<UT> volatile *sh =
1091 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1092 th->th.th_dispatch->th_dispatch_sh_current);
1093 KMP_DEBUG_ASSERT(pr);
1094 KMP_DEBUG_ASSERT(sh);
1095 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1096 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1097
1098 UT lower = pr->u.p.ordered_lower;
1099 UT upper = pr->u.p.ordered_upper;
1100 UT inc = upper - lower + 1;
1101
1102 if (pr->ordered_bumped == inc) {
1103 KD_TRACE(
1104 1000,
1105 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1106 gtid));
1107 pr->ordered_bumped = 0;
1108 } else {
1109 inc -= pr->ordered_bumped;
1110
1111 #ifdef KMP_DEBUG
1112 {
1113 char *buff;
1114 // create format specifiers before the debug output
1115 buff = __kmp_str_format(
1116 "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1117 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1118 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1119 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1120 __kmp_str_free(&buff);
1121 }
1122 #endif
1123
1124 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1125 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1126
1127 KMP_MB(); /* is this necessary? */
1128 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1129 "ordered_bumped to zero\n",
1130 gtid));
1131 pr->ordered_bumped = 0;
1132 //!!!!! TODO check if the inc should be unsigned, or signed???
1133 #ifdef KMP_DEBUG
1134 {
1135 char *buff;
1136 // create format specifiers before the debug output
1137 buff = __kmp_str_format(
1138 "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1139 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1140 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1141 traits_t<UT>::spec);
1142 KD_TRACE(1000,
1143 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1144 __kmp_str_free(&buff);
1145 }
1146 #endif
1147
1148 test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1149 }
1150 // }
1151 }
1152 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1153 }
1154
1155 #endif /* KMP_GOMP_COMPAT */
1156
1157 template <typename T>
__kmp_dispatch_next_algorithm(int gtid,dispatch_private_info_template<T> * pr,dispatch_shared_info_template<T> volatile * sh,kmp_int32 * p_last,T * p_lb,T * p_ub,typename traits_t<T>::signed_t * p_st,T nproc,T tid)1158 int __kmp_dispatch_next_algorithm(int gtid,
1159 dispatch_private_info_template<T> *pr,
1160 dispatch_shared_info_template<T> volatile *sh,
1161 kmp_int32 *p_last, T *p_lb, T *p_ub,
1162 typename traits_t<T>::signed_t *p_st, T nproc,
1163 T tid) {
1164 typedef typename traits_t<T>::unsigned_t UT;
1165 typedef typename traits_t<T>::signed_t ST;
1166 typedef typename traits_t<T>::floating_t DBL;
1167 int status = 0;
1168 bool last = false;
1169 T start;
1170 ST incr;
1171 UT limit, trip, init;
1172 kmp_info_t *th = __kmp_threads[gtid];
1173 kmp_team_t *team = th->th.th_team;
1174
1175 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1176 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1177 KMP_DEBUG_ASSERT(pr);
1178 KMP_DEBUG_ASSERT(sh);
1179 KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1180 #ifdef KMP_DEBUG
1181 {
1182 char *buff;
1183 // create format specifiers before the debug output
1184 buff =
1185 __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1186 "sh:%%p nproc:%%%s tid:%%%s\n",
1187 traits_t<T>::spec, traits_t<T>::spec);
1188 KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1189 __kmp_str_free(&buff);
1190 }
1191 #endif
1192
1193 // zero trip count
1194 if (pr->u.p.tc == 0) {
1195 KD_TRACE(10,
1196 ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1197 "zero status:%d\n",
1198 gtid, status));
1199 return 0;
1200 }
1201
1202 switch (pr->schedule) {
1203 #if KMP_STATIC_STEAL_ENABLED
1204 case kmp_sch_static_steal: {
1205 T chunk = pr->u.p.parm1;
1206 UT nchunks = pr->u.p.parm2;
1207 KD_TRACE(100,
1208 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1209 gtid));
1210
1211 trip = pr->u.p.tc - 1;
1212
1213 if (traits_t<T>::type_size > 4) {
1214 // use lock for 8-byte induction variable.
1215 // TODO (optional): check presence and use 16-byte CAS
1216 kmp_lock_t *lck = pr->u.p.steal_lock;
1217 KMP_DEBUG_ASSERT(lck != NULL);
1218 if (pr->u.p.count < (UT)pr->u.p.ub) {
1219 KMP_DEBUG_ASSERT(pr->steal_flag == READY);
1220 __kmp_acquire_lock(lck, gtid);
1221 // try to get own chunk of iterations
1222 init = (pr->u.p.count)++;
1223 status = (init < (UT)pr->u.p.ub);
1224 __kmp_release_lock(lck, gtid);
1225 } else {
1226 status = 0; // no own chunks
1227 }
1228 if (!status) { // try to steal
1229 kmp_lock_t *lckv; // victim buffer's lock
1230 T while_limit = pr->u.p.parm3;
1231 T while_index = 0;
1232 int idx = (th->th.th_dispatch->th_disp_index - 1) %
1233 __kmp_dispatch_num_buffers; // current loop index
1234 // note: victim thread can potentially execute another loop
1235 KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
1236 while ((!status) && (while_limit != ++while_index)) {
1237 dispatch_private_info_template<T> *v;
1238 T remaining;
1239 T victimId = pr->u.p.parm4;
1240 T oldVictimId = victimId ? victimId - 1 : nproc - 1;
1241 v = reinterpret_cast<dispatch_private_info_template<T> *>(
1242 &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1243 KMP_DEBUG_ASSERT(v);
1244 while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
1245 oldVictimId != victimId) {
1246 victimId = (victimId + 1) % nproc;
1247 v = reinterpret_cast<dispatch_private_info_template<T> *>(
1248 &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1249 KMP_DEBUG_ASSERT(v);
1250 }
1251 if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
1252 continue; // try once more (nproc attempts in total)
1253 }
1254 if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
1255 kmp_uint32 old = UNUSED;
1256 // try to steal whole range from inactive victim
1257 status = v->steal_flag.compare_exchange_strong(old, THIEF);
1258 if (status) {
1259 // initialize self buffer with victim's whole range of chunks
1260 T id = victimId;
1261 T small_chunk, extras;
1262 small_chunk = nchunks / nproc; // chunks per thread
1263 extras = nchunks % nproc;
1264 init = id * small_chunk + (id < extras ? id : extras);
1265 __kmp_acquire_lock(lck, gtid);
1266 pr->u.p.count = init + 1; // exclude one we execute immediately
1267 pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
1268 __kmp_release_lock(lck, gtid);
1269 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
1270 // no need to reinitialize other thread invariants: lb, st, etc.
1271 #ifdef KMP_DEBUG
1272 {
1273 char *buff;
1274 // create format specifiers before the debug output
1275 buff = __kmp_str_format(
1276 "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1277 "count:%%%s ub:%%%s\n",
1278 traits_t<UT>::spec, traits_t<T>::spec);
1279 KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
1280 __kmp_str_free(&buff);
1281 }
1282 #endif
1283 // activate non-empty buffer and let others steal from us
1284 if (pr->u.p.count < (UT)pr->u.p.ub)
1285 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1286 break;
1287 }
1288 }
1289 if (KMP_ATOMIC_LD_RLX(&v->steal_flag) != READY ||
1290 v->u.p.count >= (UT)v->u.p.ub) {
1291 pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
1292 continue; // no chunks to steal, try next victim
1293 }
1294 lckv = v->u.p.steal_lock;
1295 KMP_ASSERT(lckv != NULL);
1296 __kmp_acquire_lock(lckv, gtid);
1297 limit = v->u.p.ub; // keep initial ub
1298 if (v->u.p.count >= limit) {
1299 __kmp_release_lock(lckv, gtid);
1300 pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
1301 continue; // no chunks to steal, try next victim
1302 }
1303
1304 // stealing succeded, reduce victim's ub by 1/4 of undone chunks
1305 // TODO: is this heuristics good enough??
1306 remaining = limit - v->u.p.count;
1307 if (remaining > 7) {
1308 // steal 1/4 of remaining
1309 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1310 init = (v->u.p.ub -= (remaining >> 2));
1311 } else {
1312 // steal 1 chunk of 1..7 remaining
1313 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1314 init = (v->u.p.ub -= 1);
1315 }
1316 __kmp_release_lock(lckv, gtid);
1317 #ifdef KMP_DEBUG
1318 {
1319 char *buff;
1320 // create format specifiers before the debug output
1321 buff = __kmp_str_format(
1322 "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1323 "count:%%%s ub:%%%s\n",
1324 traits_t<UT>::spec, traits_t<UT>::spec);
1325 KD_TRACE(10, (buff, gtid, victimId, init, limit));
1326 __kmp_str_free(&buff);
1327 }
1328 #endif
1329 KMP_DEBUG_ASSERT(init + 1 <= limit);
1330 pr->u.p.parm4 = victimId; // remember victim to steal from
1331 status = 1;
1332 // now update own count and ub with stolen range excluding init chunk
1333 __kmp_acquire_lock(lck, gtid);
1334 pr->u.p.count = init + 1;
1335 pr->u.p.ub = limit;
1336 __kmp_release_lock(lck, gtid);
1337 // activate non-empty buffer and let others steal from us
1338 if (init + 1 < limit)
1339 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1340 } // while (search for victim)
1341 } // if (try to find victim and steal)
1342 } else {
1343 // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1344 // as all operations on pair (count, ub) must be done atomically
1345 typedef union {
1346 struct {
1347 UT count;
1348 T ub;
1349 } p;
1350 kmp_int64 b;
1351 } union_i4;
1352 union_i4 vold, vnew;
1353 if (pr->u.p.count < (UT)pr->u.p.ub) {
1354 KMP_DEBUG_ASSERT(pr->steal_flag == READY);
1355 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1356 vnew.b = vold.b;
1357 vnew.p.count++; // get chunk from head of self range
1358 while (!KMP_COMPARE_AND_STORE_REL64(
1359 (volatile kmp_int64 *)&pr->u.p.count,
1360 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1361 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1362 KMP_CPU_PAUSE();
1363 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1364 vnew.b = vold.b;
1365 vnew.p.count++;
1366 }
1367 init = vold.p.count;
1368 status = (init < (UT)vold.p.ub);
1369 } else {
1370 status = 0; // no own chunks
1371 }
1372 if (!status) { // try to steal
1373 T while_limit = pr->u.p.parm3;
1374 T while_index = 0;
1375 int idx = (th->th.th_dispatch->th_disp_index - 1) %
1376 __kmp_dispatch_num_buffers; // current loop index
1377 // note: victim thread can potentially execute another loop
1378 KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
1379 while ((!status) && (while_limit != ++while_index)) {
1380 dispatch_private_info_template<T> *v;
1381 T remaining;
1382 T victimId = pr->u.p.parm4;
1383 T oldVictimId = victimId ? victimId - 1 : nproc - 1;
1384 v = reinterpret_cast<dispatch_private_info_template<T> *>(
1385 &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1386 KMP_DEBUG_ASSERT(v);
1387 while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
1388 oldVictimId != victimId) {
1389 victimId = (victimId + 1) % nproc;
1390 v = reinterpret_cast<dispatch_private_info_template<T> *>(
1391 &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1392 KMP_DEBUG_ASSERT(v);
1393 }
1394 if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
1395 continue; // try once more (nproc attempts in total)
1396 }
1397 if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
1398 kmp_uint32 old = UNUSED;
1399 // try to steal whole range from inactive victim
1400 status = v->steal_flag.compare_exchange_strong(old, THIEF);
1401 if (status) {
1402 // initialize self buffer with victim's whole range of chunks
1403 T id = victimId;
1404 T small_chunk, extras;
1405 small_chunk = nchunks / nproc; // chunks per thread
1406 extras = nchunks % nproc;
1407 init = id * small_chunk + (id < extras ? id : extras);
1408 vnew.p.count = init + 1;
1409 vnew.p.ub = init + small_chunk + (id < extras ? 1 : 0);
1410 // write pair (count, ub) at once atomically
1411 #if KMP_ARCH_X86
1412 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vnew.b);
1413 #else
1414 *(volatile kmp_int64 *)(&pr->u.p.count) = vnew.b;
1415 #endif
1416 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
1417 // no need to initialize other thread invariants: lb, st, etc.
1418 #ifdef KMP_DEBUG
1419 {
1420 char *buff;
1421 // create format specifiers before the debug output
1422 buff = __kmp_str_format(
1423 "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1424 "count:%%%s ub:%%%s\n",
1425 traits_t<UT>::spec, traits_t<T>::spec);
1426 KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
1427 __kmp_str_free(&buff);
1428 }
1429 #endif
1430 // activate non-empty buffer and let others steal from us
1431 if (pr->u.p.count < (UT)pr->u.p.ub)
1432 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1433 break;
1434 }
1435 }
1436 while (1) { // CAS loop with check if victim still has enough chunks
1437 // many threads may be stealing concurrently from same victim
1438 vold.b = *(volatile kmp_int64 *)(&v->u.p.count);
1439 if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||
1440 vold.p.count >= (UT)vold.p.ub) {
1441 pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim id
1442 break; // no chunks to steal, try next victim
1443 }
1444 vnew.b = vold.b;
1445 remaining = vold.p.ub - vold.p.count;
1446 // try to steal 1/4 of remaining
1447 // TODO: is this heuristics good enough??
1448 if (remaining > 7) {
1449 vnew.p.ub -= remaining >> 2; // steal from tail of victim's range
1450 } else {
1451 vnew.p.ub -= 1; // steal 1 chunk of 1..7 remaining
1452 }
1453 KMP_DEBUG_ASSERT(vnew.p.ub * (UT)chunk <= trip);
1454 if (KMP_COMPARE_AND_STORE_REL64(
1455 (volatile kmp_int64 *)&v->u.p.count,
1456 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1457 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1458 // stealing succedded
1459 #ifdef KMP_DEBUG
1460 {
1461 char *buff;
1462 // create format specifiers before the debug output
1463 buff = __kmp_str_format(
1464 "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1465 "count:%%%s ub:%%%s\n",
1466 traits_t<T>::spec, traits_t<T>::spec);
1467 KD_TRACE(10, (buff, gtid, victimId, vnew.p.ub, vold.p.ub));
1468 __kmp_str_free(&buff);
1469 }
1470 #endif
1471 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1472 vold.p.ub - vnew.p.ub);
1473 status = 1;
1474 pr->u.p.parm4 = victimId; // keep victim id
1475 // now update own count and ub
1476 init = vnew.p.ub;
1477 vold.p.count = init + 1;
1478 #if KMP_ARCH_X86
1479 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1480 #else
1481 *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1482 #endif
1483 // activate non-empty buffer and let others steal from us
1484 if (vold.p.count < (UT)vold.p.ub)
1485 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1486 break;
1487 } // if (check CAS result)
1488 KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt
1489 } // while (try to steal from particular victim)
1490 } // while (search for victim)
1491 } // if (try to find victim and steal)
1492 } // if (4-byte induction variable)
1493 if (!status) {
1494 *p_lb = 0;
1495 *p_ub = 0;
1496 if (p_st != NULL)
1497 *p_st = 0;
1498 } else {
1499 start = pr->u.p.lb;
1500 init *= chunk;
1501 limit = chunk + init - 1;
1502 incr = pr->u.p.st;
1503 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1504
1505 KMP_DEBUG_ASSERT(init <= trip);
1506 // keep track of done chunks for possible early exit from stealing
1507 // TODO: count executed chunks locally with rare update of shared location
1508 // test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1509 if ((last = (limit >= trip)) != 0)
1510 limit = trip;
1511 if (p_st != NULL)
1512 *p_st = incr;
1513
1514 if (incr == 1) {
1515 *p_lb = start + init;
1516 *p_ub = start + limit;
1517 } else {
1518 *p_lb = start + init * incr;
1519 *p_ub = start + limit * incr;
1520 }
1521 } // if
1522 break;
1523 } // case
1524 #endif // KMP_STATIC_STEAL_ENABLED
1525 case kmp_sch_static_balanced: {
1526 KD_TRACE(
1527 10,
1528 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1529 gtid));
1530 /* check if thread has any iteration to do */
1531 if ((status = !pr->u.p.count) != 0) {
1532 pr->u.p.count = 1;
1533 *p_lb = pr->u.p.lb;
1534 *p_ub = pr->u.p.ub;
1535 last = (pr->u.p.parm1 != 0);
1536 if (p_st != NULL)
1537 *p_st = pr->u.p.st;
1538 } else { /* no iterations to do */
1539 pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1540 }
1541 } // case
1542 break;
1543 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1544 merged here */
1545 case kmp_sch_static_chunked: {
1546 T parm1;
1547
1548 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1549 "kmp_sch_static_[affinity|chunked] case\n",
1550 gtid));
1551 parm1 = pr->u.p.parm1;
1552
1553 trip = pr->u.p.tc - 1;
1554 init = parm1 * (pr->u.p.count + tid);
1555
1556 if ((status = (init <= trip)) != 0) {
1557 start = pr->u.p.lb;
1558 incr = pr->u.p.st;
1559 limit = parm1 + init - 1;
1560
1561 if ((last = (limit >= trip)) != 0)
1562 limit = trip;
1563
1564 if (p_st != NULL)
1565 *p_st = incr;
1566
1567 pr->u.p.count += nproc;
1568
1569 if (incr == 1) {
1570 *p_lb = start + init;
1571 *p_ub = start + limit;
1572 } else {
1573 *p_lb = start + init * incr;
1574 *p_ub = start + limit * incr;
1575 }
1576
1577 if (pr->flags.ordered) {
1578 pr->u.p.ordered_lower = init;
1579 pr->u.p.ordered_upper = limit;
1580 } // if
1581 } // if
1582 } // case
1583 break;
1584
1585 case kmp_sch_dynamic_chunked: {
1586 UT chunk_number;
1587 UT chunk_size = pr->u.p.parm1;
1588 UT nchunks = pr->u.p.parm2;
1589
1590 KD_TRACE(
1591 100,
1592 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1593 gtid));
1594
1595 chunk_number = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1596 status = (chunk_number < nchunks);
1597 if (!status) {
1598 *p_lb = 0;
1599 *p_ub = 0;
1600 if (p_st != NULL)
1601 *p_st = 0;
1602 } else {
1603 init = chunk_size * chunk_number;
1604 trip = pr->u.p.tc - 1;
1605 start = pr->u.p.lb;
1606 incr = pr->u.p.st;
1607
1608 if ((last = (trip - init < (UT)chunk_size)))
1609 limit = trip;
1610 else
1611 limit = chunk_size + init - 1;
1612
1613 if (p_st != NULL)
1614 *p_st = incr;
1615
1616 if (incr == 1) {
1617 *p_lb = start + init;
1618 *p_ub = start + limit;
1619 } else {
1620 *p_lb = start + init * incr;
1621 *p_ub = start + limit * incr;
1622 }
1623
1624 if (pr->flags.ordered) {
1625 pr->u.p.ordered_lower = init;
1626 pr->u.p.ordered_upper = limit;
1627 } // if
1628 } // if
1629 } // case
1630 break;
1631
1632 case kmp_sch_guided_iterative_chunked: {
1633 T chunkspec = pr->u.p.parm1;
1634 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1635 "iterative case\n",
1636 gtid));
1637 trip = pr->u.p.tc;
1638 // Start atomic part of calculations
1639 while (1) {
1640 ST remaining; // signed, because can be < 0
1641 init = sh->u.s.iteration; // shared value
1642 remaining = trip - init;
1643 if (remaining <= 0) { // AC: need to compare with 0 first
1644 // nothing to do, don't try atomic op
1645 status = 0;
1646 break;
1647 }
1648 if ((T)remaining <
1649 pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1650 // use dynamic-style schedule
1651 // atomically increment iterations, get old value
1652 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1653 (ST)chunkspec);
1654 remaining = trip - init;
1655 if (remaining <= 0) {
1656 status = 0; // all iterations got by other threads
1657 } else {
1658 // got some iterations to work on
1659 status = 1;
1660 if ((T)remaining > chunkspec) {
1661 limit = init + chunkspec - 1;
1662 } else {
1663 last = true; // the last chunk
1664 limit = init + remaining - 1;
1665 } // if
1666 } // if
1667 break;
1668 } // if
1669 limit = init + (UT)((double)remaining *
1670 *(double *)&pr->u.p.parm3); // divide by K*nproc
1671 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1672 (ST)init, (ST)limit)) {
1673 // CAS was successful, chunk obtained
1674 status = 1;
1675 --limit;
1676 break;
1677 } // if
1678 } // while
1679 if (status != 0) {
1680 start = pr->u.p.lb;
1681 incr = pr->u.p.st;
1682 if (p_st != NULL)
1683 *p_st = incr;
1684 *p_lb = start + init * incr;
1685 *p_ub = start + limit * incr;
1686 if (pr->flags.ordered) {
1687 pr->u.p.ordered_lower = init;
1688 pr->u.p.ordered_upper = limit;
1689 } // if
1690 } else {
1691 *p_lb = 0;
1692 *p_ub = 0;
1693 if (p_st != NULL)
1694 *p_st = 0;
1695 } // if
1696 } // case
1697 break;
1698
1699 case kmp_sch_guided_simd: {
1700 // same as iterative but curr-chunk adjusted to be multiple of given
1701 // chunk
1702 T chunk = pr->u.p.parm1;
1703 KD_TRACE(100,
1704 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1705 gtid));
1706 trip = pr->u.p.tc;
1707 // Start atomic part of calculations
1708 while (1) {
1709 ST remaining; // signed, because can be < 0
1710 init = sh->u.s.iteration; // shared value
1711 remaining = trip - init;
1712 if (remaining <= 0) { // AC: need to compare with 0 first
1713 status = 0; // nothing to do, don't try atomic op
1714 break;
1715 }
1716 KMP_DEBUG_ASSERT(init % chunk == 0);
1717 // compare with K*nproc*(chunk+1), K=2 by default
1718 if ((T)remaining < pr->u.p.parm2) {
1719 // use dynamic-style schedule
1720 // atomically increment iterations, get old value
1721 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1722 (ST)chunk);
1723 remaining = trip - init;
1724 if (remaining <= 0) {
1725 status = 0; // all iterations got by other threads
1726 } else {
1727 // got some iterations to work on
1728 status = 1;
1729 if ((T)remaining > chunk) {
1730 limit = init + chunk - 1;
1731 } else {
1732 last = true; // the last chunk
1733 limit = init + remaining - 1;
1734 } // if
1735 } // if
1736 break;
1737 } // if
1738 // divide by K*nproc
1739 UT span;
1740 __kmp_type_convert((double)remaining * (*(double *)&pr->u.p.parm3),
1741 &span);
1742 UT rem = span % chunk;
1743 if (rem) // adjust so that span%chunk == 0
1744 span += chunk - rem;
1745 limit = init + span;
1746 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1747 (ST)init, (ST)limit)) {
1748 // CAS was successful, chunk obtained
1749 status = 1;
1750 --limit;
1751 break;
1752 } // if
1753 } // while
1754 if (status != 0) {
1755 start = pr->u.p.lb;
1756 incr = pr->u.p.st;
1757 if (p_st != NULL)
1758 *p_st = incr;
1759 *p_lb = start + init * incr;
1760 *p_ub = start + limit * incr;
1761 if (pr->flags.ordered) {
1762 pr->u.p.ordered_lower = init;
1763 pr->u.p.ordered_upper = limit;
1764 } // if
1765 } else {
1766 *p_lb = 0;
1767 *p_ub = 0;
1768 if (p_st != NULL)
1769 *p_st = 0;
1770 } // if
1771 } // case
1772 break;
1773
1774 case kmp_sch_guided_analytical_chunked: {
1775 T chunkspec = pr->u.p.parm1;
1776 UT chunkIdx;
1777 #if KMP_USE_X87CONTROL
1778 /* for storing original FPCW value for Windows* OS on
1779 IA-32 architecture 8-byte version */
1780 unsigned int oldFpcw;
1781 unsigned int fpcwSet = 0;
1782 #endif
1783 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1784 "kmp_sch_guided_analytical_chunked case\n",
1785 gtid));
1786
1787 trip = pr->u.p.tc;
1788
1789 KMP_DEBUG_ASSERT(nproc > 1);
1790 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1791
1792 while (1) { /* this while loop is a safeguard against unexpected zero
1793 chunk sizes */
1794 chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1795 if (chunkIdx >= (UT)pr->u.p.parm2) {
1796 --trip;
1797 /* use dynamic-style scheduling */
1798 init = chunkIdx * chunkspec + pr->u.p.count;
1799 /* need to verify init > 0 in case of overflow in the above
1800 * calculation */
1801 if ((status = (init > 0 && init <= trip)) != 0) {
1802 limit = init + chunkspec - 1;
1803
1804 if ((last = (limit >= trip)) != 0)
1805 limit = trip;
1806 }
1807 break;
1808 } else {
1809 /* use exponential-style scheduling */
1810 /* The following check is to workaround the lack of long double precision on
1811 Windows* OS.
1812 This check works around the possible effect that init != 0 for chunkIdx == 0.
1813 */
1814 #if KMP_USE_X87CONTROL
1815 /* If we haven't already done so, save original
1816 FPCW and set precision to 64-bit, as Windows* OS
1817 on IA-32 architecture defaults to 53-bit */
1818 if (!fpcwSet) {
1819 oldFpcw = _control87(0, 0);
1820 _control87(_PC_64, _MCW_PC);
1821 fpcwSet = 0x30000;
1822 }
1823 #endif
1824 if (chunkIdx) {
1825 init = __kmp_dispatch_guided_remaining<T>(
1826 trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1827 KMP_DEBUG_ASSERT(init);
1828 init = trip - init;
1829 } else
1830 init = 0;
1831 limit = trip - __kmp_dispatch_guided_remaining<T>(
1832 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1833 KMP_ASSERT(init <= limit);
1834 if (init < limit) {
1835 KMP_DEBUG_ASSERT(limit <= trip);
1836 --limit;
1837 status = 1;
1838 break;
1839 } // if
1840 } // if
1841 } // while (1)
1842 #if KMP_USE_X87CONTROL
1843 /* restore FPCW if necessary
1844 AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1845 */
1846 if (fpcwSet && (oldFpcw & fpcwSet))
1847 _control87(oldFpcw, _MCW_PC);
1848 #endif
1849 if (status != 0) {
1850 start = pr->u.p.lb;
1851 incr = pr->u.p.st;
1852 if (p_st != NULL)
1853 *p_st = incr;
1854 *p_lb = start + init * incr;
1855 *p_ub = start + limit * incr;
1856 if (pr->flags.ordered) {
1857 pr->u.p.ordered_lower = init;
1858 pr->u.p.ordered_upper = limit;
1859 }
1860 } else {
1861 *p_lb = 0;
1862 *p_ub = 0;
1863 if (p_st != NULL)
1864 *p_st = 0;
1865 }
1866 } // case
1867 break;
1868
1869 case kmp_sch_trapezoidal: {
1870 UT index;
1871 T parm2 = pr->u.p.parm2;
1872 T parm3 = pr->u.p.parm3;
1873 T parm4 = pr->u.p.parm4;
1874 KD_TRACE(100,
1875 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1876 gtid));
1877
1878 index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1879
1880 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1881 trip = pr->u.p.tc - 1;
1882
1883 if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1884 *p_lb = 0;
1885 *p_ub = 0;
1886 if (p_st != NULL)
1887 *p_st = 0;
1888 } else {
1889 start = pr->u.p.lb;
1890 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1891 incr = pr->u.p.st;
1892
1893 if ((last = (limit >= trip)) != 0)
1894 limit = trip;
1895
1896 if (p_st != NULL)
1897 *p_st = incr;
1898
1899 if (incr == 1) {
1900 *p_lb = start + init;
1901 *p_ub = start + limit;
1902 } else {
1903 *p_lb = start + init * incr;
1904 *p_ub = start + limit * incr;
1905 }
1906
1907 if (pr->flags.ordered) {
1908 pr->u.p.ordered_lower = init;
1909 pr->u.p.ordered_upper = limit;
1910 } // if
1911 } // if
1912 } // case
1913 break;
1914 default: {
1915 status = 0; // to avoid complaints on uninitialized variable use
1916 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1917 KMP_HNT(GetNewerLibrary), // Hint
1918 __kmp_msg_null // Variadic argument list terminator
1919 );
1920 } break;
1921 } // switch
1922 if (p_last)
1923 *p_last = last;
1924 #ifdef KMP_DEBUG
1925 if (pr->flags.ordered) {
1926 char *buff;
1927 // create format specifiers before the debug output
1928 buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1929 "ordered_lower:%%%s ordered_upper:%%%s\n",
1930 traits_t<UT>::spec, traits_t<UT>::spec);
1931 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1932 __kmp_str_free(&buff);
1933 }
1934 {
1935 char *buff;
1936 // create format specifiers before the debug output
1937 buff = __kmp_str_format(
1938 "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1939 "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1940 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1941 KMP_DEBUG_ASSERT(p_last);
1942 KMP_DEBUG_ASSERT(p_st);
1943 KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1944 __kmp_str_free(&buff);
1945 }
1946 #endif
1947 return status;
1948 }
1949
1950 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1951 work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1952 is not called. */
1953 #if OMPT_SUPPORT && OMPT_OPTIONAL
1954 #define OMPT_LOOP_END \
1955 if (status == 0) { \
1956 if (ompt_enabled.ompt_callback_work) { \
1957 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
1958 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \
1959 ompt_callbacks.ompt_callback(ompt_callback_work)( \
1960 ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \
1961 &(task_info->task_data), 0, codeptr); \
1962 } \
1963 }
1964 // TODO: implement count
1965 #else
1966 #define OMPT_LOOP_END // no-op
1967 #endif
1968
1969 #if KMP_STATS_ENABLED
1970 #define KMP_STATS_LOOP_END \
1971 { \
1972 kmp_int64 u, l, t, i; \
1973 l = (kmp_int64)(*p_lb); \
1974 u = (kmp_int64)(*p_ub); \
1975 i = (kmp_int64)(pr->u.p.st); \
1976 if (status == 0) { \
1977 t = 0; \
1978 KMP_POP_PARTITIONED_TIMER(); \
1979 } else if (i == 1) { \
1980 if (u >= l) \
1981 t = u - l + 1; \
1982 else \
1983 t = 0; \
1984 } else if (i < 0) { \
1985 if (l >= u) \
1986 t = (l - u) / (-i) + 1; \
1987 else \
1988 t = 0; \
1989 } else { \
1990 if (u >= l) \
1991 t = (u - l) / i + 1; \
1992 else \
1993 t = 0; \
1994 } \
1995 KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \
1996 }
1997 #else
1998 #define KMP_STATS_LOOP_END /* Nothing */
1999 #endif
2000
2001 template <typename T>
__kmp_dispatch_next(ident_t * loc,int gtid,kmp_int32 * p_last,T * p_lb,T * p_ub,typename traits_t<T>::signed_t * p_st,void * codeptr)2002 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
2003 T *p_lb, T *p_ub,
2004 typename traits_t<T>::signed_t *p_st
2005 #if OMPT_SUPPORT && OMPT_OPTIONAL
2006 ,
2007 void *codeptr
2008 #endif
2009 ) {
2010
2011 typedef typename traits_t<T>::unsigned_t UT;
2012 typedef typename traits_t<T>::signed_t ST;
2013 // This is potentially slightly misleading, schedule(runtime) will appear here
2014 // even if the actual runtime schedule is static. (Which points out a
2015 // disadvantage of schedule(runtime): even when static scheduling is used it
2016 // costs more than a compile time choice to use static scheduling would.)
2017 KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
2018
2019 int status;
2020 dispatch_private_info_template<T> *pr;
2021 __kmp_assert_valid_gtid(gtid);
2022 kmp_info_t *th = __kmp_threads[gtid];
2023 kmp_team_t *team = th->th.th_team;
2024
2025 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
2026 KD_TRACE(
2027 1000,
2028 ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
2029 gtid, p_lb, p_ub, p_st, p_last));
2030
2031 if (team->t.t_serialized) {
2032 /* NOTE: serialize this dispatch because we are not at the active level */
2033 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2034 th->th.th_dispatch->th_disp_buffer); /* top of the stack */
2035 KMP_DEBUG_ASSERT(pr);
2036
2037 if ((status = (pr->u.p.tc != 0)) == 0) {
2038 *p_lb = 0;
2039 *p_ub = 0;
2040 // if ( p_last != NULL )
2041 // *p_last = 0;
2042 if (p_st != NULL)
2043 *p_st = 0;
2044 if (__kmp_env_consistency_check) {
2045 if (pr->pushed_ws != ct_none) {
2046 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2047 }
2048 }
2049 } else if (pr->flags.nomerge) {
2050 kmp_int32 last;
2051 T start;
2052 UT limit, trip, init;
2053 ST incr;
2054 T chunk = pr->u.p.parm1;
2055
2056 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
2057 gtid));
2058
2059 init = chunk * pr->u.p.count++;
2060 trip = pr->u.p.tc - 1;
2061
2062 if ((status = (init <= trip)) == 0) {
2063 *p_lb = 0;
2064 *p_ub = 0;
2065 // if ( p_last != NULL )
2066 // *p_last = 0;
2067 if (p_st != NULL)
2068 *p_st = 0;
2069 if (__kmp_env_consistency_check) {
2070 if (pr->pushed_ws != ct_none) {
2071 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2072 }
2073 }
2074 } else {
2075 start = pr->u.p.lb;
2076 limit = chunk + init - 1;
2077 incr = pr->u.p.st;
2078
2079 if ((last = (limit >= trip)) != 0) {
2080 limit = trip;
2081 #if KMP_OS_WINDOWS
2082 pr->u.p.last_upper = pr->u.p.ub;
2083 #endif /* KMP_OS_WINDOWS */
2084 }
2085 if (p_last != NULL)
2086 *p_last = last;
2087 if (p_st != NULL)
2088 *p_st = incr;
2089 if (incr == 1) {
2090 *p_lb = start + init;
2091 *p_ub = start + limit;
2092 } else {
2093 *p_lb = start + init * incr;
2094 *p_ub = start + limit * incr;
2095 }
2096
2097 if (pr->flags.ordered) {
2098 pr->u.p.ordered_lower = init;
2099 pr->u.p.ordered_upper = limit;
2100 #ifdef KMP_DEBUG
2101 {
2102 char *buff;
2103 // create format specifiers before the debug output
2104 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2105 "ordered_lower:%%%s ordered_upper:%%%s\n",
2106 traits_t<UT>::spec, traits_t<UT>::spec);
2107 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2108 pr->u.p.ordered_upper));
2109 __kmp_str_free(&buff);
2110 }
2111 #endif
2112 } // if
2113 } // if
2114 } else {
2115 pr->u.p.tc = 0;
2116 *p_lb = pr->u.p.lb;
2117 *p_ub = pr->u.p.ub;
2118 #if KMP_OS_WINDOWS
2119 pr->u.p.last_upper = *p_ub;
2120 #endif /* KMP_OS_WINDOWS */
2121 if (p_last != NULL)
2122 *p_last = TRUE;
2123 if (p_st != NULL)
2124 *p_st = pr->u.p.st;
2125 } // if
2126 #ifdef KMP_DEBUG
2127 {
2128 char *buff;
2129 // create format specifiers before the debug output
2130 buff = __kmp_str_format(
2131 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
2132 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
2133 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2134 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last,
2135 (p_last ? *p_last : 0), status));
2136 __kmp_str_free(&buff);
2137 }
2138 #endif
2139 #if INCLUDE_SSC_MARKS
2140 SSC_MARK_DISPATCH_NEXT();
2141 #endif
2142 OMPT_LOOP_END;
2143 KMP_STATS_LOOP_END;
2144 return status;
2145 } else {
2146 kmp_int32 last = 0;
2147 dispatch_shared_info_template<T> volatile *sh;
2148
2149 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2150 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2151
2152 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2153 th->th.th_dispatch->th_dispatch_pr_current);
2154 KMP_DEBUG_ASSERT(pr);
2155 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2156 th->th.th_dispatch->th_dispatch_sh_current);
2157 KMP_DEBUG_ASSERT(sh);
2158
2159 #if KMP_USE_HIER_SCHED
2160 if (pr->flags.use_hier)
2161 status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2162 else
2163 #endif // KMP_USE_HIER_SCHED
2164 status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2165 p_st, th->th.th_team_nproc,
2166 th->th.th_info.ds.ds_tid);
2167 // status == 0: no more iterations to execute
2168 if (status == 0) {
2169 ST num_done;
2170 num_done = test_then_inc<ST>(&sh->u.s.num_done);
2171 #ifdef KMP_DEBUG
2172 {
2173 char *buff;
2174 // create format specifiers before the debug output
2175 buff = __kmp_str_format(
2176 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2177 traits_t<ST>::spec);
2178 KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2179 __kmp_str_free(&buff);
2180 }
2181 #endif
2182
2183 #if KMP_USE_HIER_SCHED
2184 pr->flags.use_hier = FALSE;
2185 #endif
2186 if (num_done == th->th.th_team_nproc - 1) {
2187 #if KMP_STATIC_STEAL_ENABLED
2188 if (pr->schedule == kmp_sch_static_steal) {
2189 int i;
2190 int idx = (th->th.th_dispatch->th_disp_index - 1) %
2191 __kmp_dispatch_num_buffers; // current loop index
2192 // loop complete, safe to destroy locks used for stealing
2193 for (i = 0; i < th->th.th_team_nproc; ++i) {
2194 dispatch_private_info_template<T> *buf =
2195 reinterpret_cast<dispatch_private_info_template<T> *>(
2196 &team->t.t_dispatch[i].th_disp_buffer[idx]);
2197 KMP_ASSERT(buf->steal_flag == THIEF); // buffer must be inactive
2198 KMP_ATOMIC_ST_RLX(&buf->steal_flag, UNUSED);
2199 if (traits_t<T>::type_size > 4) {
2200 // destroy locks used for stealing
2201 kmp_lock_t *lck = buf->u.p.steal_lock;
2202 KMP_ASSERT(lck != NULL);
2203 __kmp_destroy_lock(lck);
2204 __kmp_free(lck);
2205 buf->u.p.steal_lock = NULL;
2206 }
2207 }
2208 }
2209 #endif
2210 /* NOTE: release shared buffer to be reused */
2211
2212 KMP_MB(); /* Flush all pending memory write invalidates. */
2213
2214 sh->u.s.num_done = 0;
2215 sh->u.s.iteration = 0;
2216
2217 /* TODO replace with general release procedure? */
2218 if (pr->flags.ordered) {
2219 sh->u.s.ordered_iteration = 0;
2220 }
2221
2222 sh->buffer_index += __kmp_dispatch_num_buffers;
2223 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2224 gtid, sh->buffer_index));
2225
2226 KMP_MB(); /* Flush all pending memory write invalidates. */
2227
2228 } // if
2229 if (__kmp_env_consistency_check) {
2230 if (pr->pushed_ws != ct_none) {
2231 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2232 }
2233 }
2234
2235 th->th.th_dispatch->th_deo_fcn = NULL;
2236 th->th.th_dispatch->th_dxo_fcn = NULL;
2237 th->th.th_dispatch->th_dispatch_sh_current = NULL;
2238 th->th.th_dispatch->th_dispatch_pr_current = NULL;
2239 } // if (status == 0)
2240 #if KMP_OS_WINDOWS
2241 else if (last) {
2242 pr->u.p.last_upper = pr->u.p.ub;
2243 }
2244 #endif /* KMP_OS_WINDOWS */
2245 if (p_last != NULL && status != 0)
2246 *p_last = last;
2247 } // if
2248
2249 #ifdef KMP_DEBUG
2250 {
2251 char *buff;
2252 // create format specifiers before the debug output
2253 buff = __kmp_str_format(
2254 "__kmp_dispatch_next: T#%%d normal case: "
2255 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2256 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2257 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2258 (p_last ? *p_last : 0), status));
2259 __kmp_str_free(&buff);
2260 }
2261 #endif
2262 #if INCLUDE_SSC_MARKS
2263 SSC_MARK_DISPATCH_NEXT();
2264 #endif
2265 OMPT_LOOP_END;
2266 KMP_STATS_LOOP_END;
2267 return status;
2268 }
2269
2270 template <typename T>
__kmp_dist_get_bounds(ident_t * loc,kmp_int32 gtid,kmp_int32 * plastiter,T * plower,T * pupper,typename traits_t<T>::signed_t incr)2271 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2272 kmp_int32 *plastiter, T *plower, T *pupper,
2273 typename traits_t<T>::signed_t incr) {
2274 typedef typename traits_t<T>::unsigned_t UT;
2275 kmp_uint32 team_id;
2276 kmp_uint32 nteams;
2277 UT trip_count;
2278 kmp_team_t *team;
2279 kmp_info_t *th;
2280
2281 KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2282 KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2283 #ifdef KMP_DEBUG
2284 typedef typename traits_t<T>::signed_t ST;
2285 {
2286 char *buff;
2287 // create format specifiers before the debug output
2288 buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2289 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2290 traits_t<T>::spec, traits_t<T>::spec,
2291 traits_t<ST>::spec, traits_t<T>::spec);
2292 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2293 __kmp_str_free(&buff);
2294 }
2295 #endif
2296
2297 if (__kmp_env_consistency_check) {
2298 if (incr == 0) {
2299 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2300 loc);
2301 }
2302 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2303 // The loop is illegal.
2304 // Some zero-trip loops maintained by compiler, e.g.:
2305 // for(i=10;i<0;++i) // lower >= upper - run-time check
2306 // for(i=0;i>10;--i) // lower <= upper - run-time check
2307 // for(i=0;i>10;++i) // incr > 0 - compile-time check
2308 // for(i=10;i<0;--i) // incr < 0 - compile-time check
2309 // Compiler does not check the following illegal loops:
2310 // for(i=0;i<10;i+=incr) // where incr<0
2311 // for(i=10;i>0;i-=incr) // where incr<0
2312 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2313 }
2314 }
2315 __kmp_assert_valid_gtid(gtid);
2316 th = __kmp_threads[gtid];
2317 team = th->th.th_team;
2318 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2319 nteams = th->th.th_teams_size.nteams;
2320 team_id = team->t.t_master_tid;
2321 KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2322
2323 // compute global trip count
2324 if (incr == 1) {
2325 trip_count = *pupper - *plower + 1;
2326 } else if (incr == -1) {
2327 trip_count = *plower - *pupper + 1;
2328 } else if (incr > 0) {
2329 // upper-lower can exceed the limit of signed type
2330 trip_count = (UT)(*pupper - *plower) / incr + 1;
2331 } else {
2332 trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2333 }
2334
2335 if (trip_count <= nteams) {
2336 KMP_DEBUG_ASSERT(
2337 __kmp_static == kmp_sch_static_greedy ||
2338 __kmp_static ==
2339 kmp_sch_static_balanced); // Unknown static scheduling type.
2340 // only some teams get single iteration, others get nothing
2341 if (team_id < trip_count) {
2342 *pupper = *plower = *plower + team_id * incr;
2343 } else {
2344 *plower = *pupper + incr; // zero-trip loop
2345 }
2346 if (plastiter != NULL)
2347 *plastiter = (team_id == trip_count - 1);
2348 } else {
2349 if (__kmp_static == kmp_sch_static_balanced) {
2350 UT chunk = trip_count / nteams;
2351 UT extras = trip_count % nteams;
2352 *plower +=
2353 incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2354 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2355 if (plastiter != NULL)
2356 *plastiter = (team_id == nteams - 1);
2357 } else {
2358 T chunk_inc_count =
2359 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2360 T upper = *pupper;
2361 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2362 // Unknown static scheduling type.
2363 *plower += team_id * chunk_inc_count;
2364 *pupper = *plower + chunk_inc_count - incr;
2365 // Check/correct bounds if needed
2366 if (incr > 0) {
2367 if (*pupper < *plower)
2368 *pupper = traits_t<T>::max_value;
2369 if (plastiter != NULL)
2370 *plastiter = *plower <= upper && *pupper > upper - incr;
2371 if (*pupper > upper)
2372 *pupper = upper; // tracker C73258
2373 } else {
2374 if (*pupper > *plower)
2375 *pupper = traits_t<T>::min_value;
2376 if (plastiter != NULL)
2377 *plastiter = *plower >= upper && *pupper < upper - incr;
2378 if (*pupper < upper)
2379 *pupper = upper; // tracker C73258
2380 }
2381 }
2382 }
2383 }
2384
2385 //-----------------------------------------------------------------------------
2386 // Dispatch routines
2387 // Transfer call to template< type T >
2388 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2389 // T lb, T ub, ST st, ST chunk )
2390 extern "C" {
2391
2392 /*!
2393 @ingroup WORK_SHARING
2394 @{
2395 @param loc Source location
2396 @param gtid Global thread id
2397 @param schedule Schedule type
2398 @param lb Lower bound
2399 @param ub Upper bound
2400 @param st Step (or increment if you prefer)
2401 @param chunk The chunk size to block with
2402
2403 This function prepares the runtime to start a dynamically scheduled for loop,
2404 saving the loop arguments.
2405 These functions are all identical apart from the types of the arguments.
2406 */
2407
__kmpc_dispatch_init_4(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_int32 lb,kmp_int32 ub,kmp_int32 st,kmp_int32 chunk)2408 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2409 enum sched_type schedule, kmp_int32 lb,
2410 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2411 KMP_DEBUG_ASSERT(__kmp_init_serial);
2412 #if OMPT_SUPPORT && OMPT_OPTIONAL
2413 OMPT_STORE_RETURN_ADDRESS(gtid);
2414 #endif
2415 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2416 }
2417 /*!
2418 See @ref __kmpc_dispatch_init_4
2419 */
__kmpc_dispatch_init_4u(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_uint32 lb,kmp_uint32 ub,kmp_int32 st,kmp_int32 chunk)2420 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2421 enum sched_type schedule, kmp_uint32 lb,
2422 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2423 KMP_DEBUG_ASSERT(__kmp_init_serial);
2424 #if OMPT_SUPPORT && OMPT_OPTIONAL
2425 OMPT_STORE_RETURN_ADDRESS(gtid);
2426 #endif
2427 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2428 }
2429
2430 /*!
2431 See @ref __kmpc_dispatch_init_4
2432 */
__kmpc_dispatch_init_8(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_int64 lb,kmp_int64 ub,kmp_int64 st,kmp_int64 chunk)2433 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2434 enum sched_type schedule, kmp_int64 lb,
2435 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2436 KMP_DEBUG_ASSERT(__kmp_init_serial);
2437 #if OMPT_SUPPORT && OMPT_OPTIONAL
2438 OMPT_STORE_RETURN_ADDRESS(gtid);
2439 #endif
2440 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2441 }
2442
2443 /*!
2444 See @ref __kmpc_dispatch_init_4
2445 */
__kmpc_dispatch_init_8u(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_uint64 lb,kmp_uint64 ub,kmp_int64 st,kmp_int64 chunk)2446 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2447 enum sched_type schedule, kmp_uint64 lb,
2448 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2449 KMP_DEBUG_ASSERT(__kmp_init_serial);
2450 #if OMPT_SUPPORT && OMPT_OPTIONAL
2451 OMPT_STORE_RETURN_ADDRESS(gtid);
2452 #endif
2453 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2454 }
2455
2456 /*!
2457 See @ref __kmpc_dispatch_init_4
2458
2459 Difference from __kmpc_dispatch_init set of functions is these functions
2460 are called for composite distribute parallel for construct. Thus before
2461 regular iterations dispatching we need to calc per-team iteration space.
2462
2463 These functions are all identical apart from the types of the arguments.
2464 */
__kmpc_dist_dispatch_init_4(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_int32 * p_last,kmp_int32 lb,kmp_int32 ub,kmp_int32 st,kmp_int32 chunk)2465 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2466 enum sched_type schedule, kmp_int32 *p_last,
2467 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2468 kmp_int32 chunk) {
2469 KMP_DEBUG_ASSERT(__kmp_init_serial);
2470 #if OMPT_SUPPORT && OMPT_OPTIONAL
2471 OMPT_STORE_RETURN_ADDRESS(gtid);
2472 #endif
2473 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2474 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2475 }
2476
__kmpc_dist_dispatch_init_4u(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_int32 * p_last,kmp_uint32 lb,kmp_uint32 ub,kmp_int32 st,kmp_int32 chunk)2477 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2478 enum sched_type schedule, kmp_int32 *p_last,
2479 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2480 kmp_int32 chunk) {
2481 KMP_DEBUG_ASSERT(__kmp_init_serial);
2482 #if OMPT_SUPPORT && OMPT_OPTIONAL
2483 OMPT_STORE_RETURN_ADDRESS(gtid);
2484 #endif
2485 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2486 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2487 }
2488
__kmpc_dist_dispatch_init_8(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_int32 * p_last,kmp_int64 lb,kmp_int64 ub,kmp_int64 st,kmp_int64 chunk)2489 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2490 enum sched_type schedule, kmp_int32 *p_last,
2491 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2492 kmp_int64 chunk) {
2493 KMP_DEBUG_ASSERT(__kmp_init_serial);
2494 #if OMPT_SUPPORT && OMPT_OPTIONAL
2495 OMPT_STORE_RETURN_ADDRESS(gtid);
2496 #endif
2497 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2498 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2499 }
2500
__kmpc_dist_dispatch_init_8u(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_int32 * p_last,kmp_uint64 lb,kmp_uint64 ub,kmp_int64 st,kmp_int64 chunk)2501 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2502 enum sched_type schedule, kmp_int32 *p_last,
2503 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2504 kmp_int64 chunk) {
2505 KMP_DEBUG_ASSERT(__kmp_init_serial);
2506 #if OMPT_SUPPORT && OMPT_OPTIONAL
2507 OMPT_STORE_RETURN_ADDRESS(gtid);
2508 #endif
2509 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2510 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2511 }
2512
2513 /*!
2514 @param loc Source code location
2515 @param gtid Global thread id
2516 @param p_last Pointer to a flag set to one if this is the last chunk or zero
2517 otherwise
2518 @param p_lb Pointer to the lower bound for the next chunk of work
2519 @param p_ub Pointer to the upper bound for the next chunk of work
2520 @param p_st Pointer to the stride for the next chunk of work
2521 @return one if there is work to be done, zero otherwise
2522
2523 Get the next dynamically allocated chunk of work for this thread.
2524 If there is no more work, then the lb,ub and stride need not be modified.
2525 */
__kmpc_dispatch_next_4(ident_t * loc,kmp_int32 gtid,kmp_int32 * p_last,kmp_int32 * p_lb,kmp_int32 * p_ub,kmp_int32 * p_st)2526 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2527 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2528 #if OMPT_SUPPORT && OMPT_OPTIONAL
2529 OMPT_STORE_RETURN_ADDRESS(gtid);
2530 #endif
2531 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2532 #if OMPT_SUPPORT && OMPT_OPTIONAL
2533 ,
2534 OMPT_LOAD_RETURN_ADDRESS(gtid)
2535 #endif
2536 );
2537 }
2538
2539 /*!
2540 See @ref __kmpc_dispatch_next_4
2541 */
__kmpc_dispatch_next_4u(ident_t * loc,kmp_int32 gtid,kmp_int32 * p_last,kmp_uint32 * p_lb,kmp_uint32 * p_ub,kmp_int32 * p_st)2542 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2543 kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2544 kmp_int32 *p_st) {
2545 #if OMPT_SUPPORT && OMPT_OPTIONAL
2546 OMPT_STORE_RETURN_ADDRESS(gtid);
2547 #endif
2548 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2549 #if OMPT_SUPPORT && OMPT_OPTIONAL
2550 ,
2551 OMPT_LOAD_RETURN_ADDRESS(gtid)
2552 #endif
2553 );
2554 }
2555
2556 /*!
2557 See @ref __kmpc_dispatch_next_4
2558 */
__kmpc_dispatch_next_8(ident_t * loc,kmp_int32 gtid,kmp_int32 * p_last,kmp_int64 * p_lb,kmp_int64 * p_ub,kmp_int64 * p_st)2559 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2560 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2561 #if OMPT_SUPPORT && OMPT_OPTIONAL
2562 OMPT_STORE_RETURN_ADDRESS(gtid);
2563 #endif
2564 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2565 #if OMPT_SUPPORT && OMPT_OPTIONAL
2566 ,
2567 OMPT_LOAD_RETURN_ADDRESS(gtid)
2568 #endif
2569 );
2570 }
2571
2572 /*!
2573 See @ref __kmpc_dispatch_next_4
2574 */
__kmpc_dispatch_next_8u(ident_t * loc,kmp_int32 gtid,kmp_int32 * p_last,kmp_uint64 * p_lb,kmp_uint64 * p_ub,kmp_int64 * p_st)2575 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2576 kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2577 kmp_int64 *p_st) {
2578 #if OMPT_SUPPORT && OMPT_OPTIONAL
2579 OMPT_STORE_RETURN_ADDRESS(gtid);
2580 #endif
2581 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2582 #if OMPT_SUPPORT && OMPT_OPTIONAL
2583 ,
2584 OMPT_LOAD_RETURN_ADDRESS(gtid)
2585 #endif
2586 );
2587 }
2588
2589 /*!
2590 @param loc Source code location
2591 @param gtid Global thread id
2592
2593 Mark the end of a dynamic loop.
2594 */
__kmpc_dispatch_fini_4(ident_t * loc,kmp_int32 gtid)2595 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2596 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2597 }
2598
2599 /*!
2600 See @ref __kmpc_dispatch_fini_4
2601 */
__kmpc_dispatch_fini_8(ident_t * loc,kmp_int32 gtid)2602 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2603 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2604 }
2605
2606 /*!
2607 See @ref __kmpc_dispatch_fini_4
2608 */
__kmpc_dispatch_fini_4u(ident_t * loc,kmp_int32 gtid)2609 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2610 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2611 }
2612
2613 /*!
2614 See @ref __kmpc_dispatch_fini_4
2615 */
__kmpc_dispatch_fini_8u(ident_t * loc,kmp_int32 gtid)2616 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2617 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2618 }
2619 /*! @} */
2620
2621 //-----------------------------------------------------------------------------
2622 // Non-template routines from kmp_dispatch.cpp used in other sources
2623
__kmp_eq_4(kmp_uint32 value,kmp_uint32 checker)2624 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2625 return value == checker;
2626 }
2627
__kmp_neq_4(kmp_uint32 value,kmp_uint32 checker)2628 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2629 return value != checker;
2630 }
2631
__kmp_lt_4(kmp_uint32 value,kmp_uint32 checker)2632 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2633 return value < checker;
2634 }
2635
__kmp_ge_4(kmp_uint32 value,kmp_uint32 checker)2636 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2637 return value >= checker;
2638 }
2639
__kmp_le_4(kmp_uint32 value,kmp_uint32 checker)2640 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2641 return value <= checker;
2642 }
2643
2644 kmp_uint32
__kmp_wait_4(volatile kmp_uint32 * spinner,kmp_uint32 checker,kmp_uint32 (* pred)(kmp_uint32,kmp_uint32),void * obj)2645 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2646 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2647 void *obj // Higher-level synchronization object, or NULL.
2648 ) {
2649 // note: we may not belong to a team at this point
2650 volatile kmp_uint32 *spin = spinner;
2651 kmp_uint32 check = checker;
2652 kmp_uint32 spins;
2653 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2654 kmp_uint32 r;
2655
2656 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2657 KMP_INIT_YIELD(spins);
2658 // main wait spin loop
2659 while (!f(r = TCR_4(*spin), check)) {
2660 KMP_FSYNC_SPIN_PREPARE(obj);
2661 /* GEH - remove this since it was accidentally introduced when kmp_wait was
2662 split. It causes problems with infinite recursion because of exit lock */
2663 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2664 __kmp_abort_thread(); */
2665 KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2666 }
2667 KMP_FSYNC_SPIN_ACQUIRED(obj);
2668 return r;
2669 }
2670
__kmp_wait_4_ptr(void * spinner,kmp_uint32 checker,kmp_uint32 (* pred)(void *,kmp_uint32),void * obj)2671 void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
2672 kmp_uint32 (*pred)(void *, kmp_uint32),
2673 void *obj // Higher-level synchronization object, or NULL.
2674 ) {
2675 // note: we may not belong to a team at this point
2676 void *spin = spinner;
2677 kmp_uint32 check = checker;
2678 kmp_uint32 spins;
2679 kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2680
2681 KMP_FSYNC_SPIN_INIT(obj, spin);
2682 KMP_INIT_YIELD(spins);
2683 // main wait spin loop
2684 while (!f(spin, check)) {
2685 KMP_FSYNC_SPIN_PREPARE(obj);
2686 /* if we have waited a bit, or are noversubscribed, yield */
2687 /* pause is in the following code */
2688 KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2689 }
2690 KMP_FSYNC_SPIN_ACQUIRED(obj);
2691 }
2692
2693 } // extern "C"
2694
2695 #ifdef KMP_GOMP_COMPAT
2696
__kmp_aux_dispatch_init_4(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_int32 lb,kmp_int32 ub,kmp_int32 st,kmp_int32 chunk,int push_ws)2697 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2698 enum sched_type schedule, kmp_int32 lb,
2699 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2700 int push_ws) {
2701 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2702 push_ws);
2703 }
2704
__kmp_aux_dispatch_init_4u(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_uint32 lb,kmp_uint32 ub,kmp_int32 st,kmp_int32 chunk,int push_ws)2705 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2706 enum sched_type schedule, kmp_uint32 lb,
2707 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2708 int push_ws) {
2709 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2710 push_ws);
2711 }
2712
__kmp_aux_dispatch_init_8(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_int64 lb,kmp_int64 ub,kmp_int64 st,kmp_int64 chunk,int push_ws)2713 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2714 enum sched_type schedule, kmp_int64 lb,
2715 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2716 int push_ws) {
2717 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2718 push_ws);
2719 }
2720
__kmp_aux_dispatch_init_8u(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_uint64 lb,kmp_uint64 ub,kmp_int64 st,kmp_int64 chunk,int push_ws)2721 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2722 enum sched_type schedule, kmp_uint64 lb,
2723 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2724 int push_ws) {
2725 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2726 push_ws);
2727 }
2728
__kmp_aux_dispatch_fini_chunk_4(ident_t * loc,kmp_int32 gtid)2729 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2730 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2731 }
2732
__kmp_aux_dispatch_fini_chunk_8(ident_t * loc,kmp_int32 gtid)2733 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2734 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2735 }
2736
__kmp_aux_dispatch_fini_chunk_4u(ident_t * loc,kmp_int32 gtid)2737 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2738 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2739 }
2740
__kmp_aux_dispatch_fini_chunk_8u(ident_t * loc,kmp_int32 gtid)2741 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2742 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2743 }
2744
2745 #endif /* KMP_GOMP_COMPAT */
2746
2747 /* ------------------------------------------------------------------------ */
2748