1 /*
2  * kmp_tasking.cpp -- OpenMP 3.0 tasking support.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_i18n.h"
15 #include "kmp_itt.h"
16 #include "kmp_stats.h"
17 #include "kmp_wait_release.h"
18 #include "kmp_taskdeps.h"
19 
20 #if OMPT_SUPPORT
21 #include "ompt-specific.h"
22 #endif
23 
24 #include "tsan_annotations.h"
25 
26 /* forward declaration */
27 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
28                                  kmp_info_t *this_thr);
29 static void __kmp_alloc_task_deque(kmp_info_t *thread,
30                                    kmp_thread_data_t *thread_data);
31 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
32                                            kmp_task_team_t *task_team);
33 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask);
34 
35 #ifdef BUILD_TIED_TASK_STACK
36 
37 //  __kmp_trace_task_stack: print the tied tasks from the task stack in order
38 //  from top do bottom
39 //
40 //  gtid: global thread identifier for thread containing stack
41 //  thread_data: thread data for task team thread containing stack
42 //  threshold: value above which the trace statement triggers
43 //  location: string identifying call site of this function (for trace)
44 static void __kmp_trace_task_stack(kmp_int32 gtid,
45                                    kmp_thread_data_t *thread_data,
46                                    int threshold, char *location) {
47   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
48   kmp_taskdata_t **stack_top = task_stack->ts_top;
49   kmp_int32 entries = task_stack->ts_entries;
50   kmp_taskdata_t *tied_task;
51 
52   KA_TRACE(
53       threshold,
54       ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, "
55        "first_block = %p, stack_top = %p \n",
56        location, gtid, entries, task_stack->ts_first_block, stack_top));
57 
58   KMP_DEBUG_ASSERT(stack_top != NULL);
59   KMP_DEBUG_ASSERT(entries > 0);
60 
61   while (entries != 0) {
62     KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]);
63     // fix up ts_top if we need to pop from previous block
64     if (entries & TASK_STACK_INDEX_MASK == 0) {
65       kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top);
66 
67       stack_block = stack_block->sb_prev;
68       stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
69     }
70 
71     // finish bookkeeping
72     stack_top--;
73     entries--;
74 
75     tied_task = *stack_top;
76 
77     KMP_DEBUG_ASSERT(tied_task != NULL);
78     KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
79 
80     KA_TRACE(threshold,
81              ("__kmp_trace_task_stack(%s):             gtid=%d, entry=%d, "
82               "stack_top=%p, tied_task=%p\n",
83               location, gtid, entries, stack_top, tied_task));
84   }
85   KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]);
86 
87   KA_TRACE(threshold,
88            ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n",
89             location, gtid));
90 }
91 
92 //  __kmp_init_task_stack: initialize the task stack for the first time
93 //  after a thread_data structure is created.
94 //  It should not be necessary to do this again (assuming the stack works).
95 //
96 //  gtid: global thread identifier of calling thread
97 //  thread_data: thread data for task team thread containing stack
98 static void __kmp_init_task_stack(kmp_int32 gtid,
99                                   kmp_thread_data_t *thread_data) {
100   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
101   kmp_stack_block_t *first_block;
102 
103   // set up the first block of the stack
104   first_block = &task_stack->ts_first_block;
105   task_stack->ts_top = (kmp_taskdata_t **)first_block;
106   memset((void *)first_block, '\0',
107          TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *));
108 
109   // initialize the stack to be empty
110   task_stack->ts_entries = TASK_STACK_EMPTY;
111   first_block->sb_next = NULL;
112   first_block->sb_prev = NULL;
113 }
114 
115 //  __kmp_free_task_stack: free the task stack when thread_data is destroyed.
116 //
117 //  gtid: global thread identifier for calling thread
118 //  thread_data: thread info for thread containing stack
119 static void __kmp_free_task_stack(kmp_int32 gtid,
120                                   kmp_thread_data_t *thread_data) {
121   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
122   kmp_stack_block_t *stack_block = &task_stack->ts_first_block;
123 
124   KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY);
125   // free from the second block of the stack
126   while (stack_block != NULL) {
127     kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL;
128 
129     stack_block->sb_next = NULL;
130     stack_block->sb_prev = NULL;
131     if (stack_block != &task_stack->ts_first_block) {
132       __kmp_thread_free(thread,
133                         stack_block); // free the block, if not the first
134     }
135     stack_block = next_block;
136   }
137   // initialize the stack to be empty
138   task_stack->ts_entries = 0;
139   task_stack->ts_top = NULL;
140 }
141 
142 //  __kmp_push_task_stack: Push the tied task onto the task stack.
143 //     Grow the stack if necessary by allocating another block.
144 //
145 //  gtid: global thread identifier for calling thread
146 //  thread: thread info for thread containing stack
147 //  tied_task: the task to push on the stack
148 static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread,
149                                   kmp_taskdata_t *tied_task) {
150   // GEH - need to consider what to do if tt_threads_data not allocated yet
151   kmp_thread_data_t *thread_data =
152       &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
153   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
154 
155   if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) {
156     return; // Don't push anything on stack if team or team tasks are serialized
157   }
158 
159   KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
160   KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
161 
162   KA_TRACE(20,
163            ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n",
164             gtid, thread, tied_task));
165   // Store entry
166   *(task_stack->ts_top) = tied_task;
167 
168   // Do bookkeeping for next push
169   task_stack->ts_top++;
170   task_stack->ts_entries++;
171 
172   if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
173     // Find beginning of this task block
174     kmp_stack_block_t *stack_block =
175         (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE);
176 
177     // Check if we already have a block
178     if (stack_block->sb_next !=
179         NULL) { // reset ts_top to beginning of next block
180       task_stack->ts_top = &stack_block->sb_next->sb_block[0];
181     } else { // Alloc new block and link it up
182       kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc(
183           thread, sizeof(kmp_stack_block_t));
184 
185       task_stack->ts_top = &new_block->sb_block[0];
186       stack_block->sb_next = new_block;
187       new_block->sb_prev = stack_block;
188       new_block->sb_next = NULL;
189 
190       KA_TRACE(
191           30,
192           ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n",
193            gtid, tied_task, new_block));
194     }
195   }
196   KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
197                 tied_task));
198 }
199 
200 //  __kmp_pop_task_stack: Pop the tied task from the task stack.  Don't return
201 //  the task, just check to make sure it matches the ending task passed in.
202 //
203 //  gtid: global thread identifier for the calling thread
204 //  thread: thread info structure containing stack
205 //  tied_task: the task popped off the stack
206 //  ending_task: the task that is ending (should match popped task)
207 static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread,
208                                  kmp_taskdata_t *ending_task) {
209   // GEH - need to consider what to do if tt_threads_data not allocated yet
210   kmp_thread_data_t *thread_data =
211       &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)];
212   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
213   kmp_taskdata_t *tied_task;
214 
215   if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) {
216     // Don't pop anything from stack if team or team tasks are serialized
217     return;
218   }
219 
220   KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
221   KMP_DEBUG_ASSERT(task_stack->ts_entries > 0);
222 
223   KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid,
224                 thread));
225 
226   // fix up ts_top if we need to pop from previous block
227   if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
228     kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top);
229 
230     stack_block = stack_block->sb_prev;
231     task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
232   }
233 
234   // finish bookkeeping
235   task_stack->ts_top--;
236   task_stack->ts_entries--;
237 
238   tied_task = *(task_stack->ts_top);
239 
240   KMP_DEBUG_ASSERT(tied_task != NULL);
241   KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
242   KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly
243 
244   KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
245                 tied_task));
246   return;
247 }
248 #endif /* BUILD_TIED_TASK_STACK */
249 
250 // returns 1 if new task is allowed to execute, 0 otherwise
251 // checks Task Scheduling constraint (if requested) and
252 // mutexinoutset dependencies if any
253 static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained,
254                                   const kmp_taskdata_t *tasknew,
255                                   const kmp_taskdata_t *taskcurr) {
256   if (is_constrained && (tasknew->td_flags.tiedness == TASK_TIED)) {
257     // Check if the candidate obeys the Task Scheduling Constraints (TSC)
258     // only descendant of all deferred tied tasks can be scheduled, checking
259     // the last one is enough, as it in turn is the descendant of all others
260     kmp_taskdata_t *current = taskcurr->td_last_tied;
261     KMP_DEBUG_ASSERT(current != NULL);
262     // check if the task is not suspended on barrier
263     if (current->td_flags.tasktype == TASK_EXPLICIT ||
264         current->td_taskwait_thread > 0) { // <= 0 on barrier
265       kmp_int32 level = current->td_level;
266       kmp_taskdata_t *parent = tasknew->td_parent;
267       while (parent != current && parent->td_level > level) {
268         // check generation up to the level of the current task
269         parent = parent->td_parent;
270         KMP_DEBUG_ASSERT(parent != NULL);
271       }
272       if (parent != current)
273         return false;
274     }
275   }
276   // Check mutexinoutset dependencies, acquire locks
277   kmp_depnode_t *node = tasknew->td_depnode;
278   if (node && (node->dn.mtx_num_locks > 0)) {
279     for (int i = 0; i < node->dn.mtx_num_locks; ++i) {
280       KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
281       if (__kmp_test_lock(node->dn.mtx_locks[i], gtid))
282         continue;
283       // could not get the lock, release previous locks
284       for (int j = i - 1; j >= 0; --j)
285         __kmp_release_lock(node->dn.mtx_locks[j], gtid);
286       return false;
287     }
288     // negative num_locks means all locks acquired successfully
289     node->dn.mtx_num_locks = -node->dn.mtx_num_locks;
290   }
291   return true;
292 }
293 
294 // __kmp_realloc_task_deque:
295 // Re-allocates a task deque for a particular thread, copies the content from
296 // the old deque and adjusts the necessary data structures relating to the
297 // deque. This operation must be done with the deque_lock being held
298 static void __kmp_realloc_task_deque(kmp_info_t *thread,
299                                      kmp_thread_data_t *thread_data) {
300   kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
301   KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == size);
302   kmp_int32 new_size = 2 * size;
303 
304   KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to "
305                 "%d] for thread_data %p\n",
306                 __kmp_gtid_from_thread(thread), size, new_size, thread_data));
307 
308   kmp_taskdata_t **new_deque =
309       (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *));
310 
311   int i, j;
312   for (i = thread_data->td.td_deque_head, j = 0; j < size;
313        i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++)
314     new_deque[j] = thread_data->td.td_deque[i];
315 
316   __kmp_free(thread_data->td.td_deque);
317 
318   thread_data->td.td_deque_head = 0;
319   thread_data->td.td_deque_tail = size;
320   thread_data->td.td_deque = new_deque;
321   thread_data->td.td_deque_size = new_size;
322 }
323 
324 //  __kmp_push_task: Add a task to the thread's deque
325 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
326   kmp_info_t *thread = __kmp_threads[gtid];
327   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
328   kmp_task_team_t *task_team = thread->th.th_task_team;
329   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
330   kmp_thread_data_t *thread_data;
331 
332   KA_TRACE(20,
333            ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata));
334 
335   if (taskdata->td_flags.tiedness == TASK_UNTIED) {
336     // untied task needs to increment counter so that the task structure is not
337     // freed prematurely
338     kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
339     KMP_DEBUG_USE_VAR(counter);
340     KA_TRACE(
341         20,
342         ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",
343          gtid, counter, taskdata));
344   }
345 
346   // The first check avoids building task_team thread data if serialized
347   if (taskdata->td_flags.task_serial) {
348     KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning "
349                   "TASK_NOT_PUSHED for task %p\n",
350                   gtid, taskdata));
351     return TASK_NOT_PUSHED;
352   }
353 
354   // Now that serialized tasks have returned, we can assume that we are not in
355   // immediate exec mode
356   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
357   if (!KMP_TASKING_ENABLED(task_team)) {
358     __kmp_enable_tasking(task_team, thread);
359   }
360   KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE);
361   KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL);
362 
363   // Find tasking deque specific to encountering thread
364   thread_data = &task_team->tt.tt_threads_data[tid];
365 
366   // No lock needed since only owner can allocate
367   if (thread_data->td.td_deque == NULL) {
368     __kmp_alloc_task_deque(thread, thread_data);
369   }
370 
371   int locked = 0;
372   // Check if deque is full
373   if (TCR_4(thread_data->td.td_deque_ntasks) >=
374       TASK_DEQUE_SIZE(thread_data->td)) {
375     if (__kmp_enable_task_throttling &&
376         __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
377                               thread->th.th_current_task)) {
378       KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning "
379                     "TASK_NOT_PUSHED for task %p\n",
380                     gtid, taskdata));
381       return TASK_NOT_PUSHED;
382     } else {
383       __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
384       locked = 1;
385       if (TCR_4(thread_data->td.td_deque_ntasks) >=
386           TASK_DEQUE_SIZE(thread_data->td)) {
387         // expand deque to push the task which is not allowed to execute
388         __kmp_realloc_task_deque(thread, thread_data);
389       }
390     }
391   }
392   // Lock the deque for the task push operation
393   if (!locked) {
394     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
395     // Need to recheck as we can get a proxy task from thread outside of OpenMP
396     if (TCR_4(thread_data->td.td_deque_ntasks) >=
397         TASK_DEQUE_SIZE(thread_data->td)) {
398       if (__kmp_enable_task_throttling &&
399           __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
400                                 thread->th.th_current_task)) {
401         __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
402         KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; "
403                       "returning TASK_NOT_PUSHED for task %p\n",
404                       gtid, taskdata));
405         return TASK_NOT_PUSHED;
406       } else {
407         // expand deque to push the task which is not allowed to execute
408         __kmp_realloc_task_deque(thread, thread_data);
409       }
410     }
411   }
412   // Must have room since no thread can add tasks but calling thread
413   KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
414                    TASK_DEQUE_SIZE(thread_data->td));
415 
416   thread_data->td.td_deque[thread_data->td.td_deque_tail] =
417       taskdata; // Push taskdata
418   // Wrap index.
419   thread_data->td.td_deque_tail =
420       (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
421   TCW_4(thread_data->td.td_deque_ntasks,
422         TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
423 
424   KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
425                 "task=%p ntasks=%d head=%u tail=%u\n",
426                 gtid, taskdata, thread_data->td.td_deque_ntasks,
427                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
428 
429   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
430 
431   return TASK_SUCCESSFULLY_PUSHED;
432 }
433 
434 // __kmp_pop_current_task_from_thread: set up current task from called thread
435 // when team ends
436 //
437 // this_thr: thread structure to set current_task in.
438 void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) {
439   KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d "
440                 "this_thread=%p, curtask=%p, "
441                 "curtask_parent=%p\n",
442                 0, this_thr, this_thr->th.th_current_task,
443                 this_thr->th.th_current_task->td_parent));
444 
445   this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent;
446 
447   KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d "
448                 "this_thread=%p, curtask=%p, "
449                 "curtask_parent=%p\n",
450                 0, this_thr, this_thr->th.th_current_task,
451                 this_thr->th.th_current_task->td_parent));
452 }
453 
454 // __kmp_push_current_task_to_thread: set up current task in called thread for a
455 // new team
456 //
457 // this_thr: thread structure to set up
458 // team: team for implicit task data
459 // tid: thread within team to set up
460 void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team,
461                                        int tid) {
462   // current task of the thread is a parent of the new just created implicit
463   // tasks of new team
464   KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p "
465                 "curtask=%p "
466                 "parent_task=%p\n",
467                 tid, this_thr, this_thr->th.th_current_task,
468                 team->t.t_implicit_task_taskdata[tid].td_parent));
469 
470   KMP_DEBUG_ASSERT(this_thr != NULL);
471 
472   if (tid == 0) {
473     if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) {
474       team->t.t_implicit_task_taskdata[0].td_parent =
475           this_thr->th.th_current_task;
476       this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0];
477     }
478   } else {
479     team->t.t_implicit_task_taskdata[tid].td_parent =
480         team->t.t_implicit_task_taskdata[0].td_parent;
481     this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid];
482   }
483 
484   KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p "
485                 "curtask=%p "
486                 "parent_task=%p\n",
487                 tid, this_thr, this_thr->th.th_current_task,
488                 team->t.t_implicit_task_taskdata[tid].td_parent));
489 }
490 
491 // __kmp_task_start: bookkeeping for a task starting execution
492 //
493 // GTID: global thread id of calling thread
494 // task: task starting execution
495 // current_task: task suspending
496 static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task,
497                              kmp_taskdata_t *current_task) {
498   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
499   kmp_info_t *thread = __kmp_threads[gtid];
500 
501   KA_TRACE(10,
502            ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",
503             gtid, taskdata, current_task));
504 
505   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
506 
507   // mark currently executing task as suspended
508   // TODO: GEH - make sure root team implicit task is initialized properly.
509   // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );
510   current_task->td_flags.executing = 0;
511 
512 // Add task to stack if tied
513 #ifdef BUILD_TIED_TASK_STACK
514   if (taskdata->td_flags.tiedness == TASK_TIED) {
515     __kmp_push_task_stack(gtid, thread, taskdata);
516   }
517 #endif /* BUILD_TIED_TASK_STACK */
518 
519   // mark starting task as executing and as current task
520   thread->th.th_current_task = taskdata;
521 
522   KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 ||
523                    taskdata->td_flags.tiedness == TASK_UNTIED);
524   KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 ||
525                    taskdata->td_flags.tiedness == TASK_UNTIED);
526   taskdata->td_flags.started = 1;
527   taskdata->td_flags.executing = 1;
528   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
529   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
530 
531   // GEH TODO: shouldn't we pass some sort of location identifier here?
532   // APT: yes, we will pass location here.
533   // need to store current thread state (in a thread or taskdata structure)
534   // before setting work_state, otherwise wrong state is set after end of task
535 
536   KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata));
537 
538   return;
539 }
540 
541 #if OMPT_SUPPORT
542 //------------------------------------------------------------------------------
543 // __ompt_task_init:
544 //   Initialize OMPT fields maintained by a task. This will only be called after
545 //   ompt_start_tool, so we already know whether ompt is enabled or not.
546 
547 static inline void __ompt_task_init(kmp_taskdata_t *task, int tid) {
548   // The calls to __ompt_task_init already have the ompt_enabled condition.
549   task->ompt_task_info.task_data.value = 0;
550   task->ompt_task_info.frame.exit_frame = ompt_data_none;
551   task->ompt_task_info.frame.enter_frame = ompt_data_none;
552   task->ompt_task_info.frame.exit_frame_flags = ompt_frame_runtime | ompt_frame_framepointer;
553   task->ompt_task_info.frame.enter_frame_flags = ompt_frame_runtime | ompt_frame_framepointer;
554   task->ompt_task_info.ndeps = 0;
555   task->ompt_task_info.deps = NULL;
556 }
557 
558 // __ompt_task_start:
559 //   Build and trigger task-begin event
560 static inline void __ompt_task_start(kmp_task_t *task,
561                                      kmp_taskdata_t *current_task,
562                                      kmp_int32 gtid) {
563   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
564   ompt_task_status_t status = ompt_task_switch;
565   if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) {
566     status = ompt_task_yield;
567     __kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = 0;
568   }
569   /* let OMPT know that we're about to run this task */
570   if (ompt_enabled.ompt_callback_task_schedule) {
571     ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
572         &(current_task->ompt_task_info.task_data), status,
573         &(taskdata->ompt_task_info.task_data));
574   }
575   taskdata->ompt_task_info.scheduling_parent = current_task;
576 }
577 
578 // __ompt_task_finish:
579 //   Build and trigger final task-schedule event
580 static inline void __ompt_task_finish(kmp_task_t *task,
581                                       kmp_taskdata_t *resumed_task,
582                                       ompt_task_status_t status) {
583   if (ompt_enabled.ompt_callback_task_schedule) {
584     kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
585     if (__kmp_omp_cancellation && taskdata->td_taskgroup &&
586         taskdata->td_taskgroup->cancel_request == cancel_taskgroup) {
587       status = ompt_task_cancel;
588     }
589 
590     /* let OMPT know that we're returning to the callee task */
591     ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
592         &(taskdata->ompt_task_info.task_data), status,
593         (resumed_task ? &(resumed_task->ompt_task_info.task_data) : NULL));
594   }
595 }
596 #endif
597 
598 template <bool ompt>
599 static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid,
600                                                kmp_task_t *task,
601                                                void *frame_address,
602                                                void *return_address) {
603   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
604   kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
605 
606   KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p "
607                 "current_task=%p\n",
608                 gtid, loc_ref, taskdata, current_task));
609 
610   if (taskdata->td_flags.tiedness == TASK_UNTIED) {
611     // untied task needs to increment counter so that the task structure is not
612     // freed prematurely
613     kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
614     KMP_DEBUG_USE_VAR(counter);
615     KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) "
616                   "incremented for task %p\n",
617                   gtid, counter, taskdata));
618   }
619 
620   taskdata->td_flags.task_serial =
621       1; // Execute this task immediately, not deferred.
622   __kmp_task_start(gtid, task, current_task);
623 
624 #if OMPT_SUPPORT
625   if (ompt) {
626     if (current_task->ompt_task_info.frame.enter_frame.ptr == NULL) {
627       current_task->ompt_task_info.frame.enter_frame.ptr =
628           taskdata->ompt_task_info.frame.exit_frame.ptr = frame_address;
629       current_task->ompt_task_info.frame.enter_frame_flags =
630           taskdata->ompt_task_info.frame.exit_frame_flags = ompt_frame_application | ompt_frame_framepointer;
631     }
632     if (ompt_enabled.ompt_callback_task_create) {
633       ompt_task_info_t *parent_info = &(current_task->ompt_task_info);
634       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
635           &(parent_info->task_data), &(parent_info->frame),
636           &(taskdata->ompt_task_info.task_data),
637           ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(taskdata), 0,
638           return_address);
639     }
640     __ompt_task_start(task, current_task, gtid);
641   }
642 #endif // OMPT_SUPPORT
643 
644   KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid,
645                 loc_ref, taskdata));
646 }
647 
648 #if OMPT_SUPPORT
649 OMPT_NOINLINE
650 static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
651                                            kmp_task_t *task,
652                                            void *frame_address,
653                                            void *return_address) {
654   __kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address,
655                                            return_address);
656 }
657 #endif // OMPT_SUPPORT
658 
659 // __kmpc_omp_task_begin_if0: report that a given serialized task has started
660 // execution
661 //
662 // loc_ref: source location information; points to beginning of task block.
663 // gtid: global thread number.
664 // task: task thunk for the started task.
665 void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
666                                kmp_task_t *task) {
667 #if OMPT_SUPPORT
668   if (UNLIKELY(ompt_enabled.enabled)) {
669     OMPT_STORE_RETURN_ADDRESS(gtid);
670     __kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task,
671                                    OMPT_GET_FRAME_ADDRESS(1),
672                                    OMPT_LOAD_RETURN_ADDRESS(gtid));
673     return;
674   }
675 #endif
676   __kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL);
677 }
678 
679 #ifdef TASK_UNUSED
680 // __kmpc_omp_task_begin: report that a given task has started execution
681 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
682 void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) {
683   kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
684 
685   KA_TRACE(
686       10,
687       ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
688        gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task));
689 
690   __kmp_task_start(gtid, task, current_task);
691 
692   KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid,
693                 loc_ref, KMP_TASK_TO_TASKDATA(task)));
694   return;
695 }
696 #endif // TASK_UNUSED
697 
698 // __kmp_free_task: free the current task space and the space for shareds
699 //
700 // gtid: Global thread ID of calling thread
701 // taskdata: task to free
702 // thread: thread data structure of caller
703 static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
704                             kmp_info_t *thread) {
705   KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid,
706                 taskdata));
707 
708   // Check to make sure all flags and counters have the correct values
709   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
710   KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0);
711   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1);
712   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
713   KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == 0 ||
714                    taskdata->td_flags.task_serial == 1);
715   KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0);
716 
717   taskdata->td_flags.freed = 1;
718   ANNOTATE_HAPPENS_BEFORE(taskdata);
719 // deallocate the taskdata and shared variable blocks associated with this task
720 #if USE_FAST_MEMORY
721   __kmp_fast_free(thread, taskdata);
722 #else /* ! USE_FAST_MEMORY */
723   __kmp_thread_free(thread, taskdata);
724 #endif
725 
726   KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));
727 }
728 
729 // __kmp_free_task_and_ancestors: free the current task and ancestors without
730 // children
731 //
732 // gtid: Global thread ID of calling thread
733 // taskdata: task to free
734 // thread: thread data structure of caller
735 static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
736                                           kmp_taskdata_t *taskdata,
737                                           kmp_info_t *thread) {
738   // Proxy tasks must always be allowed to free their parents
739   // because they can be run in background even in serial mode.
740   kmp_int32 team_serial =
741       (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) &&
742       !taskdata->td_flags.proxy;
743   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
744 
745   kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
746   KMP_DEBUG_ASSERT(children >= 0);
747 
748   // Now, go up the ancestor tree to see if any ancestors can now be freed.
749   while (children == 0) {
750     kmp_taskdata_t *parent_taskdata = taskdata->td_parent;
751 
752     KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete "
753                   "and freeing itself\n",
754                   gtid, taskdata));
755 
756     // --- Deallocate my ancestor task ---
757     __kmp_free_task(gtid, taskdata, thread);
758 
759     taskdata = parent_taskdata;
760 
761     if (team_serial)
762       return;
763     // Stop checking ancestors at implicit task instead of walking up ancestor
764     // tree to avoid premature deallocation of ancestors.
765     if (taskdata->td_flags.tasktype == TASK_IMPLICIT) {
766       if (taskdata->td_dephash) { // do we need to cleanup dephash?
767         int children = KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks);
768         kmp_tasking_flags_t flags_old = taskdata->td_flags;
769         if (children == 0 && flags_old.complete == 1) {
770           kmp_tasking_flags_t flags_new = flags_old;
771           flags_new.complete = 0;
772           if (KMP_COMPARE_AND_STORE_ACQ32(
773                   RCAST(kmp_int32 *, &taskdata->td_flags),
774                   *RCAST(kmp_int32 *, &flags_old),
775                   *RCAST(kmp_int32 *, &flags_new))) {
776             KA_TRACE(100, ("__kmp_free_task_and_ancestors: T#%d cleans "
777                            "dephash of implicit task %p\n",
778                            gtid, taskdata));
779             // cleanup dephash of finished implicit task
780             __kmp_dephash_free_entries(thread, taskdata->td_dephash);
781           }
782         }
783       }
784       return;
785     }
786     // Predecrement simulated by "- 1" calculation
787     children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
788     KMP_DEBUG_ASSERT(children >= 0);
789   }
790 
791   KA_TRACE(
792       20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "
793            "not freeing it yet\n",
794            gtid, taskdata, children));
795 }
796 
797 // __kmp_task_finish: bookkeeping to do when a task finishes execution
798 //
799 // gtid: global thread ID for calling thread
800 // task: task to be finished
801 // resumed_task: task to be resumed.  (may be NULL if task is serialized)
802 //
803 // template<ompt>: effectively ompt_enabled.enabled!=0
804 // the version with ompt=false is inlined, allowing to optimize away all ompt
805 // code in this case
806 template <bool ompt>
807 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
808                               kmp_taskdata_t *resumed_task) {
809   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
810   kmp_info_t *thread = __kmp_threads[gtid];
811   kmp_task_team_t *task_team =
812       thread->th.th_task_team; // might be NULL for serial teams...
813   kmp_int32 children = 0;
814 
815   KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming "
816                 "task %p\n",
817                 gtid, taskdata, resumed_task));
818 
819   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
820 
821 // Pop task from stack if tied
822 #ifdef BUILD_TIED_TASK_STACK
823   if (taskdata->td_flags.tiedness == TASK_TIED) {
824     __kmp_pop_task_stack(gtid, thread, taskdata);
825   }
826 #endif /* BUILD_TIED_TASK_STACK */
827 
828   if (taskdata->td_flags.tiedness == TASK_UNTIED) {
829     // untied task needs to check the counter so that the task structure is not
830     // freed prematurely
831     kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1;
832     KA_TRACE(
833         20,
834         ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",
835          gtid, counter, taskdata));
836     if (counter > 0) {
837       // untied task is not done, to be continued possibly by other thread, do
838       // not free it now
839       if (resumed_task == NULL) {
840         KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial);
841         resumed_task = taskdata->td_parent; // In a serialized task, the resumed
842         // task is the parent
843       }
844       thread->th.th_current_task = resumed_task; // restore current_task
845       resumed_task->td_flags.executing = 1; // resume previous task
846       KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, "
847                     "resuming task %p\n",
848                     gtid, taskdata, resumed_task));
849       return;
850     }
851   }
852 
853   // Check mutexinoutset dependencies, release locks
854   kmp_depnode_t *node = taskdata->td_depnode;
855   if (node && (node->dn.mtx_num_locks < 0)) {
856     // negative num_locks means all locks were acquired
857     node->dn.mtx_num_locks = -node->dn.mtx_num_locks;
858     for (int i = node->dn.mtx_num_locks - 1; i >= 0; --i) {
859       KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
860       __kmp_release_lock(node->dn.mtx_locks[i], gtid);
861     }
862   }
863 
864   // bookkeeping for resuming task:
865   // GEH - note tasking_ser => task_serial
866   KMP_DEBUG_ASSERT(
867       (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
868       taskdata->td_flags.task_serial);
869   if (taskdata->td_flags.task_serial) {
870     if (resumed_task == NULL) {
871       resumed_task = taskdata->td_parent; // In a serialized task, the resumed
872       // task is the parent
873     }
874   } else {
875     KMP_DEBUG_ASSERT(resumed_task !=
876                      NULL); // verify that resumed task is passed as argument
877   }
878 
879   /* If the tasks' destructor thunk flag has been set, we need to invoke the
880      destructor thunk that has been generated by the compiler. The code is
881      placed here, since at this point other tasks might have been released
882      hence overlapping the destructor invocations with some other work in the
883      released tasks.  The OpenMP spec is not specific on when the destructors
884      are invoked, so we should be free to choose. */
885   if (taskdata->td_flags.destructors_thunk) {
886     kmp_routine_entry_t destr_thunk = task->data1.destructors;
887     KMP_ASSERT(destr_thunk);
888     destr_thunk(gtid, task);
889   }
890 
891   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
892   KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
893   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
894 
895   bool detach = false;
896   if (taskdata->td_flags.detachable == TASK_DETACHABLE) {
897     if (taskdata->td_allow_completion_event.type ==
898         KMP_EVENT_ALLOW_COMPLETION) {
899       // event hasn't been fulfilled yet. Try to detach task.
900       __kmp_acquire_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
901       if (taskdata->td_allow_completion_event.type ==
902           KMP_EVENT_ALLOW_COMPLETION) {
903         // task finished execution
904         KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
905         taskdata->td_flags.executing = 0; // suspend the finishing task
906 
907 #if OMPT_SUPPORT
908         // For a detached task, which is not completed, we switch back
909         // the omp_fulfill_event signals completion
910         // locking is necessary to avoid a race with ompt_task_late_fulfill
911         if (ompt)
912           __ompt_task_finish(task, resumed_task, ompt_task_detach);
913 #endif
914 
915         // no access to taskdata after this point!
916         // __kmp_fulfill_event might free taskdata at any time from now
917 
918         taskdata->td_flags.proxy = TASK_PROXY; // proxify!
919         detach = true;
920       }
921       __kmp_release_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
922     }
923   }
924 
925   if (!detach) {
926     taskdata->td_flags.complete = 1; // mark the task as completed
927 
928 #if OMPT_SUPPORT
929     // This is not a detached task, we are done here
930     if (ompt)
931       __ompt_task_finish(task, resumed_task, ompt_task_complete);
932 #endif
933 
934     // Only need to keep track of count if team parallel and tasking not
935     // serialized, or task is detachable and event has already been fulfilled
936     if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) ||
937         taskdata->td_flags.detachable == TASK_DETACHABLE) {
938       // Predecrement simulated by "- 1" calculation
939       children =
940           KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
941       KMP_DEBUG_ASSERT(children >= 0);
942       if (taskdata->td_taskgroup)
943         KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
944       __kmp_release_deps(gtid, taskdata);
945     } else if (task_team && task_team->tt.tt_found_proxy_tasks) {
946       // if we found proxy tasks there could exist a dependency chain
947       // with the proxy task as origin
948       __kmp_release_deps(gtid, taskdata);
949     }
950     // td_flags.executing must be marked as 0 after __kmp_release_deps has been
951     // called. Othertwise, if a task is executed immediately from the
952     // release_deps code, the flag will be reset to 1 again by this same
953     // function
954     KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
955     taskdata->td_flags.executing = 0; // suspend the finishing task
956   }
957 
958 
959   KA_TRACE(
960       20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
961            gtid, taskdata, children));
962 
963   // Free this task and then ancestor tasks if they have no children.
964   // Restore th_current_task first as suggested by John:
965   // johnmc: if an asynchronous inquiry peers into the runtime system
966   // it doesn't see the freed task as the current task.
967   thread->th.th_current_task = resumed_task;
968   if (!detach)
969     __kmp_free_task_and_ancestors(gtid, taskdata, thread);
970 
971   // TODO: GEH - make sure root team implicit task is initialized properly.
972   // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
973   resumed_task->td_flags.executing = 1; // resume previous task
974 
975   KA_TRACE(
976       10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
977            gtid, taskdata, resumed_task));
978 
979   return;
980 }
981 
982 template <bool ompt>
983 static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref,
984                                                   kmp_int32 gtid,
985                                                   kmp_task_t *task) {
986   KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
987                 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
988   // this routine will provide task to resume
989   __kmp_task_finish<ompt>(gtid, task, NULL);
990 
991   KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
992                 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
993 
994 #if OMPT_SUPPORT
995   if (ompt) {
996     ompt_frame_t *ompt_frame;
997     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
998     ompt_frame->enter_frame = ompt_data_none;
999     ompt_frame->enter_frame_flags = ompt_frame_runtime | ompt_frame_framepointer;
1000   }
1001 #endif
1002 
1003   return;
1004 }
1005 
1006 #if OMPT_SUPPORT
1007 OMPT_NOINLINE
1008 void __kmpc_omp_task_complete_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
1009                                        kmp_task_t *task) {
1010   __kmpc_omp_task_complete_if0_template<true>(loc_ref, gtid, task);
1011 }
1012 #endif // OMPT_SUPPORT
1013 
1014 // __kmpc_omp_task_complete_if0: report that a task has completed execution
1015 //
1016 // loc_ref: source location information; points to end of task block.
1017 // gtid: global thread number.
1018 // task: task thunk for the completed task.
1019 void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,
1020                                   kmp_task_t *task) {
1021 #if OMPT_SUPPORT
1022   if (UNLIKELY(ompt_enabled.enabled)) {
1023     __kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task);
1024     return;
1025   }
1026 #endif
1027   __kmpc_omp_task_complete_if0_template<false>(loc_ref, gtid, task);
1028 }
1029 
1030 #ifdef TASK_UNUSED
1031 // __kmpc_omp_task_complete: report that a task has completed execution
1032 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
1033 void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid,
1034                               kmp_task_t *task) {
1035   KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid,
1036                 loc_ref, KMP_TASK_TO_TASKDATA(task)));
1037 
1038   __kmp_task_finish<false>(gtid, task,
1039                            NULL); // Not sure how to find task to resume
1040 
1041   KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid,
1042                 loc_ref, KMP_TASK_TO_TASKDATA(task)));
1043   return;
1044 }
1045 #endif // TASK_UNUSED
1046 
1047 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit
1048 // task for a given thread
1049 //
1050 // loc_ref:  reference to source location of parallel region
1051 // this_thr:  thread data structure corresponding to implicit task
1052 // team: team for this_thr
1053 // tid: thread id of given thread within team
1054 // set_curr_task: TRUE if need to push current task to thread
1055 // NOTE: Routine does not set up the implicit task ICVS.  This is assumed to
1056 // have already been done elsewhere.
1057 // TODO: Get better loc_ref.  Value passed in may be NULL
1058 void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
1059                               kmp_team_t *team, int tid, int set_curr_task) {
1060   kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid];
1061 
1062   KF_TRACE(
1063       10,
1064       ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",
1065        tid, team, task, set_curr_task ? "TRUE" : "FALSE"));
1066 
1067   task->td_task_id = KMP_GEN_TASK_ID();
1068   task->td_team = team;
1069   //    task->td_parent   = NULL;  // fix for CQ230101 (broken parent task info
1070   //    in debugger)
1071   task->td_ident = loc_ref;
1072   task->td_taskwait_ident = NULL;
1073   task->td_taskwait_counter = 0;
1074   task->td_taskwait_thread = 0;
1075 
1076   task->td_flags.tiedness = TASK_TIED;
1077   task->td_flags.tasktype = TASK_IMPLICIT;
1078   task->td_flags.proxy = TASK_FULL;
1079 
1080   // All implicit tasks are executed immediately, not deferred
1081   task->td_flags.task_serial = 1;
1082   task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1083   task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1084 
1085   task->td_flags.started = 1;
1086   task->td_flags.executing = 1;
1087   task->td_flags.complete = 0;
1088   task->td_flags.freed = 0;
1089 
1090   task->td_depnode = NULL;
1091   task->td_last_tied = task;
1092   task->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1093 
1094   if (set_curr_task) { // only do this init first time thread is created
1095     KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0);
1096     // Not used: don't need to deallocate implicit task
1097     KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0);
1098     task->td_taskgroup = NULL; // An implicit task does not have taskgroup
1099     task->td_dephash = NULL;
1100     __kmp_push_current_task_to_thread(this_thr, team, tid);
1101   } else {
1102     KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);
1103     KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0);
1104   }
1105 
1106 #if OMPT_SUPPORT
1107   if (UNLIKELY(ompt_enabled.enabled))
1108     __ompt_task_init(task, tid);
1109 #endif
1110 
1111   KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid,
1112                 team, task));
1113 }
1114 
1115 // __kmp_finish_implicit_task: Release resources associated to implicit tasks
1116 // at the end of parallel regions. Some resources are kept for reuse in the next
1117 // parallel region.
1118 //
1119 // thread:  thread data structure corresponding to implicit task
1120 void __kmp_finish_implicit_task(kmp_info_t *thread) {
1121   kmp_taskdata_t *task = thread->th.th_current_task;
1122   if (task->td_dephash) {
1123     int children;
1124     task->td_flags.complete = 1;
1125     children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks);
1126     kmp_tasking_flags_t flags_old = task->td_flags;
1127     if (children == 0 && flags_old.complete == 1) {
1128       kmp_tasking_flags_t flags_new = flags_old;
1129       flags_new.complete = 0;
1130       if (KMP_COMPARE_AND_STORE_ACQ32(RCAST(kmp_int32 *, &task->td_flags),
1131                                       *RCAST(kmp_int32 *, &flags_old),
1132                                       *RCAST(kmp_int32 *, &flags_new))) {
1133         KA_TRACE(100, ("__kmp_finish_implicit_task: T#%d cleans "
1134                        "dephash of implicit task %p\n",
1135                        thread->th.th_info.ds.ds_gtid, task));
1136         __kmp_dephash_free_entries(thread, task->td_dephash);
1137       }
1138     }
1139   }
1140 }
1141 
1142 // __kmp_free_implicit_task: Release resources associated to implicit tasks
1143 // when these are destroyed regions
1144 //
1145 // thread:  thread data structure corresponding to implicit task
1146 void __kmp_free_implicit_task(kmp_info_t *thread) {
1147   kmp_taskdata_t *task = thread->th.th_current_task;
1148   if (task && task->td_dephash) {
1149     __kmp_dephash_free(thread, task->td_dephash);
1150     task->td_dephash = NULL;
1151   }
1152 }
1153 
1154 // Round up a size to a power of two specified by val: Used to insert padding
1155 // between structures co-allocated using a single malloc() call
1156 static size_t __kmp_round_up_to_val(size_t size, size_t val) {
1157   if (size & (val - 1)) {
1158     size &= ~(val - 1);
1159     if (size <= KMP_SIZE_T_MAX - val) {
1160       size += val; // Round up if there is no overflow.
1161     }
1162   }
1163   return size;
1164 } // __kmp_round_up_to_va
1165 
1166 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task
1167 //
1168 // loc_ref: source location information
1169 // gtid: global thread number.
1170 // flags: include tiedness & task type (explicit vs. implicit) of the ''new''
1171 // task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine.
1172 // sizeof_kmp_task_t:  Size in bytes of kmp_task_t data structure including
1173 // private vars accessed in task.
1174 // sizeof_shareds:  Size in bytes of array of pointers to shared vars accessed
1175 // in task.
1176 // task_entry: Pointer to task code entry point generated by compiler.
1177 // returns: a pointer to the allocated kmp_task_t structure (task).
1178 kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1179                              kmp_tasking_flags_t *flags,
1180                              size_t sizeof_kmp_task_t, size_t sizeof_shareds,
1181                              kmp_routine_entry_t task_entry) {
1182   kmp_task_t *task;
1183   kmp_taskdata_t *taskdata;
1184   kmp_info_t *thread = __kmp_threads[gtid];
1185   kmp_team_t *team = thread->th.th_team;
1186   kmp_taskdata_t *parent_task = thread->th.th_current_task;
1187   size_t shareds_offset;
1188 
1189   if (!TCR_4(__kmp_init_middle))
1190     __kmp_middle_initialize();
1191 
1192   KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
1193                 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1194                 gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
1195                 sizeof_shareds, task_entry));
1196 
1197   if (parent_task->td_flags.final) {
1198     if (flags->merged_if0) {
1199     }
1200     flags->final = 1;
1201   }
1202   if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) {
1203     // Untied task encountered causes the TSC algorithm to check entire deque of
1204     // the victim thread. If no untied task encountered, then checking the head
1205     // of the deque should be enough.
1206     KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1);
1207   }
1208 
1209   // Detachable tasks are not proxy tasks yet but could be in the future. Doing
1210   // the tasking setup
1211   // when that happens is too late.
1212   if (flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE) {
1213     if (flags->proxy == TASK_PROXY) {
1214       flags->tiedness = TASK_UNTIED;
1215       flags->merged_if0 = 1;
1216     }
1217     /* are we running in a sequential parallel or tskm_immediate_exec... we need
1218        tasking support enabled */
1219     if ((thread->th.th_task_team) == NULL) {
1220       /* This should only happen if the team is serialized
1221           setup a task team and propagate it to the thread */
1222       KMP_DEBUG_ASSERT(team->t.t_serialized);
1223       KA_TRACE(30,
1224                ("T#%d creating task team in __kmp_task_alloc for proxy task\n",
1225                 gtid));
1226       __kmp_task_team_setup(
1227           thread, team,
1228           1); // 1 indicates setup the current team regardless of nthreads
1229       thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
1230     }
1231     kmp_task_team_t *task_team = thread->th.th_task_team;
1232 
1233     /* tasking must be enabled now as the task might not be pushed */
1234     if (!KMP_TASKING_ENABLED(task_team)) {
1235       KA_TRACE(
1236           30,
1237           ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
1238       __kmp_enable_tasking(task_team, thread);
1239       kmp_int32 tid = thread->th.th_info.ds.ds_tid;
1240       kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
1241       // No lock needed since only owner can allocate
1242       if (thread_data->td.td_deque == NULL) {
1243         __kmp_alloc_task_deque(thread, thread_data);
1244       }
1245     }
1246 
1247     if (task_team->tt.tt_found_proxy_tasks == FALSE)
1248       TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE);
1249   }
1250 
1251   // Calculate shared structure offset including padding after kmp_task_t struct
1252   // to align pointers in shared struct
1253   shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t;
1254   shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(void *));
1255 
1256   // Allocate a kmp_taskdata_t block and a kmp_task_t block.
1257   KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid,
1258                 shareds_offset));
1259   KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid,
1260                 sizeof_shareds));
1261 
1262 // Avoid double allocation here by combining shareds with taskdata
1263 #if USE_FAST_MEMORY
1264   taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset +
1265                                                                sizeof_shareds);
1266 #else /* ! USE_FAST_MEMORY */
1267   taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset +
1268                                                                sizeof_shareds);
1269 #endif /* USE_FAST_MEMORY */
1270   ANNOTATE_HAPPENS_AFTER(taskdata);
1271 
1272   task = KMP_TASKDATA_TO_TASK(taskdata);
1273 
1274 // Make sure task & taskdata are aligned appropriately
1275 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD
1276   KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0);
1277   KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0);
1278 #else
1279   KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0);
1280   KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0);
1281 #endif
1282   if (sizeof_shareds > 0) {
1283     // Avoid double allocation here by combining shareds with taskdata
1284     task->shareds = &((char *)taskdata)[shareds_offset];
1285     // Make sure shareds struct is aligned to pointer size
1286     KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
1287                      0);
1288   } else {
1289     task->shareds = NULL;
1290   }
1291   task->routine = task_entry;
1292   task->part_id = 0; // AC: Always start with 0 part id
1293 
1294   taskdata->td_task_id = KMP_GEN_TASK_ID();
1295   taskdata->td_team = team;
1296   taskdata->td_alloc_thread = thread;
1297   taskdata->td_parent = parent_task;
1298   taskdata->td_level = parent_task->td_level + 1; // increment nesting level
1299   KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
1300   taskdata->td_ident = loc_ref;
1301   taskdata->td_taskwait_ident = NULL;
1302   taskdata->td_taskwait_counter = 0;
1303   taskdata->td_taskwait_thread = 0;
1304   KMP_DEBUG_ASSERT(taskdata->td_parent != NULL);
1305   // avoid copying icvs for proxy tasks
1306   if (flags->proxy == TASK_FULL)
1307     copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs);
1308 
1309   taskdata->td_flags.tiedness = flags->tiedness;
1310   taskdata->td_flags.final = flags->final;
1311   taskdata->td_flags.merged_if0 = flags->merged_if0;
1312   taskdata->td_flags.destructors_thunk = flags->destructors_thunk;
1313   taskdata->td_flags.proxy = flags->proxy;
1314   taskdata->td_flags.detachable = flags->detachable;
1315   taskdata->td_task_team = thread->th.th_task_team;
1316   taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
1317   taskdata->td_flags.tasktype = TASK_EXPLICIT;
1318 
1319   // GEH - TODO: fix this to copy parent task's value of tasking_ser flag
1320   taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1321 
1322   // GEH - TODO: fix this to copy parent task's value of team_serial flag
1323   taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1324 
1325   // GEH - Note we serialize the task if the team is serialized to make sure
1326   // implicit parallel region tasks are not left until program termination to
1327   // execute. Also, it helps locality to execute immediately.
1328 
1329   taskdata->td_flags.task_serial =
1330       (parent_task->td_flags.final || taskdata->td_flags.team_serial ||
1331        taskdata->td_flags.tasking_ser);
1332 
1333   taskdata->td_flags.started = 0;
1334   taskdata->td_flags.executing = 0;
1335   taskdata->td_flags.complete = 0;
1336   taskdata->td_flags.freed = 0;
1337 
1338   taskdata->td_flags.native = flags->native;
1339 
1340   KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
1341   // start at one because counts current task and children
1342   KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
1343   taskdata->td_taskgroup =
1344       parent_task->td_taskgroup; // task inherits taskgroup from the parent task
1345   taskdata->td_dephash = NULL;
1346   taskdata->td_depnode = NULL;
1347   if (flags->tiedness == TASK_UNTIED)
1348     taskdata->td_last_tied = NULL; // will be set when the task is scheduled
1349   else
1350     taskdata->td_last_tied = taskdata;
1351   taskdata->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1352 #if OMPT_SUPPORT
1353   if (UNLIKELY(ompt_enabled.enabled))
1354     __ompt_task_init(taskdata, gtid);
1355 #endif
1356 // Only need to keep track of child task counts if team parallel and tasking not
1357 // serialized or if it is a proxy or detachable task
1358   if (flags->proxy == TASK_PROXY ||
1359       flags->detachable == TASK_DETACHABLE ||
1360       !(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser))
1361   {
1362     KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
1363     if (parent_task->td_taskgroup)
1364       KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
1365     // Only need to keep track of allocated child tasks for explicit tasks since
1366     // implicit not deallocated
1367     if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
1368       KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
1369     }
1370   }
1371 
1372   KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
1373                 gtid, taskdata, taskdata->td_parent));
1374   ANNOTATE_HAPPENS_BEFORE(task);
1375 
1376   return task;
1377 }
1378 
1379 kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1380                                   kmp_int32 flags, size_t sizeof_kmp_task_t,
1381                                   size_t sizeof_shareds,
1382                                   kmp_routine_entry_t task_entry) {
1383   kmp_task_t *retval;
1384   kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
1385 
1386   input_flags->native = FALSE;
1387 // __kmp_task_alloc() sets up all other runtime flags
1388 
1389   KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s %s) "
1390                 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1391                 gtid, loc_ref, input_flags->tiedness ? "tied  " : "untied",
1392                 input_flags->proxy ? "proxy" : "",
1393                 input_flags->detachable ? "detachable" : "", sizeof_kmp_task_t,
1394                 sizeof_shareds, task_entry));
1395 
1396   retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t,
1397                             sizeof_shareds, task_entry);
1398 
1399   KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval));
1400 
1401   return retval;
1402 }
1403 
1404 kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1405                                          kmp_int32 flags,
1406                                          size_t sizeof_kmp_task_t,
1407                                          size_t sizeof_shareds,
1408                                          kmp_routine_entry_t task_entry,
1409                                          kmp_int64 device_id) {
1410   return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t,
1411                                sizeof_shareds, task_entry);
1412 }
1413 
1414 /*!
1415 @ingroup TASKING
1416 @param loc_ref location of the original task directive
1417 @param gtid Global Thread ID of encountering thread
1418 @param new_task task thunk allocated by __kmpc_omp_task_alloc() for the ''new
1419 task''
1420 @param naffins Number of affinity items
1421 @param affin_list List of affinity items
1422 @return Returns non-zero if registering affinity information was not successful.
1423  Returns 0 if registration was successful
1424 This entry registers the affinity information attached to a task with the task
1425 thunk structure kmp_taskdata_t.
1426 */
1427 kmp_int32
1428 __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid,
1429                                   kmp_task_t *new_task, kmp_int32 naffins,
1430                                   kmp_task_affinity_info_t *affin_list) {
1431   return 0;
1432 }
1433 
1434 //  __kmp_invoke_task: invoke the specified task
1435 //
1436 // gtid: global thread ID of caller
1437 // task: the task to invoke
1438 // current_task: the task to resume after task invocation
1439 static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
1440                               kmp_taskdata_t *current_task) {
1441   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
1442   kmp_info_t *thread;
1443   int discard = 0 /* false */;
1444   KA_TRACE(
1445       30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
1446            gtid, taskdata, current_task));
1447   KMP_DEBUG_ASSERT(task);
1448   if (taskdata->td_flags.proxy == TASK_PROXY &&
1449       taskdata->td_flags.complete == 1) {
1450     // This is a proxy task that was already completed but it needs to run
1451     // its bottom-half finish
1452     KA_TRACE(
1453         30,
1454         ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
1455          gtid, taskdata));
1456 
1457     __kmp_bottom_half_finish_proxy(gtid, task);
1458 
1459     KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for "
1460                   "proxy task %p, resuming task %p\n",
1461                   gtid, taskdata, current_task));
1462 
1463     return;
1464   }
1465 
1466 #if OMPT_SUPPORT
1467   // For untied tasks, the first task executed only calls __kmpc_omp_task and
1468   // does not execute code.
1469   ompt_thread_info_t oldInfo;
1470   if (UNLIKELY(ompt_enabled.enabled)) {
1471     // Store the threads states and restore them after the task
1472     thread = __kmp_threads[gtid];
1473     oldInfo = thread->th.ompt_thread_info;
1474     thread->th.ompt_thread_info.wait_id = 0;
1475     thread->th.ompt_thread_info.state = (thread->th.th_team_serialized)
1476                                             ? ompt_state_work_serial
1477                                             : ompt_state_work_parallel;
1478     taskdata->ompt_task_info.frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1479   }
1480 #endif
1481 
1482   // Proxy tasks are not handled by the runtime
1483   if (taskdata->td_flags.proxy != TASK_PROXY) {
1484     ANNOTATE_HAPPENS_AFTER(task);
1485     __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded
1486   }
1487 
1488   // TODO: cancel tasks if the parallel region has also been cancelled
1489   // TODO: check if this sequence can be hoisted above __kmp_task_start
1490   // if cancellation has been enabled for this run ...
1491   if (__kmp_omp_cancellation) {
1492     thread = __kmp_threads[gtid];
1493     kmp_team_t *this_team = thread->th.th_team;
1494     kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
1495     if ((taskgroup && taskgroup->cancel_request) ||
1496         (this_team->t.t_cancel_request == cancel_parallel)) {
1497 #if OMPT_SUPPORT && OMPT_OPTIONAL
1498       ompt_data_t *task_data;
1499       if (UNLIKELY(ompt_enabled.ompt_callback_cancel)) {
1500         __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
1501         ompt_callbacks.ompt_callback(ompt_callback_cancel)(
1502             task_data,
1503             ((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup
1504                                                       : ompt_cancel_parallel) |
1505                 ompt_cancel_discarded_task,
1506             NULL);
1507       }
1508 #endif
1509       KMP_COUNT_BLOCK(TASK_cancelled);
1510       // this task belongs to a task group and we need to cancel it
1511       discard = 1 /* true */;
1512     }
1513   }
1514 
1515   // Invoke the task routine and pass in relevant data.
1516   // Thunks generated by gcc take a different argument list.
1517   if (!discard) {
1518     if (taskdata->td_flags.tiedness == TASK_UNTIED) {
1519       taskdata->td_last_tied = current_task->td_last_tied;
1520       KMP_DEBUG_ASSERT(taskdata->td_last_tied);
1521     }
1522 #if KMP_STATS_ENABLED
1523     KMP_COUNT_BLOCK(TASK_executed);
1524     switch (KMP_GET_THREAD_STATE()) {
1525     case FORK_JOIN_BARRIER:
1526       KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar);
1527       break;
1528     case PLAIN_BARRIER:
1529       KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar);
1530       break;
1531     case TASKYIELD:
1532       KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield);
1533       break;
1534     case TASKWAIT:
1535       KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait);
1536       break;
1537     case TASKGROUP:
1538       KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup);
1539       break;
1540     default:
1541       KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate);
1542       break;
1543     }
1544 #endif // KMP_STATS_ENABLED
1545 
1546 // OMPT task begin
1547 #if OMPT_SUPPORT
1548     if (UNLIKELY(ompt_enabled.enabled))
1549       __ompt_task_start(task, current_task, gtid);
1550 #endif
1551 
1552 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1553     kmp_uint64 cur_time;
1554     kmp_int32 kmp_itt_count_task =
1555         __kmp_forkjoin_frames_mode == 3 && !taskdata->td_flags.task_serial &&
1556         current_task->td_flags.tasktype == TASK_IMPLICIT;
1557     if (kmp_itt_count_task) {
1558       thread = __kmp_threads[gtid];
1559       // Time outer level explicit task on barrier for adjusting imbalance time
1560       if (thread->th.th_bar_arrive_time)
1561         cur_time = __itt_get_timestamp();
1562       else
1563         kmp_itt_count_task = 0; // thread is not on a barrier - skip timing
1564     }
1565 #endif
1566 
1567 #ifdef KMP_GOMP_COMPAT
1568     if (taskdata->td_flags.native) {
1569       ((void (*)(void *))(*(task->routine)))(task->shareds);
1570     } else
1571 #endif /* KMP_GOMP_COMPAT */
1572     {
1573       (*(task->routine))(gtid, task);
1574     }
1575     KMP_POP_PARTITIONED_TIMER();
1576 
1577 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1578     if (kmp_itt_count_task) {
1579       // Barrier imbalance - adjust arrive time with the task duration
1580       thread->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time);
1581     }
1582 #endif
1583 
1584   }
1585 
1586 
1587   // Proxy tasks are not handled by the runtime
1588   if (taskdata->td_flags.proxy != TASK_PROXY) {
1589     ANNOTATE_HAPPENS_BEFORE(taskdata->td_parent);
1590 #if OMPT_SUPPORT
1591     if (UNLIKELY(ompt_enabled.enabled)) {
1592       thread->th.ompt_thread_info = oldInfo;
1593       if (taskdata->td_flags.tiedness == TASK_TIED) {
1594         taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1595       }
1596       __kmp_task_finish<true>(gtid, task, current_task);
1597     } else
1598 #endif
1599       __kmp_task_finish<false>(gtid, task, current_task);
1600   }
1601 
1602   KA_TRACE(
1603       30,
1604       ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
1605        gtid, taskdata, current_task));
1606   return;
1607 }
1608 
1609 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution
1610 //
1611 // loc_ref: location of original task pragma (ignored)
1612 // gtid: Global Thread ID of encountering thread
1613 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
1614 // Returns:
1615 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1616 //    be resumed later.
1617 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1618 //    resumed later.
1619 kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,
1620                                 kmp_task_t *new_task) {
1621   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1622 
1623   KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid,
1624                 loc_ref, new_taskdata));
1625 
1626 #if OMPT_SUPPORT
1627   kmp_taskdata_t *parent;
1628   if (UNLIKELY(ompt_enabled.enabled)) {
1629     parent = new_taskdata->td_parent;
1630     if (ompt_enabled.ompt_callback_task_create) {
1631       ompt_data_t task_data = ompt_data_none;
1632       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1633           parent ? &(parent->ompt_task_info.task_data) : &task_data,
1634           parent ? &(parent->ompt_task_info.frame) : NULL,
1635           &(new_taskdata->ompt_task_info.task_data), ompt_task_explicit, 0,
1636           OMPT_GET_RETURN_ADDRESS(0));
1637     }
1638   }
1639 #endif
1640 
1641   /* Should we execute the new task or queue it? For now, let's just always try
1642      to queue it.  If the queue fills up, then we'll execute it.  */
1643 
1644   if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1645   { // Execute this task immediately
1646     kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1647     new_taskdata->td_flags.task_serial = 1;
1648     __kmp_invoke_task(gtid, new_task, current_task);
1649   }
1650 
1651   KA_TRACE(
1652       10,
1653       ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "
1654        "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
1655        gtid, loc_ref, new_taskdata));
1656 
1657   ANNOTATE_HAPPENS_BEFORE(new_task);
1658 #if OMPT_SUPPORT
1659   if (UNLIKELY(ompt_enabled.enabled)) {
1660     parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1661   }
1662 #endif
1663   return TASK_CURRENT_NOT_QUEUED;
1664 }
1665 
1666 // __kmp_omp_task: Schedule a non-thread-switchable task for execution
1667 //
1668 // gtid: Global Thread ID of encountering thread
1669 // new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
1670 // serialize_immediate: if TRUE then if the task is executed immediately its
1671 // execution will be serialized
1672 // Returns:
1673 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1674 //    be resumed later.
1675 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1676 //    resumed later.
1677 kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
1678                          bool serialize_immediate) {
1679   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1680 
1681   /* Should we execute the new task or queue it? For now, let's just always try
1682      to queue it.  If the queue fills up, then we'll execute it.  */
1683   if (new_taskdata->td_flags.proxy == TASK_PROXY ||
1684       __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1685   { // Execute this task immediately
1686     kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1687     if (serialize_immediate)
1688       new_taskdata->td_flags.task_serial = 1;
1689     __kmp_invoke_task(gtid, new_task, current_task);
1690   }
1691 
1692   ANNOTATE_HAPPENS_BEFORE(new_task);
1693   return TASK_CURRENT_NOT_QUEUED;
1694 }
1695 
1696 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a
1697 // non-thread-switchable task from the parent thread only!
1698 //
1699 // loc_ref: location of original task pragma (ignored)
1700 // gtid: Global Thread ID of encountering thread
1701 // new_task: non-thread-switchable task thunk allocated by
1702 // __kmp_omp_task_alloc()
1703 // Returns:
1704 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1705 //    be resumed later.
1706 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1707 //    resumed later.
1708 kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid,
1709                           kmp_task_t *new_task) {
1710   kmp_int32 res;
1711   KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1712 
1713 #if KMP_DEBUG || OMPT_SUPPORT
1714   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1715 #endif
1716   KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1717                 new_taskdata));
1718 
1719 #if OMPT_SUPPORT
1720   kmp_taskdata_t *parent = NULL;
1721   if (UNLIKELY(ompt_enabled.enabled)) {
1722     if (!new_taskdata->td_flags.started) {
1723       OMPT_STORE_RETURN_ADDRESS(gtid);
1724       parent = new_taskdata->td_parent;
1725       if (!parent->ompt_task_info.frame.enter_frame.ptr) {
1726         parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1727       }
1728       if (ompt_enabled.ompt_callback_task_create) {
1729         ompt_data_t task_data = ompt_data_none;
1730         ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1731             parent ? &(parent->ompt_task_info.task_data) : &task_data,
1732             parent ? &(parent->ompt_task_info.frame) : NULL,
1733             &(new_taskdata->ompt_task_info.task_data),
1734             ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1735             OMPT_LOAD_RETURN_ADDRESS(gtid));
1736       }
1737     } else {
1738       // We are scheduling the continuation of an UNTIED task.
1739       // Scheduling back to the parent task.
1740       __ompt_task_finish(new_task,
1741                          new_taskdata->ompt_task_info.scheduling_parent,
1742                          ompt_task_switch);
1743       new_taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1744     }
1745   }
1746 #endif
1747 
1748   res = __kmp_omp_task(gtid, new_task, true);
1749 
1750   KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1751                 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1752                 gtid, loc_ref, new_taskdata));
1753 #if OMPT_SUPPORT
1754   if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
1755     parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1756   }
1757 #endif
1758   return res;
1759 }
1760 
1761 // __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule
1762 // a taskloop task with the correct OMPT return address
1763 //
1764 // loc_ref: location of original task pragma (ignored)
1765 // gtid: Global Thread ID of encountering thread
1766 // new_task: non-thread-switchable task thunk allocated by
1767 // __kmp_omp_task_alloc()
1768 // codeptr_ra: return address for OMPT callback
1769 // Returns:
1770 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1771 //    be resumed later.
1772 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1773 //    resumed later.
1774 kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid,
1775                                   kmp_task_t *new_task, void *codeptr_ra) {
1776   kmp_int32 res;
1777   KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1778 
1779 #if KMP_DEBUG || OMPT_SUPPORT
1780   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1781 #endif
1782   KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1783                 new_taskdata));
1784 
1785 #if OMPT_SUPPORT
1786   kmp_taskdata_t *parent = NULL;
1787   if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) {
1788     parent = new_taskdata->td_parent;
1789     if (!parent->ompt_task_info.frame.enter_frame.ptr)
1790       parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1791     if (ompt_enabled.ompt_callback_task_create) {
1792       ompt_data_t task_data = ompt_data_none;
1793       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1794           parent ? &(parent->ompt_task_info.task_data) : &task_data,
1795           parent ? &(parent->ompt_task_info.frame) : NULL,
1796           &(new_taskdata->ompt_task_info.task_data),
1797           ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1798           codeptr_ra);
1799     }
1800   }
1801 #endif
1802 
1803   res = __kmp_omp_task(gtid, new_task, true);
1804 
1805   KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1806                 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1807                 gtid, loc_ref, new_taskdata));
1808 #if OMPT_SUPPORT
1809   if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
1810     parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1811   }
1812 #endif
1813   return res;
1814 }
1815 
1816 template <bool ompt>
1817 static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
1818                                               void *frame_address,
1819                                               void *return_address) {
1820   kmp_taskdata_t *taskdata;
1821   kmp_info_t *thread;
1822   int thread_finished = FALSE;
1823   KMP_SET_THREAD_STATE_BLOCK(TASKWAIT);
1824 
1825   KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref));
1826 
1827   if (__kmp_tasking_mode != tskm_immediate_exec) {
1828     thread = __kmp_threads[gtid];
1829     taskdata = thread->th.th_current_task;
1830 
1831 #if OMPT_SUPPORT && OMPT_OPTIONAL
1832     ompt_data_t *my_task_data;
1833     ompt_data_t *my_parallel_data;
1834 
1835     if (ompt) {
1836       my_task_data = &(taskdata->ompt_task_info.task_data);
1837       my_parallel_data = OMPT_CUR_TEAM_DATA(thread);
1838 
1839       taskdata->ompt_task_info.frame.enter_frame.ptr = frame_address;
1840 
1841       if (ompt_enabled.ompt_callback_sync_region) {
1842         ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1843             ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
1844             my_task_data, return_address);
1845       }
1846 
1847       if (ompt_enabled.ompt_callback_sync_region_wait) {
1848         ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1849             ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
1850             my_task_data, return_address);
1851       }
1852     }
1853 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1854 
1855 // Debugger: The taskwait is active. Store location and thread encountered the
1856 // taskwait.
1857 #if USE_ITT_BUILD
1858 // Note: These values are used by ITT events as well.
1859 #endif /* USE_ITT_BUILD */
1860     taskdata->td_taskwait_counter += 1;
1861     taskdata->td_taskwait_ident = loc_ref;
1862     taskdata->td_taskwait_thread = gtid + 1;
1863 
1864 #if USE_ITT_BUILD
1865     void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
1866     if (itt_sync_obj != NULL)
1867       __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
1868 #endif /* USE_ITT_BUILD */
1869 
1870     bool must_wait =
1871         !taskdata->td_flags.team_serial && !taskdata->td_flags.final;
1872 
1873     must_wait = must_wait || (thread->th.th_task_team != NULL &&
1874                               thread->th.th_task_team->tt.tt_found_proxy_tasks);
1875     if (must_wait) {
1876       kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *,
1877                              &(taskdata->td_incomplete_child_tasks)),
1878                        0U);
1879       while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) {
1880         flag.execute_tasks(thread, gtid, FALSE,
1881                            &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
1882                            __kmp_task_stealing_constraint);
1883       }
1884     }
1885 #if USE_ITT_BUILD
1886     if (itt_sync_obj != NULL)
1887       __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
1888 #endif /* USE_ITT_BUILD */
1889 
1890     // Debugger:  The taskwait is completed. Location remains, but thread is
1891     // negated.
1892     taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
1893 
1894 #if OMPT_SUPPORT && OMPT_OPTIONAL
1895     if (ompt) {
1896       if (ompt_enabled.ompt_callback_sync_region_wait) {
1897         ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1898             ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
1899             my_task_data, return_address);
1900       }
1901       if (ompt_enabled.ompt_callback_sync_region) {
1902         ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1903             ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
1904             my_task_data, return_address);
1905       }
1906       taskdata->ompt_task_info.frame.enter_frame = ompt_data_none;
1907     }
1908 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1909 
1910     ANNOTATE_HAPPENS_AFTER(taskdata);
1911   }
1912 
1913   KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
1914                 "returning TASK_CURRENT_NOT_QUEUED\n",
1915                 gtid, taskdata));
1916 
1917   return TASK_CURRENT_NOT_QUEUED;
1918 }
1919 
1920 #if OMPT_SUPPORT && OMPT_OPTIONAL
1921 OMPT_NOINLINE
1922 static kmp_int32 __kmpc_omp_taskwait_ompt(ident_t *loc_ref, kmp_int32 gtid,
1923                                           void *frame_address,
1924                                           void *return_address) {
1925   return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address,
1926                                             return_address);
1927 }
1928 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1929 
1930 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are
1931 // complete
1932 kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) {
1933 #if OMPT_SUPPORT && OMPT_OPTIONAL
1934   if (UNLIKELY(ompt_enabled.enabled)) {
1935     OMPT_STORE_RETURN_ADDRESS(gtid);
1936     return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(0),
1937                                     OMPT_LOAD_RETURN_ADDRESS(gtid));
1938   }
1939 #endif
1940   return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL);
1941 }
1942 
1943 // __kmpc_omp_taskyield: switch to a different task
1944 kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
1945   kmp_taskdata_t *taskdata;
1946   kmp_info_t *thread;
1947   int thread_finished = FALSE;
1948 
1949   KMP_COUNT_BLOCK(OMP_TASKYIELD);
1950   KMP_SET_THREAD_STATE_BLOCK(TASKYIELD);
1951 
1952   KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
1953                 gtid, loc_ref, end_part));
1954 
1955   if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) {
1956     thread = __kmp_threads[gtid];
1957     taskdata = thread->th.th_current_task;
1958 // Should we model this as a task wait or not?
1959 // Debugger: The taskwait is active. Store location and thread encountered the
1960 // taskwait.
1961 #if USE_ITT_BUILD
1962 // Note: These values are used by ITT events as well.
1963 #endif /* USE_ITT_BUILD */
1964     taskdata->td_taskwait_counter += 1;
1965     taskdata->td_taskwait_ident = loc_ref;
1966     taskdata->td_taskwait_thread = gtid + 1;
1967 
1968 #if USE_ITT_BUILD
1969     void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
1970     if (itt_sync_obj != NULL)
1971       __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
1972 #endif /* USE_ITT_BUILD */
1973     if (!taskdata->td_flags.team_serial) {
1974       kmp_task_team_t *task_team = thread->th.th_task_team;
1975       if (task_team != NULL) {
1976         if (KMP_TASKING_ENABLED(task_team)) {
1977 #if OMPT_SUPPORT
1978           if (UNLIKELY(ompt_enabled.enabled))
1979             thread->th.ompt_thread_info.ompt_task_yielded = 1;
1980 #endif
1981           __kmp_execute_tasks_32(
1982               thread, gtid, NULL, FALSE,
1983               &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
1984               __kmp_task_stealing_constraint);
1985 #if OMPT_SUPPORT
1986           if (UNLIKELY(ompt_enabled.enabled))
1987             thread->th.ompt_thread_info.ompt_task_yielded = 0;
1988 #endif
1989         }
1990       }
1991     }
1992 #if USE_ITT_BUILD
1993     if (itt_sync_obj != NULL)
1994       __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
1995 #endif /* USE_ITT_BUILD */
1996 
1997     // Debugger:  The taskwait is completed. Location remains, but thread is
1998     // negated.
1999     taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2000   }
2001 
2002   KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
2003                 "returning TASK_CURRENT_NOT_QUEUED\n",
2004                 gtid, taskdata));
2005 
2006   return TASK_CURRENT_NOT_QUEUED;
2007 }
2008 
2009 // Task Reduction implementation
2010 //
2011 // Note: initial implementation didn't take into account the possibility
2012 // to specify omp_orig for initializer of the UDR (user defined reduction).
2013 // Corrected implementation takes into account the omp_orig object.
2014 // Compiler is free to use old implementation if omp_orig is not specified.
2015 
2016 /*!
2017 @ingroup BASIC_TYPES
2018 @{
2019 */
2020 
2021 /*!
2022 Flags for special info per task reduction item.
2023 */
2024 typedef struct kmp_taskred_flags {
2025   /*! 1 - use lazy alloc/init (e.g. big objects, #tasks < #threads) */
2026   unsigned lazy_priv : 1;
2027   unsigned reserved31 : 31;
2028 } kmp_taskred_flags_t;
2029 
2030 /*!
2031 Internal struct for reduction data item related info set up by compiler.
2032 */
2033 typedef struct kmp_task_red_input {
2034   void *reduce_shar; /**< shared between tasks item to reduce into */
2035   size_t reduce_size; /**< size of data item in bytes */
2036   // three compiler-generated routines (init, fini are optional):
2037   void *reduce_init; /**< data initialization routine (single parameter) */
2038   void *reduce_fini; /**< data finalization routine */
2039   void *reduce_comb; /**< data combiner routine */
2040   kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
2041 } kmp_task_red_input_t;
2042 
2043 /*!
2044 Internal struct for reduction data item related info saved by the library.
2045 */
2046 typedef struct kmp_taskred_data {
2047   void *reduce_shar; /**< shared between tasks item to reduce into */
2048   size_t reduce_size; /**< size of data item */
2049   kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
2050   void *reduce_priv; /**< array of thread specific items */
2051   void *reduce_pend; /**< end of private data for faster comparison op */
2052   // three compiler-generated routines (init, fini are optional):
2053   void *reduce_comb; /**< data combiner routine */
2054   void *reduce_init; /**< data initialization routine (two parameters) */
2055   void *reduce_fini; /**< data finalization routine */
2056   void *reduce_orig; /**< original item (can be used in UDR initializer) */
2057 } kmp_taskred_data_t;
2058 
2059 /*!
2060 Internal struct for reduction data item related info set up by compiler.
2061 
2062 New interface: added reduce_orig field to provide omp_orig for UDR initializer.
2063 */
2064 typedef struct kmp_taskred_input {
2065   void *reduce_shar; /**< shared between tasks item to reduce into */
2066   void *reduce_orig; /**< original reduction item used for initialization */
2067   size_t reduce_size; /**< size of data item */
2068   // three compiler-generated routines (init, fini are optional):
2069   void *reduce_init; /**< data initialization routine (two parameters) */
2070   void *reduce_fini; /**< data finalization routine */
2071   void *reduce_comb; /**< data combiner routine */
2072   kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
2073 } kmp_taskred_input_t;
2074 /*!
2075 @}
2076 */
2077 
2078 template <typename T> void __kmp_assign_orig(kmp_taskred_data_t &item, T &src);
2079 template <>
2080 void __kmp_assign_orig<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2081                                              kmp_task_red_input_t &src) {
2082   item.reduce_orig = NULL;
2083 }
2084 template <>
2085 void __kmp_assign_orig<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2086                                             kmp_taskred_input_t &src) {
2087   if (src.reduce_orig != NULL) {
2088     item.reduce_orig = src.reduce_orig;
2089   } else {
2090     item.reduce_orig = src.reduce_shar;
2091   } // non-NULL reduce_orig means new interface used
2092 }
2093 
2094 template <typename T> void __kmp_call_init(kmp_taskred_data_t &item, int j);
2095 template <>
2096 void __kmp_call_init<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2097                                            int offset) {
2098   ((void (*)(void *))item.reduce_init)((char *)(item.reduce_priv) + offset);
2099 }
2100 template <>
2101 void __kmp_call_init<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2102                                           int offset) {
2103   ((void (*)(void *, void *))item.reduce_init)(
2104       (char *)(item.reduce_priv) + offset, item.reduce_orig);
2105 }
2106 
2107 template <typename T>
2108 void *__kmp_task_reduction_init(int gtid, int num, T *data) {
2109   kmp_info_t *thread = __kmp_threads[gtid];
2110   kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
2111   kmp_int32 nth = thread->th.th_team_nproc;
2112   kmp_taskred_data_t *arr;
2113 
2114   // check input data just in case
2115   KMP_ASSERT(tg != NULL);
2116   KMP_ASSERT(data != NULL);
2117   KMP_ASSERT(num > 0);
2118   if (nth == 1) {
2119     KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n",
2120                   gtid, tg));
2121     return (void *)tg;
2122   }
2123   KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n",
2124                 gtid, tg, num));
2125   arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2126       thread, num * sizeof(kmp_taskred_data_t));
2127   for (int i = 0; i < num; ++i) {
2128     size_t size = data[i].reduce_size - 1;
2129     // round the size up to cache line per thread-specific item
2130     size += CACHE_LINE - size % CACHE_LINE;
2131     KMP_ASSERT(data[i].reduce_comb != NULL); // combiner is mandatory
2132     arr[i].reduce_shar = data[i].reduce_shar;
2133     arr[i].reduce_size = size;
2134     arr[i].flags = data[i].flags;
2135     arr[i].reduce_comb = data[i].reduce_comb;
2136     arr[i].reduce_init = data[i].reduce_init;
2137     arr[i].reduce_fini = data[i].reduce_fini;
2138     __kmp_assign_orig<T>(arr[i], data[i]);
2139     if (!arr[i].flags.lazy_priv) {
2140       // allocate cache-line aligned block and fill it with zeros
2141       arr[i].reduce_priv = __kmp_allocate(nth * size);
2142       arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size;
2143       if (arr[i].reduce_init != NULL) {
2144         // initialize all thread-specific items
2145         for (int j = 0; j < nth; ++j) {
2146           __kmp_call_init<T>(arr[i], j * size);
2147         }
2148       }
2149     } else {
2150       // only allocate space for pointers now,
2151       // objects will be lazily allocated/initialized if/when requested
2152       // note that __kmp_allocate zeroes the allocated memory
2153       arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *));
2154     }
2155   }
2156   tg->reduce_data = (void *)arr;
2157   tg->reduce_num_data = num;
2158   return (void *)tg;
2159 }
2160 
2161 /*!
2162 @ingroup TASKING
2163 @param gtid      Global thread ID
2164 @param num       Number of data items to reduce
2165 @param data      Array of data for reduction
2166 @return The taskgroup identifier
2167 
2168 Initialize task reduction for the taskgroup.
2169 
2170 Note: this entry supposes the optional compiler-generated initializer routine
2171 has single parameter - pointer to object to be initialized. That means
2172 the reduction either does not use omp_orig object, or the omp_orig is accessible
2173 without help of the runtime library.
2174 */
2175 void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
2176   return __kmp_task_reduction_init(gtid, num, (kmp_task_red_input_t *)data);
2177 }
2178 
2179 /*!
2180 @ingroup TASKING
2181 @param gtid      Global thread ID
2182 @param num       Number of data items to reduce
2183 @param data      Array of data for reduction
2184 @return The taskgroup identifier
2185 
2186 Initialize task reduction for the taskgroup.
2187 
2188 Note: this entry supposes the optional compiler-generated initializer routine
2189 has two parameters, pointer to object to be initialized and pointer to omp_orig
2190 */
2191 void *__kmpc_taskred_init(int gtid, int num, void *data) {
2192   return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data);
2193 }
2194 
2195 // Copy task reduction data (except for shared pointers).
2196 template <typename T>
2197 void __kmp_task_reduction_init_copy(kmp_info_t *thr, int num, T *data,
2198                                     kmp_taskgroup_t *tg, void *reduce_data) {
2199   kmp_taskred_data_t *arr;
2200   KA_TRACE(20, ("__kmp_task_reduction_init_copy: Th %p, init taskgroup %p,"
2201                 " from data %p\n",
2202                 thr, tg, reduce_data));
2203   arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2204       thr, num * sizeof(kmp_taskred_data_t));
2205   // threads will share private copies, thunk routines, sizes, flags, etc.:
2206   KMP_MEMCPY(arr, reduce_data, num * sizeof(kmp_taskred_data_t));
2207   for (int i = 0; i < num; ++i) {
2208     arr[i].reduce_shar = data[i].reduce_shar; // init unique shared pointers
2209   }
2210   tg->reduce_data = (void *)arr;
2211   tg->reduce_num_data = num;
2212 }
2213 
2214 /*!
2215 @ingroup TASKING
2216 @param gtid    Global thread ID
2217 @param tskgrp  The taskgroup ID (optional)
2218 @param data    Shared location of the item
2219 @return The pointer to per-thread data
2220 
2221 Get thread-specific location of data item
2222 */
2223 void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
2224   kmp_info_t *thread = __kmp_threads[gtid];
2225   kmp_int32 nth = thread->th.th_team_nproc;
2226   if (nth == 1)
2227     return data; // nothing to do
2228 
2229   kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp;
2230   if (tg == NULL)
2231     tg = thread->th.th_current_task->td_taskgroup;
2232   KMP_ASSERT(tg != NULL);
2233   kmp_taskred_data_t *arr = (kmp_taskred_data_t *)(tg->reduce_data);
2234   kmp_int32 num = tg->reduce_num_data;
2235   kmp_int32 tid = thread->th.th_info.ds.ds_tid;
2236 
2237   KMP_ASSERT(data != NULL);
2238   while (tg != NULL) {
2239     for (int i = 0; i < num; ++i) {
2240       if (!arr[i].flags.lazy_priv) {
2241         if (data == arr[i].reduce_shar ||
2242             (data >= arr[i].reduce_priv && data < arr[i].reduce_pend))
2243           return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size;
2244       } else {
2245         // check shared location first
2246         void **p_priv = (void **)(arr[i].reduce_priv);
2247         if (data == arr[i].reduce_shar)
2248           goto found;
2249         // check if we get some thread specific location as parameter
2250         for (int j = 0; j < nth; ++j)
2251           if (data == p_priv[j])
2252             goto found;
2253         continue; // not found, continue search
2254       found:
2255         if (p_priv[tid] == NULL) {
2256           // allocate thread specific object lazily
2257           p_priv[tid] = __kmp_allocate(arr[i].reduce_size);
2258           if (arr[i].reduce_init != NULL) {
2259             if (arr[i].reduce_orig != NULL) { // new interface
2260               ((void (*)(void *, void *))arr[i].reduce_init)(
2261                   p_priv[tid], arr[i].reduce_orig);
2262             } else { // old interface (single parameter)
2263               ((void (*)(void *))arr[i].reduce_init)(p_priv[tid]);
2264             }
2265           }
2266         }
2267         return p_priv[tid];
2268       }
2269     }
2270     tg = tg->parent;
2271     arr = (kmp_taskred_data_t *)(tg->reduce_data);
2272     num = tg->reduce_num_data;
2273   }
2274   KMP_ASSERT2(0, "Unknown task reduction item");
2275   return NULL; // ERROR, this line never executed
2276 }
2277 
2278 // Finalize task reduction.
2279 // Called from __kmpc_end_taskgroup()
2280 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) {
2281   kmp_int32 nth = th->th.th_team_nproc;
2282   KMP_DEBUG_ASSERT(nth > 1); // should not be called if nth == 1
2283   kmp_taskred_data_t *arr = (kmp_taskred_data_t *)tg->reduce_data;
2284   kmp_int32 num = tg->reduce_num_data;
2285   for (int i = 0; i < num; ++i) {
2286     void *sh_data = arr[i].reduce_shar;
2287     void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini);
2288     void (*f_comb)(void *, void *) =
2289         (void (*)(void *, void *))(arr[i].reduce_comb);
2290     if (!arr[i].flags.lazy_priv) {
2291       void *pr_data = arr[i].reduce_priv;
2292       size_t size = arr[i].reduce_size;
2293       for (int j = 0; j < nth; ++j) {
2294         void *priv_data = (char *)pr_data + j * size;
2295         f_comb(sh_data, priv_data); // combine results
2296         if (f_fini)
2297           f_fini(priv_data); // finalize if needed
2298       }
2299     } else {
2300       void **pr_data = (void **)(arr[i].reduce_priv);
2301       for (int j = 0; j < nth; ++j) {
2302         if (pr_data[j] != NULL) {
2303           f_comb(sh_data, pr_data[j]); // combine results
2304           if (f_fini)
2305             f_fini(pr_data[j]); // finalize if needed
2306           __kmp_free(pr_data[j]);
2307         }
2308       }
2309     }
2310     __kmp_free(arr[i].reduce_priv);
2311   }
2312   __kmp_thread_free(th, arr);
2313   tg->reduce_data = NULL;
2314   tg->reduce_num_data = 0;
2315 }
2316 
2317 // Cleanup task reduction data for parallel or worksharing,
2318 // do not touch task private data other threads still working with.
2319 // Called from __kmpc_end_taskgroup()
2320 static void __kmp_task_reduction_clean(kmp_info_t *th, kmp_taskgroup_t *tg) {
2321   __kmp_thread_free(th, tg->reduce_data);
2322   tg->reduce_data = NULL;
2323   tg->reduce_num_data = 0;
2324 }
2325 
2326 template <typename T>
2327 void *__kmp_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2328                                          int num, T *data) {
2329   kmp_info_t *thr = __kmp_threads[gtid];
2330   kmp_int32 nth = thr->th.th_team_nproc;
2331   __kmpc_taskgroup(loc, gtid); // form new taskgroup first
2332   if (nth == 1) {
2333     KA_TRACE(10,
2334              ("__kmpc_reduction_modifier_init: T#%d, tg %p, exiting nth=1\n",
2335               gtid, thr->th.th_current_task->td_taskgroup));
2336     return (void *)thr->th.th_current_task->td_taskgroup;
2337   }
2338   kmp_team_t *team = thr->th.th_team;
2339   void *reduce_data;
2340   kmp_taskgroup_t *tg;
2341   reduce_data = KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[is_ws]);
2342   if (reduce_data == NULL &&
2343       __kmp_atomic_compare_store(&team->t.t_tg_reduce_data[is_ws], reduce_data,
2344                                  (void *)1)) {
2345     // single thread enters this block to initialize common reduction data
2346     KMP_DEBUG_ASSERT(reduce_data == NULL);
2347     // first initialize own data, then make a copy other threads can use
2348     tg = (kmp_taskgroup_t *)__kmp_task_reduction_init<T>(gtid, num, data);
2349     reduce_data = __kmp_thread_malloc(thr, num * sizeof(kmp_taskred_data_t));
2350     KMP_MEMCPY(reduce_data, tg->reduce_data, num * sizeof(kmp_taskred_data_t));
2351     // fini counters should be 0 at this point
2352     KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[0]) == 0);
2353     KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[1]) == 0);
2354     KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[is_ws], reduce_data);
2355   } else {
2356     while (
2357         (reduce_data = KMP_ATOMIC_LD_ACQ(&team->t.t_tg_reduce_data[is_ws])) ==
2358         (void *)1) { // wait for task reduction initialization
2359       KMP_CPU_PAUSE();
2360     }
2361     KMP_DEBUG_ASSERT(reduce_data > (void *)1); // should be valid pointer here
2362     tg = thr->th.th_current_task->td_taskgroup;
2363     __kmp_task_reduction_init_copy<T>(thr, num, data, tg, reduce_data);
2364   }
2365   return tg;
2366 }
2367 
2368 /*!
2369 @ingroup TASKING
2370 @param loc       Source location info
2371 @param gtid      Global thread ID
2372 @param is_ws     Is 1 if the reduction is for worksharing, 0 otherwise
2373 @param num       Number of data items to reduce
2374 @param data      Array of data for reduction
2375 @return The taskgroup identifier
2376 
2377 Initialize task reduction for a parallel or worksharing.
2378 
2379 Note: this entry supposes the optional compiler-generated initializer routine
2380 has single parameter - pointer to object to be initialized. That means
2381 the reduction either does not use omp_orig object, or the omp_orig is accessible
2382 without help of the runtime library.
2383 */
2384 void *__kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2385                                           int num, void *data) {
2386   return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2387                                             (kmp_task_red_input_t *)data);
2388 }
2389 
2390 /*!
2391 @ingroup TASKING
2392 @param loc       Source location info
2393 @param gtid      Global thread ID
2394 @param is_ws     Is 1 if the reduction is for worksharing, 0 otherwise
2395 @param num       Number of data items to reduce
2396 @param data      Array of data for reduction
2397 @return The taskgroup identifier
2398 
2399 Initialize task reduction for a parallel or worksharing.
2400 
2401 Note: this entry supposes the optional compiler-generated initializer routine
2402 has two parameters, pointer to object to be initialized and pointer to omp_orig
2403 */
2404 void *__kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num,
2405                                    void *data) {
2406   return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2407                                             (kmp_taskred_input_t *)data);
2408 }
2409 
2410 /*!
2411 @ingroup TASKING
2412 @param loc       Source location info
2413 @param gtid      Global thread ID
2414 @param is_ws     Is 1 if the reduction is for worksharing, 0 otherwise
2415 
2416 Finalize task reduction for a parallel or worksharing.
2417 */
2418 void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws) {
2419   __kmpc_end_taskgroup(loc, gtid);
2420 }
2421 
2422 // __kmpc_taskgroup: Start a new taskgroup
2423 void __kmpc_taskgroup(ident_t *loc, int gtid) {
2424   kmp_info_t *thread = __kmp_threads[gtid];
2425   kmp_taskdata_t *taskdata = thread->th.th_current_task;
2426   kmp_taskgroup_t *tg_new =
2427       (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t));
2428   KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new));
2429   KMP_ATOMIC_ST_RLX(&tg_new->count, 0);
2430   KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq);
2431   tg_new->parent = taskdata->td_taskgroup;
2432   tg_new->reduce_data = NULL;
2433   tg_new->reduce_num_data = 0;
2434   taskdata->td_taskgroup = tg_new;
2435 
2436 #if OMPT_SUPPORT && OMPT_OPTIONAL
2437   if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2438     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2439     if (!codeptr)
2440       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2441     kmp_team_t *team = thread->th.th_team;
2442     ompt_data_t my_task_data = taskdata->ompt_task_info.task_data;
2443     // FIXME: I think this is wrong for lwt!
2444     ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data;
2445 
2446     ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2447         ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2448         &(my_task_data), codeptr);
2449   }
2450 #endif
2451 }
2452 
2453 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task
2454 //                       and its descendants are complete
2455 void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
2456   kmp_info_t *thread = __kmp_threads[gtid];
2457   kmp_taskdata_t *taskdata = thread->th.th_current_task;
2458   kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
2459   int thread_finished = FALSE;
2460 
2461 #if OMPT_SUPPORT && OMPT_OPTIONAL
2462   kmp_team_t *team;
2463   ompt_data_t my_task_data;
2464   ompt_data_t my_parallel_data;
2465   void *codeptr;
2466   if (UNLIKELY(ompt_enabled.enabled)) {
2467     team = thread->th.th_team;
2468     my_task_data = taskdata->ompt_task_info.task_data;
2469     // FIXME: I think this is wrong for lwt!
2470     my_parallel_data = team->t.ompt_team_info.parallel_data;
2471     codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2472     if (!codeptr)
2473       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2474   }
2475 #endif
2476 
2477   KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc));
2478   KMP_DEBUG_ASSERT(taskgroup != NULL);
2479   KMP_SET_THREAD_STATE_BLOCK(TASKGROUP);
2480 
2481   if (__kmp_tasking_mode != tskm_immediate_exec) {
2482     // mark task as waiting not on a barrier
2483     taskdata->td_taskwait_counter += 1;
2484     taskdata->td_taskwait_ident = loc;
2485     taskdata->td_taskwait_thread = gtid + 1;
2486 #if USE_ITT_BUILD
2487     // For ITT the taskgroup wait is similar to taskwait until we need to
2488     // distinguish them
2489     void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
2490     if (itt_sync_obj != NULL)
2491       __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
2492 #endif /* USE_ITT_BUILD */
2493 
2494 #if OMPT_SUPPORT && OMPT_OPTIONAL
2495     if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2496       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2497           ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2498           &(my_task_data), codeptr);
2499     }
2500 #endif
2501 
2502     if (!taskdata->td_flags.team_serial ||
2503         (thread->th.th_task_team != NULL &&
2504          thread->th.th_task_team->tt.tt_found_proxy_tasks)) {
2505       kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)),
2506                        0U);
2507       while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) {
2508         flag.execute_tasks(thread, gtid, FALSE,
2509                            &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2510                            __kmp_task_stealing_constraint);
2511       }
2512     }
2513     taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; // end waiting
2514 
2515 #if OMPT_SUPPORT && OMPT_OPTIONAL
2516     if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2517       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2518           ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2519           &(my_task_data), codeptr);
2520     }
2521 #endif
2522 
2523 #if USE_ITT_BUILD
2524     if (itt_sync_obj != NULL)
2525       __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
2526 #endif /* USE_ITT_BUILD */
2527   }
2528   KMP_DEBUG_ASSERT(taskgroup->count == 0);
2529 
2530   if (taskgroup->reduce_data != NULL) { // need to reduce?
2531     int cnt;
2532     void *reduce_data;
2533     kmp_team_t *t = thread->th.th_team;
2534     kmp_taskred_data_t *arr = (kmp_taskred_data_t *)taskgroup->reduce_data;
2535     // check if <priv> data of the first reduction variable shared for the team
2536     void *priv0 = arr[0].reduce_priv;
2537     if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[0])) != NULL &&
2538         ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
2539       // finishing task reduction on parallel
2540       cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[0]);
2541       if (cnt == thread->th.th_team_nproc - 1) {
2542         // we are the last thread passing __kmpc_reduction_modifier_fini()
2543         // finalize task reduction:
2544         __kmp_task_reduction_fini(thread, taskgroup);
2545         // cleanup fields in the team structure:
2546         // TODO: is relaxed store enough here (whole barrier should follow)?
2547         __kmp_thread_free(thread, reduce_data);
2548         KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[0], NULL);
2549         KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[0], 0);
2550       } else {
2551         // we are not the last thread passing __kmpc_reduction_modifier_fini(),
2552         // so do not finalize reduction, just clean own copy of the data
2553         __kmp_task_reduction_clean(thread, taskgroup);
2554       }
2555     } else if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[1])) !=
2556                    NULL &&
2557                ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
2558       // finishing task reduction on worksharing
2559       cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[1]);
2560       if (cnt == thread->th.th_team_nproc - 1) {
2561         // we are the last thread passing __kmpc_reduction_modifier_fini()
2562         __kmp_task_reduction_fini(thread, taskgroup);
2563         // cleanup fields in team structure:
2564         // TODO: is relaxed store enough here (whole barrier should follow)?
2565         __kmp_thread_free(thread, reduce_data);
2566         KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[1], NULL);
2567         KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[1], 0);
2568       } else {
2569         // we are not the last thread passing __kmpc_reduction_modifier_fini(),
2570         // so do not finalize reduction, just clean own copy of the data
2571         __kmp_task_reduction_clean(thread, taskgroup);
2572       }
2573     } else {
2574       // finishing task reduction on taskgroup
2575       __kmp_task_reduction_fini(thread, taskgroup);
2576     }
2577   }
2578   // Restore parent taskgroup for the current task
2579   taskdata->td_taskgroup = taskgroup->parent;
2580   __kmp_thread_free(thread, taskgroup);
2581 
2582   KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n",
2583                 gtid, taskdata));
2584   ANNOTATE_HAPPENS_AFTER(taskdata);
2585 
2586 #if OMPT_SUPPORT && OMPT_OPTIONAL
2587   if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2588     ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2589         ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2590         &(my_task_data), codeptr);
2591   }
2592 #endif
2593 }
2594 
2595 // __kmp_remove_my_task: remove a task from my own deque
2596 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid,
2597                                         kmp_task_team_t *task_team,
2598                                         kmp_int32 is_constrained) {
2599   kmp_task_t *task;
2600   kmp_taskdata_t *taskdata;
2601   kmp_thread_data_t *thread_data;
2602   kmp_uint32 tail;
2603 
2604   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2605   KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data !=
2606                    NULL); // Caller should check this condition
2607 
2608   thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
2609 
2610   KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",
2611                 gtid, thread_data->td.td_deque_ntasks,
2612                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2613 
2614   if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2615     KA_TRACE(10,
2616              ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: "
2617               "ntasks=%d head=%u tail=%u\n",
2618               gtid, thread_data->td.td_deque_ntasks,
2619               thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2620     return NULL;
2621   }
2622 
2623   __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
2624 
2625   if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2626     __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2627     KA_TRACE(10,
2628              ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "
2629               "ntasks=%d head=%u tail=%u\n",
2630               gtid, thread_data->td.td_deque_ntasks,
2631               thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2632     return NULL;
2633   }
2634 
2635   tail = (thread_data->td.td_deque_tail - 1) &
2636          TASK_DEQUE_MASK(thread_data->td); // Wrap index.
2637   taskdata = thread_data->td.td_deque[tail];
2638 
2639   if (!__kmp_task_is_allowed(gtid, is_constrained, taskdata,
2640                              thread->th.th_current_task)) {
2641     // The TSC does not allow to steal victim task
2642     __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2643     KA_TRACE(10,
2644              ("__kmp_remove_my_task(exit #3): T#%d TSC blocks tail task: "
2645               "ntasks=%d head=%u tail=%u\n",
2646               gtid, thread_data->td.td_deque_ntasks,
2647               thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2648     return NULL;
2649   }
2650 
2651   thread_data->td.td_deque_tail = tail;
2652   TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1);
2653 
2654   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2655 
2656   KA_TRACE(10, ("__kmp_remove_my_task(exit #4): T#%d task %p removed: "
2657                 "ntasks=%d head=%u tail=%u\n",
2658                 gtid, taskdata, thread_data->td.td_deque_ntasks,
2659                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2660 
2661   task = KMP_TASKDATA_TO_TASK(taskdata);
2662   return task;
2663 }
2664 
2665 // __kmp_steal_task: remove a task from another thread's deque
2666 // Assume that calling thread has already checked existence of
2667 // task_team thread_data before calling this routine.
2668 static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid,
2669                                     kmp_task_team_t *task_team,
2670                                     std::atomic<kmp_int32> *unfinished_threads,
2671                                     int *thread_finished,
2672                                     kmp_int32 is_constrained) {
2673   kmp_task_t *task;
2674   kmp_taskdata_t *taskdata;
2675   kmp_taskdata_t *current;
2676   kmp_thread_data_t *victim_td, *threads_data;
2677   kmp_int32 target;
2678   kmp_int32 victim_tid;
2679 
2680   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2681 
2682   threads_data = task_team->tt.tt_threads_data;
2683   KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition
2684 
2685   victim_tid = victim_thr->th.th_info.ds.ds_tid;
2686   victim_td = &threads_data[victim_tid];
2687 
2688   KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: "
2689                 "task_team=%p ntasks=%d head=%u tail=%u\n",
2690                 gtid, __kmp_gtid_from_thread(victim_thr), task_team,
2691                 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
2692                 victim_td->td.td_deque_tail));
2693 
2694   if (TCR_4(victim_td->td.td_deque_ntasks) == 0) {
2695     KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: "
2696                   "task_team=%p ntasks=%d head=%u tail=%u\n",
2697                   gtid, __kmp_gtid_from_thread(victim_thr), task_team,
2698                   victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
2699                   victim_td->td.td_deque_tail));
2700     return NULL;
2701   }
2702 
2703   __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock);
2704 
2705   int ntasks = TCR_4(victim_td->td.td_deque_ntasks);
2706   // Check again after we acquire the lock
2707   if (ntasks == 0) {
2708     __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2709     KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: "
2710                   "task_team=%p ntasks=%d head=%u tail=%u\n",
2711                   gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
2712                   victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2713     return NULL;
2714   }
2715 
2716   KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL);
2717   current = __kmp_threads[gtid]->th.th_current_task;
2718   taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head];
2719   if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
2720     // Bump head pointer and Wrap.
2721     victim_td->td.td_deque_head =
2722         (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td);
2723   } else {
2724     if (!task_team->tt.tt_untied_task_encountered) {
2725       // The TSC does not allow to steal victim task
2726       __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2727       KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d could not steal from "
2728                     "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
2729                     gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
2730                     victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2731       return NULL;
2732     }
2733     int i;
2734     // walk through victim's deque trying to steal any task
2735     target = victim_td->td.td_deque_head;
2736     taskdata = NULL;
2737     for (i = 1; i < ntasks; ++i) {
2738       target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
2739       taskdata = victim_td->td.td_deque[target];
2740       if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
2741         break; // found victim task
2742       } else {
2743         taskdata = NULL;
2744       }
2745     }
2746     if (taskdata == NULL) {
2747       // No appropriate candidate to steal found
2748       __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2749       KA_TRACE(10, ("__kmp_steal_task(exit #4): T#%d could not steal from "
2750                     "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
2751                     gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
2752                     victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2753       return NULL;
2754     }
2755     int prev = target;
2756     for (i = i + 1; i < ntasks; ++i) {
2757       // shift remaining tasks in the deque left by 1
2758       target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
2759       victim_td->td.td_deque[prev] = victim_td->td.td_deque[target];
2760       prev = target;
2761     }
2762     KMP_DEBUG_ASSERT(
2763         victim_td->td.td_deque_tail ==
2764         (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(victim_td->td)));
2765     victim_td->td.td_deque_tail = target; // tail -= 1 (wrapped))
2766   }
2767   if (*thread_finished) {
2768     // We need to un-mark this victim as a finished victim.  This must be done
2769     // before releasing the lock, or else other threads (starting with the
2770     // master victim) might be prematurely released from the barrier!!!
2771     kmp_int32 count;
2772 
2773     count = KMP_ATOMIC_INC(unfinished_threads);
2774 
2775     KA_TRACE(
2776         20,
2777         ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
2778          gtid, count + 1, task_team));
2779 
2780     *thread_finished = FALSE;
2781   }
2782   TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1);
2783 
2784   __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2785 
2786   KMP_COUNT_BLOCK(TASK_stolen);
2787   KA_TRACE(10,
2788            ("__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: "
2789             "task_team=%p ntasks=%d head=%u tail=%u\n",
2790             gtid, taskdata, __kmp_gtid_from_thread(victim_thr), task_team,
2791             ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2792 
2793   task = KMP_TASKDATA_TO_TASK(taskdata);
2794   return task;
2795 }
2796 
2797 // __kmp_execute_tasks_template: Choose and execute tasks until either the
2798 // condition is statisfied (return true) or there are none left (return false).
2799 //
2800 // final_spin is TRUE if this is the spin at the release barrier.
2801 // thread_finished indicates whether the thread is finished executing all
2802 // the tasks it has on its deque, and is at the release barrier.
2803 // spinner is the location on which to spin.
2804 // spinner == NULL means only execute a single task and return.
2805 // checker is the value to check to terminate the spin.
2806 template <class C>
2807 static inline int __kmp_execute_tasks_template(
2808     kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin,
2809     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
2810     kmp_int32 is_constrained) {
2811   kmp_task_team_t *task_team = thread->th.th_task_team;
2812   kmp_thread_data_t *threads_data;
2813   kmp_task_t *task;
2814   kmp_info_t *other_thread;
2815   kmp_taskdata_t *current_task = thread->th.th_current_task;
2816   std::atomic<kmp_int32> *unfinished_threads;
2817   kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0,
2818                       tid = thread->th.th_info.ds.ds_tid;
2819 
2820   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2821   KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]);
2822 
2823   if (task_team == NULL || current_task == NULL)
2824     return FALSE;
2825 
2826   KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d "
2827                 "*thread_finished=%d\n",
2828                 gtid, final_spin, *thread_finished));
2829 
2830   thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
2831   threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
2832   KMP_DEBUG_ASSERT(threads_data != NULL);
2833 
2834   nthreads = task_team->tt.tt_nproc;
2835   unfinished_threads = &(task_team->tt.tt_unfinished_threads);
2836   KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks);
2837   KMP_DEBUG_ASSERT(*unfinished_threads >= 0);
2838 
2839   while (1) { // Outer loop keeps trying to find tasks in case of single thread
2840     // getting tasks from target constructs
2841     while (1) { // Inner loop to find a task and execute it
2842       task = NULL;
2843       if (use_own_tasks) { // check on own queue first
2844         task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained);
2845       }
2846       if ((task == NULL) && (nthreads > 1)) { // Steal a task
2847         int asleep = 1;
2848         use_own_tasks = 0;
2849         // Try to steal from the last place I stole from successfully.
2850         if (victim_tid == -2) { // haven't stolen anything yet
2851           victim_tid = threads_data[tid].td.td_deque_last_stolen;
2852           if (victim_tid !=
2853               -1) // if we have a last stolen from victim, get the thread
2854             other_thread = threads_data[victim_tid].td.td_thr;
2855         }
2856         if (victim_tid != -1) { // found last victim
2857           asleep = 0;
2858         } else if (!new_victim) { // no recent steals and we haven't already
2859           // used a new victim; select a random thread
2860           do { // Find a different thread to steal work from.
2861             // Pick a random thread. Initial plan was to cycle through all the
2862             // threads, and only return if we tried to steal from every thread,
2863             // and failed.  Arch says that's not such a great idea.
2864             victim_tid = __kmp_get_random(thread) % (nthreads - 1);
2865             if (victim_tid >= tid) {
2866               ++victim_tid; // Adjusts random distribution to exclude self
2867             }
2868             // Found a potential victim
2869             other_thread = threads_data[victim_tid].td.td_thr;
2870             // There is a slight chance that __kmp_enable_tasking() did not wake
2871             // up all threads waiting at the barrier.  If victim is sleeping,
2872             // then wake it up. Since we were going to pay the cache miss
2873             // penalty for referencing another thread's kmp_info_t struct
2874             // anyway,
2875             // the check shouldn't cost too much performance at this point. In
2876             // extra barrier mode, tasks do not sleep at the separate tasking
2877             // barrier, so this isn't a problem.
2878             asleep = 0;
2879             if ((__kmp_tasking_mode == tskm_task_teams) &&
2880                 (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&
2881                 (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) !=
2882                  NULL)) {
2883               asleep = 1;
2884               __kmp_null_resume_wrapper(__kmp_gtid_from_thread(other_thread),
2885                                         other_thread->th.th_sleep_loc);
2886               // A sleeping thread should not have any tasks on it's queue.
2887               // There is a slight possibility that it resumes, steals a task
2888               // from another thread, which spawns more tasks, all in the time
2889               // that it takes this thread to check => don't write an assertion
2890               // that the victim's queue is empty.  Try stealing from a
2891               // different thread.
2892             }
2893           } while (asleep);
2894         }
2895 
2896         if (!asleep) {
2897           // We have a victim to try to steal from
2898           task = __kmp_steal_task(other_thread, gtid, task_team,
2899                                   unfinished_threads, thread_finished,
2900                                   is_constrained);
2901         }
2902         if (task != NULL) { // set last stolen to victim
2903           if (threads_data[tid].td.td_deque_last_stolen != victim_tid) {
2904             threads_data[tid].td.td_deque_last_stolen = victim_tid;
2905             // The pre-refactored code did not try more than 1 successful new
2906             // vicitm, unless the last one generated more local tasks;
2907             // new_victim keeps track of this
2908             new_victim = 1;
2909           }
2910         } else { // No tasks found; unset last_stolen
2911           KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1);
2912           victim_tid = -2; // no successful victim found
2913         }
2914       }
2915 
2916       if (task == NULL) // break out of tasking loop
2917         break;
2918 
2919 // Found a task; execute it
2920 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2921       if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2922         if (itt_sync_obj == NULL) { // we are at fork barrier where we could not
2923           // get the object reliably
2924           itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2925         }
2926         __kmp_itt_task_starting(itt_sync_obj);
2927       }
2928 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2929       __kmp_invoke_task(gtid, task, current_task);
2930 #if USE_ITT_BUILD
2931       if (itt_sync_obj != NULL)
2932         __kmp_itt_task_finished(itt_sync_obj);
2933 #endif /* USE_ITT_BUILD */
2934       // If this thread is only partway through the barrier and the condition is
2935       // met, then return now, so that the barrier gather/release pattern can
2936       // proceed. If this thread is in the last spin loop in the barrier,
2937       // waiting to be released, we know that the termination condition will not
2938       // be satisfied, so don't waste any cycles checking it.
2939       if (flag == NULL || (!final_spin && flag->done_check())) {
2940         KA_TRACE(
2941             15,
2942             ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
2943              gtid));
2944         return TRUE;
2945       }
2946       if (thread->th.th_task_team == NULL) {
2947         break;
2948       }
2949       KMP_YIELD(__kmp_library == library_throughput); // Yield before next task
2950       // If execution of a stolen task results in more tasks being placed on our
2951       // run queue, reset use_own_tasks
2952       if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) {
2953         KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned "
2954                       "other tasks, restart\n",
2955                       gtid));
2956         use_own_tasks = 1;
2957         new_victim = 0;
2958       }
2959     }
2960 
2961     // The task source has been exhausted. If in final spin loop of barrier,
2962     // check if termination condition is satisfied. The work queue may be empty
2963     // but there might be proxy tasks still executing.
2964     if (final_spin &&
2965         KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks) == 0) {
2966       // First, decrement the #unfinished threads, if that has not already been
2967       // done.  This decrement might be to the spin location, and result in the
2968       // termination condition being satisfied.
2969       if (!*thread_finished) {
2970         kmp_int32 count;
2971 
2972         count = KMP_ATOMIC_DEC(unfinished_threads) - 1;
2973         KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec "
2974                       "unfinished_threads to %d task_team=%p\n",
2975                       gtid, count, task_team));
2976         *thread_finished = TRUE;
2977       }
2978 
2979       // It is now unsafe to reference thread->th.th_team !!!
2980       // Decrementing task_team->tt.tt_unfinished_threads can allow the master
2981       // thread to pass through the barrier, where it might reset each thread's
2982       // th.th_team field for the next parallel region. If we can steal more
2983       // work, we know that this has not happened yet.
2984       if (flag != NULL && flag->done_check()) {
2985         KA_TRACE(
2986             15,
2987             ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
2988              gtid));
2989         return TRUE;
2990       }
2991     }
2992 
2993     // If this thread's task team is NULL, master has recognized that there are
2994     // no more tasks; bail out
2995     if (thread->th.th_task_team == NULL) {
2996       KA_TRACE(15,
2997                ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid));
2998       return FALSE;
2999     }
3000 
3001     // We could be getting tasks from target constructs; if this is the only
3002     // thread, keep trying to execute tasks from own queue
3003     if (nthreads == 1)
3004       use_own_tasks = 1;
3005     else {
3006       KA_TRACE(15,
3007                ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid));
3008       return FALSE;
3009     }
3010   }
3011 }
3012 
3013 int __kmp_execute_tasks_32(
3014     kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32 *flag, int final_spin,
3015     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3016     kmp_int32 is_constrained) {
3017   return __kmp_execute_tasks_template(
3018       thread, gtid, flag, final_spin,
3019       thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3020 }
3021 
3022 int __kmp_execute_tasks_64(
3023     kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64 *flag, int final_spin,
3024     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3025     kmp_int32 is_constrained) {
3026   return __kmp_execute_tasks_template(
3027       thread, gtid, flag, final_spin,
3028       thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3029 }
3030 
3031 int __kmp_execute_tasks_oncore(
3032     kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
3033     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3034     kmp_int32 is_constrained) {
3035   return __kmp_execute_tasks_template(
3036       thread, gtid, flag, final_spin,
3037       thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3038 }
3039 
3040 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
3041 // next barrier so they can assist in executing enqueued tasks.
3042 // First thread in allocates the task team atomically.
3043 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
3044                                  kmp_info_t *this_thr) {
3045   kmp_thread_data_t *threads_data;
3046   int nthreads, i, is_init_thread;
3047 
3048   KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n",
3049                 __kmp_gtid_from_thread(this_thr)));
3050 
3051   KMP_DEBUG_ASSERT(task_team != NULL);
3052   KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL);
3053 
3054   nthreads = task_team->tt.tt_nproc;
3055   KMP_DEBUG_ASSERT(nthreads > 0);
3056   KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc);
3057 
3058   // Allocate or increase the size of threads_data if necessary
3059   is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team);
3060 
3061   if (!is_init_thread) {
3062     // Some other thread already set up the array.
3063     KA_TRACE(
3064         20,
3065         ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n",
3066          __kmp_gtid_from_thread(this_thr)));
3067     return;
3068   }
3069   threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3070   KMP_DEBUG_ASSERT(threads_data != NULL);
3071 
3072   if (__kmp_tasking_mode == tskm_task_teams &&
3073       (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) {
3074     // Release any threads sleeping at the barrier, so that they can steal
3075     // tasks and execute them.  In extra barrier mode, tasks do not sleep
3076     // at the separate tasking barrier, so this isn't a problem.
3077     for (i = 0; i < nthreads; i++) {
3078       volatile void *sleep_loc;
3079       kmp_info_t *thread = threads_data[i].td.td_thr;
3080 
3081       if (i == this_thr->th.th_info.ds.ds_tid) {
3082         continue;
3083       }
3084       // Since we haven't locked the thread's suspend mutex lock at this
3085       // point, there is a small window where a thread might be putting
3086       // itself to sleep, but hasn't set the th_sleep_loc field yet.
3087       // To work around this, __kmp_execute_tasks_template() periodically checks
3088       // see if other threads are sleeping (using the same random mechanism that
3089       // is used for task stealing) and awakens them if they are.
3090       if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3091           NULL) {
3092         KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n",
3093                       __kmp_gtid_from_thread(this_thr),
3094                       __kmp_gtid_from_thread(thread)));
3095         __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
3096       } else {
3097         KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
3098                       __kmp_gtid_from_thread(this_thr),
3099                       __kmp_gtid_from_thread(thread)));
3100       }
3101     }
3102   }
3103 
3104   KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n",
3105                 __kmp_gtid_from_thread(this_thr)));
3106 }
3107 
3108 /* // TODO: Check the comment consistency
3109  * Utility routines for "task teams".  A task team (kmp_task_t) is kind of
3110  * like a shadow of the kmp_team_t data struct, with a different lifetime.
3111  * After a child * thread checks into a barrier and calls __kmp_release() from
3112  * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
3113  * longer assume that the kmp_team_t structure is intact (at any moment, the
3114  * master thread may exit the barrier code and free the team data structure,
3115  * and return the threads to the thread pool).
3116  *
3117  * This does not work with the tasking code, as the thread is still
3118  * expected to participate in the execution of any tasks that may have been
3119  * spawned my a member of the team, and the thread still needs access to all
3120  * to each thread in the team, so that it can steal work from it.
3121  *
3122  * Enter the existence of the kmp_task_team_t struct.  It employs a reference
3123  * counting mechanism, and is allocated by the master thread before calling
3124  * __kmp_<barrier_kind>_release, and then is release by the last thread to
3125  * exit __kmp_<barrier_kind>_release at the next barrier.  I.e. the lifetimes
3126  * of the kmp_task_team_t structs for consecutive barriers can overlap
3127  * (and will, unless the master thread is the last thread to exit the barrier
3128  * release phase, which is not typical). The existence of such a struct is
3129  * useful outside the context of tasking.
3130  *
3131  * We currently use the existence of the threads array as an indicator that
3132  * tasks were spawned since the last barrier.  If the structure is to be
3133  * useful outside the context of tasking, then this will have to change, but
3134  * not setting the field minimizes the performance impact of tasking on
3135  * barriers, when no explicit tasks were spawned (pushed, actually).
3136  */
3137 
3138 static kmp_task_team_t *__kmp_free_task_teams =
3139     NULL; // Free list for task_team data structures
3140 // Lock for task team data structures
3141 kmp_bootstrap_lock_t __kmp_task_team_lock =
3142     KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock);
3143 
3144 // __kmp_alloc_task_deque:
3145 // Allocates a task deque for a particular thread, and initialize the necessary
3146 // data structures relating to the deque.  This only happens once per thread
3147 // per task team since task teams are recycled. No lock is needed during
3148 // allocation since each thread allocates its own deque.
3149 static void __kmp_alloc_task_deque(kmp_info_t *thread,
3150                                    kmp_thread_data_t *thread_data) {
3151   __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
3152   KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL);
3153 
3154   // Initialize last stolen task field to "none"
3155   thread_data->td.td_deque_last_stolen = -1;
3156 
3157   KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0);
3158   KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0);
3159   KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0);
3160 
3161   KE_TRACE(
3162       10,
3163       ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
3164        __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data));
3165   // Allocate space for task deque, and zero the deque
3166   // Cannot use __kmp_thread_calloc() because threads not around for
3167   // kmp_reap_task_team( ).
3168   thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
3169       INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
3170   thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
3171 }
3172 
3173 // __kmp_free_task_deque:
3174 // Deallocates a task deque for a particular thread. Happens at library
3175 // deallocation so don't need to reset all thread data fields.
3176 static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) {
3177   if (thread_data->td.td_deque != NULL) {
3178     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3179     TCW_4(thread_data->td.td_deque_ntasks, 0);
3180     __kmp_free(thread_data->td.td_deque);
3181     thread_data->td.td_deque = NULL;
3182     __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3183   }
3184 
3185 #ifdef BUILD_TIED_TASK_STACK
3186   // GEH: Figure out what to do here for td_susp_tied_tasks
3187   if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) {
3188     __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data);
3189   }
3190 #endif // BUILD_TIED_TASK_STACK
3191 }
3192 
3193 // __kmp_realloc_task_threads_data:
3194 // Allocates a threads_data array for a task team, either by allocating an
3195 // initial array or enlarging an existing array.  Only the first thread to get
3196 // the lock allocs or enlarges the array and re-initializes the array elements.
3197 // That thread returns "TRUE", the rest return "FALSE".
3198 // Assumes that the new array size is given by task_team -> tt.tt_nproc.
3199 // The current size is given by task_team -> tt.tt_max_threads.
3200 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
3201                                            kmp_task_team_t *task_team) {
3202   kmp_thread_data_t **threads_data_p;
3203   kmp_int32 nthreads, maxthreads;
3204   int is_init_thread = FALSE;
3205 
3206   if (TCR_4(task_team->tt.tt_found_tasks)) {
3207     // Already reallocated and initialized.
3208     return FALSE;
3209   }
3210 
3211   threads_data_p = &task_team->tt.tt_threads_data;
3212   nthreads = task_team->tt.tt_nproc;
3213   maxthreads = task_team->tt.tt_max_threads;
3214 
3215   // All threads must lock when they encounter the first task of the implicit
3216   // task region to make sure threads_data fields are (re)initialized before
3217   // used.
3218   __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3219 
3220   if (!TCR_4(task_team->tt.tt_found_tasks)) {
3221     // first thread to enable tasking
3222     kmp_team_t *team = thread->th.th_team;
3223     int i;
3224 
3225     is_init_thread = TRUE;
3226     if (maxthreads < nthreads) {
3227 
3228       if (*threads_data_p != NULL) {
3229         kmp_thread_data_t *old_data = *threads_data_p;
3230         kmp_thread_data_t *new_data = NULL;
3231 
3232         KE_TRACE(
3233             10,
3234             ("__kmp_realloc_task_threads_data: T#%d reallocating "
3235              "threads data for task_team %p, new_size = %d, old_size = %d\n",
3236              __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads));
3237         // Reallocate threads_data to have more elements than current array
3238         // Cannot use __kmp_thread_realloc() because threads not around for
3239         // kmp_reap_task_team( ).  Note all new array entries are initialized
3240         // to zero by __kmp_allocate().
3241         new_data = (kmp_thread_data_t *)__kmp_allocate(
3242             nthreads * sizeof(kmp_thread_data_t));
3243         // copy old data to new data
3244         KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t),
3245                      (void *)old_data, maxthreads * sizeof(kmp_thread_data_t));
3246 
3247 #ifdef BUILD_TIED_TASK_STACK
3248         // GEH: Figure out if this is the right thing to do
3249         for (i = maxthreads; i < nthreads; i++) {
3250           kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3251           __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3252         }
3253 #endif // BUILD_TIED_TASK_STACK
3254         // Install the new data and free the old data
3255         (*threads_data_p) = new_data;
3256         __kmp_free(old_data);
3257       } else {
3258         KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating "
3259                       "threads data for task_team %p, size = %d\n",
3260                       __kmp_gtid_from_thread(thread), task_team, nthreads));
3261         // Make the initial allocate for threads_data array, and zero entries
3262         // Cannot use __kmp_thread_calloc() because threads not around for
3263         // kmp_reap_task_team( ).
3264         ANNOTATE_IGNORE_WRITES_BEGIN();
3265         *threads_data_p = (kmp_thread_data_t *)__kmp_allocate(
3266             nthreads * sizeof(kmp_thread_data_t));
3267         ANNOTATE_IGNORE_WRITES_END();
3268 #ifdef BUILD_TIED_TASK_STACK
3269         // GEH: Figure out if this is the right thing to do
3270         for (i = 0; i < nthreads; i++) {
3271           kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3272           __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3273         }
3274 #endif // BUILD_TIED_TASK_STACK
3275       }
3276       task_team->tt.tt_max_threads = nthreads;
3277     } else {
3278       // If array has (more than) enough elements, go ahead and use it
3279       KMP_DEBUG_ASSERT(*threads_data_p != NULL);
3280     }
3281 
3282     // initialize threads_data pointers back to thread_info structures
3283     for (i = 0; i < nthreads; i++) {
3284       kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3285       thread_data->td.td_thr = team->t.t_threads[i];
3286 
3287       if (thread_data->td.td_deque_last_stolen >= nthreads) {
3288         // The last stolen field survives across teams / barrier, and the number
3289         // of threads may have changed.  It's possible (likely?) that a new
3290         // parallel region will exhibit the same behavior as previous region.
3291         thread_data->td.td_deque_last_stolen = -1;
3292       }
3293     }
3294 
3295     KMP_MB();
3296     TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE);
3297   }
3298 
3299   __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3300   return is_init_thread;
3301 }
3302 
3303 // __kmp_free_task_threads_data:
3304 // Deallocates a threads_data array for a task team, including any attached
3305 // tasking deques.  Only occurs at library shutdown.
3306 static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) {
3307   __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3308   if (task_team->tt.tt_threads_data != NULL) {
3309     int i;
3310     for (i = 0; i < task_team->tt.tt_max_threads; i++) {
3311       __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]);
3312     }
3313     __kmp_free(task_team->tt.tt_threads_data);
3314     task_team->tt.tt_threads_data = NULL;
3315   }
3316   __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3317 }
3318 
3319 // __kmp_allocate_task_team:
3320 // Allocates a task team associated with a specific team, taking it from
3321 // the global task team free list if possible.  Also initializes data
3322 // structures.
3323 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
3324                                                  kmp_team_t *team) {
3325   kmp_task_team_t *task_team = NULL;
3326   int nthreads;
3327 
3328   KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n",
3329                 (thread ? __kmp_gtid_from_thread(thread) : -1), team));
3330 
3331   if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3332     // Take a task team from the task team pool
3333     __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3334     if (__kmp_free_task_teams != NULL) {
3335       task_team = __kmp_free_task_teams;
3336       TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next);
3337       task_team->tt.tt_next = NULL;
3338     }
3339     __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3340   }
3341 
3342   if (task_team == NULL) {
3343     KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating "
3344                   "task team for team %p\n",
3345                   __kmp_gtid_from_thread(thread), team));
3346     // Allocate a new task team if one is not available.
3347     // Cannot use __kmp_thread_malloc() because threads not around for
3348     // kmp_reap_task_team( ).
3349     task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t));
3350     __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock);
3351     // AC: __kmp_allocate zeroes returned memory
3352     // task_team -> tt.tt_threads_data = NULL;
3353     // task_team -> tt.tt_max_threads = 0;
3354     // task_team -> tt.tt_next = NULL;
3355   }
3356 
3357   TCW_4(task_team->tt.tt_found_tasks, FALSE);
3358   TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3359   task_team->tt.tt_nproc = nthreads = team->t.t_nproc;
3360 
3361   KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads);
3362   TCW_4(task_team->tt.tt_active, TRUE);
3363 
3364   KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
3365                 "unfinished_threads init'd to %d\n",
3366                 (thread ? __kmp_gtid_from_thread(thread) : -1), task_team,
3367                 KMP_ATOMIC_LD_RLX(&task_team->tt.tt_unfinished_threads)));
3368   return task_team;
3369 }
3370 
3371 // __kmp_free_task_team:
3372 // Frees the task team associated with a specific thread, and adds it
3373 // to the global task team free list.
3374 void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) {
3375   KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n",
3376                 thread ? __kmp_gtid_from_thread(thread) : -1, task_team));
3377 
3378   // Put task team back on free list
3379   __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3380 
3381   KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL);
3382   task_team->tt.tt_next = __kmp_free_task_teams;
3383   TCW_PTR(__kmp_free_task_teams, task_team);
3384 
3385   __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3386 }
3387 
3388 // __kmp_reap_task_teams:
3389 // Free all the task teams on the task team free list.
3390 // Should only be done during library shutdown.
3391 // Cannot do anything that needs a thread structure or gtid since they are
3392 // already gone.
3393 void __kmp_reap_task_teams(void) {
3394   kmp_task_team_t *task_team;
3395 
3396   if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3397     // Free all task_teams on the free list
3398     __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3399     while ((task_team = __kmp_free_task_teams) != NULL) {
3400       __kmp_free_task_teams = task_team->tt.tt_next;
3401       task_team->tt.tt_next = NULL;
3402 
3403       // Free threads_data if necessary
3404       if (task_team->tt.tt_threads_data != NULL) {
3405         __kmp_free_task_threads_data(task_team);
3406       }
3407       __kmp_free(task_team);
3408     }
3409     __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3410   }
3411 }
3412 
3413 // __kmp_wait_to_unref_task_teams:
3414 // Some threads could still be in the fork barrier release code, possibly
3415 // trying to steal tasks.  Wait for each thread to unreference its task team.
3416 void __kmp_wait_to_unref_task_teams(void) {
3417   kmp_info_t *thread;
3418   kmp_uint32 spins;
3419   int done;
3420 
3421   KMP_INIT_YIELD(spins);
3422 
3423   for (;;) {
3424     done = TRUE;
3425 
3426     // TODO: GEH - this may be is wrong because some sync would be necessary
3427     // in case threads are added to the pool during the traversal. Need to
3428     // verify that lock for thread pool is held when calling this routine.
3429     for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL;
3430          thread = thread->th.th_next_pool) {
3431 #if KMP_OS_WINDOWS
3432       DWORD exit_val;
3433 #endif
3434       if (TCR_PTR(thread->th.th_task_team) == NULL) {
3435         KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n",
3436                       __kmp_gtid_from_thread(thread)));
3437         continue;
3438       }
3439 #if KMP_OS_WINDOWS
3440       // TODO: GEH - add this check for Linux* OS / OS X* as well?
3441       if (!__kmp_is_thread_alive(thread, &exit_val)) {
3442         thread->th.th_task_team = NULL;
3443         continue;
3444       }
3445 #endif
3446 
3447       done = FALSE; // Because th_task_team pointer is not NULL for this thread
3448 
3449       KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to "
3450                     "unreference task_team\n",
3451                     __kmp_gtid_from_thread(thread)));
3452 
3453       if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
3454         volatile void *sleep_loc;
3455         // If the thread is sleeping, awaken it.
3456         if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3457             NULL) {
3458           KA_TRACE(
3459               10,
3460               ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
3461                __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread)));
3462           __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
3463         }
3464       }
3465     }
3466     if (done) {
3467       break;
3468     }
3469 
3470     // If oversubscribed or have waited a bit, yield.
3471     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
3472   }
3473 }
3474 
3475 // __kmp_task_team_setup:  Create a task_team for the current team, but use
3476 // an already created, unused one if it already exists.
3477 void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
3478   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3479 
3480   // If this task_team hasn't been created yet, allocate it. It will be used in
3481   // the region after the next.
3482   // If it exists, it is the current task team and shouldn't be touched yet as
3483   // it may still be in use.
3484   if (team->t.t_task_team[this_thr->th.th_task_state] == NULL &&
3485       (always || team->t.t_nproc > 1)) {
3486     team->t.t_task_team[this_thr->th.th_task_state] =
3487         __kmp_allocate_task_team(this_thr, team);
3488     KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created new task_team %p "
3489                   "for team %d at parity=%d\n",
3490                   __kmp_gtid_from_thread(this_thr),
3491                   team->t.t_task_team[this_thr->th.th_task_state],
3492                   ((team != NULL) ? team->t.t_id : -1),
3493                   this_thr->th.th_task_state));
3494   }
3495 
3496   // After threads exit the release, they will call sync, and then point to this
3497   // other task_team; make sure it is allocated and properly initialized. As
3498   // threads spin in the barrier release phase, they will continue to use the
3499   // previous task_team struct(above), until they receive the signal to stop
3500   // checking for tasks (they can't safely reference the kmp_team_t struct,
3501   // which could be reallocated by the master thread). No task teams are formed
3502   // for serialized teams.
3503   if (team->t.t_nproc > 1) {
3504     int other_team = 1 - this_thr->th.th_task_state;
3505     if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
3506       team->t.t_task_team[other_team] =
3507           __kmp_allocate_task_team(this_thr, team);
3508       KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created second new "
3509                     "task_team %p for team %d at parity=%d\n",
3510                     __kmp_gtid_from_thread(this_thr),
3511                     team->t.t_task_team[other_team],
3512                     ((team != NULL) ? team->t.t_id : -1), other_team));
3513     } else { // Leave the old task team struct in place for the upcoming region;
3514       // adjust as needed
3515       kmp_task_team_t *task_team = team->t.t_task_team[other_team];
3516       if (!task_team->tt.tt_active ||
3517           team->t.t_nproc != task_team->tt.tt_nproc) {
3518         TCW_4(task_team->tt.tt_nproc, team->t.t_nproc);
3519         TCW_4(task_team->tt.tt_found_tasks, FALSE);
3520         TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3521         KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads,
3522                           team->t.t_nproc);
3523         TCW_4(task_team->tt.tt_active, TRUE);
3524       }
3525       // if team size has changed, the first thread to enable tasking will
3526       // realloc threads_data if necessary
3527       KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d reset next task_team "
3528                     "%p for team %d at parity=%d\n",
3529                     __kmp_gtid_from_thread(this_thr),
3530                     team->t.t_task_team[other_team],
3531                     ((team != NULL) ? team->t.t_id : -1), other_team));
3532     }
3533   }
3534 }
3535 
3536 // __kmp_task_team_sync: Propagation of task team data from team to threads
3537 // which happens just after the release phase of a team barrier.  This may be
3538 // called by any thread, but only for teams with # threads > 1.
3539 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
3540   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3541 
3542   // Toggle the th_task_state field, to switch which task_team this thread
3543   // refers to
3544   this_thr->th.th_task_state = 1 - this_thr->th.th_task_state;
3545   // It is now safe to propagate the task team pointer from the team struct to
3546   // the current thread.
3547   TCW_PTR(this_thr->th.th_task_team,
3548           team->t.t_task_team[this_thr->th.th_task_state]);
3549   KA_TRACE(20,
3550            ("__kmp_task_team_sync: Thread T#%d task team switched to task_team "
3551             "%p from Team #%d (parity=%d)\n",
3552             __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team,
3553             ((team != NULL) ? team->t.t_id : -1), this_thr->th.th_task_state));
3554 }
3555 
3556 // __kmp_task_team_wait: Master thread waits for outstanding tasks after the
3557 // barrier gather phase. Only called by master thread if #threads in team > 1 or
3558 // if proxy tasks were created.
3559 //
3560 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
3561 // by passing in 0 optionally as the last argument. When wait is zero, master
3562 // thread does not wait for unfinished_threads to reach 0.
3563 void __kmp_task_team_wait(
3564     kmp_info_t *this_thr,
3565     kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) {
3566   kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state];
3567 
3568   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3569   KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team);
3570 
3571   if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) {
3572     if (wait) {
3573       KA_TRACE(20, ("__kmp_task_team_wait: Master T#%d waiting for all tasks "
3574                     "(for unfinished_threads to reach 0) on task_team = %p\n",
3575                     __kmp_gtid_from_thread(this_thr), task_team));
3576       // Worker threads may have dropped through to release phase, but could
3577       // still be executing tasks. Wait here for tasks to complete. To avoid
3578       // memory contention, only master thread checks termination condition.
3579       kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *,
3580                              &task_team->tt.tt_unfinished_threads),
3581                        0U);
3582       flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
3583     }
3584     // Deactivate the old task team, so that the worker threads will stop
3585     // referencing it while spinning.
3586     KA_TRACE(
3587         20,
3588         ("__kmp_task_team_wait: Master T#%d deactivating task_team %p: "
3589          "setting active to false, setting local and team's pointer to NULL\n",
3590          __kmp_gtid_from_thread(this_thr), task_team));
3591     KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 ||
3592                      task_team->tt.tt_found_proxy_tasks == TRUE);
3593     TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3594     KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0);
3595     TCW_SYNC_4(task_team->tt.tt_active, FALSE);
3596     KMP_MB();
3597 
3598     TCW_PTR(this_thr->th.th_task_team, NULL);
3599   }
3600 }
3601 
3602 // __kmp_tasking_barrier:
3603 // This routine may only called when __kmp_tasking_mode == tskm_extra_barrier.
3604 // Internal function to execute all tasks prior to a regular barrier or a join
3605 // barrier. It is a full barrier itself, which unfortunately turns regular
3606 // barriers into double barriers and join barriers into 1 1/2 barriers.
3607 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) {
3608   std::atomic<kmp_uint32> *spin = RCAST(
3609       std::atomic<kmp_uint32> *,
3610       &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads);
3611   int flag = FALSE;
3612   KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier);
3613 
3614 #if USE_ITT_BUILD
3615   KMP_FSYNC_SPIN_INIT(spin, NULL);
3616 #endif /* USE_ITT_BUILD */
3617   kmp_flag_32 spin_flag(spin, 0U);
3618   while (!spin_flag.execute_tasks(thread, gtid, TRUE,
3619                                   &flag USE_ITT_BUILD_ARG(NULL), 0)) {
3620 #if USE_ITT_BUILD
3621     // TODO: What about itt_sync_obj??
3622     KMP_FSYNC_SPIN_PREPARE(RCAST(void *, spin));
3623 #endif /* USE_ITT_BUILD */
3624 
3625     if (TCR_4(__kmp_global.g.g_done)) {
3626       if (__kmp_global.g.g_abort)
3627         __kmp_abort_thread();
3628       break;
3629     }
3630     KMP_YIELD(TRUE);
3631   }
3632 #if USE_ITT_BUILD
3633   KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin));
3634 #endif /* USE_ITT_BUILD */
3635 }
3636 
3637 // __kmp_give_task puts a task into a given thread queue if:
3638 //  - the queue for that thread was created
3639 //  - there's space in that queue
3640 // Because of this, __kmp_push_task needs to check if there's space after
3641 // getting the lock
3642 static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task,
3643                             kmp_int32 pass) {
3644   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
3645   kmp_task_team_t *task_team = taskdata->td_task_team;
3646 
3647   KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n",
3648                 taskdata, tid));
3649 
3650   // If task_team is NULL something went really bad...
3651   KMP_DEBUG_ASSERT(task_team != NULL);
3652 
3653   bool result = false;
3654   kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
3655 
3656   if (thread_data->td.td_deque == NULL) {
3657     // There's no queue in this thread, go find another one
3658     // We're guaranteed that at least one thread has a queue
3659     KA_TRACE(30,
3660              ("__kmp_give_task: thread %d has no queue while giving task %p.\n",
3661               tid, taskdata));
3662     return result;
3663   }
3664 
3665   if (TCR_4(thread_data->td.td_deque_ntasks) >=
3666       TASK_DEQUE_SIZE(thread_data->td)) {
3667     KA_TRACE(
3668         30,
3669         ("__kmp_give_task: queue is full while giving task %p to thread %d.\n",
3670          taskdata, tid));
3671 
3672     // if this deque is bigger than the pass ratio give a chance to another
3673     // thread
3674     if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
3675       return result;
3676 
3677     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3678     if (TCR_4(thread_data->td.td_deque_ntasks) >=
3679         TASK_DEQUE_SIZE(thread_data->td)) {
3680       // expand deque to push the task which is not allowed to execute
3681       __kmp_realloc_task_deque(thread, thread_data);
3682     }
3683 
3684   } else {
3685 
3686     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3687 
3688     if (TCR_4(thread_data->td.td_deque_ntasks) >=
3689         TASK_DEQUE_SIZE(thread_data->td)) {
3690       KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to "
3691                     "thread %d.\n",
3692                     taskdata, tid));
3693 
3694       // if this deque is bigger than the pass ratio give a chance to another
3695       // thread
3696       if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
3697         goto release_and_exit;
3698 
3699       __kmp_realloc_task_deque(thread, thread_data);
3700     }
3701   }
3702 
3703   // lock is held here, and there is space in the deque
3704 
3705   thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
3706   // Wrap index.
3707   thread_data->td.td_deque_tail =
3708       (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
3709   TCW_4(thread_data->td.td_deque_ntasks,
3710         TCR_4(thread_data->td.td_deque_ntasks) + 1);
3711 
3712   result = true;
3713   KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n",
3714                 taskdata, tid));
3715 
3716 release_and_exit:
3717   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3718 
3719   return result;
3720 }
3721 
3722 /* The finish of the proxy tasks is divided in two pieces:
3723     - the top half is the one that can be done from a thread outside the team
3724     - the bottom half must be run from a thread within the team
3725 
3726    In order to run the bottom half the task gets queued back into one of the
3727    threads of the team. Once the td_incomplete_child_task counter of the parent
3728    is decremented the threads can leave the barriers. So, the bottom half needs
3729    to be queued before the counter is decremented. The top half is therefore
3730    divided in two parts:
3731     - things that can be run before queuing the bottom half
3732     - things that must be run after queuing the bottom half
3733 
3734    This creates a second race as the bottom half can free the task before the
3735    second top half is executed. To avoid this we use the
3736    td_incomplete_child_task of the proxy task to synchronize the top and bottom
3737    half. */
3738 static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
3739   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
3740   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3741   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
3742   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
3743 
3744   taskdata->td_flags.complete = 1; // mark the task as completed
3745 
3746   if (taskdata->td_taskgroup)
3747     KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
3748 
3749   // Create an imaginary children for this task so the bottom half cannot
3750   // release the task before we have completed the second top half
3751   KMP_ATOMIC_INC(&taskdata->td_incomplete_child_tasks);
3752 }
3753 
3754 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
3755   kmp_int32 children = 0;
3756 
3757   // Predecrement simulated by "- 1" calculation
3758   children =
3759       KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
3760   KMP_DEBUG_ASSERT(children >= 0);
3761 
3762   // Remove the imaginary children
3763   KMP_ATOMIC_DEC(&taskdata->td_incomplete_child_tasks);
3764 }
3765 
3766 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) {
3767   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3768   kmp_info_t *thread = __kmp_threads[gtid];
3769 
3770   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3771   KMP_DEBUG_ASSERT(taskdata->td_flags.complete ==
3772                    1); // top half must run before bottom half
3773 
3774   // We need to wait to make sure the top half is finished
3775   // Spinning here should be ok as this should happen quickly
3776   while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) > 0)
3777     ;
3778 
3779   __kmp_release_deps(gtid, taskdata);
3780   __kmp_free_task_and_ancestors(gtid, taskdata, thread);
3781 }
3782 
3783 /*!
3784 @ingroup TASKING
3785 @param gtid Global Thread ID of encountering thread
3786 @param ptask Task which execution is completed
3787 
3788 Execute the completion of a proxy task from a thread of that is part of the
3789 team. Run first and bottom halves directly.
3790 */
3791 void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) {
3792   KMP_DEBUG_ASSERT(ptask != NULL);
3793   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3794   KA_TRACE(
3795       10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n",
3796            gtid, taskdata));
3797 
3798   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3799 
3800   __kmp_first_top_half_finish_proxy(taskdata);
3801   __kmp_second_top_half_finish_proxy(taskdata);
3802   __kmp_bottom_half_finish_proxy(gtid, ptask);
3803 
3804   KA_TRACE(10,
3805            ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n",
3806             gtid, taskdata));
3807 }
3808 
3809 /*!
3810 @ingroup TASKING
3811 @param ptask Task which execution is completed
3812 
3813 Execute the completion of a proxy task from a thread that could not belong to
3814 the team.
3815 */
3816 void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) {
3817   KMP_DEBUG_ASSERT(ptask != NULL);
3818   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3819 
3820   KA_TRACE(
3821       10,
3822       ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n",
3823        taskdata));
3824 
3825   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3826 
3827   __kmp_first_top_half_finish_proxy(taskdata);
3828 
3829   // Enqueue task to complete bottom half completion from a thread within the
3830   // corresponding team
3831   kmp_team_t *team = taskdata->td_team;
3832   kmp_int32 nthreads = team->t.t_nproc;
3833   kmp_info_t *thread;
3834 
3835   // This should be similar to start_k = __kmp_get_random( thread ) % nthreads
3836   // but we cannot use __kmp_get_random here
3837   kmp_int32 start_k = 0;
3838   kmp_int32 pass = 1;
3839   kmp_int32 k = start_k;
3840 
3841   do {
3842     // For now we're just linearly trying to find a thread
3843     thread = team->t.t_threads[k];
3844     k = (k + 1) % nthreads;
3845 
3846     // we did a full pass through all the threads
3847     if (k == start_k)
3848       pass = pass << 1;
3849 
3850   } while (!__kmp_give_task(thread, k, ptask, pass));
3851 
3852   __kmp_second_top_half_finish_proxy(taskdata);
3853 
3854   KA_TRACE(
3855       10,
3856       ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n",
3857        taskdata));
3858 }
3859 
3860 kmp_event_t *__kmpc_task_allow_completion_event(ident_t *loc_ref, int gtid,
3861                                                 kmp_task_t *task) {
3862   kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
3863   if (td->td_allow_completion_event.type == KMP_EVENT_UNINITIALIZED) {
3864     td->td_allow_completion_event.type = KMP_EVENT_ALLOW_COMPLETION;
3865     td->td_allow_completion_event.ed.task = task;
3866     __kmp_init_tas_lock(&td->td_allow_completion_event.lock);
3867   }
3868   return &td->td_allow_completion_event;
3869 }
3870 
3871 void __kmp_fulfill_event(kmp_event_t *event) {
3872   if (event->type == KMP_EVENT_ALLOW_COMPLETION) {
3873     kmp_task_t *ptask = event->ed.task;
3874     kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3875     bool detached = false;
3876     int gtid = __kmp_get_gtid();
3877 
3878     // The associated task might have completed or could be completing at this
3879     // point.
3880     // We need to take the lock to avoid races
3881     __kmp_acquire_tas_lock(&event->lock, gtid);
3882     if (taskdata->td_flags.proxy == TASK_PROXY) {
3883       detached = true;
3884     } else {
3885 #if OMPT_SUPPORT
3886       // The OMPT event must occur under mutual exclusion,
3887       // otherwise the tool might access ptask after free
3888       if (UNLIKELY(ompt_enabled.enabled))
3889         __ompt_task_finish(ptask, NULL, ompt_task_early_fulfill);
3890 #endif
3891     }
3892     event->type = KMP_EVENT_UNINITIALIZED;
3893     __kmp_release_tas_lock(&event->lock, gtid);
3894 
3895     if (detached) {
3896 #if OMPT_SUPPORT
3897       // We free ptask afterwards and know the task is finished,
3898       // so locking is not necessary
3899       if (UNLIKELY(ompt_enabled.enabled))
3900         __ompt_task_finish(ptask, NULL, ompt_task_late_fulfill);
3901 #endif
3902       // If the task detached complete the proxy task
3903       if (gtid >= 0) {
3904         kmp_team_t *team = taskdata->td_team;
3905         kmp_info_t *thread = __kmp_get_thread();
3906         if (thread->th.th_team == team) {
3907           __kmpc_proxy_task_completed(gtid, ptask);
3908           return;
3909         }
3910       }
3911 
3912       // fallback
3913       __kmpc_proxy_task_completed_ooo(ptask);
3914     }
3915   }
3916 }
3917 
3918 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task
3919 // for taskloop
3920 //
3921 // thread:   allocating thread
3922 // task_src: pointer to source task to be duplicated
3923 // returns:  a pointer to the allocated kmp_task_t structure (task).
3924 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) {
3925   kmp_task_t *task;
3926   kmp_taskdata_t *taskdata;
3927   kmp_taskdata_t *taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
3928   kmp_taskdata_t *parent_task = taskdata_src->td_parent; // same parent task
3929   size_t shareds_offset;
3930   size_t task_size;
3931 
3932   KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread,
3933                 task_src));
3934   KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy ==
3935                    TASK_FULL); // it should not be proxy task
3936   KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT);
3937   task_size = taskdata_src->td_size_alloc;
3938 
3939   // Allocate a kmp_taskdata_t block and a kmp_task_t block.
3940   KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread,
3941                 task_size));
3942 #if USE_FAST_MEMORY
3943   taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size);
3944 #else
3945   taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size);
3946 #endif /* USE_FAST_MEMORY */
3947   KMP_MEMCPY(taskdata, taskdata_src, task_size);
3948 
3949   task = KMP_TASKDATA_TO_TASK(taskdata);
3950 
3951   // Initialize new task (only specific fields not affected by memcpy)
3952   taskdata->td_task_id = KMP_GEN_TASK_ID();
3953   if (task->shareds != NULL) { // need setup shareds pointer
3954     shareds_offset = (char *)task_src->shareds - (char *)taskdata_src;
3955     task->shareds = &((char *)taskdata)[shareds_offset];
3956     KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
3957                      0);
3958   }
3959   taskdata->td_alloc_thread = thread;
3960   taskdata->td_parent = parent_task;
3961   // task inherits the taskgroup from the parent task
3962   taskdata->td_taskgroup = parent_task->td_taskgroup;
3963   // tied task needs to initialize the td_last_tied at creation,
3964   // untied one does this when it is scheduled for execution
3965   if (taskdata->td_flags.tiedness == TASK_TIED)
3966     taskdata->td_last_tied = taskdata;
3967 
3968   // Only need to keep track of child task counts if team parallel and tasking
3969   // not serialized
3970   if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
3971     KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
3972     if (parent_task->td_taskgroup)
3973       KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
3974     // Only need to keep track of allocated child tasks for explicit tasks since
3975     // implicit not deallocated
3976     if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT)
3977       KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
3978   }
3979 
3980   KA_TRACE(20,
3981            ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
3982             thread, taskdata, taskdata->td_parent));
3983 #if OMPT_SUPPORT
3984   if (UNLIKELY(ompt_enabled.enabled))
3985     __ompt_task_init(taskdata, thread->th.th_info.ds.ds_gtid);
3986 #endif
3987   return task;
3988 }
3989 
3990 // Routine optionally generated by the compiler for setting the lastprivate flag
3991 // and calling needed constructors for private/firstprivate objects
3992 // (used to form taskloop tasks from pattern task)
3993 // Parameters: dest task, src task, lastprivate flag.
3994 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
3995 
3996 KMP_BUILD_ASSERT(sizeof(long) == 4 || sizeof(long) == 8);
3997 
3998 // class to encapsulate manipulating loop bounds in a taskloop task.
3999 // this abstracts away the Intel vs GOMP taskloop interface for setting/getting
4000 // the loop bound variables.
4001 class kmp_taskloop_bounds_t {
4002   kmp_task_t *task;
4003   const kmp_taskdata_t *taskdata;
4004   size_t lower_offset;
4005   size_t upper_offset;
4006 
4007 public:
4008   kmp_taskloop_bounds_t(kmp_task_t *_task, kmp_uint64 *lb, kmp_uint64 *ub)
4009       : task(_task), taskdata(KMP_TASK_TO_TASKDATA(task)),
4010         lower_offset((char *)lb - (char *)task),
4011         upper_offset((char *)ub - (char *)task) {
4012     KMP_DEBUG_ASSERT((char *)lb > (char *)_task);
4013     KMP_DEBUG_ASSERT((char *)ub > (char *)_task);
4014   }
4015   kmp_taskloop_bounds_t(kmp_task_t *_task, const kmp_taskloop_bounds_t &bounds)
4016       : task(_task), taskdata(KMP_TASK_TO_TASKDATA(_task)),
4017         lower_offset(bounds.lower_offset), upper_offset(bounds.upper_offset) {}
4018   size_t get_lower_offset() const { return lower_offset; }
4019   size_t get_upper_offset() const { return upper_offset; }
4020   kmp_uint64 get_lb() const {
4021     kmp_int64 retval;
4022 #if defined(KMP_GOMP_COMPAT)
4023     // Intel task just returns the lower bound normally
4024     if (!taskdata->td_flags.native) {
4025       retval = *(kmp_int64 *)((char *)task + lower_offset);
4026     } else {
4027       // GOMP task has to take into account the sizeof(long)
4028       if (taskdata->td_size_loop_bounds == 4) {
4029         kmp_int32 *lb = RCAST(kmp_int32 *, task->shareds);
4030         retval = (kmp_int64)*lb;
4031       } else {
4032         kmp_int64 *lb = RCAST(kmp_int64 *, task->shareds);
4033         retval = (kmp_int64)*lb;
4034       }
4035     }
4036 #else
4037     retval = *(kmp_int64 *)((char *)task + lower_offset);
4038 #endif // defined(KMP_GOMP_COMPAT)
4039     return retval;
4040   }
4041   kmp_uint64 get_ub() const {
4042     kmp_int64 retval;
4043 #if defined(KMP_GOMP_COMPAT)
4044     // Intel task just returns the upper bound normally
4045     if (!taskdata->td_flags.native) {
4046       retval = *(kmp_int64 *)((char *)task + upper_offset);
4047     } else {
4048       // GOMP task has to take into account the sizeof(long)
4049       if (taskdata->td_size_loop_bounds == 4) {
4050         kmp_int32 *ub = RCAST(kmp_int32 *, task->shareds) + 1;
4051         retval = (kmp_int64)*ub;
4052       } else {
4053         kmp_int64 *ub = RCAST(kmp_int64 *, task->shareds) + 1;
4054         retval = (kmp_int64)*ub;
4055       }
4056     }
4057 #else
4058     retval = *(kmp_int64 *)((char *)task + upper_offset);
4059 #endif // defined(KMP_GOMP_COMPAT)
4060     return retval;
4061   }
4062   void set_lb(kmp_uint64 lb) {
4063 #if defined(KMP_GOMP_COMPAT)
4064     // Intel task just sets the lower bound normally
4065     if (!taskdata->td_flags.native) {
4066       *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4067     } else {
4068       // GOMP task has to take into account the sizeof(long)
4069       if (taskdata->td_size_loop_bounds == 4) {
4070         kmp_uint32 *lower = RCAST(kmp_uint32 *, task->shareds);
4071         *lower = (kmp_uint32)lb;
4072       } else {
4073         kmp_uint64 *lower = RCAST(kmp_uint64 *, task->shareds);
4074         *lower = (kmp_uint64)lb;
4075       }
4076     }
4077 #else
4078     *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4079 #endif // defined(KMP_GOMP_COMPAT)
4080   }
4081   void set_ub(kmp_uint64 ub) {
4082 #if defined(KMP_GOMP_COMPAT)
4083     // Intel task just sets the upper bound normally
4084     if (!taskdata->td_flags.native) {
4085       *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4086     } else {
4087       // GOMP task has to take into account the sizeof(long)
4088       if (taskdata->td_size_loop_bounds == 4) {
4089         kmp_uint32 *upper = RCAST(kmp_uint32 *, task->shareds) + 1;
4090         *upper = (kmp_uint32)ub;
4091       } else {
4092         kmp_uint64 *upper = RCAST(kmp_uint64 *, task->shareds) + 1;
4093         *upper = (kmp_uint64)ub;
4094       }
4095     }
4096 #else
4097     *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4098 #endif // defined(KMP_GOMP_COMPAT)
4099   }
4100 };
4101 
4102 // __kmp_taskloop_linear: Start tasks of the taskloop linearly
4103 //
4104 // loc        Source location information
4105 // gtid       Global thread ID
4106 // task       Pattern task, exposes the loop iteration range
4107 // lb         Pointer to loop lower bound in task structure
4108 // ub         Pointer to loop upper bound in task structure
4109 // st         Loop stride
4110 // ub_glob    Global upper bound (used for lastprivate check)
4111 // num_tasks  Number of tasks to execute
4112 // grainsize  Number of loop iterations per task
4113 // extras     Number of chunks with grainsize+1 iterations
4114 // tc         Iterations count
4115 // task_dup   Tasks duplication routine
4116 // codeptr_ra Return address for OMPT events
4117 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
4118                            kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4119                            kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4120                            kmp_uint64 grainsize, kmp_uint64 extras,
4121                            kmp_uint64 tc,
4122 #if OMPT_SUPPORT
4123                            void *codeptr_ra,
4124 #endif
4125                            void *task_dup) {
4126   KMP_COUNT_BLOCK(OMP_TASKLOOP);
4127   KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
4128   p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4129   // compiler provides global bounds here
4130   kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4131   kmp_uint64 lower = task_bounds.get_lb();
4132   kmp_uint64 upper = task_bounds.get_ub();
4133   kmp_uint64 i;
4134   kmp_info_t *thread = __kmp_threads[gtid];
4135   kmp_taskdata_t *current_task = thread->th.th_current_task;
4136   kmp_task_t *next_task;
4137   kmp_int32 lastpriv = 0;
4138 
4139   KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
4140   KMP_DEBUG_ASSERT(num_tasks > extras);
4141   KMP_DEBUG_ASSERT(num_tasks > 0);
4142   KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, "
4143                 "extras %lld, i=%lld,%lld(%d)%lld, dup %p\n",
4144                 gtid, num_tasks, grainsize, extras, lower, upper, ub_glob, st,
4145                 task_dup));
4146 
4147   // Launch num_tasks tasks, assign grainsize iterations each task
4148   for (i = 0; i < num_tasks; ++i) {
4149     kmp_uint64 chunk_minus_1;
4150     if (extras == 0) {
4151       chunk_minus_1 = grainsize - 1;
4152     } else {
4153       chunk_minus_1 = grainsize;
4154       --extras; // first extras iterations get bigger chunk (grainsize+1)
4155     }
4156     upper = lower + st * chunk_minus_1;
4157     if (i == num_tasks - 1) {
4158       // schedule the last task, set lastprivate flag if needed
4159       if (st == 1) { // most common case
4160         KMP_DEBUG_ASSERT(upper == *ub);
4161         if (upper == ub_glob)
4162           lastpriv = 1;
4163       } else if (st > 0) { // positive loop stride
4164         KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper);
4165         if ((kmp_uint64)st > ub_glob - upper)
4166           lastpriv = 1;
4167       } else { // negative loop stride
4168         KMP_DEBUG_ASSERT(upper + st < *ub);
4169         if (upper - ub_glob < (kmp_uint64)(-st))
4170           lastpriv = 1;
4171       }
4172     }
4173     next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
4174     kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task);
4175     kmp_taskloop_bounds_t next_task_bounds =
4176         kmp_taskloop_bounds_t(next_task, task_bounds);
4177 
4178     // adjust task-specific bounds
4179     next_task_bounds.set_lb(lower);
4180     if (next_taskdata->td_flags.native) {
4181       next_task_bounds.set_ub(upper + (st > 0 ? 1 : -1));
4182     } else {
4183       next_task_bounds.set_ub(upper);
4184     }
4185     if (ptask_dup != NULL) // set lastprivate flag, construct firstprivates,
4186                            // etc.
4187       ptask_dup(next_task, task, lastpriv);
4188     KA_TRACE(40,
4189              ("__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, "
4190               "upper %lld stride %lld, (offsets %p %p)\n",
4191               gtid, i, next_task, lower, upper, st,
4192               next_task_bounds.get_lower_offset(),
4193               next_task_bounds.get_upper_offset()));
4194 #if OMPT_SUPPORT
4195     __kmp_omp_taskloop_task(NULL, gtid, next_task,
4196                            codeptr_ra); // schedule new task
4197 #else
4198     __kmp_omp_task(gtid, next_task, true); // schedule new task
4199 #endif
4200     lower = upper + st; // adjust lower bound for the next iteration
4201   }
4202   // free the pattern task and exit
4203   __kmp_task_start(gtid, task, current_task); // make internal bookkeeping
4204   // do not execute the pattern task, just do internal bookkeeping
4205   __kmp_task_finish<false>(gtid, task, current_task);
4206 }
4207 
4208 // Structure to keep taskloop parameters for auxiliary task
4209 // kept in the shareds of the task structure.
4210 typedef struct __taskloop_params {
4211   kmp_task_t *task;
4212   kmp_uint64 *lb;
4213   kmp_uint64 *ub;
4214   void *task_dup;
4215   kmp_int64 st;
4216   kmp_uint64 ub_glob;
4217   kmp_uint64 num_tasks;
4218   kmp_uint64 grainsize;
4219   kmp_uint64 extras;
4220   kmp_uint64 tc;
4221   kmp_uint64 num_t_min;
4222 #if OMPT_SUPPORT
4223   void *codeptr_ra;
4224 #endif
4225 } __taskloop_params_t;
4226 
4227 void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *,
4228                           kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64,
4229                           kmp_uint64, kmp_uint64, kmp_uint64, kmp_uint64,
4230 #if OMPT_SUPPORT
4231                           void *,
4232 #endif
4233                           void *);
4234 
4235 // Execute part of the taskloop submitted as a task.
4236 int __kmp_taskloop_task(int gtid, void *ptask) {
4237   __taskloop_params_t *p =
4238       (__taskloop_params_t *)((kmp_task_t *)ptask)->shareds;
4239   kmp_task_t *task = p->task;
4240   kmp_uint64 *lb = p->lb;
4241   kmp_uint64 *ub = p->ub;
4242   void *task_dup = p->task_dup;
4243   //  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4244   kmp_int64 st = p->st;
4245   kmp_uint64 ub_glob = p->ub_glob;
4246   kmp_uint64 num_tasks = p->num_tasks;
4247   kmp_uint64 grainsize = p->grainsize;
4248   kmp_uint64 extras = p->extras;
4249   kmp_uint64 tc = p->tc;
4250   kmp_uint64 num_t_min = p->num_t_min;
4251 #if OMPT_SUPPORT
4252   void *codeptr_ra = p->codeptr_ra;
4253 #endif
4254 #if KMP_DEBUG
4255   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4256   KMP_DEBUG_ASSERT(task != NULL);
4257   KA_TRACE(20, ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize"
4258                 " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n",
4259                 gtid, taskdata, num_tasks, grainsize, extras, *lb, *ub, st,
4260                 task_dup));
4261 #endif
4262   KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min);
4263   if (num_tasks > num_t_min)
4264     __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
4265                          grainsize, extras, tc, num_t_min,
4266 #if OMPT_SUPPORT
4267                          codeptr_ra,
4268 #endif
4269                          task_dup);
4270   else
4271     __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
4272                           grainsize, extras, tc,
4273 #if OMPT_SUPPORT
4274                           codeptr_ra,
4275 #endif
4276                           task_dup);
4277 
4278   KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid));
4279   return 0;
4280 }
4281 
4282 // Schedule part of the taskloop as a task,
4283 // execute the rest of the taskloop.
4284 //
4285 // loc        Source location information
4286 // gtid       Global thread ID
4287 // task       Pattern task, exposes the loop iteration range
4288 // lb         Pointer to loop lower bound in task structure
4289 // ub         Pointer to loop upper bound in task structure
4290 // st         Loop stride
4291 // ub_glob    Global upper bound (used for lastprivate check)
4292 // num_tasks  Number of tasks to execute
4293 // grainsize  Number of loop iterations per task
4294 // extras     Number of chunks with grainsize+1 iterations
4295 // tc         Iterations count
4296 // num_t_min  Threshold to launch tasks recursively
4297 // task_dup   Tasks duplication routine
4298 // codeptr_ra Return address for OMPT events
4299 void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
4300                           kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4301                           kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4302                           kmp_uint64 grainsize, kmp_uint64 extras,
4303                           kmp_uint64 tc, kmp_uint64 num_t_min,
4304 #if OMPT_SUPPORT
4305                           void *codeptr_ra,
4306 #endif
4307                           void *task_dup) {
4308   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4309   KMP_DEBUG_ASSERT(task != NULL);
4310   KMP_DEBUG_ASSERT(num_tasks > num_t_min);
4311   KA_TRACE(20, ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize"
4312                 " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n",
4313                 gtid, taskdata, num_tasks, grainsize, extras, *lb, *ub, st,
4314                 task_dup));
4315   p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4316   kmp_uint64 lower = *lb;
4317   kmp_info_t *thread = __kmp_threads[gtid];
4318   //  kmp_taskdata_t *current_task = thread->th.th_current_task;
4319   kmp_task_t *next_task;
4320   size_t lower_offset =
4321       (char *)lb - (char *)task; // remember offset of lb in the task structure
4322   size_t upper_offset =
4323       (char *)ub - (char *)task; // remember offset of ub in the task structure
4324 
4325   KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
4326   KMP_DEBUG_ASSERT(num_tasks > extras);
4327   KMP_DEBUG_ASSERT(num_tasks > 0);
4328 
4329   // split the loop in two halves
4330   kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1;
4331   kmp_uint64 gr_size0 = grainsize;
4332   kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute
4333   kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task
4334   if (n_tsk0 <= extras) {
4335     gr_size0++; // integrate extras into grainsize
4336     ext0 = 0; // no extra iters in 1st half
4337     ext1 = extras - n_tsk0; // remaining extras
4338     tc0 = gr_size0 * n_tsk0;
4339     tc1 = tc - tc0;
4340   } else { // n_tsk0 > extras
4341     ext1 = 0; // no extra iters in 2nd half
4342     ext0 = extras;
4343     tc1 = grainsize * n_tsk1;
4344     tc0 = tc - tc1;
4345   }
4346   ub0 = lower + st * (tc0 - 1);
4347   lb1 = ub0 + st;
4348 
4349   // create pattern task for 2nd half of the loop
4350   next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task
4351   // adjust lower bound (upper bound is not changed) for the 2nd half
4352   *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1;
4353   if (ptask_dup != NULL) // construct firstprivates, etc.
4354     ptask_dup(next_task, task, 0);
4355   *ub = ub0; // adjust upper bound for the 1st half
4356 
4357   // create auxiliary task for 2nd half of the loop
4358   // make sure new task has same parent task as the pattern task
4359   kmp_taskdata_t *current_task = thread->th.th_current_task;
4360   thread->th.th_current_task = taskdata->td_parent;
4361   kmp_task_t *new_task =
4362       __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *),
4363                             sizeof(__taskloop_params_t), &__kmp_taskloop_task);
4364   // restore current task
4365   thread->th.th_current_task = current_task;
4366   __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds;
4367   p->task = next_task;
4368   p->lb = (kmp_uint64 *)((char *)next_task + lower_offset);
4369   p->ub = (kmp_uint64 *)((char *)next_task + upper_offset);
4370   p->task_dup = task_dup;
4371   p->st = st;
4372   p->ub_glob = ub_glob;
4373   p->num_tasks = n_tsk1;
4374   p->grainsize = grainsize;
4375   p->extras = ext1;
4376   p->tc = tc1;
4377   p->num_t_min = num_t_min;
4378 #if OMPT_SUPPORT
4379   p->codeptr_ra = codeptr_ra;
4380 #endif
4381 
4382 #if OMPT_SUPPORT
4383   // schedule new task with correct return address for OMPT events
4384   __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra);
4385 #else
4386   __kmp_omp_task(gtid, new_task, true); // schedule new task
4387 #endif
4388 
4389   // execute the 1st half of current subrange
4390   if (n_tsk0 > num_t_min)
4391     __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0,
4392                          ext0, tc0, num_t_min,
4393 #if OMPT_SUPPORT
4394                          codeptr_ra,
4395 #endif
4396                          task_dup);
4397   else
4398     __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,
4399                           gr_size0, ext0, tc0,
4400 #if OMPT_SUPPORT
4401                           codeptr_ra,
4402 #endif
4403                           task_dup);
4404 
4405   KA_TRACE(40, ("__kmpc_taskloop_recur(exit): T#%d\n", gtid));
4406 }
4407 
4408 /*!
4409 @ingroup TASKING
4410 @param loc       Source location information
4411 @param gtid      Global thread ID
4412 @param task      Task structure
4413 @param if_val    Value of the if clause
4414 @param lb        Pointer to loop lower bound in task structure
4415 @param ub        Pointer to loop upper bound in task structure
4416 @param st        Loop stride
4417 @param nogroup   Flag, 1 if no taskgroup needs to be added, 0 otherwise
4418 @param sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
4419 @param grainsize Schedule value if specified
4420 @param task_dup  Tasks duplication routine
4421 
4422 Execute the taskloop construct.
4423 */
4424 void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
4425                      kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup,
4426                      int sched, kmp_uint64 grainsize, void *task_dup) {
4427   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4428   KMP_DEBUG_ASSERT(task != NULL);
4429 
4430   if (nogroup == 0) {
4431 #if OMPT_SUPPORT && OMPT_OPTIONAL
4432     OMPT_STORE_RETURN_ADDRESS(gtid);
4433 #endif
4434     __kmpc_taskgroup(loc, gtid);
4435   }
4436 
4437   // =========================================================================
4438   // calculate loop parameters
4439   kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4440   kmp_uint64 tc;
4441   // compiler provides global bounds here
4442   kmp_uint64 lower = task_bounds.get_lb();
4443   kmp_uint64 upper = task_bounds.get_ub();
4444   kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag
4445   kmp_uint64 num_tasks = 0, extras = 0;
4446   kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks;
4447   kmp_info_t *thread = __kmp_threads[gtid];
4448   kmp_taskdata_t *current_task = thread->th.th_current_task;
4449 
4450   KA_TRACE(20, ("__kmpc_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, "
4451                 "grain %llu(%d), dup %p\n",
4452                 gtid, taskdata, lower, upper, st, grainsize, sched, task_dup));
4453 
4454   // compute trip count
4455   if (st == 1) { // most common case
4456     tc = upper - lower + 1;
4457   } else if (st < 0) {
4458     tc = (lower - upper) / (-st) + 1;
4459   } else { // st > 0
4460     tc = (upper - lower) / st + 1;
4461   }
4462   if (tc == 0) {
4463     KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d zero-trip loop\n", gtid));
4464     // free the pattern task and exit
4465     __kmp_task_start(gtid, task, current_task);
4466     // do not execute anything for zero-trip loop
4467     __kmp_task_finish<false>(gtid, task, current_task);
4468     return;
4469   }
4470 
4471 #if OMPT_SUPPORT && OMPT_OPTIONAL
4472   ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
4473   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
4474   if (ompt_enabled.ompt_callback_work) {
4475     ompt_callbacks.ompt_callback(ompt_callback_work)(
4476         ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data),
4477         &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
4478   }
4479 #endif
4480 
4481   if (num_tasks_min == 0)
4482     // TODO: can we choose better default heuristic?
4483     num_tasks_min =
4484         KMP_MIN(thread->th.th_team_nproc * 10, INITIAL_TASK_DEQUE_SIZE);
4485 
4486   // compute num_tasks/grainsize based on the input provided
4487   switch (sched) {
4488   case 0: // no schedule clause specified, we can choose the default
4489     // let's try to schedule (team_size*10) tasks
4490     grainsize = thread->th.th_team_nproc * 10;
4491     KMP_FALLTHROUGH();
4492   case 2: // num_tasks provided
4493     if (grainsize > tc) {
4494       num_tasks = tc; // too big num_tasks requested, adjust values
4495       grainsize = 1;
4496       extras = 0;
4497     } else {
4498       num_tasks = grainsize;
4499       grainsize = tc / num_tasks;
4500       extras = tc % num_tasks;
4501     }
4502     break;
4503   case 1: // grainsize provided
4504     if (grainsize > tc) {
4505       num_tasks = 1; // too big grainsize requested, adjust values
4506       grainsize = tc;
4507       extras = 0;
4508     } else {
4509       num_tasks = tc / grainsize;
4510       // adjust grainsize for balanced distribution of iterations
4511       grainsize = tc / num_tasks;
4512       extras = tc % num_tasks;
4513     }
4514     break;
4515   default:
4516     KMP_ASSERT2(0, "unknown scheduling of taskloop");
4517   }
4518   KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
4519   KMP_DEBUG_ASSERT(num_tasks > extras);
4520   KMP_DEBUG_ASSERT(num_tasks > 0);
4521   // =========================================================================
4522 
4523   // check if clause value first
4524   // Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native)
4525   if (if_val == 0) { // if(0) specified, mark task as serial
4526     taskdata->td_flags.task_serial = 1;
4527     taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
4528     // always start serial tasks linearly
4529     __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
4530                           grainsize, extras, tc,
4531 #if OMPT_SUPPORT
4532                           OMPT_GET_RETURN_ADDRESS(0),
4533 #endif
4534                           task_dup);
4535     // !taskdata->td_flags.native => currently force linear spawning of tasks
4536     // for GOMP_taskloop
4537   } else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) {
4538     KA_TRACE(20, ("__kmpc_taskloop: T#%d, go recursive: tc %llu, #tasks %llu"
4539                   "(%lld), grain %llu, extras %llu\n",
4540                   gtid, tc, num_tasks, num_tasks_min, grainsize, extras));
4541     __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
4542                          grainsize, extras, tc, num_tasks_min,
4543 #if OMPT_SUPPORT
4544                          OMPT_GET_RETURN_ADDRESS(0),
4545 #endif
4546                          task_dup);
4547   } else {
4548     KA_TRACE(20, ("__kmpc_taskloop: T#%d, go linear: tc %llu, #tasks %llu"
4549                   "(%lld), grain %llu, extras %llu\n",
4550                   gtid, tc, num_tasks, num_tasks_min, grainsize, extras));
4551     __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
4552                           grainsize, extras, tc,
4553 #if OMPT_SUPPORT
4554                           OMPT_GET_RETURN_ADDRESS(0),
4555 #endif
4556                           task_dup);
4557   }
4558 
4559 #if OMPT_SUPPORT && OMPT_OPTIONAL
4560   if (ompt_enabled.ompt_callback_work) {
4561     ompt_callbacks.ompt_callback(ompt_callback_work)(
4562         ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data),
4563         &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
4564   }
4565 #endif
4566 
4567   if (nogroup == 0) {
4568 #if OMPT_SUPPORT && OMPT_OPTIONAL
4569     OMPT_STORE_RETURN_ADDRESS(gtid);
4570 #endif
4571     __kmpc_end_taskgroup(loc, gtid);
4572   }
4573   KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid));
4574 }
4575