1 /*
2  * kmp_tasking.cpp -- OpenMP 3.0 tasking support.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_i18n.h"
15 #include "kmp_itt.h"
16 #include "kmp_stats.h"
17 #include "kmp_wait_release.h"
18 #include "kmp_taskdeps.h"
19 
20 #if OMPT_SUPPORT
21 #include "ompt-specific.h"
22 #endif
23 
24 #include "tsan_annotations.h"
25 
26 /* forward declaration */
27 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
28                                  kmp_info_t *this_thr);
29 static void __kmp_alloc_task_deque(kmp_info_t *thread,
30                                    kmp_thread_data_t *thread_data);
31 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
32                                            kmp_task_team_t *task_team);
33 
34 #if OMP_45_ENABLED
35 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask);
36 #endif
37 
38 #ifdef BUILD_TIED_TASK_STACK
39 
40 //  __kmp_trace_task_stack: print the tied tasks from the task stack in order
41 //  from top do bottom
42 //
43 //  gtid: global thread identifier for thread containing stack
44 //  thread_data: thread data for task team thread containing stack
45 //  threshold: value above which the trace statement triggers
46 //  location: string identifying call site of this function (for trace)
47 static void __kmp_trace_task_stack(kmp_int32 gtid,
48                                    kmp_thread_data_t *thread_data,
49                                    int threshold, char *location) {
50   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
51   kmp_taskdata_t **stack_top = task_stack->ts_top;
52   kmp_int32 entries = task_stack->ts_entries;
53   kmp_taskdata_t *tied_task;
54 
55   KA_TRACE(
56       threshold,
57       ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, "
58        "first_block = %p, stack_top = %p \n",
59        location, gtid, entries, task_stack->ts_first_block, stack_top));
60 
61   KMP_DEBUG_ASSERT(stack_top != NULL);
62   KMP_DEBUG_ASSERT(entries > 0);
63 
64   while (entries != 0) {
65     KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]);
66     // fix up ts_top if we need to pop from previous block
67     if (entries & TASK_STACK_INDEX_MASK == 0) {
68       kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top);
69 
70       stack_block = stack_block->sb_prev;
71       stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
72     }
73 
74     // finish bookkeeping
75     stack_top--;
76     entries--;
77 
78     tied_task = *stack_top;
79 
80     KMP_DEBUG_ASSERT(tied_task != NULL);
81     KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
82 
83     KA_TRACE(threshold,
84              ("__kmp_trace_task_stack(%s):             gtid=%d, entry=%d, "
85               "stack_top=%p, tied_task=%p\n",
86               location, gtid, entries, stack_top, tied_task));
87   }
88   KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]);
89 
90   KA_TRACE(threshold,
91            ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n",
92             location, gtid));
93 }
94 
95 //  __kmp_init_task_stack: initialize the task stack for the first time
96 //  after a thread_data structure is created.
97 //  It should not be necessary to do this again (assuming the stack works).
98 //
99 //  gtid: global thread identifier of calling thread
100 //  thread_data: thread data for task team thread containing stack
101 static void __kmp_init_task_stack(kmp_int32 gtid,
102                                   kmp_thread_data_t *thread_data) {
103   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
104   kmp_stack_block_t *first_block;
105 
106   // set up the first block of the stack
107   first_block = &task_stack->ts_first_block;
108   task_stack->ts_top = (kmp_taskdata_t **)first_block;
109   memset((void *)first_block, '\0',
110          TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *));
111 
112   // initialize the stack to be empty
113   task_stack->ts_entries = TASK_STACK_EMPTY;
114   first_block->sb_next = NULL;
115   first_block->sb_prev = NULL;
116 }
117 
118 //  __kmp_free_task_stack: free the task stack when thread_data is destroyed.
119 //
120 //  gtid: global thread identifier for calling thread
121 //  thread_data: thread info for thread containing stack
122 static void __kmp_free_task_stack(kmp_int32 gtid,
123                                   kmp_thread_data_t *thread_data) {
124   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
125   kmp_stack_block_t *stack_block = &task_stack->ts_first_block;
126 
127   KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY);
128   // free from the second block of the stack
129   while (stack_block != NULL) {
130     kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL;
131 
132     stack_block->sb_next = NULL;
133     stack_block->sb_prev = NULL;
134     if (stack_block != &task_stack->ts_first_block) {
135       __kmp_thread_free(thread,
136                         stack_block); // free the block, if not the first
137     }
138     stack_block = next_block;
139   }
140   // initialize the stack to be empty
141   task_stack->ts_entries = 0;
142   task_stack->ts_top = NULL;
143 }
144 
145 //  __kmp_push_task_stack: Push the tied task onto the task stack.
146 //     Grow the stack if necessary by allocating another block.
147 //
148 //  gtid: global thread identifier for calling thread
149 //  thread: thread info for thread containing stack
150 //  tied_task: the task to push on the stack
151 static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread,
152                                   kmp_taskdata_t *tied_task) {
153   // GEH - need to consider what to do if tt_threads_data not allocated yet
154   kmp_thread_data_t *thread_data =
155       &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
156   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
157 
158   if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) {
159     return; // Don't push anything on stack if team or team tasks are serialized
160   }
161 
162   KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
163   KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
164 
165   KA_TRACE(20,
166            ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n",
167             gtid, thread, tied_task));
168   // Store entry
169   *(task_stack->ts_top) = tied_task;
170 
171   // Do bookkeeping for next push
172   task_stack->ts_top++;
173   task_stack->ts_entries++;
174 
175   if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
176     // Find beginning of this task block
177     kmp_stack_block_t *stack_block =
178         (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE);
179 
180     // Check if we already have a block
181     if (stack_block->sb_next !=
182         NULL) { // reset ts_top to beginning of next block
183       task_stack->ts_top = &stack_block->sb_next->sb_block[0];
184     } else { // Alloc new block and link it up
185       kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc(
186           thread, sizeof(kmp_stack_block_t));
187 
188       task_stack->ts_top = &new_block->sb_block[0];
189       stack_block->sb_next = new_block;
190       new_block->sb_prev = stack_block;
191       new_block->sb_next = NULL;
192 
193       KA_TRACE(
194           30,
195           ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n",
196            gtid, tied_task, new_block));
197     }
198   }
199   KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
200                 tied_task));
201 }
202 
203 //  __kmp_pop_task_stack: Pop the tied task from the task stack.  Don't return
204 //  the task, just check to make sure it matches the ending task passed in.
205 //
206 //  gtid: global thread identifier for the calling thread
207 //  thread: thread info structure containing stack
208 //  tied_task: the task popped off the stack
209 //  ending_task: the task that is ending (should match popped task)
210 static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread,
211                                  kmp_taskdata_t *ending_task) {
212   // GEH - need to consider what to do if tt_threads_data not allocated yet
213   kmp_thread_data_t *thread_data =
214       &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)];
215   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
216   kmp_taskdata_t *tied_task;
217 
218   if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) {
219     // Don't pop anything from stack if team or team tasks are serialized
220     return;
221   }
222 
223   KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
224   KMP_DEBUG_ASSERT(task_stack->ts_entries > 0);
225 
226   KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid,
227                 thread));
228 
229   // fix up ts_top if we need to pop from previous block
230   if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
231     kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top);
232 
233     stack_block = stack_block->sb_prev;
234     task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
235   }
236 
237   // finish bookkeeping
238   task_stack->ts_top--;
239   task_stack->ts_entries--;
240 
241   tied_task = *(task_stack->ts_top);
242 
243   KMP_DEBUG_ASSERT(tied_task != NULL);
244   KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
245   KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly
246 
247   KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
248                 tied_task));
249   return;
250 }
251 #endif /* BUILD_TIED_TASK_STACK */
252 
253 // returns 1 if new task is allowed to execute, 0 otherwise
254 // checks Task Scheduling constraint (if requested) and
255 // mutexinoutset dependencies if any
256 static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained,
257                                   const kmp_taskdata_t *tasknew,
258                                   const kmp_taskdata_t *taskcurr) {
259   if (is_constrained && (tasknew->td_flags.tiedness == TASK_TIED)) {
260     // Check if the candidate obeys the Task Scheduling Constraints (TSC)
261     // only descendant of all deferred tied tasks can be scheduled, checking
262     // the last one is enough, as it in turn is the descendant of all others
263     kmp_taskdata_t *current = taskcurr->td_last_tied;
264     KMP_DEBUG_ASSERT(current != NULL);
265     // check if the task is not suspended on barrier
266     if (current->td_flags.tasktype == TASK_EXPLICIT ||
267         current->td_taskwait_thread > 0) { // <= 0 on barrier
268       kmp_int32 level = current->td_level;
269       kmp_taskdata_t *parent = tasknew->td_parent;
270       while (parent != current && parent->td_level > level) {
271         // check generation up to the level of the current task
272         parent = parent->td_parent;
273         KMP_DEBUG_ASSERT(parent != NULL);
274       }
275       if (parent != current)
276         return false;
277     }
278   }
279   // Check mutexinoutset dependencies, acquire locks
280   kmp_depnode_t *node = tasknew->td_depnode;
281   if (node && (node->dn.mtx_num_locks > 0)) {
282     for (int i = 0; i < node->dn.mtx_num_locks; ++i) {
283       KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
284       if (__kmp_test_lock(node->dn.mtx_locks[i], gtid))
285         continue;
286       // could not get the lock, release previous locks
287       for (int j = i - 1; j >= 0; --j)
288         __kmp_release_lock(node->dn.mtx_locks[j], gtid);
289       return false;
290     }
291     // negative num_locks means all locks acquired successfully
292     node->dn.mtx_num_locks = -node->dn.mtx_num_locks;
293   }
294   return true;
295 }
296 
297 // __kmp_realloc_task_deque:
298 // Re-allocates a task deque for a particular thread, copies the content from
299 // the old deque and adjusts the necessary data structures relating to the
300 // deque. This operation must be done with the deque_lock being held
301 static void __kmp_realloc_task_deque(kmp_info_t *thread,
302                                      kmp_thread_data_t *thread_data) {
303   kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
304   kmp_int32 new_size = 2 * size;
305 
306   KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to "
307                 "%d] for thread_data %p\n",
308                 __kmp_gtid_from_thread(thread), size, new_size, thread_data));
309 
310   kmp_taskdata_t **new_deque =
311       (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *));
312 
313   int i, j;
314   for (i = thread_data->td.td_deque_head, j = 0; j < size;
315        i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++)
316     new_deque[j] = thread_data->td.td_deque[i];
317 
318   __kmp_free(thread_data->td.td_deque);
319 
320   thread_data->td.td_deque_head = 0;
321   thread_data->td.td_deque_tail = size;
322   thread_data->td.td_deque = new_deque;
323   thread_data->td.td_deque_size = new_size;
324 }
325 
326 //  __kmp_push_task: Add a task to the thread's deque
327 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
328   kmp_info_t *thread = __kmp_threads[gtid];
329   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
330   kmp_task_team_t *task_team = thread->th.th_task_team;
331   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
332   kmp_thread_data_t *thread_data;
333 
334   KA_TRACE(20,
335            ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata));
336 
337   if (taskdata->td_flags.tiedness == TASK_UNTIED) {
338     // untied task needs to increment counter so that the task structure is not
339     // freed prematurely
340     kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
341     KMP_DEBUG_USE_VAR(counter);
342     KA_TRACE(
343         20,
344         ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",
345          gtid, counter, taskdata));
346   }
347 
348   // The first check avoids building task_team thread data if serialized
349   if (taskdata->td_flags.task_serial) {
350     KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning "
351                   "TASK_NOT_PUSHED for task %p\n",
352                   gtid, taskdata));
353     return TASK_NOT_PUSHED;
354   }
355 
356   // Now that serialized tasks have returned, we can assume that we are not in
357   // immediate exec mode
358   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
359   if (!KMP_TASKING_ENABLED(task_team)) {
360     __kmp_enable_tasking(task_team, thread);
361   }
362   KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE);
363   KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL);
364 
365   // Find tasking deque specific to encountering thread
366   thread_data = &task_team->tt.tt_threads_data[tid];
367 
368   // No lock needed since only owner can allocate
369   if (thread_data->td.td_deque == NULL) {
370     __kmp_alloc_task_deque(thread, thread_data);
371   }
372 
373   int locked = 0;
374   // Check if deque is full
375   if (TCR_4(thread_data->td.td_deque_ntasks) >=
376       TASK_DEQUE_SIZE(thread_data->td)) {
377     if (__kmp_enable_task_throttling &&
378         __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
379                               thread->th.th_current_task)) {
380       KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning "
381                     "TASK_NOT_PUSHED for task %p\n",
382                     gtid, taskdata));
383       return TASK_NOT_PUSHED;
384     } else {
385       __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
386       locked = 1;
387       // expand deque to push the task which is not allowed to execute
388       __kmp_realloc_task_deque(thread, thread_data);
389     }
390   }
391   // Lock the deque for the task push operation
392   if (!locked) {
393     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
394 #if OMP_45_ENABLED
395     // Need to recheck as we can get a proxy task from thread outside of OpenMP
396     if (TCR_4(thread_data->td.td_deque_ntasks) >=
397         TASK_DEQUE_SIZE(thread_data->td)) {
398       if (__kmp_enable_task_throttling &&
399           __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
400                                 thread->th.th_current_task)) {
401         __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
402         KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; "
403                       "returning TASK_NOT_PUSHED for task %p\n",
404                       gtid, taskdata));
405         return TASK_NOT_PUSHED;
406       } else {
407         // expand deque to push the task which is not allowed to execute
408         __kmp_realloc_task_deque(thread, thread_data);
409       }
410     }
411 #endif
412   }
413   // Must have room since no thread can add tasks but calling thread
414   KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
415                    TASK_DEQUE_SIZE(thread_data->td));
416 
417   thread_data->td.td_deque[thread_data->td.td_deque_tail] =
418       taskdata; // Push taskdata
419   // Wrap index.
420   thread_data->td.td_deque_tail =
421       (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
422   TCW_4(thread_data->td.td_deque_ntasks,
423         TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
424 
425   KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
426                 "task=%p ntasks=%d head=%u tail=%u\n",
427                 gtid, taskdata, thread_data->td.td_deque_ntasks,
428                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
429 
430   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
431 
432   return TASK_SUCCESSFULLY_PUSHED;
433 }
434 
435 // __kmp_pop_current_task_from_thread: set up current task from called thread
436 // when team ends
437 //
438 // this_thr: thread structure to set current_task in.
439 void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) {
440   KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d "
441                 "this_thread=%p, curtask=%p, "
442                 "curtask_parent=%p\n",
443                 0, this_thr, this_thr->th.th_current_task,
444                 this_thr->th.th_current_task->td_parent));
445 
446   this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent;
447 
448   KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d "
449                 "this_thread=%p, curtask=%p, "
450                 "curtask_parent=%p\n",
451                 0, this_thr, this_thr->th.th_current_task,
452                 this_thr->th.th_current_task->td_parent));
453 }
454 
455 // __kmp_push_current_task_to_thread: set up current task in called thread for a
456 // new team
457 //
458 // this_thr: thread structure to set up
459 // team: team for implicit task data
460 // tid: thread within team to set up
461 void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team,
462                                        int tid) {
463   // current task of the thread is a parent of the new just created implicit
464   // tasks of new team
465   KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p "
466                 "curtask=%p "
467                 "parent_task=%p\n",
468                 tid, this_thr, this_thr->th.th_current_task,
469                 team->t.t_implicit_task_taskdata[tid].td_parent));
470 
471   KMP_DEBUG_ASSERT(this_thr != NULL);
472 
473   if (tid == 0) {
474     if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) {
475       team->t.t_implicit_task_taskdata[0].td_parent =
476           this_thr->th.th_current_task;
477       this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0];
478     }
479   } else {
480     team->t.t_implicit_task_taskdata[tid].td_parent =
481         team->t.t_implicit_task_taskdata[0].td_parent;
482     this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid];
483   }
484 
485   KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p "
486                 "curtask=%p "
487                 "parent_task=%p\n",
488                 tid, this_thr, this_thr->th.th_current_task,
489                 team->t.t_implicit_task_taskdata[tid].td_parent));
490 }
491 
492 // __kmp_task_start: bookkeeping for a task starting execution
493 //
494 // GTID: global thread id of calling thread
495 // task: task starting execution
496 // current_task: task suspending
497 static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task,
498                              kmp_taskdata_t *current_task) {
499   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
500   kmp_info_t *thread = __kmp_threads[gtid];
501 
502   KA_TRACE(10,
503            ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",
504             gtid, taskdata, current_task));
505 
506   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
507 
508   // mark currently executing task as suspended
509   // TODO: GEH - make sure root team implicit task is initialized properly.
510   // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );
511   current_task->td_flags.executing = 0;
512 
513 // Add task to stack if tied
514 #ifdef BUILD_TIED_TASK_STACK
515   if (taskdata->td_flags.tiedness == TASK_TIED) {
516     __kmp_push_task_stack(gtid, thread, taskdata);
517   }
518 #endif /* BUILD_TIED_TASK_STACK */
519 
520   // mark starting task as executing and as current task
521   thread->th.th_current_task = taskdata;
522 
523   KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 ||
524                    taskdata->td_flags.tiedness == TASK_UNTIED);
525   KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 ||
526                    taskdata->td_flags.tiedness == TASK_UNTIED);
527   taskdata->td_flags.started = 1;
528   taskdata->td_flags.executing = 1;
529   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
530   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
531 
532   // GEH TODO: shouldn't we pass some sort of location identifier here?
533   // APT: yes, we will pass location here.
534   // need to store current thread state (in a thread or taskdata structure)
535   // before setting work_state, otherwise wrong state is set after end of task
536 
537   KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata));
538 
539   return;
540 }
541 
542 #if OMPT_SUPPORT
543 //------------------------------------------------------------------------------
544 // __ompt_task_init:
545 //   Initialize OMPT fields maintained by a task. This will only be called after
546 //   ompt_start_tool, so we already know whether ompt is enabled or not.
547 
548 static inline void __ompt_task_init(kmp_taskdata_t *task, int tid) {
549   // The calls to __ompt_task_init already have the ompt_enabled condition.
550   task->ompt_task_info.task_data.value = 0;
551   task->ompt_task_info.frame.exit_frame = ompt_data_none;
552   task->ompt_task_info.frame.enter_frame = ompt_data_none;
553   task->ompt_task_info.frame.exit_frame_flags = ompt_frame_runtime | ompt_frame_framepointer;
554   task->ompt_task_info.frame.enter_frame_flags = ompt_frame_runtime | ompt_frame_framepointer;
555 #if OMP_40_ENABLED
556   task->ompt_task_info.ndeps = 0;
557   task->ompt_task_info.deps = NULL;
558 #endif /* OMP_40_ENABLED */
559 }
560 
561 // __ompt_task_start:
562 //   Build and trigger task-begin event
563 static inline void __ompt_task_start(kmp_task_t *task,
564                                      kmp_taskdata_t *current_task,
565                                      kmp_int32 gtid) {
566   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
567   ompt_task_status_t status = ompt_task_switch;
568   if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) {
569     status = ompt_task_yield;
570     __kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = 0;
571   }
572   /* let OMPT know that we're about to run this task */
573   if (ompt_enabled.ompt_callback_task_schedule) {
574     ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
575         &(current_task->ompt_task_info.task_data), status,
576         &(taskdata->ompt_task_info.task_data));
577   }
578   taskdata->ompt_task_info.scheduling_parent = current_task;
579 }
580 
581 // __ompt_task_finish:
582 //   Build and trigger final task-schedule event
583 static inline void
584 __ompt_task_finish(kmp_task_t *task, kmp_taskdata_t *resumed_task,
585                    ompt_task_status_t status = ompt_task_complete) {
586   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
587   if (__kmp_omp_cancellation && taskdata->td_taskgroup &&
588       taskdata->td_taskgroup->cancel_request == cancel_taskgroup) {
589     status = ompt_task_cancel;
590   }
591 
592   /* let OMPT know that we're returning to the callee task */
593   if (ompt_enabled.ompt_callback_task_schedule) {
594     ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
595         &(taskdata->ompt_task_info.task_data), status,
596         &((resumed_task ? resumed_task
597                         : (taskdata->ompt_task_info.scheduling_parent
598                                ? taskdata->ompt_task_info.scheduling_parent
599                                : taskdata->td_parent))
600               ->ompt_task_info.task_data));
601   }
602 }
603 #endif
604 
605 template <bool ompt>
606 static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid,
607                                                kmp_task_t *task,
608                                                void *frame_address,
609                                                void *return_address) {
610   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
611   kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
612 
613   KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p "
614                 "current_task=%p\n",
615                 gtid, loc_ref, taskdata, current_task));
616 
617   if (taskdata->td_flags.tiedness == TASK_UNTIED) {
618     // untied task needs to increment counter so that the task structure is not
619     // freed prematurely
620     kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
621     KMP_DEBUG_USE_VAR(counter);
622     KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) "
623                   "incremented for task %p\n",
624                   gtid, counter, taskdata));
625   }
626 
627   taskdata->td_flags.task_serial =
628       1; // Execute this task immediately, not deferred.
629   __kmp_task_start(gtid, task, current_task);
630 
631 #if OMPT_SUPPORT
632   if (ompt) {
633     if (current_task->ompt_task_info.frame.enter_frame.ptr == NULL) {
634       current_task->ompt_task_info.frame.enter_frame.ptr =
635           taskdata->ompt_task_info.frame.exit_frame.ptr = frame_address;
636       current_task->ompt_task_info.frame.enter_frame_flags =
637           taskdata->ompt_task_info.frame.exit_frame_flags = ompt_frame_application | ompt_frame_framepointer;
638     }
639     if (ompt_enabled.ompt_callback_task_create) {
640       ompt_task_info_t *parent_info = &(current_task->ompt_task_info);
641       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
642           &(parent_info->task_data), &(parent_info->frame),
643           &(taskdata->ompt_task_info.task_data),
644           ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(taskdata), 0,
645           return_address);
646     }
647     __ompt_task_start(task, current_task, gtid);
648   }
649 #endif // OMPT_SUPPORT
650 
651   KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid,
652                 loc_ref, taskdata));
653 }
654 
655 #if OMPT_SUPPORT
656 OMPT_NOINLINE
657 static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
658                                            kmp_task_t *task,
659                                            void *frame_address,
660                                            void *return_address) {
661   __kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address,
662                                            return_address);
663 }
664 #endif // OMPT_SUPPORT
665 
666 // __kmpc_omp_task_begin_if0: report that a given serialized task has started
667 // execution
668 //
669 // loc_ref: source location information; points to beginning of task block.
670 // gtid: global thread number.
671 // task: task thunk for the started task.
672 void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
673                                kmp_task_t *task) {
674 #if OMPT_SUPPORT
675   if (UNLIKELY(ompt_enabled.enabled)) {
676     OMPT_STORE_RETURN_ADDRESS(gtid);
677     __kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task,
678                                    OMPT_GET_FRAME_ADDRESS(1),
679                                    OMPT_LOAD_RETURN_ADDRESS(gtid));
680     return;
681   }
682 #endif
683   __kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL);
684 }
685 
686 #ifdef TASK_UNUSED
687 // __kmpc_omp_task_begin: report that a given task has started execution
688 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
689 void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) {
690   kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
691 
692   KA_TRACE(
693       10,
694       ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
695        gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task));
696 
697   __kmp_task_start(gtid, task, current_task);
698 
699   KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid,
700                 loc_ref, KMP_TASK_TO_TASKDATA(task)));
701   return;
702 }
703 #endif // TASK_UNUSED
704 
705 // __kmp_free_task: free the current task space and the space for shareds
706 //
707 // gtid: Global thread ID of calling thread
708 // taskdata: task to free
709 // thread: thread data structure of caller
710 static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
711                             kmp_info_t *thread) {
712   KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid,
713                 taskdata));
714 
715   // Check to make sure all flags and counters have the correct values
716   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
717   KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0);
718   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1);
719   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
720   KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == 0 ||
721                    taskdata->td_flags.task_serial == 1);
722   KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0);
723 
724   taskdata->td_flags.freed = 1;
725   ANNOTATE_HAPPENS_BEFORE(taskdata);
726 // deallocate the taskdata and shared variable blocks associated with this task
727 #if USE_FAST_MEMORY
728   __kmp_fast_free(thread, taskdata);
729 #else /* ! USE_FAST_MEMORY */
730   __kmp_thread_free(thread, taskdata);
731 #endif
732 
733   KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));
734 }
735 
736 // __kmp_free_task_and_ancestors: free the current task and ancestors without
737 // children
738 //
739 // gtid: Global thread ID of calling thread
740 // taskdata: task to free
741 // thread: thread data structure of caller
742 static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
743                                           kmp_taskdata_t *taskdata,
744                                           kmp_info_t *thread) {
745 #if OMP_45_ENABLED
746   // Proxy tasks must always be allowed to free their parents
747   // because they can be run in background even in serial mode.
748   kmp_int32 team_serial =
749       (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) &&
750       !taskdata->td_flags.proxy;
751 #else
752   kmp_int32 team_serial =
753       taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser;
754 #endif
755   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
756 
757   kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
758   KMP_DEBUG_ASSERT(children >= 0);
759 
760   // Now, go up the ancestor tree to see if any ancestors can now be freed.
761   while (children == 0) {
762     kmp_taskdata_t *parent_taskdata = taskdata->td_parent;
763 
764     KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete "
765                   "and freeing itself\n",
766                   gtid, taskdata));
767 
768     // --- Deallocate my ancestor task ---
769     __kmp_free_task(gtid, taskdata, thread);
770 
771     taskdata = parent_taskdata;
772 
773     if (team_serial)
774       return;
775     // Stop checking ancestors at implicit task instead of walking up ancestor
776     // tree to avoid premature deallocation of ancestors.
777     if (taskdata->td_flags.tasktype == TASK_IMPLICIT) {
778       if (taskdata->td_dephash) { // do we need to cleanup dephash?
779         int children = KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks);
780         kmp_tasking_flags_t flags_old = taskdata->td_flags;
781         if (children == 0 && flags_old.complete == 1) {
782           kmp_tasking_flags_t flags_new = flags_old;
783           flags_new.complete = 0;
784           if (KMP_COMPARE_AND_STORE_ACQ32(
785                   RCAST(kmp_int32 *, &taskdata->td_flags),
786                   *RCAST(kmp_int32 *, &flags_old),
787                   *RCAST(kmp_int32 *, &flags_new))) {
788             KA_TRACE(100, ("__kmp_free_task_and_ancestors: T#%d cleans "
789                            "dephash of implicit task %p\n",
790                            gtid, taskdata));
791             // cleanup dephash of finished implicit task
792             __kmp_dephash_free_entries(thread, taskdata->td_dephash);
793           }
794         }
795       }
796       return;
797     }
798     // Predecrement simulated by "- 1" calculation
799     children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
800     KMP_DEBUG_ASSERT(children >= 0);
801   }
802 
803   KA_TRACE(
804       20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "
805            "not freeing it yet\n",
806            gtid, taskdata, children));
807 }
808 
809 // __kmp_task_finish: bookkeeping to do when a task finishes execution
810 //
811 // gtid: global thread ID for calling thread
812 // task: task to be finished
813 // resumed_task: task to be resumed.  (may be NULL if task is serialized)
814 template <bool ompt>
815 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
816                               kmp_taskdata_t *resumed_task) {
817   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
818   kmp_info_t *thread = __kmp_threads[gtid];
819 #if OMP_45_ENABLED
820   kmp_task_team_t *task_team =
821       thread->th.th_task_team; // might be NULL for serial teams...
822 #endif // OMP_45_ENABLED
823   kmp_int32 children = 0;
824 
825   KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming "
826                 "task %p\n",
827                 gtid, taskdata, resumed_task));
828 
829   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
830 
831 // Pop task from stack if tied
832 #ifdef BUILD_TIED_TASK_STACK
833   if (taskdata->td_flags.tiedness == TASK_TIED) {
834     __kmp_pop_task_stack(gtid, thread, taskdata);
835   }
836 #endif /* BUILD_TIED_TASK_STACK */
837 
838   if (taskdata->td_flags.tiedness == TASK_UNTIED) {
839     // untied task needs to check the counter so that the task structure is not
840     // freed prematurely
841     kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1;
842     KA_TRACE(
843         20,
844         ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",
845          gtid, counter, taskdata));
846     if (counter > 0) {
847       // untied task is not done, to be continued possibly by other thread, do
848       // not free it now
849       if (resumed_task == NULL) {
850         KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial);
851         resumed_task = taskdata->td_parent; // In a serialized task, the resumed
852         // task is the parent
853       }
854       thread->th.th_current_task = resumed_task; // restore current_task
855       resumed_task->td_flags.executing = 1; // resume previous task
856       KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, "
857                     "resuming task %p\n",
858                     gtid, taskdata, resumed_task));
859       return;
860     }
861   }
862 #if OMPT_SUPPORT
863   if (ompt)
864     __ompt_task_finish(task, resumed_task);
865 #endif
866 
867   // Check mutexinoutset dependencies, release locks
868   kmp_depnode_t *node = taskdata->td_depnode;
869   if (node && (node->dn.mtx_num_locks < 0)) {
870     // negative num_locks means all locks were acquired
871     node->dn.mtx_num_locks = -node->dn.mtx_num_locks;
872     for (int i = node->dn.mtx_num_locks - 1; i >= 0; --i) {
873       KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
874       __kmp_release_lock(node->dn.mtx_locks[i], gtid);
875     }
876   }
877 
878   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
879   bool detach = false;
880 #if OMP_50_ENABLED
881   if (taskdata->td_flags.detachable == TASK_DETACHABLE) {
882     if (taskdata->td_allow_completion_event.type ==
883         KMP_EVENT_ALLOW_COMPLETION) {
884       // event hasn't been fulfilled yet. Try to detach task.
885       __kmp_acquire_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
886       if (taskdata->td_allow_completion_event.type ==
887           KMP_EVENT_ALLOW_COMPLETION) {
888         taskdata->td_flags.proxy = TASK_PROXY; // proxify!
889         detach = true;
890       }
891       __kmp_release_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
892     }
893   }
894 #endif
895   KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
896   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
897 
898   if (!detach) {
899     taskdata->td_flags.complete = 1; // mark the task as completed
900 
901     // Only need to keep track of count if team parallel and tasking not
902     // serialized
903     if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
904       // Predecrement simulated by "- 1" calculation
905       children =
906           KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
907       KMP_DEBUG_ASSERT(children >= 0);
908 #if OMP_40_ENABLED
909       if (taskdata->td_taskgroup)
910         KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
911       __kmp_release_deps(gtid, taskdata);
912 #if OMP_45_ENABLED
913     } else if (task_team && task_team->tt.tt_found_proxy_tasks) {
914       // if we found proxy tasks there could exist a dependency chain
915       // with the proxy task as origin
916       __kmp_release_deps(gtid, taskdata);
917 #endif // OMP_45_ENABLED
918 #endif // OMP_40_ENABLED
919     }
920   }
921 
922   // td_flags.executing must be marked as 0 after __kmp_release_deps has been
923   // called. Othertwise, if a task is executed immediately from the release_deps
924   // code, the flag will be reset to 1 again by this same function
925   KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
926   taskdata->td_flags.executing = 0; // suspend the finishing task
927 
928   KA_TRACE(
929       20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
930            gtid, taskdata, children));
931 
932 #if OMP_40_ENABLED
933   /* If the tasks' destructor thunk flag has been set, we need to invoke the
934      destructor thunk that has been generated by the compiler. The code is
935      placed here, since at this point other tasks might have been released
936      hence overlapping the destructor invokations with some other work in the
937      released tasks.  The OpenMP spec is not specific on when the destructors
938      are invoked, so we should be free to choose. */
939   if (taskdata->td_flags.destructors_thunk) {
940     kmp_routine_entry_t destr_thunk = task->data1.destructors;
941     KMP_ASSERT(destr_thunk);
942     destr_thunk(gtid, task);
943   }
944 #endif // OMP_40_ENABLED
945 
946   // bookkeeping for resuming task:
947   // GEH - note tasking_ser => task_serial
948   KMP_DEBUG_ASSERT(
949       (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
950       taskdata->td_flags.task_serial);
951   if (taskdata->td_flags.task_serial) {
952     if (resumed_task == NULL) {
953       resumed_task = taskdata->td_parent; // In a serialized task, the resumed
954       // task is the parent
955     }
956   } else {
957     KMP_DEBUG_ASSERT(resumed_task !=
958                      NULL); // verify that resumed task is passed as arguemnt
959   }
960 
961   // Free this task and then ancestor tasks if they have no children.
962   // Restore th_current_task first as suggested by John:
963   // johnmc: if an asynchronous inquiry peers into the runtime system
964   // it doesn't see the freed task as the current task.
965   thread->th.th_current_task = resumed_task;
966   if (!detach)
967     __kmp_free_task_and_ancestors(gtid, taskdata, thread);
968 
969   // TODO: GEH - make sure root team implicit task is initialized properly.
970   // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
971   resumed_task->td_flags.executing = 1; // resume previous task
972 
973   KA_TRACE(
974       10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
975            gtid, taskdata, resumed_task));
976 
977   return;
978 }
979 
980 template <bool ompt>
981 static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref,
982                                                   kmp_int32 gtid,
983                                                   kmp_task_t *task) {
984   KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
985                 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
986   // this routine will provide task to resume
987   __kmp_task_finish<ompt>(gtid, task, NULL);
988 
989   KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
990                 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
991 
992 #if OMPT_SUPPORT
993   if (ompt) {
994     ompt_frame_t *ompt_frame;
995     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
996     ompt_frame->enter_frame = ompt_data_none;
997     ompt_frame->enter_frame_flags = ompt_frame_runtime | ompt_frame_framepointer;
998   }
999 #endif
1000 
1001   return;
1002 }
1003 
1004 #if OMPT_SUPPORT
1005 OMPT_NOINLINE
1006 void __kmpc_omp_task_complete_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
1007                                        kmp_task_t *task) {
1008   __kmpc_omp_task_complete_if0_template<true>(loc_ref, gtid, task);
1009 }
1010 #endif // OMPT_SUPPORT
1011 
1012 // __kmpc_omp_task_complete_if0: report that a task has completed execution
1013 //
1014 // loc_ref: source location information; points to end of task block.
1015 // gtid: global thread number.
1016 // task: task thunk for the completed task.
1017 void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,
1018                                   kmp_task_t *task) {
1019 #if OMPT_SUPPORT
1020   if (UNLIKELY(ompt_enabled.enabled)) {
1021     __kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task);
1022     return;
1023   }
1024 #endif
1025   __kmpc_omp_task_complete_if0_template<false>(loc_ref, gtid, task);
1026 }
1027 
1028 #ifdef TASK_UNUSED
1029 // __kmpc_omp_task_complete: report that a task has completed execution
1030 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
1031 void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid,
1032                               kmp_task_t *task) {
1033   KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid,
1034                 loc_ref, KMP_TASK_TO_TASKDATA(task)));
1035 
1036   __kmp_task_finish<false>(gtid, task,
1037                            NULL); // Not sure how to find task to resume
1038 
1039   KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid,
1040                 loc_ref, KMP_TASK_TO_TASKDATA(task)));
1041   return;
1042 }
1043 #endif // TASK_UNUSED
1044 
1045 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit
1046 // task for a given thread
1047 //
1048 // loc_ref:  reference to source location of parallel region
1049 // this_thr:  thread data structure corresponding to implicit task
1050 // team: team for this_thr
1051 // tid: thread id of given thread within team
1052 // set_curr_task: TRUE if need to push current task to thread
1053 // NOTE: Routine does not set up the implicit task ICVS.  This is assumed to
1054 // have already been done elsewhere.
1055 // TODO: Get better loc_ref.  Value passed in may be NULL
1056 void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
1057                               kmp_team_t *team, int tid, int set_curr_task) {
1058   kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid];
1059 
1060   KF_TRACE(
1061       10,
1062       ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",
1063        tid, team, task, set_curr_task ? "TRUE" : "FALSE"));
1064 
1065   task->td_task_id = KMP_GEN_TASK_ID();
1066   task->td_team = team;
1067   //    task->td_parent   = NULL;  // fix for CQ230101 (broken parent task info
1068   //    in debugger)
1069   task->td_ident = loc_ref;
1070   task->td_taskwait_ident = NULL;
1071   task->td_taskwait_counter = 0;
1072   task->td_taskwait_thread = 0;
1073 
1074   task->td_flags.tiedness = TASK_TIED;
1075   task->td_flags.tasktype = TASK_IMPLICIT;
1076 #if OMP_45_ENABLED
1077   task->td_flags.proxy = TASK_FULL;
1078 #endif
1079 
1080   // All implicit tasks are executed immediately, not deferred
1081   task->td_flags.task_serial = 1;
1082   task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1083   task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1084 
1085   task->td_flags.started = 1;
1086   task->td_flags.executing = 1;
1087   task->td_flags.complete = 0;
1088   task->td_flags.freed = 0;
1089 
1090 #if OMP_40_ENABLED
1091   task->td_depnode = NULL;
1092 #endif
1093   task->td_last_tied = task;
1094 #if OMP_50_ENABLED
1095   task->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1096 #endif
1097 
1098   if (set_curr_task) { // only do this init first time thread is created
1099     KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0);
1100     // Not used: don't need to deallocate implicit task
1101     KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0);
1102 #if OMP_40_ENABLED
1103     task->td_taskgroup = NULL; // An implicit task does not have taskgroup
1104     task->td_dephash = NULL;
1105 #endif
1106     __kmp_push_current_task_to_thread(this_thr, team, tid);
1107   } else {
1108     KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);
1109     KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0);
1110   }
1111 
1112 #if OMPT_SUPPORT
1113   if (UNLIKELY(ompt_enabled.enabled))
1114     __ompt_task_init(task, tid);
1115 #endif
1116 
1117   KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid,
1118                 team, task));
1119 }
1120 
1121 // __kmp_finish_implicit_task: Release resources associated to implicit tasks
1122 // at the end of parallel regions. Some resources are kept for reuse in the next
1123 // parallel region.
1124 //
1125 // thread:  thread data structure corresponding to implicit task
1126 void __kmp_finish_implicit_task(kmp_info_t *thread) {
1127   kmp_taskdata_t *task = thread->th.th_current_task;
1128   if (task->td_dephash) {
1129     int children;
1130     task->td_flags.complete = 1;
1131     children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks);
1132     kmp_tasking_flags_t flags_old = task->td_flags;
1133     if (children == 0 && flags_old.complete == 1) {
1134       kmp_tasking_flags_t flags_new = flags_old;
1135       flags_new.complete = 0;
1136       if (KMP_COMPARE_AND_STORE_ACQ32(RCAST(kmp_int32 *, &task->td_flags),
1137                                       *RCAST(kmp_int32 *, &flags_old),
1138                                       *RCAST(kmp_int32 *, &flags_new))) {
1139         KA_TRACE(100, ("__kmp_finish_implicit_task: T#%d cleans "
1140                        "dephash of implicit task %p\n",
1141                        thread->th.th_info.ds.ds_gtid, task));
1142         __kmp_dephash_free_entries(thread, task->td_dephash);
1143       }
1144     }
1145   }
1146 }
1147 
1148 // __kmp_free_implicit_task: Release resources associated to implicit tasks
1149 // when these are destroyed regions
1150 //
1151 // thread:  thread data structure corresponding to implicit task
1152 void __kmp_free_implicit_task(kmp_info_t *thread) {
1153   kmp_taskdata_t *task = thread->th.th_current_task;
1154   if (task && task->td_dephash) {
1155     __kmp_dephash_free(thread, task->td_dephash);
1156     task->td_dephash = NULL;
1157   }
1158 }
1159 
1160 // Round up a size to a power of two specified by val: Used to insert padding
1161 // between structures co-allocated using a single malloc() call
1162 static size_t __kmp_round_up_to_val(size_t size, size_t val) {
1163   if (size & (val - 1)) {
1164     size &= ~(val - 1);
1165     if (size <= KMP_SIZE_T_MAX - val) {
1166       size += val; // Round up if there is no overflow.
1167     }
1168   }
1169   return size;
1170 } // __kmp_round_up_to_va
1171 
1172 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task
1173 //
1174 // loc_ref: source location information
1175 // gtid: global thread number.
1176 // flags: include tiedness & task type (explicit vs. implicit) of the ''new''
1177 // task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine.
1178 // sizeof_kmp_task_t:  Size in bytes of kmp_task_t data structure including
1179 // private vars accessed in task.
1180 // sizeof_shareds:  Size in bytes of array of pointers to shared vars accessed
1181 // in task.
1182 // task_entry: Pointer to task code entry point generated by compiler.
1183 // returns: a pointer to the allocated kmp_task_t structure (task).
1184 kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1185                              kmp_tasking_flags_t *flags,
1186                              size_t sizeof_kmp_task_t, size_t sizeof_shareds,
1187                              kmp_routine_entry_t task_entry) {
1188   kmp_task_t *task;
1189   kmp_taskdata_t *taskdata;
1190   kmp_info_t *thread = __kmp_threads[gtid];
1191   kmp_team_t *team = thread->th.th_team;
1192   kmp_taskdata_t *parent_task = thread->th.th_current_task;
1193   size_t shareds_offset;
1194 
1195   if (!TCR_4(__kmp_init_middle))
1196     __kmp_middle_initialize();
1197 
1198   KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
1199                 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1200                 gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
1201                 sizeof_shareds, task_entry));
1202 
1203   if (parent_task->td_flags.final) {
1204     if (flags->merged_if0) {
1205     }
1206     flags->final = 1;
1207   }
1208   if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) {
1209     // Untied task encountered causes the TSC algorithm to check entire deque of
1210     // the victim thread. If no untied task encountered, then checking the head
1211     // of the deque should be enough.
1212     KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1);
1213   }
1214 
1215 #if OMP_50_ENABLED
1216   // Detachable tasks are not proxy tasks yet but could be in the future. Doing
1217   // the tasking setup
1218   // when that happens is too late.
1219   if (flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE) {
1220 #endif
1221 #if OMP_45_ENABLED
1222     if (flags->proxy == TASK_PROXY) {
1223       flags->tiedness = TASK_UNTIED;
1224       flags->merged_if0 = 1;
1225 #if OMP_50_ENABLED
1226     }
1227 #endif
1228     /* are we running in a sequential parallel or tskm_immediate_exec... we need
1229        tasking support enabled */
1230     if ((thread->th.th_task_team) == NULL) {
1231       /* This should only happen if the team is serialized
1232           setup a task team and propagate it to the thread */
1233       KMP_DEBUG_ASSERT(team->t.t_serialized);
1234       KA_TRACE(30,
1235                ("T#%d creating task team in __kmp_task_alloc for proxy task\n",
1236                 gtid));
1237       __kmp_task_team_setup(
1238           thread, team,
1239           1); // 1 indicates setup the current team regardless of nthreads
1240       thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
1241     }
1242     kmp_task_team_t *task_team = thread->th.th_task_team;
1243 
1244     /* tasking must be enabled now as the task might not be pushed */
1245     if (!KMP_TASKING_ENABLED(task_team)) {
1246       KA_TRACE(
1247           30,
1248           ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
1249       __kmp_enable_tasking(task_team, thread);
1250       kmp_int32 tid = thread->th.th_info.ds.ds_tid;
1251       kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
1252       // No lock needed since only owner can allocate
1253       if (thread_data->td.td_deque == NULL) {
1254         __kmp_alloc_task_deque(thread, thread_data);
1255       }
1256     }
1257 
1258     if (task_team->tt.tt_found_proxy_tasks == FALSE)
1259       TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE);
1260   }
1261 #endif
1262 
1263   // Calculate shared structure offset including padding after kmp_task_t struct
1264   // to align pointers in shared struct
1265   shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t;
1266   shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(void *));
1267 
1268   // Allocate a kmp_taskdata_t block and a kmp_task_t block.
1269   KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid,
1270                 shareds_offset));
1271   KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid,
1272                 sizeof_shareds));
1273 
1274 // Avoid double allocation here by combining shareds with taskdata
1275 #if USE_FAST_MEMORY
1276   taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset +
1277                                                                sizeof_shareds);
1278 #else /* ! USE_FAST_MEMORY */
1279   taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset +
1280                                                                sizeof_shareds);
1281 #endif /* USE_FAST_MEMORY */
1282   ANNOTATE_HAPPENS_AFTER(taskdata);
1283 
1284   task = KMP_TASKDATA_TO_TASK(taskdata);
1285 
1286 // Make sure task & taskdata are aligned appropriately
1287 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD
1288   KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0);
1289   KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0);
1290 #else
1291   KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0);
1292   KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0);
1293 #endif
1294   if (sizeof_shareds > 0) {
1295     // Avoid double allocation here by combining shareds with taskdata
1296     task->shareds = &((char *)taskdata)[shareds_offset];
1297     // Make sure shareds struct is aligned to pointer size
1298     KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
1299                      0);
1300   } else {
1301     task->shareds = NULL;
1302   }
1303   task->routine = task_entry;
1304   task->part_id = 0; // AC: Always start with 0 part id
1305 
1306   taskdata->td_task_id = KMP_GEN_TASK_ID();
1307   taskdata->td_team = team;
1308   taskdata->td_alloc_thread = thread;
1309   taskdata->td_parent = parent_task;
1310   taskdata->td_level = parent_task->td_level + 1; // increment nesting level
1311   KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
1312   taskdata->td_ident = loc_ref;
1313   taskdata->td_taskwait_ident = NULL;
1314   taskdata->td_taskwait_counter = 0;
1315   taskdata->td_taskwait_thread = 0;
1316   KMP_DEBUG_ASSERT(taskdata->td_parent != NULL);
1317 #if OMP_45_ENABLED
1318   // avoid copying icvs for proxy tasks
1319   if (flags->proxy == TASK_FULL)
1320 #endif
1321     copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs);
1322 
1323   taskdata->td_flags.tiedness = flags->tiedness;
1324   taskdata->td_flags.final = flags->final;
1325   taskdata->td_flags.merged_if0 = flags->merged_if0;
1326 #if OMP_40_ENABLED
1327   taskdata->td_flags.destructors_thunk = flags->destructors_thunk;
1328 #endif // OMP_40_ENABLED
1329 #if OMP_45_ENABLED
1330   taskdata->td_flags.proxy = flags->proxy;
1331 #if OMP_50_ENABLED
1332   taskdata->td_flags.detachable = flags->detachable;
1333 #endif
1334   taskdata->td_task_team = thread->th.th_task_team;
1335   taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
1336 #endif
1337   taskdata->td_flags.tasktype = TASK_EXPLICIT;
1338 
1339   // GEH - TODO: fix this to copy parent task's value of tasking_ser flag
1340   taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1341 
1342   // GEH - TODO: fix this to copy parent task's value of team_serial flag
1343   taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1344 
1345   // GEH - Note we serialize the task if the team is serialized to make sure
1346   // implicit parallel region tasks are not left until program termination to
1347   // execute. Also, it helps locality to execute immediately.
1348 
1349   taskdata->td_flags.task_serial =
1350       (parent_task->td_flags.final || taskdata->td_flags.team_serial ||
1351        taskdata->td_flags.tasking_ser);
1352 
1353   taskdata->td_flags.started = 0;
1354   taskdata->td_flags.executing = 0;
1355   taskdata->td_flags.complete = 0;
1356   taskdata->td_flags.freed = 0;
1357 
1358   taskdata->td_flags.native = flags->native;
1359 
1360   KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
1361   // start at one because counts current task and children
1362   KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
1363 #if OMP_40_ENABLED
1364   taskdata->td_taskgroup =
1365       parent_task->td_taskgroup; // task inherits taskgroup from the parent task
1366   taskdata->td_dephash = NULL;
1367   taskdata->td_depnode = NULL;
1368 #endif
1369   if (flags->tiedness == TASK_UNTIED)
1370     taskdata->td_last_tied = NULL; // will be set when the task is scheduled
1371   else
1372     taskdata->td_last_tied = taskdata;
1373 #if OMP_50_ENABLED
1374   taskdata->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1375 #endif
1376 #if OMPT_SUPPORT
1377   if (UNLIKELY(ompt_enabled.enabled))
1378     __ompt_task_init(taskdata, gtid);
1379 #endif
1380 // Only need to keep track of child task counts if team parallel and tasking not
1381 // serialized or if it is a proxy or detachable task
1382 #if OMP_45_ENABLED
1383   if (flags->proxy == TASK_PROXY ||
1384 #if OMP_50_ENABLED
1385       flags->detachable == TASK_DETACHABLE ||
1386 #endif
1387       !(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser))
1388 #else
1389   if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser))
1390 #endif
1391   {
1392     KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
1393 #if OMP_40_ENABLED
1394     if (parent_task->td_taskgroup)
1395       KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
1396 #endif
1397     // Only need to keep track of allocated child tasks for explicit tasks since
1398     // implicit not deallocated
1399     if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
1400       KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
1401     }
1402   }
1403 
1404   KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
1405                 gtid, taskdata, taskdata->td_parent));
1406   ANNOTATE_HAPPENS_BEFORE(task);
1407 
1408   return task;
1409 }
1410 
1411 kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1412                                   kmp_int32 flags, size_t sizeof_kmp_task_t,
1413                                   size_t sizeof_shareds,
1414                                   kmp_routine_entry_t task_entry) {
1415   kmp_task_t *retval;
1416   kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
1417 
1418   input_flags->native = FALSE;
1419 // __kmp_task_alloc() sets up all other runtime flags
1420 
1421 #if OMP_45_ENABLED
1422 #if OMP_50_ENABLED
1423   KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s %s) "
1424                 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1425                 gtid, loc_ref, input_flags->tiedness ? "tied  " : "untied",
1426                 input_flags->proxy ? "proxy" : "",
1427                 input_flags->detachable ? "detachable" : "", sizeof_kmp_task_t,
1428                 sizeof_shareds, task_entry));
1429 #else
1430   KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s) "
1431                 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1432                 gtid, loc_ref, input_flags->tiedness ? "tied  " : "untied",
1433                 input_flags->proxy ? "proxy" : "", sizeof_kmp_task_t,
1434                 sizeof_shareds, task_entry));
1435 #endif
1436 #else
1437   KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s) "
1438                 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1439                 gtid, loc_ref, input_flags->tiedness ? "tied  " : "untied",
1440                 sizeof_kmp_task_t, sizeof_shareds, task_entry));
1441 #endif
1442 
1443   retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t,
1444                             sizeof_shareds, task_entry);
1445 
1446   KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval));
1447 
1448   return retval;
1449 }
1450 
1451 kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1452                                          kmp_int32 flags,
1453                                          size_t sizeof_kmp_task_t,
1454                                          size_t sizeof_shareds,
1455                                          kmp_routine_entry_t task_entry,
1456                                          kmp_int64 device_id) {
1457   return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t,
1458                                sizeof_shareds, task_entry);
1459 }
1460 
1461 #if OMP_50_ENABLED
1462 /*!
1463 @ingroup TASKING
1464 @param loc_ref location of the original task directive
1465 @param gtid Global Thread ID of encountering thread
1466 @param new_task task thunk allocated by __kmpc_omp_task_alloc() for the ''new
1467 task''
1468 @param naffins Number of affinity items
1469 @param affin_list List of affinity items
1470 @return Returns non-zero if registering affinity information was not successful.
1471  Returns 0 if registration was successful
1472 This entry registers the affinity information attached to a task with the task
1473 thunk structure kmp_taskdata_t.
1474 */
1475 kmp_int32
1476 __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid,
1477                                   kmp_task_t *new_task, kmp_int32 naffins,
1478                                   kmp_task_affinity_info_t *affin_list) {
1479   return 0;
1480 }
1481 #endif
1482 
1483 //  __kmp_invoke_task: invoke the specified task
1484 //
1485 // gtid: global thread ID of caller
1486 // task: the task to invoke
1487 // current_task: the task to resume after task invokation
1488 static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
1489                               kmp_taskdata_t *current_task) {
1490   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
1491   kmp_info_t *thread;
1492 #if OMP_40_ENABLED
1493   int discard = 0 /* false */;
1494 #endif
1495   KA_TRACE(
1496       30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
1497            gtid, taskdata, current_task));
1498   KMP_DEBUG_ASSERT(task);
1499 #if OMP_45_ENABLED
1500   if (taskdata->td_flags.proxy == TASK_PROXY &&
1501       taskdata->td_flags.complete == 1) {
1502     // This is a proxy task that was already completed but it needs to run
1503     // its bottom-half finish
1504     KA_TRACE(
1505         30,
1506         ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
1507          gtid, taskdata));
1508 
1509     __kmp_bottom_half_finish_proxy(gtid, task);
1510 
1511     KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for "
1512                   "proxy task %p, resuming task %p\n",
1513                   gtid, taskdata, current_task));
1514 
1515     return;
1516   }
1517 #endif
1518 
1519 #if OMPT_SUPPORT
1520   // For untied tasks, the first task executed only calls __kmpc_omp_task and
1521   // does not execute code.
1522   ompt_thread_info_t oldInfo;
1523   if (UNLIKELY(ompt_enabled.enabled)) {
1524     // Store the threads states and restore them after the task
1525     thread = __kmp_threads[gtid];
1526     oldInfo = thread->th.ompt_thread_info;
1527     thread->th.ompt_thread_info.wait_id = 0;
1528     thread->th.ompt_thread_info.state = (thread->th.th_team_serialized)
1529                                             ? ompt_state_work_serial
1530                                             : ompt_state_work_parallel;
1531     taskdata->ompt_task_info.frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1532   }
1533 #endif
1534 
1535 #if OMP_45_ENABLED
1536   // Proxy tasks are not handled by the runtime
1537   if (taskdata->td_flags.proxy != TASK_PROXY) {
1538 #endif
1539     ANNOTATE_HAPPENS_AFTER(task);
1540     __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded
1541 #if OMP_45_ENABLED
1542   }
1543 #endif
1544 
1545 #if OMP_40_ENABLED
1546   // TODO: cancel tasks if the parallel region has also been cancelled
1547   // TODO: check if this sequence can be hoisted above __kmp_task_start
1548   // if cancellation has been enabled for this run ...
1549   if (__kmp_omp_cancellation) {
1550     thread = __kmp_threads[gtid];
1551     kmp_team_t *this_team = thread->th.th_team;
1552     kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
1553     if ((taskgroup && taskgroup->cancel_request) ||
1554         (this_team->t.t_cancel_request == cancel_parallel)) {
1555 #if OMPT_SUPPORT && OMPT_OPTIONAL
1556       ompt_data_t *task_data;
1557       if (UNLIKELY(ompt_enabled.ompt_callback_cancel)) {
1558         __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
1559         ompt_callbacks.ompt_callback(ompt_callback_cancel)(
1560             task_data,
1561             ((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup
1562                                                       : ompt_cancel_parallel) |
1563                 ompt_cancel_discarded_task,
1564             NULL);
1565       }
1566 #endif
1567       KMP_COUNT_BLOCK(TASK_cancelled);
1568       // this task belongs to a task group and we need to cancel it
1569       discard = 1 /* true */;
1570     }
1571   }
1572 
1573   // Invoke the task routine and pass in relevant data.
1574   // Thunks generated by gcc take a different argument list.
1575   if (!discard) {
1576     if (taskdata->td_flags.tiedness == TASK_UNTIED) {
1577       taskdata->td_last_tied = current_task->td_last_tied;
1578       KMP_DEBUG_ASSERT(taskdata->td_last_tied);
1579     }
1580 #if KMP_STATS_ENABLED
1581     KMP_COUNT_BLOCK(TASK_executed);
1582     switch (KMP_GET_THREAD_STATE()) {
1583     case FORK_JOIN_BARRIER:
1584       KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar);
1585       break;
1586     case PLAIN_BARRIER:
1587       KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar);
1588       break;
1589     case TASKYIELD:
1590       KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield);
1591       break;
1592     case TASKWAIT:
1593       KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait);
1594       break;
1595     case TASKGROUP:
1596       KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup);
1597       break;
1598     default:
1599       KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate);
1600       break;
1601     }
1602 #endif // KMP_STATS_ENABLED
1603 #endif // OMP_40_ENABLED
1604 
1605 // OMPT task begin
1606 #if OMPT_SUPPORT
1607     if (UNLIKELY(ompt_enabled.enabled))
1608       __ompt_task_start(task, current_task, gtid);
1609 #endif
1610 
1611 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1612     kmp_uint64 cur_time;
1613     kmp_int32 kmp_itt_count_task =
1614         __kmp_forkjoin_frames_mode == 3 && !taskdata->td_flags.task_serial &&
1615         current_task->td_flags.tasktype == TASK_IMPLICIT;
1616     if (kmp_itt_count_task) {
1617       thread = __kmp_threads[gtid];
1618       // Time outer level explicit task on barrier for adjusting imbalance time
1619       if (thread->th.th_bar_arrive_time)
1620         cur_time = __itt_get_timestamp();
1621       else
1622         kmp_itt_count_task = 0; // thread is not on a barrier - skip timing
1623     }
1624 #endif
1625 
1626 #ifdef KMP_GOMP_COMPAT
1627     if (taskdata->td_flags.native) {
1628       ((void (*)(void *))(*(task->routine)))(task->shareds);
1629     } else
1630 #endif /* KMP_GOMP_COMPAT */
1631     {
1632       (*(task->routine))(gtid, task);
1633     }
1634     KMP_POP_PARTITIONED_TIMER();
1635 
1636 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1637     if (kmp_itt_count_task) {
1638       // Barrier imbalance - adjust arrive time with the task duration
1639       thread->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time);
1640     }
1641 #endif
1642 
1643 #if OMP_40_ENABLED
1644   }
1645 #endif // OMP_40_ENABLED
1646 
1647 
1648 #if OMP_45_ENABLED
1649   // Proxy tasks are not handled by the runtime
1650   if (taskdata->td_flags.proxy != TASK_PROXY) {
1651 #endif
1652     ANNOTATE_HAPPENS_BEFORE(taskdata->td_parent);
1653 #if OMPT_SUPPORT
1654     if (UNLIKELY(ompt_enabled.enabled)) {
1655       thread->th.ompt_thread_info = oldInfo;
1656       if (taskdata->td_flags.tiedness == TASK_TIED) {
1657         taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1658       }
1659       __kmp_task_finish<true>(gtid, task, current_task);
1660     } else
1661 #endif
1662       __kmp_task_finish<false>(gtid, task, current_task);
1663 #if OMP_45_ENABLED
1664   }
1665 #endif
1666 
1667   KA_TRACE(
1668       30,
1669       ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
1670        gtid, taskdata, current_task));
1671   return;
1672 }
1673 
1674 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution
1675 //
1676 // loc_ref: location of original task pragma (ignored)
1677 // gtid: Global Thread ID of encountering thread
1678 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
1679 // Returns:
1680 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1681 //    be resumed later.
1682 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1683 //    resumed later.
1684 kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,
1685                                 kmp_task_t *new_task) {
1686   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1687 
1688   KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid,
1689                 loc_ref, new_taskdata));
1690 
1691 #if OMPT_SUPPORT
1692   kmp_taskdata_t *parent;
1693   if (UNLIKELY(ompt_enabled.enabled)) {
1694     parent = new_taskdata->td_parent;
1695     if (ompt_enabled.ompt_callback_task_create) {
1696       ompt_data_t task_data = ompt_data_none;
1697       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1698           parent ? &(parent->ompt_task_info.task_data) : &task_data,
1699           parent ? &(parent->ompt_task_info.frame) : NULL,
1700           &(new_taskdata->ompt_task_info.task_data), ompt_task_explicit, 0,
1701           OMPT_GET_RETURN_ADDRESS(0));
1702     }
1703   }
1704 #endif
1705 
1706   /* Should we execute the new task or queue it? For now, let's just always try
1707      to queue it.  If the queue fills up, then we'll execute it.  */
1708 
1709   if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1710   { // Execute this task immediately
1711     kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1712     new_taskdata->td_flags.task_serial = 1;
1713     __kmp_invoke_task(gtid, new_task, current_task);
1714   }
1715 
1716   KA_TRACE(
1717       10,
1718       ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "
1719        "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
1720        gtid, loc_ref, new_taskdata));
1721 
1722   ANNOTATE_HAPPENS_BEFORE(new_task);
1723 #if OMPT_SUPPORT
1724   if (UNLIKELY(ompt_enabled.enabled)) {
1725     parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1726   }
1727 #endif
1728   return TASK_CURRENT_NOT_QUEUED;
1729 }
1730 
1731 // __kmp_omp_task: Schedule a non-thread-switchable task for execution
1732 //
1733 // gtid: Global Thread ID of encountering thread
1734 // new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
1735 // serialize_immediate: if TRUE then if the task is executed immediately its
1736 // execution will be serialized
1737 // Returns:
1738 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1739 //    be resumed later.
1740 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1741 //    resumed later.
1742 kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
1743                          bool serialize_immediate) {
1744   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1745 
1746 /* Should we execute the new task or queue it? For now, let's just always try to
1747    queue it.  If the queue fills up, then we'll execute it.  */
1748 #if OMP_45_ENABLED
1749   if (new_taskdata->td_flags.proxy == TASK_PROXY ||
1750       __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1751 #else
1752   if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1753 #endif
1754   { // Execute this task immediately
1755     kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1756     if (serialize_immediate)
1757       new_taskdata->td_flags.task_serial = 1;
1758     __kmp_invoke_task(gtid, new_task, current_task);
1759   }
1760 
1761   ANNOTATE_HAPPENS_BEFORE(new_task);
1762   return TASK_CURRENT_NOT_QUEUED;
1763 }
1764 
1765 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a
1766 // non-thread-switchable task from the parent thread only!
1767 //
1768 // loc_ref: location of original task pragma (ignored)
1769 // gtid: Global Thread ID of encountering thread
1770 // new_task: non-thread-switchable task thunk allocated by
1771 // __kmp_omp_task_alloc()
1772 // Returns:
1773 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1774 //    be resumed later.
1775 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1776 //    resumed later.
1777 kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid,
1778                           kmp_task_t *new_task) {
1779   kmp_int32 res;
1780   KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1781 
1782 #if KMP_DEBUG || OMPT_SUPPORT
1783   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1784 #endif
1785   KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1786                 new_taskdata));
1787 
1788 #if OMPT_SUPPORT
1789   kmp_taskdata_t *parent = NULL;
1790   if (UNLIKELY(ompt_enabled.enabled)) {
1791     if (!new_taskdata->td_flags.started) {
1792       OMPT_STORE_RETURN_ADDRESS(gtid);
1793       parent = new_taskdata->td_parent;
1794       if (!parent->ompt_task_info.frame.enter_frame.ptr) {
1795         parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1796       }
1797       if (ompt_enabled.ompt_callback_task_create) {
1798         ompt_data_t task_data = ompt_data_none;
1799         ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1800             parent ? &(parent->ompt_task_info.task_data) : &task_data,
1801             parent ? &(parent->ompt_task_info.frame) : NULL,
1802             &(new_taskdata->ompt_task_info.task_data),
1803             ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1804             OMPT_LOAD_RETURN_ADDRESS(gtid));
1805       }
1806     } else {
1807       // We are scheduling the continuation of an UNTIED task.
1808       // Scheduling back to the parent task.
1809       __ompt_task_finish(new_task,
1810                          new_taskdata->ompt_task_info.scheduling_parent,
1811                          ompt_task_switch);
1812       new_taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1813     }
1814   }
1815 #endif
1816 
1817   res = __kmp_omp_task(gtid, new_task, true);
1818 
1819   KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1820                 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1821                 gtid, loc_ref, new_taskdata));
1822 #if OMPT_SUPPORT
1823   if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
1824     parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1825   }
1826 #endif
1827   return res;
1828 }
1829 
1830 // __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule
1831 // a taskloop task with the correct OMPT return address
1832 //
1833 // loc_ref: location of original task pragma (ignored)
1834 // gtid: Global Thread ID of encountering thread
1835 // new_task: non-thread-switchable task thunk allocated by
1836 // __kmp_omp_task_alloc()
1837 // codeptr_ra: return address for OMPT callback
1838 // Returns:
1839 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1840 //    be resumed later.
1841 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1842 //    resumed later.
1843 kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid,
1844                                   kmp_task_t *new_task, void *codeptr_ra) {
1845   kmp_int32 res;
1846   KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1847 
1848 #if KMP_DEBUG || OMPT_SUPPORT
1849   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1850 #endif
1851   KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1852                 new_taskdata));
1853 
1854 #if OMPT_SUPPORT
1855   kmp_taskdata_t *parent = NULL;
1856   if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) {
1857     parent = new_taskdata->td_parent;
1858     if (!parent->ompt_task_info.frame.enter_frame.ptr)
1859       parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1860     if (ompt_enabled.ompt_callback_task_create) {
1861       ompt_data_t task_data = ompt_data_none;
1862       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1863           parent ? &(parent->ompt_task_info.task_data) : &task_data,
1864           parent ? &(parent->ompt_task_info.frame) : NULL,
1865           &(new_taskdata->ompt_task_info.task_data),
1866           ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1867           codeptr_ra);
1868     }
1869   }
1870 #endif
1871 
1872   res = __kmp_omp_task(gtid, new_task, true);
1873 
1874   KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1875                 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1876                 gtid, loc_ref, new_taskdata));
1877 #if OMPT_SUPPORT
1878   if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
1879     parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1880   }
1881 #endif
1882   return res;
1883 }
1884 
1885 template <bool ompt>
1886 static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
1887                                               void *frame_address,
1888                                               void *return_address) {
1889   kmp_taskdata_t *taskdata;
1890   kmp_info_t *thread;
1891   int thread_finished = FALSE;
1892   KMP_SET_THREAD_STATE_BLOCK(TASKWAIT);
1893 
1894   KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref));
1895 
1896   if (__kmp_tasking_mode != tskm_immediate_exec) {
1897     thread = __kmp_threads[gtid];
1898     taskdata = thread->th.th_current_task;
1899 
1900 #if OMPT_SUPPORT && OMPT_OPTIONAL
1901     ompt_data_t *my_task_data;
1902     ompt_data_t *my_parallel_data;
1903 
1904     if (ompt) {
1905       my_task_data = &(taskdata->ompt_task_info.task_data);
1906       my_parallel_data = OMPT_CUR_TEAM_DATA(thread);
1907 
1908       taskdata->ompt_task_info.frame.enter_frame.ptr = frame_address;
1909 
1910       if (ompt_enabled.ompt_callback_sync_region) {
1911         ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1912             ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
1913             my_task_data, return_address);
1914       }
1915 
1916       if (ompt_enabled.ompt_callback_sync_region_wait) {
1917         ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1918             ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
1919             my_task_data, return_address);
1920       }
1921     }
1922 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1923 
1924 // Debugger: The taskwait is active. Store location and thread encountered the
1925 // taskwait.
1926 #if USE_ITT_BUILD
1927 // Note: These values are used by ITT events as well.
1928 #endif /* USE_ITT_BUILD */
1929     taskdata->td_taskwait_counter += 1;
1930     taskdata->td_taskwait_ident = loc_ref;
1931     taskdata->td_taskwait_thread = gtid + 1;
1932 
1933 #if USE_ITT_BUILD
1934     void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
1935     if (itt_sync_obj != NULL)
1936       __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
1937 #endif /* USE_ITT_BUILD */
1938 
1939     bool must_wait =
1940         !taskdata->td_flags.team_serial && !taskdata->td_flags.final;
1941 
1942 #if OMP_45_ENABLED
1943     must_wait = must_wait || (thread->th.th_task_team != NULL &&
1944                               thread->th.th_task_team->tt.tt_found_proxy_tasks);
1945 #endif
1946     if (must_wait) {
1947       kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *,
1948                              &(taskdata->td_incomplete_child_tasks)),
1949                        0U);
1950       while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) {
1951         flag.execute_tasks(thread, gtid, FALSE,
1952                            &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
1953                            __kmp_task_stealing_constraint);
1954       }
1955     }
1956 #if USE_ITT_BUILD
1957     if (itt_sync_obj != NULL)
1958       __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
1959 #endif /* USE_ITT_BUILD */
1960 
1961     // Debugger:  The taskwait is completed. Location remains, but thread is
1962     // negated.
1963     taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
1964 
1965 #if OMPT_SUPPORT && OMPT_OPTIONAL
1966     if (ompt) {
1967       if (ompt_enabled.ompt_callback_sync_region_wait) {
1968         ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1969             ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
1970             my_task_data, return_address);
1971       }
1972       if (ompt_enabled.ompt_callback_sync_region) {
1973         ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1974             ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
1975             my_task_data, return_address);
1976       }
1977       taskdata->ompt_task_info.frame.enter_frame = ompt_data_none;
1978     }
1979 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1980 
1981     ANNOTATE_HAPPENS_AFTER(taskdata);
1982   }
1983 
1984   KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
1985                 "returning TASK_CURRENT_NOT_QUEUED\n",
1986                 gtid, taskdata));
1987 
1988   return TASK_CURRENT_NOT_QUEUED;
1989 }
1990 
1991 #if OMPT_SUPPORT && OMPT_OPTIONAL
1992 OMPT_NOINLINE
1993 static kmp_int32 __kmpc_omp_taskwait_ompt(ident_t *loc_ref, kmp_int32 gtid,
1994                                           void *frame_address,
1995                                           void *return_address) {
1996   return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address,
1997                                             return_address);
1998 }
1999 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
2000 
2001 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are
2002 // complete
2003 kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) {
2004 #if OMPT_SUPPORT && OMPT_OPTIONAL
2005   if (UNLIKELY(ompt_enabled.enabled)) {
2006     OMPT_STORE_RETURN_ADDRESS(gtid);
2007     return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(0),
2008                                     OMPT_LOAD_RETURN_ADDRESS(gtid));
2009   }
2010 #endif
2011   return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL);
2012 }
2013 
2014 // __kmpc_omp_taskyield: switch to a different task
2015 kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
2016   kmp_taskdata_t *taskdata;
2017   kmp_info_t *thread;
2018   int thread_finished = FALSE;
2019 
2020   KMP_COUNT_BLOCK(OMP_TASKYIELD);
2021   KMP_SET_THREAD_STATE_BLOCK(TASKYIELD);
2022 
2023   KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
2024                 gtid, loc_ref, end_part));
2025 
2026   if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) {
2027     thread = __kmp_threads[gtid];
2028     taskdata = thread->th.th_current_task;
2029 // Should we model this as a task wait or not?
2030 // Debugger: The taskwait is active. Store location and thread encountered the
2031 // taskwait.
2032 #if USE_ITT_BUILD
2033 // Note: These values are used by ITT events as well.
2034 #endif /* USE_ITT_BUILD */
2035     taskdata->td_taskwait_counter += 1;
2036     taskdata->td_taskwait_ident = loc_ref;
2037     taskdata->td_taskwait_thread = gtid + 1;
2038 
2039 #if USE_ITT_BUILD
2040     void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
2041     if (itt_sync_obj != NULL)
2042       __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
2043 #endif /* USE_ITT_BUILD */
2044     if (!taskdata->td_flags.team_serial) {
2045       kmp_task_team_t *task_team = thread->th.th_task_team;
2046       if (task_team != NULL) {
2047         if (KMP_TASKING_ENABLED(task_team)) {
2048 #if OMPT_SUPPORT
2049           if (UNLIKELY(ompt_enabled.enabled))
2050             thread->th.ompt_thread_info.ompt_task_yielded = 1;
2051 #endif
2052           __kmp_execute_tasks_32(
2053               thread, gtid, NULL, FALSE,
2054               &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2055               __kmp_task_stealing_constraint);
2056 #if OMPT_SUPPORT
2057           if (UNLIKELY(ompt_enabled.enabled))
2058             thread->th.ompt_thread_info.ompt_task_yielded = 0;
2059 #endif
2060         }
2061       }
2062     }
2063 #if USE_ITT_BUILD
2064     if (itt_sync_obj != NULL)
2065       __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
2066 #endif /* USE_ITT_BUILD */
2067 
2068     // Debugger:  The taskwait is completed. Location remains, but thread is
2069     // negated.
2070     taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2071   }
2072 
2073   KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
2074                 "returning TASK_CURRENT_NOT_QUEUED\n",
2075                 gtid, taskdata));
2076 
2077   return TASK_CURRENT_NOT_QUEUED;
2078 }
2079 
2080 #if OMP_50_ENABLED
2081 // Task Reduction implementation
2082 //
2083 // Note: initial implementation didn't take into account the possibility
2084 // to specify omp_orig for initializer of the UDR (user defined reduction).
2085 // Corrected implementation takes into account the omp_orig object.
2086 // Compiler is free to use old implementation if omp_orig is not specified.
2087 
2088 /*!
2089 @ingroup BASIC_TYPES
2090 @{
2091 */
2092 
2093 /*!
2094 Flags for special info per task reduction item.
2095 */
2096 typedef struct kmp_taskred_flags {
2097   /*! 1 - use lazy alloc/init (e.g. big objects, #tasks < #threads) */
2098   unsigned lazy_priv : 1;
2099   unsigned reserved31 : 31;
2100 } kmp_taskred_flags_t;
2101 
2102 /*!
2103 Internal struct for reduction data item related info set up by compiler.
2104 */
2105 typedef struct kmp_task_red_input {
2106   void *reduce_shar; /**< shared between tasks item to reduce into */
2107   size_t reduce_size; /**< size of data item in bytes */
2108   // three compiler-generated routines (init, fini are optional):
2109   void *reduce_init; /**< data initialization routine (single parameter) */
2110   void *reduce_fini; /**< data finalization routine */
2111   void *reduce_comb; /**< data combiner routine */
2112   kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
2113 } kmp_task_red_input_t;
2114 
2115 /*!
2116 Internal struct for reduction data item related info saved by the library.
2117 */
2118 typedef struct kmp_taskred_data {
2119   void *reduce_shar; /**< shared between tasks item to reduce into */
2120   size_t reduce_size; /**< size of data item */
2121   kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
2122   void *reduce_priv; /**< array of thread specific items */
2123   void *reduce_pend; /**< end of private data for faster comparison op */
2124   // three compiler-generated routines (init, fini are optional):
2125   void *reduce_comb; /**< data combiner routine */
2126   void *reduce_init; /**< data initialization routine (two parameters) */
2127   void *reduce_fini; /**< data finalization routine */
2128   void *reduce_orig; /**< original item (can be used in UDR initializer) */
2129 } kmp_taskred_data_t;
2130 
2131 /*!
2132 Internal struct for reduction data item related info set up by compiler.
2133 
2134 New interface: added reduce_orig field to provide omp_orig for UDR initializer.
2135 */
2136 typedef struct kmp_taskred_input {
2137   void *reduce_shar; /**< shared between tasks item to reduce into */
2138   void *reduce_orig; /**< original reduction item used for initialization */
2139   size_t reduce_size; /**< size of data item */
2140   // three compiler-generated routines (init, fini are optional):
2141   void *reduce_init; /**< data initialization routine (two parameters) */
2142   void *reduce_fini; /**< data finalization routine */
2143   void *reduce_comb; /**< data combiner routine */
2144   kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
2145 } kmp_taskred_input_t;
2146 /*!
2147 @}
2148 */
2149 
2150 template <typename T> void __kmp_assign_orig(kmp_taskred_data_t &item, T &src);
2151 template <>
2152 void __kmp_assign_orig<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2153                                              kmp_task_red_input_t &src) {
2154   item.reduce_orig = NULL;
2155 }
2156 template <>
2157 void __kmp_assign_orig<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2158                                             kmp_taskred_input_t &src) {
2159   if (src.reduce_orig != NULL) {
2160     item.reduce_orig = src.reduce_orig;
2161   } else {
2162     item.reduce_orig = src.reduce_shar;
2163   } // non-NULL reduce_orig means new interface used
2164 }
2165 
2166 template <typename T> void __kmp_call_init(kmp_taskred_data_t &item, int j);
2167 template <>
2168 void __kmp_call_init<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2169                                            int offset) {
2170   ((void (*)(void *))item.reduce_init)((char *)(item.reduce_priv) + offset);
2171 }
2172 template <>
2173 void __kmp_call_init<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2174                                           int offset) {
2175   ((void (*)(void *, void *))item.reduce_init)(
2176       (char *)(item.reduce_priv) + offset, item.reduce_orig);
2177 }
2178 
2179 template <typename T>
2180 void *__kmp_task_reduction_init(int gtid, int num, T *data) {
2181   kmp_info_t *thread = __kmp_threads[gtid];
2182   kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
2183   kmp_int32 nth = thread->th.th_team_nproc;
2184   kmp_taskred_data_t *arr;
2185 
2186   // check input data just in case
2187   KMP_ASSERT(tg != NULL);
2188   KMP_ASSERT(data != NULL);
2189   KMP_ASSERT(num > 0);
2190   if (nth == 1) {
2191     KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n",
2192                   gtid, tg));
2193     return (void *)tg;
2194   }
2195   KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n",
2196                 gtid, tg, num));
2197   arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2198       thread, num * sizeof(kmp_taskred_data_t));
2199   for (int i = 0; i < num; ++i) {
2200     size_t size = data[i].reduce_size - 1;
2201     // round the size up to cache line per thread-specific item
2202     size += CACHE_LINE - size % CACHE_LINE;
2203     KMP_ASSERT(data[i].reduce_comb != NULL); // combiner is mandatory
2204     arr[i].reduce_shar = data[i].reduce_shar;
2205     arr[i].reduce_size = size;
2206     arr[i].flags = data[i].flags;
2207     arr[i].reduce_comb = data[i].reduce_comb;
2208     arr[i].reduce_init = data[i].reduce_init;
2209     arr[i].reduce_fini = data[i].reduce_fini;
2210     __kmp_assign_orig<T>(arr[i], data[i]);
2211     if (!arr[i].flags.lazy_priv) {
2212       // allocate cache-line aligned block and fill it with zeros
2213       arr[i].reduce_priv = __kmp_allocate(nth * size);
2214       arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size;
2215       if (arr[i].reduce_init != NULL) {
2216         // initialize all thread-specific items
2217         for (int j = 0; j < nth; ++j) {
2218           __kmp_call_init<T>(arr[i], j * size);
2219         }
2220       }
2221     } else {
2222       // only allocate space for pointers now,
2223       // objects will be lazily allocated/initialized if/when requested
2224       // note that __kmp_allocate zeroes the allocated memory
2225       arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *));
2226     }
2227   }
2228   tg->reduce_data = (void *)arr;
2229   tg->reduce_num_data = num;
2230   return (void *)tg;
2231 }
2232 
2233 /*!
2234 @ingroup TASKING
2235 @param gtid      Global thread ID
2236 @param num       Number of data items to reduce
2237 @param data      Array of data for reduction
2238 @return The taskgroup identifier
2239 
2240 Initialize task reduction for the taskgroup.
2241 
2242 Note: this entry supposes the optional compiler-generated initializer routine
2243 has single parameter - pointer to object to be initialized. That means
2244 the reduction either does not use omp_orig object, or the omp_orig is accessible
2245 without help of the runtime library.
2246 */
2247 void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
2248   return __kmp_task_reduction_init(gtid, num, (kmp_task_red_input_t *)data);
2249 }
2250 
2251 /*!
2252 @ingroup TASKING
2253 @param gtid      Global thread ID
2254 @param num       Number of data items to reduce
2255 @param data      Array of data for reduction
2256 @return The taskgroup identifier
2257 
2258 Initialize task reduction for the taskgroup.
2259 
2260 Note: this entry supposes the optional compiler-generated initializer routine
2261 has two parameters, pointer to object to be initialized and pointer to omp_orig
2262 */
2263 void *__kmpc_taskred_init(int gtid, int num, void *data) {
2264   return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data);
2265 }
2266 
2267 // Copy task reduction data (except for shared pointers).
2268 template <typename T>
2269 void __kmp_task_reduction_init_copy(kmp_info_t *thr, int num, T *data,
2270                                     kmp_taskgroup_t *tg, void *reduce_data) {
2271   kmp_taskred_data_t *arr;
2272   KA_TRACE(20, ("__kmp_task_reduction_init_copy: Th %p, init taskgroup %p,"
2273                 " from data %p\n",
2274                 thr, tg, reduce_data));
2275   arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2276       thr, num * sizeof(kmp_taskred_data_t));
2277   // threads will share private copies, thunk routines, sizes, flags, etc.:
2278   KMP_MEMCPY(arr, reduce_data, num * sizeof(kmp_taskred_data_t));
2279   for (int i = 0; i < num; ++i) {
2280     arr[i].reduce_shar = data[i].reduce_shar; // init unique shared pointers
2281   }
2282   tg->reduce_data = (void *)arr;
2283   tg->reduce_num_data = num;
2284 }
2285 
2286 /*!
2287 @ingroup TASKING
2288 @param gtid    Global thread ID
2289 @param tskgrp  The taskgroup ID (optional)
2290 @param data    Shared location of the item
2291 @return The pointer to per-thread data
2292 
2293 Get thread-specific location of data item
2294 */
2295 void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
2296   kmp_info_t *thread = __kmp_threads[gtid];
2297   kmp_int32 nth = thread->th.th_team_nproc;
2298   if (nth == 1)
2299     return data; // nothing to do
2300 
2301   kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp;
2302   if (tg == NULL)
2303     tg = thread->th.th_current_task->td_taskgroup;
2304   KMP_ASSERT(tg != NULL);
2305   kmp_taskred_data_t *arr = (kmp_taskred_data_t *)(tg->reduce_data);
2306   kmp_int32 num = tg->reduce_num_data;
2307   kmp_int32 tid = thread->th.th_info.ds.ds_tid;
2308 
2309   KMP_ASSERT(data != NULL);
2310   while (tg != NULL) {
2311     for (int i = 0; i < num; ++i) {
2312       if (!arr[i].flags.lazy_priv) {
2313         if (data == arr[i].reduce_shar ||
2314             (data >= arr[i].reduce_priv && data < arr[i].reduce_pend))
2315           return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size;
2316       } else {
2317         // check shared location first
2318         void **p_priv = (void **)(arr[i].reduce_priv);
2319         if (data == arr[i].reduce_shar)
2320           goto found;
2321         // check if we get some thread specific location as parameter
2322         for (int j = 0; j < nth; ++j)
2323           if (data == p_priv[j])
2324             goto found;
2325         continue; // not found, continue search
2326       found:
2327         if (p_priv[tid] == NULL) {
2328           // allocate thread specific object lazily
2329           p_priv[tid] = __kmp_allocate(arr[i].reduce_size);
2330           if (arr[i].reduce_init != NULL) {
2331             if (arr[i].reduce_orig != NULL) { // new interface
2332               ((void (*)(void *, void *))arr[i].reduce_init)(
2333                   p_priv[tid], arr[i].reduce_orig);
2334             } else { // old interface (single parameter)
2335               ((void (*)(void *))arr[i].reduce_init)(p_priv[tid]);
2336             }
2337           }
2338         }
2339         return p_priv[tid];
2340       }
2341     }
2342     tg = tg->parent;
2343     arr = (kmp_taskred_data_t *)(tg->reduce_data);
2344     num = tg->reduce_num_data;
2345   }
2346   KMP_ASSERT2(0, "Unknown task reduction item");
2347   return NULL; // ERROR, this line never executed
2348 }
2349 
2350 // Finalize task reduction.
2351 // Called from __kmpc_end_taskgroup()
2352 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) {
2353   kmp_int32 nth = th->th.th_team_nproc;
2354   KMP_DEBUG_ASSERT(nth > 1); // should not be called if nth == 1
2355   kmp_taskred_data_t *arr = (kmp_taskred_data_t *)tg->reduce_data;
2356   kmp_int32 num = tg->reduce_num_data;
2357   for (int i = 0; i < num; ++i) {
2358     void *sh_data = arr[i].reduce_shar;
2359     void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini);
2360     void (*f_comb)(void *, void *) =
2361         (void (*)(void *, void *))(arr[i].reduce_comb);
2362     if (!arr[i].flags.lazy_priv) {
2363       void *pr_data = arr[i].reduce_priv;
2364       size_t size = arr[i].reduce_size;
2365       for (int j = 0; j < nth; ++j) {
2366         void *priv_data = (char *)pr_data + j * size;
2367         f_comb(sh_data, priv_data); // combine results
2368         if (f_fini)
2369           f_fini(priv_data); // finalize if needed
2370       }
2371     } else {
2372       void **pr_data = (void **)(arr[i].reduce_priv);
2373       for (int j = 0; j < nth; ++j) {
2374         if (pr_data[j] != NULL) {
2375           f_comb(sh_data, pr_data[j]); // combine results
2376           if (f_fini)
2377             f_fini(pr_data[j]); // finalize if needed
2378           __kmp_free(pr_data[j]);
2379         }
2380       }
2381     }
2382     __kmp_free(arr[i].reduce_priv);
2383   }
2384   __kmp_thread_free(th, arr);
2385   tg->reduce_data = NULL;
2386   tg->reduce_num_data = 0;
2387 }
2388 
2389 // Cleanup task reduction data for parallel or worksharing,
2390 // do not touch task private data other threads still working with.
2391 // Called from __kmpc_end_taskgroup()
2392 static void __kmp_task_reduction_clean(kmp_info_t *th, kmp_taskgroup_t *tg) {
2393   __kmp_thread_free(th, tg->reduce_data);
2394   tg->reduce_data = NULL;
2395   tg->reduce_num_data = 0;
2396 }
2397 
2398 template <typename T>
2399 void *__kmp_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2400                                          int num, T *data) {
2401   kmp_info_t *thr = __kmp_threads[gtid];
2402   kmp_int32 nth = thr->th.th_team_nproc;
2403   __kmpc_taskgroup(loc, gtid); // form new taskgroup first
2404   if (nth == 1) {
2405     KA_TRACE(10,
2406              ("__kmpc_reduction_modifier_init: T#%d, tg %p, exiting nth=1\n",
2407               gtid, thr->th.th_current_task->td_taskgroup));
2408     return (void *)thr->th.th_current_task->td_taskgroup;
2409   }
2410   kmp_team_t *team = thr->th.th_team;
2411   void *reduce_data;
2412   kmp_taskgroup_t *tg;
2413   reduce_data = KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[is_ws]);
2414   if (reduce_data == NULL &&
2415       __kmp_atomic_compare_store(&team->t.t_tg_reduce_data[is_ws], reduce_data,
2416                                  (void *)1)) {
2417     // single thread enters this block to initialize common reduction data
2418     KMP_DEBUG_ASSERT(reduce_data == NULL);
2419     // first initialize own data, then make a copy other threads can use
2420     tg = (kmp_taskgroup_t *)__kmp_task_reduction_init<T>(gtid, num, data);
2421     reduce_data = __kmp_thread_malloc(thr, num * sizeof(kmp_taskred_data_t));
2422     KMP_MEMCPY(reduce_data, tg->reduce_data, num * sizeof(kmp_taskred_data_t));
2423     // fini counters should be 0 at this point
2424     KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[0]) == 0);
2425     KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[1]) == 0);
2426     KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[is_ws], reduce_data);
2427   } else {
2428     while (
2429         (reduce_data = KMP_ATOMIC_LD_ACQ(&team->t.t_tg_reduce_data[is_ws])) ==
2430         (void *)1) { // wait for task reduction initialization
2431       KMP_CPU_PAUSE();
2432     }
2433     KMP_DEBUG_ASSERT(reduce_data > (void *)1); // should be valid pointer here
2434     tg = thr->th.th_current_task->td_taskgroup;
2435     __kmp_task_reduction_init_copy<T>(thr, num, data, tg, reduce_data);
2436   }
2437   return tg;
2438 }
2439 
2440 /*!
2441 @ingroup TASKING
2442 @param loc       Source location info
2443 @param gtid      Global thread ID
2444 @param is_ws     Is 1 if the reduction is for worksharing, 0 otherwise
2445 @param num       Number of data items to reduce
2446 @param data      Array of data for reduction
2447 @return The taskgroup identifier
2448 
2449 Initialize task reduction for a parallel or worksharing.
2450 
2451 Note: this entry supposes the optional compiler-generated initializer routine
2452 has single parameter - pointer to object to be initialized. That means
2453 the reduction either does not use omp_orig object, or the omp_orig is accessible
2454 without help of the runtime library.
2455 */
2456 void *__kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2457                                           int num, void *data) {
2458   return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2459                                             (kmp_task_red_input_t *)data);
2460 }
2461 
2462 /*!
2463 @ingroup TASKING
2464 @param loc       Source location info
2465 @param gtid      Global thread ID
2466 @param is_ws     Is 1 if the reduction is for worksharing, 0 otherwise
2467 @param num       Number of data items to reduce
2468 @param data      Array of data for reduction
2469 @return The taskgroup identifier
2470 
2471 Initialize task reduction for a parallel or worksharing.
2472 
2473 Note: this entry supposes the optional compiler-generated initializer routine
2474 has two parameters, pointer to object to be initialized and pointer to omp_orig
2475 */
2476 void *__kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num,
2477                                    void *data) {
2478   return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2479                                             (kmp_taskred_input_t *)data);
2480 }
2481 
2482 /*!
2483 @ingroup TASKING
2484 @param loc       Source location info
2485 @param gtid      Global thread ID
2486 @param is_ws     Is 1 if the reduction is for worksharing, 0 otherwise
2487 
2488 Finalize task reduction for a parallel or worksharing.
2489 */
2490 void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws) {
2491   __kmpc_end_taskgroup(loc, gtid);
2492 }
2493 #endif
2494 
2495 #if OMP_40_ENABLED
2496 // __kmpc_taskgroup: Start a new taskgroup
2497 void __kmpc_taskgroup(ident_t *loc, int gtid) {
2498   kmp_info_t *thread = __kmp_threads[gtid];
2499   kmp_taskdata_t *taskdata = thread->th.th_current_task;
2500   kmp_taskgroup_t *tg_new =
2501       (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t));
2502   KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new));
2503   KMP_ATOMIC_ST_RLX(&tg_new->count, 0);
2504   KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq);
2505   tg_new->parent = taskdata->td_taskgroup;
2506 #if OMP_50_ENABLED
2507   tg_new->reduce_data = NULL;
2508   tg_new->reduce_num_data = 0;
2509 #endif
2510   taskdata->td_taskgroup = tg_new;
2511 
2512 #if OMPT_SUPPORT && OMPT_OPTIONAL
2513   if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2514     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2515     if (!codeptr)
2516       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2517     kmp_team_t *team = thread->th.th_team;
2518     ompt_data_t my_task_data = taskdata->ompt_task_info.task_data;
2519     // FIXME: I think this is wrong for lwt!
2520     ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data;
2521 
2522     ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2523         ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2524         &(my_task_data), codeptr);
2525   }
2526 #endif
2527 }
2528 
2529 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task
2530 //                       and its descendants are complete
2531 void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
2532   kmp_info_t *thread = __kmp_threads[gtid];
2533   kmp_taskdata_t *taskdata = thread->th.th_current_task;
2534   kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
2535   int thread_finished = FALSE;
2536 
2537 #if OMPT_SUPPORT && OMPT_OPTIONAL
2538   kmp_team_t *team;
2539   ompt_data_t my_task_data;
2540   ompt_data_t my_parallel_data;
2541   void *codeptr;
2542   if (UNLIKELY(ompt_enabled.enabled)) {
2543     team = thread->th.th_team;
2544     my_task_data = taskdata->ompt_task_info.task_data;
2545     // FIXME: I think this is wrong for lwt!
2546     my_parallel_data = team->t.ompt_team_info.parallel_data;
2547     codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2548     if (!codeptr)
2549       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2550   }
2551 #endif
2552 
2553   KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc));
2554   KMP_DEBUG_ASSERT(taskgroup != NULL);
2555   KMP_SET_THREAD_STATE_BLOCK(TASKGROUP);
2556 
2557   if (__kmp_tasking_mode != tskm_immediate_exec) {
2558     // mark task as waiting not on a barrier
2559     taskdata->td_taskwait_counter += 1;
2560     taskdata->td_taskwait_ident = loc;
2561     taskdata->td_taskwait_thread = gtid + 1;
2562 #if USE_ITT_BUILD
2563     // For ITT the taskgroup wait is similar to taskwait until we need to
2564     // distinguish them
2565     void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
2566     if (itt_sync_obj != NULL)
2567       __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
2568 #endif /* USE_ITT_BUILD */
2569 
2570 #if OMPT_SUPPORT && OMPT_OPTIONAL
2571     if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2572       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2573           ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2574           &(my_task_data), codeptr);
2575     }
2576 #endif
2577 
2578 #if OMP_45_ENABLED
2579     if (!taskdata->td_flags.team_serial ||
2580         (thread->th.th_task_team != NULL &&
2581          thread->th.th_task_team->tt.tt_found_proxy_tasks))
2582 #else
2583     if (!taskdata->td_flags.team_serial)
2584 #endif
2585     {
2586       kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)),
2587                        0U);
2588       while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) {
2589         flag.execute_tasks(thread, gtid, FALSE,
2590                            &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2591                            __kmp_task_stealing_constraint);
2592       }
2593     }
2594     taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; // end waiting
2595 
2596 #if OMPT_SUPPORT && OMPT_OPTIONAL
2597     if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2598       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2599           ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2600           &(my_task_data), codeptr);
2601     }
2602 #endif
2603 
2604 #if USE_ITT_BUILD
2605     if (itt_sync_obj != NULL)
2606       __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
2607 #endif /* USE_ITT_BUILD */
2608   }
2609   KMP_DEBUG_ASSERT(taskgroup->count == 0);
2610 
2611 #if OMP_50_ENABLED
2612   if (taskgroup->reduce_data != NULL) { // need to reduce?
2613     int cnt;
2614     void *reduce_data;
2615     kmp_team_t *t = thread->th.th_team;
2616     kmp_taskred_data_t *arr = (kmp_taskred_data_t *)taskgroup->reduce_data;
2617     // check if <priv> data of the first reduction variable shared for the team
2618     void *priv0 = arr[0].reduce_priv;
2619     if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[0])) != NULL &&
2620         ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
2621       // finishing task reduction on parallel
2622       cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[0]);
2623       if (cnt == thread->th.th_team_nproc - 1) {
2624         // we are the last thread passing __kmpc_reduction_modifier_fini()
2625         // finalize task reduction:
2626         __kmp_task_reduction_fini(thread, taskgroup);
2627         // cleanup fields in the team structure:
2628         // TODO: is relaxed store enough here (whole barrier should follow)?
2629         __kmp_thread_free(thread, reduce_data);
2630         KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[0], NULL);
2631         KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[0], 0);
2632       } else {
2633         // we are not the last thread passing __kmpc_reduction_modifier_fini(),
2634         // so do not finalize reduction, just clean own copy of the data
2635         __kmp_task_reduction_clean(thread, taskgroup);
2636       }
2637     } else if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[1])) !=
2638                    NULL &&
2639                ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
2640       // finishing task reduction on worksharing
2641       cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[1]);
2642       if (cnt == thread->th.th_team_nproc - 1) {
2643         // we are the last thread passing __kmpc_reduction_modifier_fini()
2644         __kmp_task_reduction_fini(thread, taskgroup);
2645         // cleanup fields in team structure:
2646         // TODO: is relaxed store enough here (whole barrier should follow)?
2647         __kmp_thread_free(thread, reduce_data);
2648         KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[1], NULL);
2649         KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[1], 0);
2650       } else {
2651         // we are not the last thread passing __kmpc_reduction_modifier_fini(),
2652         // so do not finalize reduction, just clean own copy of the data
2653         __kmp_task_reduction_clean(thread, taskgroup);
2654       }
2655     } else {
2656       // finishing task reduction on taskgroup
2657       __kmp_task_reduction_fini(thread, taskgroup);
2658     }
2659   }
2660 #endif
2661   // Restore parent taskgroup for the current task
2662   taskdata->td_taskgroup = taskgroup->parent;
2663   __kmp_thread_free(thread, taskgroup);
2664 
2665   KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n",
2666                 gtid, taskdata));
2667   ANNOTATE_HAPPENS_AFTER(taskdata);
2668 
2669 #if OMPT_SUPPORT && OMPT_OPTIONAL
2670   if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2671     ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2672         ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2673         &(my_task_data), codeptr);
2674   }
2675 #endif
2676 }
2677 #endif
2678 
2679 // __kmp_remove_my_task: remove a task from my own deque
2680 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid,
2681                                         kmp_task_team_t *task_team,
2682                                         kmp_int32 is_constrained) {
2683   kmp_task_t *task;
2684   kmp_taskdata_t *taskdata;
2685   kmp_thread_data_t *thread_data;
2686   kmp_uint32 tail;
2687 
2688   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2689   KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data !=
2690                    NULL); // Caller should check this condition
2691 
2692   thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
2693 
2694   KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",
2695                 gtid, thread_data->td.td_deque_ntasks,
2696                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2697 
2698   if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2699     KA_TRACE(10,
2700              ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: "
2701               "ntasks=%d head=%u tail=%u\n",
2702               gtid, thread_data->td.td_deque_ntasks,
2703               thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2704     return NULL;
2705   }
2706 
2707   __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
2708 
2709   if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2710     __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2711     KA_TRACE(10,
2712              ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "
2713               "ntasks=%d head=%u tail=%u\n",
2714               gtid, thread_data->td.td_deque_ntasks,
2715               thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2716     return NULL;
2717   }
2718 
2719   tail = (thread_data->td.td_deque_tail - 1) &
2720          TASK_DEQUE_MASK(thread_data->td); // Wrap index.
2721   taskdata = thread_data->td.td_deque[tail];
2722 
2723   if (!__kmp_task_is_allowed(gtid, is_constrained, taskdata,
2724                              thread->th.th_current_task)) {
2725     // The TSC does not allow to steal victim task
2726     __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2727     KA_TRACE(10,
2728              ("__kmp_remove_my_task(exit #3): T#%d TSC blocks tail task: "
2729               "ntasks=%d head=%u tail=%u\n",
2730               gtid, thread_data->td.td_deque_ntasks,
2731               thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2732     return NULL;
2733   }
2734 
2735   thread_data->td.td_deque_tail = tail;
2736   TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1);
2737 
2738   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2739 
2740   KA_TRACE(10, ("__kmp_remove_my_task(exit #4): T#%d task %p removed: "
2741                 "ntasks=%d head=%u tail=%u\n",
2742                 gtid, taskdata, thread_data->td.td_deque_ntasks,
2743                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2744 
2745   task = KMP_TASKDATA_TO_TASK(taskdata);
2746   return task;
2747 }
2748 
2749 // __kmp_steal_task: remove a task from another thread's deque
2750 // Assume that calling thread has already checked existence of
2751 // task_team thread_data before calling this routine.
2752 static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid,
2753                                     kmp_task_team_t *task_team,
2754                                     std::atomic<kmp_int32> *unfinished_threads,
2755                                     int *thread_finished,
2756                                     kmp_int32 is_constrained) {
2757   kmp_task_t *task;
2758   kmp_taskdata_t *taskdata;
2759   kmp_taskdata_t *current;
2760   kmp_thread_data_t *victim_td, *threads_data;
2761   kmp_int32 target;
2762   kmp_int32 victim_tid;
2763 
2764   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2765 
2766   threads_data = task_team->tt.tt_threads_data;
2767   KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition
2768 
2769   victim_tid = victim_thr->th.th_info.ds.ds_tid;
2770   victim_td = &threads_data[victim_tid];
2771 
2772   KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: "
2773                 "task_team=%p ntasks=%d head=%u tail=%u\n",
2774                 gtid, __kmp_gtid_from_thread(victim_thr), task_team,
2775                 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
2776                 victim_td->td.td_deque_tail));
2777 
2778   if (TCR_4(victim_td->td.td_deque_ntasks) == 0) {
2779     KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: "
2780                   "task_team=%p ntasks=%d head=%u tail=%u\n",
2781                   gtid, __kmp_gtid_from_thread(victim_thr), task_team,
2782                   victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
2783                   victim_td->td.td_deque_tail));
2784     return NULL;
2785   }
2786 
2787   __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock);
2788 
2789   int ntasks = TCR_4(victim_td->td.td_deque_ntasks);
2790   // Check again after we acquire the lock
2791   if (ntasks == 0) {
2792     __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2793     KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: "
2794                   "task_team=%p ntasks=%d head=%u tail=%u\n",
2795                   gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
2796                   victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2797     return NULL;
2798   }
2799 
2800   KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL);
2801   current = __kmp_threads[gtid]->th.th_current_task;
2802   taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head];
2803   if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
2804     // Bump head pointer and Wrap.
2805     victim_td->td.td_deque_head =
2806         (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td);
2807   } else {
2808     if (!task_team->tt.tt_untied_task_encountered) {
2809       // The TSC does not allow to steal victim task
2810       __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2811       KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d could not steal from "
2812                     "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
2813                     gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
2814                     victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2815       return NULL;
2816     }
2817     int i;
2818     // walk through victim's deque trying to steal any task
2819     target = victim_td->td.td_deque_head;
2820     taskdata = NULL;
2821     for (i = 1; i < ntasks; ++i) {
2822       target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
2823       taskdata = victim_td->td.td_deque[target];
2824       if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
2825         break; // found victim task
2826       } else {
2827         taskdata = NULL;
2828       }
2829     }
2830     if (taskdata == NULL) {
2831       // No appropriate candidate to steal found
2832       __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2833       KA_TRACE(10, ("__kmp_steal_task(exit #4): T#%d could not steal from "
2834                     "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
2835                     gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
2836                     victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2837       return NULL;
2838     }
2839     int prev = target;
2840     for (i = i + 1; i < ntasks; ++i) {
2841       // shift remaining tasks in the deque left by 1
2842       target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
2843       victim_td->td.td_deque[prev] = victim_td->td.td_deque[target];
2844       prev = target;
2845     }
2846     KMP_DEBUG_ASSERT(
2847         victim_td->td.td_deque_tail ==
2848         (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(victim_td->td)));
2849     victim_td->td.td_deque_tail = target; // tail -= 1 (wrapped))
2850   }
2851   if (*thread_finished) {
2852     // We need to un-mark this victim as a finished victim.  This must be done
2853     // before releasing the lock, or else other threads (starting with the
2854     // master victim) might be prematurely released from the barrier!!!
2855     kmp_int32 count;
2856 
2857     count = KMP_ATOMIC_INC(unfinished_threads);
2858 
2859     KA_TRACE(
2860         20,
2861         ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
2862          gtid, count + 1, task_team));
2863 
2864     *thread_finished = FALSE;
2865   }
2866   TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1);
2867 
2868   __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2869 
2870   KMP_COUNT_BLOCK(TASK_stolen);
2871   KA_TRACE(10,
2872            ("__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: "
2873             "task_team=%p ntasks=%d head=%u tail=%u\n",
2874             gtid, taskdata, __kmp_gtid_from_thread(victim_thr), task_team,
2875             ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2876 
2877   task = KMP_TASKDATA_TO_TASK(taskdata);
2878   return task;
2879 }
2880 
2881 // __kmp_execute_tasks_template: Choose and execute tasks until either the
2882 // condition is statisfied (return true) or there are none left (return false).
2883 //
2884 // final_spin is TRUE if this is the spin at the release barrier.
2885 // thread_finished indicates whether the thread is finished executing all
2886 // the tasks it has on its deque, and is at the release barrier.
2887 // spinner is the location on which to spin.
2888 // spinner == NULL means only execute a single task and return.
2889 // checker is the value to check to terminate the spin.
2890 template <class C>
2891 static inline int __kmp_execute_tasks_template(
2892     kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin,
2893     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
2894     kmp_int32 is_constrained) {
2895   kmp_task_team_t *task_team = thread->th.th_task_team;
2896   kmp_thread_data_t *threads_data;
2897   kmp_task_t *task;
2898   kmp_info_t *other_thread;
2899   kmp_taskdata_t *current_task = thread->th.th_current_task;
2900   std::atomic<kmp_int32> *unfinished_threads;
2901   kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0,
2902                       tid = thread->th.th_info.ds.ds_tid;
2903 
2904   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2905   KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]);
2906 
2907   if (task_team == NULL || current_task == NULL)
2908     return FALSE;
2909 
2910   KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d "
2911                 "*thread_finished=%d\n",
2912                 gtid, final_spin, *thread_finished));
2913 
2914   thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
2915   threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
2916   KMP_DEBUG_ASSERT(threads_data != NULL);
2917 
2918   nthreads = task_team->tt.tt_nproc;
2919   unfinished_threads = &(task_team->tt.tt_unfinished_threads);
2920 #if OMP_45_ENABLED
2921   KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks);
2922 #else
2923   KMP_DEBUG_ASSERT(nthreads > 1);
2924 #endif
2925   KMP_DEBUG_ASSERT(*unfinished_threads >= 0);
2926 
2927   while (1) { // Outer loop keeps trying to find tasks in case of single thread
2928     // getting tasks from target constructs
2929     while (1) { // Inner loop to find a task and execute it
2930       task = NULL;
2931       if (use_own_tasks) { // check on own queue first
2932         task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained);
2933       }
2934       if ((task == NULL) && (nthreads > 1)) { // Steal a task
2935         int asleep = 1;
2936         use_own_tasks = 0;
2937         // Try to steal from the last place I stole from successfully.
2938         if (victim_tid == -2) { // haven't stolen anything yet
2939           victim_tid = threads_data[tid].td.td_deque_last_stolen;
2940           if (victim_tid !=
2941               -1) // if we have a last stolen from victim, get the thread
2942             other_thread = threads_data[victim_tid].td.td_thr;
2943         }
2944         if (victim_tid != -1) { // found last victim
2945           asleep = 0;
2946         } else if (!new_victim) { // no recent steals and we haven't already
2947           // used a new victim; select a random thread
2948           do { // Find a different thread to steal work from.
2949             // Pick a random thread. Initial plan was to cycle through all the
2950             // threads, and only return if we tried to steal from every thread,
2951             // and failed.  Arch says that's not such a great idea.
2952             victim_tid = __kmp_get_random(thread) % (nthreads - 1);
2953             if (victim_tid >= tid) {
2954               ++victim_tid; // Adjusts random distribution to exclude self
2955             }
2956             // Found a potential victim
2957             other_thread = threads_data[victim_tid].td.td_thr;
2958             // There is a slight chance that __kmp_enable_tasking() did not wake
2959             // up all threads waiting at the barrier.  If victim is sleeping,
2960             // then wake it up. Since we were going to pay the cache miss
2961             // penalty for referencing another thread's kmp_info_t struct
2962             // anyway,
2963             // the check shouldn't cost too much performance at this point. In
2964             // extra barrier mode, tasks do not sleep at the separate tasking
2965             // barrier, so this isn't a problem.
2966             asleep = 0;
2967             if ((__kmp_tasking_mode == tskm_task_teams) &&
2968                 (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&
2969                 (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) !=
2970                  NULL)) {
2971               asleep = 1;
2972               __kmp_null_resume_wrapper(__kmp_gtid_from_thread(other_thread),
2973                                         other_thread->th.th_sleep_loc);
2974               // A sleeping thread should not have any tasks on it's queue.
2975               // There is a slight possibility that it resumes, steals a task
2976               // from another thread, which spawns more tasks, all in the time
2977               // that it takes this thread to check => don't write an assertion
2978               // that the victim's queue is empty.  Try stealing from a
2979               // different thread.
2980             }
2981           } while (asleep);
2982         }
2983 
2984         if (!asleep) {
2985           // We have a victim to try to steal from
2986           task = __kmp_steal_task(other_thread, gtid, task_team,
2987                                   unfinished_threads, thread_finished,
2988                                   is_constrained);
2989         }
2990         if (task != NULL) { // set last stolen to victim
2991           if (threads_data[tid].td.td_deque_last_stolen != victim_tid) {
2992             threads_data[tid].td.td_deque_last_stolen = victim_tid;
2993             // The pre-refactored code did not try more than 1 successful new
2994             // vicitm, unless the last one generated more local tasks;
2995             // new_victim keeps track of this
2996             new_victim = 1;
2997           }
2998         } else { // No tasks found; unset last_stolen
2999           KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1);
3000           victim_tid = -2; // no successful victim found
3001         }
3002       }
3003 
3004       if (task == NULL) // break out of tasking loop
3005         break;
3006 
3007 // Found a task; execute it
3008 #if USE_ITT_BUILD && USE_ITT_NOTIFY
3009       if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
3010         if (itt_sync_obj == NULL) { // we are at fork barrier where we could not
3011           // get the object reliably
3012           itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
3013         }
3014         __kmp_itt_task_starting(itt_sync_obj);
3015       }
3016 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
3017       __kmp_invoke_task(gtid, task, current_task);
3018 #if USE_ITT_BUILD
3019       if (itt_sync_obj != NULL)
3020         __kmp_itt_task_finished(itt_sync_obj);
3021 #endif /* USE_ITT_BUILD */
3022       // If this thread is only partway through the barrier and the condition is
3023       // met, then return now, so that the barrier gather/release pattern can
3024       // proceed. If this thread is in the last spin loop in the barrier,
3025       // waiting to be released, we know that the termination condition will not
3026       // be satisified, so don't waste any cycles checking it.
3027       if (flag == NULL || (!final_spin && flag->done_check())) {
3028         KA_TRACE(
3029             15,
3030             ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3031              gtid));
3032         return TRUE;
3033       }
3034       if (thread->th.th_task_team == NULL) {
3035         break;
3036       }
3037       KMP_YIELD(__kmp_library == library_throughput); // Yield before next task
3038       // If execution of a stolen task results in more tasks being placed on our
3039       // run queue, reset use_own_tasks
3040       if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) {
3041         KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned "
3042                       "other tasks, restart\n",
3043                       gtid));
3044         use_own_tasks = 1;
3045         new_victim = 0;
3046       }
3047     }
3048 
3049 // The task source has been exhausted. If in final spin loop of barrier, check
3050 // if termination condition is satisfied.
3051 #if OMP_45_ENABLED
3052     // The work queue may be empty but there might be proxy tasks still
3053     // executing
3054     if (final_spin &&
3055         KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks) == 0)
3056 #else
3057     if (final_spin)
3058 #endif
3059     {
3060       // First, decrement the #unfinished threads, if that has not already been
3061       // done.  This decrement might be to the spin location, and result in the
3062       // termination condition being satisfied.
3063       if (!*thread_finished) {
3064         kmp_int32 count;
3065 
3066         count = KMP_ATOMIC_DEC(unfinished_threads) - 1;
3067         KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec "
3068                       "unfinished_threads to %d task_team=%p\n",
3069                       gtid, count, task_team));
3070         *thread_finished = TRUE;
3071       }
3072 
3073       // It is now unsafe to reference thread->th.th_team !!!
3074       // Decrementing task_team->tt.tt_unfinished_threads can allow the master
3075       // thread to pass through the barrier, where it might reset each thread's
3076       // th.th_team field for the next parallel region. If we can steal more
3077       // work, we know that this has not happened yet.
3078       if (flag != NULL && flag->done_check()) {
3079         KA_TRACE(
3080             15,
3081             ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3082              gtid));
3083         return TRUE;
3084       }
3085     }
3086 
3087     // If this thread's task team is NULL, master has recognized that there are
3088     // no more tasks; bail out
3089     if (thread->th.th_task_team == NULL) {
3090       KA_TRACE(15,
3091                ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid));
3092       return FALSE;
3093     }
3094 
3095 #if OMP_45_ENABLED
3096     // We could be getting tasks from target constructs; if this is the only
3097     // thread, keep trying to execute tasks from own queue
3098     if (nthreads == 1)
3099       use_own_tasks = 1;
3100     else
3101 #endif
3102     {
3103       KA_TRACE(15,
3104                ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid));
3105       return FALSE;
3106     }
3107   }
3108 }
3109 
3110 int __kmp_execute_tasks_32(
3111     kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32 *flag, int final_spin,
3112     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3113     kmp_int32 is_constrained) {
3114   return __kmp_execute_tasks_template(
3115       thread, gtid, flag, final_spin,
3116       thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3117 }
3118 
3119 int __kmp_execute_tasks_64(
3120     kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64 *flag, int final_spin,
3121     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3122     kmp_int32 is_constrained) {
3123   return __kmp_execute_tasks_template(
3124       thread, gtid, flag, final_spin,
3125       thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3126 }
3127 
3128 int __kmp_execute_tasks_oncore(
3129     kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
3130     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3131     kmp_int32 is_constrained) {
3132   return __kmp_execute_tasks_template(
3133       thread, gtid, flag, final_spin,
3134       thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3135 }
3136 
3137 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
3138 // next barrier so they can assist in executing enqueued tasks.
3139 // First thread in allocates the task team atomically.
3140 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
3141                                  kmp_info_t *this_thr) {
3142   kmp_thread_data_t *threads_data;
3143   int nthreads, i, is_init_thread;
3144 
3145   KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n",
3146                 __kmp_gtid_from_thread(this_thr)));
3147 
3148   KMP_DEBUG_ASSERT(task_team != NULL);
3149   KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL);
3150 
3151   nthreads = task_team->tt.tt_nproc;
3152   KMP_DEBUG_ASSERT(nthreads > 0);
3153   KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc);
3154 
3155   // Allocate or increase the size of threads_data if necessary
3156   is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team);
3157 
3158   if (!is_init_thread) {
3159     // Some other thread already set up the array.
3160     KA_TRACE(
3161         20,
3162         ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n",
3163          __kmp_gtid_from_thread(this_thr)));
3164     return;
3165   }
3166   threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3167   KMP_DEBUG_ASSERT(threads_data != NULL);
3168 
3169   if (__kmp_tasking_mode == tskm_task_teams &&
3170       (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) {
3171     // Release any threads sleeping at the barrier, so that they can steal
3172     // tasks and execute them.  In extra barrier mode, tasks do not sleep
3173     // at the separate tasking barrier, so this isn't a problem.
3174     for (i = 0; i < nthreads; i++) {
3175       volatile void *sleep_loc;
3176       kmp_info_t *thread = threads_data[i].td.td_thr;
3177 
3178       if (i == this_thr->th.th_info.ds.ds_tid) {
3179         continue;
3180       }
3181       // Since we haven't locked the thread's suspend mutex lock at this
3182       // point, there is a small window where a thread might be putting
3183       // itself to sleep, but hasn't set the th_sleep_loc field yet.
3184       // To work around this, __kmp_execute_tasks_template() periodically checks
3185       // see if other threads are sleeping (using the same random mechanism that
3186       // is used for task stealing) and awakens them if they are.
3187       if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3188           NULL) {
3189         KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n",
3190                       __kmp_gtid_from_thread(this_thr),
3191                       __kmp_gtid_from_thread(thread)));
3192         __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
3193       } else {
3194         KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
3195                       __kmp_gtid_from_thread(this_thr),
3196                       __kmp_gtid_from_thread(thread)));
3197       }
3198     }
3199   }
3200 
3201   KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n",
3202                 __kmp_gtid_from_thread(this_thr)));
3203 }
3204 
3205 /* // TODO: Check the comment consistency
3206  * Utility routines for "task teams".  A task team (kmp_task_t) is kind of
3207  * like a shadow of the kmp_team_t data struct, with a different lifetime.
3208  * After a child * thread checks into a barrier and calls __kmp_release() from
3209  * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
3210  * longer assume that the kmp_team_t structure is intact (at any moment, the
3211  * master thread may exit the barrier code and free the team data structure,
3212  * and return the threads to the thread pool).
3213  *
3214  * This does not work with the the tasking code, as the thread is still
3215  * expected to participate in the execution of any tasks that may have been
3216  * spawned my a member of the team, and the thread still needs access to all
3217  * to each thread in the team, so that it can steal work from it.
3218  *
3219  * Enter the existence of the kmp_task_team_t struct.  It employs a reference
3220  * counting mechanims, and is allocated by the master thread before calling
3221  * __kmp_<barrier_kind>_release, and then is release by the last thread to
3222  * exit __kmp_<barrier_kind>_release at the next barrier.  I.e. the lifetimes
3223  * of the kmp_task_team_t structs for consecutive barriers can overlap
3224  * (and will, unless the master thread is the last thread to exit the barrier
3225  * release phase, which is not typical).
3226  *
3227  * The existence of such a struct is useful outside the context of tasking,
3228  * but for now, I'm trying to keep it specific to the OMP_30_ENABLED macro,
3229  * so that any performance differences show up when comparing the 2.5 vs. 3.0
3230  * libraries.
3231  *
3232  * We currently use the existence of the threads array as an indicator that
3233  * tasks were spawned since the last barrier.  If the structure is to be
3234  * useful outside the context of tasking, then this will have to change, but
3235  * not settting the field minimizes the performance impact of tasking on
3236  * barriers, when no explicit tasks were spawned (pushed, actually).
3237  */
3238 
3239 static kmp_task_team_t *__kmp_free_task_teams =
3240     NULL; // Free list for task_team data structures
3241 // Lock for task team data structures
3242 kmp_bootstrap_lock_t __kmp_task_team_lock =
3243     KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock);
3244 
3245 // __kmp_alloc_task_deque:
3246 // Allocates a task deque for a particular thread, and initialize the necessary
3247 // data structures relating to the deque.  This only happens once per thread
3248 // per task team since task teams are recycled. No lock is needed during
3249 // allocation since each thread allocates its own deque.
3250 static void __kmp_alloc_task_deque(kmp_info_t *thread,
3251                                    kmp_thread_data_t *thread_data) {
3252   __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
3253   KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL);
3254 
3255   // Initialize last stolen task field to "none"
3256   thread_data->td.td_deque_last_stolen = -1;
3257 
3258   KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0);
3259   KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0);
3260   KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0);
3261 
3262   KE_TRACE(
3263       10,
3264       ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
3265        __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data));
3266   // Allocate space for task deque, and zero the deque
3267   // Cannot use __kmp_thread_calloc() because threads not around for
3268   // kmp_reap_task_team( ).
3269   thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
3270       INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
3271   thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
3272 }
3273 
3274 // __kmp_free_task_deque:
3275 // Deallocates a task deque for a particular thread. Happens at library
3276 // deallocation so don't need to reset all thread data fields.
3277 static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) {
3278   if (thread_data->td.td_deque != NULL) {
3279     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3280     TCW_4(thread_data->td.td_deque_ntasks, 0);
3281     __kmp_free(thread_data->td.td_deque);
3282     thread_data->td.td_deque = NULL;
3283     __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3284   }
3285 
3286 #ifdef BUILD_TIED_TASK_STACK
3287   // GEH: Figure out what to do here for td_susp_tied_tasks
3288   if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) {
3289     __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data);
3290   }
3291 #endif // BUILD_TIED_TASK_STACK
3292 }
3293 
3294 // __kmp_realloc_task_threads_data:
3295 // Allocates a threads_data array for a task team, either by allocating an
3296 // initial array or enlarging an existing array.  Only the first thread to get
3297 // the lock allocs or enlarges the array and re-initializes the array eleemnts.
3298 // That thread returns "TRUE", the rest return "FALSE".
3299 // Assumes that the new array size is given by task_team -> tt.tt_nproc.
3300 // The current size is given by task_team -> tt.tt_max_threads.
3301 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
3302                                            kmp_task_team_t *task_team) {
3303   kmp_thread_data_t **threads_data_p;
3304   kmp_int32 nthreads, maxthreads;
3305   int is_init_thread = FALSE;
3306 
3307   if (TCR_4(task_team->tt.tt_found_tasks)) {
3308     // Already reallocated and initialized.
3309     return FALSE;
3310   }
3311 
3312   threads_data_p = &task_team->tt.tt_threads_data;
3313   nthreads = task_team->tt.tt_nproc;
3314   maxthreads = task_team->tt.tt_max_threads;
3315 
3316   // All threads must lock when they encounter the first task of the implicit
3317   // task region to make sure threads_data fields are (re)initialized before
3318   // used.
3319   __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3320 
3321   if (!TCR_4(task_team->tt.tt_found_tasks)) {
3322     // first thread to enable tasking
3323     kmp_team_t *team = thread->th.th_team;
3324     int i;
3325 
3326     is_init_thread = TRUE;
3327     if (maxthreads < nthreads) {
3328 
3329       if (*threads_data_p != NULL) {
3330         kmp_thread_data_t *old_data = *threads_data_p;
3331         kmp_thread_data_t *new_data = NULL;
3332 
3333         KE_TRACE(
3334             10,
3335             ("__kmp_realloc_task_threads_data: T#%d reallocating "
3336              "threads data for task_team %p, new_size = %d, old_size = %d\n",
3337              __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads));
3338         // Reallocate threads_data to have more elements than current array
3339         // Cannot use __kmp_thread_realloc() because threads not around for
3340         // kmp_reap_task_team( ).  Note all new array entries are initialized
3341         // to zero by __kmp_allocate().
3342         new_data = (kmp_thread_data_t *)__kmp_allocate(
3343             nthreads * sizeof(kmp_thread_data_t));
3344         // copy old data to new data
3345         KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t),
3346                      (void *)old_data, maxthreads * sizeof(kmp_thread_data_t));
3347 
3348 #ifdef BUILD_TIED_TASK_STACK
3349         // GEH: Figure out if this is the right thing to do
3350         for (i = maxthreads; i < nthreads; i++) {
3351           kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3352           __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3353         }
3354 #endif // BUILD_TIED_TASK_STACK
3355         // Install the new data and free the old data
3356         (*threads_data_p) = new_data;
3357         __kmp_free(old_data);
3358       } else {
3359         KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating "
3360                       "threads data for task_team %p, size = %d\n",
3361                       __kmp_gtid_from_thread(thread), task_team, nthreads));
3362         // Make the initial allocate for threads_data array, and zero entries
3363         // Cannot use __kmp_thread_calloc() because threads not around for
3364         // kmp_reap_task_team( ).
3365         ANNOTATE_IGNORE_WRITES_BEGIN();
3366         *threads_data_p = (kmp_thread_data_t *)__kmp_allocate(
3367             nthreads * sizeof(kmp_thread_data_t));
3368         ANNOTATE_IGNORE_WRITES_END();
3369 #ifdef BUILD_TIED_TASK_STACK
3370         // GEH: Figure out if this is the right thing to do
3371         for (i = 0; i < nthreads; i++) {
3372           kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3373           __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3374         }
3375 #endif // BUILD_TIED_TASK_STACK
3376       }
3377       task_team->tt.tt_max_threads = nthreads;
3378     } else {
3379       // If array has (more than) enough elements, go ahead and use it
3380       KMP_DEBUG_ASSERT(*threads_data_p != NULL);
3381     }
3382 
3383     // initialize threads_data pointers back to thread_info structures
3384     for (i = 0; i < nthreads; i++) {
3385       kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3386       thread_data->td.td_thr = team->t.t_threads[i];
3387 
3388       if (thread_data->td.td_deque_last_stolen >= nthreads) {
3389         // The last stolen field survives across teams / barrier, and the number
3390         // of threads may have changed.  It's possible (likely?) that a new
3391         // parallel region will exhibit the same behavior as previous region.
3392         thread_data->td.td_deque_last_stolen = -1;
3393       }
3394     }
3395 
3396     KMP_MB();
3397     TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE);
3398   }
3399 
3400   __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3401   return is_init_thread;
3402 }
3403 
3404 // __kmp_free_task_threads_data:
3405 // Deallocates a threads_data array for a task team, including any attached
3406 // tasking deques.  Only occurs at library shutdown.
3407 static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) {
3408   __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3409   if (task_team->tt.tt_threads_data != NULL) {
3410     int i;
3411     for (i = 0; i < task_team->tt.tt_max_threads; i++) {
3412       __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]);
3413     }
3414     __kmp_free(task_team->tt.tt_threads_data);
3415     task_team->tt.tt_threads_data = NULL;
3416   }
3417   __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3418 }
3419 
3420 // __kmp_allocate_task_team:
3421 // Allocates a task team associated with a specific team, taking it from
3422 // the global task team free list if possible.  Also initializes data
3423 // structures.
3424 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
3425                                                  kmp_team_t *team) {
3426   kmp_task_team_t *task_team = NULL;
3427   int nthreads;
3428 
3429   KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n",
3430                 (thread ? __kmp_gtid_from_thread(thread) : -1), team));
3431 
3432   if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3433     // Take a task team from the task team pool
3434     __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3435     if (__kmp_free_task_teams != NULL) {
3436       task_team = __kmp_free_task_teams;
3437       TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next);
3438       task_team->tt.tt_next = NULL;
3439     }
3440     __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3441   }
3442 
3443   if (task_team == NULL) {
3444     KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating "
3445                   "task team for team %p\n",
3446                   __kmp_gtid_from_thread(thread), team));
3447     // Allocate a new task team if one is not available.
3448     // Cannot use __kmp_thread_malloc() because threads not around for
3449     // kmp_reap_task_team( ).
3450     task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t));
3451     __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock);
3452     // AC: __kmp_allocate zeroes returned memory
3453     // task_team -> tt.tt_threads_data = NULL;
3454     // task_team -> tt.tt_max_threads = 0;
3455     // task_team -> tt.tt_next = NULL;
3456   }
3457 
3458   TCW_4(task_team->tt.tt_found_tasks, FALSE);
3459 #if OMP_45_ENABLED
3460   TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3461 #endif
3462   task_team->tt.tt_nproc = nthreads = team->t.t_nproc;
3463 
3464   KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads);
3465   TCW_4(task_team->tt.tt_active, TRUE);
3466 
3467   KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
3468                 "unfinished_threads init'd to %d\n",
3469                 (thread ? __kmp_gtid_from_thread(thread) : -1), task_team,
3470                 KMP_ATOMIC_LD_RLX(&task_team->tt.tt_unfinished_threads)));
3471   return task_team;
3472 }
3473 
3474 // __kmp_free_task_team:
3475 // Frees the task team associated with a specific thread, and adds it
3476 // to the global task team free list.
3477 void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) {
3478   KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n",
3479                 thread ? __kmp_gtid_from_thread(thread) : -1, task_team));
3480 
3481   // Put task team back on free list
3482   __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3483 
3484   KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL);
3485   task_team->tt.tt_next = __kmp_free_task_teams;
3486   TCW_PTR(__kmp_free_task_teams, task_team);
3487 
3488   __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3489 }
3490 
3491 // __kmp_reap_task_teams:
3492 // Free all the task teams on the task team free list.
3493 // Should only be done during library shutdown.
3494 // Cannot do anything that needs a thread structure or gtid since they are
3495 // already gone.
3496 void __kmp_reap_task_teams(void) {
3497   kmp_task_team_t *task_team;
3498 
3499   if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3500     // Free all task_teams on the free list
3501     __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3502     while ((task_team = __kmp_free_task_teams) != NULL) {
3503       __kmp_free_task_teams = task_team->tt.tt_next;
3504       task_team->tt.tt_next = NULL;
3505 
3506       // Free threads_data if necessary
3507       if (task_team->tt.tt_threads_data != NULL) {
3508         __kmp_free_task_threads_data(task_team);
3509       }
3510       __kmp_free(task_team);
3511     }
3512     __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3513   }
3514 }
3515 
3516 // __kmp_wait_to_unref_task_teams:
3517 // Some threads could still be in the fork barrier release code, possibly
3518 // trying to steal tasks.  Wait for each thread to unreference its task team.
3519 void __kmp_wait_to_unref_task_teams(void) {
3520   kmp_info_t *thread;
3521   kmp_uint32 spins;
3522   int done;
3523 
3524   KMP_INIT_YIELD(spins);
3525 
3526   for (;;) {
3527     done = TRUE;
3528 
3529     // TODO: GEH - this may be is wrong because some sync would be necessary
3530     // in case threads are added to the pool during the traversal. Need to
3531     // verify that lock for thread pool is held when calling this routine.
3532     for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL;
3533          thread = thread->th.th_next_pool) {
3534 #if KMP_OS_WINDOWS
3535       DWORD exit_val;
3536 #endif
3537       if (TCR_PTR(thread->th.th_task_team) == NULL) {
3538         KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n",
3539                       __kmp_gtid_from_thread(thread)));
3540         continue;
3541       }
3542 #if KMP_OS_WINDOWS
3543       // TODO: GEH - add this check for Linux* OS / OS X* as well?
3544       if (!__kmp_is_thread_alive(thread, &exit_val)) {
3545         thread->th.th_task_team = NULL;
3546         continue;
3547       }
3548 #endif
3549 
3550       done = FALSE; // Because th_task_team pointer is not NULL for this thread
3551 
3552       KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to "
3553                     "unreference task_team\n",
3554                     __kmp_gtid_from_thread(thread)));
3555 
3556       if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
3557         volatile void *sleep_loc;
3558         // If the thread is sleeping, awaken it.
3559         if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3560             NULL) {
3561           KA_TRACE(
3562               10,
3563               ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
3564                __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread)));
3565           __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
3566         }
3567       }
3568     }
3569     if (done) {
3570       break;
3571     }
3572 
3573     // If oversubscribed or have waited a bit, yield.
3574     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
3575   }
3576 }
3577 
3578 // __kmp_task_team_setup:  Create a task_team for the current team, but use
3579 // an already created, unused one if it already exists.
3580 void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
3581   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3582 
3583   // If this task_team hasn't been created yet, allocate it. It will be used in
3584   // the region after the next.
3585   // If it exists, it is the current task team and shouldn't be touched yet as
3586   // it may still be in use.
3587   if (team->t.t_task_team[this_thr->th.th_task_state] == NULL &&
3588       (always || team->t.t_nproc > 1)) {
3589     team->t.t_task_team[this_thr->th.th_task_state] =
3590         __kmp_allocate_task_team(this_thr, team);
3591     KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created new task_team %p "
3592                   "for team %d at parity=%d\n",
3593                   __kmp_gtid_from_thread(this_thr),
3594                   team->t.t_task_team[this_thr->th.th_task_state],
3595                   ((team != NULL) ? team->t.t_id : -1),
3596                   this_thr->th.th_task_state));
3597   }
3598 
3599   // After threads exit the release, they will call sync, and then point to this
3600   // other task_team; make sure it is allocated and properly initialized. As
3601   // threads spin in the barrier release phase, they will continue to use the
3602   // previous task_team struct(above), until they receive the signal to stop
3603   // checking for tasks (they can't safely reference the kmp_team_t struct,
3604   // which could be reallocated by the master thread). No task teams are formed
3605   // for serialized teams.
3606   if (team->t.t_nproc > 1) {
3607     int other_team = 1 - this_thr->th.th_task_state;
3608     if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
3609       team->t.t_task_team[other_team] =
3610           __kmp_allocate_task_team(this_thr, team);
3611       KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created second new "
3612                     "task_team %p for team %d at parity=%d\n",
3613                     __kmp_gtid_from_thread(this_thr),
3614                     team->t.t_task_team[other_team],
3615                     ((team != NULL) ? team->t.t_id : -1), other_team));
3616     } else { // Leave the old task team struct in place for the upcoming region;
3617       // adjust as needed
3618       kmp_task_team_t *task_team = team->t.t_task_team[other_team];
3619       if (!task_team->tt.tt_active ||
3620           team->t.t_nproc != task_team->tt.tt_nproc) {
3621         TCW_4(task_team->tt.tt_nproc, team->t.t_nproc);
3622         TCW_4(task_team->tt.tt_found_tasks, FALSE);
3623 #if OMP_45_ENABLED
3624         TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3625 #endif
3626         KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads,
3627                           team->t.t_nproc);
3628         TCW_4(task_team->tt.tt_active, TRUE);
3629       }
3630       // if team size has changed, the first thread to enable tasking will
3631       // realloc threads_data if necessary
3632       KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d reset next task_team "
3633                     "%p for team %d at parity=%d\n",
3634                     __kmp_gtid_from_thread(this_thr),
3635                     team->t.t_task_team[other_team],
3636                     ((team != NULL) ? team->t.t_id : -1), other_team));
3637     }
3638   }
3639 }
3640 
3641 // __kmp_task_team_sync: Propagation of task team data from team to threads
3642 // which happens just after the release phase of a team barrier.  This may be
3643 // called by any thread, but only for teams with # threads > 1.
3644 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
3645   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3646 
3647   // Toggle the th_task_state field, to switch which task_team this thread
3648   // refers to
3649   this_thr->th.th_task_state = 1 - this_thr->th.th_task_state;
3650   // It is now safe to propagate the task team pointer from the team struct to
3651   // the current thread.
3652   TCW_PTR(this_thr->th.th_task_team,
3653           team->t.t_task_team[this_thr->th.th_task_state]);
3654   KA_TRACE(20,
3655            ("__kmp_task_team_sync: Thread T#%d task team switched to task_team "
3656             "%p from Team #%d (parity=%d)\n",
3657             __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team,
3658             ((team != NULL) ? team->t.t_id : -1), this_thr->th.th_task_state));
3659 }
3660 
3661 // __kmp_task_team_wait: Master thread waits for outstanding tasks after the
3662 // barrier gather phase. Only called by master thread if #threads in team > 1 or
3663 // if proxy tasks were created.
3664 //
3665 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
3666 // by passing in 0 optionally as the last argument. When wait is zero, master
3667 // thread does not wait for unfinished_threads to reach 0.
3668 void __kmp_task_team_wait(
3669     kmp_info_t *this_thr,
3670     kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) {
3671   kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state];
3672 
3673   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3674   KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team);
3675 
3676   if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) {
3677     if (wait) {
3678       KA_TRACE(20, ("__kmp_task_team_wait: Master T#%d waiting for all tasks "
3679                     "(for unfinished_threads to reach 0) on task_team = %p\n",
3680                     __kmp_gtid_from_thread(this_thr), task_team));
3681       // Worker threads may have dropped through to release phase, but could
3682       // still be executing tasks. Wait here for tasks to complete. To avoid
3683       // memory contention, only master thread checks termination condition.
3684       kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *,
3685                              &task_team->tt.tt_unfinished_threads),
3686                        0U);
3687       flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
3688     }
3689     // Deactivate the old task team, so that the worker threads will stop
3690     // referencing it while spinning.
3691     KA_TRACE(
3692         20,
3693         ("__kmp_task_team_wait: Master T#%d deactivating task_team %p: "
3694          "setting active to false, setting local and team's pointer to NULL\n",
3695          __kmp_gtid_from_thread(this_thr), task_team));
3696 #if OMP_45_ENABLED
3697     KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 ||
3698                      task_team->tt.tt_found_proxy_tasks == TRUE);
3699     TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3700 #else
3701     KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1);
3702 #endif
3703     KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0);
3704     TCW_SYNC_4(task_team->tt.tt_active, FALSE);
3705     KMP_MB();
3706 
3707     TCW_PTR(this_thr->th.th_task_team, NULL);
3708   }
3709 }
3710 
3711 // __kmp_tasking_barrier:
3712 // This routine may only called when __kmp_tasking_mode == tskm_extra_barrier.
3713 // Internal function to execute all tasks prior to a regular barrier or a join
3714 // barrier. It is a full barrier itself, which unfortunately turns regular
3715 // barriers into double barriers and join barriers into 1 1/2 barriers.
3716 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) {
3717   std::atomic<kmp_uint32> *spin = RCAST(
3718       std::atomic<kmp_uint32> *,
3719       &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads);
3720   int flag = FALSE;
3721   KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier);
3722 
3723 #if USE_ITT_BUILD
3724   KMP_FSYNC_SPIN_INIT(spin, NULL);
3725 #endif /* USE_ITT_BUILD */
3726   kmp_flag_32 spin_flag(spin, 0U);
3727   while (!spin_flag.execute_tasks(thread, gtid, TRUE,
3728                                   &flag USE_ITT_BUILD_ARG(NULL), 0)) {
3729 #if USE_ITT_BUILD
3730     // TODO: What about itt_sync_obj??
3731     KMP_FSYNC_SPIN_PREPARE(RCAST(void *, spin));
3732 #endif /* USE_ITT_BUILD */
3733 
3734     if (TCR_4(__kmp_global.g.g_done)) {
3735       if (__kmp_global.g.g_abort)
3736         __kmp_abort_thread();
3737       break;
3738     }
3739     KMP_YIELD(TRUE);
3740   }
3741 #if USE_ITT_BUILD
3742   KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin));
3743 #endif /* USE_ITT_BUILD */
3744 }
3745 
3746 #if OMP_45_ENABLED
3747 
3748 // __kmp_give_task puts a task into a given thread queue if:
3749 //  - the queue for that thread was created
3750 //  - there's space in that queue
3751 // Because of this, __kmp_push_task needs to check if there's space after
3752 // getting the lock
3753 static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task,
3754                             kmp_int32 pass) {
3755   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
3756   kmp_task_team_t *task_team = taskdata->td_task_team;
3757 
3758   KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n",
3759                 taskdata, tid));
3760 
3761   // If task_team is NULL something went really bad...
3762   KMP_DEBUG_ASSERT(task_team != NULL);
3763 
3764   bool result = false;
3765   kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
3766 
3767   if (thread_data->td.td_deque == NULL) {
3768     // There's no queue in this thread, go find another one
3769     // We're guaranteed that at least one thread has a queue
3770     KA_TRACE(30,
3771              ("__kmp_give_task: thread %d has no queue while giving task %p.\n",
3772               tid, taskdata));
3773     return result;
3774   }
3775 
3776   if (TCR_4(thread_data->td.td_deque_ntasks) >=
3777       TASK_DEQUE_SIZE(thread_data->td)) {
3778     KA_TRACE(
3779         30,
3780         ("__kmp_give_task: queue is full while giving task %p to thread %d.\n",
3781          taskdata, tid));
3782 
3783     // if this deque is bigger than the pass ratio give a chance to another
3784     // thread
3785     if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
3786       return result;
3787 
3788     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3789     __kmp_realloc_task_deque(thread, thread_data);
3790 
3791   } else {
3792 
3793     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3794 
3795     if (TCR_4(thread_data->td.td_deque_ntasks) >=
3796         TASK_DEQUE_SIZE(thread_data->td)) {
3797       KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to "
3798                     "thread %d.\n",
3799                     taskdata, tid));
3800 
3801       // if this deque is bigger than the pass ratio give a chance to another
3802       // thread
3803       if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
3804         goto release_and_exit;
3805 
3806       __kmp_realloc_task_deque(thread, thread_data);
3807     }
3808   }
3809 
3810   // lock is held here, and there is space in the deque
3811 
3812   thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
3813   // Wrap index.
3814   thread_data->td.td_deque_tail =
3815       (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
3816   TCW_4(thread_data->td.td_deque_ntasks,
3817         TCR_4(thread_data->td.td_deque_ntasks) + 1);
3818 
3819   result = true;
3820   KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n",
3821                 taskdata, tid));
3822 
3823 release_and_exit:
3824   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3825 
3826   return result;
3827 }
3828 
3829 /* The finish of the proxy tasks is divided in two pieces:
3830     - the top half is the one that can be done from a thread outside the team
3831     - the bottom half must be run from a thread within the team
3832 
3833    In order to run the bottom half the task gets queued back into one of the
3834    threads of the team. Once the td_incomplete_child_task counter of the parent
3835    is decremented the threads can leave the barriers. So, the bottom half needs
3836    to be queued before the counter is decremented. The top half is therefore
3837    divided in two parts:
3838     - things that can be run before queuing the bottom half
3839     - things that must be run after queuing the bottom half
3840 
3841    This creates a second race as the bottom half can free the task before the
3842    second top half is executed. To avoid this we use the
3843    td_incomplete_child_task of the proxy task to synchronize the top and bottom
3844    half. */
3845 static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
3846   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
3847   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3848   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
3849   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
3850 
3851   taskdata->td_flags.complete = 1; // mark the task as completed
3852 
3853   if (taskdata->td_taskgroup)
3854     KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
3855 
3856   // Create an imaginary children for this task so the bottom half cannot
3857   // release the task before we have completed the second top half
3858   KMP_ATOMIC_INC(&taskdata->td_incomplete_child_tasks);
3859 }
3860 
3861 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
3862   kmp_int32 children = 0;
3863 
3864   // Predecrement simulated by "- 1" calculation
3865   children =
3866       KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
3867   KMP_DEBUG_ASSERT(children >= 0);
3868 
3869   // Remove the imaginary children
3870   KMP_ATOMIC_DEC(&taskdata->td_incomplete_child_tasks);
3871 }
3872 
3873 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) {
3874   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3875   kmp_info_t *thread = __kmp_threads[gtid];
3876 
3877   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3878   KMP_DEBUG_ASSERT(taskdata->td_flags.complete ==
3879                    1); // top half must run before bottom half
3880 
3881   // We need to wait to make sure the top half is finished
3882   // Spinning here should be ok as this should happen quickly
3883   while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) > 0)
3884     ;
3885 
3886   __kmp_release_deps(gtid, taskdata);
3887   __kmp_free_task_and_ancestors(gtid, taskdata, thread);
3888 }
3889 
3890 /*!
3891 @ingroup TASKING
3892 @param gtid Global Thread ID of encountering thread
3893 @param ptask Task which execution is completed
3894 
3895 Execute the completation of a proxy task from a thread of that is part of the
3896 team. Run first and bottom halves directly.
3897 */
3898 void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) {
3899   KMP_DEBUG_ASSERT(ptask != NULL);
3900   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3901   KA_TRACE(
3902       10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n",
3903            gtid, taskdata));
3904 
3905   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3906 
3907   __kmp_first_top_half_finish_proxy(taskdata);
3908   __kmp_second_top_half_finish_proxy(taskdata);
3909   __kmp_bottom_half_finish_proxy(gtid, ptask);
3910 
3911   KA_TRACE(10,
3912            ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n",
3913             gtid, taskdata));
3914 }
3915 
3916 /*!
3917 @ingroup TASKING
3918 @param ptask Task which execution is completed
3919 
3920 Execute the completation of a proxy task from a thread that could not belong to
3921 the team.
3922 */
3923 void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) {
3924   KMP_DEBUG_ASSERT(ptask != NULL);
3925   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3926 
3927   KA_TRACE(
3928       10,
3929       ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n",
3930        taskdata));
3931 
3932   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3933 
3934   __kmp_first_top_half_finish_proxy(taskdata);
3935 
3936   // Enqueue task to complete bottom half completion from a thread within the
3937   // corresponding team
3938   kmp_team_t *team = taskdata->td_team;
3939   kmp_int32 nthreads = team->t.t_nproc;
3940   kmp_info_t *thread;
3941 
3942   // This should be similar to start_k = __kmp_get_random( thread ) % nthreads
3943   // but we cannot use __kmp_get_random here
3944   kmp_int32 start_k = 0;
3945   kmp_int32 pass = 1;
3946   kmp_int32 k = start_k;
3947 
3948   do {
3949     // For now we're just linearly trying to find a thread
3950     thread = team->t.t_threads[k];
3951     k = (k + 1) % nthreads;
3952 
3953     // we did a full pass through all the threads
3954     if (k == start_k)
3955       pass = pass << 1;
3956 
3957   } while (!__kmp_give_task(thread, k, ptask, pass));
3958 
3959   __kmp_second_top_half_finish_proxy(taskdata);
3960 
3961   KA_TRACE(
3962       10,
3963       ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n",
3964        taskdata));
3965 }
3966 
3967 #if OMP_50_ENABLED
3968 kmp_event_t *__kmpc_task_allow_completion_event(ident_t *loc_ref, int gtid,
3969                                                 kmp_task_t *task) {
3970   kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
3971   if (td->td_allow_completion_event.type == KMP_EVENT_UNINITIALIZED) {
3972     td->td_allow_completion_event.type = KMP_EVENT_ALLOW_COMPLETION;
3973     td->td_allow_completion_event.ed.task = task;
3974     __kmp_init_tas_lock(&td->td_allow_completion_event.lock);
3975   }
3976   return &td->td_allow_completion_event;
3977 }
3978 
3979 void __kmp_fulfill_event(kmp_event_t *event) {
3980   if (event->type == KMP_EVENT_ALLOW_COMPLETION) {
3981     kmp_task_t *ptask = event->ed.task;
3982     kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3983     bool detached = false;
3984     int gtid = __kmp_get_gtid();
3985 
3986     if (taskdata->td_flags.proxy == TASK_PROXY) {
3987       // The associated task code completed before this call and detached.
3988       detached = true;
3989       event->type = KMP_EVENT_UNINITIALIZED;
3990     } else {
3991       // The associated task has not completed but could be completing at this
3992       // point.
3993       // We need to take the lock to avoid races
3994       __kmp_acquire_tas_lock(&event->lock, gtid);
3995       if (taskdata->td_flags.proxy == TASK_PROXY)
3996         detached = true;
3997       event->type = KMP_EVENT_UNINITIALIZED;
3998       __kmp_release_tas_lock(&event->lock, gtid);
3999     }
4000 
4001     if (detached) {
4002       // If the task detached complete the proxy task
4003       if (gtid >= 0) {
4004         kmp_team_t *team = taskdata->td_team;
4005         kmp_info_t *thread = __kmp_get_thread();
4006         if (thread->th.th_team == team) {
4007           __kmpc_proxy_task_completed(gtid, ptask);
4008           return;
4009         }
4010       }
4011 
4012       // fallback
4013       __kmpc_proxy_task_completed_ooo(ptask);
4014     }
4015   }
4016 }
4017 #endif
4018 
4019 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task
4020 // for taskloop
4021 //
4022 // thread:   allocating thread
4023 // task_src: pointer to source task to be duplicated
4024 // returns:  a pointer to the allocated kmp_task_t structure (task).
4025 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) {
4026   kmp_task_t *task;
4027   kmp_taskdata_t *taskdata;
4028   kmp_taskdata_t *taskdata_src;
4029   kmp_taskdata_t *parent_task = thread->th.th_current_task;
4030   size_t shareds_offset;
4031   size_t task_size;
4032 
4033   KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread,
4034                 task_src));
4035   taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
4036   KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy ==
4037                    TASK_FULL); // it should not be proxy task
4038   KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT);
4039   task_size = taskdata_src->td_size_alloc;
4040 
4041   // Allocate a kmp_taskdata_t block and a kmp_task_t block.
4042   KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread,
4043                 task_size));
4044 #if USE_FAST_MEMORY
4045   taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size);
4046 #else
4047   taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size);
4048 #endif /* USE_FAST_MEMORY */
4049   KMP_MEMCPY(taskdata, taskdata_src, task_size);
4050 
4051   task = KMP_TASKDATA_TO_TASK(taskdata);
4052 
4053   // Initialize new task (only specific fields not affected by memcpy)
4054   taskdata->td_task_id = KMP_GEN_TASK_ID();
4055   if (task->shareds != NULL) { // need setup shareds pointer
4056     shareds_offset = (char *)task_src->shareds - (char *)taskdata_src;
4057     task->shareds = &((char *)taskdata)[shareds_offset];
4058     KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
4059                      0);
4060   }
4061   taskdata->td_alloc_thread = thread;
4062   taskdata->td_parent = parent_task;
4063   taskdata->td_taskgroup =
4064       parent_task
4065           ->td_taskgroup; // task inherits the taskgroup from the parent task
4066 
4067   // Only need to keep track of child task counts if team parallel and tasking
4068   // not serialized
4069   if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
4070     KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
4071     if (parent_task->td_taskgroup)
4072       KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
4073     // Only need to keep track of allocated child tasks for explicit tasks since
4074     // implicit not deallocated
4075     if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT)
4076       KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
4077   }
4078 
4079   KA_TRACE(20,
4080            ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
4081             thread, taskdata, taskdata->td_parent));
4082 #if OMPT_SUPPORT
4083   if (UNLIKELY(ompt_enabled.enabled))
4084     __ompt_task_init(taskdata, thread->th.th_info.ds.ds_gtid);
4085 #endif
4086   return task;
4087 }
4088 
4089 // Routine optionally generated by the compiler for setting the lastprivate flag
4090 // and calling needed constructors for private/firstprivate objects
4091 // (used to form taskloop tasks from pattern task)
4092 // Parameters: dest task, src task, lastprivate flag.
4093 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
4094 
4095 KMP_BUILD_ASSERT(sizeof(long) == 4 || sizeof(long) == 8);
4096 
4097 // class to encapsulate manipulating loop bounds in a taskloop task.
4098 // this abstracts away the Intel vs GOMP taskloop interface for setting/getting
4099 // the loop bound variables.
4100 class kmp_taskloop_bounds_t {
4101   kmp_task_t *task;
4102   const kmp_taskdata_t *taskdata;
4103   size_t lower_offset;
4104   size_t upper_offset;
4105 
4106 public:
4107   kmp_taskloop_bounds_t(kmp_task_t *_task, kmp_uint64 *lb, kmp_uint64 *ub)
4108       : task(_task), taskdata(KMP_TASK_TO_TASKDATA(task)),
4109         lower_offset((char *)lb - (char *)task),
4110         upper_offset((char *)ub - (char *)task) {
4111     KMP_DEBUG_ASSERT((char *)lb > (char *)_task);
4112     KMP_DEBUG_ASSERT((char *)ub > (char *)_task);
4113   }
4114   kmp_taskloop_bounds_t(kmp_task_t *_task, const kmp_taskloop_bounds_t &bounds)
4115       : task(_task), taskdata(KMP_TASK_TO_TASKDATA(_task)),
4116         lower_offset(bounds.lower_offset), upper_offset(bounds.upper_offset) {}
4117   size_t get_lower_offset() const { return lower_offset; }
4118   size_t get_upper_offset() const { return upper_offset; }
4119   kmp_uint64 get_lb() const {
4120     kmp_int64 retval;
4121 #if defined(KMP_GOMP_COMPAT)
4122     // Intel task just returns the lower bound normally
4123     if (!taskdata->td_flags.native) {
4124       retval = *(kmp_int64 *)((char *)task + lower_offset);
4125     } else {
4126       // GOMP task has to take into account the sizeof(long)
4127       if (taskdata->td_size_loop_bounds == 4) {
4128         kmp_int32 *lb = RCAST(kmp_int32 *, task->shareds);
4129         retval = (kmp_int64)*lb;
4130       } else {
4131         kmp_int64 *lb = RCAST(kmp_int64 *, task->shareds);
4132         retval = (kmp_int64)*lb;
4133       }
4134     }
4135 #else
4136     retval = *(kmp_int64 *)((char *)task + lower_offset);
4137 #endif // defined(KMP_GOMP_COMPAT)
4138     return retval;
4139   }
4140   kmp_uint64 get_ub() const {
4141     kmp_int64 retval;
4142 #if defined(KMP_GOMP_COMPAT)
4143     // Intel task just returns the upper bound normally
4144     if (!taskdata->td_flags.native) {
4145       retval = *(kmp_int64 *)((char *)task + upper_offset);
4146     } else {
4147       // GOMP task has to take into account the sizeof(long)
4148       if (taskdata->td_size_loop_bounds == 4) {
4149         kmp_int32 *ub = RCAST(kmp_int32 *, task->shareds) + 1;
4150         retval = (kmp_int64)*ub;
4151       } else {
4152         kmp_int64 *ub = RCAST(kmp_int64 *, task->shareds) + 1;
4153         retval = (kmp_int64)*ub;
4154       }
4155     }
4156 #else
4157     retval = *(kmp_int64 *)((char *)task + upper_offset);
4158 #endif // defined(KMP_GOMP_COMPAT)
4159     return retval;
4160   }
4161   void set_lb(kmp_uint64 lb) {
4162 #if defined(KMP_GOMP_COMPAT)
4163     // Intel task just sets the lower bound normally
4164     if (!taskdata->td_flags.native) {
4165       *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4166     } else {
4167       // GOMP task has to take into account the sizeof(long)
4168       if (taskdata->td_size_loop_bounds == 4) {
4169         kmp_uint32 *lower = RCAST(kmp_uint32 *, task->shareds);
4170         *lower = (kmp_uint32)lb;
4171       } else {
4172         kmp_uint64 *lower = RCAST(kmp_uint64 *, task->shareds);
4173         *lower = (kmp_uint64)lb;
4174       }
4175     }
4176 #else
4177     *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4178 #endif // defined(KMP_GOMP_COMPAT)
4179   }
4180   void set_ub(kmp_uint64 ub) {
4181 #if defined(KMP_GOMP_COMPAT)
4182     // Intel task just sets the upper bound normally
4183     if (!taskdata->td_flags.native) {
4184       *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4185     } else {
4186       // GOMP task has to take into account the sizeof(long)
4187       if (taskdata->td_size_loop_bounds == 4) {
4188         kmp_uint32 *upper = RCAST(kmp_uint32 *, task->shareds) + 1;
4189         *upper = (kmp_uint32)ub;
4190       } else {
4191         kmp_uint64 *upper = RCAST(kmp_uint64 *, task->shareds) + 1;
4192         *upper = (kmp_uint64)ub;
4193       }
4194     }
4195 #else
4196     *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4197 #endif // defined(KMP_GOMP_COMPAT)
4198   }
4199 };
4200 
4201 // __kmp_taskloop_linear: Start tasks of the taskloop linearly
4202 //
4203 // loc        Source location information
4204 // gtid       Global thread ID
4205 // task       Pattern task, exposes the loop iteration range
4206 // lb         Pointer to loop lower bound in task structure
4207 // ub         Pointer to loop upper bound in task structure
4208 // st         Loop stride
4209 // ub_glob    Global upper bound (used for lastprivate check)
4210 // num_tasks  Number of tasks to execute
4211 // grainsize  Number of loop iterations per task
4212 // extras     Number of chunks with grainsize+1 iterations
4213 // tc         Iterations count
4214 // task_dup   Tasks duplication routine
4215 // codeptr_ra Return address for OMPT events
4216 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
4217                            kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4218                            kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4219                            kmp_uint64 grainsize, kmp_uint64 extras,
4220                            kmp_uint64 tc,
4221 #if OMPT_SUPPORT
4222                            void *codeptr_ra,
4223 #endif
4224                            void *task_dup) {
4225   KMP_COUNT_BLOCK(OMP_TASKLOOP);
4226   KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
4227   p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4228   // compiler provides global bounds here
4229   kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4230   kmp_uint64 lower = task_bounds.get_lb();
4231   kmp_uint64 upper = task_bounds.get_ub();
4232   kmp_uint64 i;
4233   kmp_info_t *thread = __kmp_threads[gtid];
4234   kmp_taskdata_t *current_task = thread->th.th_current_task;
4235   kmp_task_t *next_task;
4236   kmp_int32 lastpriv = 0;
4237 
4238   KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
4239   KMP_DEBUG_ASSERT(num_tasks > extras);
4240   KMP_DEBUG_ASSERT(num_tasks > 0);
4241   KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, "
4242                 "extras %lld, i=%lld,%lld(%d)%lld, dup %p\n",
4243                 gtid, num_tasks, grainsize, extras, lower, upper, ub_glob, st,
4244                 task_dup));
4245 
4246   // Launch num_tasks tasks, assign grainsize iterations each task
4247   for (i = 0; i < num_tasks; ++i) {
4248     kmp_uint64 chunk_minus_1;
4249     if (extras == 0) {
4250       chunk_minus_1 = grainsize - 1;
4251     } else {
4252       chunk_minus_1 = grainsize;
4253       --extras; // first extras iterations get bigger chunk (grainsize+1)
4254     }
4255     upper = lower + st * chunk_minus_1;
4256     if (i == num_tasks - 1) {
4257       // schedule the last task, set lastprivate flag if needed
4258       if (st == 1) { // most common case
4259         KMP_DEBUG_ASSERT(upper == *ub);
4260         if (upper == ub_glob)
4261           lastpriv = 1;
4262       } else if (st > 0) { // positive loop stride
4263         KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper);
4264         if ((kmp_uint64)st > ub_glob - upper)
4265           lastpriv = 1;
4266       } else { // negative loop stride
4267         KMP_DEBUG_ASSERT(upper + st < *ub);
4268         if (upper - ub_glob < (kmp_uint64)(-st))
4269           lastpriv = 1;
4270       }
4271     }
4272     next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
4273     kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task);
4274     kmp_taskloop_bounds_t next_task_bounds =
4275         kmp_taskloop_bounds_t(next_task, task_bounds);
4276 
4277     // adjust task-specific bounds
4278     next_task_bounds.set_lb(lower);
4279     if (next_taskdata->td_flags.native) {
4280       next_task_bounds.set_ub(upper + (st > 0 ? 1 : -1));
4281     } else {
4282       next_task_bounds.set_ub(upper);
4283     }
4284     if (ptask_dup != NULL) // set lastprivate flag, construct fistprivates, etc.
4285       ptask_dup(next_task, task, lastpriv);
4286     KA_TRACE(40,
4287              ("__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, "
4288               "upper %lld stride %lld, (offsets %p %p)\n",
4289               gtid, i, next_task, lower, upper, st,
4290               next_task_bounds.get_lower_offset(),
4291               next_task_bounds.get_upper_offset()));
4292 #if OMPT_SUPPORT
4293     __kmp_omp_taskloop_task(NULL, gtid, next_task,
4294                            codeptr_ra); // schedule new task
4295 #else
4296     __kmp_omp_task(gtid, next_task, true); // schedule new task
4297 #endif
4298     lower = upper + st; // adjust lower bound for the next iteration
4299   }
4300   // free the pattern task and exit
4301   __kmp_task_start(gtid, task, current_task); // make internal bookkeeping
4302   // do not execute the pattern task, just do internal bookkeeping
4303   __kmp_task_finish<false>(gtid, task, current_task);
4304 }
4305 
4306 // Structure to keep taskloop parameters for auxiliary task
4307 // kept in the shareds of the task structure.
4308 typedef struct __taskloop_params {
4309   kmp_task_t *task;
4310   kmp_uint64 *lb;
4311   kmp_uint64 *ub;
4312   void *task_dup;
4313   kmp_int64 st;
4314   kmp_uint64 ub_glob;
4315   kmp_uint64 num_tasks;
4316   kmp_uint64 grainsize;
4317   kmp_uint64 extras;
4318   kmp_uint64 tc;
4319   kmp_uint64 num_t_min;
4320 #if OMPT_SUPPORT
4321   void *codeptr_ra;
4322 #endif
4323 } __taskloop_params_t;
4324 
4325 void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *,
4326                           kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64,
4327                           kmp_uint64, kmp_uint64, kmp_uint64, kmp_uint64,
4328 #if OMPT_SUPPORT
4329                           void *,
4330 #endif
4331                           void *);
4332 
4333 // Execute part of the the taskloop submitted as a task.
4334 int __kmp_taskloop_task(int gtid, void *ptask) {
4335   __taskloop_params_t *p =
4336       (__taskloop_params_t *)((kmp_task_t *)ptask)->shareds;
4337   kmp_task_t *task = p->task;
4338   kmp_uint64 *lb = p->lb;
4339   kmp_uint64 *ub = p->ub;
4340   void *task_dup = p->task_dup;
4341   //  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4342   kmp_int64 st = p->st;
4343   kmp_uint64 ub_glob = p->ub_glob;
4344   kmp_uint64 num_tasks = p->num_tasks;
4345   kmp_uint64 grainsize = p->grainsize;
4346   kmp_uint64 extras = p->extras;
4347   kmp_uint64 tc = p->tc;
4348   kmp_uint64 num_t_min = p->num_t_min;
4349 #if OMPT_SUPPORT
4350   void *codeptr_ra = p->codeptr_ra;
4351 #endif
4352 #if KMP_DEBUG
4353   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4354   KMP_DEBUG_ASSERT(task != NULL);
4355   KA_TRACE(20, ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize"
4356                 " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n",
4357                 gtid, taskdata, num_tasks, grainsize, extras, *lb, *ub, st,
4358                 task_dup));
4359 #endif
4360   KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min);
4361   if (num_tasks > num_t_min)
4362     __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
4363                          grainsize, extras, tc, num_t_min,
4364 #if OMPT_SUPPORT
4365                          codeptr_ra,
4366 #endif
4367                          task_dup);
4368   else
4369     __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
4370                           grainsize, extras, tc,
4371 #if OMPT_SUPPORT
4372                           codeptr_ra,
4373 #endif
4374                           task_dup);
4375 
4376   KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid));
4377   return 0;
4378 }
4379 
4380 // Schedule part of the the taskloop as a task,
4381 // execute the rest of the the taskloop.
4382 //
4383 // loc        Source location information
4384 // gtid       Global thread ID
4385 // task       Pattern task, exposes the loop iteration range
4386 // lb         Pointer to loop lower bound in task structure
4387 // ub         Pointer to loop upper bound in task structure
4388 // st         Loop stride
4389 // ub_glob    Global upper bound (used for lastprivate check)
4390 // num_tasks  Number of tasks to execute
4391 // grainsize  Number of loop iterations per task
4392 // extras     Number of chunks with grainsize+1 iterations
4393 // tc         Iterations count
4394 // num_t_min  Threashold to launch tasks recursively
4395 // task_dup   Tasks duplication routine
4396 // codeptr_ra Return address for OMPT events
4397 void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
4398                           kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4399                           kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4400                           kmp_uint64 grainsize, kmp_uint64 extras,
4401                           kmp_uint64 tc, kmp_uint64 num_t_min,
4402 #if OMPT_SUPPORT
4403                           void *codeptr_ra,
4404 #endif
4405                           void *task_dup) {
4406 #if KMP_DEBUG
4407   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4408   KMP_DEBUG_ASSERT(task != NULL);
4409   KMP_DEBUG_ASSERT(num_tasks > num_t_min);
4410   KA_TRACE(20, ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize"
4411                 " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n",
4412                 gtid, taskdata, num_tasks, grainsize, extras, *lb, *ub, st,
4413                 task_dup));
4414 #endif
4415   p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4416   kmp_uint64 lower = *lb;
4417   kmp_info_t *thread = __kmp_threads[gtid];
4418   //  kmp_taskdata_t *current_task = thread->th.th_current_task;
4419   kmp_task_t *next_task;
4420   size_t lower_offset =
4421       (char *)lb - (char *)task; // remember offset of lb in the task structure
4422   size_t upper_offset =
4423       (char *)ub - (char *)task; // remember offset of ub in the task structure
4424 
4425   KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
4426   KMP_DEBUG_ASSERT(num_tasks > extras);
4427   KMP_DEBUG_ASSERT(num_tasks > 0);
4428 
4429   // split the loop in two halves
4430   kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1;
4431   kmp_uint64 gr_size0 = grainsize;
4432   kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute
4433   kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task
4434   if (n_tsk0 <= extras) {
4435     gr_size0++; // integrate extras into grainsize
4436     ext0 = 0; // no extra iters in 1st half
4437     ext1 = extras - n_tsk0; // remaining extras
4438     tc0 = gr_size0 * n_tsk0;
4439     tc1 = tc - tc0;
4440   } else { // n_tsk0 > extras
4441     ext1 = 0; // no extra iters in 2nd half
4442     ext0 = extras;
4443     tc1 = grainsize * n_tsk1;
4444     tc0 = tc - tc1;
4445   }
4446   ub0 = lower + st * (tc0 - 1);
4447   lb1 = ub0 + st;
4448 
4449   // create pattern task for 2nd half of the loop
4450   next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task
4451   // adjust lower bound (upper bound is not changed) for the 2nd half
4452   *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1;
4453   if (ptask_dup != NULL) // construct fistprivates, etc.
4454     ptask_dup(next_task, task, 0);
4455   *ub = ub0; // adjust upper bound for the 1st half
4456 
4457   // create auxiliary task for 2nd half of the loop
4458   kmp_task_t *new_task =
4459       __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *),
4460                             sizeof(__taskloop_params_t), &__kmp_taskloop_task);
4461   __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds;
4462   p->task = next_task;
4463   p->lb = (kmp_uint64 *)((char *)next_task + lower_offset);
4464   p->ub = (kmp_uint64 *)((char *)next_task + upper_offset);
4465   p->task_dup = task_dup;
4466   p->st = st;
4467   p->ub_glob = ub_glob;
4468   p->num_tasks = n_tsk1;
4469   p->grainsize = grainsize;
4470   p->extras = ext1;
4471   p->tc = tc1;
4472   p->num_t_min = num_t_min;
4473 #if OMPT_SUPPORT
4474   p->codeptr_ra = codeptr_ra;
4475 #endif
4476 
4477 #if OMPT_SUPPORT
4478   // schedule new task with correct return address for OMPT events
4479   __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra);
4480 #else
4481   __kmp_omp_task(gtid, new_task, true); // schedule new task
4482 #endif
4483 
4484   // execute the 1st half of current subrange
4485   if (n_tsk0 > num_t_min)
4486     __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0,
4487                          ext0, tc0, num_t_min,
4488 #if OMPT_SUPPORT
4489                          codeptr_ra,
4490 #endif
4491                          task_dup);
4492   else
4493     __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,
4494                           gr_size0, ext0, tc0,
4495 #if OMPT_SUPPORT
4496                           codeptr_ra,
4497 #endif
4498                           task_dup);
4499 
4500   KA_TRACE(40, ("__kmpc_taskloop_recur(exit): T#%d\n", gtid));
4501 }
4502 
4503 /*!
4504 @ingroup TASKING
4505 @param loc       Source location information
4506 @param gtid      Global thread ID
4507 @param task      Task structure
4508 @param if_val    Value of the if clause
4509 @param lb        Pointer to loop lower bound in task structure
4510 @param ub        Pointer to loop upper bound in task structure
4511 @param st        Loop stride
4512 @param nogroup   Flag, 1 if no taskgroup needs to be added, 0 otherwise
4513 @param sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
4514 @param grainsize Schedule value if specified
4515 @param task_dup  Tasks duplication routine
4516 
4517 Execute the taskloop construct.
4518 */
4519 void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
4520                      kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup,
4521                      int sched, kmp_uint64 grainsize, void *task_dup) {
4522   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4523   KMP_DEBUG_ASSERT(task != NULL);
4524 
4525   if (nogroup == 0) {
4526 #if OMPT_SUPPORT && OMPT_OPTIONAL
4527     OMPT_STORE_RETURN_ADDRESS(gtid);
4528 #endif
4529     __kmpc_taskgroup(loc, gtid);
4530   }
4531 
4532   // =========================================================================
4533   // calculate loop parameters
4534   kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4535   kmp_uint64 tc;
4536   // compiler provides global bounds here
4537   kmp_uint64 lower = task_bounds.get_lb();
4538   kmp_uint64 upper = task_bounds.get_ub();
4539   kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag
4540   kmp_uint64 num_tasks = 0, extras = 0;
4541   kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks;
4542   kmp_info_t *thread = __kmp_threads[gtid];
4543   kmp_taskdata_t *current_task = thread->th.th_current_task;
4544 
4545   KA_TRACE(20, ("__kmpc_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, "
4546                 "grain %llu(%d), dup %p\n",
4547                 gtid, taskdata, lower, upper, st, grainsize, sched, task_dup));
4548 
4549   // compute trip count
4550   if (st == 1) { // most common case
4551     tc = upper - lower + 1;
4552   } else if (st < 0) {
4553     tc = (lower - upper) / (-st) + 1;
4554   } else { // st > 0
4555     tc = (upper - lower) / st + 1;
4556   }
4557   if (tc == 0) {
4558     KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d zero-trip loop\n", gtid));
4559     // free the pattern task and exit
4560     __kmp_task_start(gtid, task, current_task);
4561     // do not execute anything for zero-trip loop
4562     __kmp_task_finish<false>(gtid, task, current_task);
4563     return;
4564   }
4565 
4566 #if OMPT_SUPPORT && OMPT_OPTIONAL
4567   ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
4568   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
4569   if (ompt_enabled.ompt_callback_work) {
4570     ompt_callbacks.ompt_callback(ompt_callback_work)(
4571         ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data),
4572         &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
4573   }
4574 #endif
4575 
4576   if (num_tasks_min == 0)
4577     // TODO: can we choose better default heuristic?
4578     num_tasks_min =
4579         KMP_MIN(thread->th.th_team_nproc * 10, INITIAL_TASK_DEQUE_SIZE);
4580 
4581   // compute num_tasks/grainsize based on the input provided
4582   switch (sched) {
4583   case 0: // no schedule clause specified, we can choose the default
4584     // let's try to schedule (team_size*10) tasks
4585     grainsize = thread->th.th_team_nproc * 10;
4586     KMP_FALLTHROUGH();
4587   case 2: // num_tasks provided
4588     if (grainsize > tc) {
4589       num_tasks = tc; // too big num_tasks requested, adjust values
4590       grainsize = 1;
4591       extras = 0;
4592     } else {
4593       num_tasks = grainsize;
4594       grainsize = tc / num_tasks;
4595       extras = tc % num_tasks;
4596     }
4597     break;
4598   case 1: // grainsize provided
4599     if (grainsize > tc) {
4600       num_tasks = 1; // too big grainsize requested, adjust values
4601       grainsize = tc;
4602       extras = 0;
4603     } else {
4604       num_tasks = tc / grainsize;
4605       // adjust grainsize for balanced distribution of iterations
4606       grainsize = tc / num_tasks;
4607       extras = tc % num_tasks;
4608     }
4609     break;
4610   default:
4611     KMP_ASSERT2(0, "unknown scheduling of taskloop");
4612   }
4613   KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
4614   KMP_DEBUG_ASSERT(num_tasks > extras);
4615   KMP_DEBUG_ASSERT(num_tasks > 0);
4616   // =========================================================================
4617 
4618   // check if clause value first
4619   // Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native)
4620   if (if_val == 0) { // if(0) specified, mark task as serial
4621     taskdata->td_flags.task_serial = 1;
4622     taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
4623     // always start serial tasks linearly
4624     __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
4625                           grainsize, extras, tc,
4626 #if OMPT_SUPPORT
4627                           OMPT_GET_RETURN_ADDRESS(0),
4628 #endif
4629                           task_dup);
4630     // !taskdata->td_flags.native => currently force linear spawning of tasks
4631     // for GOMP_taskloop
4632   } else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) {
4633     KA_TRACE(20, ("__kmpc_taskloop: T#%d, go recursive: tc %llu, #tasks %llu"
4634                   "(%lld), grain %llu, extras %llu\n",
4635                   gtid, tc, num_tasks, num_tasks_min, grainsize, extras));
4636     __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
4637                          grainsize, extras, tc, num_tasks_min,
4638 #if OMPT_SUPPORT
4639                          OMPT_GET_RETURN_ADDRESS(0),
4640 #endif
4641                          task_dup);
4642   } else {
4643     KA_TRACE(20, ("__kmpc_taskloop: T#%d, go linear: tc %llu, #tasks %llu"
4644                   "(%lld), grain %llu, extras %llu\n",
4645                   gtid, tc, num_tasks, num_tasks_min, grainsize, extras));
4646     __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
4647                           grainsize, extras, tc,
4648 #if OMPT_SUPPORT
4649                           OMPT_GET_RETURN_ADDRESS(0),
4650 #endif
4651                           task_dup);
4652   }
4653 
4654 #if OMPT_SUPPORT && OMPT_OPTIONAL
4655   if (ompt_enabled.ompt_callback_work) {
4656     ompt_callbacks.ompt_callback(ompt_callback_work)(
4657         ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data),
4658         &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
4659   }
4660 #endif
4661 
4662   if (nogroup == 0) {
4663 #if OMPT_SUPPORT && OMPT_OPTIONAL
4664     OMPT_STORE_RETURN_ADDRESS(gtid);
4665 #endif
4666     __kmpc_end_taskgroup(loc, gtid);
4667   }
4668   KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid));
4669 }
4670 
4671 #endif
4672