1 /*
2  * kmp_tasking.cpp -- OpenMP 3.0 tasking support.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_i18n.h"
15 #include "kmp_itt.h"
16 #include "kmp_stats.h"
17 #include "kmp_wait_release.h"
18 #include "kmp_taskdeps.h"
19 
20 #if OMPT_SUPPORT
21 #include "ompt-specific.h"
22 #endif
23 
24 #include "tsan_annotations.h"
25 
26 /* forward declaration */
27 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
28                                  kmp_info_t *this_thr);
29 static void __kmp_alloc_task_deque(kmp_info_t *thread,
30                                    kmp_thread_data_t *thread_data);
31 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
32                                            kmp_task_team_t *task_team);
33 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask);
34 
35 #ifdef BUILD_TIED_TASK_STACK
36 
37 //  __kmp_trace_task_stack: print the tied tasks from the task stack in order
38 //  from top do bottom
39 //
40 //  gtid: global thread identifier for thread containing stack
41 //  thread_data: thread data for task team thread containing stack
42 //  threshold: value above which the trace statement triggers
43 //  location: string identifying call site of this function (for trace)
44 static void __kmp_trace_task_stack(kmp_int32 gtid,
45                                    kmp_thread_data_t *thread_data,
46                                    int threshold, char *location) {
47   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
48   kmp_taskdata_t **stack_top = task_stack->ts_top;
49   kmp_int32 entries = task_stack->ts_entries;
50   kmp_taskdata_t *tied_task;
51 
52   KA_TRACE(
53       threshold,
54       ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, "
55        "first_block = %p, stack_top = %p \n",
56        location, gtid, entries, task_stack->ts_first_block, stack_top));
57 
58   KMP_DEBUG_ASSERT(stack_top != NULL);
59   KMP_DEBUG_ASSERT(entries > 0);
60 
61   while (entries != 0) {
62     KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]);
63     // fix up ts_top if we need to pop from previous block
64     if (entries & TASK_STACK_INDEX_MASK == 0) {
65       kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top);
66 
67       stack_block = stack_block->sb_prev;
68       stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
69     }
70 
71     // finish bookkeeping
72     stack_top--;
73     entries--;
74 
75     tied_task = *stack_top;
76 
77     KMP_DEBUG_ASSERT(tied_task != NULL);
78     KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
79 
80     KA_TRACE(threshold,
81              ("__kmp_trace_task_stack(%s):             gtid=%d, entry=%d, "
82               "stack_top=%p, tied_task=%p\n",
83               location, gtid, entries, stack_top, tied_task));
84   }
85   KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]);
86 
87   KA_TRACE(threshold,
88            ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n",
89             location, gtid));
90 }
91 
92 //  __kmp_init_task_stack: initialize the task stack for the first time
93 //  after a thread_data structure is created.
94 //  It should not be necessary to do this again (assuming the stack works).
95 //
96 //  gtid: global thread identifier of calling thread
97 //  thread_data: thread data for task team thread containing stack
98 static void __kmp_init_task_stack(kmp_int32 gtid,
99                                   kmp_thread_data_t *thread_data) {
100   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
101   kmp_stack_block_t *first_block;
102 
103   // set up the first block of the stack
104   first_block = &task_stack->ts_first_block;
105   task_stack->ts_top = (kmp_taskdata_t **)first_block;
106   memset((void *)first_block, '\0',
107          TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *));
108 
109   // initialize the stack to be empty
110   task_stack->ts_entries = TASK_STACK_EMPTY;
111   first_block->sb_next = NULL;
112   first_block->sb_prev = NULL;
113 }
114 
115 //  __kmp_free_task_stack: free the task stack when thread_data is destroyed.
116 //
117 //  gtid: global thread identifier for calling thread
118 //  thread_data: thread info for thread containing stack
119 static void __kmp_free_task_stack(kmp_int32 gtid,
120                                   kmp_thread_data_t *thread_data) {
121   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
122   kmp_stack_block_t *stack_block = &task_stack->ts_first_block;
123 
124   KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY);
125   // free from the second block of the stack
126   while (stack_block != NULL) {
127     kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL;
128 
129     stack_block->sb_next = NULL;
130     stack_block->sb_prev = NULL;
131     if (stack_block != &task_stack->ts_first_block) {
132       __kmp_thread_free(thread,
133                         stack_block); // free the block, if not the first
134     }
135     stack_block = next_block;
136   }
137   // initialize the stack to be empty
138   task_stack->ts_entries = 0;
139   task_stack->ts_top = NULL;
140 }
141 
142 //  __kmp_push_task_stack: Push the tied task onto the task stack.
143 //     Grow the stack if necessary by allocating another block.
144 //
145 //  gtid: global thread identifier for calling thread
146 //  thread: thread info for thread containing stack
147 //  tied_task: the task to push on the stack
148 static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread,
149                                   kmp_taskdata_t *tied_task) {
150   // GEH - need to consider what to do if tt_threads_data not allocated yet
151   kmp_thread_data_t *thread_data =
152       &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
153   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
154 
155   if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) {
156     return; // Don't push anything on stack if team or team tasks are serialized
157   }
158 
159   KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
160   KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
161 
162   KA_TRACE(20,
163            ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n",
164             gtid, thread, tied_task));
165   // Store entry
166   *(task_stack->ts_top) = tied_task;
167 
168   // Do bookkeeping for next push
169   task_stack->ts_top++;
170   task_stack->ts_entries++;
171 
172   if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
173     // Find beginning of this task block
174     kmp_stack_block_t *stack_block =
175         (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE);
176 
177     // Check if we already have a block
178     if (stack_block->sb_next !=
179         NULL) { // reset ts_top to beginning of next block
180       task_stack->ts_top = &stack_block->sb_next->sb_block[0];
181     } else { // Alloc new block and link it up
182       kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc(
183           thread, sizeof(kmp_stack_block_t));
184 
185       task_stack->ts_top = &new_block->sb_block[0];
186       stack_block->sb_next = new_block;
187       new_block->sb_prev = stack_block;
188       new_block->sb_next = NULL;
189 
190       KA_TRACE(
191           30,
192           ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n",
193            gtid, tied_task, new_block));
194     }
195   }
196   KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
197                 tied_task));
198 }
199 
200 //  __kmp_pop_task_stack: Pop the tied task from the task stack.  Don't return
201 //  the task, just check to make sure it matches the ending task passed in.
202 //
203 //  gtid: global thread identifier for the calling thread
204 //  thread: thread info structure containing stack
205 //  tied_task: the task popped off the stack
206 //  ending_task: the task that is ending (should match popped task)
207 static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread,
208                                  kmp_taskdata_t *ending_task) {
209   // GEH - need to consider what to do if tt_threads_data not allocated yet
210   kmp_thread_data_t *thread_data =
211       &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)];
212   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
213   kmp_taskdata_t *tied_task;
214 
215   if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) {
216     // Don't pop anything from stack if team or team tasks are serialized
217     return;
218   }
219 
220   KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
221   KMP_DEBUG_ASSERT(task_stack->ts_entries > 0);
222 
223   KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid,
224                 thread));
225 
226   // fix up ts_top if we need to pop from previous block
227   if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
228     kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top);
229 
230     stack_block = stack_block->sb_prev;
231     task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
232   }
233 
234   // finish bookkeeping
235   task_stack->ts_top--;
236   task_stack->ts_entries--;
237 
238   tied_task = *(task_stack->ts_top);
239 
240   KMP_DEBUG_ASSERT(tied_task != NULL);
241   KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
242   KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly
243 
244   KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
245                 tied_task));
246   return;
247 }
248 #endif /* BUILD_TIED_TASK_STACK */
249 
250 // returns 1 if new task is allowed to execute, 0 otherwise
251 // checks Task Scheduling constraint (if requested) and
252 // mutexinoutset dependencies if any
253 static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained,
254                                   const kmp_taskdata_t *tasknew,
255                                   const kmp_taskdata_t *taskcurr) {
256   if (is_constrained && (tasknew->td_flags.tiedness == TASK_TIED)) {
257     // Check if the candidate obeys the Task Scheduling Constraints (TSC)
258     // only descendant of all deferred tied tasks can be scheduled, checking
259     // the last one is enough, as it in turn is the descendant of all others
260     kmp_taskdata_t *current = taskcurr->td_last_tied;
261     KMP_DEBUG_ASSERT(current != NULL);
262     // check if the task is not suspended on barrier
263     if (current->td_flags.tasktype == TASK_EXPLICIT ||
264         current->td_taskwait_thread > 0) { // <= 0 on barrier
265       kmp_int32 level = current->td_level;
266       kmp_taskdata_t *parent = tasknew->td_parent;
267       while (parent != current && parent->td_level > level) {
268         // check generation up to the level of the current task
269         parent = parent->td_parent;
270         KMP_DEBUG_ASSERT(parent != NULL);
271       }
272       if (parent != current)
273         return false;
274     }
275   }
276   // Check mutexinoutset dependencies, acquire locks
277   kmp_depnode_t *node = tasknew->td_depnode;
278   if (UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
279     for (int i = 0; i < node->dn.mtx_num_locks; ++i) {
280       KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
281       if (__kmp_test_lock(node->dn.mtx_locks[i], gtid))
282         continue;
283       // could not get the lock, release previous locks
284       for (int j = i - 1; j >= 0; --j)
285         __kmp_release_lock(node->dn.mtx_locks[j], gtid);
286       return false;
287     }
288     // negative num_locks means all locks acquired successfully
289     node->dn.mtx_num_locks = -node->dn.mtx_num_locks;
290   }
291   return true;
292 }
293 
294 // __kmp_realloc_task_deque:
295 // Re-allocates a task deque for a particular thread, copies the content from
296 // the old deque and adjusts the necessary data structures relating to the
297 // deque. This operation must be done with the deque_lock being held
298 static void __kmp_realloc_task_deque(kmp_info_t *thread,
299                                      kmp_thread_data_t *thread_data) {
300   kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
301   KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == size);
302   kmp_int32 new_size = 2 * size;
303 
304   KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to "
305                 "%d] for thread_data %p\n",
306                 __kmp_gtid_from_thread(thread), size, new_size, thread_data));
307 
308   kmp_taskdata_t **new_deque =
309       (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *));
310 
311   int i, j;
312   for (i = thread_data->td.td_deque_head, j = 0; j < size;
313        i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++)
314     new_deque[j] = thread_data->td.td_deque[i];
315 
316   __kmp_free(thread_data->td.td_deque);
317 
318   thread_data->td.td_deque_head = 0;
319   thread_data->td.td_deque_tail = size;
320   thread_data->td.td_deque = new_deque;
321   thread_data->td.td_deque_size = new_size;
322 }
323 
324 //  __kmp_push_task: Add a task to the thread's deque
325 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
326   kmp_info_t *thread = __kmp_threads[gtid];
327   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
328 
329   // We don't need to map to shadow gtid if it is already hidden helper thread
330   if (taskdata->td_flags.hidden_helper && !KMP_HIDDEN_HELPER_THREAD(gtid)) {
331     gtid = KMP_GTID_TO_SHADOW_GTID(gtid);
332     thread = __kmp_threads[gtid];
333   }
334 
335   kmp_task_team_t *task_team = thread->th.th_task_team;
336   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
337   kmp_thread_data_t *thread_data;
338 
339   KA_TRACE(20,
340            ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata));
341 
342   if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
343     // untied task needs to increment counter so that the task structure is not
344     // freed prematurely
345     kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
346     KMP_DEBUG_USE_VAR(counter);
347     KA_TRACE(
348         20,
349         ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",
350          gtid, counter, taskdata));
351   }
352 
353   // The first check avoids building task_team thread data if serialized
354   if (UNLIKELY(taskdata->td_flags.task_serial)) {
355     KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning "
356                   "TASK_NOT_PUSHED for task %p\n",
357                   gtid, taskdata));
358     return TASK_NOT_PUSHED;
359   }
360 
361   // Now that serialized tasks have returned, we can assume that we are not in
362   // immediate exec mode
363   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
364   if (UNLIKELY(!KMP_TASKING_ENABLED(task_team))) {
365     __kmp_enable_tasking(task_team, thread);
366   }
367   KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE);
368   KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL);
369 
370   // Find tasking deque specific to encountering thread
371   thread_data = &task_team->tt.tt_threads_data[tid];
372 
373   // No lock needed since only owner can allocate. If the task is hidden_helper,
374   // we don't need it either because we have initialized the dequeue for hidden
375   // helper thread data.
376   if (UNLIKELY(thread_data->td.td_deque == NULL)) {
377     __kmp_alloc_task_deque(thread, thread_data);
378   }
379 
380   int locked = 0;
381   // Check if deque is full
382   if (TCR_4(thread_data->td.td_deque_ntasks) >=
383       TASK_DEQUE_SIZE(thread_data->td)) {
384     if (__kmp_enable_task_throttling &&
385         __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
386                               thread->th.th_current_task)) {
387       KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning "
388                     "TASK_NOT_PUSHED for task %p\n",
389                     gtid, taskdata));
390       return TASK_NOT_PUSHED;
391     } else {
392       __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
393       locked = 1;
394       if (TCR_4(thread_data->td.td_deque_ntasks) >=
395           TASK_DEQUE_SIZE(thread_data->td)) {
396         // expand deque to push the task which is not allowed to execute
397         __kmp_realloc_task_deque(thread, thread_data);
398       }
399     }
400   }
401   // Lock the deque for the task push operation
402   if (!locked) {
403     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
404     // Need to recheck as we can get a proxy task from thread outside of OpenMP
405     if (TCR_4(thread_data->td.td_deque_ntasks) >=
406         TASK_DEQUE_SIZE(thread_data->td)) {
407       if (__kmp_enable_task_throttling &&
408           __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
409                                 thread->th.th_current_task)) {
410         __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
411         KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; "
412                       "returning TASK_NOT_PUSHED for task %p\n",
413                       gtid, taskdata));
414         return TASK_NOT_PUSHED;
415       } else {
416         // expand deque to push the task which is not allowed to execute
417         __kmp_realloc_task_deque(thread, thread_data);
418       }
419     }
420   }
421   // Must have room since no thread can add tasks but calling thread
422   KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
423                    TASK_DEQUE_SIZE(thread_data->td));
424 
425   thread_data->td.td_deque[thread_data->td.td_deque_tail] =
426       taskdata; // Push taskdata
427   // Wrap index.
428   thread_data->td.td_deque_tail =
429       (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
430   TCW_4(thread_data->td.td_deque_ntasks,
431         TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
432   KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self
433   KMP_FSYNC_RELEASING(taskdata); // releasing child
434   KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
435                 "task=%p ntasks=%d head=%u tail=%u\n",
436                 gtid, taskdata, thread_data->td.td_deque_ntasks,
437                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
438 
439   auto hidden_helper = taskdata->td_flags.hidden_helper;
440 
441   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
442 
443   // Signal one worker thread to execute the task
444   if (UNLIKELY(hidden_helper)) {
445     // Wake hidden helper threads up if they're sleeping
446     __kmp_hidden_helper_worker_thread_signal();
447   }
448 
449   return TASK_SUCCESSFULLY_PUSHED;
450 }
451 
452 // __kmp_pop_current_task_from_thread: set up current task from called thread
453 // when team ends
454 //
455 // this_thr: thread structure to set current_task in.
456 void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) {
457   KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d "
458                 "this_thread=%p, curtask=%p, "
459                 "curtask_parent=%p\n",
460                 0, this_thr, this_thr->th.th_current_task,
461                 this_thr->th.th_current_task->td_parent));
462 
463   this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent;
464 
465   KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d "
466                 "this_thread=%p, curtask=%p, "
467                 "curtask_parent=%p\n",
468                 0, this_thr, this_thr->th.th_current_task,
469                 this_thr->th.th_current_task->td_parent));
470 }
471 
472 // __kmp_push_current_task_to_thread: set up current task in called thread for a
473 // new team
474 //
475 // this_thr: thread structure to set up
476 // team: team for implicit task data
477 // tid: thread within team to set up
478 void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team,
479                                        int tid) {
480   // current task of the thread is a parent of the new just created implicit
481   // tasks of new team
482   KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p "
483                 "curtask=%p "
484                 "parent_task=%p\n",
485                 tid, this_thr, this_thr->th.th_current_task,
486                 team->t.t_implicit_task_taskdata[tid].td_parent));
487 
488   KMP_DEBUG_ASSERT(this_thr != NULL);
489 
490   if (tid == 0) {
491     if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) {
492       team->t.t_implicit_task_taskdata[0].td_parent =
493           this_thr->th.th_current_task;
494       this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0];
495     }
496   } else {
497     team->t.t_implicit_task_taskdata[tid].td_parent =
498         team->t.t_implicit_task_taskdata[0].td_parent;
499     this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid];
500   }
501 
502   KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p "
503                 "curtask=%p "
504                 "parent_task=%p\n",
505                 tid, this_thr, this_thr->th.th_current_task,
506                 team->t.t_implicit_task_taskdata[tid].td_parent));
507 }
508 
509 // __kmp_task_start: bookkeeping for a task starting execution
510 //
511 // GTID: global thread id of calling thread
512 // task: task starting execution
513 // current_task: task suspending
514 static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task,
515                              kmp_taskdata_t *current_task) {
516   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
517   kmp_info_t *thread = __kmp_threads[gtid];
518 
519   KA_TRACE(10,
520            ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",
521             gtid, taskdata, current_task));
522 
523   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
524 
525   // mark currently executing task as suspended
526   // TODO: GEH - make sure root team implicit task is initialized properly.
527   // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );
528   current_task->td_flags.executing = 0;
529 
530 // Add task to stack if tied
531 #ifdef BUILD_TIED_TASK_STACK
532   if (taskdata->td_flags.tiedness == TASK_TIED) {
533     __kmp_push_task_stack(gtid, thread, taskdata);
534   }
535 #endif /* BUILD_TIED_TASK_STACK */
536 
537   // mark starting task as executing and as current task
538   thread->th.th_current_task = taskdata;
539 
540   KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 ||
541                    taskdata->td_flags.tiedness == TASK_UNTIED);
542   KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 ||
543                    taskdata->td_flags.tiedness == TASK_UNTIED);
544   taskdata->td_flags.started = 1;
545   taskdata->td_flags.executing = 1;
546   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
547   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
548 
549   // GEH TODO: shouldn't we pass some sort of location identifier here?
550   // APT: yes, we will pass location here.
551   // need to store current thread state (in a thread or taskdata structure)
552   // before setting work_state, otherwise wrong state is set after end of task
553 
554   KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata));
555 
556   return;
557 }
558 
559 #if OMPT_SUPPORT
560 //------------------------------------------------------------------------------
561 // __ompt_task_init:
562 //   Initialize OMPT fields maintained by a task. This will only be called after
563 //   ompt_start_tool, so we already know whether ompt is enabled or not.
564 
565 static inline void __ompt_task_init(kmp_taskdata_t *task, int tid) {
566   // The calls to __ompt_task_init already have the ompt_enabled condition.
567   task->ompt_task_info.task_data.value = 0;
568   task->ompt_task_info.frame.exit_frame = ompt_data_none;
569   task->ompt_task_info.frame.enter_frame = ompt_data_none;
570   task->ompt_task_info.frame.exit_frame_flags =
571       ompt_frame_runtime | ompt_frame_framepointer;
572   task->ompt_task_info.frame.enter_frame_flags =
573       ompt_frame_runtime | ompt_frame_framepointer;
574 }
575 
576 // __ompt_task_start:
577 //   Build and trigger task-begin event
578 static inline void __ompt_task_start(kmp_task_t *task,
579                                      kmp_taskdata_t *current_task,
580                                      kmp_int32 gtid) {
581   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
582   ompt_task_status_t status = ompt_task_switch;
583   if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) {
584     status = ompt_task_yield;
585     __kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = 0;
586   }
587   /* let OMPT know that we're about to run this task */
588   if (ompt_enabled.ompt_callback_task_schedule) {
589     ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
590         &(current_task->ompt_task_info.task_data), status,
591         &(taskdata->ompt_task_info.task_data));
592   }
593   taskdata->ompt_task_info.scheduling_parent = current_task;
594 }
595 
596 // __ompt_task_finish:
597 //   Build and trigger final task-schedule event
598 static inline void __ompt_task_finish(kmp_task_t *task,
599                                       kmp_taskdata_t *resumed_task,
600                                       ompt_task_status_t status) {
601   if (ompt_enabled.ompt_callback_task_schedule) {
602     kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
603     if (__kmp_omp_cancellation && taskdata->td_taskgroup &&
604         taskdata->td_taskgroup->cancel_request == cancel_taskgroup) {
605       status = ompt_task_cancel;
606     }
607 
608     /* let OMPT know that we're returning to the callee task */
609     ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
610         &(taskdata->ompt_task_info.task_data), status,
611         (resumed_task ? &(resumed_task->ompt_task_info.task_data) : NULL));
612   }
613 }
614 #endif
615 
616 template <bool ompt>
617 static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid,
618                                                kmp_task_t *task,
619                                                void *frame_address,
620                                                void *return_address) {
621   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
622   kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
623 
624   KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p "
625                 "current_task=%p\n",
626                 gtid, loc_ref, taskdata, current_task));
627 
628   if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
629     // untied task needs to increment counter so that the task structure is not
630     // freed prematurely
631     kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
632     KMP_DEBUG_USE_VAR(counter);
633     KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) "
634                   "incremented for task %p\n",
635                   gtid, counter, taskdata));
636   }
637 
638   taskdata->td_flags.task_serial =
639       1; // Execute this task immediately, not deferred.
640   __kmp_task_start(gtid, task, current_task);
641 
642 #if OMPT_SUPPORT
643   if (ompt) {
644     if (current_task->ompt_task_info.frame.enter_frame.ptr == NULL) {
645       current_task->ompt_task_info.frame.enter_frame.ptr =
646           taskdata->ompt_task_info.frame.exit_frame.ptr = frame_address;
647       current_task->ompt_task_info.frame.enter_frame_flags =
648           taskdata->ompt_task_info.frame.exit_frame_flags =
649               ompt_frame_application | ompt_frame_framepointer;
650     }
651     if (ompt_enabled.ompt_callback_task_create) {
652       ompt_task_info_t *parent_info = &(current_task->ompt_task_info);
653       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
654           &(parent_info->task_data), &(parent_info->frame),
655           &(taskdata->ompt_task_info.task_data),
656           ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(taskdata), 0,
657           return_address);
658     }
659     __ompt_task_start(task, current_task, gtid);
660   }
661 #endif // OMPT_SUPPORT
662 
663   KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid,
664                 loc_ref, taskdata));
665 }
666 
667 #if OMPT_SUPPORT
668 OMPT_NOINLINE
669 static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
670                                            kmp_task_t *task,
671                                            void *frame_address,
672                                            void *return_address) {
673   __kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address,
674                                            return_address);
675 }
676 #endif // OMPT_SUPPORT
677 
678 // __kmpc_omp_task_begin_if0: report that a given serialized task has started
679 // execution
680 //
681 // loc_ref: source location information; points to beginning of task block.
682 // gtid: global thread number.
683 // task: task thunk for the started task.
684 void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
685                                kmp_task_t *task) {
686 #if OMPT_SUPPORT
687   if (UNLIKELY(ompt_enabled.enabled)) {
688     OMPT_STORE_RETURN_ADDRESS(gtid);
689     __kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task,
690                                    OMPT_GET_FRAME_ADDRESS(1),
691                                    OMPT_LOAD_RETURN_ADDRESS(gtid));
692     return;
693   }
694 #endif
695   __kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL);
696 }
697 
698 #ifdef TASK_UNUSED
699 // __kmpc_omp_task_begin: report that a given task has started execution
700 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
701 void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) {
702   kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
703 
704   KA_TRACE(
705       10,
706       ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
707        gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task));
708 
709   __kmp_task_start(gtid, task, current_task);
710 
711   KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid,
712                 loc_ref, KMP_TASK_TO_TASKDATA(task)));
713   return;
714 }
715 #endif // TASK_UNUSED
716 
717 // __kmp_free_task: free the current task space and the space for shareds
718 //
719 // gtid: Global thread ID of calling thread
720 // taskdata: task to free
721 // thread: thread data structure of caller
722 static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
723                             kmp_info_t *thread) {
724   KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid,
725                 taskdata));
726 
727   // Check to make sure all flags and counters have the correct values
728   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
729   KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0);
730   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1);
731   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
732   KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == 0 ||
733                    taskdata->td_flags.task_serial == 1);
734   KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0);
735 
736   taskdata->td_flags.freed = 1;
737   ANNOTATE_HAPPENS_BEFORE(taskdata);
738 // deallocate the taskdata and shared variable blocks associated with this task
739 #if USE_FAST_MEMORY
740   __kmp_fast_free(thread, taskdata);
741 #else /* ! USE_FAST_MEMORY */
742   __kmp_thread_free(thread, taskdata);
743 #endif
744   KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));
745 }
746 
747 // __kmp_free_task_and_ancestors: free the current task and ancestors without
748 // children
749 //
750 // gtid: Global thread ID of calling thread
751 // taskdata: task to free
752 // thread: thread data structure of caller
753 static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
754                                           kmp_taskdata_t *taskdata,
755                                           kmp_info_t *thread) {
756   // Proxy tasks must always be allowed to free their parents
757   // because they can be run in background even in serial mode.
758   kmp_int32 team_serial =
759       (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) &&
760       !taskdata->td_flags.proxy;
761   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
762 
763   kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
764   KMP_DEBUG_ASSERT(children >= 0);
765 
766   // Now, go up the ancestor tree to see if any ancestors can now be freed.
767   while (children == 0) {
768     kmp_taskdata_t *parent_taskdata = taskdata->td_parent;
769 
770     KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete "
771                   "and freeing itself\n",
772                   gtid, taskdata));
773 
774     // --- Deallocate my ancestor task ---
775     __kmp_free_task(gtid, taskdata, thread);
776 
777     taskdata = parent_taskdata;
778 
779     if (team_serial)
780       return;
781     // Stop checking ancestors at implicit task instead of walking up ancestor
782     // tree to avoid premature deallocation of ancestors.
783     if (taskdata->td_flags.tasktype == TASK_IMPLICIT) {
784       if (taskdata->td_dephash) { // do we need to cleanup dephash?
785         int children = KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks);
786         kmp_tasking_flags_t flags_old = taskdata->td_flags;
787         if (children == 0 && flags_old.complete == 1) {
788           kmp_tasking_flags_t flags_new = flags_old;
789           flags_new.complete = 0;
790           if (KMP_COMPARE_AND_STORE_ACQ32(
791                   RCAST(kmp_int32 *, &taskdata->td_flags),
792                   *RCAST(kmp_int32 *, &flags_old),
793                   *RCAST(kmp_int32 *, &flags_new))) {
794             KA_TRACE(100, ("__kmp_free_task_and_ancestors: T#%d cleans "
795                            "dephash of implicit task %p\n",
796                            gtid, taskdata));
797             // cleanup dephash of finished implicit task
798             __kmp_dephash_free_entries(thread, taskdata->td_dephash);
799           }
800         }
801       }
802       return;
803     }
804     // Predecrement simulated by "- 1" calculation
805     children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
806     KMP_DEBUG_ASSERT(children >= 0);
807   }
808 
809   KA_TRACE(
810       20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "
811            "not freeing it yet\n",
812            gtid, taskdata, children));
813 }
814 
815 // __kmp_task_finish: bookkeeping to do when a task finishes execution
816 //
817 // gtid: global thread ID for calling thread
818 // task: task to be finished
819 // resumed_task: task to be resumed.  (may be NULL if task is serialized)
820 //
821 // template<ompt>: effectively ompt_enabled.enabled!=0
822 // the version with ompt=false is inlined, allowing to optimize away all ompt
823 // code in this case
824 template <bool ompt>
825 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
826                               kmp_taskdata_t *resumed_task) {
827   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
828   kmp_info_t *thread = __kmp_threads[gtid];
829   kmp_task_team_t *task_team =
830       thread->th.th_task_team; // might be NULL for serial teams...
831   kmp_int32 children = 0;
832 
833   KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming "
834                 "task %p\n",
835                 gtid, taskdata, resumed_task));
836 
837   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
838 
839 // Pop task from stack if tied
840 #ifdef BUILD_TIED_TASK_STACK
841   if (taskdata->td_flags.tiedness == TASK_TIED) {
842     __kmp_pop_task_stack(gtid, thread, taskdata);
843   }
844 #endif /* BUILD_TIED_TASK_STACK */
845 
846   if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
847     // untied task needs to check the counter so that the task structure is not
848     // freed prematurely
849     kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1;
850     KA_TRACE(
851         20,
852         ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",
853          gtid, counter, taskdata));
854     if (counter > 0) {
855       // untied task is not done, to be continued possibly by other thread, do
856       // not free it now
857       if (resumed_task == NULL) {
858         KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial);
859         resumed_task = taskdata->td_parent; // In a serialized task, the resumed
860         // task is the parent
861       }
862       thread->th.th_current_task = resumed_task; // restore current_task
863       resumed_task->td_flags.executing = 1; // resume previous task
864       KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, "
865                     "resuming task %p\n",
866                     gtid, taskdata, resumed_task));
867       return;
868     }
869   }
870 
871   // bookkeeping for resuming task:
872   // GEH - note tasking_ser => task_serial
873   KMP_DEBUG_ASSERT(
874       (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
875       taskdata->td_flags.task_serial);
876   if (taskdata->td_flags.task_serial) {
877     if (resumed_task == NULL) {
878       resumed_task = taskdata->td_parent; // In a serialized task, the resumed
879       // task is the parent
880     }
881   } else {
882     KMP_DEBUG_ASSERT(resumed_task !=
883                      NULL); // verify that resumed task is passed as argument
884   }
885 
886   /* If the tasks' destructor thunk flag has been set, we need to invoke the
887      destructor thunk that has been generated by the compiler. The code is
888      placed here, since at this point other tasks might have been released
889      hence overlapping the destructor invocations with some other work in the
890      released tasks.  The OpenMP spec is not specific on when the destructors
891      are invoked, so we should be free to choose. */
892   if (UNLIKELY(taskdata->td_flags.destructors_thunk)) {
893     kmp_routine_entry_t destr_thunk = task->data1.destructors;
894     KMP_ASSERT(destr_thunk);
895     destr_thunk(gtid, task);
896   }
897 
898   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
899   KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
900   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
901 
902   bool detach = false;
903   if (UNLIKELY(taskdata->td_flags.detachable == TASK_DETACHABLE)) {
904     if (taskdata->td_allow_completion_event.type ==
905         KMP_EVENT_ALLOW_COMPLETION) {
906       // event hasn't been fulfilled yet. Try to detach task.
907       __kmp_acquire_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
908       if (taskdata->td_allow_completion_event.type ==
909           KMP_EVENT_ALLOW_COMPLETION) {
910         // task finished execution
911         KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
912         taskdata->td_flags.executing = 0; // suspend the finishing task
913 
914 #if OMPT_SUPPORT
915         // For a detached task, which is not completed, we switch back
916         // the omp_fulfill_event signals completion
917         // locking is necessary to avoid a race with ompt_task_late_fulfill
918         if (ompt)
919           __ompt_task_finish(task, resumed_task, ompt_task_detach);
920 #endif
921 
922         // no access to taskdata after this point!
923         // __kmp_fulfill_event might free taskdata at any time from now
924 
925         taskdata->td_flags.proxy = TASK_PROXY; // proxify!
926         detach = true;
927       }
928       __kmp_release_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
929     }
930   }
931 
932   if (!detach) {
933     taskdata->td_flags.complete = 1; // mark the task as completed
934 
935 #if OMPT_SUPPORT
936     // This is not a detached task, we are done here
937     if (ompt)
938       __ompt_task_finish(task, resumed_task, ompt_task_complete);
939 #endif
940 
941     // Only need to keep track of count if team parallel and tasking not
942     // serialized, or task is detachable and event has already been fulfilled
943     if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) ||
944         taskdata->td_flags.detachable == TASK_DETACHABLE ||
945         taskdata->td_flags.hidden_helper) {
946       // Predecrement simulated by "- 1" calculation
947       children =
948           KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
949       KMP_DEBUG_ASSERT(children >= 0);
950       if (taskdata->td_taskgroup)
951         KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
952       __kmp_release_deps(gtid, taskdata);
953     } else if (task_team && task_team->tt.tt_found_proxy_tasks) {
954       // if we found proxy tasks there could exist a dependency chain
955       // with the proxy task as origin
956       __kmp_release_deps(gtid, taskdata);
957     }
958     // td_flags.executing must be marked as 0 after __kmp_release_deps has been
959     // called. Othertwise, if a task is executed immediately from the
960     // release_deps code, the flag will be reset to 1 again by this same
961     // function
962     KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
963     taskdata->td_flags.executing = 0; // suspend the finishing task
964   }
965 
966   KA_TRACE(
967       20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
968            gtid, taskdata, children));
969 
970   // Free this task and then ancestor tasks if they have no children.
971   // Restore th_current_task first as suggested by John:
972   // johnmc: if an asynchronous inquiry peers into the runtime system
973   // it doesn't see the freed task as the current task.
974   thread->th.th_current_task = resumed_task;
975   if (!detach)
976     __kmp_free_task_and_ancestors(gtid, taskdata, thread);
977 
978   // TODO: GEH - make sure root team implicit task is initialized properly.
979   // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
980   resumed_task->td_flags.executing = 1; // resume previous task
981 
982   KA_TRACE(
983       10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
984            gtid, taskdata, resumed_task));
985 
986   return;
987 }
988 
989 template <bool ompt>
990 static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref,
991                                                   kmp_int32 gtid,
992                                                   kmp_task_t *task) {
993   KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
994                 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
995   KMP_DEBUG_ASSERT(gtid >= 0);
996   // this routine will provide task to resume
997   __kmp_task_finish<ompt>(gtid, task, NULL);
998 
999   KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
1000                 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
1001 
1002 #if OMPT_SUPPORT
1003   if (ompt) {
1004     ompt_frame_t *ompt_frame;
1005     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1006     ompt_frame->enter_frame = ompt_data_none;
1007     ompt_frame->enter_frame_flags =
1008         ompt_frame_runtime | ompt_frame_framepointer;
1009   }
1010 #endif
1011 
1012   return;
1013 }
1014 
1015 #if OMPT_SUPPORT
1016 OMPT_NOINLINE
1017 void __kmpc_omp_task_complete_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
1018                                        kmp_task_t *task) {
1019   __kmpc_omp_task_complete_if0_template<true>(loc_ref, gtid, task);
1020 }
1021 #endif // OMPT_SUPPORT
1022 
1023 // __kmpc_omp_task_complete_if0: report that a task has completed execution
1024 //
1025 // loc_ref: source location information; points to end of task block.
1026 // gtid: global thread number.
1027 // task: task thunk for the completed task.
1028 void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,
1029                                   kmp_task_t *task) {
1030 #if OMPT_SUPPORT
1031   if (UNLIKELY(ompt_enabled.enabled)) {
1032     __kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task);
1033     return;
1034   }
1035 #endif
1036   __kmpc_omp_task_complete_if0_template<false>(loc_ref, gtid, task);
1037 }
1038 
1039 #ifdef TASK_UNUSED
1040 // __kmpc_omp_task_complete: report that a task has completed execution
1041 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
1042 void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid,
1043                               kmp_task_t *task) {
1044   KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid,
1045                 loc_ref, KMP_TASK_TO_TASKDATA(task)));
1046 
1047   __kmp_task_finish<false>(gtid, task,
1048                            NULL); // Not sure how to find task to resume
1049 
1050   KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid,
1051                 loc_ref, KMP_TASK_TO_TASKDATA(task)));
1052   return;
1053 }
1054 #endif // TASK_UNUSED
1055 
1056 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit
1057 // task for a given thread
1058 //
1059 // loc_ref:  reference to source location of parallel region
1060 // this_thr:  thread data structure corresponding to implicit task
1061 // team: team for this_thr
1062 // tid: thread id of given thread within team
1063 // set_curr_task: TRUE if need to push current task to thread
1064 // NOTE: Routine does not set up the implicit task ICVS.  This is assumed to
1065 // have already been done elsewhere.
1066 // TODO: Get better loc_ref.  Value passed in may be NULL
1067 void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
1068                               kmp_team_t *team, int tid, int set_curr_task) {
1069   kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid];
1070 
1071   KF_TRACE(
1072       10,
1073       ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",
1074        tid, team, task, set_curr_task ? "TRUE" : "FALSE"));
1075 
1076   task->td_task_id = KMP_GEN_TASK_ID();
1077   task->td_team = team;
1078   //    task->td_parent   = NULL;  // fix for CQ230101 (broken parent task info
1079   //    in debugger)
1080   task->td_ident = loc_ref;
1081   task->td_taskwait_ident = NULL;
1082   task->td_taskwait_counter = 0;
1083   task->td_taskwait_thread = 0;
1084 
1085   task->td_flags.tiedness = TASK_TIED;
1086   task->td_flags.tasktype = TASK_IMPLICIT;
1087   task->td_flags.proxy = TASK_FULL;
1088 
1089   // All implicit tasks are executed immediately, not deferred
1090   task->td_flags.task_serial = 1;
1091   task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1092   task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1093 
1094   task->td_flags.started = 1;
1095   task->td_flags.executing = 1;
1096   task->td_flags.complete = 0;
1097   task->td_flags.freed = 0;
1098 
1099   task->td_depnode = NULL;
1100   task->td_last_tied = task;
1101   task->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1102 
1103   if (set_curr_task) { // only do this init first time thread is created
1104     KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0);
1105     // Not used: don't need to deallocate implicit task
1106     KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0);
1107     task->td_taskgroup = NULL; // An implicit task does not have taskgroup
1108     task->td_dephash = NULL;
1109     __kmp_push_current_task_to_thread(this_thr, team, tid);
1110   } else {
1111     KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);
1112     KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0);
1113   }
1114 
1115 #if OMPT_SUPPORT
1116   if (UNLIKELY(ompt_enabled.enabled))
1117     __ompt_task_init(task, tid);
1118 #endif
1119 
1120   KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid,
1121                 team, task));
1122 }
1123 
1124 // __kmp_finish_implicit_task: Release resources associated to implicit tasks
1125 // at the end of parallel regions. Some resources are kept for reuse in the next
1126 // parallel region.
1127 //
1128 // thread:  thread data structure corresponding to implicit task
1129 void __kmp_finish_implicit_task(kmp_info_t *thread) {
1130   kmp_taskdata_t *task = thread->th.th_current_task;
1131   if (task->td_dephash) {
1132     int children;
1133     task->td_flags.complete = 1;
1134     children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks);
1135     kmp_tasking_flags_t flags_old = task->td_flags;
1136     if (children == 0 && flags_old.complete == 1) {
1137       kmp_tasking_flags_t flags_new = flags_old;
1138       flags_new.complete = 0;
1139       if (KMP_COMPARE_AND_STORE_ACQ32(RCAST(kmp_int32 *, &task->td_flags),
1140                                       *RCAST(kmp_int32 *, &flags_old),
1141                                       *RCAST(kmp_int32 *, &flags_new))) {
1142         KA_TRACE(100, ("__kmp_finish_implicit_task: T#%d cleans "
1143                        "dephash of implicit task %p\n",
1144                        thread->th.th_info.ds.ds_gtid, task));
1145         __kmp_dephash_free_entries(thread, task->td_dephash);
1146       }
1147     }
1148   }
1149 }
1150 
1151 // __kmp_free_implicit_task: Release resources associated to implicit tasks
1152 // when these are destroyed regions
1153 //
1154 // thread:  thread data structure corresponding to implicit task
1155 void __kmp_free_implicit_task(kmp_info_t *thread) {
1156   kmp_taskdata_t *task = thread->th.th_current_task;
1157   if (task && task->td_dephash) {
1158     __kmp_dephash_free(thread, task->td_dephash);
1159     task->td_dephash = NULL;
1160   }
1161 }
1162 
1163 // Round up a size to a power of two specified by val: Used to insert padding
1164 // between structures co-allocated using a single malloc() call
1165 static size_t __kmp_round_up_to_val(size_t size, size_t val) {
1166   if (size & (val - 1)) {
1167     size &= ~(val - 1);
1168     if (size <= KMP_SIZE_T_MAX - val) {
1169       size += val; // Round up if there is no overflow.
1170     }
1171   }
1172   return size;
1173 } // __kmp_round_up_to_va
1174 
1175 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task
1176 //
1177 // loc_ref: source location information
1178 // gtid: global thread number.
1179 // flags: include tiedness & task type (explicit vs. implicit) of the ''new''
1180 // task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine.
1181 // sizeof_kmp_task_t:  Size in bytes of kmp_task_t data structure including
1182 // private vars accessed in task.
1183 // sizeof_shareds:  Size in bytes of array of pointers to shared vars accessed
1184 // in task.
1185 // task_entry: Pointer to task code entry point generated by compiler.
1186 // returns: a pointer to the allocated kmp_task_t structure (task).
1187 kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1188                              kmp_tasking_flags_t *flags,
1189                              size_t sizeof_kmp_task_t, size_t sizeof_shareds,
1190                              kmp_routine_entry_t task_entry) {
1191   kmp_task_t *task;
1192   kmp_taskdata_t *taskdata;
1193   kmp_info_t *thread = __kmp_threads[gtid];
1194   kmp_info_t *encountering_thread = thread;
1195   kmp_team_t *team = thread->th.th_team;
1196   kmp_taskdata_t *parent_task = thread->th.th_current_task;
1197   size_t shareds_offset;
1198 
1199   if (UNLIKELY(!TCR_4(__kmp_init_middle)))
1200     __kmp_middle_initialize();
1201 
1202   if (flags->hidden_helper) {
1203     if (__kmp_enable_hidden_helper) {
1204       if (!TCR_4(__kmp_init_hidden_helper))
1205         __kmp_hidden_helper_initialize();
1206 
1207       // For a hidden helper task encountered by a regular thread, we will push
1208       // the task to the (gtid%__kmp_hidden_helper_threads_num)-th hidden helper
1209       // thread.
1210       if (!KMP_HIDDEN_HELPER_THREAD(gtid)) {
1211         thread = __kmp_threads[KMP_GTID_TO_SHADOW_GTID(gtid)];
1212         // We don't change the parent-child relation for hidden helper task as
1213         // we need that to do per-task-region synchronization.
1214       }
1215     } else {
1216       // If the hidden helper task is not enabled, reset the flag to FALSE.
1217       flags->hidden_helper = FALSE;
1218     }
1219   }
1220 
1221   KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
1222                 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1223                 gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
1224                 sizeof_shareds, task_entry));
1225 
1226   KMP_DEBUG_ASSERT(parent_task);
1227   if (parent_task->td_flags.final) {
1228     if (flags->merged_if0) {
1229     }
1230     flags->final = 1;
1231   }
1232 
1233   if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) {
1234     // Untied task encountered causes the TSC algorithm to check entire deque of
1235     // the victim thread. If no untied task encountered, then checking the head
1236     // of the deque should be enough.
1237     KMP_CHECK_UPDATE(
1238         encountering_thread->th.th_task_team->tt.tt_untied_task_encountered, 1);
1239   }
1240 
1241   // Detachable tasks are not proxy tasks yet but could be in the future. Doing
1242   // the tasking setup
1243   // when that happens is too late.
1244   if (UNLIKELY(flags->proxy == TASK_PROXY ||
1245                flags->detachable == TASK_DETACHABLE || flags->hidden_helper)) {
1246     if (flags->proxy == TASK_PROXY) {
1247       flags->tiedness = TASK_UNTIED;
1248       flags->merged_if0 = 1;
1249     }
1250     /* are we running in a sequential parallel or tskm_immediate_exec... we need
1251        tasking support enabled */
1252     if ((encountering_thread->th.th_task_team) == NULL) {
1253       /* This should only happen if the team is serialized
1254           setup a task team and propagate it to the thread */
1255       KMP_DEBUG_ASSERT(team->t.t_serialized);
1256       KA_TRACE(30,
1257                ("T#%d creating task team in __kmp_task_alloc for proxy task\n",
1258                 gtid));
1259       __kmp_task_team_setup(
1260           encountering_thread, team,
1261           1); // 1 indicates setup the current team regardless of nthreads
1262       encountering_thread->th.th_task_team =
1263           team->t.t_task_team[encountering_thread->th.th_task_state];
1264     }
1265     kmp_task_team_t *task_team = encountering_thread->th.th_task_team;
1266 
1267     /* tasking must be enabled now as the task might not be pushed */
1268     if (!KMP_TASKING_ENABLED(task_team)) {
1269       KA_TRACE(
1270           30,
1271           ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
1272       __kmp_enable_tasking(task_team, encountering_thread);
1273       kmp_int32 tid = encountering_thread->th.th_info.ds.ds_tid;
1274       kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
1275       // No lock needed since only owner can allocate
1276       if (thread_data->td.td_deque == NULL) {
1277         __kmp_alloc_task_deque(encountering_thread, thread_data);
1278       }
1279     }
1280 
1281     if ((flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE) &&
1282         task_team->tt.tt_found_proxy_tasks == FALSE)
1283       TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE);
1284     if (flags->hidden_helper &&
1285         task_team->tt.tt_hidden_helper_task_encountered == FALSE)
1286       TCW_4(task_team->tt.tt_hidden_helper_task_encountered, TRUE);
1287   }
1288 
1289   // Calculate shared structure offset including padding after kmp_task_t struct
1290   // to align pointers in shared struct
1291   shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t;
1292   shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(void *));
1293 
1294   // Allocate a kmp_taskdata_t block and a kmp_task_t block.
1295   KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid,
1296                 shareds_offset));
1297   KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid,
1298                 sizeof_shareds));
1299 
1300   // Avoid double allocation here by combining shareds with taskdata
1301 #if USE_FAST_MEMORY
1302   taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(
1303       encountering_thread, shareds_offset + sizeof_shareds);
1304 #else /* ! USE_FAST_MEMORY */
1305   taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(
1306       encountering_thread, shareds_offset + sizeof_shareds);
1307 #endif /* USE_FAST_MEMORY */
1308   ANNOTATE_HAPPENS_AFTER(taskdata);
1309 
1310   task = KMP_TASKDATA_TO_TASK(taskdata);
1311 
1312 // Make sure task & taskdata are aligned appropriately
1313 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD
1314   KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0);
1315   KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0);
1316 #else
1317   KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0);
1318   KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0);
1319 #endif
1320   if (sizeof_shareds > 0) {
1321     // Avoid double allocation here by combining shareds with taskdata
1322     task->shareds = &((char *)taskdata)[shareds_offset];
1323     // Make sure shareds struct is aligned to pointer size
1324     KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
1325                      0);
1326   } else {
1327     task->shareds = NULL;
1328   }
1329   task->routine = task_entry;
1330   task->part_id = 0; // AC: Always start with 0 part id
1331 
1332   taskdata->td_task_id = KMP_GEN_TASK_ID();
1333   taskdata->td_team = thread->th.th_team;
1334   taskdata->td_alloc_thread = encountering_thread;
1335   taskdata->td_parent = parent_task;
1336   taskdata->td_level = parent_task->td_level + 1; // increment nesting level
1337   KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
1338   taskdata->td_ident = loc_ref;
1339   taskdata->td_taskwait_ident = NULL;
1340   taskdata->td_taskwait_counter = 0;
1341   taskdata->td_taskwait_thread = 0;
1342   KMP_DEBUG_ASSERT(taskdata->td_parent != NULL);
1343   // avoid copying icvs for proxy tasks
1344   if (flags->proxy == TASK_FULL)
1345     copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs);
1346 
1347   taskdata->td_flags.tiedness = flags->tiedness;
1348   taskdata->td_flags.final = flags->final;
1349   taskdata->td_flags.merged_if0 = flags->merged_if0;
1350   taskdata->td_flags.destructors_thunk = flags->destructors_thunk;
1351   taskdata->td_flags.proxy = flags->proxy;
1352   taskdata->td_flags.detachable = flags->detachable;
1353   taskdata->td_flags.hidden_helper = flags->hidden_helper;
1354   taskdata->encountering_gtid = gtid;
1355   taskdata->td_task_team = thread->th.th_task_team;
1356   taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
1357   taskdata->td_flags.tasktype = TASK_EXPLICIT;
1358 
1359   // GEH - TODO: fix this to copy parent task's value of tasking_ser flag
1360   taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1361 
1362   // GEH - TODO: fix this to copy parent task's value of team_serial flag
1363   taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1364 
1365   // GEH - Note we serialize the task if the team is serialized to make sure
1366   // implicit parallel region tasks are not left until program termination to
1367   // execute. Also, it helps locality to execute immediately.
1368 
1369   taskdata->td_flags.task_serial =
1370       (parent_task->td_flags.final || taskdata->td_flags.team_serial ||
1371        taskdata->td_flags.tasking_ser || flags->merged_if0);
1372 
1373   taskdata->td_flags.started = 0;
1374   taskdata->td_flags.executing = 0;
1375   taskdata->td_flags.complete = 0;
1376   taskdata->td_flags.freed = 0;
1377 
1378   taskdata->td_flags.native = flags->native;
1379 
1380   KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
1381   // start at one because counts current task and children
1382   KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
1383   taskdata->td_taskgroup =
1384       parent_task->td_taskgroup; // task inherits taskgroup from the parent task
1385   taskdata->td_dephash = NULL;
1386   taskdata->td_depnode = NULL;
1387   if (flags->tiedness == TASK_UNTIED)
1388     taskdata->td_last_tied = NULL; // will be set when the task is scheduled
1389   else
1390     taskdata->td_last_tied = taskdata;
1391   taskdata->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1392 #if OMPT_SUPPORT
1393   if (UNLIKELY(ompt_enabled.enabled))
1394     __ompt_task_init(taskdata, gtid);
1395 #endif
1396   // Only need to keep track of child task counts if team parallel and tasking
1397   // not serialized or if it is a proxy or detachable or hidden helper task
1398   if (flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE ||
1399       flags->hidden_helper ||
1400       !(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
1401     KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
1402     if (parent_task->td_taskgroup)
1403       KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
1404     // Only need to keep track of allocated child tasks for explicit tasks since
1405     // implicit not deallocated
1406     if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
1407       KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
1408     }
1409   }
1410 
1411   if (flags->hidden_helper) {
1412     taskdata->td_flags.task_serial = FALSE;
1413     // Increment the number of hidden helper tasks to be executed
1414     KMP_ATOMIC_INC(&__kmp_unexecuted_hidden_helper_tasks);
1415   }
1416 
1417   KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
1418                 gtid, taskdata, taskdata->td_parent));
1419   ANNOTATE_HAPPENS_BEFORE(task);
1420 
1421   return task;
1422 }
1423 
1424 kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1425                                   kmp_int32 flags, size_t sizeof_kmp_task_t,
1426                                   size_t sizeof_shareds,
1427                                   kmp_routine_entry_t task_entry) {
1428   kmp_task_t *retval;
1429   kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
1430   __kmp_assert_valid_gtid(gtid);
1431   input_flags->native = FALSE;
1432   // __kmp_task_alloc() sets up all other runtime flags
1433   KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s %s) "
1434                 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1435                 gtid, loc_ref, input_flags->tiedness ? "tied  " : "untied",
1436                 input_flags->proxy ? "proxy" : "",
1437                 input_flags->detachable ? "detachable" : "", sizeof_kmp_task_t,
1438                 sizeof_shareds, task_entry));
1439 
1440   retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t,
1441                             sizeof_shareds, task_entry);
1442 
1443   KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval));
1444 
1445   return retval;
1446 }
1447 
1448 kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1449                                          kmp_int32 flags,
1450                                          size_t sizeof_kmp_task_t,
1451                                          size_t sizeof_shareds,
1452                                          kmp_routine_entry_t task_entry,
1453                                          kmp_int64 device_id) {
1454   if (__kmp_enable_hidden_helper) {
1455     auto &input_flags = reinterpret_cast<kmp_tasking_flags_t &>(flags);
1456     input_flags.hidden_helper = TRUE;
1457   }
1458 
1459   return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t,
1460                                sizeof_shareds, task_entry);
1461 }
1462 
1463 /*!
1464 @ingroup TASKING
1465 @param loc_ref location of the original task directive
1466 @param gtid Global Thread ID of encountering thread
1467 @param new_task task thunk allocated by __kmpc_omp_task_alloc() for the ''new
1468 task''
1469 @param naffins Number of affinity items
1470 @param affin_list List of affinity items
1471 @return Returns non-zero if registering affinity information was not successful.
1472  Returns 0 if registration was successful
1473 This entry registers the affinity information attached to a task with the task
1474 thunk structure kmp_taskdata_t.
1475 */
1476 kmp_int32
1477 __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid,
1478                                   kmp_task_t *new_task, kmp_int32 naffins,
1479                                   kmp_task_affinity_info_t *affin_list) {
1480   return 0;
1481 }
1482 
1483 //  __kmp_invoke_task: invoke the specified task
1484 //
1485 // gtid: global thread ID of caller
1486 // task: the task to invoke
1487 // current_task: the task to resume after task invocation
1488 static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
1489                               kmp_taskdata_t *current_task) {
1490   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
1491   kmp_info_t *thread;
1492   int discard = 0 /* false */;
1493   KA_TRACE(
1494       30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
1495            gtid, taskdata, current_task));
1496   KMP_DEBUG_ASSERT(task);
1497   if (UNLIKELY(taskdata->td_flags.proxy == TASK_PROXY &&
1498                taskdata->td_flags.complete == 1)) {
1499     // This is a proxy task that was already completed but it needs to run
1500     // its bottom-half finish
1501     KA_TRACE(
1502         30,
1503         ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
1504          gtid, taskdata));
1505 
1506     __kmp_bottom_half_finish_proxy(gtid, task);
1507 
1508     KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for "
1509                   "proxy task %p, resuming task %p\n",
1510                   gtid, taskdata, current_task));
1511 
1512     return;
1513   }
1514 
1515 #if OMPT_SUPPORT
1516   // For untied tasks, the first task executed only calls __kmpc_omp_task and
1517   // does not execute code.
1518   ompt_thread_info_t oldInfo;
1519   if (UNLIKELY(ompt_enabled.enabled)) {
1520     // Store the threads states and restore them after the task
1521     thread = __kmp_threads[gtid];
1522     oldInfo = thread->th.ompt_thread_info;
1523     thread->th.ompt_thread_info.wait_id = 0;
1524     thread->th.ompt_thread_info.state = (thread->th.th_team_serialized)
1525                                             ? ompt_state_work_serial
1526                                             : ompt_state_work_parallel;
1527     taskdata->ompt_task_info.frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1528   }
1529 #endif
1530 
1531   // Decreament the counter of hidden helper tasks to be executed
1532   if (taskdata->td_flags.hidden_helper) {
1533     // Hidden helper tasks can only be executed by hidden helper threads
1534     KMP_ASSERT(KMP_HIDDEN_HELPER_THREAD(gtid));
1535     KMP_ATOMIC_DEC(&__kmp_unexecuted_hidden_helper_tasks);
1536   }
1537 
1538   // Proxy tasks are not handled by the runtime
1539   if (taskdata->td_flags.proxy != TASK_PROXY) {
1540     ANNOTATE_HAPPENS_AFTER(task);
1541     __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded
1542   }
1543 
1544   // TODO: cancel tasks if the parallel region has also been cancelled
1545   // TODO: check if this sequence can be hoisted above __kmp_task_start
1546   // if cancellation has been enabled for this run ...
1547   if (UNLIKELY(__kmp_omp_cancellation)) {
1548     thread = __kmp_threads[gtid];
1549     kmp_team_t *this_team = thread->th.th_team;
1550     kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
1551     if ((taskgroup && taskgroup->cancel_request) ||
1552         (this_team->t.t_cancel_request == cancel_parallel)) {
1553 #if OMPT_SUPPORT && OMPT_OPTIONAL
1554       ompt_data_t *task_data;
1555       if (UNLIKELY(ompt_enabled.ompt_callback_cancel)) {
1556         __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
1557         ompt_callbacks.ompt_callback(ompt_callback_cancel)(
1558             task_data,
1559             ((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup
1560                                                       : ompt_cancel_parallel) |
1561                 ompt_cancel_discarded_task,
1562             NULL);
1563       }
1564 #endif
1565       KMP_COUNT_BLOCK(TASK_cancelled);
1566       // this task belongs to a task group and we need to cancel it
1567       discard = 1 /* true */;
1568     }
1569   }
1570 
1571   // Invoke the task routine and pass in relevant data.
1572   // Thunks generated by gcc take a different argument list.
1573   if (!discard) {
1574     if (taskdata->td_flags.tiedness == TASK_UNTIED) {
1575       taskdata->td_last_tied = current_task->td_last_tied;
1576       KMP_DEBUG_ASSERT(taskdata->td_last_tied);
1577     }
1578 #if KMP_STATS_ENABLED
1579     KMP_COUNT_BLOCK(TASK_executed);
1580     switch (KMP_GET_THREAD_STATE()) {
1581     case FORK_JOIN_BARRIER:
1582       KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar);
1583       break;
1584     case PLAIN_BARRIER:
1585       KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar);
1586       break;
1587     case TASKYIELD:
1588       KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield);
1589       break;
1590     case TASKWAIT:
1591       KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait);
1592       break;
1593     case TASKGROUP:
1594       KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup);
1595       break;
1596     default:
1597       KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate);
1598       break;
1599     }
1600 #endif // KMP_STATS_ENABLED
1601 
1602 // OMPT task begin
1603 #if OMPT_SUPPORT
1604     if (UNLIKELY(ompt_enabled.enabled))
1605       __ompt_task_start(task, current_task, gtid);
1606 #endif
1607 
1608 #if OMPD_SUPPORT
1609     if (ompd_state & OMPD_ENABLE_BP)
1610       ompd_bp_task_begin();
1611 #endif
1612 
1613 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1614     kmp_uint64 cur_time;
1615     kmp_int32 kmp_itt_count_task =
1616         __kmp_forkjoin_frames_mode == 3 && !taskdata->td_flags.task_serial &&
1617         current_task->td_flags.tasktype == TASK_IMPLICIT;
1618     if (kmp_itt_count_task) {
1619       thread = __kmp_threads[gtid];
1620       // Time outer level explicit task on barrier for adjusting imbalance time
1621       if (thread->th.th_bar_arrive_time)
1622         cur_time = __itt_get_timestamp();
1623       else
1624         kmp_itt_count_task = 0; // thread is not on a barrier - skip timing
1625     }
1626     KMP_FSYNC_ACQUIRED(taskdata); // acquired self (new task)
1627 #endif
1628 
1629 #ifdef KMP_GOMP_COMPAT
1630     if (taskdata->td_flags.native) {
1631       ((void (*)(void *))(*(task->routine)))(task->shareds);
1632     } else
1633 #endif /* KMP_GOMP_COMPAT */
1634     {
1635       (*(task->routine))(gtid, task);
1636     }
1637     KMP_POP_PARTITIONED_TIMER();
1638 
1639 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1640     if (kmp_itt_count_task) {
1641       // Barrier imbalance - adjust arrive time with the task duration
1642       thread->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time);
1643     }
1644     KMP_FSYNC_CANCEL(taskdata); // destroy self (just executed)
1645     KMP_FSYNC_RELEASING(taskdata->td_parent); // releasing parent
1646 #endif
1647   }
1648 
1649 #if OMPD_SUPPORT
1650   if (ompd_state & OMPD_ENABLE_BP)
1651     ompd_bp_task_end();
1652 #endif
1653 
1654   // Proxy tasks are not handled by the runtime
1655   if (taskdata->td_flags.proxy != TASK_PROXY) {
1656     ANNOTATE_HAPPENS_BEFORE(taskdata->td_parent);
1657 #if OMPT_SUPPORT
1658     if (UNLIKELY(ompt_enabled.enabled)) {
1659       thread->th.ompt_thread_info = oldInfo;
1660       if (taskdata->td_flags.tiedness == TASK_TIED) {
1661         taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1662       }
1663       __kmp_task_finish<true>(gtid, task, current_task);
1664     } else
1665 #endif
1666       __kmp_task_finish<false>(gtid, task, current_task);
1667   }
1668 
1669   KA_TRACE(
1670       30,
1671       ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
1672        gtid, taskdata, current_task));
1673   return;
1674 }
1675 
1676 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution
1677 //
1678 // loc_ref: location of original task pragma (ignored)
1679 // gtid: Global Thread ID of encountering thread
1680 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
1681 // Returns:
1682 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1683 //    be resumed later.
1684 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1685 //    resumed later.
1686 kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,
1687                                 kmp_task_t *new_task) {
1688   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1689 
1690   KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid,
1691                 loc_ref, new_taskdata));
1692 
1693 #if OMPT_SUPPORT
1694   kmp_taskdata_t *parent;
1695   if (UNLIKELY(ompt_enabled.enabled)) {
1696     parent = new_taskdata->td_parent;
1697     if (ompt_enabled.ompt_callback_task_create) {
1698       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1699           &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
1700           &(new_taskdata->ompt_task_info.task_data), ompt_task_explicit, 0,
1701           OMPT_GET_RETURN_ADDRESS(0));
1702     }
1703   }
1704 #endif
1705 
1706   /* Should we execute the new task or queue it? For now, let's just always try
1707      to queue it.  If the queue fills up, then we'll execute it.  */
1708 
1709   if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1710   { // Execute this task immediately
1711     kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1712     new_taskdata->td_flags.task_serial = 1;
1713     __kmp_invoke_task(gtid, new_task, current_task);
1714   }
1715 
1716   KA_TRACE(
1717       10,
1718       ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "
1719        "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
1720        gtid, loc_ref, new_taskdata));
1721 
1722   ANNOTATE_HAPPENS_BEFORE(new_task);
1723 #if OMPT_SUPPORT
1724   if (UNLIKELY(ompt_enabled.enabled)) {
1725     parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1726   }
1727 #endif
1728   return TASK_CURRENT_NOT_QUEUED;
1729 }
1730 
1731 // __kmp_omp_task: Schedule a non-thread-switchable task for execution
1732 //
1733 // gtid: Global Thread ID of encountering thread
1734 // new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
1735 // serialize_immediate: if TRUE then if the task is executed immediately its
1736 // execution will be serialized
1737 // Returns:
1738 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1739 //    be resumed later.
1740 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1741 //    resumed later.
1742 kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
1743                          bool serialize_immediate) {
1744   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1745 
1746   /* Should we execute the new task or queue it? For now, let's just always try
1747      to queue it.  If the queue fills up, then we'll execute it.  */
1748   if (new_taskdata->td_flags.proxy == TASK_PROXY ||
1749       __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1750   { // Execute this task immediately
1751     kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1752     if (serialize_immediate)
1753       new_taskdata->td_flags.task_serial = 1;
1754     __kmp_invoke_task(gtid, new_task, current_task);
1755   }
1756 
1757   ANNOTATE_HAPPENS_BEFORE(new_task);
1758   return TASK_CURRENT_NOT_QUEUED;
1759 }
1760 
1761 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a
1762 // non-thread-switchable task from the parent thread only!
1763 //
1764 // loc_ref: location of original task pragma (ignored)
1765 // gtid: Global Thread ID of encountering thread
1766 // new_task: non-thread-switchable task thunk allocated by
1767 // __kmp_omp_task_alloc()
1768 // Returns:
1769 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1770 //    be resumed later.
1771 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1772 //    resumed later.
1773 kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid,
1774                           kmp_task_t *new_task) {
1775   kmp_int32 res;
1776   KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1777 
1778 #if KMP_DEBUG || OMPT_SUPPORT
1779   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1780 #endif
1781   KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1782                 new_taskdata));
1783   __kmp_assert_valid_gtid(gtid);
1784 
1785 #if OMPT_SUPPORT
1786   kmp_taskdata_t *parent = NULL;
1787   if (UNLIKELY(ompt_enabled.enabled)) {
1788     if (!new_taskdata->td_flags.started) {
1789       OMPT_STORE_RETURN_ADDRESS(gtid);
1790       parent = new_taskdata->td_parent;
1791       if (!parent->ompt_task_info.frame.enter_frame.ptr) {
1792         parent->ompt_task_info.frame.enter_frame.ptr =
1793             OMPT_GET_FRAME_ADDRESS(0);
1794       }
1795       if (ompt_enabled.ompt_callback_task_create) {
1796         ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1797             &(parent->ompt_task_info.task_data),
1798             &(parent->ompt_task_info.frame),
1799             &(new_taskdata->ompt_task_info.task_data),
1800             ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1801             OMPT_LOAD_RETURN_ADDRESS(gtid));
1802       }
1803     } else {
1804       // We are scheduling the continuation of an UNTIED task.
1805       // Scheduling back to the parent task.
1806       __ompt_task_finish(new_task,
1807                          new_taskdata->ompt_task_info.scheduling_parent,
1808                          ompt_task_switch);
1809       new_taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1810     }
1811   }
1812 #endif
1813 
1814   res = __kmp_omp_task(gtid, new_task, true);
1815 
1816   KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1817                 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1818                 gtid, loc_ref, new_taskdata));
1819 #if OMPT_SUPPORT
1820   if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
1821     parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1822   }
1823 #endif
1824   return res;
1825 }
1826 
1827 // __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule
1828 // a taskloop task with the correct OMPT return address
1829 //
1830 // loc_ref: location of original task pragma (ignored)
1831 // gtid: Global Thread ID of encountering thread
1832 // new_task: non-thread-switchable task thunk allocated by
1833 // __kmp_omp_task_alloc()
1834 // codeptr_ra: return address for OMPT callback
1835 // Returns:
1836 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1837 //    be resumed later.
1838 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1839 //    resumed later.
1840 kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid,
1841                                   kmp_task_t *new_task, void *codeptr_ra) {
1842   kmp_int32 res;
1843   KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1844 
1845 #if KMP_DEBUG || OMPT_SUPPORT
1846   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1847 #endif
1848   KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1849                 new_taskdata));
1850 
1851 #if OMPT_SUPPORT
1852   kmp_taskdata_t *parent = NULL;
1853   if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) {
1854     parent = new_taskdata->td_parent;
1855     if (!parent->ompt_task_info.frame.enter_frame.ptr)
1856       parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1857     if (ompt_enabled.ompt_callback_task_create) {
1858       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1859           &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
1860           &(new_taskdata->ompt_task_info.task_data),
1861           ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1862           codeptr_ra);
1863     }
1864   }
1865 #endif
1866 
1867   res = __kmp_omp_task(gtid, new_task, true);
1868 
1869   KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1870                 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1871                 gtid, loc_ref, new_taskdata));
1872 #if OMPT_SUPPORT
1873   if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
1874     parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1875   }
1876 #endif
1877   return res;
1878 }
1879 
1880 template <bool ompt>
1881 static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
1882                                               void *frame_address,
1883                                               void *return_address) {
1884   kmp_taskdata_t *taskdata = nullptr;
1885   kmp_info_t *thread;
1886   int thread_finished = FALSE;
1887   KMP_SET_THREAD_STATE_BLOCK(TASKWAIT);
1888 
1889   KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref));
1890   KMP_DEBUG_ASSERT(gtid >= 0);
1891 
1892   if (__kmp_tasking_mode != tskm_immediate_exec) {
1893     thread = __kmp_threads[gtid];
1894     taskdata = thread->th.th_current_task;
1895 
1896 #if OMPT_SUPPORT && OMPT_OPTIONAL
1897     ompt_data_t *my_task_data;
1898     ompt_data_t *my_parallel_data;
1899 
1900     if (ompt) {
1901       my_task_data = &(taskdata->ompt_task_info.task_data);
1902       my_parallel_data = OMPT_CUR_TEAM_DATA(thread);
1903 
1904       taskdata->ompt_task_info.frame.enter_frame.ptr = frame_address;
1905 
1906       if (ompt_enabled.ompt_callback_sync_region) {
1907         ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1908             ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
1909             my_task_data, return_address);
1910       }
1911 
1912       if (ompt_enabled.ompt_callback_sync_region_wait) {
1913         ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1914             ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
1915             my_task_data, return_address);
1916       }
1917     }
1918 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1919 
1920 // Debugger: The taskwait is active. Store location and thread encountered the
1921 // taskwait.
1922 #if USE_ITT_BUILD
1923 // Note: These values are used by ITT events as well.
1924 #endif /* USE_ITT_BUILD */
1925     taskdata->td_taskwait_counter += 1;
1926     taskdata->td_taskwait_ident = loc_ref;
1927     taskdata->td_taskwait_thread = gtid + 1;
1928 
1929 #if USE_ITT_BUILD
1930     void *itt_sync_obj = NULL;
1931 #if USE_ITT_NOTIFY
1932     KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
1933 #endif /* USE_ITT_NOTIFY */
1934 #endif /* USE_ITT_BUILD */
1935 
1936     bool must_wait =
1937         !taskdata->td_flags.team_serial && !taskdata->td_flags.final;
1938 
1939     must_wait = must_wait || (thread->th.th_task_team != NULL &&
1940                               thread->th.th_task_team->tt.tt_found_proxy_tasks);
1941     // If hidden helper thread is encountered, we must enable wait here.
1942     must_wait =
1943         must_wait ||
1944         (__kmp_enable_hidden_helper && thread->th.th_task_team != NULL &&
1945          thread->th.th_task_team->tt.tt_hidden_helper_task_encountered);
1946 
1947     if (must_wait) {
1948       kmp_flag_32<false, false> flag(
1949           RCAST(std::atomic<kmp_uint32> *,
1950                 &(taskdata->td_incomplete_child_tasks)),
1951           0U);
1952       while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) {
1953         flag.execute_tasks(thread, gtid, FALSE,
1954                            &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
1955                            __kmp_task_stealing_constraint);
1956       }
1957     }
1958 #if USE_ITT_BUILD
1959     KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
1960     KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with children
1961 #endif /* USE_ITT_BUILD */
1962 
1963     // Debugger:  The taskwait is completed. Location remains, but thread is
1964     // negated.
1965     taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
1966 
1967 #if OMPT_SUPPORT && OMPT_OPTIONAL
1968     if (ompt) {
1969       if (ompt_enabled.ompt_callback_sync_region_wait) {
1970         ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1971             ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
1972             my_task_data, return_address);
1973       }
1974       if (ompt_enabled.ompt_callback_sync_region) {
1975         ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1976             ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
1977             my_task_data, return_address);
1978       }
1979       taskdata->ompt_task_info.frame.enter_frame = ompt_data_none;
1980     }
1981 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1982 
1983     ANNOTATE_HAPPENS_AFTER(taskdata);
1984   }
1985 
1986   KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
1987                 "returning TASK_CURRENT_NOT_QUEUED\n",
1988                 gtid, taskdata));
1989 
1990   return TASK_CURRENT_NOT_QUEUED;
1991 }
1992 
1993 #if OMPT_SUPPORT && OMPT_OPTIONAL
1994 OMPT_NOINLINE
1995 static kmp_int32 __kmpc_omp_taskwait_ompt(ident_t *loc_ref, kmp_int32 gtid,
1996                                           void *frame_address,
1997                                           void *return_address) {
1998   return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address,
1999                                             return_address);
2000 }
2001 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
2002 
2003 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are
2004 // complete
2005 kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) {
2006 #if OMPT_SUPPORT && OMPT_OPTIONAL
2007   if (UNLIKELY(ompt_enabled.enabled)) {
2008     OMPT_STORE_RETURN_ADDRESS(gtid);
2009     return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(0),
2010                                     OMPT_LOAD_RETURN_ADDRESS(gtid));
2011   }
2012 #endif
2013   return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL);
2014 }
2015 
2016 // __kmpc_omp_taskyield: switch to a different task
2017 kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
2018   kmp_taskdata_t *taskdata = NULL;
2019   kmp_info_t *thread;
2020   int thread_finished = FALSE;
2021 
2022   KMP_COUNT_BLOCK(OMP_TASKYIELD);
2023   KMP_SET_THREAD_STATE_BLOCK(TASKYIELD);
2024 
2025   KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
2026                 gtid, loc_ref, end_part));
2027   __kmp_assert_valid_gtid(gtid);
2028 
2029   if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) {
2030     thread = __kmp_threads[gtid];
2031     taskdata = thread->th.th_current_task;
2032 // Should we model this as a task wait or not?
2033 // Debugger: The taskwait is active. Store location and thread encountered the
2034 // taskwait.
2035 #if USE_ITT_BUILD
2036 // Note: These values are used by ITT events as well.
2037 #endif /* USE_ITT_BUILD */
2038     taskdata->td_taskwait_counter += 1;
2039     taskdata->td_taskwait_ident = loc_ref;
2040     taskdata->td_taskwait_thread = gtid + 1;
2041 
2042 #if USE_ITT_BUILD
2043     void *itt_sync_obj = NULL;
2044 #if USE_ITT_NOTIFY
2045     KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2046 #endif /* USE_ITT_NOTIFY */
2047 #endif /* USE_ITT_BUILD */
2048     if (!taskdata->td_flags.team_serial) {
2049       kmp_task_team_t *task_team = thread->th.th_task_team;
2050       if (task_team != NULL) {
2051         if (KMP_TASKING_ENABLED(task_team)) {
2052 #if OMPT_SUPPORT
2053           if (UNLIKELY(ompt_enabled.enabled))
2054             thread->th.ompt_thread_info.ompt_task_yielded = 1;
2055 #endif
2056           __kmp_execute_tasks_32(
2057               thread, gtid, (kmp_flag_32<> *)NULL, FALSE,
2058               &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2059               __kmp_task_stealing_constraint);
2060 #if OMPT_SUPPORT
2061           if (UNLIKELY(ompt_enabled.enabled))
2062             thread->th.ompt_thread_info.ompt_task_yielded = 0;
2063 #endif
2064         }
2065       }
2066     }
2067 #if USE_ITT_BUILD
2068     KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2069 #endif /* USE_ITT_BUILD */
2070 
2071     // Debugger:  The taskwait is completed. Location remains, but thread is
2072     // negated.
2073     taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2074   }
2075 
2076   KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
2077                 "returning TASK_CURRENT_NOT_QUEUED\n",
2078                 gtid, taskdata));
2079 
2080   return TASK_CURRENT_NOT_QUEUED;
2081 }
2082 
2083 // Task Reduction implementation
2084 //
2085 // Note: initial implementation didn't take into account the possibility
2086 // to specify omp_orig for initializer of the UDR (user defined reduction).
2087 // Corrected implementation takes into account the omp_orig object.
2088 // Compiler is free to use old implementation if omp_orig is not specified.
2089 
2090 /*!
2091 @ingroup BASIC_TYPES
2092 @{
2093 */
2094 
2095 /*!
2096 Flags for special info per task reduction item.
2097 */
2098 typedef struct kmp_taskred_flags {
2099   /*! 1 - use lazy alloc/init (e.g. big objects, #tasks < #threads) */
2100   unsigned lazy_priv : 1;
2101   unsigned reserved31 : 31;
2102 } kmp_taskred_flags_t;
2103 
2104 /*!
2105 Internal struct for reduction data item related info set up by compiler.
2106 */
2107 typedef struct kmp_task_red_input {
2108   void *reduce_shar; /**< shared between tasks item to reduce into */
2109   size_t reduce_size; /**< size of data item in bytes */
2110   // three compiler-generated routines (init, fini are optional):
2111   void *reduce_init; /**< data initialization routine (single parameter) */
2112   void *reduce_fini; /**< data finalization routine */
2113   void *reduce_comb; /**< data combiner routine */
2114   kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
2115 } kmp_task_red_input_t;
2116 
2117 /*!
2118 Internal struct for reduction data item related info saved by the library.
2119 */
2120 typedef struct kmp_taskred_data {
2121   void *reduce_shar; /**< shared between tasks item to reduce into */
2122   size_t reduce_size; /**< size of data item */
2123   kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
2124   void *reduce_priv; /**< array of thread specific items */
2125   void *reduce_pend; /**< end of private data for faster comparison op */
2126   // three compiler-generated routines (init, fini are optional):
2127   void *reduce_comb; /**< data combiner routine */
2128   void *reduce_init; /**< data initialization routine (two parameters) */
2129   void *reduce_fini; /**< data finalization routine */
2130   void *reduce_orig; /**< original item (can be used in UDR initializer) */
2131 } kmp_taskred_data_t;
2132 
2133 /*!
2134 Internal struct for reduction data item related info set up by compiler.
2135 
2136 New interface: added reduce_orig field to provide omp_orig for UDR initializer.
2137 */
2138 typedef struct kmp_taskred_input {
2139   void *reduce_shar; /**< shared between tasks item to reduce into */
2140   void *reduce_orig; /**< original reduction item used for initialization */
2141   size_t reduce_size; /**< size of data item */
2142   // three compiler-generated routines (init, fini are optional):
2143   void *reduce_init; /**< data initialization routine (two parameters) */
2144   void *reduce_fini; /**< data finalization routine */
2145   void *reduce_comb; /**< data combiner routine */
2146   kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
2147 } kmp_taskred_input_t;
2148 /*!
2149 @}
2150 */
2151 
2152 template <typename T> void __kmp_assign_orig(kmp_taskred_data_t &item, T &src);
2153 template <>
2154 void __kmp_assign_orig<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2155                                              kmp_task_red_input_t &src) {
2156   item.reduce_orig = NULL;
2157 }
2158 template <>
2159 void __kmp_assign_orig<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2160                                             kmp_taskred_input_t &src) {
2161   if (src.reduce_orig != NULL) {
2162     item.reduce_orig = src.reduce_orig;
2163   } else {
2164     item.reduce_orig = src.reduce_shar;
2165   } // non-NULL reduce_orig means new interface used
2166 }
2167 
2168 template <typename T> void __kmp_call_init(kmp_taskred_data_t &item, size_t j);
2169 template <>
2170 void __kmp_call_init<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2171                                            size_t offset) {
2172   ((void (*)(void *))item.reduce_init)((char *)(item.reduce_priv) + offset);
2173 }
2174 template <>
2175 void __kmp_call_init<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2176                                           size_t offset) {
2177   ((void (*)(void *, void *))item.reduce_init)(
2178       (char *)(item.reduce_priv) + offset, item.reduce_orig);
2179 }
2180 
2181 template <typename T>
2182 void *__kmp_task_reduction_init(int gtid, int num, T *data) {
2183   __kmp_assert_valid_gtid(gtid);
2184   kmp_info_t *thread = __kmp_threads[gtid];
2185   kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
2186   kmp_uint32 nth = thread->th.th_team_nproc;
2187   kmp_taskred_data_t *arr;
2188 
2189   // check input data just in case
2190   KMP_ASSERT(tg != NULL);
2191   KMP_ASSERT(data != NULL);
2192   KMP_ASSERT(num > 0);
2193   if (nth == 1) {
2194     KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n",
2195                   gtid, tg));
2196     return (void *)tg;
2197   }
2198   KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n",
2199                 gtid, tg, num));
2200   arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2201       thread, num * sizeof(kmp_taskred_data_t));
2202   for (int i = 0; i < num; ++i) {
2203     size_t size = data[i].reduce_size - 1;
2204     // round the size up to cache line per thread-specific item
2205     size += CACHE_LINE - size % CACHE_LINE;
2206     KMP_ASSERT(data[i].reduce_comb != NULL); // combiner is mandatory
2207     arr[i].reduce_shar = data[i].reduce_shar;
2208     arr[i].reduce_size = size;
2209     arr[i].flags = data[i].flags;
2210     arr[i].reduce_comb = data[i].reduce_comb;
2211     arr[i].reduce_init = data[i].reduce_init;
2212     arr[i].reduce_fini = data[i].reduce_fini;
2213     __kmp_assign_orig<T>(arr[i], data[i]);
2214     if (!arr[i].flags.lazy_priv) {
2215       // allocate cache-line aligned block and fill it with zeros
2216       arr[i].reduce_priv = __kmp_allocate(nth * size);
2217       arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size;
2218       if (arr[i].reduce_init != NULL) {
2219         // initialize all thread-specific items
2220         for (size_t j = 0; j < nth; ++j) {
2221           __kmp_call_init<T>(arr[i], j * size);
2222         }
2223       }
2224     } else {
2225       // only allocate space for pointers now,
2226       // objects will be lazily allocated/initialized if/when requested
2227       // note that __kmp_allocate zeroes the allocated memory
2228       arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *));
2229     }
2230   }
2231   tg->reduce_data = (void *)arr;
2232   tg->reduce_num_data = num;
2233   return (void *)tg;
2234 }
2235 
2236 /*!
2237 @ingroup TASKING
2238 @param gtid      Global thread ID
2239 @param num       Number of data items to reduce
2240 @param data      Array of data for reduction
2241 @return The taskgroup identifier
2242 
2243 Initialize task reduction for the taskgroup.
2244 
2245 Note: this entry supposes the optional compiler-generated initializer routine
2246 has single parameter - pointer to object to be initialized. That means
2247 the reduction either does not use omp_orig object, or the omp_orig is accessible
2248 without help of the runtime library.
2249 */
2250 void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
2251   return __kmp_task_reduction_init(gtid, num, (kmp_task_red_input_t *)data);
2252 }
2253 
2254 /*!
2255 @ingroup TASKING
2256 @param gtid      Global thread ID
2257 @param num       Number of data items to reduce
2258 @param data      Array of data for reduction
2259 @return The taskgroup identifier
2260 
2261 Initialize task reduction for the taskgroup.
2262 
2263 Note: this entry supposes the optional compiler-generated initializer routine
2264 has two parameters, pointer to object to be initialized and pointer to omp_orig
2265 */
2266 void *__kmpc_taskred_init(int gtid, int num, void *data) {
2267   return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data);
2268 }
2269 
2270 // Copy task reduction data (except for shared pointers).
2271 template <typename T>
2272 void __kmp_task_reduction_init_copy(kmp_info_t *thr, int num, T *data,
2273                                     kmp_taskgroup_t *tg, void *reduce_data) {
2274   kmp_taskred_data_t *arr;
2275   KA_TRACE(20, ("__kmp_task_reduction_init_copy: Th %p, init taskgroup %p,"
2276                 " from data %p\n",
2277                 thr, tg, reduce_data));
2278   arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2279       thr, num * sizeof(kmp_taskred_data_t));
2280   // threads will share private copies, thunk routines, sizes, flags, etc.:
2281   KMP_MEMCPY(arr, reduce_data, num * sizeof(kmp_taskred_data_t));
2282   for (int i = 0; i < num; ++i) {
2283     arr[i].reduce_shar = data[i].reduce_shar; // init unique shared pointers
2284   }
2285   tg->reduce_data = (void *)arr;
2286   tg->reduce_num_data = num;
2287 }
2288 
2289 /*!
2290 @ingroup TASKING
2291 @param gtid    Global thread ID
2292 @param tskgrp  The taskgroup ID (optional)
2293 @param data    Shared location of the item
2294 @return The pointer to per-thread data
2295 
2296 Get thread-specific location of data item
2297 */
2298 void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
2299   __kmp_assert_valid_gtid(gtid);
2300   kmp_info_t *thread = __kmp_threads[gtid];
2301   kmp_int32 nth = thread->th.th_team_nproc;
2302   if (nth == 1)
2303     return data; // nothing to do
2304 
2305   kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp;
2306   if (tg == NULL)
2307     tg = thread->th.th_current_task->td_taskgroup;
2308   KMP_ASSERT(tg != NULL);
2309   kmp_taskred_data_t *arr = (kmp_taskred_data_t *)(tg->reduce_data);
2310   kmp_int32 num = tg->reduce_num_data;
2311   kmp_int32 tid = thread->th.th_info.ds.ds_tid;
2312 
2313   KMP_ASSERT(data != NULL);
2314   while (tg != NULL) {
2315     for (int i = 0; i < num; ++i) {
2316       if (!arr[i].flags.lazy_priv) {
2317         if (data == arr[i].reduce_shar ||
2318             (data >= arr[i].reduce_priv && data < arr[i].reduce_pend))
2319           return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size;
2320       } else {
2321         // check shared location first
2322         void **p_priv = (void **)(arr[i].reduce_priv);
2323         if (data == arr[i].reduce_shar)
2324           goto found;
2325         // check if we get some thread specific location as parameter
2326         for (int j = 0; j < nth; ++j)
2327           if (data == p_priv[j])
2328             goto found;
2329         continue; // not found, continue search
2330       found:
2331         if (p_priv[tid] == NULL) {
2332           // allocate thread specific object lazily
2333           p_priv[tid] = __kmp_allocate(arr[i].reduce_size);
2334           if (arr[i].reduce_init != NULL) {
2335             if (arr[i].reduce_orig != NULL) { // new interface
2336               ((void (*)(void *, void *))arr[i].reduce_init)(
2337                   p_priv[tid], arr[i].reduce_orig);
2338             } else { // old interface (single parameter)
2339               ((void (*)(void *))arr[i].reduce_init)(p_priv[tid]);
2340             }
2341           }
2342         }
2343         return p_priv[tid];
2344       }
2345     }
2346     tg = tg->parent;
2347     arr = (kmp_taskred_data_t *)(tg->reduce_data);
2348     num = tg->reduce_num_data;
2349   }
2350   KMP_ASSERT2(0, "Unknown task reduction item");
2351   return NULL; // ERROR, this line never executed
2352 }
2353 
2354 // Finalize task reduction.
2355 // Called from __kmpc_end_taskgroup()
2356 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) {
2357   kmp_int32 nth = th->th.th_team_nproc;
2358   KMP_DEBUG_ASSERT(nth > 1); // should not be called if nth == 1
2359   kmp_taskred_data_t *arr = (kmp_taskred_data_t *)tg->reduce_data;
2360   kmp_int32 num = tg->reduce_num_data;
2361   for (int i = 0; i < num; ++i) {
2362     void *sh_data = arr[i].reduce_shar;
2363     void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini);
2364     void (*f_comb)(void *, void *) =
2365         (void (*)(void *, void *))(arr[i].reduce_comb);
2366     if (!arr[i].flags.lazy_priv) {
2367       void *pr_data = arr[i].reduce_priv;
2368       size_t size = arr[i].reduce_size;
2369       for (int j = 0; j < nth; ++j) {
2370         void *priv_data = (char *)pr_data + j * size;
2371         f_comb(sh_data, priv_data); // combine results
2372         if (f_fini)
2373           f_fini(priv_data); // finalize if needed
2374       }
2375     } else {
2376       void **pr_data = (void **)(arr[i].reduce_priv);
2377       for (int j = 0; j < nth; ++j) {
2378         if (pr_data[j] != NULL) {
2379           f_comb(sh_data, pr_data[j]); // combine results
2380           if (f_fini)
2381             f_fini(pr_data[j]); // finalize if needed
2382           __kmp_free(pr_data[j]);
2383         }
2384       }
2385     }
2386     __kmp_free(arr[i].reduce_priv);
2387   }
2388   __kmp_thread_free(th, arr);
2389   tg->reduce_data = NULL;
2390   tg->reduce_num_data = 0;
2391 }
2392 
2393 // Cleanup task reduction data for parallel or worksharing,
2394 // do not touch task private data other threads still working with.
2395 // Called from __kmpc_end_taskgroup()
2396 static void __kmp_task_reduction_clean(kmp_info_t *th, kmp_taskgroup_t *tg) {
2397   __kmp_thread_free(th, tg->reduce_data);
2398   tg->reduce_data = NULL;
2399   tg->reduce_num_data = 0;
2400 }
2401 
2402 template <typename T>
2403 void *__kmp_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2404                                          int num, T *data) {
2405   __kmp_assert_valid_gtid(gtid);
2406   kmp_info_t *thr = __kmp_threads[gtid];
2407   kmp_int32 nth = thr->th.th_team_nproc;
2408   __kmpc_taskgroup(loc, gtid); // form new taskgroup first
2409   if (nth == 1) {
2410     KA_TRACE(10,
2411              ("__kmpc_reduction_modifier_init: T#%d, tg %p, exiting nth=1\n",
2412               gtid, thr->th.th_current_task->td_taskgroup));
2413     return (void *)thr->th.th_current_task->td_taskgroup;
2414   }
2415   kmp_team_t *team = thr->th.th_team;
2416   void *reduce_data;
2417   kmp_taskgroup_t *tg;
2418   reduce_data = KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[is_ws]);
2419   if (reduce_data == NULL &&
2420       __kmp_atomic_compare_store(&team->t.t_tg_reduce_data[is_ws], reduce_data,
2421                                  (void *)1)) {
2422     // single thread enters this block to initialize common reduction data
2423     KMP_DEBUG_ASSERT(reduce_data == NULL);
2424     // first initialize own data, then make a copy other threads can use
2425     tg = (kmp_taskgroup_t *)__kmp_task_reduction_init<T>(gtid, num, data);
2426     reduce_data = __kmp_thread_malloc(thr, num * sizeof(kmp_taskred_data_t));
2427     KMP_MEMCPY(reduce_data, tg->reduce_data, num * sizeof(kmp_taskred_data_t));
2428     // fini counters should be 0 at this point
2429     KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[0]) == 0);
2430     KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[1]) == 0);
2431     KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[is_ws], reduce_data);
2432   } else {
2433     while (
2434         (reduce_data = KMP_ATOMIC_LD_ACQ(&team->t.t_tg_reduce_data[is_ws])) ==
2435         (void *)1) { // wait for task reduction initialization
2436       KMP_CPU_PAUSE();
2437     }
2438     KMP_DEBUG_ASSERT(reduce_data > (void *)1); // should be valid pointer here
2439     tg = thr->th.th_current_task->td_taskgroup;
2440     __kmp_task_reduction_init_copy<T>(thr, num, data, tg, reduce_data);
2441   }
2442   return tg;
2443 }
2444 
2445 /*!
2446 @ingroup TASKING
2447 @param loc       Source location info
2448 @param gtid      Global thread ID
2449 @param is_ws     Is 1 if the reduction is for worksharing, 0 otherwise
2450 @param num       Number of data items to reduce
2451 @param data      Array of data for reduction
2452 @return The taskgroup identifier
2453 
2454 Initialize task reduction for a parallel or worksharing.
2455 
2456 Note: this entry supposes the optional compiler-generated initializer routine
2457 has single parameter - pointer to object to be initialized. That means
2458 the reduction either does not use omp_orig object, or the omp_orig is accessible
2459 without help of the runtime library.
2460 */
2461 void *__kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2462                                           int num, void *data) {
2463   return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2464                                             (kmp_task_red_input_t *)data);
2465 }
2466 
2467 /*!
2468 @ingroup TASKING
2469 @param loc       Source location info
2470 @param gtid      Global thread ID
2471 @param is_ws     Is 1 if the reduction is for worksharing, 0 otherwise
2472 @param num       Number of data items to reduce
2473 @param data      Array of data for reduction
2474 @return The taskgroup identifier
2475 
2476 Initialize task reduction for a parallel or worksharing.
2477 
2478 Note: this entry supposes the optional compiler-generated initializer routine
2479 has two parameters, pointer to object to be initialized and pointer to omp_orig
2480 */
2481 void *__kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num,
2482                                    void *data) {
2483   return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2484                                             (kmp_taskred_input_t *)data);
2485 }
2486 
2487 /*!
2488 @ingroup TASKING
2489 @param loc       Source location info
2490 @param gtid      Global thread ID
2491 @param is_ws     Is 1 if the reduction is for worksharing, 0 otherwise
2492 
2493 Finalize task reduction for a parallel or worksharing.
2494 */
2495 void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws) {
2496   __kmpc_end_taskgroup(loc, gtid);
2497 }
2498 
2499 // __kmpc_taskgroup: Start a new taskgroup
2500 void __kmpc_taskgroup(ident_t *loc, int gtid) {
2501   __kmp_assert_valid_gtid(gtid);
2502   kmp_info_t *thread = __kmp_threads[gtid];
2503   kmp_taskdata_t *taskdata = thread->th.th_current_task;
2504   kmp_taskgroup_t *tg_new =
2505       (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t));
2506   KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new));
2507   KMP_ATOMIC_ST_RLX(&tg_new->count, 0);
2508   KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq);
2509   tg_new->parent = taskdata->td_taskgroup;
2510   tg_new->reduce_data = NULL;
2511   tg_new->reduce_num_data = 0;
2512   tg_new->gomp_data = NULL;
2513   taskdata->td_taskgroup = tg_new;
2514 
2515 #if OMPT_SUPPORT && OMPT_OPTIONAL
2516   if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2517     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2518     if (!codeptr)
2519       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2520     kmp_team_t *team = thread->th.th_team;
2521     ompt_data_t my_task_data = taskdata->ompt_task_info.task_data;
2522     // FIXME: I think this is wrong for lwt!
2523     ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data;
2524 
2525     ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2526         ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2527         &(my_task_data), codeptr);
2528   }
2529 #endif
2530 }
2531 
2532 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task
2533 //                       and its descendants are complete
2534 void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
2535   __kmp_assert_valid_gtid(gtid);
2536   kmp_info_t *thread = __kmp_threads[gtid];
2537   kmp_taskdata_t *taskdata = thread->th.th_current_task;
2538   kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
2539   int thread_finished = FALSE;
2540 
2541 #if OMPT_SUPPORT && OMPT_OPTIONAL
2542   kmp_team_t *team;
2543   ompt_data_t my_task_data;
2544   ompt_data_t my_parallel_data;
2545   void *codeptr = nullptr;
2546   if (UNLIKELY(ompt_enabled.enabled)) {
2547     team = thread->th.th_team;
2548     my_task_data = taskdata->ompt_task_info.task_data;
2549     // FIXME: I think this is wrong for lwt!
2550     my_parallel_data = team->t.ompt_team_info.parallel_data;
2551     codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2552     if (!codeptr)
2553       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2554   }
2555 #endif
2556 
2557   KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc));
2558   KMP_DEBUG_ASSERT(taskgroup != NULL);
2559   KMP_SET_THREAD_STATE_BLOCK(TASKGROUP);
2560 
2561   if (__kmp_tasking_mode != tskm_immediate_exec) {
2562     // mark task as waiting not on a barrier
2563     taskdata->td_taskwait_counter += 1;
2564     taskdata->td_taskwait_ident = loc;
2565     taskdata->td_taskwait_thread = gtid + 1;
2566 #if USE_ITT_BUILD
2567     // For ITT the taskgroup wait is similar to taskwait until we need to
2568     // distinguish them
2569     void *itt_sync_obj = NULL;
2570 #if USE_ITT_NOTIFY
2571     KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2572 #endif /* USE_ITT_NOTIFY */
2573 #endif /* USE_ITT_BUILD */
2574 
2575 #if OMPT_SUPPORT && OMPT_OPTIONAL
2576     if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2577       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2578           ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2579           &(my_task_data), codeptr);
2580     }
2581 #endif
2582 
2583     if (!taskdata->td_flags.team_serial ||
2584         (thread->th.th_task_team != NULL &&
2585          thread->th.th_task_team->tt.tt_found_proxy_tasks)) {
2586       kmp_flag_32<false, false> flag(
2587           RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)), 0U);
2588       while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) {
2589         flag.execute_tasks(thread, gtid, FALSE,
2590                            &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2591                            __kmp_task_stealing_constraint);
2592       }
2593     }
2594     taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; // end waiting
2595 
2596 #if OMPT_SUPPORT && OMPT_OPTIONAL
2597     if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2598       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2599           ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2600           &(my_task_data), codeptr);
2601     }
2602 #endif
2603 
2604 #if USE_ITT_BUILD
2605     KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2606     KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with descendants
2607 #endif /* USE_ITT_BUILD */
2608   }
2609   KMP_DEBUG_ASSERT(taskgroup->count == 0);
2610 
2611   if (taskgroup->reduce_data != NULL &&
2612       !taskgroup->gomp_data) { // need to reduce?
2613     int cnt;
2614     void *reduce_data;
2615     kmp_team_t *t = thread->th.th_team;
2616     kmp_taskred_data_t *arr = (kmp_taskred_data_t *)taskgroup->reduce_data;
2617     // check if <priv> data of the first reduction variable shared for the team
2618     void *priv0 = arr[0].reduce_priv;
2619     if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[0])) != NULL &&
2620         ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
2621       // finishing task reduction on parallel
2622       cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[0]);
2623       if (cnt == thread->th.th_team_nproc - 1) {
2624         // we are the last thread passing __kmpc_reduction_modifier_fini()
2625         // finalize task reduction:
2626         __kmp_task_reduction_fini(thread, taskgroup);
2627         // cleanup fields in the team structure:
2628         // TODO: is relaxed store enough here (whole barrier should follow)?
2629         __kmp_thread_free(thread, reduce_data);
2630         KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[0], NULL);
2631         KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[0], 0);
2632       } else {
2633         // we are not the last thread passing __kmpc_reduction_modifier_fini(),
2634         // so do not finalize reduction, just clean own copy of the data
2635         __kmp_task_reduction_clean(thread, taskgroup);
2636       }
2637     } else if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[1])) !=
2638                    NULL &&
2639                ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
2640       // finishing task reduction on worksharing
2641       cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[1]);
2642       if (cnt == thread->th.th_team_nproc - 1) {
2643         // we are the last thread passing __kmpc_reduction_modifier_fini()
2644         __kmp_task_reduction_fini(thread, taskgroup);
2645         // cleanup fields in team structure:
2646         // TODO: is relaxed store enough here (whole barrier should follow)?
2647         __kmp_thread_free(thread, reduce_data);
2648         KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[1], NULL);
2649         KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[1], 0);
2650       } else {
2651         // we are not the last thread passing __kmpc_reduction_modifier_fini(),
2652         // so do not finalize reduction, just clean own copy of the data
2653         __kmp_task_reduction_clean(thread, taskgroup);
2654       }
2655     } else {
2656       // finishing task reduction on taskgroup
2657       __kmp_task_reduction_fini(thread, taskgroup);
2658     }
2659   }
2660   // Restore parent taskgroup for the current task
2661   taskdata->td_taskgroup = taskgroup->parent;
2662   __kmp_thread_free(thread, taskgroup);
2663 
2664   KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n",
2665                 gtid, taskdata));
2666   ANNOTATE_HAPPENS_AFTER(taskdata);
2667 
2668 #if OMPT_SUPPORT && OMPT_OPTIONAL
2669   if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2670     ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2671         ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2672         &(my_task_data), codeptr);
2673   }
2674 #endif
2675 }
2676 
2677 // __kmp_remove_my_task: remove a task from my own deque
2678 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid,
2679                                         kmp_task_team_t *task_team,
2680                                         kmp_int32 is_constrained) {
2681   kmp_task_t *task;
2682   kmp_taskdata_t *taskdata;
2683   kmp_thread_data_t *thread_data;
2684   kmp_uint32 tail;
2685 
2686   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2687   KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data !=
2688                    NULL); // Caller should check this condition
2689 
2690   thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
2691 
2692   KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",
2693                 gtid, thread_data->td.td_deque_ntasks,
2694                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2695 
2696   if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2697     KA_TRACE(10,
2698              ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: "
2699               "ntasks=%d head=%u tail=%u\n",
2700               gtid, thread_data->td.td_deque_ntasks,
2701               thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2702     return NULL;
2703   }
2704 
2705   __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
2706 
2707   if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2708     __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2709     KA_TRACE(10,
2710              ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "
2711               "ntasks=%d head=%u tail=%u\n",
2712               gtid, thread_data->td.td_deque_ntasks,
2713               thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2714     return NULL;
2715   }
2716 
2717   tail = (thread_data->td.td_deque_tail - 1) &
2718          TASK_DEQUE_MASK(thread_data->td); // Wrap index.
2719   taskdata = thread_data->td.td_deque[tail];
2720 
2721   if (!__kmp_task_is_allowed(gtid, is_constrained, taskdata,
2722                              thread->th.th_current_task)) {
2723     // The TSC does not allow to steal victim task
2724     __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2725     KA_TRACE(10,
2726              ("__kmp_remove_my_task(exit #3): T#%d TSC blocks tail task: "
2727               "ntasks=%d head=%u tail=%u\n",
2728               gtid, thread_data->td.td_deque_ntasks,
2729               thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2730     return NULL;
2731   }
2732 
2733   thread_data->td.td_deque_tail = tail;
2734   TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1);
2735 
2736   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2737 
2738   KA_TRACE(10, ("__kmp_remove_my_task(exit #4): T#%d task %p removed: "
2739                 "ntasks=%d head=%u tail=%u\n",
2740                 gtid, taskdata, thread_data->td.td_deque_ntasks,
2741                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2742 
2743   task = KMP_TASKDATA_TO_TASK(taskdata);
2744   return task;
2745 }
2746 
2747 // __kmp_steal_task: remove a task from another thread's deque
2748 // Assume that calling thread has already checked existence of
2749 // task_team thread_data before calling this routine.
2750 static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid,
2751                                     kmp_task_team_t *task_team,
2752                                     std::atomic<kmp_int32> *unfinished_threads,
2753                                     int *thread_finished,
2754                                     kmp_int32 is_constrained) {
2755   kmp_task_t *task;
2756   kmp_taskdata_t *taskdata;
2757   kmp_taskdata_t *current;
2758   kmp_thread_data_t *victim_td, *threads_data;
2759   kmp_int32 target;
2760   kmp_int32 victim_tid;
2761 
2762   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2763 
2764   threads_data = task_team->tt.tt_threads_data;
2765   KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition
2766 
2767   victim_tid = victim_thr->th.th_info.ds.ds_tid;
2768   victim_td = &threads_data[victim_tid];
2769 
2770   KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: "
2771                 "task_team=%p ntasks=%d head=%u tail=%u\n",
2772                 gtid, __kmp_gtid_from_thread(victim_thr), task_team,
2773                 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
2774                 victim_td->td.td_deque_tail));
2775 
2776   if (TCR_4(victim_td->td.td_deque_ntasks) == 0) {
2777     KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: "
2778                   "task_team=%p ntasks=%d head=%u tail=%u\n",
2779                   gtid, __kmp_gtid_from_thread(victim_thr), task_team,
2780                   victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
2781                   victim_td->td.td_deque_tail));
2782     return NULL;
2783   }
2784 
2785   __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock);
2786 
2787   int ntasks = TCR_4(victim_td->td.td_deque_ntasks);
2788   // Check again after we acquire the lock
2789   if (ntasks == 0) {
2790     __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2791     KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: "
2792                   "task_team=%p ntasks=%d head=%u tail=%u\n",
2793                   gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
2794                   victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2795     return NULL;
2796   }
2797 
2798   KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL);
2799   current = __kmp_threads[gtid]->th.th_current_task;
2800   taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head];
2801   if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
2802     // Bump head pointer and Wrap.
2803     victim_td->td.td_deque_head =
2804         (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td);
2805   } else {
2806     if (!task_team->tt.tt_untied_task_encountered) {
2807       // The TSC does not allow to steal victim task
2808       __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2809       KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d could not steal from "
2810                     "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
2811                     gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
2812                     victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2813       return NULL;
2814     }
2815     int i;
2816     // walk through victim's deque trying to steal any task
2817     target = victim_td->td.td_deque_head;
2818     taskdata = NULL;
2819     for (i = 1; i < ntasks; ++i) {
2820       target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
2821       taskdata = victim_td->td.td_deque[target];
2822       if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
2823         break; // found victim task
2824       } else {
2825         taskdata = NULL;
2826       }
2827     }
2828     if (taskdata == NULL) {
2829       // No appropriate candidate to steal found
2830       __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2831       KA_TRACE(10, ("__kmp_steal_task(exit #4): T#%d could not steal from "
2832                     "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
2833                     gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
2834                     victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2835       return NULL;
2836     }
2837     int prev = target;
2838     for (i = i + 1; i < ntasks; ++i) {
2839       // shift remaining tasks in the deque left by 1
2840       target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
2841       victim_td->td.td_deque[prev] = victim_td->td.td_deque[target];
2842       prev = target;
2843     }
2844     KMP_DEBUG_ASSERT(
2845         victim_td->td.td_deque_tail ==
2846         (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(victim_td->td)));
2847     victim_td->td.td_deque_tail = target; // tail -= 1 (wrapped))
2848   }
2849   if (*thread_finished) {
2850     // We need to un-mark this victim as a finished victim.  This must be done
2851     // before releasing the lock, or else other threads (starting with the
2852     // primary thread victim) might be prematurely released from the barrier!!!
2853     kmp_int32 count;
2854 
2855     count = KMP_ATOMIC_INC(unfinished_threads);
2856 
2857     KA_TRACE(
2858         20,
2859         ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
2860          gtid, count + 1, task_team));
2861 
2862     *thread_finished = FALSE;
2863   }
2864   TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1);
2865 
2866   __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2867 
2868   KMP_COUNT_BLOCK(TASK_stolen);
2869   KA_TRACE(10,
2870            ("__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: "
2871             "task_team=%p ntasks=%d head=%u tail=%u\n",
2872             gtid, taskdata, __kmp_gtid_from_thread(victim_thr), task_team,
2873             ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2874 
2875   task = KMP_TASKDATA_TO_TASK(taskdata);
2876   return task;
2877 }
2878 
2879 // __kmp_execute_tasks_template: Choose and execute tasks until either the
2880 // condition is statisfied (return true) or there are none left (return false).
2881 //
2882 // final_spin is TRUE if this is the spin at the release barrier.
2883 // thread_finished indicates whether the thread is finished executing all
2884 // the tasks it has on its deque, and is at the release barrier.
2885 // spinner is the location on which to spin.
2886 // spinner == NULL means only execute a single task and return.
2887 // checker is the value to check to terminate the spin.
2888 template <class C>
2889 static inline int __kmp_execute_tasks_template(
2890     kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin,
2891     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
2892     kmp_int32 is_constrained) {
2893   kmp_task_team_t *task_team = thread->th.th_task_team;
2894   kmp_thread_data_t *threads_data;
2895   kmp_task_t *task;
2896   kmp_info_t *other_thread;
2897   kmp_taskdata_t *current_task = thread->th.th_current_task;
2898   std::atomic<kmp_int32> *unfinished_threads;
2899   kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0,
2900                       tid = thread->th.th_info.ds.ds_tid;
2901 
2902   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2903   KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]);
2904 
2905   if (task_team == NULL || current_task == NULL)
2906     return FALSE;
2907 
2908   KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d "
2909                 "*thread_finished=%d\n",
2910                 gtid, final_spin, *thread_finished));
2911 
2912   thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
2913   threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
2914 
2915   KMP_DEBUG_ASSERT(threads_data != NULL);
2916 
2917   nthreads = task_team->tt.tt_nproc;
2918   unfinished_threads = &(task_team->tt.tt_unfinished_threads);
2919   KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks ||
2920                    task_team->tt.tt_hidden_helper_task_encountered);
2921   KMP_DEBUG_ASSERT(*unfinished_threads >= 0);
2922 
2923   while (1) { // Outer loop keeps trying to find tasks in case of single thread
2924     // getting tasks from target constructs
2925     while (1) { // Inner loop to find a task and execute it
2926       task = NULL;
2927       if (use_own_tasks) { // check on own queue first
2928         task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained);
2929       }
2930       if ((task == NULL) && (nthreads > 1)) { // Steal a task
2931         int asleep = 1;
2932         use_own_tasks = 0;
2933         // Try to steal from the last place I stole from successfully.
2934         if (victim_tid == -2) { // haven't stolen anything yet
2935           victim_tid = threads_data[tid].td.td_deque_last_stolen;
2936           if (victim_tid !=
2937               -1) // if we have a last stolen from victim, get the thread
2938             other_thread = threads_data[victim_tid].td.td_thr;
2939         }
2940         if (victim_tid != -1) { // found last victim
2941           asleep = 0;
2942         } else if (!new_victim) { // no recent steals and we haven't already
2943           // used a new victim; select a random thread
2944           do { // Find a different thread to steal work from.
2945             // Pick a random thread. Initial plan was to cycle through all the
2946             // threads, and only return if we tried to steal from every thread,
2947             // and failed.  Arch says that's not such a great idea.
2948             victim_tid = __kmp_get_random(thread) % (nthreads - 1);
2949             if (victim_tid >= tid) {
2950               ++victim_tid; // Adjusts random distribution to exclude self
2951             }
2952             // Found a potential victim
2953             other_thread = threads_data[victim_tid].td.td_thr;
2954             // There is a slight chance that __kmp_enable_tasking() did not wake
2955             // up all threads waiting at the barrier.  If victim is sleeping,
2956             // then wake it up. Since we were going to pay the cache miss
2957             // penalty for referencing another thread's kmp_info_t struct
2958             // anyway,
2959             // the check shouldn't cost too much performance at this point. In
2960             // extra barrier mode, tasks do not sleep at the separate tasking
2961             // barrier, so this isn't a problem.
2962             asleep = 0;
2963             if ((__kmp_tasking_mode == tskm_task_teams) &&
2964                 (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&
2965                 (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) !=
2966                  NULL)) {
2967               asleep = 1;
2968               __kmp_null_resume_wrapper(__kmp_gtid_from_thread(other_thread),
2969                                         other_thread->th.th_sleep_loc);
2970               // A sleeping thread should not have any tasks on it's queue.
2971               // There is a slight possibility that it resumes, steals a task
2972               // from another thread, which spawns more tasks, all in the time
2973               // that it takes this thread to check => don't write an assertion
2974               // that the victim's queue is empty.  Try stealing from a
2975               // different thread.
2976             }
2977           } while (asleep);
2978         }
2979 
2980         if (!asleep) {
2981           // We have a victim to try to steal from
2982           task = __kmp_steal_task(other_thread, gtid, task_team,
2983                                   unfinished_threads, thread_finished,
2984                                   is_constrained);
2985         }
2986         if (task != NULL) { // set last stolen to victim
2987           if (threads_data[tid].td.td_deque_last_stolen != victim_tid) {
2988             threads_data[tid].td.td_deque_last_stolen = victim_tid;
2989             // The pre-refactored code did not try more than 1 successful new
2990             // vicitm, unless the last one generated more local tasks;
2991             // new_victim keeps track of this
2992             new_victim = 1;
2993           }
2994         } else { // No tasks found; unset last_stolen
2995           KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1);
2996           victim_tid = -2; // no successful victim found
2997         }
2998       }
2999 
3000       if (task == NULL)
3001         break; // break out of tasking loop
3002 
3003 // Found a task; execute it
3004 #if USE_ITT_BUILD && USE_ITT_NOTIFY
3005       if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
3006         if (itt_sync_obj == NULL) { // we are at fork barrier where we could not
3007           // get the object reliably
3008           itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
3009         }
3010         __kmp_itt_task_starting(itt_sync_obj);
3011       }
3012 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
3013       __kmp_invoke_task(gtid, task, current_task);
3014 #if USE_ITT_BUILD
3015       if (itt_sync_obj != NULL)
3016         __kmp_itt_task_finished(itt_sync_obj);
3017 #endif /* USE_ITT_BUILD */
3018       // If this thread is only partway through the barrier and the condition is
3019       // met, then return now, so that the barrier gather/release pattern can
3020       // proceed. If this thread is in the last spin loop in the barrier,
3021       // waiting to be released, we know that the termination condition will not
3022       // be satisfied, so don't waste any cycles checking it.
3023       if (flag == NULL || (!final_spin && flag->done_check())) {
3024         KA_TRACE(
3025             15,
3026             ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3027              gtid));
3028         return TRUE;
3029       }
3030       if (thread->th.th_task_team == NULL) {
3031         break;
3032       }
3033       KMP_YIELD(__kmp_library == library_throughput); // Yield before next task
3034       // If execution of a stolen task results in more tasks being placed on our
3035       // run queue, reset use_own_tasks
3036       if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) {
3037         KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned "
3038                       "other tasks, restart\n",
3039                       gtid));
3040         use_own_tasks = 1;
3041         new_victim = 0;
3042       }
3043     }
3044 
3045     // The task source has been exhausted. If in final spin loop of barrier,
3046     // check if termination condition is satisfied. The work queue may be empty
3047     // but there might be proxy tasks still executing.
3048     if (final_spin &&
3049         KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks) == 0) {
3050       // First, decrement the #unfinished threads, if that has not already been
3051       // done.  This decrement might be to the spin location, and result in the
3052       // termination condition being satisfied.
3053       if (!*thread_finished) {
3054         kmp_int32 count;
3055 
3056         count = KMP_ATOMIC_DEC(unfinished_threads) - 1;
3057         KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec "
3058                       "unfinished_threads to %d task_team=%p\n",
3059                       gtid, count, task_team));
3060         *thread_finished = TRUE;
3061       }
3062 
3063       // It is now unsafe to reference thread->th.th_team !!!
3064       // Decrementing task_team->tt.tt_unfinished_threads can allow the primary
3065       // thread to pass through the barrier, where it might reset each thread's
3066       // th.th_team field for the next parallel region. If we can steal more
3067       // work, we know that this has not happened yet.
3068       if (flag != NULL && flag->done_check()) {
3069         KA_TRACE(
3070             15,
3071             ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3072              gtid));
3073         return TRUE;
3074       }
3075     }
3076 
3077     // If this thread's task team is NULL, primary thread has recognized that
3078     // there are no more tasks; bail out
3079     if (thread->th.th_task_team == NULL) {
3080       KA_TRACE(15,
3081                ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid));
3082       return FALSE;
3083     }
3084 
3085     // We could be getting tasks from target constructs; if this is the only
3086     // thread, keep trying to execute tasks from own queue
3087     if (nthreads == 1 &&
3088         KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks))
3089       use_own_tasks = 1;
3090     else {
3091       KA_TRACE(15,
3092                ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid));
3093       return FALSE;
3094     }
3095   }
3096 }
3097 
3098 template <bool C, bool S>
3099 int __kmp_execute_tasks_32(
3100     kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32<C, S> *flag, int final_spin,
3101     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3102     kmp_int32 is_constrained) {
3103   return __kmp_execute_tasks_template(
3104       thread, gtid, flag, final_spin,
3105       thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3106 }
3107 
3108 template <bool C, bool S>
3109 int __kmp_execute_tasks_64(
3110     kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64<C, S> *flag, int final_spin,
3111     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3112     kmp_int32 is_constrained) {
3113   return __kmp_execute_tasks_template(
3114       thread, gtid, flag, final_spin,
3115       thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3116 }
3117 
3118 int __kmp_execute_tasks_oncore(
3119     kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
3120     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3121     kmp_int32 is_constrained) {
3122   return __kmp_execute_tasks_template(
3123       thread, gtid, flag, final_spin,
3124       thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3125 }
3126 
3127 template int
3128 __kmp_execute_tasks_32<false, false>(kmp_info_t *, kmp_int32,
3129                                      kmp_flag_32<false, false> *, int,
3130                                      int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3131 
3132 template int __kmp_execute_tasks_64<false, true>(kmp_info_t *, kmp_int32,
3133                                                  kmp_flag_64<false, true> *,
3134                                                  int,
3135                                                  int *USE_ITT_BUILD_ARG(void *),
3136                                                  kmp_int32);
3137 
3138 template int __kmp_execute_tasks_64<true, false>(kmp_info_t *, kmp_int32,
3139                                                  kmp_flag_64<true, false> *,
3140                                                  int,
3141                                                  int *USE_ITT_BUILD_ARG(void *),
3142                                                  kmp_int32);
3143 
3144 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
3145 // next barrier so they can assist in executing enqueued tasks.
3146 // First thread in allocates the task team atomically.
3147 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
3148                                  kmp_info_t *this_thr) {
3149   kmp_thread_data_t *threads_data;
3150   int nthreads, i, is_init_thread;
3151 
3152   KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n",
3153                 __kmp_gtid_from_thread(this_thr)));
3154 
3155   KMP_DEBUG_ASSERT(task_team != NULL);
3156   KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL);
3157 
3158   nthreads = task_team->tt.tt_nproc;
3159   KMP_DEBUG_ASSERT(nthreads > 0);
3160   KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc);
3161 
3162   // Allocate or increase the size of threads_data if necessary
3163   is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team);
3164 
3165   if (!is_init_thread) {
3166     // Some other thread already set up the array.
3167     KA_TRACE(
3168         20,
3169         ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n",
3170          __kmp_gtid_from_thread(this_thr)));
3171     return;
3172   }
3173   threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3174   KMP_DEBUG_ASSERT(threads_data != NULL);
3175 
3176   if (__kmp_tasking_mode == tskm_task_teams &&
3177       (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) {
3178     // Release any threads sleeping at the barrier, so that they can steal
3179     // tasks and execute them.  In extra barrier mode, tasks do not sleep
3180     // at the separate tasking barrier, so this isn't a problem.
3181     for (i = 0; i < nthreads; i++) {
3182       volatile void *sleep_loc;
3183       kmp_info_t *thread = threads_data[i].td.td_thr;
3184 
3185       if (i == this_thr->th.th_info.ds.ds_tid) {
3186         continue;
3187       }
3188       // Since we haven't locked the thread's suspend mutex lock at this
3189       // point, there is a small window where a thread might be putting
3190       // itself to sleep, but hasn't set the th_sleep_loc field yet.
3191       // To work around this, __kmp_execute_tasks_template() periodically checks
3192       // see if other threads are sleeping (using the same random mechanism that
3193       // is used for task stealing) and awakens them if they are.
3194       if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3195           NULL) {
3196         KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n",
3197                       __kmp_gtid_from_thread(this_thr),
3198                       __kmp_gtid_from_thread(thread)));
3199         __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
3200       } else {
3201         KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
3202                       __kmp_gtid_from_thread(this_thr),
3203                       __kmp_gtid_from_thread(thread)));
3204       }
3205     }
3206   }
3207 
3208   KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n",
3209                 __kmp_gtid_from_thread(this_thr)));
3210 }
3211 
3212 /* // TODO: Check the comment consistency
3213  * Utility routines for "task teams".  A task team (kmp_task_t) is kind of
3214  * like a shadow of the kmp_team_t data struct, with a different lifetime.
3215  * After a child * thread checks into a barrier and calls __kmp_release() from
3216  * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
3217  * longer assume that the kmp_team_t structure is intact (at any moment, the
3218  * primary thread may exit the barrier code and free the team data structure,
3219  * and return the threads to the thread pool).
3220  *
3221  * This does not work with the tasking code, as the thread is still
3222  * expected to participate in the execution of any tasks that may have been
3223  * spawned my a member of the team, and the thread still needs access to all
3224  * to each thread in the team, so that it can steal work from it.
3225  *
3226  * Enter the existence of the kmp_task_team_t struct.  It employs a reference
3227  * counting mechanism, and is allocated by the primary thread before calling
3228  * __kmp_<barrier_kind>_release, and then is release by the last thread to
3229  * exit __kmp_<barrier_kind>_release at the next barrier.  I.e. the lifetimes
3230  * of the kmp_task_team_t structs for consecutive barriers can overlap
3231  * (and will, unless the primary thread is the last thread to exit the barrier
3232  * release phase, which is not typical). The existence of such a struct is
3233  * useful outside the context of tasking.
3234  *
3235  * We currently use the existence of the threads array as an indicator that
3236  * tasks were spawned since the last barrier.  If the structure is to be
3237  * useful outside the context of tasking, then this will have to change, but
3238  * not setting the field minimizes the performance impact of tasking on
3239  * barriers, when no explicit tasks were spawned (pushed, actually).
3240  */
3241 
3242 static kmp_task_team_t *__kmp_free_task_teams =
3243     NULL; // Free list for task_team data structures
3244 // Lock for task team data structures
3245 kmp_bootstrap_lock_t __kmp_task_team_lock =
3246     KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock);
3247 
3248 // __kmp_alloc_task_deque:
3249 // Allocates a task deque for a particular thread, and initialize the necessary
3250 // data structures relating to the deque.  This only happens once per thread
3251 // per task team since task teams are recycled. No lock is needed during
3252 // allocation since each thread allocates its own deque.
3253 static void __kmp_alloc_task_deque(kmp_info_t *thread,
3254                                    kmp_thread_data_t *thread_data) {
3255   __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
3256   KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL);
3257 
3258   // Initialize last stolen task field to "none"
3259   thread_data->td.td_deque_last_stolen = -1;
3260 
3261   KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0);
3262   KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0);
3263   KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0);
3264 
3265   KE_TRACE(
3266       10,
3267       ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
3268        __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data));
3269   // Allocate space for task deque, and zero the deque
3270   // Cannot use __kmp_thread_calloc() because threads not around for
3271   // kmp_reap_task_team( ).
3272   thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
3273       INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
3274   thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
3275 }
3276 
3277 // __kmp_free_task_deque:
3278 // Deallocates a task deque for a particular thread. Happens at library
3279 // deallocation so don't need to reset all thread data fields.
3280 static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) {
3281   if (thread_data->td.td_deque != NULL) {
3282     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3283     TCW_4(thread_data->td.td_deque_ntasks, 0);
3284     __kmp_free(thread_data->td.td_deque);
3285     thread_data->td.td_deque = NULL;
3286     __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3287   }
3288 
3289 #ifdef BUILD_TIED_TASK_STACK
3290   // GEH: Figure out what to do here for td_susp_tied_tasks
3291   if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) {
3292     __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data);
3293   }
3294 #endif // BUILD_TIED_TASK_STACK
3295 }
3296 
3297 // __kmp_realloc_task_threads_data:
3298 // Allocates a threads_data array for a task team, either by allocating an
3299 // initial array or enlarging an existing array.  Only the first thread to get
3300 // the lock allocs or enlarges the array and re-initializes the array elements.
3301 // That thread returns "TRUE", the rest return "FALSE".
3302 // Assumes that the new array size is given by task_team -> tt.tt_nproc.
3303 // The current size is given by task_team -> tt.tt_max_threads.
3304 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
3305                                            kmp_task_team_t *task_team) {
3306   kmp_thread_data_t **threads_data_p;
3307   kmp_int32 nthreads, maxthreads;
3308   int is_init_thread = FALSE;
3309 
3310   if (TCR_4(task_team->tt.tt_found_tasks)) {
3311     // Already reallocated and initialized.
3312     return FALSE;
3313   }
3314 
3315   threads_data_p = &task_team->tt.tt_threads_data;
3316   nthreads = task_team->tt.tt_nproc;
3317   maxthreads = task_team->tt.tt_max_threads;
3318 
3319   // All threads must lock when they encounter the first task of the implicit
3320   // task region to make sure threads_data fields are (re)initialized before
3321   // used.
3322   __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3323 
3324   if (!TCR_4(task_team->tt.tt_found_tasks)) {
3325     // first thread to enable tasking
3326     kmp_team_t *team = thread->th.th_team;
3327     int i;
3328 
3329     is_init_thread = TRUE;
3330     if (maxthreads < nthreads) {
3331 
3332       if (*threads_data_p != NULL) {
3333         kmp_thread_data_t *old_data = *threads_data_p;
3334         kmp_thread_data_t *new_data = NULL;
3335 
3336         KE_TRACE(
3337             10,
3338             ("__kmp_realloc_task_threads_data: T#%d reallocating "
3339              "threads data for task_team %p, new_size = %d, old_size = %d\n",
3340              __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads));
3341         // Reallocate threads_data to have more elements than current array
3342         // Cannot use __kmp_thread_realloc() because threads not around for
3343         // kmp_reap_task_team( ).  Note all new array entries are initialized
3344         // to zero by __kmp_allocate().
3345         new_data = (kmp_thread_data_t *)__kmp_allocate(
3346             nthreads * sizeof(kmp_thread_data_t));
3347         // copy old data to new data
3348         KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t),
3349                      (void *)old_data, maxthreads * sizeof(kmp_thread_data_t));
3350 
3351 #ifdef BUILD_TIED_TASK_STACK
3352         // GEH: Figure out if this is the right thing to do
3353         for (i = maxthreads; i < nthreads; i++) {
3354           kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3355           __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3356         }
3357 #endif // BUILD_TIED_TASK_STACK
3358        // Install the new data and free the old data
3359         (*threads_data_p) = new_data;
3360         __kmp_free(old_data);
3361       } else {
3362         KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating "
3363                       "threads data for task_team %p, size = %d\n",
3364                       __kmp_gtid_from_thread(thread), task_team, nthreads));
3365         // Make the initial allocate for threads_data array, and zero entries
3366         // Cannot use __kmp_thread_calloc() because threads not around for
3367         // kmp_reap_task_team( ).
3368         ANNOTATE_IGNORE_WRITES_BEGIN();
3369         *threads_data_p = (kmp_thread_data_t *)__kmp_allocate(
3370             nthreads * sizeof(kmp_thread_data_t));
3371         ANNOTATE_IGNORE_WRITES_END();
3372 #ifdef BUILD_TIED_TASK_STACK
3373         // GEH: Figure out if this is the right thing to do
3374         for (i = 0; i < nthreads; i++) {
3375           kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3376           __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3377         }
3378 #endif // BUILD_TIED_TASK_STACK
3379       }
3380       task_team->tt.tt_max_threads = nthreads;
3381     } else {
3382       // If array has (more than) enough elements, go ahead and use it
3383       KMP_DEBUG_ASSERT(*threads_data_p != NULL);
3384     }
3385 
3386     // initialize threads_data pointers back to thread_info structures
3387     for (i = 0; i < nthreads; i++) {
3388       kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3389       thread_data->td.td_thr = team->t.t_threads[i];
3390 
3391       if (thread_data->td.td_deque_last_stolen >= nthreads) {
3392         // The last stolen field survives across teams / barrier, and the number
3393         // of threads may have changed.  It's possible (likely?) that a new
3394         // parallel region will exhibit the same behavior as previous region.
3395         thread_data->td.td_deque_last_stolen = -1;
3396       }
3397     }
3398 
3399     KMP_MB();
3400     TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE);
3401   }
3402 
3403   __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3404   return is_init_thread;
3405 }
3406 
3407 // __kmp_free_task_threads_data:
3408 // Deallocates a threads_data array for a task team, including any attached
3409 // tasking deques.  Only occurs at library shutdown.
3410 static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) {
3411   __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3412   if (task_team->tt.tt_threads_data != NULL) {
3413     int i;
3414     for (i = 0; i < task_team->tt.tt_max_threads; i++) {
3415       __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]);
3416     }
3417     __kmp_free(task_team->tt.tt_threads_data);
3418     task_team->tt.tt_threads_data = NULL;
3419   }
3420   __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3421 }
3422 
3423 // __kmp_allocate_task_team:
3424 // Allocates a task team associated with a specific team, taking it from
3425 // the global task team free list if possible.  Also initializes data
3426 // structures.
3427 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
3428                                                  kmp_team_t *team) {
3429   kmp_task_team_t *task_team = NULL;
3430   int nthreads;
3431 
3432   KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n",
3433                 (thread ? __kmp_gtid_from_thread(thread) : -1), team));
3434 
3435   if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3436     // Take a task team from the task team pool
3437     __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3438     if (__kmp_free_task_teams != NULL) {
3439       task_team = __kmp_free_task_teams;
3440       TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next);
3441       task_team->tt.tt_next = NULL;
3442     }
3443     __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3444   }
3445 
3446   if (task_team == NULL) {
3447     KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating "
3448                   "task team for team %p\n",
3449                   __kmp_gtid_from_thread(thread), team));
3450     // Allocate a new task team if one is not available. Cannot use
3451     // __kmp_thread_malloc because threads not around for kmp_reap_task_team.
3452     task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t));
3453     __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock);
3454 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
3455     // suppress race conditions detection on synchronization flags in debug mode
3456     // this helps to analyze library internals eliminating false positives
3457     __itt_suppress_mark_range(
3458         __itt_suppress_range, __itt_suppress_threading_errors,
3459         &task_team->tt.tt_found_tasks, sizeof(task_team->tt.tt_found_tasks));
3460     __itt_suppress_mark_range(__itt_suppress_range,
3461                               __itt_suppress_threading_errors,
3462                               CCAST(kmp_uint32 *, &task_team->tt.tt_active),
3463                               sizeof(task_team->tt.tt_active));
3464 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
3465     // Note: __kmp_allocate zeroes returned memory, othewise we would need:
3466     // task_team->tt.tt_threads_data = NULL;
3467     // task_team->tt.tt_max_threads = 0;
3468     // task_team->tt.tt_next = NULL;
3469   }
3470 
3471   TCW_4(task_team->tt.tt_found_tasks, FALSE);
3472   TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3473   task_team->tt.tt_nproc = nthreads = team->t.t_nproc;
3474 
3475   KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads);
3476   TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
3477   TCW_4(task_team->tt.tt_active, TRUE);
3478 
3479   KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
3480                 "unfinished_threads init'd to %d\n",
3481                 (thread ? __kmp_gtid_from_thread(thread) : -1), task_team,
3482                 KMP_ATOMIC_LD_RLX(&task_team->tt.tt_unfinished_threads)));
3483   return task_team;
3484 }
3485 
3486 // __kmp_free_task_team:
3487 // Frees the task team associated with a specific thread, and adds it
3488 // to the global task team free list.
3489 void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) {
3490   KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n",
3491                 thread ? __kmp_gtid_from_thread(thread) : -1, task_team));
3492 
3493   // Put task team back on free list
3494   __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3495 
3496   KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL);
3497   task_team->tt.tt_next = __kmp_free_task_teams;
3498   TCW_PTR(__kmp_free_task_teams, task_team);
3499 
3500   __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3501 }
3502 
3503 // __kmp_reap_task_teams:
3504 // Free all the task teams on the task team free list.
3505 // Should only be done during library shutdown.
3506 // Cannot do anything that needs a thread structure or gtid since they are
3507 // already gone.
3508 void __kmp_reap_task_teams(void) {
3509   kmp_task_team_t *task_team;
3510 
3511   if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3512     // Free all task_teams on the free list
3513     __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3514     while ((task_team = __kmp_free_task_teams) != NULL) {
3515       __kmp_free_task_teams = task_team->tt.tt_next;
3516       task_team->tt.tt_next = NULL;
3517 
3518       // Free threads_data if necessary
3519       if (task_team->tt.tt_threads_data != NULL) {
3520         __kmp_free_task_threads_data(task_team);
3521       }
3522       __kmp_free(task_team);
3523     }
3524     __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3525   }
3526 }
3527 
3528 // __kmp_wait_to_unref_task_teams:
3529 // Some threads could still be in the fork barrier release code, possibly
3530 // trying to steal tasks.  Wait for each thread to unreference its task team.
3531 void __kmp_wait_to_unref_task_teams(void) {
3532   kmp_info_t *thread;
3533   kmp_uint32 spins;
3534   int done;
3535 
3536   KMP_INIT_YIELD(spins);
3537 
3538   for (;;) {
3539     done = TRUE;
3540 
3541     // TODO: GEH - this may be is wrong because some sync would be necessary
3542     // in case threads are added to the pool during the traversal. Need to
3543     // verify that lock for thread pool is held when calling this routine.
3544     for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL;
3545          thread = thread->th.th_next_pool) {
3546 #if KMP_OS_WINDOWS
3547       DWORD exit_val;
3548 #endif
3549       if (TCR_PTR(thread->th.th_task_team) == NULL) {
3550         KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n",
3551                       __kmp_gtid_from_thread(thread)));
3552         continue;
3553       }
3554 #if KMP_OS_WINDOWS
3555       // TODO: GEH - add this check for Linux* OS / OS X* as well?
3556       if (!__kmp_is_thread_alive(thread, &exit_val)) {
3557         thread->th.th_task_team = NULL;
3558         continue;
3559       }
3560 #endif
3561 
3562       done = FALSE; // Because th_task_team pointer is not NULL for this thread
3563 
3564       KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to "
3565                     "unreference task_team\n",
3566                     __kmp_gtid_from_thread(thread)));
3567 
3568       if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
3569         volatile void *sleep_loc;
3570         // If the thread is sleeping, awaken it.
3571         if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3572             NULL) {
3573           KA_TRACE(
3574               10,
3575               ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
3576                __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread)));
3577           __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
3578         }
3579       }
3580     }
3581     if (done) {
3582       break;
3583     }
3584 
3585     // If oversubscribed or have waited a bit, yield.
3586     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
3587   }
3588 }
3589 
3590 // __kmp_task_team_setup:  Create a task_team for the current team, but use
3591 // an already created, unused one if it already exists.
3592 void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
3593   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3594 
3595   // If this task_team hasn't been created yet, allocate it. It will be used in
3596   // the region after the next.
3597   // If it exists, it is the current task team and shouldn't be touched yet as
3598   // it may still be in use.
3599   if (team->t.t_task_team[this_thr->th.th_task_state] == NULL &&
3600       (always || team->t.t_nproc > 1)) {
3601     team->t.t_task_team[this_thr->th.th_task_state] =
3602         __kmp_allocate_task_team(this_thr, team);
3603     KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
3604                   " for team %d at parity=%d\n",
3605                   __kmp_gtid_from_thread(this_thr),
3606                   team->t.t_task_team[this_thr->th.th_task_state], team->t.t_id,
3607                   this_thr->th.th_task_state));
3608   }
3609 
3610   // After threads exit the release, they will call sync, and then point to this
3611   // other task_team; make sure it is allocated and properly initialized. As
3612   // threads spin in the barrier release phase, they will continue to use the
3613   // previous task_team struct(above), until they receive the signal to stop
3614   // checking for tasks (they can't safely reference the kmp_team_t struct,
3615   // which could be reallocated by the primary thread). No task teams are formed
3616   // for serialized teams.
3617   if (team->t.t_nproc > 1) {
3618     int other_team = 1 - this_thr->th.th_task_state;
3619     KMP_DEBUG_ASSERT(other_team >= 0 && other_team < 2);
3620     if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
3621       team->t.t_task_team[other_team] =
3622           __kmp_allocate_task_team(this_thr, team);
3623       KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created second new "
3624                     "task_team %p for team %d at parity=%d\n",
3625                     __kmp_gtid_from_thread(this_thr),
3626                     team->t.t_task_team[other_team], team->t.t_id, other_team));
3627     } else { // Leave the old task team struct in place for the upcoming region;
3628       // adjust as needed
3629       kmp_task_team_t *task_team = team->t.t_task_team[other_team];
3630       if (!task_team->tt.tt_active ||
3631           team->t.t_nproc != task_team->tt.tt_nproc) {
3632         TCW_4(task_team->tt.tt_nproc, team->t.t_nproc);
3633         TCW_4(task_team->tt.tt_found_tasks, FALSE);
3634         TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3635         KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads,
3636                           team->t.t_nproc);
3637         TCW_4(task_team->tt.tt_active, TRUE);
3638       }
3639       // if team size has changed, the first thread to enable tasking will
3640       // realloc threads_data if necessary
3641       KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d reset next task_team "
3642                     "%p for team %d at parity=%d\n",
3643                     __kmp_gtid_from_thread(this_thr),
3644                     team->t.t_task_team[other_team], team->t.t_id, other_team));
3645     }
3646   }
3647 
3648   // For regular thread, task enabling should be called when the task is going
3649   // to be pushed to a dequeue. However, for the hidden helper thread, we need
3650   // it ahead of time so that some operations can be performed without race
3651   // condition.
3652   if (this_thr == __kmp_hidden_helper_main_thread) {
3653     for (int i = 0; i < 2; ++i) {
3654       kmp_task_team_t *task_team = team->t.t_task_team[i];
3655       if (KMP_TASKING_ENABLED(task_team)) {
3656         continue;
3657       }
3658       __kmp_enable_tasking(task_team, this_thr);
3659       for (int j = 0; j < task_team->tt.tt_nproc; ++j) {
3660         kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[j];
3661         if (thread_data->td.td_deque == NULL) {
3662           __kmp_alloc_task_deque(__kmp_hidden_helper_threads[j], thread_data);
3663         }
3664       }
3665     }
3666   }
3667 }
3668 
3669 // __kmp_task_team_sync: Propagation of task team data from team to threads
3670 // which happens just after the release phase of a team barrier.  This may be
3671 // called by any thread, but only for teams with # threads > 1.
3672 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
3673   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3674 
3675   // Toggle the th_task_state field, to switch which task_team this thread
3676   // refers to
3677   this_thr->th.th_task_state = (kmp_uint8)(1 - this_thr->th.th_task_state);
3678 
3679   // It is now safe to propagate the task team pointer from the team struct to
3680   // the current thread.
3681   TCW_PTR(this_thr->th.th_task_team,
3682           team->t.t_task_team[this_thr->th.th_task_state]);
3683   KA_TRACE(20,
3684            ("__kmp_task_team_sync: Thread T#%d task team switched to task_team "
3685             "%p from Team #%d (parity=%d)\n",
3686             __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team,
3687             team->t.t_id, this_thr->th.th_task_state));
3688 }
3689 
3690 // __kmp_task_team_wait: Primary thread waits for outstanding tasks after the
3691 // barrier gather phase. Only called by primary thread if #threads in team > 1
3692 // or if proxy tasks were created.
3693 //
3694 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
3695 // by passing in 0 optionally as the last argument. When wait is zero, primary
3696 // thread does not wait for unfinished_threads to reach 0.
3697 void __kmp_task_team_wait(
3698     kmp_info_t *this_thr,
3699     kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) {
3700   kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state];
3701 
3702   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3703   KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team);
3704 
3705   if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) {
3706     if (wait) {
3707       KA_TRACE(20, ("__kmp_task_team_wait: Primary T#%d waiting for all tasks "
3708                     "(for unfinished_threads to reach 0) on task_team = %p\n",
3709                     __kmp_gtid_from_thread(this_thr), task_team));
3710       // Worker threads may have dropped through to release phase, but could
3711       // still be executing tasks. Wait here for tasks to complete. To avoid
3712       // memory contention, only primary thread checks termination condition.
3713       kmp_flag_32<false, false> flag(
3714           RCAST(std::atomic<kmp_uint32> *,
3715                 &task_team->tt.tt_unfinished_threads),
3716           0U);
3717       flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
3718     }
3719     // Deactivate the old task team, so that the worker threads will stop
3720     // referencing it while spinning.
3721     KA_TRACE(
3722         20,
3723         ("__kmp_task_team_wait: Primary T#%d deactivating task_team %p: "
3724          "setting active to false, setting local and team's pointer to NULL\n",
3725          __kmp_gtid_from_thread(this_thr), task_team));
3726     KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 ||
3727                      task_team->tt.tt_found_proxy_tasks == TRUE);
3728     TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3729     KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0);
3730     TCW_SYNC_4(task_team->tt.tt_active, FALSE);
3731     KMP_MB();
3732 
3733     TCW_PTR(this_thr->th.th_task_team, NULL);
3734   }
3735 }
3736 
3737 // __kmp_tasking_barrier:
3738 // This routine is called only when __kmp_tasking_mode == tskm_extra_barrier.
3739 // Internal function to execute all tasks prior to a regular barrier or a join
3740 // barrier. It is a full barrier itself, which unfortunately turns regular
3741 // barriers into double barriers and join barriers into 1 1/2 barriers.
3742 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) {
3743   std::atomic<kmp_uint32> *spin = RCAST(
3744       std::atomic<kmp_uint32> *,
3745       &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads);
3746   int flag = FALSE;
3747   KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier);
3748 
3749 #if USE_ITT_BUILD
3750   KMP_FSYNC_SPIN_INIT(spin, NULL);
3751 #endif /* USE_ITT_BUILD */
3752   kmp_flag_32<false, false> spin_flag(spin, 0U);
3753   while (!spin_flag.execute_tasks(thread, gtid, TRUE,
3754                                   &flag USE_ITT_BUILD_ARG(NULL), 0)) {
3755 #if USE_ITT_BUILD
3756     // TODO: What about itt_sync_obj??
3757     KMP_FSYNC_SPIN_PREPARE(RCAST(void *, spin));
3758 #endif /* USE_ITT_BUILD */
3759 
3760     if (TCR_4(__kmp_global.g.g_done)) {
3761       if (__kmp_global.g.g_abort)
3762         __kmp_abort_thread();
3763       break;
3764     }
3765     KMP_YIELD(TRUE);
3766   }
3767 #if USE_ITT_BUILD
3768   KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin));
3769 #endif /* USE_ITT_BUILD */
3770 }
3771 
3772 // __kmp_give_task puts a task into a given thread queue if:
3773 //  - the queue for that thread was created
3774 //  - there's space in that queue
3775 // Because of this, __kmp_push_task needs to check if there's space after
3776 // getting the lock
3777 static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task,
3778                             kmp_int32 pass) {
3779   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
3780   kmp_task_team_t *task_team = taskdata->td_task_team;
3781 
3782   KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n",
3783                 taskdata, tid));
3784 
3785   // If task_team is NULL something went really bad...
3786   KMP_DEBUG_ASSERT(task_team != NULL);
3787 
3788   bool result = false;
3789   kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
3790 
3791   if (thread_data->td.td_deque == NULL) {
3792     // There's no queue in this thread, go find another one
3793     // We're guaranteed that at least one thread has a queue
3794     KA_TRACE(30,
3795              ("__kmp_give_task: thread %d has no queue while giving task %p.\n",
3796               tid, taskdata));
3797     return result;
3798   }
3799 
3800   if (TCR_4(thread_data->td.td_deque_ntasks) >=
3801       TASK_DEQUE_SIZE(thread_data->td)) {
3802     KA_TRACE(
3803         30,
3804         ("__kmp_give_task: queue is full while giving task %p to thread %d.\n",
3805          taskdata, tid));
3806 
3807     // if this deque is bigger than the pass ratio give a chance to another
3808     // thread
3809     if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
3810       return result;
3811 
3812     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3813     if (TCR_4(thread_data->td.td_deque_ntasks) >=
3814         TASK_DEQUE_SIZE(thread_data->td)) {
3815       // expand deque to push the task which is not allowed to execute
3816       __kmp_realloc_task_deque(thread, thread_data);
3817     }
3818 
3819   } else {
3820 
3821     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3822 
3823     if (TCR_4(thread_data->td.td_deque_ntasks) >=
3824         TASK_DEQUE_SIZE(thread_data->td)) {
3825       KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to "
3826                     "thread %d.\n",
3827                     taskdata, tid));
3828 
3829       // if this deque is bigger than the pass ratio give a chance to another
3830       // thread
3831       if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
3832         goto release_and_exit;
3833 
3834       __kmp_realloc_task_deque(thread, thread_data);
3835     }
3836   }
3837 
3838   // lock is held here, and there is space in the deque
3839 
3840   thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
3841   // Wrap index.
3842   thread_data->td.td_deque_tail =
3843       (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
3844   TCW_4(thread_data->td.td_deque_ntasks,
3845         TCR_4(thread_data->td.td_deque_ntasks) + 1);
3846 
3847   result = true;
3848   KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n",
3849                 taskdata, tid));
3850 
3851 release_and_exit:
3852   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3853 
3854   return result;
3855 }
3856 
3857 /* The finish of the proxy tasks is divided in two pieces:
3858     - the top half is the one that can be done from a thread outside the team
3859     - the bottom half must be run from a thread within the team
3860 
3861    In order to run the bottom half the task gets queued back into one of the
3862    threads of the team. Once the td_incomplete_child_task counter of the parent
3863    is decremented the threads can leave the barriers. So, the bottom half needs
3864    to be queued before the counter is decremented. The top half is therefore
3865    divided in two parts:
3866     - things that can be run before queuing the bottom half
3867     - things that must be run after queuing the bottom half
3868 
3869    This creates a second race as the bottom half can free the task before the
3870    second top half is executed. To avoid this we use the
3871    td_incomplete_child_task of the proxy task to synchronize the top and bottom
3872    half. */
3873 static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
3874   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
3875   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3876   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
3877   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
3878 
3879   taskdata->td_flags.complete = 1; // mark the task as completed
3880 
3881   if (taskdata->td_taskgroup)
3882     KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
3883 
3884   // Create an imaginary children for this task so the bottom half cannot
3885   // release the task before we have completed the second top half
3886   KMP_ATOMIC_INC(&taskdata->td_incomplete_child_tasks);
3887 }
3888 
3889 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
3890   kmp_int32 children = 0;
3891 
3892   // Predecrement simulated by "- 1" calculation
3893   children =
3894       KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
3895   KMP_DEBUG_ASSERT(children >= 0);
3896 
3897   // Remove the imaginary children
3898   KMP_ATOMIC_DEC(&taskdata->td_incomplete_child_tasks);
3899 }
3900 
3901 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) {
3902   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3903   kmp_info_t *thread = __kmp_threads[gtid];
3904 
3905   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3906   KMP_DEBUG_ASSERT(taskdata->td_flags.complete ==
3907                    1); // top half must run before bottom half
3908 
3909   // We need to wait to make sure the top half is finished
3910   // Spinning here should be ok as this should happen quickly
3911   while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) > 0)
3912     ;
3913 
3914   __kmp_release_deps(gtid, taskdata);
3915   __kmp_free_task_and_ancestors(gtid, taskdata, thread);
3916 }
3917 
3918 /*!
3919 @ingroup TASKING
3920 @param gtid Global Thread ID of encountering thread
3921 @param ptask Task which execution is completed
3922 
3923 Execute the completion of a proxy task from a thread of that is part of the
3924 team. Run first and bottom halves directly.
3925 */
3926 void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) {
3927   KMP_DEBUG_ASSERT(ptask != NULL);
3928   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3929   KA_TRACE(
3930       10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n",
3931            gtid, taskdata));
3932   __kmp_assert_valid_gtid(gtid);
3933   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3934 
3935   __kmp_first_top_half_finish_proxy(taskdata);
3936   __kmp_second_top_half_finish_proxy(taskdata);
3937   __kmp_bottom_half_finish_proxy(gtid, ptask);
3938 
3939   KA_TRACE(10,
3940            ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n",
3941             gtid, taskdata));
3942 }
3943 
3944 /*!
3945 @ingroup TASKING
3946 @param ptask Task which execution is completed
3947 
3948 Execute the completion of a proxy task from a thread that could not belong to
3949 the team.
3950 */
3951 void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) {
3952   KMP_DEBUG_ASSERT(ptask != NULL);
3953   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3954 
3955   KA_TRACE(
3956       10,
3957       ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n",
3958        taskdata));
3959 
3960   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3961 
3962   __kmp_first_top_half_finish_proxy(taskdata);
3963 
3964   // Enqueue task to complete bottom half completion from a thread within the
3965   // corresponding team
3966   kmp_team_t *team = taskdata->td_team;
3967   kmp_int32 nthreads = team->t.t_nproc;
3968   kmp_info_t *thread;
3969 
3970   // This should be similar to start_k = __kmp_get_random( thread ) % nthreads
3971   // but we cannot use __kmp_get_random here
3972   kmp_int32 start_k = 0;
3973   kmp_int32 pass = 1;
3974   kmp_int32 k = start_k;
3975 
3976   do {
3977     // For now we're just linearly trying to find a thread
3978     thread = team->t.t_threads[k];
3979     k = (k + 1) % nthreads;
3980 
3981     // we did a full pass through all the threads
3982     if (k == start_k)
3983       pass = pass << 1;
3984 
3985   } while (!__kmp_give_task(thread, k, ptask, pass));
3986 
3987   __kmp_second_top_half_finish_proxy(taskdata);
3988 
3989   KA_TRACE(
3990       10,
3991       ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n",
3992        taskdata));
3993 }
3994 
3995 kmp_event_t *__kmpc_task_allow_completion_event(ident_t *loc_ref, int gtid,
3996                                                 kmp_task_t *task) {
3997   kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
3998   if (td->td_allow_completion_event.type == KMP_EVENT_UNINITIALIZED) {
3999     td->td_allow_completion_event.type = KMP_EVENT_ALLOW_COMPLETION;
4000     td->td_allow_completion_event.ed.task = task;
4001     __kmp_init_tas_lock(&td->td_allow_completion_event.lock);
4002   }
4003   return &td->td_allow_completion_event;
4004 }
4005 
4006 void __kmp_fulfill_event(kmp_event_t *event) {
4007   if (event->type == KMP_EVENT_ALLOW_COMPLETION) {
4008     kmp_task_t *ptask = event->ed.task;
4009     kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4010     bool detached = false;
4011     int gtid = __kmp_get_gtid();
4012 
4013     // The associated task might have completed or could be completing at this
4014     // point.
4015     // We need to take the lock to avoid races
4016     __kmp_acquire_tas_lock(&event->lock, gtid);
4017     if (taskdata->td_flags.proxy == TASK_PROXY) {
4018       detached = true;
4019     } else {
4020 #if OMPT_SUPPORT
4021       // The OMPT event must occur under mutual exclusion,
4022       // otherwise the tool might access ptask after free
4023       if (UNLIKELY(ompt_enabled.enabled))
4024         __ompt_task_finish(ptask, NULL, ompt_task_early_fulfill);
4025 #endif
4026     }
4027     event->type = KMP_EVENT_UNINITIALIZED;
4028     __kmp_release_tas_lock(&event->lock, gtid);
4029 
4030     if (detached) {
4031 #if OMPT_SUPPORT
4032       // We free ptask afterwards and know the task is finished,
4033       // so locking is not necessary
4034       if (UNLIKELY(ompt_enabled.enabled))
4035         __ompt_task_finish(ptask, NULL, ompt_task_late_fulfill);
4036 #endif
4037       // If the task detached complete the proxy task
4038       if (gtid >= 0) {
4039         kmp_team_t *team = taskdata->td_team;
4040         kmp_info_t *thread = __kmp_get_thread();
4041         if (thread->th.th_team == team) {
4042           __kmpc_proxy_task_completed(gtid, ptask);
4043           return;
4044         }
4045       }
4046 
4047       // fallback
4048       __kmpc_proxy_task_completed_ooo(ptask);
4049     }
4050   }
4051 }
4052 
4053 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task
4054 // for taskloop
4055 //
4056 // thread:   allocating thread
4057 // task_src: pointer to source task to be duplicated
4058 // returns:  a pointer to the allocated kmp_task_t structure (task).
4059 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) {
4060   kmp_task_t *task;
4061   kmp_taskdata_t *taskdata;
4062   kmp_taskdata_t *taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
4063   kmp_taskdata_t *parent_task = taskdata_src->td_parent; // same parent task
4064   size_t shareds_offset;
4065   size_t task_size;
4066 
4067   KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread,
4068                 task_src));
4069   KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy ==
4070                    TASK_FULL); // it should not be proxy task
4071   KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT);
4072   task_size = taskdata_src->td_size_alloc;
4073 
4074   // Allocate a kmp_taskdata_t block and a kmp_task_t block.
4075   KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread,
4076                 task_size));
4077 #if USE_FAST_MEMORY
4078   taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size);
4079 #else
4080   taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size);
4081 #endif /* USE_FAST_MEMORY */
4082   KMP_MEMCPY(taskdata, taskdata_src, task_size);
4083 
4084   task = KMP_TASKDATA_TO_TASK(taskdata);
4085 
4086   // Initialize new task (only specific fields not affected by memcpy)
4087   taskdata->td_task_id = KMP_GEN_TASK_ID();
4088   if (task->shareds != NULL) { // need setup shareds pointer
4089     shareds_offset = (char *)task_src->shareds - (char *)taskdata_src;
4090     task->shareds = &((char *)taskdata)[shareds_offset];
4091     KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
4092                      0);
4093   }
4094   taskdata->td_alloc_thread = thread;
4095   taskdata->td_parent = parent_task;
4096   // task inherits the taskgroup from the parent task
4097   taskdata->td_taskgroup = parent_task->td_taskgroup;
4098   // tied task needs to initialize the td_last_tied at creation,
4099   // untied one does this when it is scheduled for execution
4100   if (taskdata->td_flags.tiedness == TASK_TIED)
4101     taskdata->td_last_tied = taskdata;
4102 
4103   // Only need to keep track of child task counts if team parallel and tasking
4104   // not serialized
4105   if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
4106     KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
4107     if (parent_task->td_taskgroup)
4108       KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
4109     // Only need to keep track of allocated child tasks for explicit tasks since
4110     // implicit not deallocated
4111     if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT)
4112       KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
4113   }
4114 
4115   KA_TRACE(20,
4116            ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
4117             thread, taskdata, taskdata->td_parent));
4118 #if OMPT_SUPPORT
4119   if (UNLIKELY(ompt_enabled.enabled))
4120     __ompt_task_init(taskdata, thread->th.th_info.ds.ds_gtid);
4121 #endif
4122   return task;
4123 }
4124 
4125 // Routine optionally generated by the compiler for setting the lastprivate flag
4126 // and calling needed constructors for private/firstprivate objects
4127 // (used to form taskloop tasks from pattern task)
4128 // Parameters: dest task, src task, lastprivate flag.
4129 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
4130 
4131 KMP_BUILD_ASSERT(sizeof(long) == 4 || sizeof(long) == 8);
4132 
4133 // class to encapsulate manipulating loop bounds in a taskloop task.
4134 // this abstracts away the Intel vs GOMP taskloop interface for setting/getting
4135 // the loop bound variables.
4136 class kmp_taskloop_bounds_t {
4137   kmp_task_t *task;
4138   const kmp_taskdata_t *taskdata;
4139   size_t lower_offset;
4140   size_t upper_offset;
4141 
4142 public:
4143   kmp_taskloop_bounds_t(kmp_task_t *_task, kmp_uint64 *lb, kmp_uint64 *ub)
4144       : task(_task), taskdata(KMP_TASK_TO_TASKDATA(task)),
4145         lower_offset((char *)lb - (char *)task),
4146         upper_offset((char *)ub - (char *)task) {
4147     KMP_DEBUG_ASSERT((char *)lb > (char *)_task);
4148     KMP_DEBUG_ASSERT((char *)ub > (char *)_task);
4149   }
4150   kmp_taskloop_bounds_t(kmp_task_t *_task, const kmp_taskloop_bounds_t &bounds)
4151       : task(_task), taskdata(KMP_TASK_TO_TASKDATA(_task)),
4152         lower_offset(bounds.lower_offset), upper_offset(bounds.upper_offset) {}
4153   size_t get_lower_offset() const { return lower_offset; }
4154   size_t get_upper_offset() const { return upper_offset; }
4155   kmp_uint64 get_lb() const {
4156     kmp_int64 retval;
4157 #if defined(KMP_GOMP_COMPAT)
4158     // Intel task just returns the lower bound normally
4159     if (!taskdata->td_flags.native) {
4160       retval = *(kmp_int64 *)((char *)task + lower_offset);
4161     } else {
4162       // GOMP task has to take into account the sizeof(long)
4163       if (taskdata->td_size_loop_bounds == 4) {
4164         kmp_int32 *lb = RCAST(kmp_int32 *, task->shareds);
4165         retval = (kmp_int64)*lb;
4166       } else {
4167         kmp_int64 *lb = RCAST(kmp_int64 *, task->shareds);
4168         retval = (kmp_int64)*lb;
4169       }
4170     }
4171 #else
4172     (void)taskdata;
4173     retval = *(kmp_int64 *)((char *)task + lower_offset);
4174 #endif // defined(KMP_GOMP_COMPAT)
4175     return retval;
4176   }
4177   kmp_uint64 get_ub() const {
4178     kmp_int64 retval;
4179 #if defined(KMP_GOMP_COMPAT)
4180     // Intel task just returns the upper bound normally
4181     if (!taskdata->td_flags.native) {
4182       retval = *(kmp_int64 *)((char *)task + upper_offset);
4183     } else {
4184       // GOMP task has to take into account the sizeof(long)
4185       if (taskdata->td_size_loop_bounds == 4) {
4186         kmp_int32 *ub = RCAST(kmp_int32 *, task->shareds) + 1;
4187         retval = (kmp_int64)*ub;
4188       } else {
4189         kmp_int64 *ub = RCAST(kmp_int64 *, task->shareds) + 1;
4190         retval = (kmp_int64)*ub;
4191       }
4192     }
4193 #else
4194     retval = *(kmp_int64 *)((char *)task + upper_offset);
4195 #endif // defined(KMP_GOMP_COMPAT)
4196     return retval;
4197   }
4198   void set_lb(kmp_uint64 lb) {
4199 #if defined(KMP_GOMP_COMPAT)
4200     // Intel task just sets the lower bound normally
4201     if (!taskdata->td_flags.native) {
4202       *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4203     } else {
4204       // GOMP task has to take into account the sizeof(long)
4205       if (taskdata->td_size_loop_bounds == 4) {
4206         kmp_uint32 *lower = RCAST(kmp_uint32 *, task->shareds);
4207         *lower = (kmp_uint32)lb;
4208       } else {
4209         kmp_uint64 *lower = RCAST(kmp_uint64 *, task->shareds);
4210         *lower = (kmp_uint64)lb;
4211       }
4212     }
4213 #else
4214     *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4215 #endif // defined(KMP_GOMP_COMPAT)
4216   }
4217   void set_ub(kmp_uint64 ub) {
4218 #if defined(KMP_GOMP_COMPAT)
4219     // Intel task just sets the upper bound normally
4220     if (!taskdata->td_flags.native) {
4221       *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4222     } else {
4223       // GOMP task has to take into account the sizeof(long)
4224       if (taskdata->td_size_loop_bounds == 4) {
4225         kmp_uint32 *upper = RCAST(kmp_uint32 *, task->shareds) + 1;
4226         *upper = (kmp_uint32)ub;
4227       } else {
4228         kmp_uint64 *upper = RCAST(kmp_uint64 *, task->shareds) + 1;
4229         *upper = (kmp_uint64)ub;
4230       }
4231     }
4232 #else
4233     *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4234 #endif // defined(KMP_GOMP_COMPAT)
4235   }
4236 };
4237 
4238 // __kmp_taskloop_linear: Start tasks of the taskloop linearly
4239 //
4240 // loc        Source location information
4241 // gtid       Global thread ID
4242 // task       Pattern task, exposes the loop iteration range
4243 // lb         Pointer to loop lower bound in task structure
4244 // ub         Pointer to loop upper bound in task structure
4245 // st         Loop stride
4246 // ub_glob    Global upper bound (used for lastprivate check)
4247 // num_tasks  Number of tasks to execute
4248 // grainsize  Number of loop iterations per task
4249 // extras     Number of chunks with grainsize+1 iterations
4250 // last_chunk Reduction of grainsize for last task
4251 // tc         Iterations count
4252 // task_dup   Tasks duplication routine
4253 // codeptr_ra Return address for OMPT events
4254 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
4255                            kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4256                            kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4257                            kmp_uint64 grainsize, kmp_uint64 extras,
4258                            kmp_int64 last_chunk, kmp_uint64 tc,
4259 #if OMPT_SUPPORT
4260                            void *codeptr_ra,
4261 #endif
4262                            void *task_dup) {
4263   KMP_COUNT_BLOCK(OMP_TASKLOOP);
4264   KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
4265   p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4266   // compiler provides global bounds here
4267   kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4268   kmp_uint64 lower = task_bounds.get_lb();
4269   kmp_uint64 upper = task_bounds.get_ub();
4270   kmp_uint64 i;
4271   kmp_info_t *thread = __kmp_threads[gtid];
4272   kmp_taskdata_t *current_task = thread->th.th_current_task;
4273   kmp_task_t *next_task;
4274   kmp_int32 lastpriv = 0;
4275 
4276   KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
4277                              (last_chunk < 0 ? last_chunk : extras));
4278   KMP_DEBUG_ASSERT(num_tasks > extras);
4279   KMP_DEBUG_ASSERT(num_tasks > 0);
4280   KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, "
4281                 "extras %lld, last_chunk %lld, i=%lld,%lld(%d)%lld, dup %p\n",
4282                 gtid, num_tasks, grainsize, extras, last_chunk, lower, upper,
4283                 ub_glob, st, task_dup));
4284 
4285   // Launch num_tasks tasks, assign grainsize iterations each task
4286   for (i = 0; i < num_tasks; ++i) {
4287     kmp_uint64 chunk_minus_1;
4288     if (extras == 0) {
4289       chunk_minus_1 = grainsize - 1;
4290     } else {
4291       chunk_minus_1 = grainsize;
4292       --extras; // first extras iterations get bigger chunk (grainsize+1)
4293     }
4294     upper = lower + st * chunk_minus_1;
4295     if (upper > *ub) {
4296       upper = *ub;
4297     }
4298     if (i == num_tasks - 1) {
4299       // schedule the last task, set lastprivate flag if needed
4300       if (st == 1) { // most common case
4301         KMP_DEBUG_ASSERT(upper == *ub);
4302         if (upper == ub_glob)
4303           lastpriv = 1;
4304       } else if (st > 0) { // positive loop stride
4305         KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper);
4306         if ((kmp_uint64)st > ub_glob - upper)
4307           lastpriv = 1;
4308       } else { // negative loop stride
4309         KMP_DEBUG_ASSERT(upper + st < *ub);
4310         if (upper - ub_glob < (kmp_uint64)(-st))
4311           lastpriv = 1;
4312       }
4313     }
4314     next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
4315     kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task);
4316     kmp_taskloop_bounds_t next_task_bounds =
4317         kmp_taskloop_bounds_t(next_task, task_bounds);
4318 
4319     // adjust task-specific bounds
4320     next_task_bounds.set_lb(lower);
4321     if (next_taskdata->td_flags.native) {
4322       next_task_bounds.set_ub(upper + (st > 0 ? 1 : -1));
4323     } else {
4324       next_task_bounds.set_ub(upper);
4325     }
4326     if (ptask_dup != NULL) // set lastprivate flag, construct firstprivates,
4327                            // etc.
4328       ptask_dup(next_task, task, lastpriv);
4329     KA_TRACE(40,
4330              ("__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, "
4331               "upper %lld stride %lld, (offsets %p %p)\n",
4332               gtid, i, next_task, lower, upper, st,
4333               next_task_bounds.get_lower_offset(),
4334               next_task_bounds.get_upper_offset()));
4335 #if OMPT_SUPPORT
4336     __kmp_omp_taskloop_task(NULL, gtid, next_task,
4337                             codeptr_ra); // schedule new task
4338 #else
4339     __kmp_omp_task(gtid, next_task, true); // schedule new task
4340 #endif
4341     lower = upper + st; // adjust lower bound for the next iteration
4342   }
4343   // free the pattern task and exit
4344   __kmp_task_start(gtid, task, current_task); // make internal bookkeeping
4345   // do not execute the pattern task, just do internal bookkeeping
4346   __kmp_task_finish<false>(gtid, task, current_task);
4347 }
4348 
4349 // Structure to keep taskloop parameters for auxiliary task
4350 // kept in the shareds of the task structure.
4351 typedef struct __taskloop_params {
4352   kmp_task_t *task;
4353   kmp_uint64 *lb;
4354   kmp_uint64 *ub;
4355   void *task_dup;
4356   kmp_int64 st;
4357   kmp_uint64 ub_glob;
4358   kmp_uint64 num_tasks;
4359   kmp_uint64 grainsize;
4360   kmp_uint64 extras;
4361   kmp_int64 last_chunk;
4362   kmp_uint64 tc;
4363   kmp_uint64 num_t_min;
4364 #if OMPT_SUPPORT
4365   void *codeptr_ra;
4366 #endif
4367 } __taskloop_params_t;
4368 
4369 void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *,
4370                           kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64,
4371                           kmp_uint64, kmp_uint64, kmp_int64, kmp_uint64,
4372                           kmp_uint64,
4373 #if OMPT_SUPPORT
4374                           void *,
4375 #endif
4376                           void *);
4377 
4378 // Execute part of the taskloop submitted as a task.
4379 int __kmp_taskloop_task(int gtid, void *ptask) {
4380   __taskloop_params_t *p =
4381       (__taskloop_params_t *)((kmp_task_t *)ptask)->shareds;
4382   kmp_task_t *task = p->task;
4383   kmp_uint64 *lb = p->lb;
4384   kmp_uint64 *ub = p->ub;
4385   void *task_dup = p->task_dup;
4386   //  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4387   kmp_int64 st = p->st;
4388   kmp_uint64 ub_glob = p->ub_glob;
4389   kmp_uint64 num_tasks = p->num_tasks;
4390   kmp_uint64 grainsize = p->grainsize;
4391   kmp_uint64 extras = p->extras;
4392   kmp_int64 last_chunk = p->last_chunk;
4393   kmp_uint64 tc = p->tc;
4394   kmp_uint64 num_t_min = p->num_t_min;
4395 #if OMPT_SUPPORT
4396   void *codeptr_ra = p->codeptr_ra;
4397 #endif
4398 #if KMP_DEBUG
4399   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4400   KMP_DEBUG_ASSERT(task != NULL);
4401   KA_TRACE(20,
4402            ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize"
4403             " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
4404             gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,
4405             st, task_dup));
4406 #endif
4407   KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min);
4408   if (num_tasks > num_t_min)
4409     __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
4410                          grainsize, extras, last_chunk, tc, num_t_min,
4411 #if OMPT_SUPPORT
4412                          codeptr_ra,
4413 #endif
4414                          task_dup);
4415   else
4416     __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
4417                           grainsize, extras, last_chunk, tc,
4418 #if OMPT_SUPPORT
4419                           codeptr_ra,
4420 #endif
4421                           task_dup);
4422 
4423   KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid));
4424   return 0;
4425 }
4426 
4427 // Schedule part of the taskloop as a task,
4428 // execute the rest of the taskloop.
4429 //
4430 // loc        Source location information
4431 // gtid       Global thread ID
4432 // task       Pattern task, exposes the loop iteration range
4433 // lb         Pointer to loop lower bound in task structure
4434 // ub         Pointer to loop upper bound in task structure
4435 // st         Loop stride
4436 // ub_glob    Global upper bound (used for lastprivate check)
4437 // num_tasks  Number of tasks to execute
4438 // grainsize  Number of loop iterations per task
4439 // extras     Number of chunks with grainsize+1 iterations
4440 // last_chunk Reduction of grainsize for last task
4441 // tc         Iterations count
4442 // num_t_min  Threshold to launch tasks recursively
4443 // task_dup   Tasks duplication routine
4444 // codeptr_ra Return address for OMPT events
4445 void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
4446                           kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4447                           kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4448                           kmp_uint64 grainsize, kmp_uint64 extras,
4449                           kmp_int64 last_chunk, kmp_uint64 tc,
4450                           kmp_uint64 num_t_min,
4451 #if OMPT_SUPPORT
4452                           void *codeptr_ra,
4453 #endif
4454                           void *task_dup) {
4455   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4456   KMP_DEBUG_ASSERT(task != NULL);
4457   KMP_DEBUG_ASSERT(num_tasks > num_t_min);
4458   KA_TRACE(20,
4459            ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize"
4460             " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
4461             gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,
4462             st, task_dup));
4463   p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4464   kmp_uint64 lower = *lb;
4465   kmp_info_t *thread = __kmp_threads[gtid];
4466   //  kmp_taskdata_t *current_task = thread->th.th_current_task;
4467   kmp_task_t *next_task;
4468   size_t lower_offset =
4469       (char *)lb - (char *)task; // remember offset of lb in the task structure
4470   size_t upper_offset =
4471       (char *)ub - (char *)task; // remember offset of ub in the task structure
4472 
4473   KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
4474                              (last_chunk < 0 ? last_chunk : extras));
4475   KMP_DEBUG_ASSERT(num_tasks > extras);
4476   KMP_DEBUG_ASSERT(num_tasks > 0);
4477 
4478   // split the loop in two halves
4479   kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1;
4480   kmp_int64 last_chunk0 = 0, last_chunk1 = 0;
4481   kmp_uint64 gr_size0 = grainsize;
4482   kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute
4483   kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task
4484   if (last_chunk < 0) {
4485     ext0 = ext1 = 0;
4486     last_chunk1 = last_chunk;
4487     tc0 = grainsize * n_tsk0;
4488     tc1 = tc - tc0;
4489   } else if (n_tsk0 <= extras) {
4490     gr_size0++; // integrate extras into grainsize
4491     ext0 = 0; // no extra iters in 1st half
4492     ext1 = extras - n_tsk0; // remaining extras
4493     tc0 = gr_size0 * n_tsk0;
4494     tc1 = tc - tc0;
4495   } else { // n_tsk0 > extras
4496     ext1 = 0; // no extra iters in 2nd half
4497     ext0 = extras;
4498     tc1 = grainsize * n_tsk1;
4499     tc0 = tc - tc1;
4500   }
4501   ub0 = lower + st * (tc0 - 1);
4502   lb1 = ub0 + st;
4503 
4504   // create pattern task for 2nd half of the loop
4505   next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task
4506   // adjust lower bound (upper bound is not changed) for the 2nd half
4507   *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1;
4508   if (ptask_dup != NULL) // construct firstprivates, etc.
4509     ptask_dup(next_task, task, 0);
4510   *ub = ub0; // adjust upper bound for the 1st half
4511 
4512   // create auxiliary task for 2nd half of the loop
4513   // make sure new task has same parent task as the pattern task
4514   kmp_taskdata_t *current_task = thread->th.th_current_task;
4515   thread->th.th_current_task = taskdata->td_parent;
4516   kmp_task_t *new_task =
4517       __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *),
4518                             sizeof(__taskloop_params_t), &__kmp_taskloop_task);
4519   // restore current task
4520   thread->th.th_current_task = current_task;
4521   __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds;
4522   p->task = next_task;
4523   p->lb = (kmp_uint64 *)((char *)next_task + lower_offset);
4524   p->ub = (kmp_uint64 *)((char *)next_task + upper_offset);
4525   p->task_dup = task_dup;
4526   p->st = st;
4527   p->ub_glob = ub_glob;
4528   p->num_tasks = n_tsk1;
4529   p->grainsize = grainsize;
4530   p->extras = ext1;
4531   p->last_chunk = last_chunk1;
4532   p->tc = tc1;
4533   p->num_t_min = num_t_min;
4534 #if OMPT_SUPPORT
4535   p->codeptr_ra = codeptr_ra;
4536 #endif
4537 
4538 #if OMPT_SUPPORT
4539   // schedule new task with correct return address for OMPT events
4540   __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra);
4541 #else
4542   __kmp_omp_task(gtid, new_task, true); // schedule new task
4543 #endif
4544 
4545   // execute the 1st half of current subrange
4546   if (n_tsk0 > num_t_min)
4547     __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0,
4548                          ext0, last_chunk0, tc0, num_t_min,
4549 #if OMPT_SUPPORT
4550                          codeptr_ra,
4551 #endif
4552                          task_dup);
4553   else
4554     __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,
4555                           gr_size0, ext0, last_chunk0, tc0,
4556 #if OMPT_SUPPORT
4557                           codeptr_ra,
4558 #endif
4559                           task_dup);
4560 
4561   KA_TRACE(40, ("__kmp_taskloop_recur(exit): T#%d\n", gtid));
4562 }
4563 
4564 static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
4565                            kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4566                            int nogroup, int sched, kmp_uint64 grainsize,
4567                            int modifier, void *task_dup) {
4568   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4569   KMP_DEBUG_ASSERT(task != NULL);
4570   if (nogroup == 0) {
4571 #if OMPT_SUPPORT && OMPT_OPTIONAL
4572     OMPT_STORE_RETURN_ADDRESS(gtid);
4573 #endif
4574     __kmpc_taskgroup(loc, gtid);
4575   }
4576 
4577   // =========================================================================
4578   // calculate loop parameters
4579   kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4580   kmp_uint64 tc;
4581   // compiler provides global bounds here
4582   kmp_uint64 lower = task_bounds.get_lb();
4583   kmp_uint64 upper = task_bounds.get_ub();
4584   kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag
4585   kmp_uint64 num_tasks = 0, extras = 0;
4586   kmp_int64 last_chunk =
4587       0; // reduce grainsize of last task by last_chunk in strict mode
4588   kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks;
4589   kmp_info_t *thread = __kmp_threads[gtid];
4590   kmp_taskdata_t *current_task = thread->th.th_current_task;
4591 
4592   KA_TRACE(20, ("__kmp_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, "
4593                 "grain %llu(%d, %d), dup %p\n",
4594                 gtid, taskdata, lower, upper, st, grainsize, sched, modifier,
4595                 task_dup));
4596 
4597   // compute trip count
4598   if (st == 1) { // most common case
4599     tc = upper - lower + 1;
4600   } else if (st < 0) {
4601     tc = (lower - upper) / (-st) + 1;
4602   } else { // st > 0
4603     tc = (upper - lower) / st + 1;
4604   }
4605   if (tc == 0) {
4606     KA_TRACE(20, ("__kmp_taskloop(exit): T#%d zero-trip loop\n", gtid));
4607     // free the pattern task and exit
4608     __kmp_task_start(gtid, task, current_task);
4609     // do not execute anything for zero-trip loop
4610     __kmp_task_finish<false>(gtid, task, current_task);
4611     return;
4612   }
4613 
4614 #if OMPT_SUPPORT && OMPT_OPTIONAL
4615   ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
4616   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
4617   if (ompt_enabled.ompt_callback_work) {
4618     ompt_callbacks.ompt_callback(ompt_callback_work)(
4619         ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data),
4620         &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
4621   }
4622 #endif
4623 
4624   if (num_tasks_min == 0)
4625     // TODO: can we choose better default heuristic?
4626     num_tasks_min =
4627         KMP_MIN(thread->th.th_team_nproc * 10, INITIAL_TASK_DEQUE_SIZE);
4628 
4629   // compute num_tasks/grainsize based on the input provided
4630   switch (sched) {
4631   case 0: // no schedule clause specified, we can choose the default
4632     // let's try to schedule (team_size*10) tasks
4633     grainsize = thread->th.th_team_nproc * 10;
4634     KMP_FALLTHROUGH();
4635   case 2: // num_tasks provided
4636     if (grainsize > tc) {
4637       num_tasks = tc; // too big num_tasks requested, adjust values
4638       grainsize = 1;
4639       extras = 0;
4640     } else {
4641       num_tasks = grainsize;
4642       grainsize = tc / num_tasks;
4643       extras = tc % num_tasks;
4644     }
4645     break;
4646   case 1: // grainsize provided
4647     if (grainsize > tc) {
4648       num_tasks = 1;
4649       grainsize = tc; // too big grainsize requested, adjust values
4650       extras = 0;
4651     } else {
4652       if (modifier) {
4653         num_tasks = (tc + grainsize - 1) / grainsize;
4654         last_chunk = tc - (num_tasks * grainsize);
4655         extras = 0;
4656       } else {
4657         num_tasks = tc / grainsize;
4658         // adjust grainsize for balanced distribution of iterations
4659         grainsize = tc / num_tasks;
4660         extras = tc % num_tasks;
4661       }
4662     }
4663     break;
4664   default:
4665     KMP_ASSERT2(0, "unknown scheduling of taskloop");
4666   }
4667 
4668   KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
4669                              (last_chunk < 0 ? last_chunk : extras));
4670   KMP_DEBUG_ASSERT(num_tasks > extras);
4671   KMP_DEBUG_ASSERT(num_tasks > 0);
4672   // =========================================================================
4673 
4674   // check if clause value first
4675   // Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native)
4676   if (if_val == 0) { // if(0) specified, mark task as serial
4677     taskdata->td_flags.task_serial = 1;
4678     taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
4679     // always start serial tasks linearly
4680     __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
4681                           grainsize, extras, last_chunk, tc,
4682 #if OMPT_SUPPORT
4683                           OMPT_GET_RETURN_ADDRESS(0),
4684 #endif
4685                           task_dup);
4686     // !taskdata->td_flags.native => currently force linear spawning of tasks
4687     // for GOMP_taskloop
4688   } else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) {
4689     KA_TRACE(20, ("__kmp_taskloop: T#%d, go recursive: tc %llu, #tasks %llu"
4690                   "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
4691                   gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
4692                   last_chunk));
4693     __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
4694                          grainsize, extras, last_chunk, tc, num_tasks_min,
4695 #if OMPT_SUPPORT
4696                          OMPT_GET_RETURN_ADDRESS(0),
4697 #endif
4698                          task_dup);
4699   } else {
4700     KA_TRACE(20, ("__kmp_taskloop: T#%d, go linear: tc %llu, #tasks %llu"
4701                   "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
4702                   gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
4703                   last_chunk));
4704     __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
4705                           grainsize, extras, last_chunk, tc,
4706 #if OMPT_SUPPORT
4707                           OMPT_GET_RETURN_ADDRESS(0),
4708 #endif
4709                           task_dup);
4710   }
4711 
4712 #if OMPT_SUPPORT && OMPT_OPTIONAL
4713   if (ompt_enabled.ompt_callback_work) {
4714     ompt_callbacks.ompt_callback(ompt_callback_work)(
4715         ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data),
4716         &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
4717   }
4718 #endif
4719 
4720   if (nogroup == 0) {
4721 #if OMPT_SUPPORT && OMPT_OPTIONAL
4722     OMPT_STORE_RETURN_ADDRESS(gtid);
4723 #endif
4724     __kmpc_end_taskgroup(loc, gtid);
4725   }
4726   KA_TRACE(20, ("__kmp_taskloop(exit): T#%d\n", gtid));
4727 }
4728 
4729 /*!
4730 @ingroup TASKING
4731 @param loc       Source location information
4732 @param gtid      Global thread ID
4733 @param task      Task structure
4734 @param if_val    Value of the if clause
4735 @param lb        Pointer to loop lower bound in task structure
4736 @param ub        Pointer to loop upper bound in task structure
4737 @param st        Loop stride
4738 @param nogroup   Flag, 1 if nogroup clause specified, 0 otherwise
4739 @param sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
4740 @param grainsize Schedule value if specified
4741 @param task_dup  Tasks duplication routine
4742 
4743 Execute the taskloop construct.
4744 */
4745 void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
4746                      kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup,
4747                      int sched, kmp_uint64 grainsize, void *task_dup) {
4748   __kmp_assert_valid_gtid(gtid);
4749   KA_TRACE(20, ("__kmpc_taskloop(enter): T#%d\n", gtid));
4750   __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
4751                  0, task_dup);
4752   KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid));
4753 }
4754 
4755 /*!
4756 @ingroup TASKING
4757 @param loc       Source location information
4758 @param gtid      Global thread ID
4759 @param task      Task structure
4760 @param if_val    Value of the if clause
4761 @param lb        Pointer to loop lower bound in task structure
4762 @param ub        Pointer to loop upper bound in task structure
4763 @param st        Loop stride
4764 @param nogroup   Flag, 1 if nogroup clause specified, 0 otherwise
4765 @param sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
4766 @param grainsize Schedule value if specified
4767 @param modifer   Modifier 'strict' for sched, 1 if present, 0 otherwise
4768 @param task_dup  Tasks duplication routine
4769 
4770 Execute the taskloop construct.
4771 */
4772 void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
4773                        kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4774                        int nogroup, int sched, kmp_uint64 grainsize,
4775                        int modifier, void *task_dup) {
4776   __kmp_assert_valid_gtid(gtid);
4777   KA_TRACE(20, ("__kmpc_taskloop_5(enter): T#%d\n", gtid));
4778   __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
4779                  modifier, task_dup);
4780   KA_TRACE(20, ("__kmpc_taskloop_5(exit): T#%d\n", gtid));
4781 }
4782