1 /*
2  * kmp_tasking.cpp -- OpenMP 3.0 tasking support.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_i18n.h"
15 #include "kmp_itt.h"
16 #include "kmp_stats.h"
17 #include "kmp_wait_release.h"
18 #include "kmp_taskdeps.h"
19 
20 #if OMPT_SUPPORT
21 #include "ompt-specific.h"
22 #endif
23 
24 /* forward declaration */
25 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
26                                  kmp_info_t *this_thr);
27 static void __kmp_alloc_task_deque(kmp_info_t *thread,
28                                    kmp_thread_data_t *thread_data);
29 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
30                                            kmp_task_team_t *task_team);
31 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask);
32 
33 #ifdef BUILD_TIED_TASK_STACK
34 
35 //  __kmp_trace_task_stack: print the tied tasks from the task stack in order
36 //  from top do bottom
37 //
38 //  gtid: global thread identifier for thread containing stack
39 //  thread_data: thread data for task team thread containing stack
40 //  threshold: value above which the trace statement triggers
41 //  location: string identifying call site of this function (for trace)
42 static void __kmp_trace_task_stack(kmp_int32 gtid,
43                                    kmp_thread_data_t *thread_data,
44                                    int threshold, char *location) {
45   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
46   kmp_taskdata_t **stack_top = task_stack->ts_top;
47   kmp_int32 entries = task_stack->ts_entries;
48   kmp_taskdata_t *tied_task;
49 
50   KA_TRACE(
51       threshold,
52       ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, "
53        "first_block = %p, stack_top = %p \n",
54        location, gtid, entries, task_stack->ts_first_block, stack_top));
55 
56   KMP_DEBUG_ASSERT(stack_top != NULL);
57   KMP_DEBUG_ASSERT(entries > 0);
58 
59   while (entries != 0) {
60     KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]);
61     // fix up ts_top if we need to pop from previous block
62     if (entries & TASK_STACK_INDEX_MASK == 0) {
63       kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top);
64 
65       stack_block = stack_block->sb_prev;
66       stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
67     }
68 
69     // finish bookkeeping
70     stack_top--;
71     entries--;
72 
73     tied_task = *stack_top;
74 
75     KMP_DEBUG_ASSERT(tied_task != NULL);
76     KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
77 
78     KA_TRACE(threshold,
79              ("__kmp_trace_task_stack(%s):             gtid=%d, entry=%d, "
80               "stack_top=%p, tied_task=%p\n",
81               location, gtid, entries, stack_top, tied_task));
82   }
83   KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]);
84 
85   KA_TRACE(threshold,
86            ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n",
87             location, gtid));
88 }
89 
90 //  __kmp_init_task_stack: initialize the task stack for the first time
91 //  after a thread_data structure is created.
92 //  It should not be necessary to do this again (assuming the stack works).
93 //
94 //  gtid: global thread identifier of calling thread
95 //  thread_data: thread data for task team thread containing stack
96 static void __kmp_init_task_stack(kmp_int32 gtid,
97                                   kmp_thread_data_t *thread_data) {
98   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
99   kmp_stack_block_t *first_block;
100 
101   // set up the first block of the stack
102   first_block = &task_stack->ts_first_block;
103   task_stack->ts_top = (kmp_taskdata_t **)first_block;
104   memset((void *)first_block, '\0',
105          TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *));
106 
107   // initialize the stack to be empty
108   task_stack->ts_entries = TASK_STACK_EMPTY;
109   first_block->sb_next = NULL;
110   first_block->sb_prev = NULL;
111 }
112 
113 //  __kmp_free_task_stack: free the task stack when thread_data is destroyed.
114 //
115 //  gtid: global thread identifier for calling thread
116 //  thread_data: thread info for thread containing stack
117 static void __kmp_free_task_stack(kmp_int32 gtid,
118                                   kmp_thread_data_t *thread_data) {
119   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
120   kmp_stack_block_t *stack_block = &task_stack->ts_first_block;
121 
122   KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY);
123   // free from the second block of the stack
124   while (stack_block != NULL) {
125     kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL;
126 
127     stack_block->sb_next = NULL;
128     stack_block->sb_prev = NULL;
129     if (stack_block != &task_stack->ts_first_block) {
130       __kmp_thread_free(thread,
131                         stack_block); // free the block, if not the first
132     }
133     stack_block = next_block;
134   }
135   // initialize the stack to be empty
136   task_stack->ts_entries = 0;
137   task_stack->ts_top = NULL;
138 }
139 
140 //  __kmp_push_task_stack: Push the tied task onto the task stack.
141 //     Grow the stack if necessary by allocating another block.
142 //
143 //  gtid: global thread identifier for calling thread
144 //  thread: thread info for thread containing stack
145 //  tied_task: the task to push on the stack
146 static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread,
147                                   kmp_taskdata_t *tied_task) {
148   // GEH - need to consider what to do if tt_threads_data not allocated yet
149   kmp_thread_data_t *thread_data =
150       &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
151   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
152 
153   if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) {
154     return; // Don't push anything on stack if team or team tasks are serialized
155   }
156 
157   KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
158   KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
159 
160   KA_TRACE(20,
161            ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n",
162             gtid, thread, tied_task));
163   // Store entry
164   *(task_stack->ts_top) = tied_task;
165 
166   // Do bookkeeping for next push
167   task_stack->ts_top++;
168   task_stack->ts_entries++;
169 
170   if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
171     // Find beginning of this task block
172     kmp_stack_block_t *stack_block =
173         (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE);
174 
175     // Check if we already have a block
176     if (stack_block->sb_next !=
177         NULL) { // reset ts_top to beginning of next block
178       task_stack->ts_top = &stack_block->sb_next->sb_block[0];
179     } else { // Alloc new block and link it up
180       kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc(
181           thread, sizeof(kmp_stack_block_t));
182 
183       task_stack->ts_top = &new_block->sb_block[0];
184       stack_block->sb_next = new_block;
185       new_block->sb_prev = stack_block;
186       new_block->sb_next = NULL;
187 
188       KA_TRACE(
189           30,
190           ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n",
191            gtid, tied_task, new_block));
192     }
193   }
194   KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
195                 tied_task));
196 }
197 
198 //  __kmp_pop_task_stack: Pop the tied task from the task stack.  Don't return
199 //  the task, just check to make sure it matches the ending task passed in.
200 //
201 //  gtid: global thread identifier for the calling thread
202 //  thread: thread info structure containing stack
203 //  tied_task: the task popped off the stack
204 //  ending_task: the task that is ending (should match popped task)
205 static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread,
206                                  kmp_taskdata_t *ending_task) {
207   // GEH - need to consider what to do if tt_threads_data not allocated yet
208   kmp_thread_data_t *thread_data =
209       &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)];
210   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
211   kmp_taskdata_t *tied_task;
212 
213   if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) {
214     // Don't pop anything from stack if team or team tasks are serialized
215     return;
216   }
217 
218   KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
219   KMP_DEBUG_ASSERT(task_stack->ts_entries > 0);
220 
221   KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid,
222                 thread));
223 
224   // fix up ts_top if we need to pop from previous block
225   if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
226     kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top);
227 
228     stack_block = stack_block->sb_prev;
229     task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
230   }
231 
232   // finish bookkeeping
233   task_stack->ts_top--;
234   task_stack->ts_entries--;
235 
236   tied_task = *(task_stack->ts_top);
237 
238   KMP_DEBUG_ASSERT(tied_task != NULL);
239   KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
240   KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly
241 
242   KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
243                 tied_task));
244   return;
245 }
246 #endif /* BUILD_TIED_TASK_STACK */
247 
248 // returns 1 if new task is allowed to execute, 0 otherwise
249 // checks Task Scheduling constraint (if requested) and
250 // mutexinoutset dependencies if any
251 static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained,
252                                   const kmp_taskdata_t *tasknew,
253                                   const kmp_taskdata_t *taskcurr) {
254   if (is_constrained && (tasknew->td_flags.tiedness == TASK_TIED)) {
255     // Check if the candidate obeys the Task Scheduling Constraints (TSC)
256     // only descendant of all deferred tied tasks can be scheduled, checking
257     // the last one is enough, as it in turn is the descendant of all others
258     kmp_taskdata_t *current = taskcurr->td_last_tied;
259     KMP_DEBUG_ASSERT(current != NULL);
260     // check if the task is not suspended on barrier
261     if (current->td_flags.tasktype == TASK_EXPLICIT ||
262         current->td_taskwait_thread > 0) { // <= 0 on barrier
263       kmp_int32 level = current->td_level;
264       kmp_taskdata_t *parent = tasknew->td_parent;
265       while (parent != current && parent->td_level > level) {
266         // check generation up to the level of the current task
267         parent = parent->td_parent;
268         KMP_DEBUG_ASSERT(parent != NULL);
269       }
270       if (parent != current)
271         return false;
272     }
273   }
274   // Check mutexinoutset dependencies, acquire locks
275   kmp_depnode_t *node = tasknew->td_depnode;
276   if (UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
277     for (int i = 0; i < node->dn.mtx_num_locks; ++i) {
278       KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
279       if (__kmp_test_lock(node->dn.mtx_locks[i], gtid))
280         continue;
281       // could not get the lock, release previous locks
282       for (int j = i - 1; j >= 0; --j)
283         __kmp_release_lock(node->dn.mtx_locks[j], gtid);
284       return false;
285     }
286     // negative num_locks means all locks acquired successfully
287     node->dn.mtx_num_locks = -node->dn.mtx_num_locks;
288   }
289   return true;
290 }
291 
292 // __kmp_realloc_task_deque:
293 // Re-allocates a task deque for a particular thread, copies the content from
294 // the old deque and adjusts the necessary data structures relating to the
295 // deque. This operation must be done with the deque_lock being held
296 static void __kmp_realloc_task_deque(kmp_info_t *thread,
297                                      kmp_thread_data_t *thread_data) {
298   kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
299   KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == size);
300   kmp_int32 new_size = 2 * size;
301 
302   KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to "
303                 "%d] for thread_data %p\n",
304                 __kmp_gtid_from_thread(thread), size, new_size, thread_data));
305 
306   kmp_taskdata_t **new_deque =
307       (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *));
308 
309   int i, j;
310   for (i = thread_data->td.td_deque_head, j = 0; j < size;
311        i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++)
312     new_deque[j] = thread_data->td.td_deque[i];
313 
314   __kmp_free(thread_data->td.td_deque);
315 
316   thread_data->td.td_deque_head = 0;
317   thread_data->td.td_deque_tail = size;
318   thread_data->td.td_deque = new_deque;
319   thread_data->td.td_deque_size = new_size;
320 }
321 
322 //  __kmp_push_task: Add a task to the thread's deque
323 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
324   kmp_info_t *thread = __kmp_threads[gtid];
325   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
326 
327   // We don't need to map to shadow gtid if it is already hidden helper thread
328   if (taskdata->td_flags.hidden_helper && !KMP_HIDDEN_HELPER_THREAD(gtid)) {
329     gtid = KMP_GTID_TO_SHADOW_GTID(gtid);
330     thread = __kmp_threads[gtid];
331   }
332 
333   kmp_task_team_t *task_team = thread->th.th_task_team;
334   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
335   kmp_thread_data_t *thread_data;
336 
337   KA_TRACE(20,
338            ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata));
339 
340   if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
341     // untied task needs to increment counter so that the task structure is not
342     // freed prematurely
343     kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
344     KMP_DEBUG_USE_VAR(counter);
345     KA_TRACE(
346         20,
347         ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",
348          gtid, counter, taskdata));
349   }
350 
351   // The first check avoids building task_team thread data if serialized
352   if (UNLIKELY(taskdata->td_flags.task_serial)) {
353     KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning "
354                   "TASK_NOT_PUSHED for task %p\n",
355                   gtid, taskdata));
356     return TASK_NOT_PUSHED;
357   }
358 
359   // Now that serialized tasks have returned, we can assume that we are not in
360   // immediate exec mode
361   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
362   if (UNLIKELY(!KMP_TASKING_ENABLED(task_team))) {
363     __kmp_enable_tasking(task_team, thread);
364   }
365   KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE);
366   KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL);
367 
368   // Find tasking deque specific to encountering thread
369   thread_data = &task_team->tt.tt_threads_data[tid];
370 
371   // No lock needed since only owner can allocate. If the task is hidden_helper,
372   // we don't need it either because we have initialized the dequeue for hidden
373   // helper thread data.
374   if (UNLIKELY(thread_data->td.td_deque == NULL)) {
375     __kmp_alloc_task_deque(thread, thread_data);
376   }
377 
378   int locked = 0;
379   // Check if deque is full
380   if (TCR_4(thread_data->td.td_deque_ntasks) >=
381       TASK_DEQUE_SIZE(thread_data->td)) {
382     if (__kmp_enable_task_throttling &&
383         __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
384                               thread->th.th_current_task)) {
385       KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning "
386                     "TASK_NOT_PUSHED for task %p\n",
387                     gtid, taskdata));
388       return TASK_NOT_PUSHED;
389     } else {
390       __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
391       locked = 1;
392       if (TCR_4(thread_data->td.td_deque_ntasks) >=
393           TASK_DEQUE_SIZE(thread_data->td)) {
394         // expand deque to push the task which is not allowed to execute
395         __kmp_realloc_task_deque(thread, thread_data);
396       }
397     }
398   }
399   // Lock the deque for the task push operation
400   if (!locked) {
401     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
402     // Need to recheck as we can get a proxy task from thread outside of OpenMP
403     if (TCR_4(thread_data->td.td_deque_ntasks) >=
404         TASK_DEQUE_SIZE(thread_data->td)) {
405       if (__kmp_enable_task_throttling &&
406           __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
407                                 thread->th.th_current_task)) {
408         __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
409         KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; "
410                       "returning TASK_NOT_PUSHED for task %p\n",
411                       gtid, taskdata));
412         return TASK_NOT_PUSHED;
413       } else {
414         // expand deque to push the task which is not allowed to execute
415         __kmp_realloc_task_deque(thread, thread_data);
416       }
417     }
418   }
419   // Must have room since no thread can add tasks but calling thread
420   KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
421                    TASK_DEQUE_SIZE(thread_data->td));
422 
423   thread_data->td.td_deque[thread_data->td.td_deque_tail] =
424       taskdata; // Push taskdata
425   // Wrap index.
426   thread_data->td.td_deque_tail =
427       (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
428   TCW_4(thread_data->td.td_deque_ntasks,
429         TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
430   KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self
431   KMP_FSYNC_RELEASING(taskdata); // releasing child
432   KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
433                 "task=%p ntasks=%d head=%u tail=%u\n",
434                 gtid, taskdata, thread_data->td.td_deque_ntasks,
435                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
436 
437   auto hidden_helper = taskdata->td_flags.hidden_helper;
438 
439   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
440 
441   // Signal one worker thread to execute the task
442   if (UNLIKELY(hidden_helper)) {
443     // Wake hidden helper threads up if they're sleeping
444     __kmp_hidden_helper_worker_thread_signal();
445   }
446 
447   return TASK_SUCCESSFULLY_PUSHED;
448 }
449 
450 // __kmp_pop_current_task_from_thread: set up current task from called thread
451 // when team ends
452 //
453 // this_thr: thread structure to set current_task in.
454 void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) {
455   KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d "
456                 "this_thread=%p, curtask=%p, "
457                 "curtask_parent=%p\n",
458                 0, this_thr, this_thr->th.th_current_task,
459                 this_thr->th.th_current_task->td_parent));
460 
461   this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent;
462 
463   KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d "
464                 "this_thread=%p, curtask=%p, "
465                 "curtask_parent=%p\n",
466                 0, this_thr, this_thr->th.th_current_task,
467                 this_thr->th.th_current_task->td_parent));
468 }
469 
470 // __kmp_push_current_task_to_thread: set up current task in called thread for a
471 // new team
472 //
473 // this_thr: thread structure to set up
474 // team: team for implicit task data
475 // tid: thread within team to set up
476 void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team,
477                                        int tid) {
478   // current task of the thread is a parent of the new just created implicit
479   // tasks of new team
480   KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p "
481                 "curtask=%p "
482                 "parent_task=%p\n",
483                 tid, this_thr, this_thr->th.th_current_task,
484                 team->t.t_implicit_task_taskdata[tid].td_parent));
485 
486   KMP_DEBUG_ASSERT(this_thr != NULL);
487 
488   if (tid == 0) {
489     if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) {
490       team->t.t_implicit_task_taskdata[0].td_parent =
491           this_thr->th.th_current_task;
492       this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0];
493     }
494   } else {
495     team->t.t_implicit_task_taskdata[tid].td_parent =
496         team->t.t_implicit_task_taskdata[0].td_parent;
497     this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid];
498   }
499 
500   KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p "
501                 "curtask=%p "
502                 "parent_task=%p\n",
503                 tid, this_thr, this_thr->th.th_current_task,
504                 team->t.t_implicit_task_taskdata[tid].td_parent));
505 }
506 
507 // __kmp_task_start: bookkeeping for a task starting execution
508 //
509 // GTID: global thread id of calling thread
510 // task: task starting execution
511 // current_task: task suspending
512 static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task,
513                              kmp_taskdata_t *current_task) {
514   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
515   kmp_info_t *thread = __kmp_threads[gtid];
516 
517   KA_TRACE(10,
518            ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",
519             gtid, taskdata, current_task));
520 
521   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
522 
523   // mark currently executing task as suspended
524   // TODO: GEH - make sure root team implicit task is initialized properly.
525   // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );
526   current_task->td_flags.executing = 0;
527 
528 // Add task to stack if tied
529 #ifdef BUILD_TIED_TASK_STACK
530   if (taskdata->td_flags.tiedness == TASK_TIED) {
531     __kmp_push_task_stack(gtid, thread, taskdata);
532   }
533 #endif /* BUILD_TIED_TASK_STACK */
534 
535   // mark starting task as executing and as current task
536   thread->th.th_current_task = taskdata;
537 
538   KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 ||
539                    taskdata->td_flags.tiedness == TASK_UNTIED);
540   KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 ||
541                    taskdata->td_flags.tiedness == TASK_UNTIED);
542   taskdata->td_flags.started = 1;
543   taskdata->td_flags.executing = 1;
544   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
545   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
546 
547   // GEH TODO: shouldn't we pass some sort of location identifier here?
548   // APT: yes, we will pass location here.
549   // need to store current thread state (in a thread or taskdata structure)
550   // before setting work_state, otherwise wrong state is set after end of task
551 
552   KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata));
553 
554   return;
555 }
556 
557 #if OMPT_SUPPORT
558 //------------------------------------------------------------------------------
559 // __ompt_task_init:
560 //   Initialize OMPT fields maintained by a task. This will only be called after
561 //   ompt_start_tool, so we already know whether ompt is enabled or not.
562 
563 static inline void __ompt_task_init(kmp_taskdata_t *task, int tid) {
564   // The calls to __ompt_task_init already have the ompt_enabled condition.
565   task->ompt_task_info.task_data.value = 0;
566   task->ompt_task_info.frame.exit_frame = ompt_data_none;
567   task->ompt_task_info.frame.enter_frame = ompt_data_none;
568   task->ompt_task_info.frame.exit_frame_flags =
569       ompt_frame_runtime | ompt_frame_framepointer;
570   task->ompt_task_info.frame.enter_frame_flags =
571       ompt_frame_runtime | ompt_frame_framepointer;
572 }
573 
574 // __ompt_task_start:
575 //   Build and trigger task-begin event
576 static inline void __ompt_task_start(kmp_task_t *task,
577                                      kmp_taskdata_t *current_task,
578                                      kmp_int32 gtid) {
579   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
580   ompt_task_status_t status = ompt_task_switch;
581   if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) {
582     status = ompt_task_yield;
583     __kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = 0;
584   }
585   /* let OMPT know that we're about to run this task */
586   if (ompt_enabled.ompt_callback_task_schedule) {
587     ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
588         &(current_task->ompt_task_info.task_data), status,
589         &(taskdata->ompt_task_info.task_data));
590   }
591   taskdata->ompt_task_info.scheduling_parent = current_task;
592 }
593 
594 // __ompt_task_finish:
595 //   Build and trigger final task-schedule event
596 static inline void __ompt_task_finish(kmp_task_t *task,
597                                       kmp_taskdata_t *resumed_task,
598                                       ompt_task_status_t status) {
599   if (ompt_enabled.ompt_callback_task_schedule) {
600     kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
601     if (__kmp_omp_cancellation && taskdata->td_taskgroup &&
602         taskdata->td_taskgroup->cancel_request == cancel_taskgroup) {
603       status = ompt_task_cancel;
604     }
605 
606     /* let OMPT know that we're returning to the callee task */
607     ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
608         &(taskdata->ompt_task_info.task_data), status,
609         (resumed_task ? &(resumed_task->ompt_task_info.task_data) : NULL));
610   }
611 }
612 #endif
613 
614 template <bool ompt>
615 static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid,
616                                                kmp_task_t *task,
617                                                void *frame_address,
618                                                void *return_address) {
619   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
620   kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
621 
622   KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p "
623                 "current_task=%p\n",
624                 gtid, loc_ref, taskdata, current_task));
625 
626   if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
627     // untied task needs to increment counter so that the task structure is not
628     // freed prematurely
629     kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
630     KMP_DEBUG_USE_VAR(counter);
631     KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) "
632                   "incremented for task %p\n",
633                   gtid, counter, taskdata));
634   }
635 
636   taskdata->td_flags.task_serial =
637       1; // Execute this task immediately, not deferred.
638   __kmp_task_start(gtid, task, current_task);
639 
640 #if OMPT_SUPPORT
641   if (ompt) {
642     if (current_task->ompt_task_info.frame.enter_frame.ptr == NULL) {
643       current_task->ompt_task_info.frame.enter_frame.ptr =
644           taskdata->ompt_task_info.frame.exit_frame.ptr = frame_address;
645       current_task->ompt_task_info.frame.enter_frame_flags =
646           taskdata->ompt_task_info.frame.exit_frame_flags =
647               ompt_frame_application | ompt_frame_framepointer;
648     }
649     if (ompt_enabled.ompt_callback_task_create) {
650       ompt_task_info_t *parent_info = &(current_task->ompt_task_info);
651       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
652           &(parent_info->task_data), &(parent_info->frame),
653           &(taskdata->ompt_task_info.task_data),
654           ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(taskdata), 0,
655           return_address);
656     }
657     __ompt_task_start(task, current_task, gtid);
658   }
659 #endif // OMPT_SUPPORT
660 
661   KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid,
662                 loc_ref, taskdata));
663 }
664 
665 #if OMPT_SUPPORT
666 OMPT_NOINLINE
667 static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
668                                            kmp_task_t *task,
669                                            void *frame_address,
670                                            void *return_address) {
671   __kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address,
672                                            return_address);
673 }
674 #endif // OMPT_SUPPORT
675 
676 // __kmpc_omp_task_begin_if0: report that a given serialized task has started
677 // execution
678 //
679 // loc_ref: source location information; points to beginning of task block.
680 // gtid: global thread number.
681 // task: task thunk for the started task.
682 void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
683                                kmp_task_t *task) {
684 #if OMPT_SUPPORT
685   if (UNLIKELY(ompt_enabled.enabled)) {
686     OMPT_STORE_RETURN_ADDRESS(gtid);
687     __kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task,
688                                    OMPT_GET_FRAME_ADDRESS(1),
689                                    OMPT_LOAD_RETURN_ADDRESS(gtid));
690     return;
691   }
692 #endif
693   __kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL);
694 }
695 
696 #ifdef TASK_UNUSED
697 // __kmpc_omp_task_begin: report that a given task has started execution
698 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
699 void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) {
700   kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
701 
702   KA_TRACE(
703       10,
704       ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
705        gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task));
706 
707   __kmp_task_start(gtid, task, current_task);
708 
709   KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid,
710                 loc_ref, KMP_TASK_TO_TASKDATA(task)));
711   return;
712 }
713 #endif // TASK_UNUSED
714 
715 // __kmp_free_task: free the current task space and the space for shareds
716 //
717 // gtid: Global thread ID of calling thread
718 // taskdata: task to free
719 // thread: thread data structure of caller
720 static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
721                             kmp_info_t *thread) {
722   KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid,
723                 taskdata));
724 
725   // Check to make sure all flags and counters have the correct values
726   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
727   KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0);
728   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1);
729   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
730   KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == 0 ||
731                    taskdata->td_flags.task_serial == 1);
732   KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0);
733 
734   taskdata->td_flags.freed = 1;
735 // deallocate the taskdata and shared variable blocks associated with this task
736 #if USE_FAST_MEMORY
737   __kmp_fast_free(thread, taskdata);
738 #else /* ! USE_FAST_MEMORY */
739   __kmp_thread_free(thread, taskdata);
740 #endif
741   KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));
742 }
743 
744 // __kmp_free_task_and_ancestors: free the current task and ancestors without
745 // children
746 //
747 // gtid: Global thread ID of calling thread
748 // taskdata: task to free
749 // thread: thread data structure of caller
750 static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
751                                           kmp_taskdata_t *taskdata,
752                                           kmp_info_t *thread) {
753   // Proxy tasks must always be allowed to free their parents
754   // because they can be run in background even in serial mode.
755   kmp_int32 team_serial =
756       (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) &&
757       !taskdata->td_flags.proxy;
758   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
759 
760   kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
761   KMP_DEBUG_ASSERT(children >= 0);
762 
763   // Now, go up the ancestor tree to see if any ancestors can now be freed.
764   while (children == 0) {
765     kmp_taskdata_t *parent_taskdata = taskdata->td_parent;
766 
767     KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete "
768                   "and freeing itself\n",
769                   gtid, taskdata));
770 
771     // --- Deallocate my ancestor task ---
772     __kmp_free_task(gtid, taskdata, thread);
773 
774     taskdata = parent_taskdata;
775 
776     if (team_serial)
777       return;
778     // Stop checking ancestors at implicit task instead of walking up ancestor
779     // tree to avoid premature deallocation of ancestors.
780     if (taskdata->td_flags.tasktype == TASK_IMPLICIT) {
781       if (taskdata->td_dephash) { // do we need to cleanup dephash?
782         int children = KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks);
783         kmp_tasking_flags_t flags_old = taskdata->td_flags;
784         if (children == 0 && flags_old.complete == 1) {
785           kmp_tasking_flags_t flags_new = flags_old;
786           flags_new.complete = 0;
787           if (KMP_COMPARE_AND_STORE_ACQ32(
788                   RCAST(kmp_int32 *, &taskdata->td_flags),
789                   *RCAST(kmp_int32 *, &flags_old),
790                   *RCAST(kmp_int32 *, &flags_new))) {
791             KA_TRACE(100, ("__kmp_free_task_and_ancestors: T#%d cleans "
792                            "dephash of implicit task %p\n",
793                            gtid, taskdata));
794             // cleanup dephash of finished implicit task
795             __kmp_dephash_free_entries(thread, taskdata->td_dephash);
796           }
797         }
798       }
799       return;
800     }
801     // Predecrement simulated by "- 1" calculation
802     children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
803     KMP_DEBUG_ASSERT(children >= 0);
804   }
805 
806   KA_TRACE(
807       20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "
808            "not freeing it yet\n",
809            gtid, taskdata, children));
810 }
811 
812 // __kmp_task_finish: bookkeeping to do when a task finishes execution
813 //
814 // gtid: global thread ID for calling thread
815 // task: task to be finished
816 // resumed_task: task to be resumed.  (may be NULL if task is serialized)
817 //
818 // template<ompt>: effectively ompt_enabled.enabled!=0
819 // the version with ompt=false is inlined, allowing to optimize away all ompt
820 // code in this case
821 template <bool ompt>
822 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
823                               kmp_taskdata_t *resumed_task) {
824   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
825   kmp_info_t *thread = __kmp_threads[gtid];
826   kmp_task_team_t *task_team =
827       thread->th.th_task_team; // might be NULL for serial teams...
828 #if KMP_DEBUG
829   kmp_int32 children = 0;
830 #endif
831   KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming "
832                 "task %p\n",
833                 gtid, taskdata, resumed_task));
834 
835   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
836 
837 // Pop task from stack if tied
838 #ifdef BUILD_TIED_TASK_STACK
839   if (taskdata->td_flags.tiedness == TASK_TIED) {
840     __kmp_pop_task_stack(gtid, thread, taskdata);
841   }
842 #endif /* BUILD_TIED_TASK_STACK */
843 
844   if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
845     // untied task needs to check the counter so that the task structure is not
846     // freed prematurely
847     kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1;
848     KA_TRACE(
849         20,
850         ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",
851          gtid, counter, taskdata));
852     if (counter > 0) {
853       // untied task is not done, to be continued possibly by other thread, do
854       // not free it now
855       if (resumed_task == NULL) {
856         KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial);
857         resumed_task = taskdata->td_parent; // In a serialized task, the resumed
858         // task is the parent
859       }
860       thread->th.th_current_task = resumed_task; // restore current_task
861       resumed_task->td_flags.executing = 1; // resume previous task
862       KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, "
863                     "resuming task %p\n",
864                     gtid, taskdata, resumed_task));
865       return;
866     }
867   }
868 
869   // bookkeeping for resuming task:
870   // GEH - note tasking_ser => task_serial
871   KMP_DEBUG_ASSERT(
872       (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
873       taskdata->td_flags.task_serial);
874   if (taskdata->td_flags.task_serial) {
875     if (resumed_task == NULL) {
876       resumed_task = taskdata->td_parent; // In a serialized task, the resumed
877       // task is the parent
878     }
879   } else {
880     KMP_DEBUG_ASSERT(resumed_task !=
881                      NULL); // verify that resumed task is passed as argument
882   }
883 
884   /* If the tasks' destructor thunk flag has been set, we need to invoke the
885      destructor thunk that has been generated by the compiler. The code is
886      placed here, since at this point other tasks might have been released
887      hence overlapping the destructor invocations with some other work in the
888      released tasks.  The OpenMP spec is not specific on when the destructors
889      are invoked, so we should be free to choose. */
890   if (UNLIKELY(taskdata->td_flags.destructors_thunk)) {
891     kmp_routine_entry_t destr_thunk = task->data1.destructors;
892     KMP_ASSERT(destr_thunk);
893     destr_thunk(gtid, task);
894   }
895 
896   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
897   KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
898   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
899 
900   bool detach = false;
901   if (UNLIKELY(taskdata->td_flags.detachable == TASK_DETACHABLE)) {
902     if (taskdata->td_allow_completion_event.type ==
903         KMP_EVENT_ALLOW_COMPLETION) {
904       // event hasn't been fulfilled yet. Try to detach task.
905       __kmp_acquire_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
906       if (taskdata->td_allow_completion_event.type ==
907           KMP_EVENT_ALLOW_COMPLETION) {
908         // task finished execution
909         KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
910         taskdata->td_flags.executing = 0; // suspend the finishing task
911 
912 #if OMPT_SUPPORT
913         // For a detached task, which is not completed, we switch back
914         // the omp_fulfill_event signals completion
915         // locking is necessary to avoid a race with ompt_task_late_fulfill
916         if (ompt)
917           __ompt_task_finish(task, resumed_task, ompt_task_detach);
918 #endif
919 
920         // no access to taskdata after this point!
921         // __kmp_fulfill_event might free taskdata at any time from now
922 
923         taskdata->td_flags.proxy = TASK_PROXY; // proxify!
924         detach = true;
925       }
926       __kmp_release_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
927     }
928   }
929 
930   if (!detach) {
931     taskdata->td_flags.complete = 1; // mark the task as completed
932 
933 #if OMPT_SUPPORT
934     // This is not a detached task, we are done here
935     if (ompt)
936       __ompt_task_finish(task, resumed_task, ompt_task_complete);
937 #endif
938 
939     // Only need to keep track of count if team parallel and tasking not
940     // serialized, or task is detachable and event has already been fulfilled
941     if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) ||
942         taskdata->td_flags.detachable == TASK_DETACHABLE ||
943         taskdata->td_flags.hidden_helper) {
944       __kmp_release_deps(gtid, taskdata);
945       // Predecrement simulated by "- 1" calculation
946 #if KMP_DEBUG
947       children = -1 +
948 #endif
949           KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
950       KMP_DEBUG_ASSERT(children >= 0);
951       if (taskdata->td_taskgroup)
952         KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
953     } else if (task_team && (task_team->tt.tt_found_proxy_tasks ||
954                              task_team->tt.tt_hidden_helper_task_encountered)) {
955       // if we found proxy or hidden helper tasks there could exist a dependency
956       // chain with the proxy task as origin
957       __kmp_release_deps(gtid, taskdata);
958     }
959     // td_flags.executing must be marked as 0 after __kmp_release_deps has been
960     // called. Othertwise, if a task is executed immediately from the
961     // release_deps code, the flag will be reset to 1 again by this same
962     // function
963     KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
964     taskdata->td_flags.executing = 0; // suspend the finishing task
965   }
966 
967   KA_TRACE(
968       20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
969            gtid, taskdata, children));
970 
971   // Free this task and then ancestor tasks if they have no children.
972   // Restore th_current_task first as suggested by John:
973   // johnmc: if an asynchronous inquiry peers into the runtime system
974   // it doesn't see the freed task as the current task.
975   thread->th.th_current_task = resumed_task;
976   if (!detach)
977     __kmp_free_task_and_ancestors(gtid, taskdata, thread);
978 
979   // TODO: GEH - make sure root team implicit task is initialized properly.
980   // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
981   resumed_task->td_flags.executing = 1; // resume previous task
982 
983   KA_TRACE(
984       10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
985            gtid, taskdata, resumed_task));
986 
987   return;
988 }
989 
990 template <bool ompt>
991 static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref,
992                                                   kmp_int32 gtid,
993                                                   kmp_task_t *task) {
994   KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
995                 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
996   KMP_DEBUG_ASSERT(gtid >= 0);
997   // this routine will provide task to resume
998   __kmp_task_finish<ompt>(gtid, task, NULL);
999 
1000   KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
1001                 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
1002 
1003 #if OMPT_SUPPORT
1004   if (ompt) {
1005     ompt_frame_t *ompt_frame;
1006     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1007     ompt_frame->enter_frame = ompt_data_none;
1008     ompt_frame->enter_frame_flags =
1009         ompt_frame_runtime | ompt_frame_framepointer;
1010   }
1011 #endif
1012 
1013   return;
1014 }
1015 
1016 #if OMPT_SUPPORT
1017 OMPT_NOINLINE
1018 void __kmpc_omp_task_complete_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
1019                                        kmp_task_t *task) {
1020   __kmpc_omp_task_complete_if0_template<true>(loc_ref, gtid, task);
1021 }
1022 #endif // OMPT_SUPPORT
1023 
1024 // __kmpc_omp_task_complete_if0: report that a task has completed execution
1025 //
1026 // loc_ref: source location information; points to end of task block.
1027 // gtid: global thread number.
1028 // task: task thunk for the completed task.
1029 void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,
1030                                   kmp_task_t *task) {
1031 #if OMPT_SUPPORT
1032   if (UNLIKELY(ompt_enabled.enabled)) {
1033     __kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task);
1034     return;
1035   }
1036 #endif
1037   __kmpc_omp_task_complete_if0_template<false>(loc_ref, gtid, task);
1038 }
1039 
1040 #ifdef TASK_UNUSED
1041 // __kmpc_omp_task_complete: report that a task has completed execution
1042 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
1043 void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid,
1044                               kmp_task_t *task) {
1045   KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid,
1046                 loc_ref, KMP_TASK_TO_TASKDATA(task)));
1047 
1048   __kmp_task_finish<false>(gtid, task,
1049                            NULL); // Not sure how to find task to resume
1050 
1051   KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid,
1052                 loc_ref, KMP_TASK_TO_TASKDATA(task)));
1053   return;
1054 }
1055 #endif // TASK_UNUSED
1056 
1057 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit
1058 // task for a given thread
1059 //
1060 // loc_ref:  reference to source location of parallel region
1061 // this_thr:  thread data structure corresponding to implicit task
1062 // team: team for this_thr
1063 // tid: thread id of given thread within team
1064 // set_curr_task: TRUE if need to push current task to thread
1065 // NOTE: Routine does not set up the implicit task ICVS.  This is assumed to
1066 // have already been done elsewhere.
1067 // TODO: Get better loc_ref.  Value passed in may be NULL
1068 void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
1069                               kmp_team_t *team, int tid, int set_curr_task) {
1070   kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid];
1071 
1072   KF_TRACE(
1073       10,
1074       ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",
1075        tid, team, task, set_curr_task ? "TRUE" : "FALSE"));
1076 
1077   task->td_task_id = KMP_GEN_TASK_ID();
1078   task->td_team = team;
1079   //    task->td_parent   = NULL;  // fix for CQ230101 (broken parent task info
1080   //    in debugger)
1081   task->td_ident = loc_ref;
1082   task->td_taskwait_ident = NULL;
1083   task->td_taskwait_counter = 0;
1084   task->td_taskwait_thread = 0;
1085 
1086   task->td_flags.tiedness = TASK_TIED;
1087   task->td_flags.tasktype = TASK_IMPLICIT;
1088   task->td_flags.proxy = TASK_FULL;
1089 
1090   // All implicit tasks are executed immediately, not deferred
1091   task->td_flags.task_serial = 1;
1092   task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1093   task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1094 
1095   task->td_flags.started = 1;
1096   task->td_flags.executing = 1;
1097   task->td_flags.complete = 0;
1098   task->td_flags.freed = 0;
1099 
1100   task->td_depnode = NULL;
1101   task->td_last_tied = task;
1102   task->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1103 
1104   if (set_curr_task) { // only do this init first time thread is created
1105     KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0);
1106     // Not used: don't need to deallocate implicit task
1107     KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0);
1108     task->td_taskgroup = NULL; // An implicit task does not have taskgroup
1109     task->td_dephash = NULL;
1110     __kmp_push_current_task_to_thread(this_thr, team, tid);
1111   } else {
1112     KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);
1113     KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0);
1114   }
1115 
1116 #if OMPT_SUPPORT
1117   if (UNLIKELY(ompt_enabled.enabled))
1118     __ompt_task_init(task, tid);
1119 #endif
1120 
1121   KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid,
1122                 team, task));
1123 }
1124 
1125 // __kmp_finish_implicit_task: Release resources associated to implicit tasks
1126 // at the end of parallel regions. Some resources are kept for reuse in the next
1127 // parallel region.
1128 //
1129 // thread:  thread data structure corresponding to implicit task
1130 void __kmp_finish_implicit_task(kmp_info_t *thread) {
1131   kmp_taskdata_t *task = thread->th.th_current_task;
1132   if (task->td_dephash) {
1133     int children;
1134     task->td_flags.complete = 1;
1135     children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks);
1136     kmp_tasking_flags_t flags_old = task->td_flags;
1137     if (children == 0 && flags_old.complete == 1) {
1138       kmp_tasking_flags_t flags_new = flags_old;
1139       flags_new.complete = 0;
1140       if (KMP_COMPARE_AND_STORE_ACQ32(RCAST(kmp_int32 *, &task->td_flags),
1141                                       *RCAST(kmp_int32 *, &flags_old),
1142                                       *RCAST(kmp_int32 *, &flags_new))) {
1143         KA_TRACE(100, ("__kmp_finish_implicit_task: T#%d cleans "
1144                        "dephash of implicit task %p\n",
1145                        thread->th.th_info.ds.ds_gtid, task));
1146         __kmp_dephash_free_entries(thread, task->td_dephash);
1147       }
1148     }
1149   }
1150 }
1151 
1152 // __kmp_free_implicit_task: Release resources associated to implicit tasks
1153 // when these are destroyed regions
1154 //
1155 // thread:  thread data structure corresponding to implicit task
1156 void __kmp_free_implicit_task(kmp_info_t *thread) {
1157   kmp_taskdata_t *task = thread->th.th_current_task;
1158   if (task && task->td_dephash) {
1159     __kmp_dephash_free(thread, task->td_dephash);
1160     task->td_dephash = NULL;
1161   }
1162 }
1163 
1164 // Round up a size to a power of two specified by val: Used to insert padding
1165 // between structures co-allocated using a single malloc() call
1166 static size_t __kmp_round_up_to_val(size_t size, size_t val) {
1167   if (size & (val - 1)) {
1168     size &= ~(val - 1);
1169     if (size <= KMP_SIZE_T_MAX - val) {
1170       size += val; // Round up if there is no overflow.
1171     }
1172   }
1173   return size;
1174 } // __kmp_round_up_to_va
1175 
1176 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task
1177 //
1178 // loc_ref: source location information
1179 // gtid: global thread number.
1180 // flags: include tiedness & task type (explicit vs. implicit) of the ''new''
1181 // task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine.
1182 // sizeof_kmp_task_t:  Size in bytes of kmp_task_t data structure including
1183 // private vars accessed in task.
1184 // sizeof_shareds:  Size in bytes of array of pointers to shared vars accessed
1185 // in task.
1186 // task_entry: Pointer to task code entry point generated by compiler.
1187 // returns: a pointer to the allocated kmp_task_t structure (task).
1188 kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1189                              kmp_tasking_flags_t *flags,
1190                              size_t sizeof_kmp_task_t, size_t sizeof_shareds,
1191                              kmp_routine_entry_t task_entry) {
1192   kmp_task_t *task;
1193   kmp_taskdata_t *taskdata;
1194   kmp_info_t *thread = __kmp_threads[gtid];
1195   kmp_team_t *team = thread->th.th_team;
1196   kmp_taskdata_t *parent_task = thread->th.th_current_task;
1197   size_t shareds_offset;
1198 
1199   if (UNLIKELY(!TCR_4(__kmp_init_middle)))
1200     __kmp_middle_initialize();
1201 
1202   if (flags->hidden_helper) {
1203     if (__kmp_enable_hidden_helper) {
1204       if (!TCR_4(__kmp_init_hidden_helper))
1205         __kmp_hidden_helper_initialize();
1206     } else {
1207       // If the hidden helper task is not enabled, reset the flag to FALSE.
1208       flags->hidden_helper = FALSE;
1209     }
1210   }
1211 
1212   KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
1213                 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1214                 gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
1215                 sizeof_shareds, task_entry));
1216 
1217   KMP_DEBUG_ASSERT(parent_task);
1218   if (parent_task->td_flags.final) {
1219     if (flags->merged_if0) {
1220     }
1221     flags->final = 1;
1222   }
1223 
1224   if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) {
1225     // Untied task encountered causes the TSC algorithm to check entire deque of
1226     // the victim thread. If no untied task encountered, then checking the head
1227     // of the deque should be enough.
1228     KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1);
1229   }
1230 
1231   // Detachable tasks are not proxy tasks yet but could be in the future. Doing
1232   // the tasking setup
1233   // when that happens is too late.
1234   if (UNLIKELY(flags->proxy == TASK_PROXY ||
1235                flags->detachable == TASK_DETACHABLE || flags->hidden_helper)) {
1236     if (flags->proxy == TASK_PROXY) {
1237       flags->tiedness = TASK_UNTIED;
1238       flags->merged_if0 = 1;
1239     }
1240     /* are we running in a sequential parallel or tskm_immediate_exec... we need
1241        tasking support enabled */
1242     if ((thread->th.th_task_team) == NULL) {
1243       /* This should only happen if the team is serialized
1244           setup a task team and propagate it to the thread */
1245       KMP_DEBUG_ASSERT(team->t.t_serialized);
1246       KA_TRACE(30,
1247                ("T#%d creating task team in __kmp_task_alloc for proxy task\n",
1248                 gtid));
1249       // 1 indicates setup the current team regardless of nthreads
1250       __kmp_task_team_setup(thread, team, 1);
1251       thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
1252     }
1253     kmp_task_team_t *task_team = thread->th.th_task_team;
1254 
1255     /* tasking must be enabled now as the task might not be pushed */
1256     if (!KMP_TASKING_ENABLED(task_team)) {
1257       KA_TRACE(
1258           30,
1259           ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
1260       __kmp_enable_tasking(task_team, thread);
1261       kmp_int32 tid = thread->th.th_info.ds.ds_tid;
1262       kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
1263       // No lock needed since only owner can allocate
1264       if (thread_data->td.td_deque == NULL) {
1265         __kmp_alloc_task_deque(thread, thread_data);
1266       }
1267     }
1268 
1269     if ((flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE) &&
1270         task_team->tt.tt_found_proxy_tasks == FALSE)
1271       TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE);
1272     if (flags->hidden_helper &&
1273         task_team->tt.tt_hidden_helper_task_encountered == FALSE)
1274       TCW_4(task_team->tt.tt_hidden_helper_task_encountered, TRUE);
1275   }
1276 
1277   // Calculate shared structure offset including padding after kmp_task_t struct
1278   // to align pointers in shared struct
1279   shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t;
1280   shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(void *));
1281 
1282   // Allocate a kmp_taskdata_t block and a kmp_task_t block.
1283   KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid,
1284                 shareds_offset));
1285   KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid,
1286                 sizeof_shareds));
1287 
1288   // Avoid double allocation here by combining shareds with taskdata
1289 #if USE_FAST_MEMORY
1290   taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset +
1291                                                                sizeof_shareds);
1292 #else /* ! USE_FAST_MEMORY */
1293   taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset +
1294                                                                sizeof_shareds);
1295 #endif /* USE_FAST_MEMORY */
1296 
1297   task = KMP_TASKDATA_TO_TASK(taskdata);
1298 
1299 // Make sure task & taskdata are aligned appropriately
1300 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD
1301   KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0);
1302   KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0);
1303 #else
1304   KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0);
1305   KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0);
1306 #endif
1307   if (sizeof_shareds > 0) {
1308     // Avoid double allocation here by combining shareds with taskdata
1309     task->shareds = &((char *)taskdata)[shareds_offset];
1310     // Make sure shareds struct is aligned to pointer size
1311     KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
1312                      0);
1313   } else {
1314     task->shareds = NULL;
1315   }
1316   task->routine = task_entry;
1317   task->part_id = 0; // AC: Always start with 0 part id
1318 
1319   taskdata->td_task_id = KMP_GEN_TASK_ID();
1320   taskdata->td_team = thread->th.th_team;
1321   taskdata->td_alloc_thread = thread;
1322   taskdata->td_parent = parent_task;
1323   taskdata->td_level = parent_task->td_level + 1; // increment nesting level
1324   KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
1325   taskdata->td_ident = loc_ref;
1326   taskdata->td_taskwait_ident = NULL;
1327   taskdata->td_taskwait_counter = 0;
1328   taskdata->td_taskwait_thread = 0;
1329   KMP_DEBUG_ASSERT(taskdata->td_parent != NULL);
1330   // avoid copying icvs for proxy tasks
1331   if (flags->proxy == TASK_FULL)
1332     copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs);
1333 
1334   taskdata->td_flags = *flags;
1335   taskdata->td_task_team = thread->th.th_task_team;
1336   taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
1337   taskdata->td_flags.tasktype = TASK_EXPLICIT;
1338   // If it is hidden helper task, we need to set the team and task team
1339   // correspondingly.
1340   if (flags->hidden_helper) {
1341     kmp_info_t *shadow_thread = __kmp_threads[KMP_GTID_TO_SHADOW_GTID(gtid)];
1342     taskdata->td_team = shadow_thread->th.th_team;
1343     taskdata->td_task_team = shadow_thread->th.th_task_team;
1344   }
1345 
1346   // GEH - TODO: fix this to copy parent task's value of tasking_ser flag
1347   taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1348 
1349   // GEH - TODO: fix this to copy parent task's value of team_serial flag
1350   taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1351 
1352   // GEH - Note we serialize the task if the team is serialized to make sure
1353   // implicit parallel region tasks are not left until program termination to
1354   // execute. Also, it helps locality to execute immediately.
1355 
1356   taskdata->td_flags.task_serial =
1357       (parent_task->td_flags.final || taskdata->td_flags.team_serial ||
1358        taskdata->td_flags.tasking_ser || flags->merged_if0);
1359 
1360   taskdata->td_flags.started = 0;
1361   taskdata->td_flags.executing = 0;
1362   taskdata->td_flags.complete = 0;
1363   taskdata->td_flags.freed = 0;
1364 
1365   KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
1366   // start at one because counts current task and children
1367   KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
1368   taskdata->td_taskgroup =
1369       parent_task->td_taskgroup; // task inherits taskgroup from the parent task
1370   taskdata->td_dephash = NULL;
1371   taskdata->td_depnode = NULL;
1372   if (flags->tiedness == TASK_UNTIED)
1373     taskdata->td_last_tied = NULL; // will be set when the task is scheduled
1374   else
1375     taskdata->td_last_tied = taskdata;
1376   taskdata->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1377 #if OMPT_SUPPORT
1378   if (UNLIKELY(ompt_enabled.enabled))
1379     __ompt_task_init(taskdata, gtid);
1380 #endif
1381   // Only need to keep track of child task counts if team parallel and tasking
1382   // not serialized or if it is a proxy or detachable or hidden helper task
1383   if (flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE ||
1384       flags->hidden_helper ||
1385       !(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
1386     KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
1387     if (parent_task->td_taskgroup)
1388       KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
1389     // Only need to keep track of allocated child tasks for explicit tasks since
1390     // implicit not deallocated
1391     if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
1392       KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
1393     }
1394     if (flags->hidden_helper) {
1395       taskdata->td_flags.task_serial = FALSE;
1396       // Increment the number of hidden helper tasks to be executed
1397       KMP_ATOMIC_INC(&__kmp_unexecuted_hidden_helper_tasks);
1398     }
1399   }
1400 
1401   KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
1402                 gtid, taskdata, taskdata->td_parent));
1403 
1404   return task;
1405 }
1406 
1407 kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1408                                   kmp_int32 flags, size_t sizeof_kmp_task_t,
1409                                   size_t sizeof_shareds,
1410                                   kmp_routine_entry_t task_entry) {
1411   kmp_task_t *retval;
1412   kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
1413   __kmp_assert_valid_gtid(gtid);
1414   input_flags->native = FALSE;
1415   // __kmp_task_alloc() sets up all other runtime flags
1416   KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s %s) "
1417                 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1418                 gtid, loc_ref, input_flags->tiedness ? "tied  " : "untied",
1419                 input_flags->proxy ? "proxy" : "",
1420                 input_flags->detachable ? "detachable" : "", sizeof_kmp_task_t,
1421                 sizeof_shareds, task_entry));
1422 
1423   retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t,
1424                             sizeof_shareds, task_entry);
1425 
1426   KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval));
1427 
1428   return retval;
1429 }
1430 
1431 kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1432                                          kmp_int32 flags,
1433                                          size_t sizeof_kmp_task_t,
1434                                          size_t sizeof_shareds,
1435                                          kmp_routine_entry_t task_entry,
1436                                          kmp_int64 device_id) {
1437   auto &input_flags = reinterpret_cast<kmp_tasking_flags_t &>(flags);
1438   // target task is untied defined in the specification
1439   input_flags.tiedness = TASK_UNTIED;
1440 
1441   if (__kmp_enable_hidden_helper)
1442     input_flags.hidden_helper = TRUE;
1443 
1444   return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t,
1445                                sizeof_shareds, task_entry);
1446 }
1447 
1448 /*!
1449 @ingroup TASKING
1450 @param loc_ref location of the original task directive
1451 @param gtid Global Thread ID of encountering thread
1452 @param new_task task thunk allocated by __kmpc_omp_task_alloc() for the ''new
1453 task''
1454 @param naffins Number of affinity items
1455 @param affin_list List of affinity items
1456 @return Returns non-zero if registering affinity information was not successful.
1457  Returns 0 if registration was successful
1458 This entry registers the affinity information attached to a task with the task
1459 thunk structure kmp_taskdata_t.
1460 */
1461 kmp_int32
1462 __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid,
1463                                   kmp_task_t *new_task, kmp_int32 naffins,
1464                                   kmp_task_affinity_info_t *affin_list) {
1465   return 0;
1466 }
1467 
1468 //  __kmp_invoke_task: invoke the specified task
1469 //
1470 // gtid: global thread ID of caller
1471 // task: the task to invoke
1472 // current_task: the task to resume after task invocation
1473 static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
1474                               kmp_taskdata_t *current_task) {
1475   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
1476   kmp_info_t *thread;
1477   int discard = 0 /* false */;
1478   KA_TRACE(
1479       30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
1480            gtid, taskdata, current_task));
1481   KMP_DEBUG_ASSERT(task);
1482   if (UNLIKELY(taskdata->td_flags.proxy == TASK_PROXY &&
1483                taskdata->td_flags.complete == 1)) {
1484     // This is a proxy task that was already completed but it needs to run
1485     // its bottom-half finish
1486     KA_TRACE(
1487         30,
1488         ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
1489          gtid, taskdata));
1490 
1491     __kmp_bottom_half_finish_proxy(gtid, task);
1492 
1493     KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for "
1494                   "proxy task %p, resuming task %p\n",
1495                   gtid, taskdata, current_task));
1496 
1497     return;
1498   }
1499 
1500 #if OMPT_SUPPORT
1501   // For untied tasks, the first task executed only calls __kmpc_omp_task and
1502   // does not execute code.
1503   ompt_thread_info_t oldInfo;
1504   if (UNLIKELY(ompt_enabled.enabled)) {
1505     // Store the threads states and restore them after the task
1506     thread = __kmp_threads[gtid];
1507     oldInfo = thread->th.ompt_thread_info;
1508     thread->th.ompt_thread_info.wait_id = 0;
1509     thread->th.ompt_thread_info.state = (thread->th.th_team_serialized)
1510                                             ? ompt_state_work_serial
1511                                             : ompt_state_work_parallel;
1512     taskdata->ompt_task_info.frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1513   }
1514 #endif
1515 
1516   // Decreament the counter of hidden helper tasks to be executed
1517   if (taskdata->td_flags.hidden_helper) {
1518     // Hidden helper tasks can only be executed by hidden helper threads
1519     KMP_ASSERT(KMP_HIDDEN_HELPER_THREAD(gtid));
1520     KMP_ATOMIC_DEC(&__kmp_unexecuted_hidden_helper_tasks);
1521   }
1522 
1523   // Proxy tasks are not handled by the runtime
1524   if (taskdata->td_flags.proxy != TASK_PROXY) {
1525     __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded
1526   }
1527 
1528   // TODO: cancel tasks if the parallel region has also been cancelled
1529   // TODO: check if this sequence can be hoisted above __kmp_task_start
1530   // if cancellation has been enabled for this run ...
1531   if (UNLIKELY(__kmp_omp_cancellation)) {
1532     thread = __kmp_threads[gtid];
1533     kmp_team_t *this_team = thread->th.th_team;
1534     kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
1535     if ((taskgroup && taskgroup->cancel_request) ||
1536         (this_team->t.t_cancel_request == cancel_parallel)) {
1537 #if OMPT_SUPPORT && OMPT_OPTIONAL
1538       ompt_data_t *task_data;
1539       if (UNLIKELY(ompt_enabled.ompt_callback_cancel)) {
1540         __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
1541         ompt_callbacks.ompt_callback(ompt_callback_cancel)(
1542             task_data,
1543             ((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup
1544                                                       : ompt_cancel_parallel) |
1545                 ompt_cancel_discarded_task,
1546             NULL);
1547       }
1548 #endif
1549       KMP_COUNT_BLOCK(TASK_cancelled);
1550       // this task belongs to a task group and we need to cancel it
1551       discard = 1 /* true */;
1552     }
1553   }
1554 
1555   // Invoke the task routine and pass in relevant data.
1556   // Thunks generated by gcc take a different argument list.
1557   if (!discard) {
1558     if (taskdata->td_flags.tiedness == TASK_UNTIED) {
1559       taskdata->td_last_tied = current_task->td_last_tied;
1560       KMP_DEBUG_ASSERT(taskdata->td_last_tied);
1561     }
1562 #if KMP_STATS_ENABLED
1563     KMP_COUNT_BLOCK(TASK_executed);
1564     switch (KMP_GET_THREAD_STATE()) {
1565     case FORK_JOIN_BARRIER:
1566       KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar);
1567       break;
1568     case PLAIN_BARRIER:
1569       KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar);
1570       break;
1571     case TASKYIELD:
1572       KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield);
1573       break;
1574     case TASKWAIT:
1575       KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait);
1576       break;
1577     case TASKGROUP:
1578       KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup);
1579       break;
1580     default:
1581       KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate);
1582       break;
1583     }
1584 #endif // KMP_STATS_ENABLED
1585 
1586 // OMPT task begin
1587 #if OMPT_SUPPORT
1588     if (UNLIKELY(ompt_enabled.enabled))
1589       __ompt_task_start(task, current_task, gtid);
1590 #endif
1591 
1592 #if OMPD_SUPPORT
1593     if (ompd_state & OMPD_ENABLE_BP)
1594       ompd_bp_task_begin();
1595 #endif
1596 
1597 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1598     kmp_uint64 cur_time;
1599     kmp_int32 kmp_itt_count_task =
1600         __kmp_forkjoin_frames_mode == 3 && !taskdata->td_flags.task_serial &&
1601         current_task->td_flags.tasktype == TASK_IMPLICIT;
1602     if (kmp_itt_count_task) {
1603       thread = __kmp_threads[gtid];
1604       // Time outer level explicit task on barrier for adjusting imbalance time
1605       if (thread->th.th_bar_arrive_time)
1606         cur_time = __itt_get_timestamp();
1607       else
1608         kmp_itt_count_task = 0; // thread is not on a barrier - skip timing
1609     }
1610     KMP_FSYNC_ACQUIRED(taskdata); // acquired self (new task)
1611 #endif
1612 
1613 #ifdef KMP_GOMP_COMPAT
1614     if (taskdata->td_flags.native) {
1615       ((void (*)(void *))(*(task->routine)))(task->shareds);
1616     } else
1617 #endif /* KMP_GOMP_COMPAT */
1618     {
1619       (*(task->routine))(gtid, task);
1620     }
1621     KMP_POP_PARTITIONED_TIMER();
1622 
1623 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1624     if (kmp_itt_count_task) {
1625       // Barrier imbalance - adjust arrive time with the task duration
1626       thread->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time);
1627     }
1628     KMP_FSYNC_CANCEL(taskdata); // destroy self (just executed)
1629     KMP_FSYNC_RELEASING(taskdata->td_parent); // releasing parent
1630 #endif
1631   }
1632 
1633 #if OMPD_SUPPORT
1634   if (ompd_state & OMPD_ENABLE_BP)
1635     ompd_bp_task_end();
1636 #endif
1637 
1638   // Proxy tasks are not handled by the runtime
1639   if (taskdata->td_flags.proxy != TASK_PROXY) {
1640 #if OMPT_SUPPORT
1641     if (UNLIKELY(ompt_enabled.enabled)) {
1642       thread->th.ompt_thread_info = oldInfo;
1643       if (taskdata->td_flags.tiedness == TASK_TIED) {
1644         taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1645       }
1646       __kmp_task_finish<true>(gtid, task, current_task);
1647     } else
1648 #endif
1649       __kmp_task_finish<false>(gtid, task, current_task);
1650   }
1651 
1652   KA_TRACE(
1653       30,
1654       ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
1655        gtid, taskdata, current_task));
1656   return;
1657 }
1658 
1659 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution
1660 //
1661 // loc_ref: location of original task pragma (ignored)
1662 // gtid: Global Thread ID of encountering thread
1663 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
1664 // Returns:
1665 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1666 //    be resumed later.
1667 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1668 //    resumed later.
1669 kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,
1670                                 kmp_task_t *new_task) {
1671   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1672 
1673   KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid,
1674                 loc_ref, new_taskdata));
1675 
1676 #if OMPT_SUPPORT
1677   kmp_taskdata_t *parent;
1678   if (UNLIKELY(ompt_enabled.enabled)) {
1679     parent = new_taskdata->td_parent;
1680     if (ompt_enabled.ompt_callback_task_create) {
1681       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1682           &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
1683           &(new_taskdata->ompt_task_info.task_data), ompt_task_explicit, 0,
1684           OMPT_GET_RETURN_ADDRESS(0));
1685     }
1686   }
1687 #endif
1688 
1689   /* Should we execute the new task or queue it? For now, let's just always try
1690      to queue it.  If the queue fills up, then we'll execute it.  */
1691 
1692   if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1693   { // Execute this task immediately
1694     kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1695     new_taskdata->td_flags.task_serial = 1;
1696     __kmp_invoke_task(gtid, new_task, current_task);
1697   }
1698 
1699   KA_TRACE(
1700       10,
1701       ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "
1702        "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
1703        gtid, loc_ref, new_taskdata));
1704 
1705 #if OMPT_SUPPORT
1706   if (UNLIKELY(ompt_enabled.enabled)) {
1707     parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1708   }
1709 #endif
1710   return TASK_CURRENT_NOT_QUEUED;
1711 }
1712 
1713 // __kmp_omp_task: Schedule a non-thread-switchable task for execution
1714 //
1715 // gtid: Global Thread ID of encountering thread
1716 // new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
1717 // serialize_immediate: if TRUE then if the task is executed immediately its
1718 // execution will be serialized
1719 // Returns:
1720 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1721 //    be resumed later.
1722 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1723 //    resumed later.
1724 kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
1725                          bool serialize_immediate) {
1726   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1727 
1728   /* Should we execute the new task or queue it? For now, let's just always try
1729      to queue it.  If the queue fills up, then we'll execute it.  */
1730   if (new_taskdata->td_flags.proxy == TASK_PROXY ||
1731       __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1732   { // Execute this task immediately
1733     kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1734     if (serialize_immediate)
1735       new_taskdata->td_flags.task_serial = 1;
1736     __kmp_invoke_task(gtid, new_task, current_task);
1737   }
1738 
1739   return TASK_CURRENT_NOT_QUEUED;
1740 }
1741 
1742 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a
1743 // non-thread-switchable task from the parent thread only!
1744 //
1745 // loc_ref: location of original task pragma (ignored)
1746 // gtid: Global Thread ID of encountering thread
1747 // new_task: non-thread-switchable task thunk allocated by
1748 // __kmp_omp_task_alloc()
1749 // Returns:
1750 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1751 //    be resumed later.
1752 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1753 //    resumed later.
1754 kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid,
1755                           kmp_task_t *new_task) {
1756   kmp_int32 res;
1757   KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1758 
1759 #if KMP_DEBUG || OMPT_SUPPORT
1760   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1761 #endif
1762   KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1763                 new_taskdata));
1764   __kmp_assert_valid_gtid(gtid);
1765 
1766 #if OMPT_SUPPORT
1767   kmp_taskdata_t *parent = NULL;
1768   if (UNLIKELY(ompt_enabled.enabled)) {
1769     if (!new_taskdata->td_flags.started) {
1770       OMPT_STORE_RETURN_ADDRESS(gtid);
1771       parent = new_taskdata->td_parent;
1772       if (!parent->ompt_task_info.frame.enter_frame.ptr) {
1773         parent->ompt_task_info.frame.enter_frame.ptr =
1774             OMPT_GET_FRAME_ADDRESS(0);
1775       }
1776       if (ompt_enabled.ompt_callback_task_create) {
1777         ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1778             &(parent->ompt_task_info.task_data),
1779             &(parent->ompt_task_info.frame),
1780             &(new_taskdata->ompt_task_info.task_data),
1781             ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1782             OMPT_LOAD_RETURN_ADDRESS(gtid));
1783       }
1784     } else {
1785       // We are scheduling the continuation of an UNTIED task.
1786       // Scheduling back to the parent task.
1787       __ompt_task_finish(new_task,
1788                          new_taskdata->ompt_task_info.scheduling_parent,
1789                          ompt_task_switch);
1790       new_taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1791     }
1792   }
1793 #endif
1794 
1795   res = __kmp_omp_task(gtid, new_task, true);
1796 
1797   KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1798                 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1799                 gtid, loc_ref, new_taskdata));
1800 #if OMPT_SUPPORT
1801   if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
1802     parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1803   }
1804 #endif
1805   return res;
1806 }
1807 
1808 // __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule
1809 // a taskloop task with the correct OMPT return address
1810 //
1811 // loc_ref: location of original task pragma (ignored)
1812 // gtid: Global Thread ID of encountering thread
1813 // new_task: non-thread-switchable task thunk allocated by
1814 // __kmp_omp_task_alloc()
1815 // codeptr_ra: return address for OMPT callback
1816 // Returns:
1817 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1818 //    be resumed later.
1819 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1820 //    resumed later.
1821 kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid,
1822                                   kmp_task_t *new_task, void *codeptr_ra) {
1823   kmp_int32 res;
1824   KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1825 
1826 #if KMP_DEBUG || OMPT_SUPPORT
1827   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1828 #endif
1829   KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1830                 new_taskdata));
1831 
1832 #if OMPT_SUPPORT
1833   kmp_taskdata_t *parent = NULL;
1834   if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) {
1835     parent = new_taskdata->td_parent;
1836     if (!parent->ompt_task_info.frame.enter_frame.ptr)
1837       parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1838     if (ompt_enabled.ompt_callback_task_create) {
1839       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1840           &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
1841           &(new_taskdata->ompt_task_info.task_data),
1842           ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1843           codeptr_ra);
1844     }
1845   }
1846 #endif
1847 
1848   res = __kmp_omp_task(gtid, new_task, true);
1849 
1850   KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1851                 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1852                 gtid, loc_ref, new_taskdata));
1853 #if OMPT_SUPPORT
1854   if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
1855     parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1856   }
1857 #endif
1858   return res;
1859 }
1860 
1861 template <bool ompt>
1862 static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
1863                                               void *frame_address,
1864                                               void *return_address) {
1865   kmp_taskdata_t *taskdata = nullptr;
1866   kmp_info_t *thread;
1867   int thread_finished = FALSE;
1868   KMP_SET_THREAD_STATE_BLOCK(TASKWAIT);
1869 
1870   KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref));
1871   KMP_DEBUG_ASSERT(gtid >= 0);
1872 
1873   if (__kmp_tasking_mode != tskm_immediate_exec) {
1874     thread = __kmp_threads[gtid];
1875     taskdata = thread->th.th_current_task;
1876 
1877 #if OMPT_SUPPORT && OMPT_OPTIONAL
1878     ompt_data_t *my_task_data;
1879     ompt_data_t *my_parallel_data;
1880 
1881     if (ompt) {
1882       my_task_data = &(taskdata->ompt_task_info.task_data);
1883       my_parallel_data = OMPT_CUR_TEAM_DATA(thread);
1884 
1885       taskdata->ompt_task_info.frame.enter_frame.ptr = frame_address;
1886 
1887       if (ompt_enabled.ompt_callback_sync_region) {
1888         ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1889             ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
1890             my_task_data, return_address);
1891       }
1892 
1893       if (ompt_enabled.ompt_callback_sync_region_wait) {
1894         ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1895             ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
1896             my_task_data, return_address);
1897       }
1898     }
1899 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1900 
1901 // Debugger: The taskwait is active. Store location and thread encountered the
1902 // taskwait.
1903 #if USE_ITT_BUILD
1904 // Note: These values are used by ITT events as well.
1905 #endif /* USE_ITT_BUILD */
1906     taskdata->td_taskwait_counter += 1;
1907     taskdata->td_taskwait_ident = loc_ref;
1908     taskdata->td_taskwait_thread = gtid + 1;
1909 
1910 #if USE_ITT_BUILD
1911     void *itt_sync_obj = NULL;
1912 #if USE_ITT_NOTIFY
1913     KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
1914 #endif /* USE_ITT_NOTIFY */
1915 #endif /* USE_ITT_BUILD */
1916 
1917     bool must_wait =
1918         !taskdata->td_flags.team_serial && !taskdata->td_flags.final;
1919 
1920     must_wait = must_wait || (thread->th.th_task_team != NULL &&
1921                               thread->th.th_task_team->tt.tt_found_proxy_tasks);
1922     // If hidden helper thread is encountered, we must enable wait here.
1923     must_wait =
1924         must_wait ||
1925         (__kmp_enable_hidden_helper && thread->th.th_task_team != NULL &&
1926          thread->th.th_task_team->tt.tt_hidden_helper_task_encountered);
1927 
1928     if (must_wait) {
1929       kmp_flag_32<false, false> flag(
1930           RCAST(std::atomic<kmp_uint32> *,
1931                 &(taskdata->td_incomplete_child_tasks)),
1932           0U);
1933       while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) {
1934         flag.execute_tasks(thread, gtid, FALSE,
1935                            &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
1936                            __kmp_task_stealing_constraint);
1937       }
1938     }
1939 #if USE_ITT_BUILD
1940     KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
1941     KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with children
1942 #endif /* USE_ITT_BUILD */
1943 
1944     // Debugger:  The taskwait is completed. Location remains, but thread is
1945     // negated.
1946     taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
1947 
1948 #if OMPT_SUPPORT && OMPT_OPTIONAL
1949     if (ompt) {
1950       if (ompt_enabled.ompt_callback_sync_region_wait) {
1951         ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1952             ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
1953             my_task_data, return_address);
1954       }
1955       if (ompt_enabled.ompt_callback_sync_region) {
1956         ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1957             ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
1958             my_task_data, return_address);
1959       }
1960       taskdata->ompt_task_info.frame.enter_frame = ompt_data_none;
1961     }
1962 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1963 
1964   }
1965 
1966   KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
1967                 "returning TASK_CURRENT_NOT_QUEUED\n",
1968                 gtid, taskdata));
1969 
1970   return TASK_CURRENT_NOT_QUEUED;
1971 }
1972 
1973 #if OMPT_SUPPORT && OMPT_OPTIONAL
1974 OMPT_NOINLINE
1975 static kmp_int32 __kmpc_omp_taskwait_ompt(ident_t *loc_ref, kmp_int32 gtid,
1976                                           void *frame_address,
1977                                           void *return_address) {
1978   return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address,
1979                                             return_address);
1980 }
1981 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1982 
1983 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are
1984 // complete
1985 kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) {
1986 #if OMPT_SUPPORT && OMPT_OPTIONAL
1987   if (UNLIKELY(ompt_enabled.enabled)) {
1988     OMPT_STORE_RETURN_ADDRESS(gtid);
1989     return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(0),
1990                                     OMPT_LOAD_RETURN_ADDRESS(gtid));
1991   }
1992 #endif
1993   return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL);
1994 }
1995 
1996 // __kmpc_omp_taskyield: switch to a different task
1997 kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
1998   kmp_taskdata_t *taskdata = NULL;
1999   kmp_info_t *thread;
2000   int thread_finished = FALSE;
2001 
2002   KMP_COUNT_BLOCK(OMP_TASKYIELD);
2003   KMP_SET_THREAD_STATE_BLOCK(TASKYIELD);
2004 
2005   KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
2006                 gtid, loc_ref, end_part));
2007   __kmp_assert_valid_gtid(gtid);
2008 
2009   if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) {
2010     thread = __kmp_threads[gtid];
2011     taskdata = thread->th.th_current_task;
2012 // Should we model this as a task wait or not?
2013 // Debugger: The taskwait is active. Store location and thread encountered the
2014 // taskwait.
2015 #if USE_ITT_BUILD
2016 // Note: These values are used by ITT events as well.
2017 #endif /* USE_ITT_BUILD */
2018     taskdata->td_taskwait_counter += 1;
2019     taskdata->td_taskwait_ident = loc_ref;
2020     taskdata->td_taskwait_thread = gtid + 1;
2021 
2022 #if USE_ITT_BUILD
2023     void *itt_sync_obj = NULL;
2024 #if USE_ITT_NOTIFY
2025     KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2026 #endif /* USE_ITT_NOTIFY */
2027 #endif /* USE_ITT_BUILD */
2028     if (!taskdata->td_flags.team_serial) {
2029       kmp_task_team_t *task_team = thread->th.th_task_team;
2030       if (task_team != NULL) {
2031         if (KMP_TASKING_ENABLED(task_team)) {
2032 #if OMPT_SUPPORT
2033           if (UNLIKELY(ompt_enabled.enabled))
2034             thread->th.ompt_thread_info.ompt_task_yielded = 1;
2035 #endif
2036           __kmp_execute_tasks_32(
2037               thread, gtid, (kmp_flag_32<> *)NULL, FALSE,
2038               &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2039               __kmp_task_stealing_constraint);
2040 #if OMPT_SUPPORT
2041           if (UNLIKELY(ompt_enabled.enabled))
2042             thread->th.ompt_thread_info.ompt_task_yielded = 0;
2043 #endif
2044         }
2045       }
2046     }
2047 #if USE_ITT_BUILD
2048     KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2049 #endif /* USE_ITT_BUILD */
2050 
2051     // Debugger:  The taskwait is completed. Location remains, but thread is
2052     // negated.
2053     taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2054   }
2055 
2056   KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
2057                 "returning TASK_CURRENT_NOT_QUEUED\n",
2058                 gtid, taskdata));
2059 
2060   return TASK_CURRENT_NOT_QUEUED;
2061 }
2062 
2063 // Task Reduction implementation
2064 //
2065 // Note: initial implementation didn't take into account the possibility
2066 // to specify omp_orig for initializer of the UDR (user defined reduction).
2067 // Corrected implementation takes into account the omp_orig object.
2068 // Compiler is free to use old implementation if omp_orig is not specified.
2069 
2070 /*!
2071 @ingroup BASIC_TYPES
2072 @{
2073 */
2074 
2075 /*!
2076 Flags for special info per task reduction item.
2077 */
2078 typedef struct kmp_taskred_flags {
2079   /*! 1 - use lazy alloc/init (e.g. big objects, #tasks < #threads) */
2080   unsigned lazy_priv : 1;
2081   unsigned reserved31 : 31;
2082 } kmp_taskred_flags_t;
2083 
2084 /*!
2085 Internal struct for reduction data item related info set up by compiler.
2086 */
2087 typedef struct kmp_task_red_input {
2088   void *reduce_shar; /**< shared between tasks item to reduce into */
2089   size_t reduce_size; /**< size of data item in bytes */
2090   // three compiler-generated routines (init, fini are optional):
2091   void *reduce_init; /**< data initialization routine (single parameter) */
2092   void *reduce_fini; /**< data finalization routine */
2093   void *reduce_comb; /**< data combiner routine */
2094   kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
2095 } kmp_task_red_input_t;
2096 
2097 /*!
2098 Internal struct for reduction data item related info saved by the library.
2099 */
2100 typedef struct kmp_taskred_data {
2101   void *reduce_shar; /**< shared between tasks item to reduce into */
2102   size_t reduce_size; /**< size of data item */
2103   kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
2104   void *reduce_priv; /**< array of thread specific items */
2105   void *reduce_pend; /**< end of private data for faster comparison op */
2106   // three compiler-generated routines (init, fini are optional):
2107   void *reduce_comb; /**< data combiner routine */
2108   void *reduce_init; /**< data initialization routine (two parameters) */
2109   void *reduce_fini; /**< data finalization routine */
2110   void *reduce_orig; /**< original item (can be used in UDR initializer) */
2111 } kmp_taskred_data_t;
2112 
2113 /*!
2114 Internal struct for reduction data item related info set up by compiler.
2115 
2116 New interface: added reduce_orig field to provide omp_orig for UDR initializer.
2117 */
2118 typedef struct kmp_taskred_input {
2119   void *reduce_shar; /**< shared between tasks item to reduce into */
2120   void *reduce_orig; /**< original reduction item used for initialization */
2121   size_t reduce_size; /**< size of data item */
2122   // three compiler-generated routines (init, fini are optional):
2123   void *reduce_init; /**< data initialization routine (two parameters) */
2124   void *reduce_fini; /**< data finalization routine */
2125   void *reduce_comb; /**< data combiner routine */
2126   kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
2127 } kmp_taskred_input_t;
2128 /*!
2129 @}
2130 */
2131 
2132 template <typename T> void __kmp_assign_orig(kmp_taskred_data_t &item, T &src);
2133 template <>
2134 void __kmp_assign_orig<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2135                                              kmp_task_red_input_t &src) {
2136   item.reduce_orig = NULL;
2137 }
2138 template <>
2139 void __kmp_assign_orig<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2140                                             kmp_taskred_input_t &src) {
2141   if (src.reduce_orig != NULL) {
2142     item.reduce_orig = src.reduce_orig;
2143   } else {
2144     item.reduce_orig = src.reduce_shar;
2145   } // non-NULL reduce_orig means new interface used
2146 }
2147 
2148 template <typename T> void __kmp_call_init(kmp_taskred_data_t &item, size_t j);
2149 template <>
2150 void __kmp_call_init<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2151                                            size_t offset) {
2152   ((void (*)(void *))item.reduce_init)((char *)(item.reduce_priv) + offset);
2153 }
2154 template <>
2155 void __kmp_call_init<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2156                                           size_t offset) {
2157   ((void (*)(void *, void *))item.reduce_init)(
2158       (char *)(item.reduce_priv) + offset, item.reduce_orig);
2159 }
2160 
2161 template <typename T>
2162 void *__kmp_task_reduction_init(int gtid, int num, T *data) {
2163   __kmp_assert_valid_gtid(gtid);
2164   kmp_info_t *thread = __kmp_threads[gtid];
2165   kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
2166   kmp_uint32 nth = thread->th.th_team_nproc;
2167   kmp_taskred_data_t *arr;
2168 
2169   // check input data just in case
2170   KMP_ASSERT(tg != NULL);
2171   KMP_ASSERT(data != NULL);
2172   KMP_ASSERT(num > 0);
2173   if (nth == 1) {
2174     KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n",
2175                   gtid, tg));
2176     return (void *)tg;
2177   }
2178   KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n",
2179                 gtid, tg, num));
2180   arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2181       thread, num * sizeof(kmp_taskred_data_t));
2182   for (int i = 0; i < num; ++i) {
2183     size_t size = data[i].reduce_size - 1;
2184     // round the size up to cache line per thread-specific item
2185     size += CACHE_LINE - size % CACHE_LINE;
2186     KMP_ASSERT(data[i].reduce_comb != NULL); // combiner is mandatory
2187     arr[i].reduce_shar = data[i].reduce_shar;
2188     arr[i].reduce_size = size;
2189     arr[i].flags = data[i].flags;
2190     arr[i].reduce_comb = data[i].reduce_comb;
2191     arr[i].reduce_init = data[i].reduce_init;
2192     arr[i].reduce_fini = data[i].reduce_fini;
2193     __kmp_assign_orig<T>(arr[i], data[i]);
2194     if (!arr[i].flags.lazy_priv) {
2195       // allocate cache-line aligned block and fill it with zeros
2196       arr[i].reduce_priv = __kmp_allocate(nth * size);
2197       arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size;
2198       if (arr[i].reduce_init != NULL) {
2199         // initialize all thread-specific items
2200         for (size_t j = 0; j < nth; ++j) {
2201           __kmp_call_init<T>(arr[i], j * size);
2202         }
2203       }
2204     } else {
2205       // only allocate space for pointers now,
2206       // objects will be lazily allocated/initialized if/when requested
2207       // note that __kmp_allocate zeroes the allocated memory
2208       arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *));
2209     }
2210   }
2211   tg->reduce_data = (void *)arr;
2212   tg->reduce_num_data = num;
2213   return (void *)tg;
2214 }
2215 
2216 /*!
2217 @ingroup TASKING
2218 @param gtid      Global thread ID
2219 @param num       Number of data items to reduce
2220 @param data      Array of data for reduction
2221 @return The taskgroup identifier
2222 
2223 Initialize task reduction for the taskgroup.
2224 
2225 Note: this entry supposes the optional compiler-generated initializer routine
2226 has single parameter - pointer to object to be initialized. That means
2227 the reduction either does not use omp_orig object, or the omp_orig is accessible
2228 without help of the runtime library.
2229 */
2230 void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
2231   return __kmp_task_reduction_init(gtid, num, (kmp_task_red_input_t *)data);
2232 }
2233 
2234 /*!
2235 @ingroup TASKING
2236 @param gtid      Global thread ID
2237 @param num       Number of data items to reduce
2238 @param data      Array of data for reduction
2239 @return The taskgroup identifier
2240 
2241 Initialize task reduction for the taskgroup.
2242 
2243 Note: this entry supposes the optional compiler-generated initializer routine
2244 has two parameters, pointer to object to be initialized and pointer to omp_orig
2245 */
2246 void *__kmpc_taskred_init(int gtid, int num, void *data) {
2247   return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data);
2248 }
2249 
2250 // Copy task reduction data (except for shared pointers).
2251 template <typename T>
2252 void __kmp_task_reduction_init_copy(kmp_info_t *thr, int num, T *data,
2253                                     kmp_taskgroup_t *tg, void *reduce_data) {
2254   kmp_taskred_data_t *arr;
2255   KA_TRACE(20, ("__kmp_task_reduction_init_copy: Th %p, init taskgroup %p,"
2256                 " from data %p\n",
2257                 thr, tg, reduce_data));
2258   arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2259       thr, num * sizeof(kmp_taskred_data_t));
2260   // threads will share private copies, thunk routines, sizes, flags, etc.:
2261   KMP_MEMCPY(arr, reduce_data, num * sizeof(kmp_taskred_data_t));
2262   for (int i = 0; i < num; ++i) {
2263     arr[i].reduce_shar = data[i].reduce_shar; // init unique shared pointers
2264   }
2265   tg->reduce_data = (void *)arr;
2266   tg->reduce_num_data = num;
2267 }
2268 
2269 /*!
2270 @ingroup TASKING
2271 @param gtid    Global thread ID
2272 @param tskgrp  The taskgroup ID (optional)
2273 @param data    Shared location of the item
2274 @return The pointer to per-thread data
2275 
2276 Get thread-specific location of data item
2277 */
2278 void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
2279   __kmp_assert_valid_gtid(gtid);
2280   kmp_info_t *thread = __kmp_threads[gtid];
2281   kmp_int32 nth = thread->th.th_team_nproc;
2282   if (nth == 1)
2283     return data; // nothing to do
2284 
2285   kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp;
2286   if (tg == NULL)
2287     tg = thread->th.th_current_task->td_taskgroup;
2288   KMP_ASSERT(tg != NULL);
2289   kmp_taskred_data_t *arr = (kmp_taskred_data_t *)(tg->reduce_data);
2290   kmp_int32 num = tg->reduce_num_data;
2291   kmp_int32 tid = thread->th.th_info.ds.ds_tid;
2292 
2293   KMP_ASSERT(data != NULL);
2294   while (tg != NULL) {
2295     for (int i = 0; i < num; ++i) {
2296       if (!arr[i].flags.lazy_priv) {
2297         if (data == arr[i].reduce_shar ||
2298             (data >= arr[i].reduce_priv && data < arr[i].reduce_pend))
2299           return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size;
2300       } else {
2301         // check shared location first
2302         void **p_priv = (void **)(arr[i].reduce_priv);
2303         if (data == arr[i].reduce_shar)
2304           goto found;
2305         // check if we get some thread specific location as parameter
2306         for (int j = 0; j < nth; ++j)
2307           if (data == p_priv[j])
2308             goto found;
2309         continue; // not found, continue search
2310       found:
2311         if (p_priv[tid] == NULL) {
2312           // allocate thread specific object lazily
2313           p_priv[tid] = __kmp_allocate(arr[i].reduce_size);
2314           if (arr[i].reduce_init != NULL) {
2315             if (arr[i].reduce_orig != NULL) { // new interface
2316               ((void (*)(void *, void *))arr[i].reduce_init)(
2317                   p_priv[tid], arr[i].reduce_orig);
2318             } else { // old interface (single parameter)
2319               ((void (*)(void *))arr[i].reduce_init)(p_priv[tid]);
2320             }
2321           }
2322         }
2323         return p_priv[tid];
2324       }
2325     }
2326     tg = tg->parent;
2327     arr = (kmp_taskred_data_t *)(tg->reduce_data);
2328     num = tg->reduce_num_data;
2329   }
2330   KMP_ASSERT2(0, "Unknown task reduction item");
2331   return NULL; // ERROR, this line never executed
2332 }
2333 
2334 // Finalize task reduction.
2335 // Called from __kmpc_end_taskgroup()
2336 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) {
2337   kmp_int32 nth = th->th.th_team_nproc;
2338   KMP_DEBUG_ASSERT(nth > 1); // should not be called if nth == 1
2339   kmp_taskred_data_t *arr = (kmp_taskred_data_t *)tg->reduce_data;
2340   kmp_int32 num = tg->reduce_num_data;
2341   for (int i = 0; i < num; ++i) {
2342     void *sh_data = arr[i].reduce_shar;
2343     void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini);
2344     void (*f_comb)(void *, void *) =
2345         (void (*)(void *, void *))(arr[i].reduce_comb);
2346     if (!arr[i].flags.lazy_priv) {
2347       void *pr_data = arr[i].reduce_priv;
2348       size_t size = arr[i].reduce_size;
2349       for (int j = 0; j < nth; ++j) {
2350         void *priv_data = (char *)pr_data + j * size;
2351         f_comb(sh_data, priv_data); // combine results
2352         if (f_fini)
2353           f_fini(priv_data); // finalize if needed
2354       }
2355     } else {
2356       void **pr_data = (void **)(arr[i].reduce_priv);
2357       for (int j = 0; j < nth; ++j) {
2358         if (pr_data[j] != NULL) {
2359           f_comb(sh_data, pr_data[j]); // combine results
2360           if (f_fini)
2361             f_fini(pr_data[j]); // finalize if needed
2362           __kmp_free(pr_data[j]);
2363         }
2364       }
2365     }
2366     __kmp_free(arr[i].reduce_priv);
2367   }
2368   __kmp_thread_free(th, arr);
2369   tg->reduce_data = NULL;
2370   tg->reduce_num_data = 0;
2371 }
2372 
2373 // Cleanup task reduction data for parallel or worksharing,
2374 // do not touch task private data other threads still working with.
2375 // Called from __kmpc_end_taskgroup()
2376 static void __kmp_task_reduction_clean(kmp_info_t *th, kmp_taskgroup_t *tg) {
2377   __kmp_thread_free(th, tg->reduce_data);
2378   tg->reduce_data = NULL;
2379   tg->reduce_num_data = 0;
2380 }
2381 
2382 template <typename T>
2383 void *__kmp_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2384                                          int num, T *data) {
2385   __kmp_assert_valid_gtid(gtid);
2386   kmp_info_t *thr = __kmp_threads[gtid];
2387   kmp_int32 nth = thr->th.th_team_nproc;
2388   __kmpc_taskgroup(loc, gtid); // form new taskgroup first
2389   if (nth == 1) {
2390     KA_TRACE(10,
2391              ("__kmpc_reduction_modifier_init: T#%d, tg %p, exiting nth=1\n",
2392               gtid, thr->th.th_current_task->td_taskgroup));
2393     return (void *)thr->th.th_current_task->td_taskgroup;
2394   }
2395   kmp_team_t *team = thr->th.th_team;
2396   void *reduce_data;
2397   kmp_taskgroup_t *tg;
2398   reduce_data = KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[is_ws]);
2399   if (reduce_data == NULL &&
2400       __kmp_atomic_compare_store(&team->t.t_tg_reduce_data[is_ws], reduce_data,
2401                                  (void *)1)) {
2402     // single thread enters this block to initialize common reduction data
2403     KMP_DEBUG_ASSERT(reduce_data == NULL);
2404     // first initialize own data, then make a copy other threads can use
2405     tg = (kmp_taskgroup_t *)__kmp_task_reduction_init<T>(gtid, num, data);
2406     reduce_data = __kmp_thread_malloc(thr, num * sizeof(kmp_taskred_data_t));
2407     KMP_MEMCPY(reduce_data, tg->reduce_data, num * sizeof(kmp_taskred_data_t));
2408     // fini counters should be 0 at this point
2409     KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[0]) == 0);
2410     KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[1]) == 0);
2411     KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[is_ws], reduce_data);
2412   } else {
2413     while (
2414         (reduce_data = KMP_ATOMIC_LD_ACQ(&team->t.t_tg_reduce_data[is_ws])) ==
2415         (void *)1) { // wait for task reduction initialization
2416       KMP_CPU_PAUSE();
2417     }
2418     KMP_DEBUG_ASSERT(reduce_data > (void *)1); // should be valid pointer here
2419     tg = thr->th.th_current_task->td_taskgroup;
2420     __kmp_task_reduction_init_copy<T>(thr, num, data, tg, reduce_data);
2421   }
2422   return tg;
2423 }
2424 
2425 /*!
2426 @ingroup TASKING
2427 @param loc       Source location info
2428 @param gtid      Global thread ID
2429 @param is_ws     Is 1 if the reduction is for worksharing, 0 otherwise
2430 @param num       Number of data items to reduce
2431 @param data      Array of data for reduction
2432 @return The taskgroup identifier
2433 
2434 Initialize task reduction for a parallel or worksharing.
2435 
2436 Note: this entry supposes the optional compiler-generated initializer routine
2437 has single parameter - pointer to object to be initialized. That means
2438 the reduction either does not use omp_orig object, or the omp_orig is accessible
2439 without help of the runtime library.
2440 */
2441 void *__kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2442                                           int num, void *data) {
2443   return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2444                                             (kmp_task_red_input_t *)data);
2445 }
2446 
2447 /*!
2448 @ingroup TASKING
2449 @param loc       Source location info
2450 @param gtid      Global thread ID
2451 @param is_ws     Is 1 if the reduction is for worksharing, 0 otherwise
2452 @param num       Number of data items to reduce
2453 @param data      Array of data for reduction
2454 @return The taskgroup identifier
2455 
2456 Initialize task reduction for a parallel or worksharing.
2457 
2458 Note: this entry supposes the optional compiler-generated initializer routine
2459 has two parameters, pointer to object to be initialized and pointer to omp_orig
2460 */
2461 void *__kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num,
2462                                    void *data) {
2463   return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2464                                             (kmp_taskred_input_t *)data);
2465 }
2466 
2467 /*!
2468 @ingroup TASKING
2469 @param loc       Source location info
2470 @param gtid      Global thread ID
2471 @param is_ws     Is 1 if the reduction is for worksharing, 0 otherwise
2472 
2473 Finalize task reduction for a parallel or worksharing.
2474 */
2475 void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws) {
2476   __kmpc_end_taskgroup(loc, gtid);
2477 }
2478 
2479 // __kmpc_taskgroup: Start a new taskgroup
2480 void __kmpc_taskgroup(ident_t *loc, int gtid) {
2481   __kmp_assert_valid_gtid(gtid);
2482   kmp_info_t *thread = __kmp_threads[gtid];
2483   kmp_taskdata_t *taskdata = thread->th.th_current_task;
2484   kmp_taskgroup_t *tg_new =
2485       (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t));
2486   KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new));
2487   KMP_ATOMIC_ST_RLX(&tg_new->count, 0);
2488   KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq);
2489   tg_new->parent = taskdata->td_taskgroup;
2490   tg_new->reduce_data = NULL;
2491   tg_new->reduce_num_data = 0;
2492   tg_new->gomp_data = NULL;
2493   taskdata->td_taskgroup = tg_new;
2494 
2495 #if OMPT_SUPPORT && OMPT_OPTIONAL
2496   if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2497     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2498     if (!codeptr)
2499       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2500     kmp_team_t *team = thread->th.th_team;
2501     ompt_data_t my_task_data = taskdata->ompt_task_info.task_data;
2502     // FIXME: I think this is wrong for lwt!
2503     ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data;
2504 
2505     ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2506         ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2507         &(my_task_data), codeptr);
2508   }
2509 #endif
2510 }
2511 
2512 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task
2513 //                       and its descendants are complete
2514 void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
2515   __kmp_assert_valid_gtid(gtid);
2516   kmp_info_t *thread = __kmp_threads[gtid];
2517   kmp_taskdata_t *taskdata = thread->th.th_current_task;
2518   kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
2519   int thread_finished = FALSE;
2520 
2521 #if OMPT_SUPPORT && OMPT_OPTIONAL
2522   kmp_team_t *team;
2523   ompt_data_t my_task_data;
2524   ompt_data_t my_parallel_data;
2525   void *codeptr = nullptr;
2526   if (UNLIKELY(ompt_enabled.enabled)) {
2527     team = thread->th.th_team;
2528     my_task_data = taskdata->ompt_task_info.task_data;
2529     // FIXME: I think this is wrong for lwt!
2530     my_parallel_data = team->t.ompt_team_info.parallel_data;
2531     codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2532     if (!codeptr)
2533       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2534   }
2535 #endif
2536 
2537   KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc));
2538   KMP_DEBUG_ASSERT(taskgroup != NULL);
2539   KMP_SET_THREAD_STATE_BLOCK(TASKGROUP);
2540 
2541   if (__kmp_tasking_mode != tskm_immediate_exec) {
2542     // mark task as waiting not on a barrier
2543     taskdata->td_taskwait_counter += 1;
2544     taskdata->td_taskwait_ident = loc;
2545     taskdata->td_taskwait_thread = gtid + 1;
2546 #if USE_ITT_BUILD
2547     // For ITT the taskgroup wait is similar to taskwait until we need to
2548     // distinguish them
2549     void *itt_sync_obj = NULL;
2550 #if USE_ITT_NOTIFY
2551     KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2552 #endif /* USE_ITT_NOTIFY */
2553 #endif /* USE_ITT_BUILD */
2554 
2555 #if OMPT_SUPPORT && OMPT_OPTIONAL
2556     if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2557       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2558           ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2559           &(my_task_data), codeptr);
2560     }
2561 #endif
2562 
2563     if (!taskdata->td_flags.team_serial ||
2564         (thread->th.th_task_team != NULL &&
2565          (thread->th.th_task_team->tt.tt_found_proxy_tasks ||
2566           thread->th.th_task_team->tt.tt_hidden_helper_task_encountered))) {
2567       kmp_flag_32<false, false> flag(
2568           RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)), 0U);
2569       while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) {
2570         flag.execute_tasks(thread, gtid, FALSE,
2571                            &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2572                            __kmp_task_stealing_constraint);
2573       }
2574     }
2575     taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; // end waiting
2576 
2577 #if OMPT_SUPPORT && OMPT_OPTIONAL
2578     if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2579       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2580           ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2581           &(my_task_data), codeptr);
2582     }
2583 #endif
2584 
2585 #if USE_ITT_BUILD
2586     KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2587     KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with descendants
2588 #endif /* USE_ITT_BUILD */
2589   }
2590   KMP_DEBUG_ASSERT(taskgroup->count == 0);
2591 
2592   if (taskgroup->reduce_data != NULL &&
2593       !taskgroup->gomp_data) { // need to reduce?
2594     int cnt;
2595     void *reduce_data;
2596     kmp_team_t *t = thread->th.th_team;
2597     kmp_taskred_data_t *arr = (kmp_taskred_data_t *)taskgroup->reduce_data;
2598     // check if <priv> data of the first reduction variable shared for the team
2599     void *priv0 = arr[0].reduce_priv;
2600     if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[0])) != NULL &&
2601         ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
2602       // finishing task reduction on parallel
2603       cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[0]);
2604       if (cnt == thread->th.th_team_nproc - 1) {
2605         // we are the last thread passing __kmpc_reduction_modifier_fini()
2606         // finalize task reduction:
2607         __kmp_task_reduction_fini(thread, taskgroup);
2608         // cleanup fields in the team structure:
2609         // TODO: is relaxed store enough here (whole barrier should follow)?
2610         __kmp_thread_free(thread, reduce_data);
2611         KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[0], NULL);
2612         KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[0], 0);
2613       } else {
2614         // we are not the last thread passing __kmpc_reduction_modifier_fini(),
2615         // so do not finalize reduction, just clean own copy of the data
2616         __kmp_task_reduction_clean(thread, taskgroup);
2617       }
2618     } else if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[1])) !=
2619                    NULL &&
2620                ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
2621       // finishing task reduction on worksharing
2622       cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[1]);
2623       if (cnt == thread->th.th_team_nproc - 1) {
2624         // we are the last thread passing __kmpc_reduction_modifier_fini()
2625         __kmp_task_reduction_fini(thread, taskgroup);
2626         // cleanup fields in team structure:
2627         // TODO: is relaxed store enough here (whole barrier should follow)?
2628         __kmp_thread_free(thread, reduce_data);
2629         KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[1], NULL);
2630         KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[1], 0);
2631       } else {
2632         // we are not the last thread passing __kmpc_reduction_modifier_fini(),
2633         // so do not finalize reduction, just clean own copy of the data
2634         __kmp_task_reduction_clean(thread, taskgroup);
2635       }
2636     } else {
2637       // finishing task reduction on taskgroup
2638       __kmp_task_reduction_fini(thread, taskgroup);
2639     }
2640   }
2641   // Restore parent taskgroup for the current task
2642   taskdata->td_taskgroup = taskgroup->parent;
2643   __kmp_thread_free(thread, taskgroup);
2644 
2645   KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n",
2646                 gtid, taskdata));
2647 
2648 #if OMPT_SUPPORT && OMPT_OPTIONAL
2649   if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2650     ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2651         ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2652         &(my_task_data), codeptr);
2653   }
2654 #endif
2655 }
2656 
2657 // __kmp_remove_my_task: remove a task from my own deque
2658 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid,
2659                                         kmp_task_team_t *task_team,
2660                                         kmp_int32 is_constrained) {
2661   kmp_task_t *task;
2662   kmp_taskdata_t *taskdata;
2663   kmp_thread_data_t *thread_data;
2664   kmp_uint32 tail;
2665 
2666   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2667   KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data !=
2668                    NULL); // Caller should check this condition
2669 
2670   thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
2671 
2672   KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",
2673                 gtid, thread_data->td.td_deque_ntasks,
2674                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2675 
2676   if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2677     KA_TRACE(10,
2678              ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: "
2679               "ntasks=%d head=%u tail=%u\n",
2680               gtid, thread_data->td.td_deque_ntasks,
2681               thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2682     return NULL;
2683   }
2684 
2685   __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
2686 
2687   if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2688     __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2689     KA_TRACE(10,
2690              ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "
2691               "ntasks=%d head=%u tail=%u\n",
2692               gtid, thread_data->td.td_deque_ntasks,
2693               thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2694     return NULL;
2695   }
2696 
2697   tail = (thread_data->td.td_deque_tail - 1) &
2698          TASK_DEQUE_MASK(thread_data->td); // Wrap index.
2699   taskdata = thread_data->td.td_deque[tail];
2700 
2701   if (!__kmp_task_is_allowed(gtid, is_constrained, taskdata,
2702                              thread->th.th_current_task)) {
2703     // The TSC does not allow to steal victim task
2704     __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2705     KA_TRACE(10,
2706              ("__kmp_remove_my_task(exit #3): T#%d TSC blocks tail task: "
2707               "ntasks=%d head=%u tail=%u\n",
2708               gtid, thread_data->td.td_deque_ntasks,
2709               thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2710     return NULL;
2711   }
2712 
2713   thread_data->td.td_deque_tail = tail;
2714   TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1);
2715 
2716   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2717 
2718   KA_TRACE(10, ("__kmp_remove_my_task(exit #4): T#%d task %p removed: "
2719                 "ntasks=%d head=%u tail=%u\n",
2720                 gtid, taskdata, thread_data->td.td_deque_ntasks,
2721                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2722 
2723   task = KMP_TASKDATA_TO_TASK(taskdata);
2724   return task;
2725 }
2726 
2727 // __kmp_steal_task: remove a task from another thread's deque
2728 // Assume that calling thread has already checked existence of
2729 // task_team thread_data before calling this routine.
2730 static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid,
2731                                     kmp_task_team_t *task_team,
2732                                     std::atomic<kmp_int32> *unfinished_threads,
2733                                     int *thread_finished,
2734                                     kmp_int32 is_constrained) {
2735   kmp_task_t *task;
2736   kmp_taskdata_t *taskdata;
2737   kmp_taskdata_t *current;
2738   kmp_thread_data_t *victim_td, *threads_data;
2739   kmp_int32 target;
2740   kmp_int32 victim_tid;
2741 
2742   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2743 
2744   threads_data = task_team->tt.tt_threads_data;
2745   KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition
2746 
2747   victim_tid = victim_thr->th.th_info.ds.ds_tid;
2748   victim_td = &threads_data[victim_tid];
2749 
2750   KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: "
2751                 "task_team=%p ntasks=%d head=%u tail=%u\n",
2752                 gtid, __kmp_gtid_from_thread(victim_thr), task_team,
2753                 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
2754                 victim_td->td.td_deque_tail));
2755 
2756   if (TCR_4(victim_td->td.td_deque_ntasks) == 0) {
2757     KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: "
2758                   "task_team=%p ntasks=%d head=%u tail=%u\n",
2759                   gtid, __kmp_gtid_from_thread(victim_thr), task_team,
2760                   victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
2761                   victim_td->td.td_deque_tail));
2762     return NULL;
2763   }
2764 
2765   __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock);
2766 
2767   int ntasks = TCR_4(victim_td->td.td_deque_ntasks);
2768   // Check again after we acquire the lock
2769   if (ntasks == 0) {
2770     __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2771     KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: "
2772                   "task_team=%p ntasks=%d head=%u tail=%u\n",
2773                   gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
2774                   victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2775     return NULL;
2776   }
2777 
2778   KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL);
2779   current = __kmp_threads[gtid]->th.th_current_task;
2780   taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head];
2781   if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
2782     // Bump head pointer and Wrap.
2783     victim_td->td.td_deque_head =
2784         (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td);
2785   } else {
2786     if (!task_team->tt.tt_untied_task_encountered) {
2787       // The TSC does not allow to steal victim task
2788       __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2789       KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d could not steal from "
2790                     "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
2791                     gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
2792                     victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2793       return NULL;
2794     }
2795     int i;
2796     // walk through victim's deque trying to steal any task
2797     target = victim_td->td.td_deque_head;
2798     taskdata = NULL;
2799     for (i = 1; i < ntasks; ++i) {
2800       target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
2801       taskdata = victim_td->td.td_deque[target];
2802       if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
2803         break; // found victim task
2804       } else {
2805         taskdata = NULL;
2806       }
2807     }
2808     if (taskdata == NULL) {
2809       // No appropriate candidate to steal found
2810       __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2811       KA_TRACE(10, ("__kmp_steal_task(exit #4): T#%d could not steal from "
2812                     "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
2813                     gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
2814                     victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2815       return NULL;
2816     }
2817     int prev = target;
2818     for (i = i + 1; i < ntasks; ++i) {
2819       // shift remaining tasks in the deque left by 1
2820       target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
2821       victim_td->td.td_deque[prev] = victim_td->td.td_deque[target];
2822       prev = target;
2823     }
2824     KMP_DEBUG_ASSERT(
2825         victim_td->td.td_deque_tail ==
2826         (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(victim_td->td)));
2827     victim_td->td.td_deque_tail = target; // tail -= 1 (wrapped))
2828   }
2829   if (*thread_finished) {
2830     // We need to un-mark this victim as a finished victim.  This must be done
2831     // before releasing the lock, or else other threads (starting with the
2832     // primary thread victim) might be prematurely released from the barrier!!!
2833 #if KMP_DEBUG
2834     kmp_int32 count =
2835 #endif
2836         KMP_ATOMIC_INC(unfinished_threads);
2837     KA_TRACE(
2838         20,
2839         ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
2840          gtid, count + 1, task_team));
2841     *thread_finished = FALSE;
2842   }
2843   TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1);
2844 
2845   __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2846 
2847   KMP_COUNT_BLOCK(TASK_stolen);
2848   KA_TRACE(10,
2849            ("__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: "
2850             "task_team=%p ntasks=%d head=%u tail=%u\n",
2851             gtid, taskdata, __kmp_gtid_from_thread(victim_thr), task_team,
2852             ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2853 
2854   task = KMP_TASKDATA_TO_TASK(taskdata);
2855   return task;
2856 }
2857 
2858 // __kmp_execute_tasks_template: Choose and execute tasks until either the
2859 // condition is statisfied (return true) or there are none left (return false).
2860 //
2861 // final_spin is TRUE if this is the spin at the release barrier.
2862 // thread_finished indicates whether the thread is finished executing all
2863 // the tasks it has on its deque, and is at the release barrier.
2864 // spinner is the location on which to spin.
2865 // spinner == NULL means only execute a single task and return.
2866 // checker is the value to check to terminate the spin.
2867 template <class C>
2868 static inline int __kmp_execute_tasks_template(
2869     kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin,
2870     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
2871     kmp_int32 is_constrained) {
2872   kmp_task_team_t *task_team = thread->th.th_task_team;
2873   kmp_thread_data_t *threads_data;
2874   kmp_task_t *task;
2875   kmp_info_t *other_thread;
2876   kmp_taskdata_t *current_task = thread->th.th_current_task;
2877   std::atomic<kmp_int32> *unfinished_threads;
2878   kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0,
2879                       tid = thread->th.th_info.ds.ds_tid;
2880 
2881   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2882   KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]);
2883 
2884   if (task_team == NULL || current_task == NULL)
2885     return FALSE;
2886 
2887   KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d "
2888                 "*thread_finished=%d\n",
2889                 gtid, final_spin, *thread_finished));
2890 
2891   thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
2892   threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
2893 
2894   KMP_DEBUG_ASSERT(threads_data != NULL);
2895 
2896   nthreads = task_team->tt.tt_nproc;
2897   unfinished_threads = &(task_team->tt.tt_unfinished_threads);
2898   KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks ||
2899                    task_team->tt.tt_hidden_helper_task_encountered);
2900   KMP_DEBUG_ASSERT(*unfinished_threads >= 0);
2901 
2902   while (1) { // Outer loop keeps trying to find tasks in case of single thread
2903     // getting tasks from target constructs
2904     while (1) { // Inner loop to find a task and execute it
2905       task = NULL;
2906       if (use_own_tasks) { // check on own queue first
2907         task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained);
2908       }
2909       if ((task == NULL) && (nthreads > 1)) { // Steal a task
2910         int asleep = 1;
2911         use_own_tasks = 0;
2912         // Try to steal from the last place I stole from successfully.
2913         if (victim_tid == -2) { // haven't stolen anything yet
2914           victim_tid = threads_data[tid].td.td_deque_last_stolen;
2915           if (victim_tid !=
2916               -1) // if we have a last stolen from victim, get the thread
2917             other_thread = threads_data[victim_tid].td.td_thr;
2918         }
2919         if (victim_tid != -1) { // found last victim
2920           asleep = 0;
2921         } else if (!new_victim) { // no recent steals and we haven't already
2922           // used a new victim; select a random thread
2923           do { // Find a different thread to steal work from.
2924             // Pick a random thread. Initial plan was to cycle through all the
2925             // threads, and only return if we tried to steal from every thread,
2926             // and failed.  Arch says that's not such a great idea.
2927             victim_tid = __kmp_get_random(thread) % (nthreads - 1);
2928             if (victim_tid >= tid) {
2929               ++victim_tid; // Adjusts random distribution to exclude self
2930             }
2931             // Found a potential victim
2932             other_thread = threads_data[victim_tid].td.td_thr;
2933             // There is a slight chance that __kmp_enable_tasking() did not wake
2934             // up all threads waiting at the barrier.  If victim is sleeping,
2935             // then wake it up. Since we were going to pay the cache miss
2936             // penalty for referencing another thread's kmp_info_t struct
2937             // anyway,
2938             // the check shouldn't cost too much performance at this point. In
2939             // extra barrier mode, tasks do not sleep at the separate tasking
2940             // barrier, so this isn't a problem.
2941             asleep = 0;
2942             if ((__kmp_tasking_mode == tskm_task_teams) &&
2943                 (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&
2944                 (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) !=
2945                  NULL)) {
2946               asleep = 1;
2947               __kmp_null_resume_wrapper(other_thread);
2948               // A sleeping thread should not have any tasks on it's queue.
2949               // There is a slight possibility that it resumes, steals a task
2950               // from another thread, which spawns more tasks, all in the time
2951               // that it takes this thread to check => don't write an assertion
2952               // that the victim's queue is empty.  Try stealing from a
2953               // different thread.
2954             }
2955           } while (asleep);
2956         }
2957 
2958         if (!asleep) {
2959           // We have a victim to try to steal from
2960           task = __kmp_steal_task(other_thread, gtid, task_team,
2961                                   unfinished_threads, thread_finished,
2962                                   is_constrained);
2963         }
2964         if (task != NULL) { // set last stolen to victim
2965           if (threads_data[tid].td.td_deque_last_stolen != victim_tid) {
2966             threads_data[tid].td.td_deque_last_stolen = victim_tid;
2967             // The pre-refactored code did not try more than 1 successful new
2968             // vicitm, unless the last one generated more local tasks;
2969             // new_victim keeps track of this
2970             new_victim = 1;
2971           }
2972         } else { // No tasks found; unset last_stolen
2973           KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1);
2974           victim_tid = -2; // no successful victim found
2975         }
2976       }
2977 
2978       if (task == NULL)
2979         break; // break out of tasking loop
2980 
2981 // Found a task; execute it
2982 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2983       if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2984         if (itt_sync_obj == NULL) { // we are at fork barrier where we could not
2985           // get the object reliably
2986           itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2987         }
2988         __kmp_itt_task_starting(itt_sync_obj);
2989       }
2990 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2991       __kmp_invoke_task(gtid, task, current_task);
2992 #if USE_ITT_BUILD
2993       if (itt_sync_obj != NULL)
2994         __kmp_itt_task_finished(itt_sync_obj);
2995 #endif /* USE_ITT_BUILD */
2996       // If this thread is only partway through the barrier and the condition is
2997       // met, then return now, so that the barrier gather/release pattern can
2998       // proceed. If this thread is in the last spin loop in the barrier,
2999       // waiting to be released, we know that the termination condition will not
3000       // be satisfied, so don't waste any cycles checking it.
3001       if (flag == NULL || (!final_spin && flag->done_check())) {
3002         KA_TRACE(
3003             15,
3004             ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3005              gtid));
3006         return TRUE;
3007       }
3008       if (thread->th.th_task_team == NULL) {
3009         break;
3010       }
3011       KMP_YIELD(__kmp_library == library_throughput); // Yield before next task
3012       // If execution of a stolen task results in more tasks being placed on our
3013       // run queue, reset use_own_tasks
3014       if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) {
3015         KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned "
3016                       "other tasks, restart\n",
3017                       gtid));
3018         use_own_tasks = 1;
3019         new_victim = 0;
3020       }
3021     }
3022 
3023     // The task source has been exhausted. If in final spin loop of barrier,
3024     // check if termination condition is satisfied. The work queue may be empty
3025     // but there might be proxy tasks still executing.
3026     if (final_spin &&
3027         KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks) == 0) {
3028       // First, decrement the #unfinished threads, if that has not already been
3029       // done.  This decrement might be to the spin location, and result in the
3030       // termination condition being satisfied.
3031       if (!*thread_finished) {
3032 #if KMP_DEBUG
3033         kmp_int32 count = -1 +
3034 #endif
3035             KMP_ATOMIC_DEC(unfinished_threads);
3036         KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec "
3037                       "unfinished_threads to %d task_team=%p\n",
3038                       gtid, count, task_team));
3039         *thread_finished = TRUE;
3040       }
3041 
3042       // It is now unsafe to reference thread->th.th_team !!!
3043       // Decrementing task_team->tt.tt_unfinished_threads can allow the primary
3044       // thread to pass through the barrier, where it might reset each thread's
3045       // th.th_team field for the next parallel region. If we can steal more
3046       // work, we know that this has not happened yet.
3047       if (flag != NULL && flag->done_check()) {
3048         KA_TRACE(
3049             15,
3050             ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3051              gtid));
3052         return TRUE;
3053       }
3054     }
3055 
3056     // If this thread's task team is NULL, primary thread has recognized that
3057     // there are no more tasks; bail out
3058     if (thread->th.th_task_team == NULL) {
3059       KA_TRACE(15,
3060                ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid));
3061       return FALSE;
3062     }
3063 
3064     // We could be getting tasks from target constructs; if this is the only
3065     // thread, keep trying to execute tasks from own queue
3066     if (nthreads == 1 &&
3067         KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks))
3068       use_own_tasks = 1;
3069     else {
3070       KA_TRACE(15,
3071                ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid));
3072       return FALSE;
3073     }
3074   }
3075 }
3076 
3077 template <bool C, bool S>
3078 int __kmp_execute_tasks_32(
3079     kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32<C, S> *flag, int final_spin,
3080     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3081     kmp_int32 is_constrained) {
3082   return __kmp_execute_tasks_template(
3083       thread, gtid, flag, final_spin,
3084       thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3085 }
3086 
3087 template <bool C, bool S>
3088 int __kmp_execute_tasks_64(
3089     kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64<C, S> *flag, int final_spin,
3090     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3091     kmp_int32 is_constrained) {
3092   return __kmp_execute_tasks_template(
3093       thread, gtid, flag, final_spin,
3094       thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3095 }
3096 
3097 template <bool C, bool S>
3098 int __kmp_atomic_execute_tasks_64(
3099     kmp_info_t *thread, kmp_int32 gtid, kmp_atomic_flag_64<C, S> *flag,
3100     int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3101     kmp_int32 is_constrained) {
3102   return __kmp_execute_tasks_template(
3103       thread, gtid, flag, final_spin,
3104       thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3105 }
3106 
3107 int __kmp_execute_tasks_oncore(
3108     kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
3109     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3110     kmp_int32 is_constrained) {
3111   return __kmp_execute_tasks_template(
3112       thread, gtid, flag, final_spin,
3113       thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3114 }
3115 
3116 template int
3117 __kmp_execute_tasks_32<false, false>(kmp_info_t *, kmp_int32,
3118                                      kmp_flag_32<false, false> *, int,
3119                                      int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3120 
3121 template int __kmp_execute_tasks_64<false, true>(kmp_info_t *, kmp_int32,
3122                                                  kmp_flag_64<false, true> *,
3123                                                  int,
3124                                                  int *USE_ITT_BUILD_ARG(void *),
3125                                                  kmp_int32);
3126 
3127 template int __kmp_execute_tasks_64<true, false>(kmp_info_t *, kmp_int32,
3128                                                  kmp_flag_64<true, false> *,
3129                                                  int,
3130                                                  int *USE_ITT_BUILD_ARG(void *),
3131                                                  kmp_int32);
3132 
3133 template int __kmp_atomic_execute_tasks_64<false, true>(
3134     kmp_info_t *, kmp_int32, kmp_atomic_flag_64<false, true> *, int,
3135     int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3136 
3137 template int __kmp_atomic_execute_tasks_64<true, false>(
3138     kmp_info_t *, kmp_int32, kmp_atomic_flag_64<true, false> *, int,
3139     int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3140 
3141 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
3142 // next barrier so they can assist in executing enqueued tasks.
3143 // First thread in allocates the task team atomically.
3144 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
3145                                  kmp_info_t *this_thr) {
3146   kmp_thread_data_t *threads_data;
3147   int nthreads, i, is_init_thread;
3148 
3149   KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n",
3150                 __kmp_gtid_from_thread(this_thr)));
3151 
3152   KMP_DEBUG_ASSERT(task_team != NULL);
3153   KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL);
3154 
3155   nthreads = task_team->tt.tt_nproc;
3156   KMP_DEBUG_ASSERT(nthreads > 0);
3157   KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc);
3158 
3159   // Allocate or increase the size of threads_data if necessary
3160   is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team);
3161 
3162   if (!is_init_thread) {
3163     // Some other thread already set up the array.
3164     KA_TRACE(
3165         20,
3166         ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n",
3167          __kmp_gtid_from_thread(this_thr)));
3168     return;
3169   }
3170   threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3171   KMP_DEBUG_ASSERT(threads_data != NULL);
3172 
3173   if (__kmp_tasking_mode == tskm_task_teams &&
3174       (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) {
3175     // Release any threads sleeping at the barrier, so that they can steal
3176     // tasks and execute them.  In extra barrier mode, tasks do not sleep
3177     // at the separate tasking barrier, so this isn't a problem.
3178     for (i = 0; i < nthreads; i++) {
3179       void *sleep_loc;
3180       kmp_info_t *thread = threads_data[i].td.td_thr;
3181 
3182       if (i == this_thr->th.th_info.ds.ds_tid) {
3183         continue;
3184       }
3185       // Since we haven't locked the thread's suspend mutex lock at this
3186       // point, there is a small window where a thread might be putting
3187       // itself to sleep, but hasn't set the th_sleep_loc field yet.
3188       // To work around this, __kmp_execute_tasks_template() periodically checks
3189       // see if other threads are sleeping (using the same random mechanism that
3190       // is used for task stealing) and awakens them if they are.
3191       if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3192           NULL) {
3193         KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n",
3194                       __kmp_gtid_from_thread(this_thr),
3195                       __kmp_gtid_from_thread(thread)));
3196         __kmp_null_resume_wrapper(thread);
3197       } else {
3198         KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
3199                       __kmp_gtid_from_thread(this_thr),
3200                       __kmp_gtid_from_thread(thread)));
3201       }
3202     }
3203   }
3204 
3205   KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n",
3206                 __kmp_gtid_from_thread(this_thr)));
3207 }
3208 
3209 /* // TODO: Check the comment consistency
3210  * Utility routines for "task teams".  A task team (kmp_task_t) is kind of
3211  * like a shadow of the kmp_team_t data struct, with a different lifetime.
3212  * After a child * thread checks into a barrier and calls __kmp_release() from
3213  * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
3214  * longer assume that the kmp_team_t structure is intact (at any moment, the
3215  * primary thread may exit the barrier code and free the team data structure,
3216  * and return the threads to the thread pool).
3217  *
3218  * This does not work with the tasking code, as the thread is still
3219  * expected to participate in the execution of any tasks that may have been
3220  * spawned my a member of the team, and the thread still needs access to all
3221  * to each thread in the team, so that it can steal work from it.
3222  *
3223  * Enter the existence of the kmp_task_team_t struct.  It employs a reference
3224  * counting mechanism, and is allocated by the primary thread before calling
3225  * __kmp_<barrier_kind>_release, and then is release by the last thread to
3226  * exit __kmp_<barrier_kind>_release at the next barrier.  I.e. the lifetimes
3227  * of the kmp_task_team_t structs for consecutive barriers can overlap
3228  * (and will, unless the primary thread is the last thread to exit the barrier
3229  * release phase, which is not typical). The existence of such a struct is
3230  * useful outside the context of tasking.
3231  *
3232  * We currently use the existence of the threads array as an indicator that
3233  * tasks were spawned since the last barrier.  If the structure is to be
3234  * useful outside the context of tasking, then this will have to change, but
3235  * not setting the field minimizes the performance impact of tasking on
3236  * barriers, when no explicit tasks were spawned (pushed, actually).
3237  */
3238 
3239 static kmp_task_team_t *__kmp_free_task_teams =
3240     NULL; // Free list for task_team data structures
3241 // Lock for task team data structures
3242 kmp_bootstrap_lock_t __kmp_task_team_lock =
3243     KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock);
3244 
3245 // __kmp_alloc_task_deque:
3246 // Allocates a task deque for a particular thread, and initialize the necessary
3247 // data structures relating to the deque.  This only happens once per thread
3248 // per task team since task teams are recycled. No lock is needed during
3249 // allocation since each thread allocates its own deque.
3250 static void __kmp_alloc_task_deque(kmp_info_t *thread,
3251                                    kmp_thread_data_t *thread_data) {
3252   __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
3253   KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL);
3254 
3255   // Initialize last stolen task field to "none"
3256   thread_data->td.td_deque_last_stolen = -1;
3257 
3258   KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0);
3259   KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0);
3260   KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0);
3261 
3262   KE_TRACE(
3263       10,
3264       ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
3265        __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data));
3266   // Allocate space for task deque, and zero the deque
3267   // Cannot use __kmp_thread_calloc() because threads not around for
3268   // kmp_reap_task_team( ).
3269   thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
3270       INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
3271   thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
3272 }
3273 
3274 // __kmp_free_task_deque:
3275 // Deallocates a task deque for a particular thread. Happens at library
3276 // deallocation so don't need to reset all thread data fields.
3277 static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) {
3278   if (thread_data->td.td_deque != NULL) {
3279     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3280     TCW_4(thread_data->td.td_deque_ntasks, 0);
3281     __kmp_free(thread_data->td.td_deque);
3282     thread_data->td.td_deque = NULL;
3283     __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3284   }
3285 
3286 #ifdef BUILD_TIED_TASK_STACK
3287   // GEH: Figure out what to do here for td_susp_tied_tasks
3288   if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) {
3289     __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data);
3290   }
3291 #endif // BUILD_TIED_TASK_STACK
3292 }
3293 
3294 // __kmp_realloc_task_threads_data:
3295 // Allocates a threads_data array for a task team, either by allocating an
3296 // initial array or enlarging an existing array.  Only the first thread to get
3297 // the lock allocs or enlarges the array and re-initializes the array elements.
3298 // That thread returns "TRUE", the rest return "FALSE".
3299 // Assumes that the new array size is given by task_team -> tt.tt_nproc.
3300 // The current size is given by task_team -> tt.tt_max_threads.
3301 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
3302                                            kmp_task_team_t *task_team) {
3303   kmp_thread_data_t **threads_data_p;
3304   kmp_int32 nthreads, maxthreads;
3305   int is_init_thread = FALSE;
3306 
3307   if (TCR_4(task_team->tt.tt_found_tasks)) {
3308     // Already reallocated and initialized.
3309     return FALSE;
3310   }
3311 
3312   threads_data_p = &task_team->tt.tt_threads_data;
3313   nthreads = task_team->tt.tt_nproc;
3314   maxthreads = task_team->tt.tt_max_threads;
3315 
3316   // All threads must lock when they encounter the first task of the implicit
3317   // task region to make sure threads_data fields are (re)initialized before
3318   // used.
3319   __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3320 
3321   if (!TCR_4(task_team->tt.tt_found_tasks)) {
3322     // first thread to enable tasking
3323     kmp_team_t *team = thread->th.th_team;
3324     int i;
3325 
3326     is_init_thread = TRUE;
3327     if (maxthreads < nthreads) {
3328 
3329       if (*threads_data_p != NULL) {
3330         kmp_thread_data_t *old_data = *threads_data_p;
3331         kmp_thread_data_t *new_data = NULL;
3332 
3333         KE_TRACE(
3334             10,
3335             ("__kmp_realloc_task_threads_data: T#%d reallocating "
3336              "threads data for task_team %p, new_size = %d, old_size = %d\n",
3337              __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads));
3338         // Reallocate threads_data to have more elements than current array
3339         // Cannot use __kmp_thread_realloc() because threads not around for
3340         // kmp_reap_task_team( ).  Note all new array entries are initialized
3341         // to zero by __kmp_allocate().
3342         new_data = (kmp_thread_data_t *)__kmp_allocate(
3343             nthreads * sizeof(kmp_thread_data_t));
3344         // copy old data to new data
3345         KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t),
3346                      (void *)old_data, maxthreads * sizeof(kmp_thread_data_t));
3347 
3348 #ifdef BUILD_TIED_TASK_STACK
3349         // GEH: Figure out if this is the right thing to do
3350         for (i = maxthreads; i < nthreads; i++) {
3351           kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3352           __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3353         }
3354 #endif // BUILD_TIED_TASK_STACK
3355        // Install the new data and free the old data
3356         (*threads_data_p) = new_data;
3357         __kmp_free(old_data);
3358       } else {
3359         KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating "
3360                       "threads data for task_team %p, size = %d\n",
3361                       __kmp_gtid_from_thread(thread), task_team, nthreads));
3362         // Make the initial allocate for threads_data array, and zero entries
3363         // Cannot use __kmp_thread_calloc() because threads not around for
3364         // kmp_reap_task_team( ).
3365         *threads_data_p = (kmp_thread_data_t *)__kmp_allocate(
3366             nthreads * sizeof(kmp_thread_data_t));
3367 #ifdef BUILD_TIED_TASK_STACK
3368         // GEH: Figure out if this is the right thing to do
3369         for (i = 0; i < nthreads; i++) {
3370           kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3371           __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3372         }
3373 #endif // BUILD_TIED_TASK_STACK
3374       }
3375       task_team->tt.tt_max_threads = nthreads;
3376     } else {
3377       // If array has (more than) enough elements, go ahead and use it
3378       KMP_DEBUG_ASSERT(*threads_data_p != NULL);
3379     }
3380 
3381     // initialize threads_data pointers back to thread_info structures
3382     for (i = 0; i < nthreads; i++) {
3383       kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3384       thread_data->td.td_thr = team->t.t_threads[i];
3385 
3386       if (thread_data->td.td_deque_last_stolen >= nthreads) {
3387         // The last stolen field survives across teams / barrier, and the number
3388         // of threads may have changed.  It's possible (likely?) that a new
3389         // parallel region will exhibit the same behavior as previous region.
3390         thread_data->td.td_deque_last_stolen = -1;
3391       }
3392     }
3393 
3394     KMP_MB();
3395     TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE);
3396   }
3397 
3398   __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3399   return is_init_thread;
3400 }
3401 
3402 // __kmp_free_task_threads_data:
3403 // Deallocates a threads_data array for a task team, including any attached
3404 // tasking deques.  Only occurs at library shutdown.
3405 static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) {
3406   __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3407   if (task_team->tt.tt_threads_data != NULL) {
3408     int i;
3409     for (i = 0; i < task_team->tt.tt_max_threads; i++) {
3410       __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]);
3411     }
3412     __kmp_free(task_team->tt.tt_threads_data);
3413     task_team->tt.tt_threads_data = NULL;
3414   }
3415   __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3416 }
3417 
3418 // __kmp_allocate_task_team:
3419 // Allocates a task team associated with a specific team, taking it from
3420 // the global task team free list if possible.  Also initializes data
3421 // structures.
3422 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
3423                                                  kmp_team_t *team) {
3424   kmp_task_team_t *task_team = NULL;
3425   int nthreads;
3426 
3427   KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n",
3428                 (thread ? __kmp_gtid_from_thread(thread) : -1), team));
3429 
3430   if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3431     // Take a task team from the task team pool
3432     __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3433     if (__kmp_free_task_teams != NULL) {
3434       task_team = __kmp_free_task_teams;
3435       TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next);
3436       task_team->tt.tt_next = NULL;
3437     }
3438     __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3439   }
3440 
3441   if (task_team == NULL) {
3442     KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating "
3443                   "task team for team %p\n",
3444                   __kmp_gtid_from_thread(thread), team));
3445     // Allocate a new task team if one is not available. Cannot use
3446     // __kmp_thread_malloc because threads not around for kmp_reap_task_team.
3447     task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t));
3448     __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock);
3449 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
3450     // suppress race conditions detection on synchronization flags in debug mode
3451     // this helps to analyze library internals eliminating false positives
3452     __itt_suppress_mark_range(
3453         __itt_suppress_range, __itt_suppress_threading_errors,
3454         &task_team->tt.tt_found_tasks, sizeof(task_team->tt.tt_found_tasks));
3455     __itt_suppress_mark_range(__itt_suppress_range,
3456                               __itt_suppress_threading_errors,
3457                               CCAST(kmp_uint32 *, &task_team->tt.tt_active),
3458                               sizeof(task_team->tt.tt_active));
3459 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
3460     // Note: __kmp_allocate zeroes returned memory, othewise we would need:
3461     // task_team->tt.tt_threads_data = NULL;
3462     // task_team->tt.tt_max_threads = 0;
3463     // task_team->tt.tt_next = NULL;
3464   }
3465 
3466   TCW_4(task_team->tt.tt_found_tasks, FALSE);
3467   TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3468   task_team->tt.tt_nproc = nthreads = team->t.t_nproc;
3469 
3470   KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads);
3471   TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
3472   TCW_4(task_team->tt.tt_active, TRUE);
3473 
3474   KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
3475                 "unfinished_threads init'd to %d\n",
3476                 (thread ? __kmp_gtid_from_thread(thread) : -1), task_team,
3477                 KMP_ATOMIC_LD_RLX(&task_team->tt.tt_unfinished_threads)));
3478   return task_team;
3479 }
3480 
3481 // __kmp_free_task_team:
3482 // Frees the task team associated with a specific thread, and adds it
3483 // to the global task team free list.
3484 void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) {
3485   KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n",
3486                 thread ? __kmp_gtid_from_thread(thread) : -1, task_team));
3487 
3488   // Put task team back on free list
3489   __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3490 
3491   KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL);
3492   task_team->tt.tt_next = __kmp_free_task_teams;
3493   TCW_PTR(__kmp_free_task_teams, task_team);
3494 
3495   __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3496 }
3497 
3498 // __kmp_reap_task_teams:
3499 // Free all the task teams on the task team free list.
3500 // Should only be done during library shutdown.
3501 // Cannot do anything that needs a thread structure or gtid since they are
3502 // already gone.
3503 void __kmp_reap_task_teams(void) {
3504   kmp_task_team_t *task_team;
3505 
3506   if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3507     // Free all task_teams on the free list
3508     __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3509     while ((task_team = __kmp_free_task_teams) != NULL) {
3510       __kmp_free_task_teams = task_team->tt.tt_next;
3511       task_team->tt.tt_next = NULL;
3512 
3513       // Free threads_data if necessary
3514       if (task_team->tt.tt_threads_data != NULL) {
3515         __kmp_free_task_threads_data(task_team);
3516       }
3517       __kmp_free(task_team);
3518     }
3519     __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3520   }
3521 }
3522 
3523 // __kmp_wait_to_unref_task_teams:
3524 // Some threads could still be in the fork barrier release code, possibly
3525 // trying to steal tasks.  Wait for each thread to unreference its task team.
3526 void __kmp_wait_to_unref_task_teams(void) {
3527   kmp_info_t *thread;
3528   kmp_uint32 spins;
3529   int done;
3530 
3531   KMP_INIT_YIELD(spins);
3532 
3533   for (;;) {
3534     done = TRUE;
3535 
3536     // TODO: GEH - this may be is wrong because some sync would be necessary
3537     // in case threads are added to the pool during the traversal. Need to
3538     // verify that lock for thread pool is held when calling this routine.
3539     for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL;
3540          thread = thread->th.th_next_pool) {
3541 #if KMP_OS_WINDOWS
3542       DWORD exit_val;
3543 #endif
3544       if (TCR_PTR(thread->th.th_task_team) == NULL) {
3545         KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n",
3546                       __kmp_gtid_from_thread(thread)));
3547         continue;
3548       }
3549 #if KMP_OS_WINDOWS
3550       // TODO: GEH - add this check for Linux* OS / OS X* as well?
3551       if (!__kmp_is_thread_alive(thread, &exit_val)) {
3552         thread->th.th_task_team = NULL;
3553         continue;
3554       }
3555 #endif
3556 
3557       done = FALSE; // Because th_task_team pointer is not NULL for this thread
3558 
3559       KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to "
3560                     "unreference task_team\n",
3561                     __kmp_gtid_from_thread(thread)));
3562 
3563       if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
3564         void *sleep_loc;
3565         // If the thread is sleeping, awaken it.
3566         if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3567             NULL) {
3568           KA_TRACE(
3569               10,
3570               ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
3571                __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread)));
3572           __kmp_null_resume_wrapper(thread);
3573         }
3574       }
3575     }
3576     if (done) {
3577       break;
3578     }
3579 
3580     // If oversubscribed or have waited a bit, yield.
3581     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
3582   }
3583 }
3584 
3585 // __kmp_task_team_setup:  Create a task_team for the current team, but use
3586 // an already created, unused one if it already exists.
3587 void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
3588   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3589 
3590   // If this task_team hasn't been created yet, allocate it. It will be used in
3591   // the region after the next.
3592   // If it exists, it is the current task team and shouldn't be touched yet as
3593   // it may still be in use.
3594   if (team->t.t_task_team[this_thr->th.th_task_state] == NULL &&
3595       (always || team->t.t_nproc > 1)) {
3596     team->t.t_task_team[this_thr->th.th_task_state] =
3597         __kmp_allocate_task_team(this_thr, team);
3598     KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
3599                   " for team %d at parity=%d\n",
3600                   __kmp_gtid_from_thread(this_thr),
3601                   team->t.t_task_team[this_thr->th.th_task_state], team->t.t_id,
3602                   this_thr->th.th_task_state));
3603   }
3604 
3605   // After threads exit the release, they will call sync, and then point to this
3606   // other task_team; make sure it is allocated and properly initialized. As
3607   // threads spin in the barrier release phase, they will continue to use the
3608   // previous task_team struct(above), until they receive the signal to stop
3609   // checking for tasks (they can't safely reference the kmp_team_t struct,
3610   // which could be reallocated by the primary thread). No task teams are formed
3611   // for serialized teams.
3612   if (team->t.t_nproc > 1) {
3613     int other_team = 1 - this_thr->th.th_task_state;
3614     KMP_DEBUG_ASSERT(other_team >= 0 && other_team < 2);
3615     if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
3616       team->t.t_task_team[other_team] =
3617           __kmp_allocate_task_team(this_thr, team);
3618       KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created second new "
3619                     "task_team %p for team %d at parity=%d\n",
3620                     __kmp_gtid_from_thread(this_thr),
3621                     team->t.t_task_team[other_team], team->t.t_id, other_team));
3622     } else { // Leave the old task team struct in place for the upcoming region;
3623       // adjust as needed
3624       kmp_task_team_t *task_team = team->t.t_task_team[other_team];
3625       if (!task_team->tt.tt_active ||
3626           team->t.t_nproc != task_team->tt.tt_nproc) {
3627         TCW_4(task_team->tt.tt_nproc, team->t.t_nproc);
3628         TCW_4(task_team->tt.tt_found_tasks, FALSE);
3629         TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3630         KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads,
3631                           team->t.t_nproc);
3632         TCW_4(task_team->tt.tt_active, TRUE);
3633       }
3634       // if team size has changed, the first thread to enable tasking will
3635       // realloc threads_data if necessary
3636       KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d reset next task_team "
3637                     "%p for team %d at parity=%d\n",
3638                     __kmp_gtid_from_thread(this_thr),
3639                     team->t.t_task_team[other_team], team->t.t_id, other_team));
3640     }
3641   }
3642 
3643   // For regular thread, task enabling should be called when the task is going
3644   // to be pushed to a dequeue. However, for the hidden helper thread, we need
3645   // it ahead of time so that some operations can be performed without race
3646   // condition.
3647   if (this_thr == __kmp_hidden_helper_main_thread) {
3648     for (int i = 0; i < 2; ++i) {
3649       kmp_task_team_t *task_team = team->t.t_task_team[i];
3650       if (KMP_TASKING_ENABLED(task_team)) {
3651         continue;
3652       }
3653       __kmp_enable_tasking(task_team, this_thr);
3654       for (int j = 0; j < task_team->tt.tt_nproc; ++j) {
3655         kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[j];
3656         if (thread_data->td.td_deque == NULL) {
3657           __kmp_alloc_task_deque(__kmp_hidden_helper_threads[j], thread_data);
3658         }
3659       }
3660     }
3661   }
3662 }
3663 
3664 // __kmp_task_team_sync: Propagation of task team data from team to threads
3665 // which happens just after the release phase of a team barrier.  This may be
3666 // called by any thread, but only for teams with # threads > 1.
3667 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
3668   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3669 
3670   // Toggle the th_task_state field, to switch which task_team this thread
3671   // refers to
3672   this_thr->th.th_task_state = (kmp_uint8)(1 - this_thr->th.th_task_state);
3673 
3674   // It is now safe to propagate the task team pointer from the team struct to
3675   // the current thread.
3676   TCW_PTR(this_thr->th.th_task_team,
3677           team->t.t_task_team[this_thr->th.th_task_state]);
3678   KA_TRACE(20,
3679            ("__kmp_task_team_sync: Thread T#%d task team switched to task_team "
3680             "%p from Team #%d (parity=%d)\n",
3681             __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team,
3682             team->t.t_id, this_thr->th.th_task_state));
3683 }
3684 
3685 // __kmp_task_team_wait: Primary thread waits for outstanding tasks after the
3686 // barrier gather phase. Only called by primary thread if #threads in team > 1
3687 // or if proxy tasks were created.
3688 //
3689 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
3690 // by passing in 0 optionally as the last argument. When wait is zero, primary
3691 // thread does not wait for unfinished_threads to reach 0.
3692 void __kmp_task_team_wait(
3693     kmp_info_t *this_thr,
3694     kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) {
3695   kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state];
3696 
3697   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3698   KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team);
3699 
3700   if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) {
3701     if (wait) {
3702       KA_TRACE(20, ("__kmp_task_team_wait: Primary T#%d waiting for all tasks "
3703                     "(for unfinished_threads to reach 0) on task_team = %p\n",
3704                     __kmp_gtid_from_thread(this_thr), task_team));
3705       // Worker threads may have dropped through to release phase, but could
3706       // still be executing tasks. Wait here for tasks to complete. To avoid
3707       // memory contention, only primary thread checks termination condition.
3708       kmp_flag_32<false, false> flag(
3709           RCAST(std::atomic<kmp_uint32> *,
3710                 &task_team->tt.tt_unfinished_threads),
3711           0U);
3712       flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
3713     }
3714     // Deactivate the old task team, so that the worker threads will stop
3715     // referencing it while spinning.
3716     KA_TRACE(
3717         20,
3718         ("__kmp_task_team_wait: Primary T#%d deactivating task_team %p: "
3719          "setting active to false, setting local and team's pointer to NULL\n",
3720          __kmp_gtid_from_thread(this_thr), task_team));
3721     KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 ||
3722                      task_team->tt.tt_found_proxy_tasks == TRUE);
3723     TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3724     KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0);
3725     TCW_SYNC_4(task_team->tt.tt_active, FALSE);
3726     KMP_MB();
3727 
3728     TCW_PTR(this_thr->th.th_task_team, NULL);
3729   }
3730 }
3731 
3732 // __kmp_tasking_barrier:
3733 // This routine is called only when __kmp_tasking_mode == tskm_extra_barrier.
3734 // Internal function to execute all tasks prior to a regular barrier or a join
3735 // barrier. It is a full barrier itself, which unfortunately turns regular
3736 // barriers into double barriers and join barriers into 1 1/2 barriers.
3737 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) {
3738   std::atomic<kmp_uint32> *spin = RCAST(
3739       std::atomic<kmp_uint32> *,
3740       &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads);
3741   int flag = FALSE;
3742   KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier);
3743 
3744 #if USE_ITT_BUILD
3745   KMP_FSYNC_SPIN_INIT(spin, NULL);
3746 #endif /* USE_ITT_BUILD */
3747   kmp_flag_32<false, false> spin_flag(spin, 0U);
3748   while (!spin_flag.execute_tasks(thread, gtid, TRUE,
3749                                   &flag USE_ITT_BUILD_ARG(NULL), 0)) {
3750 #if USE_ITT_BUILD
3751     // TODO: What about itt_sync_obj??
3752     KMP_FSYNC_SPIN_PREPARE(RCAST(void *, spin));
3753 #endif /* USE_ITT_BUILD */
3754 
3755     if (TCR_4(__kmp_global.g.g_done)) {
3756       if (__kmp_global.g.g_abort)
3757         __kmp_abort_thread();
3758       break;
3759     }
3760     KMP_YIELD(TRUE);
3761   }
3762 #if USE_ITT_BUILD
3763   KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin));
3764 #endif /* USE_ITT_BUILD */
3765 }
3766 
3767 // __kmp_give_task puts a task into a given thread queue if:
3768 //  - the queue for that thread was created
3769 //  - there's space in that queue
3770 // Because of this, __kmp_push_task needs to check if there's space after
3771 // getting the lock
3772 static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task,
3773                             kmp_int32 pass) {
3774   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
3775   kmp_task_team_t *task_team = taskdata->td_task_team;
3776 
3777   KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n",
3778                 taskdata, tid));
3779 
3780   // If task_team is NULL something went really bad...
3781   KMP_DEBUG_ASSERT(task_team != NULL);
3782 
3783   bool result = false;
3784   kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
3785 
3786   if (thread_data->td.td_deque == NULL) {
3787     // There's no queue in this thread, go find another one
3788     // We're guaranteed that at least one thread has a queue
3789     KA_TRACE(30,
3790              ("__kmp_give_task: thread %d has no queue while giving task %p.\n",
3791               tid, taskdata));
3792     return result;
3793   }
3794 
3795   if (TCR_4(thread_data->td.td_deque_ntasks) >=
3796       TASK_DEQUE_SIZE(thread_data->td)) {
3797     KA_TRACE(
3798         30,
3799         ("__kmp_give_task: queue is full while giving task %p to thread %d.\n",
3800          taskdata, tid));
3801 
3802     // if this deque is bigger than the pass ratio give a chance to another
3803     // thread
3804     if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
3805       return result;
3806 
3807     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3808     if (TCR_4(thread_data->td.td_deque_ntasks) >=
3809         TASK_DEQUE_SIZE(thread_data->td)) {
3810       // expand deque to push the task which is not allowed to execute
3811       __kmp_realloc_task_deque(thread, thread_data);
3812     }
3813 
3814   } else {
3815 
3816     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3817 
3818     if (TCR_4(thread_data->td.td_deque_ntasks) >=
3819         TASK_DEQUE_SIZE(thread_data->td)) {
3820       KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to "
3821                     "thread %d.\n",
3822                     taskdata, tid));
3823 
3824       // if this deque is bigger than the pass ratio give a chance to another
3825       // thread
3826       if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
3827         goto release_and_exit;
3828 
3829       __kmp_realloc_task_deque(thread, thread_data);
3830     }
3831   }
3832 
3833   // lock is held here, and there is space in the deque
3834 
3835   thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
3836   // Wrap index.
3837   thread_data->td.td_deque_tail =
3838       (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
3839   TCW_4(thread_data->td.td_deque_ntasks,
3840         TCR_4(thread_data->td.td_deque_ntasks) + 1);
3841 
3842   result = true;
3843   KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n",
3844                 taskdata, tid));
3845 
3846 release_and_exit:
3847   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3848 
3849   return result;
3850 }
3851 
3852 #define PROXY_TASK_FLAG 0x40000000
3853 /* The finish of the proxy tasks is divided in two pieces:
3854     - the top half is the one that can be done from a thread outside the team
3855     - the bottom half must be run from a thread within the team
3856 
3857    In order to run the bottom half the task gets queued back into one of the
3858    threads of the team. Once the td_incomplete_child_task counter of the parent
3859    is decremented the threads can leave the barriers. So, the bottom half needs
3860    to be queued before the counter is decremented. The top half is therefore
3861    divided in two parts:
3862     - things that can be run before queuing the bottom half
3863     - things that must be run after queuing the bottom half
3864 
3865    This creates a second race as the bottom half can free the task before the
3866    second top half is executed. To avoid this we use the
3867    td_incomplete_child_task of the proxy task to synchronize the top and bottom
3868    half. */
3869 static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
3870   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
3871   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3872   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
3873   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
3874 
3875   taskdata->td_flags.complete = 1; // mark the task as completed
3876 
3877   if (taskdata->td_taskgroup)
3878     KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
3879 
3880   // Create an imaginary children for this task so the bottom half cannot
3881   // release the task before we have completed the second top half
3882   KMP_ATOMIC_OR(&taskdata->td_incomplete_child_tasks, PROXY_TASK_FLAG);
3883 }
3884 
3885 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
3886 #if KMP_DEBUG
3887   kmp_int32 children = 0;
3888   // Predecrement simulated by "- 1" calculation
3889   children = -1 +
3890 #endif
3891       KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
3892   KMP_DEBUG_ASSERT(children >= 0);
3893 
3894   // Remove the imaginary children
3895   KMP_ATOMIC_AND(&taskdata->td_incomplete_child_tasks, ~PROXY_TASK_FLAG);
3896 }
3897 
3898 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) {
3899   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3900   kmp_info_t *thread = __kmp_threads[gtid];
3901 
3902   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3903   KMP_DEBUG_ASSERT(taskdata->td_flags.complete ==
3904                    1); // top half must run before bottom half
3905 
3906   // We need to wait to make sure the top half is finished
3907   // Spinning here should be ok as this should happen quickly
3908   while ((KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) &
3909           PROXY_TASK_FLAG) > 0)
3910     ;
3911 
3912   __kmp_release_deps(gtid, taskdata);
3913   __kmp_free_task_and_ancestors(gtid, taskdata, thread);
3914 }
3915 
3916 /*!
3917 @ingroup TASKING
3918 @param gtid Global Thread ID of encountering thread
3919 @param ptask Task which execution is completed
3920 
3921 Execute the completion of a proxy task from a thread of that is part of the
3922 team. Run first and bottom halves directly.
3923 */
3924 void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) {
3925   KMP_DEBUG_ASSERT(ptask != NULL);
3926   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3927   KA_TRACE(
3928       10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n",
3929            gtid, taskdata));
3930   __kmp_assert_valid_gtid(gtid);
3931   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3932 
3933   __kmp_first_top_half_finish_proxy(taskdata);
3934   __kmp_second_top_half_finish_proxy(taskdata);
3935   __kmp_bottom_half_finish_proxy(gtid, ptask);
3936 
3937   KA_TRACE(10,
3938            ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n",
3939             gtid, taskdata));
3940 }
3941 
3942 void __kmpc_give_task(kmp_task_t *ptask, kmp_int32 start = 0) {
3943   KMP_DEBUG_ASSERT(ptask != NULL);
3944   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3945 
3946   // Enqueue task to complete bottom half completion from a thread within the
3947   // corresponding team
3948   kmp_team_t *team = taskdata->td_team;
3949   kmp_int32 nthreads = team->t.t_nproc;
3950   kmp_info_t *thread;
3951 
3952   // This should be similar to start_k = __kmp_get_random( thread ) % nthreads
3953   // but we cannot use __kmp_get_random here
3954   kmp_int32 start_k = start % nthreads;
3955   kmp_int32 pass = 1;
3956   kmp_int32 k = start_k;
3957 
3958   do {
3959     // For now we're just linearly trying to find a thread
3960     thread = team->t.t_threads[k];
3961     k = (k + 1) % nthreads;
3962 
3963     // we did a full pass through all the threads
3964     if (k == start_k)
3965       pass = pass << 1;
3966 
3967   } while (!__kmp_give_task(thread, k, ptask, pass));
3968 }
3969 
3970 /*!
3971 @ingroup TASKING
3972 @param ptask Task which execution is completed
3973 
3974 Execute the completion of a proxy task from a thread that could not belong to
3975 the team.
3976 */
3977 void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) {
3978   KMP_DEBUG_ASSERT(ptask != NULL);
3979   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3980 
3981   KA_TRACE(
3982       10,
3983       ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n",
3984        taskdata));
3985 
3986   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3987 
3988   __kmp_first_top_half_finish_proxy(taskdata);
3989 
3990   __kmpc_give_task(ptask);
3991 
3992   __kmp_second_top_half_finish_proxy(taskdata);
3993 
3994   KA_TRACE(
3995       10,
3996       ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n",
3997        taskdata));
3998 }
3999 
4000 kmp_event_t *__kmpc_task_allow_completion_event(ident_t *loc_ref, int gtid,
4001                                                 kmp_task_t *task) {
4002   kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
4003   if (td->td_allow_completion_event.type == KMP_EVENT_UNINITIALIZED) {
4004     td->td_allow_completion_event.type = KMP_EVENT_ALLOW_COMPLETION;
4005     td->td_allow_completion_event.ed.task = task;
4006     __kmp_init_tas_lock(&td->td_allow_completion_event.lock);
4007   }
4008   return &td->td_allow_completion_event;
4009 }
4010 
4011 void __kmp_fulfill_event(kmp_event_t *event) {
4012   if (event->type == KMP_EVENT_ALLOW_COMPLETION) {
4013     kmp_task_t *ptask = event->ed.task;
4014     kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4015     bool detached = false;
4016     int gtid = __kmp_get_gtid();
4017 
4018     // The associated task might have completed or could be completing at this
4019     // point.
4020     // We need to take the lock to avoid races
4021     __kmp_acquire_tas_lock(&event->lock, gtid);
4022     if (taskdata->td_flags.proxy == TASK_PROXY) {
4023       detached = true;
4024     } else {
4025 #if OMPT_SUPPORT
4026       // The OMPT event must occur under mutual exclusion,
4027       // otherwise the tool might access ptask after free
4028       if (UNLIKELY(ompt_enabled.enabled))
4029         __ompt_task_finish(ptask, NULL, ompt_task_early_fulfill);
4030 #endif
4031     }
4032     event->type = KMP_EVENT_UNINITIALIZED;
4033     __kmp_release_tas_lock(&event->lock, gtid);
4034 
4035     if (detached) {
4036 #if OMPT_SUPPORT
4037       // We free ptask afterwards and know the task is finished,
4038       // so locking is not necessary
4039       if (UNLIKELY(ompt_enabled.enabled))
4040         __ompt_task_finish(ptask, NULL, ompt_task_late_fulfill);
4041 #endif
4042       // If the task detached complete the proxy task
4043       if (gtid >= 0) {
4044         kmp_team_t *team = taskdata->td_team;
4045         kmp_info_t *thread = __kmp_get_thread();
4046         if (thread->th.th_team == team) {
4047           __kmpc_proxy_task_completed(gtid, ptask);
4048           return;
4049         }
4050       }
4051 
4052       // fallback
4053       __kmpc_proxy_task_completed_ooo(ptask);
4054     }
4055   }
4056 }
4057 
4058 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task
4059 // for taskloop
4060 //
4061 // thread:   allocating thread
4062 // task_src: pointer to source task to be duplicated
4063 // returns:  a pointer to the allocated kmp_task_t structure (task).
4064 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) {
4065   kmp_task_t *task;
4066   kmp_taskdata_t *taskdata;
4067   kmp_taskdata_t *taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
4068   kmp_taskdata_t *parent_task = taskdata_src->td_parent; // same parent task
4069   size_t shareds_offset;
4070   size_t task_size;
4071 
4072   KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread,
4073                 task_src));
4074   KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy ==
4075                    TASK_FULL); // it should not be proxy task
4076   KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT);
4077   task_size = taskdata_src->td_size_alloc;
4078 
4079   // Allocate a kmp_taskdata_t block and a kmp_task_t block.
4080   KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread,
4081                 task_size));
4082 #if USE_FAST_MEMORY
4083   taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size);
4084 #else
4085   taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size);
4086 #endif /* USE_FAST_MEMORY */
4087   KMP_MEMCPY(taskdata, taskdata_src, task_size);
4088 
4089   task = KMP_TASKDATA_TO_TASK(taskdata);
4090 
4091   // Initialize new task (only specific fields not affected by memcpy)
4092   taskdata->td_task_id = KMP_GEN_TASK_ID();
4093   if (task->shareds != NULL) { // need setup shareds pointer
4094     shareds_offset = (char *)task_src->shareds - (char *)taskdata_src;
4095     task->shareds = &((char *)taskdata)[shareds_offset];
4096     KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
4097                      0);
4098   }
4099   taskdata->td_alloc_thread = thread;
4100   taskdata->td_parent = parent_task;
4101   // task inherits the taskgroup from the parent task
4102   taskdata->td_taskgroup = parent_task->td_taskgroup;
4103   // tied task needs to initialize the td_last_tied at creation,
4104   // untied one does this when it is scheduled for execution
4105   if (taskdata->td_flags.tiedness == TASK_TIED)
4106     taskdata->td_last_tied = taskdata;
4107 
4108   // Only need to keep track of child task counts if team parallel and tasking
4109   // not serialized
4110   if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
4111     KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
4112     if (parent_task->td_taskgroup)
4113       KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
4114     // Only need to keep track of allocated child tasks for explicit tasks since
4115     // implicit not deallocated
4116     if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT)
4117       KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
4118   }
4119 
4120   KA_TRACE(20,
4121            ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
4122             thread, taskdata, taskdata->td_parent));
4123 #if OMPT_SUPPORT
4124   if (UNLIKELY(ompt_enabled.enabled))
4125     __ompt_task_init(taskdata, thread->th.th_info.ds.ds_gtid);
4126 #endif
4127   return task;
4128 }
4129 
4130 // Routine optionally generated by the compiler for setting the lastprivate flag
4131 // and calling needed constructors for private/firstprivate objects
4132 // (used to form taskloop tasks from pattern task)
4133 // Parameters: dest task, src task, lastprivate flag.
4134 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
4135 
4136 KMP_BUILD_ASSERT(sizeof(long) == 4 || sizeof(long) == 8);
4137 
4138 // class to encapsulate manipulating loop bounds in a taskloop task.
4139 // this abstracts away the Intel vs GOMP taskloop interface for setting/getting
4140 // the loop bound variables.
4141 class kmp_taskloop_bounds_t {
4142   kmp_task_t *task;
4143   const kmp_taskdata_t *taskdata;
4144   size_t lower_offset;
4145   size_t upper_offset;
4146 
4147 public:
4148   kmp_taskloop_bounds_t(kmp_task_t *_task, kmp_uint64 *lb, kmp_uint64 *ub)
4149       : task(_task), taskdata(KMP_TASK_TO_TASKDATA(task)),
4150         lower_offset((char *)lb - (char *)task),
4151         upper_offset((char *)ub - (char *)task) {
4152     KMP_DEBUG_ASSERT((char *)lb > (char *)_task);
4153     KMP_DEBUG_ASSERT((char *)ub > (char *)_task);
4154   }
4155   kmp_taskloop_bounds_t(kmp_task_t *_task, const kmp_taskloop_bounds_t &bounds)
4156       : task(_task), taskdata(KMP_TASK_TO_TASKDATA(_task)),
4157         lower_offset(bounds.lower_offset), upper_offset(bounds.upper_offset) {}
4158   size_t get_lower_offset() const { return lower_offset; }
4159   size_t get_upper_offset() const { return upper_offset; }
4160   kmp_uint64 get_lb() const {
4161     kmp_int64 retval;
4162 #if defined(KMP_GOMP_COMPAT)
4163     // Intel task just returns the lower bound normally
4164     if (!taskdata->td_flags.native) {
4165       retval = *(kmp_int64 *)((char *)task + lower_offset);
4166     } else {
4167       // GOMP task has to take into account the sizeof(long)
4168       if (taskdata->td_size_loop_bounds == 4) {
4169         kmp_int32 *lb = RCAST(kmp_int32 *, task->shareds);
4170         retval = (kmp_int64)*lb;
4171       } else {
4172         kmp_int64 *lb = RCAST(kmp_int64 *, task->shareds);
4173         retval = (kmp_int64)*lb;
4174       }
4175     }
4176 #else
4177     (void)taskdata;
4178     retval = *(kmp_int64 *)((char *)task + lower_offset);
4179 #endif // defined(KMP_GOMP_COMPAT)
4180     return retval;
4181   }
4182   kmp_uint64 get_ub() const {
4183     kmp_int64 retval;
4184 #if defined(KMP_GOMP_COMPAT)
4185     // Intel task just returns the upper bound normally
4186     if (!taskdata->td_flags.native) {
4187       retval = *(kmp_int64 *)((char *)task + upper_offset);
4188     } else {
4189       // GOMP task has to take into account the sizeof(long)
4190       if (taskdata->td_size_loop_bounds == 4) {
4191         kmp_int32 *ub = RCAST(kmp_int32 *, task->shareds) + 1;
4192         retval = (kmp_int64)*ub;
4193       } else {
4194         kmp_int64 *ub = RCAST(kmp_int64 *, task->shareds) + 1;
4195         retval = (kmp_int64)*ub;
4196       }
4197     }
4198 #else
4199     retval = *(kmp_int64 *)((char *)task + upper_offset);
4200 #endif // defined(KMP_GOMP_COMPAT)
4201     return retval;
4202   }
4203   void set_lb(kmp_uint64 lb) {
4204 #if defined(KMP_GOMP_COMPAT)
4205     // Intel task just sets the lower bound normally
4206     if (!taskdata->td_flags.native) {
4207       *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4208     } else {
4209       // GOMP task has to take into account the sizeof(long)
4210       if (taskdata->td_size_loop_bounds == 4) {
4211         kmp_uint32 *lower = RCAST(kmp_uint32 *, task->shareds);
4212         *lower = (kmp_uint32)lb;
4213       } else {
4214         kmp_uint64 *lower = RCAST(kmp_uint64 *, task->shareds);
4215         *lower = (kmp_uint64)lb;
4216       }
4217     }
4218 #else
4219     *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4220 #endif // defined(KMP_GOMP_COMPAT)
4221   }
4222   void set_ub(kmp_uint64 ub) {
4223 #if defined(KMP_GOMP_COMPAT)
4224     // Intel task just sets the upper bound normally
4225     if (!taskdata->td_flags.native) {
4226       *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4227     } else {
4228       // GOMP task has to take into account the sizeof(long)
4229       if (taskdata->td_size_loop_bounds == 4) {
4230         kmp_uint32 *upper = RCAST(kmp_uint32 *, task->shareds) + 1;
4231         *upper = (kmp_uint32)ub;
4232       } else {
4233         kmp_uint64 *upper = RCAST(kmp_uint64 *, task->shareds) + 1;
4234         *upper = (kmp_uint64)ub;
4235       }
4236     }
4237 #else
4238     *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4239 #endif // defined(KMP_GOMP_COMPAT)
4240   }
4241 };
4242 
4243 // __kmp_taskloop_linear: Start tasks of the taskloop linearly
4244 //
4245 // loc        Source location information
4246 // gtid       Global thread ID
4247 // task       Pattern task, exposes the loop iteration range
4248 // lb         Pointer to loop lower bound in task structure
4249 // ub         Pointer to loop upper bound in task structure
4250 // st         Loop stride
4251 // ub_glob    Global upper bound (used for lastprivate check)
4252 // num_tasks  Number of tasks to execute
4253 // grainsize  Number of loop iterations per task
4254 // extras     Number of chunks with grainsize+1 iterations
4255 // last_chunk Reduction of grainsize for last task
4256 // tc         Iterations count
4257 // task_dup   Tasks duplication routine
4258 // codeptr_ra Return address for OMPT events
4259 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
4260                            kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4261                            kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4262                            kmp_uint64 grainsize, kmp_uint64 extras,
4263                            kmp_int64 last_chunk, kmp_uint64 tc,
4264 #if OMPT_SUPPORT
4265                            void *codeptr_ra,
4266 #endif
4267                            void *task_dup) {
4268   KMP_COUNT_BLOCK(OMP_TASKLOOP);
4269   KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
4270   p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4271   // compiler provides global bounds here
4272   kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4273   kmp_uint64 lower = task_bounds.get_lb();
4274   kmp_uint64 upper = task_bounds.get_ub();
4275   kmp_uint64 i;
4276   kmp_info_t *thread = __kmp_threads[gtid];
4277   kmp_taskdata_t *current_task = thread->th.th_current_task;
4278   kmp_task_t *next_task;
4279   kmp_int32 lastpriv = 0;
4280 
4281   KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
4282                              (last_chunk < 0 ? last_chunk : extras));
4283   KMP_DEBUG_ASSERT(num_tasks > extras);
4284   KMP_DEBUG_ASSERT(num_tasks > 0);
4285   KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, "
4286                 "extras %lld, last_chunk %lld, i=%lld,%lld(%d)%lld, dup %p\n",
4287                 gtid, num_tasks, grainsize, extras, last_chunk, lower, upper,
4288                 ub_glob, st, task_dup));
4289 
4290   // Launch num_tasks tasks, assign grainsize iterations each task
4291   for (i = 0; i < num_tasks; ++i) {
4292     kmp_uint64 chunk_minus_1;
4293     if (extras == 0) {
4294       chunk_minus_1 = grainsize - 1;
4295     } else {
4296       chunk_minus_1 = grainsize;
4297       --extras; // first extras iterations get bigger chunk (grainsize+1)
4298     }
4299     upper = lower + st * chunk_minus_1;
4300     if (upper > *ub) {
4301       upper = *ub;
4302     }
4303     if (i == num_tasks - 1) {
4304       // schedule the last task, set lastprivate flag if needed
4305       if (st == 1) { // most common case
4306         KMP_DEBUG_ASSERT(upper == *ub);
4307         if (upper == ub_glob)
4308           lastpriv = 1;
4309       } else if (st > 0) { // positive loop stride
4310         KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper);
4311         if ((kmp_uint64)st > ub_glob - upper)
4312           lastpriv = 1;
4313       } else { // negative loop stride
4314         KMP_DEBUG_ASSERT(upper + st < *ub);
4315         if (upper - ub_glob < (kmp_uint64)(-st))
4316           lastpriv = 1;
4317       }
4318     }
4319     next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
4320     kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task);
4321     kmp_taskloop_bounds_t next_task_bounds =
4322         kmp_taskloop_bounds_t(next_task, task_bounds);
4323 
4324     // adjust task-specific bounds
4325     next_task_bounds.set_lb(lower);
4326     if (next_taskdata->td_flags.native) {
4327       next_task_bounds.set_ub(upper + (st > 0 ? 1 : -1));
4328     } else {
4329       next_task_bounds.set_ub(upper);
4330     }
4331     if (ptask_dup != NULL) // set lastprivate flag, construct firstprivates,
4332                            // etc.
4333       ptask_dup(next_task, task, lastpriv);
4334     KA_TRACE(40,
4335              ("__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, "
4336               "upper %lld stride %lld, (offsets %p %p)\n",
4337               gtid, i, next_task, lower, upper, st,
4338               next_task_bounds.get_lower_offset(),
4339               next_task_bounds.get_upper_offset()));
4340 #if OMPT_SUPPORT
4341     __kmp_omp_taskloop_task(NULL, gtid, next_task,
4342                             codeptr_ra); // schedule new task
4343 #else
4344     __kmp_omp_task(gtid, next_task, true); // schedule new task
4345 #endif
4346     lower = upper + st; // adjust lower bound for the next iteration
4347   }
4348   // free the pattern task and exit
4349   __kmp_task_start(gtid, task, current_task); // make internal bookkeeping
4350   // do not execute the pattern task, just do internal bookkeeping
4351   __kmp_task_finish<false>(gtid, task, current_task);
4352 }
4353 
4354 // Structure to keep taskloop parameters for auxiliary task
4355 // kept in the shareds of the task structure.
4356 typedef struct __taskloop_params {
4357   kmp_task_t *task;
4358   kmp_uint64 *lb;
4359   kmp_uint64 *ub;
4360   void *task_dup;
4361   kmp_int64 st;
4362   kmp_uint64 ub_glob;
4363   kmp_uint64 num_tasks;
4364   kmp_uint64 grainsize;
4365   kmp_uint64 extras;
4366   kmp_int64 last_chunk;
4367   kmp_uint64 tc;
4368   kmp_uint64 num_t_min;
4369 #if OMPT_SUPPORT
4370   void *codeptr_ra;
4371 #endif
4372 } __taskloop_params_t;
4373 
4374 void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *,
4375                           kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64,
4376                           kmp_uint64, kmp_uint64, kmp_int64, kmp_uint64,
4377                           kmp_uint64,
4378 #if OMPT_SUPPORT
4379                           void *,
4380 #endif
4381                           void *);
4382 
4383 // Execute part of the taskloop submitted as a task.
4384 int __kmp_taskloop_task(int gtid, void *ptask) {
4385   __taskloop_params_t *p =
4386       (__taskloop_params_t *)((kmp_task_t *)ptask)->shareds;
4387   kmp_task_t *task = p->task;
4388   kmp_uint64 *lb = p->lb;
4389   kmp_uint64 *ub = p->ub;
4390   void *task_dup = p->task_dup;
4391   //  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4392   kmp_int64 st = p->st;
4393   kmp_uint64 ub_glob = p->ub_glob;
4394   kmp_uint64 num_tasks = p->num_tasks;
4395   kmp_uint64 grainsize = p->grainsize;
4396   kmp_uint64 extras = p->extras;
4397   kmp_int64 last_chunk = p->last_chunk;
4398   kmp_uint64 tc = p->tc;
4399   kmp_uint64 num_t_min = p->num_t_min;
4400 #if OMPT_SUPPORT
4401   void *codeptr_ra = p->codeptr_ra;
4402 #endif
4403 #if KMP_DEBUG
4404   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4405   KMP_DEBUG_ASSERT(task != NULL);
4406   KA_TRACE(20,
4407            ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize"
4408             " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
4409             gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,
4410             st, task_dup));
4411 #endif
4412   KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min);
4413   if (num_tasks > num_t_min)
4414     __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
4415                          grainsize, extras, last_chunk, tc, num_t_min,
4416 #if OMPT_SUPPORT
4417                          codeptr_ra,
4418 #endif
4419                          task_dup);
4420   else
4421     __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
4422                           grainsize, extras, last_chunk, tc,
4423 #if OMPT_SUPPORT
4424                           codeptr_ra,
4425 #endif
4426                           task_dup);
4427 
4428   KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid));
4429   return 0;
4430 }
4431 
4432 // Schedule part of the taskloop as a task,
4433 // execute the rest of the taskloop.
4434 //
4435 // loc        Source location information
4436 // gtid       Global thread ID
4437 // task       Pattern task, exposes the loop iteration range
4438 // lb         Pointer to loop lower bound in task structure
4439 // ub         Pointer to loop upper bound in task structure
4440 // st         Loop stride
4441 // ub_glob    Global upper bound (used for lastprivate check)
4442 // num_tasks  Number of tasks to execute
4443 // grainsize  Number of loop iterations per task
4444 // extras     Number of chunks with grainsize+1 iterations
4445 // last_chunk Reduction of grainsize for last task
4446 // tc         Iterations count
4447 // num_t_min  Threshold to launch tasks recursively
4448 // task_dup   Tasks duplication routine
4449 // codeptr_ra Return address for OMPT events
4450 void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
4451                           kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4452                           kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4453                           kmp_uint64 grainsize, kmp_uint64 extras,
4454                           kmp_int64 last_chunk, kmp_uint64 tc,
4455                           kmp_uint64 num_t_min,
4456 #if OMPT_SUPPORT
4457                           void *codeptr_ra,
4458 #endif
4459                           void *task_dup) {
4460   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4461   KMP_DEBUG_ASSERT(task != NULL);
4462   KMP_DEBUG_ASSERT(num_tasks > num_t_min);
4463   KA_TRACE(20,
4464            ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize"
4465             " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
4466             gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,
4467             st, task_dup));
4468   p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4469   kmp_uint64 lower = *lb;
4470   kmp_info_t *thread = __kmp_threads[gtid];
4471   //  kmp_taskdata_t *current_task = thread->th.th_current_task;
4472   kmp_task_t *next_task;
4473   size_t lower_offset =
4474       (char *)lb - (char *)task; // remember offset of lb in the task structure
4475   size_t upper_offset =
4476       (char *)ub - (char *)task; // remember offset of ub in the task structure
4477 
4478   KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
4479                              (last_chunk < 0 ? last_chunk : extras));
4480   KMP_DEBUG_ASSERT(num_tasks > extras);
4481   KMP_DEBUG_ASSERT(num_tasks > 0);
4482 
4483   // split the loop in two halves
4484   kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1;
4485   kmp_int64 last_chunk0 = 0, last_chunk1 = 0;
4486   kmp_uint64 gr_size0 = grainsize;
4487   kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute
4488   kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task
4489   if (last_chunk < 0) {
4490     ext0 = ext1 = 0;
4491     last_chunk1 = last_chunk;
4492     tc0 = grainsize * n_tsk0;
4493     tc1 = tc - tc0;
4494   } else if (n_tsk0 <= extras) {
4495     gr_size0++; // integrate extras into grainsize
4496     ext0 = 0; // no extra iters in 1st half
4497     ext1 = extras - n_tsk0; // remaining extras
4498     tc0 = gr_size0 * n_tsk0;
4499     tc1 = tc - tc0;
4500   } else { // n_tsk0 > extras
4501     ext1 = 0; // no extra iters in 2nd half
4502     ext0 = extras;
4503     tc1 = grainsize * n_tsk1;
4504     tc0 = tc - tc1;
4505   }
4506   ub0 = lower + st * (tc0 - 1);
4507   lb1 = ub0 + st;
4508 
4509   // create pattern task for 2nd half of the loop
4510   next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task
4511   // adjust lower bound (upper bound is not changed) for the 2nd half
4512   *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1;
4513   if (ptask_dup != NULL) // construct firstprivates, etc.
4514     ptask_dup(next_task, task, 0);
4515   *ub = ub0; // adjust upper bound for the 1st half
4516 
4517   // create auxiliary task for 2nd half of the loop
4518   // make sure new task has same parent task as the pattern task
4519   kmp_taskdata_t *current_task = thread->th.th_current_task;
4520   thread->th.th_current_task = taskdata->td_parent;
4521   kmp_task_t *new_task =
4522       __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *),
4523                             sizeof(__taskloop_params_t), &__kmp_taskloop_task);
4524   // restore current task
4525   thread->th.th_current_task = current_task;
4526   __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds;
4527   p->task = next_task;
4528   p->lb = (kmp_uint64 *)((char *)next_task + lower_offset);
4529   p->ub = (kmp_uint64 *)((char *)next_task + upper_offset);
4530   p->task_dup = task_dup;
4531   p->st = st;
4532   p->ub_glob = ub_glob;
4533   p->num_tasks = n_tsk1;
4534   p->grainsize = grainsize;
4535   p->extras = ext1;
4536   p->last_chunk = last_chunk1;
4537   p->tc = tc1;
4538   p->num_t_min = num_t_min;
4539 #if OMPT_SUPPORT
4540   p->codeptr_ra = codeptr_ra;
4541 #endif
4542 
4543 #if OMPT_SUPPORT
4544   // schedule new task with correct return address for OMPT events
4545   __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra);
4546 #else
4547   __kmp_omp_task(gtid, new_task, true); // schedule new task
4548 #endif
4549 
4550   // execute the 1st half of current subrange
4551   if (n_tsk0 > num_t_min)
4552     __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0,
4553                          ext0, last_chunk0, tc0, num_t_min,
4554 #if OMPT_SUPPORT
4555                          codeptr_ra,
4556 #endif
4557                          task_dup);
4558   else
4559     __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,
4560                           gr_size0, ext0, last_chunk0, tc0,
4561 #if OMPT_SUPPORT
4562                           codeptr_ra,
4563 #endif
4564                           task_dup);
4565 
4566   KA_TRACE(40, ("__kmp_taskloop_recur(exit): T#%d\n", gtid));
4567 }
4568 
4569 static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
4570                            kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4571                            int nogroup, int sched, kmp_uint64 grainsize,
4572                            int modifier, void *task_dup) {
4573   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4574   KMP_DEBUG_ASSERT(task != NULL);
4575   if (nogroup == 0) {
4576 #if OMPT_SUPPORT && OMPT_OPTIONAL
4577     OMPT_STORE_RETURN_ADDRESS(gtid);
4578 #endif
4579     __kmpc_taskgroup(loc, gtid);
4580   }
4581 
4582   // =========================================================================
4583   // calculate loop parameters
4584   kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4585   kmp_uint64 tc;
4586   // compiler provides global bounds here
4587   kmp_uint64 lower = task_bounds.get_lb();
4588   kmp_uint64 upper = task_bounds.get_ub();
4589   kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag
4590   kmp_uint64 num_tasks = 0, extras = 0;
4591   kmp_int64 last_chunk =
4592       0; // reduce grainsize of last task by last_chunk in strict mode
4593   kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks;
4594   kmp_info_t *thread = __kmp_threads[gtid];
4595   kmp_taskdata_t *current_task = thread->th.th_current_task;
4596 
4597   KA_TRACE(20, ("__kmp_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, "
4598                 "grain %llu(%d, %d), dup %p\n",
4599                 gtid, taskdata, lower, upper, st, grainsize, sched, modifier,
4600                 task_dup));
4601 
4602   // compute trip count
4603   if (st == 1) { // most common case
4604     tc = upper - lower + 1;
4605   } else if (st < 0) {
4606     tc = (lower - upper) / (-st) + 1;
4607   } else { // st > 0
4608     tc = (upper - lower) / st + 1;
4609   }
4610   if (tc == 0) {
4611     KA_TRACE(20, ("__kmp_taskloop(exit): T#%d zero-trip loop\n", gtid));
4612     // free the pattern task and exit
4613     __kmp_task_start(gtid, task, current_task);
4614     // do not execute anything for zero-trip loop
4615     __kmp_task_finish<false>(gtid, task, current_task);
4616     return;
4617   }
4618 
4619 #if OMPT_SUPPORT && OMPT_OPTIONAL
4620   ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
4621   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
4622   if (ompt_enabled.ompt_callback_work) {
4623     ompt_callbacks.ompt_callback(ompt_callback_work)(
4624         ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data),
4625         &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
4626   }
4627 #endif
4628 
4629   if (num_tasks_min == 0)
4630     // TODO: can we choose better default heuristic?
4631     num_tasks_min =
4632         KMP_MIN(thread->th.th_team_nproc * 10, INITIAL_TASK_DEQUE_SIZE);
4633 
4634   // compute num_tasks/grainsize based on the input provided
4635   switch (sched) {
4636   case 0: // no schedule clause specified, we can choose the default
4637     // let's try to schedule (team_size*10) tasks
4638     grainsize = thread->th.th_team_nproc * 10;
4639     KMP_FALLTHROUGH();
4640   case 2: // num_tasks provided
4641     if (grainsize > tc) {
4642       num_tasks = tc; // too big num_tasks requested, adjust values
4643       grainsize = 1;
4644       extras = 0;
4645     } else {
4646       num_tasks = grainsize;
4647       grainsize = tc / num_tasks;
4648       extras = tc % num_tasks;
4649     }
4650     break;
4651   case 1: // grainsize provided
4652     if (grainsize > tc) {
4653       num_tasks = 1;
4654       grainsize = tc; // too big grainsize requested, adjust values
4655       extras = 0;
4656     } else {
4657       if (modifier) {
4658         num_tasks = (tc + grainsize - 1) / grainsize;
4659         last_chunk = tc - (num_tasks * grainsize);
4660         extras = 0;
4661       } else {
4662         num_tasks = tc / grainsize;
4663         // adjust grainsize for balanced distribution of iterations
4664         grainsize = tc / num_tasks;
4665         extras = tc % num_tasks;
4666       }
4667     }
4668     break;
4669   default:
4670     KMP_ASSERT2(0, "unknown scheduling of taskloop");
4671   }
4672 
4673   KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
4674                              (last_chunk < 0 ? last_chunk : extras));
4675   KMP_DEBUG_ASSERT(num_tasks > extras);
4676   KMP_DEBUG_ASSERT(num_tasks > 0);
4677   // =========================================================================
4678 
4679   // check if clause value first
4680   // Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native)
4681   if (if_val == 0) { // if(0) specified, mark task as serial
4682     taskdata->td_flags.task_serial = 1;
4683     taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
4684     // always start serial tasks linearly
4685     __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
4686                           grainsize, extras, last_chunk, tc,
4687 #if OMPT_SUPPORT
4688                           OMPT_GET_RETURN_ADDRESS(0),
4689 #endif
4690                           task_dup);
4691     // !taskdata->td_flags.native => currently force linear spawning of tasks
4692     // for GOMP_taskloop
4693   } else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) {
4694     KA_TRACE(20, ("__kmp_taskloop: T#%d, go recursive: tc %llu, #tasks %llu"
4695                   "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
4696                   gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
4697                   last_chunk));
4698     __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
4699                          grainsize, extras, last_chunk, tc, num_tasks_min,
4700 #if OMPT_SUPPORT
4701                          OMPT_GET_RETURN_ADDRESS(0),
4702 #endif
4703                          task_dup);
4704   } else {
4705     KA_TRACE(20, ("__kmp_taskloop: T#%d, go linear: tc %llu, #tasks %llu"
4706                   "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
4707                   gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
4708                   last_chunk));
4709     __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
4710                           grainsize, extras, last_chunk, tc,
4711 #if OMPT_SUPPORT
4712                           OMPT_GET_RETURN_ADDRESS(0),
4713 #endif
4714                           task_dup);
4715   }
4716 
4717 #if OMPT_SUPPORT && OMPT_OPTIONAL
4718   if (ompt_enabled.ompt_callback_work) {
4719     ompt_callbacks.ompt_callback(ompt_callback_work)(
4720         ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data),
4721         &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
4722   }
4723 #endif
4724 
4725   if (nogroup == 0) {
4726 #if OMPT_SUPPORT && OMPT_OPTIONAL
4727     OMPT_STORE_RETURN_ADDRESS(gtid);
4728 #endif
4729     __kmpc_end_taskgroup(loc, gtid);
4730   }
4731   KA_TRACE(20, ("__kmp_taskloop(exit): T#%d\n", gtid));
4732 }
4733 
4734 /*!
4735 @ingroup TASKING
4736 @param loc       Source location information
4737 @param gtid      Global thread ID
4738 @param task      Task structure
4739 @param if_val    Value of the if clause
4740 @param lb        Pointer to loop lower bound in task structure
4741 @param ub        Pointer to loop upper bound in task structure
4742 @param st        Loop stride
4743 @param nogroup   Flag, 1 if nogroup clause specified, 0 otherwise
4744 @param sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
4745 @param grainsize Schedule value if specified
4746 @param task_dup  Tasks duplication routine
4747 
4748 Execute the taskloop construct.
4749 */
4750 void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
4751                      kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup,
4752                      int sched, kmp_uint64 grainsize, void *task_dup) {
4753   __kmp_assert_valid_gtid(gtid);
4754   KA_TRACE(20, ("__kmpc_taskloop(enter): T#%d\n", gtid));
4755   __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
4756                  0, task_dup);
4757   KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid));
4758 }
4759 
4760 /*!
4761 @ingroup TASKING
4762 @param loc       Source location information
4763 @param gtid      Global thread ID
4764 @param task      Task structure
4765 @param if_val    Value of the if clause
4766 @param lb        Pointer to loop lower bound in task structure
4767 @param ub        Pointer to loop upper bound in task structure
4768 @param st        Loop stride
4769 @param nogroup   Flag, 1 if nogroup clause specified, 0 otherwise
4770 @param sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
4771 @param grainsize Schedule value if specified
4772 @param modifer   Modifier 'strict' for sched, 1 if present, 0 otherwise
4773 @param task_dup  Tasks duplication routine
4774 
4775 Execute the taskloop construct.
4776 */
4777 void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
4778                        kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4779                        int nogroup, int sched, kmp_uint64 grainsize,
4780                        int modifier, void *task_dup) {
4781   __kmp_assert_valid_gtid(gtid);
4782   KA_TRACE(20, ("__kmpc_taskloop_5(enter): T#%d\n", gtid));
4783   __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
4784                  modifier, task_dup);
4785   KA_TRACE(20, ("__kmpc_taskloop_5(exit): T#%d\n", gtid));
4786 }
4787