1 /*
2  * kmp_tasking.cpp -- OpenMP 3.0 tasking support.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_i18n.h"
15 #include "kmp_itt.h"
16 #include "kmp_stats.h"
17 #include "kmp_wait_release.h"
18 #include "kmp_taskdeps.h"
19 
20 #if OMPT_SUPPORT
21 #include "ompt-specific.h"
22 #endif
23 
24 /* forward declaration */
25 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
26                                  kmp_info_t *this_thr);
27 static void __kmp_alloc_task_deque(kmp_info_t *thread,
28                                    kmp_thread_data_t *thread_data);
29 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
30                                            kmp_task_team_t *task_team);
31 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask);
32 
33 #ifdef BUILD_TIED_TASK_STACK
34 
35 //  __kmp_trace_task_stack: print the tied tasks from the task stack in order
36 //  from top do bottom
37 //
38 //  gtid: global thread identifier for thread containing stack
39 //  thread_data: thread data for task team thread containing stack
40 //  threshold: value above which the trace statement triggers
41 //  location: string identifying call site of this function (for trace)
42 static void __kmp_trace_task_stack(kmp_int32 gtid,
43                                    kmp_thread_data_t *thread_data,
44                                    int threshold, char *location) {
45   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
46   kmp_taskdata_t **stack_top = task_stack->ts_top;
47   kmp_int32 entries = task_stack->ts_entries;
48   kmp_taskdata_t *tied_task;
49 
50   KA_TRACE(
51       threshold,
52       ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, "
53        "first_block = %p, stack_top = %p \n",
54        location, gtid, entries, task_stack->ts_first_block, stack_top));
55 
56   KMP_DEBUG_ASSERT(stack_top != NULL);
57   KMP_DEBUG_ASSERT(entries > 0);
58 
59   while (entries != 0) {
60     KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]);
61     // fix up ts_top if we need to pop from previous block
62     if (entries & TASK_STACK_INDEX_MASK == 0) {
63       kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top);
64 
65       stack_block = stack_block->sb_prev;
66       stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
67     }
68 
69     // finish bookkeeping
70     stack_top--;
71     entries--;
72 
73     tied_task = *stack_top;
74 
75     KMP_DEBUG_ASSERT(tied_task != NULL);
76     KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
77 
78     KA_TRACE(threshold,
79              ("__kmp_trace_task_stack(%s):             gtid=%d, entry=%d, "
80               "stack_top=%p, tied_task=%p\n",
81               location, gtid, entries, stack_top, tied_task));
82   }
83   KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]);
84 
85   KA_TRACE(threshold,
86            ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n",
87             location, gtid));
88 }
89 
90 //  __kmp_init_task_stack: initialize the task stack for the first time
91 //  after a thread_data structure is created.
92 //  It should not be necessary to do this again (assuming the stack works).
93 //
94 //  gtid: global thread identifier of calling thread
95 //  thread_data: thread data for task team thread containing stack
96 static void __kmp_init_task_stack(kmp_int32 gtid,
97                                   kmp_thread_data_t *thread_data) {
98   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
99   kmp_stack_block_t *first_block;
100 
101   // set up the first block of the stack
102   first_block = &task_stack->ts_first_block;
103   task_stack->ts_top = (kmp_taskdata_t **)first_block;
104   memset((void *)first_block, '\0',
105          TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *));
106 
107   // initialize the stack to be empty
108   task_stack->ts_entries = TASK_STACK_EMPTY;
109   first_block->sb_next = NULL;
110   first_block->sb_prev = NULL;
111 }
112 
113 //  __kmp_free_task_stack: free the task stack when thread_data is destroyed.
114 //
115 //  gtid: global thread identifier for calling thread
116 //  thread_data: thread info for thread containing stack
117 static void __kmp_free_task_stack(kmp_int32 gtid,
118                                   kmp_thread_data_t *thread_data) {
119   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
120   kmp_stack_block_t *stack_block = &task_stack->ts_first_block;
121 
122   KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY);
123   // free from the second block of the stack
124   while (stack_block != NULL) {
125     kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL;
126 
127     stack_block->sb_next = NULL;
128     stack_block->sb_prev = NULL;
129     if (stack_block != &task_stack->ts_first_block) {
130       __kmp_thread_free(thread,
131                         stack_block); // free the block, if not the first
132     }
133     stack_block = next_block;
134   }
135   // initialize the stack to be empty
136   task_stack->ts_entries = 0;
137   task_stack->ts_top = NULL;
138 }
139 
140 //  __kmp_push_task_stack: Push the tied task onto the task stack.
141 //     Grow the stack if necessary by allocating another block.
142 //
143 //  gtid: global thread identifier for calling thread
144 //  thread: thread info for thread containing stack
145 //  tied_task: the task to push on the stack
146 static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread,
147                                   kmp_taskdata_t *tied_task) {
148   // GEH - need to consider what to do if tt_threads_data not allocated yet
149   kmp_thread_data_t *thread_data =
150       &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
151   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
152 
153   if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) {
154     return; // Don't push anything on stack if team or team tasks are serialized
155   }
156 
157   KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
158   KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
159 
160   KA_TRACE(20,
161            ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n",
162             gtid, thread, tied_task));
163   // Store entry
164   *(task_stack->ts_top) = tied_task;
165 
166   // Do bookkeeping for next push
167   task_stack->ts_top++;
168   task_stack->ts_entries++;
169 
170   if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
171     // Find beginning of this task block
172     kmp_stack_block_t *stack_block =
173         (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE);
174 
175     // Check if we already have a block
176     if (stack_block->sb_next !=
177         NULL) { // reset ts_top to beginning of next block
178       task_stack->ts_top = &stack_block->sb_next->sb_block[0];
179     } else { // Alloc new block and link it up
180       kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc(
181           thread, sizeof(kmp_stack_block_t));
182 
183       task_stack->ts_top = &new_block->sb_block[0];
184       stack_block->sb_next = new_block;
185       new_block->sb_prev = stack_block;
186       new_block->sb_next = NULL;
187 
188       KA_TRACE(
189           30,
190           ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n",
191            gtid, tied_task, new_block));
192     }
193   }
194   KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
195                 tied_task));
196 }
197 
198 //  __kmp_pop_task_stack: Pop the tied task from the task stack.  Don't return
199 //  the task, just check to make sure it matches the ending task passed in.
200 //
201 //  gtid: global thread identifier for the calling thread
202 //  thread: thread info structure containing stack
203 //  tied_task: the task popped off the stack
204 //  ending_task: the task that is ending (should match popped task)
205 static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread,
206                                  kmp_taskdata_t *ending_task) {
207   // GEH - need to consider what to do if tt_threads_data not allocated yet
208   kmp_thread_data_t *thread_data =
209       &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)];
210   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
211   kmp_taskdata_t *tied_task;
212 
213   if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) {
214     // Don't pop anything from stack if team or team tasks are serialized
215     return;
216   }
217 
218   KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
219   KMP_DEBUG_ASSERT(task_stack->ts_entries > 0);
220 
221   KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid,
222                 thread));
223 
224   // fix up ts_top if we need to pop from previous block
225   if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
226     kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top);
227 
228     stack_block = stack_block->sb_prev;
229     task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
230   }
231 
232   // finish bookkeeping
233   task_stack->ts_top--;
234   task_stack->ts_entries--;
235 
236   tied_task = *(task_stack->ts_top);
237 
238   KMP_DEBUG_ASSERT(tied_task != NULL);
239   KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
240   KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly
241 
242   KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
243                 tied_task));
244   return;
245 }
246 #endif /* BUILD_TIED_TASK_STACK */
247 
248 // returns 1 if new task is allowed to execute, 0 otherwise
249 // checks Task Scheduling constraint (if requested) and
250 // mutexinoutset dependencies if any
251 static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained,
252                                   const kmp_taskdata_t *tasknew,
253                                   const kmp_taskdata_t *taskcurr) {
254   if (is_constrained && (tasknew->td_flags.tiedness == TASK_TIED)) {
255     // Check if the candidate obeys the Task Scheduling Constraints (TSC)
256     // only descendant of all deferred tied tasks can be scheduled, checking
257     // the last one is enough, as it in turn is the descendant of all others
258     kmp_taskdata_t *current = taskcurr->td_last_tied;
259     KMP_DEBUG_ASSERT(current != NULL);
260     // check if the task is not suspended on barrier
261     if (current->td_flags.tasktype == TASK_EXPLICIT ||
262         current->td_taskwait_thread > 0) { // <= 0 on barrier
263       kmp_int32 level = current->td_level;
264       kmp_taskdata_t *parent = tasknew->td_parent;
265       while (parent != current && parent->td_level > level) {
266         // check generation up to the level of the current task
267         parent = parent->td_parent;
268         KMP_DEBUG_ASSERT(parent != NULL);
269       }
270       if (parent != current)
271         return false;
272     }
273   }
274   // Check mutexinoutset dependencies, acquire locks
275   kmp_depnode_t *node = tasknew->td_depnode;
276   if (UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
277     for (int i = 0; i < node->dn.mtx_num_locks; ++i) {
278       KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
279       if (__kmp_test_lock(node->dn.mtx_locks[i], gtid))
280         continue;
281       // could not get the lock, release previous locks
282       for (int j = i - 1; j >= 0; --j)
283         __kmp_release_lock(node->dn.mtx_locks[j], gtid);
284       return false;
285     }
286     // negative num_locks means all locks acquired successfully
287     node->dn.mtx_num_locks = -node->dn.mtx_num_locks;
288   }
289   return true;
290 }
291 
292 // __kmp_realloc_task_deque:
293 // Re-allocates a task deque for a particular thread, copies the content from
294 // the old deque and adjusts the necessary data structures relating to the
295 // deque. This operation must be done with the deque_lock being held
296 static void __kmp_realloc_task_deque(kmp_info_t *thread,
297                                      kmp_thread_data_t *thread_data) {
298   kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
299   KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == size);
300   kmp_int32 new_size = 2 * size;
301 
302   KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to "
303                 "%d] for thread_data %p\n",
304                 __kmp_gtid_from_thread(thread), size, new_size, thread_data));
305 
306   kmp_taskdata_t **new_deque =
307       (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *));
308 
309   int i, j;
310   for (i = thread_data->td.td_deque_head, j = 0; j < size;
311        i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++)
312     new_deque[j] = thread_data->td.td_deque[i];
313 
314   __kmp_free(thread_data->td.td_deque);
315 
316   thread_data->td.td_deque_head = 0;
317   thread_data->td.td_deque_tail = size;
318   thread_data->td.td_deque = new_deque;
319   thread_data->td.td_deque_size = new_size;
320 }
321 
322 //  __kmp_push_task: Add a task to the thread's deque
323 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
324   kmp_info_t *thread = __kmp_threads[gtid];
325   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
326 
327   // If we encounter a hidden helper task, and the current thread is not a
328   // hidden helper thread, we have to give the task to any hidden helper thread
329   // starting from its shadow one.
330   if (UNLIKELY(taskdata->td_flags.hidden_helper &&
331                !KMP_HIDDEN_HELPER_THREAD(gtid))) {
332     kmp_int32 shadow_gtid = KMP_GTID_TO_SHADOW_GTID(gtid);
333     __kmpc_give_task(task, __kmp_tid_from_gtid(shadow_gtid));
334     // Signal the hidden helper threads.
335     __kmp_hidden_helper_worker_thread_signal();
336     return TASK_SUCCESSFULLY_PUSHED;
337   }
338 
339   kmp_task_team_t *task_team = thread->th.th_task_team;
340   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
341   kmp_thread_data_t *thread_data;
342 
343   KA_TRACE(20,
344            ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata));
345 
346   if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
347     // untied task needs to increment counter so that the task structure is not
348     // freed prematurely
349     kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
350     KMP_DEBUG_USE_VAR(counter);
351     KA_TRACE(
352         20,
353         ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",
354          gtid, counter, taskdata));
355   }
356 
357   // The first check avoids building task_team thread data if serialized
358   if (UNLIKELY(taskdata->td_flags.task_serial)) {
359     KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning "
360                   "TASK_NOT_PUSHED for task %p\n",
361                   gtid, taskdata));
362     return TASK_NOT_PUSHED;
363   }
364 
365   // Now that serialized tasks have returned, we can assume that we are not in
366   // immediate exec mode
367   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
368   if (UNLIKELY(!KMP_TASKING_ENABLED(task_team))) {
369     __kmp_enable_tasking(task_team, thread);
370   }
371   KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE);
372   KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL);
373 
374   // Find tasking deque specific to encountering thread
375   thread_data = &task_team->tt.tt_threads_data[tid];
376 
377   // No lock needed since only owner can allocate. If the task is hidden_helper,
378   // we don't need it either because we have initialized the dequeue for hidden
379   // helper thread data.
380   if (UNLIKELY(thread_data->td.td_deque == NULL)) {
381     __kmp_alloc_task_deque(thread, thread_data);
382   }
383 
384   int locked = 0;
385   // Check if deque is full
386   if (TCR_4(thread_data->td.td_deque_ntasks) >=
387       TASK_DEQUE_SIZE(thread_data->td)) {
388     if (__kmp_enable_task_throttling &&
389         __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
390                               thread->th.th_current_task)) {
391       KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning "
392                     "TASK_NOT_PUSHED for task %p\n",
393                     gtid, taskdata));
394       return TASK_NOT_PUSHED;
395     } else {
396       __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
397       locked = 1;
398       if (TCR_4(thread_data->td.td_deque_ntasks) >=
399           TASK_DEQUE_SIZE(thread_data->td)) {
400         // expand deque to push the task which is not allowed to execute
401         __kmp_realloc_task_deque(thread, thread_data);
402       }
403     }
404   }
405   // Lock the deque for the task push operation
406   if (!locked) {
407     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
408     // Need to recheck as we can get a proxy task from thread outside of OpenMP
409     if (TCR_4(thread_data->td.td_deque_ntasks) >=
410         TASK_DEQUE_SIZE(thread_data->td)) {
411       if (__kmp_enable_task_throttling &&
412           __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
413                                 thread->th.th_current_task)) {
414         __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
415         KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; "
416                       "returning TASK_NOT_PUSHED for task %p\n",
417                       gtid, taskdata));
418         return TASK_NOT_PUSHED;
419       } else {
420         // expand deque to push the task which is not allowed to execute
421         __kmp_realloc_task_deque(thread, thread_data);
422       }
423     }
424   }
425   // Must have room since no thread can add tasks but calling thread
426   KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
427                    TASK_DEQUE_SIZE(thread_data->td));
428 
429   thread_data->td.td_deque[thread_data->td.td_deque_tail] =
430       taskdata; // Push taskdata
431   // Wrap index.
432   thread_data->td.td_deque_tail =
433       (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
434   TCW_4(thread_data->td.td_deque_ntasks,
435         TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
436   KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self
437   KMP_FSYNC_RELEASING(taskdata); // releasing child
438   KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
439                 "task=%p ntasks=%d head=%u tail=%u\n",
440                 gtid, taskdata, thread_data->td.td_deque_ntasks,
441                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
442 
443   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
444 
445   return TASK_SUCCESSFULLY_PUSHED;
446 }
447 
448 // __kmp_pop_current_task_from_thread: set up current task from called thread
449 // when team ends
450 //
451 // this_thr: thread structure to set current_task in.
452 void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) {
453   KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d "
454                 "this_thread=%p, curtask=%p, "
455                 "curtask_parent=%p\n",
456                 0, this_thr, this_thr->th.th_current_task,
457                 this_thr->th.th_current_task->td_parent));
458 
459   this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent;
460 
461   KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d "
462                 "this_thread=%p, curtask=%p, "
463                 "curtask_parent=%p\n",
464                 0, this_thr, this_thr->th.th_current_task,
465                 this_thr->th.th_current_task->td_parent));
466 }
467 
468 // __kmp_push_current_task_to_thread: set up current task in called thread for a
469 // new team
470 //
471 // this_thr: thread structure to set up
472 // team: team for implicit task data
473 // tid: thread within team to set up
474 void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team,
475                                        int tid) {
476   // current task of the thread is a parent of the new just created implicit
477   // tasks of new team
478   KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p "
479                 "curtask=%p "
480                 "parent_task=%p\n",
481                 tid, this_thr, this_thr->th.th_current_task,
482                 team->t.t_implicit_task_taskdata[tid].td_parent));
483 
484   KMP_DEBUG_ASSERT(this_thr != NULL);
485 
486   if (tid == 0) {
487     if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) {
488       team->t.t_implicit_task_taskdata[0].td_parent =
489           this_thr->th.th_current_task;
490       this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0];
491     }
492   } else {
493     team->t.t_implicit_task_taskdata[tid].td_parent =
494         team->t.t_implicit_task_taskdata[0].td_parent;
495     this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid];
496   }
497 
498   KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p "
499                 "curtask=%p "
500                 "parent_task=%p\n",
501                 tid, this_thr, this_thr->th.th_current_task,
502                 team->t.t_implicit_task_taskdata[tid].td_parent));
503 }
504 
505 // __kmp_task_start: bookkeeping for a task starting execution
506 //
507 // GTID: global thread id of calling thread
508 // task: task starting execution
509 // current_task: task suspending
510 static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task,
511                              kmp_taskdata_t *current_task) {
512   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
513   kmp_info_t *thread = __kmp_threads[gtid];
514 
515   KA_TRACE(10,
516            ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",
517             gtid, taskdata, current_task));
518 
519   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
520 
521   // mark currently executing task as suspended
522   // TODO: GEH - make sure root team implicit task is initialized properly.
523   // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );
524   current_task->td_flags.executing = 0;
525 
526 // Add task to stack if tied
527 #ifdef BUILD_TIED_TASK_STACK
528   if (taskdata->td_flags.tiedness == TASK_TIED) {
529     __kmp_push_task_stack(gtid, thread, taskdata);
530   }
531 #endif /* BUILD_TIED_TASK_STACK */
532 
533   // mark starting task as executing and as current task
534   thread->th.th_current_task = taskdata;
535 
536   KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 ||
537                    taskdata->td_flags.tiedness == TASK_UNTIED);
538   KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 ||
539                    taskdata->td_flags.tiedness == TASK_UNTIED);
540   taskdata->td_flags.started = 1;
541   taskdata->td_flags.executing = 1;
542   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
543   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
544 
545   // GEH TODO: shouldn't we pass some sort of location identifier here?
546   // APT: yes, we will pass location here.
547   // need to store current thread state (in a thread or taskdata structure)
548   // before setting work_state, otherwise wrong state is set after end of task
549 
550   KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata));
551 
552   return;
553 }
554 
555 #if OMPT_SUPPORT
556 //------------------------------------------------------------------------------
557 // __ompt_task_init:
558 //   Initialize OMPT fields maintained by a task. This will only be called after
559 //   ompt_start_tool, so we already know whether ompt is enabled or not.
560 
561 static inline void __ompt_task_init(kmp_taskdata_t *task, int tid) {
562   // The calls to __ompt_task_init already have the ompt_enabled condition.
563   task->ompt_task_info.task_data.value = 0;
564   task->ompt_task_info.frame.exit_frame = ompt_data_none;
565   task->ompt_task_info.frame.enter_frame = ompt_data_none;
566   task->ompt_task_info.frame.exit_frame_flags =
567       ompt_frame_runtime | ompt_frame_framepointer;
568   task->ompt_task_info.frame.enter_frame_flags =
569       ompt_frame_runtime | ompt_frame_framepointer;
570 }
571 
572 // __ompt_task_start:
573 //   Build and trigger task-begin event
574 static inline void __ompt_task_start(kmp_task_t *task,
575                                      kmp_taskdata_t *current_task,
576                                      kmp_int32 gtid) {
577   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
578   ompt_task_status_t status = ompt_task_switch;
579   if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) {
580     status = ompt_task_yield;
581     __kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = 0;
582   }
583   /* let OMPT know that we're about to run this task */
584   if (ompt_enabled.ompt_callback_task_schedule) {
585     ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
586         &(current_task->ompt_task_info.task_data), status,
587         &(taskdata->ompt_task_info.task_data));
588   }
589   taskdata->ompt_task_info.scheduling_parent = current_task;
590 }
591 
592 // __ompt_task_finish:
593 //   Build and trigger final task-schedule event
594 static inline void __ompt_task_finish(kmp_task_t *task,
595                                       kmp_taskdata_t *resumed_task,
596                                       ompt_task_status_t status) {
597   if (ompt_enabled.ompt_callback_task_schedule) {
598     kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
599     if (__kmp_omp_cancellation && taskdata->td_taskgroup &&
600         taskdata->td_taskgroup->cancel_request == cancel_taskgroup) {
601       status = ompt_task_cancel;
602     }
603 
604     /* let OMPT know that we're returning to the callee task */
605     ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
606         &(taskdata->ompt_task_info.task_data), status,
607         (resumed_task ? &(resumed_task->ompt_task_info.task_data) : NULL));
608   }
609 }
610 #endif
611 
612 template <bool ompt>
613 static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid,
614                                                kmp_task_t *task,
615                                                void *frame_address,
616                                                void *return_address) {
617   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
618   kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
619 
620   KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p "
621                 "current_task=%p\n",
622                 gtid, loc_ref, taskdata, current_task));
623 
624   if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
625     // untied task needs to increment counter so that the task structure is not
626     // freed prematurely
627     kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
628     KMP_DEBUG_USE_VAR(counter);
629     KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) "
630                   "incremented for task %p\n",
631                   gtid, counter, taskdata));
632   }
633 
634   taskdata->td_flags.task_serial =
635       1; // Execute this task immediately, not deferred.
636   __kmp_task_start(gtid, task, current_task);
637 
638 #if OMPT_SUPPORT
639   if (ompt) {
640     if (current_task->ompt_task_info.frame.enter_frame.ptr == NULL) {
641       current_task->ompt_task_info.frame.enter_frame.ptr =
642           taskdata->ompt_task_info.frame.exit_frame.ptr = frame_address;
643       current_task->ompt_task_info.frame.enter_frame_flags =
644           taskdata->ompt_task_info.frame.exit_frame_flags =
645               ompt_frame_application | ompt_frame_framepointer;
646     }
647     if (ompt_enabled.ompt_callback_task_create) {
648       ompt_task_info_t *parent_info = &(current_task->ompt_task_info);
649       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
650           &(parent_info->task_data), &(parent_info->frame),
651           &(taskdata->ompt_task_info.task_data),
652           ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(taskdata), 0,
653           return_address);
654     }
655     __ompt_task_start(task, current_task, gtid);
656   }
657 #endif // OMPT_SUPPORT
658 
659   KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid,
660                 loc_ref, taskdata));
661 }
662 
663 #if OMPT_SUPPORT
664 OMPT_NOINLINE
665 static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
666                                            kmp_task_t *task,
667                                            void *frame_address,
668                                            void *return_address) {
669   __kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address,
670                                            return_address);
671 }
672 #endif // OMPT_SUPPORT
673 
674 // __kmpc_omp_task_begin_if0: report that a given serialized task has started
675 // execution
676 //
677 // loc_ref: source location information; points to beginning of task block.
678 // gtid: global thread number.
679 // task: task thunk for the started task.
680 void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
681                                kmp_task_t *task) {
682 #if OMPT_SUPPORT
683   if (UNLIKELY(ompt_enabled.enabled)) {
684     OMPT_STORE_RETURN_ADDRESS(gtid);
685     __kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task,
686                                    OMPT_GET_FRAME_ADDRESS(1),
687                                    OMPT_LOAD_RETURN_ADDRESS(gtid));
688     return;
689   }
690 #endif
691   __kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL);
692 }
693 
694 #ifdef TASK_UNUSED
695 // __kmpc_omp_task_begin: report that a given task has started execution
696 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
697 void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) {
698   kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
699 
700   KA_TRACE(
701       10,
702       ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
703        gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task));
704 
705   __kmp_task_start(gtid, task, current_task);
706 
707   KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid,
708                 loc_ref, KMP_TASK_TO_TASKDATA(task)));
709   return;
710 }
711 #endif // TASK_UNUSED
712 
713 // __kmp_free_task: free the current task space and the space for shareds
714 //
715 // gtid: Global thread ID of calling thread
716 // taskdata: task to free
717 // thread: thread data structure of caller
718 static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
719                             kmp_info_t *thread) {
720   KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid,
721                 taskdata));
722 
723   // Check to make sure all flags and counters have the correct values
724   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
725   KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0);
726   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1);
727   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
728   KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == 0 ||
729                    taskdata->td_flags.task_serial == 1);
730   KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0);
731 
732   taskdata->td_flags.freed = 1;
733 // deallocate the taskdata and shared variable blocks associated with this task
734 #if USE_FAST_MEMORY
735   __kmp_fast_free(thread, taskdata);
736 #else /* ! USE_FAST_MEMORY */
737   __kmp_thread_free(thread, taskdata);
738 #endif
739   KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));
740 }
741 
742 // __kmp_free_task_and_ancestors: free the current task and ancestors without
743 // children
744 //
745 // gtid: Global thread ID of calling thread
746 // taskdata: task to free
747 // thread: thread data structure of caller
748 static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
749                                           kmp_taskdata_t *taskdata,
750                                           kmp_info_t *thread) {
751   // Proxy tasks must always be allowed to free their parents
752   // because they can be run in background even in serial mode.
753   kmp_int32 team_serial =
754       (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) &&
755       !taskdata->td_flags.proxy;
756   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
757 
758   kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
759   KMP_DEBUG_ASSERT(children >= 0);
760 
761   // Now, go up the ancestor tree to see if any ancestors can now be freed.
762   while (children == 0) {
763     kmp_taskdata_t *parent_taskdata = taskdata->td_parent;
764 
765     KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete "
766                   "and freeing itself\n",
767                   gtid, taskdata));
768 
769     // --- Deallocate my ancestor task ---
770     __kmp_free_task(gtid, taskdata, thread);
771 
772     taskdata = parent_taskdata;
773 
774     if (team_serial)
775       return;
776     // Stop checking ancestors at implicit task instead of walking up ancestor
777     // tree to avoid premature deallocation of ancestors.
778     if (taskdata->td_flags.tasktype == TASK_IMPLICIT) {
779       if (taskdata->td_dephash) { // do we need to cleanup dephash?
780         int children = KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks);
781         kmp_tasking_flags_t flags_old = taskdata->td_flags;
782         if (children == 0 && flags_old.complete == 1) {
783           kmp_tasking_flags_t flags_new = flags_old;
784           flags_new.complete = 0;
785           if (KMP_COMPARE_AND_STORE_ACQ32(
786                   RCAST(kmp_int32 *, &taskdata->td_flags),
787                   *RCAST(kmp_int32 *, &flags_old),
788                   *RCAST(kmp_int32 *, &flags_new))) {
789             KA_TRACE(100, ("__kmp_free_task_and_ancestors: T#%d cleans "
790                            "dephash of implicit task %p\n",
791                            gtid, taskdata));
792             // cleanup dephash of finished implicit task
793             __kmp_dephash_free_entries(thread, taskdata->td_dephash);
794           }
795         }
796       }
797       return;
798     }
799     // Predecrement simulated by "- 1" calculation
800     children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
801     KMP_DEBUG_ASSERT(children >= 0);
802   }
803 
804   KA_TRACE(
805       20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "
806            "not freeing it yet\n",
807            gtid, taskdata, children));
808 }
809 
810 // __kmp_task_finish: bookkeeping to do when a task finishes execution
811 //
812 // gtid: global thread ID for calling thread
813 // task: task to be finished
814 // resumed_task: task to be resumed.  (may be NULL if task is serialized)
815 //
816 // template<ompt>: effectively ompt_enabled.enabled!=0
817 // the version with ompt=false is inlined, allowing to optimize away all ompt
818 // code in this case
819 template <bool ompt>
820 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
821                               kmp_taskdata_t *resumed_task) {
822   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
823   kmp_info_t *thread = __kmp_threads[gtid];
824   kmp_task_team_t *task_team =
825       thread->th.th_task_team; // might be NULL for serial teams...
826 #if KMP_DEBUG
827   kmp_int32 children = 0;
828 #endif
829   KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming "
830                 "task %p\n",
831                 gtid, taskdata, resumed_task));
832 
833   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
834 
835 // Pop task from stack if tied
836 #ifdef BUILD_TIED_TASK_STACK
837   if (taskdata->td_flags.tiedness == TASK_TIED) {
838     __kmp_pop_task_stack(gtid, thread, taskdata);
839   }
840 #endif /* BUILD_TIED_TASK_STACK */
841 
842   if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
843     // untied task needs to check the counter so that the task structure is not
844     // freed prematurely
845     kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1;
846     KA_TRACE(
847         20,
848         ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",
849          gtid, counter, taskdata));
850     if (counter > 0) {
851       // untied task is not done, to be continued possibly by other thread, do
852       // not free it now
853       if (resumed_task == NULL) {
854         KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial);
855         resumed_task = taskdata->td_parent; // In a serialized task, the resumed
856         // task is the parent
857       }
858       thread->th.th_current_task = resumed_task; // restore current_task
859       resumed_task->td_flags.executing = 1; // resume previous task
860       KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, "
861                     "resuming task %p\n",
862                     gtid, taskdata, resumed_task));
863       return;
864     }
865   }
866 
867   // bookkeeping for resuming task:
868   // GEH - note tasking_ser => task_serial
869   KMP_DEBUG_ASSERT(
870       (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
871       taskdata->td_flags.task_serial);
872   if (taskdata->td_flags.task_serial) {
873     if (resumed_task == NULL) {
874       resumed_task = taskdata->td_parent; // In a serialized task, the resumed
875       // task is the parent
876     }
877   } else {
878     KMP_DEBUG_ASSERT(resumed_task !=
879                      NULL); // verify that resumed task is passed as argument
880   }
881 
882   /* If the tasks' destructor thunk flag has been set, we need to invoke the
883      destructor thunk that has been generated by the compiler. The code is
884      placed here, since at this point other tasks might have been released
885      hence overlapping the destructor invocations with some other work in the
886      released tasks.  The OpenMP spec is not specific on when the destructors
887      are invoked, so we should be free to choose. */
888   if (UNLIKELY(taskdata->td_flags.destructors_thunk)) {
889     kmp_routine_entry_t destr_thunk = task->data1.destructors;
890     KMP_ASSERT(destr_thunk);
891     destr_thunk(gtid, task);
892   }
893 
894   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
895   KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
896   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
897 
898   bool detach = false;
899   if (UNLIKELY(taskdata->td_flags.detachable == TASK_DETACHABLE)) {
900     if (taskdata->td_allow_completion_event.type ==
901         KMP_EVENT_ALLOW_COMPLETION) {
902       // event hasn't been fulfilled yet. Try to detach task.
903       __kmp_acquire_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
904       if (taskdata->td_allow_completion_event.type ==
905           KMP_EVENT_ALLOW_COMPLETION) {
906         // task finished execution
907         KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
908         taskdata->td_flags.executing = 0; // suspend the finishing task
909 
910 #if OMPT_SUPPORT
911         // For a detached task, which is not completed, we switch back
912         // the omp_fulfill_event signals completion
913         // locking is necessary to avoid a race with ompt_task_late_fulfill
914         if (ompt)
915           __ompt_task_finish(task, resumed_task, ompt_task_detach);
916 #endif
917 
918         // no access to taskdata after this point!
919         // __kmp_fulfill_event might free taskdata at any time from now
920 
921         taskdata->td_flags.proxy = TASK_PROXY; // proxify!
922         detach = true;
923       }
924       __kmp_release_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
925     }
926   }
927 
928   if (!detach) {
929     taskdata->td_flags.complete = 1; // mark the task as completed
930 
931 #if OMPT_SUPPORT
932     // This is not a detached task, we are done here
933     if (ompt)
934       __ompt_task_finish(task, resumed_task, ompt_task_complete);
935 #endif
936 
937     // Only need to keep track of count if team parallel and tasking not
938     // serialized, or task is detachable and event has already been fulfilled
939     if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) ||
940         taskdata->td_flags.detachable == TASK_DETACHABLE ||
941         taskdata->td_flags.hidden_helper) {
942       __kmp_release_deps(gtid, taskdata);
943       // Predecrement simulated by "- 1" calculation
944 #if KMP_DEBUG
945       children = -1 +
946 #endif
947           KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
948       KMP_DEBUG_ASSERT(children >= 0);
949       if (taskdata->td_taskgroup)
950         KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
951     } else if (task_team && (task_team->tt.tt_found_proxy_tasks ||
952                              task_team->tt.tt_hidden_helper_task_encountered)) {
953       // if we found proxy or hidden helper tasks there could exist a dependency
954       // chain with the proxy task as origin
955       __kmp_release_deps(gtid, taskdata);
956     }
957     // td_flags.executing must be marked as 0 after __kmp_release_deps has been
958     // called. Othertwise, if a task is executed immediately from the
959     // release_deps code, the flag will be reset to 1 again by this same
960     // function
961     KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
962     taskdata->td_flags.executing = 0; // suspend the finishing task
963   }
964 
965   KA_TRACE(
966       20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
967            gtid, taskdata, children));
968 
969   // Free this task and then ancestor tasks if they have no children.
970   // Restore th_current_task first as suggested by John:
971   // johnmc: if an asynchronous inquiry peers into the runtime system
972   // it doesn't see the freed task as the current task.
973   thread->th.th_current_task = resumed_task;
974   if (!detach)
975     __kmp_free_task_and_ancestors(gtid, taskdata, thread);
976 
977   // TODO: GEH - make sure root team implicit task is initialized properly.
978   // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
979   resumed_task->td_flags.executing = 1; // resume previous task
980 
981   KA_TRACE(
982       10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
983            gtid, taskdata, resumed_task));
984 
985   return;
986 }
987 
988 template <bool ompt>
989 static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref,
990                                                   kmp_int32 gtid,
991                                                   kmp_task_t *task) {
992   KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
993                 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
994   KMP_DEBUG_ASSERT(gtid >= 0);
995   // this routine will provide task to resume
996   __kmp_task_finish<ompt>(gtid, task, NULL);
997 
998   KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
999                 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
1000 
1001 #if OMPT_SUPPORT
1002   if (ompt) {
1003     ompt_frame_t *ompt_frame;
1004     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1005     ompt_frame->enter_frame = ompt_data_none;
1006     ompt_frame->enter_frame_flags =
1007         ompt_frame_runtime | ompt_frame_framepointer;
1008   }
1009 #endif
1010 
1011   return;
1012 }
1013 
1014 #if OMPT_SUPPORT
1015 OMPT_NOINLINE
1016 void __kmpc_omp_task_complete_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
1017                                        kmp_task_t *task) {
1018   __kmpc_omp_task_complete_if0_template<true>(loc_ref, gtid, task);
1019 }
1020 #endif // OMPT_SUPPORT
1021 
1022 // __kmpc_omp_task_complete_if0: report that a task has completed execution
1023 //
1024 // loc_ref: source location information; points to end of task block.
1025 // gtid: global thread number.
1026 // task: task thunk for the completed task.
1027 void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,
1028                                   kmp_task_t *task) {
1029 #if OMPT_SUPPORT
1030   if (UNLIKELY(ompt_enabled.enabled)) {
1031     __kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task);
1032     return;
1033   }
1034 #endif
1035   __kmpc_omp_task_complete_if0_template<false>(loc_ref, gtid, task);
1036 }
1037 
1038 #ifdef TASK_UNUSED
1039 // __kmpc_omp_task_complete: report that a task has completed execution
1040 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
1041 void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid,
1042                               kmp_task_t *task) {
1043   KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid,
1044                 loc_ref, KMP_TASK_TO_TASKDATA(task)));
1045 
1046   __kmp_task_finish<false>(gtid, task,
1047                            NULL); // Not sure how to find task to resume
1048 
1049   KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid,
1050                 loc_ref, KMP_TASK_TO_TASKDATA(task)));
1051   return;
1052 }
1053 #endif // TASK_UNUSED
1054 
1055 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit
1056 // task for a given thread
1057 //
1058 // loc_ref:  reference to source location of parallel region
1059 // this_thr:  thread data structure corresponding to implicit task
1060 // team: team for this_thr
1061 // tid: thread id of given thread within team
1062 // set_curr_task: TRUE if need to push current task to thread
1063 // NOTE: Routine does not set up the implicit task ICVS.  This is assumed to
1064 // have already been done elsewhere.
1065 // TODO: Get better loc_ref.  Value passed in may be NULL
1066 void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
1067                               kmp_team_t *team, int tid, int set_curr_task) {
1068   kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid];
1069 
1070   KF_TRACE(
1071       10,
1072       ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",
1073        tid, team, task, set_curr_task ? "TRUE" : "FALSE"));
1074 
1075   task->td_task_id = KMP_GEN_TASK_ID();
1076   task->td_team = team;
1077   //    task->td_parent   = NULL;  // fix for CQ230101 (broken parent task info
1078   //    in debugger)
1079   task->td_ident = loc_ref;
1080   task->td_taskwait_ident = NULL;
1081   task->td_taskwait_counter = 0;
1082   task->td_taskwait_thread = 0;
1083 
1084   task->td_flags.tiedness = TASK_TIED;
1085   task->td_flags.tasktype = TASK_IMPLICIT;
1086   task->td_flags.proxy = TASK_FULL;
1087 
1088   // All implicit tasks are executed immediately, not deferred
1089   task->td_flags.task_serial = 1;
1090   task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1091   task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1092 
1093   task->td_flags.started = 1;
1094   task->td_flags.executing = 1;
1095   task->td_flags.complete = 0;
1096   task->td_flags.freed = 0;
1097 
1098   task->td_depnode = NULL;
1099   task->td_last_tied = task;
1100   task->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1101 
1102   if (set_curr_task) { // only do this init first time thread is created
1103     KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0);
1104     // Not used: don't need to deallocate implicit task
1105     KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0);
1106     task->td_taskgroup = NULL; // An implicit task does not have taskgroup
1107     task->td_dephash = NULL;
1108     __kmp_push_current_task_to_thread(this_thr, team, tid);
1109   } else {
1110     KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);
1111     KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0);
1112   }
1113 
1114 #if OMPT_SUPPORT
1115   if (UNLIKELY(ompt_enabled.enabled))
1116     __ompt_task_init(task, tid);
1117 #endif
1118 
1119   KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid,
1120                 team, task));
1121 }
1122 
1123 // __kmp_finish_implicit_task: Release resources associated to implicit tasks
1124 // at the end of parallel regions. Some resources are kept for reuse in the next
1125 // parallel region.
1126 //
1127 // thread:  thread data structure corresponding to implicit task
1128 void __kmp_finish_implicit_task(kmp_info_t *thread) {
1129   kmp_taskdata_t *task = thread->th.th_current_task;
1130   if (task->td_dephash) {
1131     int children;
1132     task->td_flags.complete = 1;
1133     children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks);
1134     kmp_tasking_flags_t flags_old = task->td_flags;
1135     if (children == 0 && flags_old.complete == 1) {
1136       kmp_tasking_flags_t flags_new = flags_old;
1137       flags_new.complete = 0;
1138       if (KMP_COMPARE_AND_STORE_ACQ32(RCAST(kmp_int32 *, &task->td_flags),
1139                                       *RCAST(kmp_int32 *, &flags_old),
1140                                       *RCAST(kmp_int32 *, &flags_new))) {
1141         KA_TRACE(100, ("__kmp_finish_implicit_task: T#%d cleans "
1142                        "dephash of implicit task %p\n",
1143                        thread->th.th_info.ds.ds_gtid, task));
1144         __kmp_dephash_free_entries(thread, task->td_dephash);
1145       }
1146     }
1147   }
1148 }
1149 
1150 // __kmp_free_implicit_task: Release resources associated to implicit tasks
1151 // when these are destroyed regions
1152 //
1153 // thread:  thread data structure corresponding to implicit task
1154 void __kmp_free_implicit_task(kmp_info_t *thread) {
1155   kmp_taskdata_t *task = thread->th.th_current_task;
1156   if (task && task->td_dephash) {
1157     __kmp_dephash_free(thread, task->td_dephash);
1158     task->td_dephash = NULL;
1159   }
1160 }
1161 
1162 // Round up a size to a power of two specified by val: Used to insert padding
1163 // between structures co-allocated using a single malloc() call
1164 static size_t __kmp_round_up_to_val(size_t size, size_t val) {
1165   if (size & (val - 1)) {
1166     size &= ~(val - 1);
1167     if (size <= KMP_SIZE_T_MAX - val) {
1168       size += val; // Round up if there is no overflow.
1169     }
1170   }
1171   return size;
1172 } // __kmp_round_up_to_va
1173 
1174 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task
1175 //
1176 // loc_ref: source location information
1177 // gtid: global thread number.
1178 // flags: include tiedness & task type (explicit vs. implicit) of the ''new''
1179 // task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine.
1180 // sizeof_kmp_task_t:  Size in bytes of kmp_task_t data structure including
1181 // private vars accessed in task.
1182 // sizeof_shareds:  Size in bytes of array of pointers to shared vars accessed
1183 // in task.
1184 // task_entry: Pointer to task code entry point generated by compiler.
1185 // returns: a pointer to the allocated kmp_task_t structure (task).
1186 kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1187                              kmp_tasking_flags_t *flags,
1188                              size_t sizeof_kmp_task_t, size_t sizeof_shareds,
1189                              kmp_routine_entry_t task_entry) {
1190   kmp_task_t *task;
1191   kmp_taskdata_t *taskdata;
1192   kmp_info_t *thread = __kmp_threads[gtid];
1193   kmp_team_t *team = thread->th.th_team;
1194   kmp_taskdata_t *parent_task = thread->th.th_current_task;
1195   size_t shareds_offset;
1196 
1197   if (UNLIKELY(!TCR_4(__kmp_init_middle)))
1198     __kmp_middle_initialize();
1199 
1200   if (flags->hidden_helper) {
1201     if (__kmp_enable_hidden_helper) {
1202       if (!TCR_4(__kmp_init_hidden_helper))
1203         __kmp_hidden_helper_initialize();
1204     } else {
1205       // If the hidden helper task is not enabled, reset the flag to FALSE.
1206       flags->hidden_helper = FALSE;
1207     }
1208   }
1209 
1210   KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
1211                 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1212                 gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
1213                 sizeof_shareds, task_entry));
1214 
1215   KMP_DEBUG_ASSERT(parent_task);
1216   if (parent_task->td_flags.final) {
1217     if (flags->merged_if0) {
1218     }
1219     flags->final = 1;
1220   }
1221 
1222   if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) {
1223     // Untied task encountered causes the TSC algorithm to check entire deque of
1224     // the victim thread. If no untied task encountered, then checking the head
1225     // of the deque should be enough.
1226     KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1);
1227   }
1228 
1229   // Detachable tasks are not proxy tasks yet but could be in the future. Doing
1230   // the tasking setup
1231   // when that happens is too late.
1232   if (UNLIKELY(flags->proxy == TASK_PROXY ||
1233                flags->detachable == TASK_DETACHABLE || flags->hidden_helper)) {
1234     if (flags->proxy == TASK_PROXY) {
1235       flags->tiedness = TASK_UNTIED;
1236       flags->merged_if0 = 1;
1237     }
1238     /* are we running in a sequential parallel or tskm_immediate_exec... we need
1239        tasking support enabled */
1240     if ((thread->th.th_task_team) == NULL) {
1241       /* This should only happen if the team is serialized
1242           setup a task team and propagate it to the thread */
1243       KMP_DEBUG_ASSERT(team->t.t_serialized);
1244       KA_TRACE(30,
1245                ("T#%d creating task team in __kmp_task_alloc for proxy task\n",
1246                 gtid));
1247       // 1 indicates setup the current team regardless of nthreads
1248       __kmp_task_team_setup(thread, team, 1);
1249       thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
1250     }
1251     kmp_task_team_t *task_team = thread->th.th_task_team;
1252 
1253     /* tasking must be enabled now as the task might not be pushed */
1254     if (!KMP_TASKING_ENABLED(task_team)) {
1255       KA_TRACE(
1256           30,
1257           ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
1258       __kmp_enable_tasking(task_team, thread);
1259       kmp_int32 tid = thread->th.th_info.ds.ds_tid;
1260       kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
1261       // No lock needed since only owner can allocate
1262       if (thread_data->td.td_deque == NULL) {
1263         __kmp_alloc_task_deque(thread, thread_data);
1264       }
1265     }
1266 
1267     if ((flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE) &&
1268         task_team->tt.tt_found_proxy_tasks == FALSE)
1269       TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE);
1270     if (flags->hidden_helper &&
1271         task_team->tt.tt_hidden_helper_task_encountered == FALSE)
1272       TCW_4(task_team->tt.tt_hidden_helper_task_encountered, TRUE);
1273   }
1274 
1275   // Calculate shared structure offset including padding after kmp_task_t struct
1276   // to align pointers in shared struct
1277   shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t;
1278   shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(void *));
1279 
1280   // Allocate a kmp_taskdata_t block and a kmp_task_t block.
1281   KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid,
1282                 shareds_offset));
1283   KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid,
1284                 sizeof_shareds));
1285 
1286   // Avoid double allocation here by combining shareds with taskdata
1287 #if USE_FAST_MEMORY
1288   taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset +
1289                                                                sizeof_shareds);
1290 #else /* ! USE_FAST_MEMORY */
1291   taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset +
1292                                                                sizeof_shareds);
1293 #endif /* USE_FAST_MEMORY */
1294 
1295   task = KMP_TASKDATA_TO_TASK(taskdata);
1296 
1297 // Make sure task & taskdata are aligned appropriately
1298 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD
1299   KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0);
1300   KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0);
1301 #else
1302   KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0);
1303   KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0);
1304 #endif
1305   if (sizeof_shareds > 0) {
1306     // Avoid double allocation here by combining shareds with taskdata
1307     task->shareds = &((char *)taskdata)[shareds_offset];
1308     // Make sure shareds struct is aligned to pointer size
1309     KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
1310                      0);
1311   } else {
1312     task->shareds = NULL;
1313   }
1314   task->routine = task_entry;
1315   task->part_id = 0; // AC: Always start with 0 part id
1316 
1317   taskdata->td_task_id = KMP_GEN_TASK_ID();
1318   taskdata->td_team = thread->th.th_team;
1319   taskdata->td_alloc_thread = thread;
1320   taskdata->td_parent = parent_task;
1321   taskdata->td_level = parent_task->td_level + 1; // increment nesting level
1322   KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
1323   taskdata->td_ident = loc_ref;
1324   taskdata->td_taskwait_ident = NULL;
1325   taskdata->td_taskwait_counter = 0;
1326   taskdata->td_taskwait_thread = 0;
1327   KMP_DEBUG_ASSERT(taskdata->td_parent != NULL);
1328   // avoid copying icvs for proxy tasks
1329   if (flags->proxy == TASK_FULL)
1330     copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs);
1331 
1332   taskdata->td_flags = *flags;
1333   taskdata->td_task_team = thread->th.th_task_team;
1334   taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
1335   taskdata->td_flags.tasktype = TASK_EXPLICIT;
1336   // If it is hidden helper task, we need to set the team and task team
1337   // correspondingly.
1338   if (flags->hidden_helper) {
1339     kmp_info_t *shadow_thread = __kmp_threads[KMP_GTID_TO_SHADOW_GTID(gtid)];
1340     taskdata->td_team = shadow_thread->th.th_team;
1341     taskdata->td_task_team = shadow_thread->th.th_task_team;
1342   }
1343 
1344   // GEH - TODO: fix this to copy parent task's value of tasking_ser flag
1345   taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1346 
1347   // GEH - TODO: fix this to copy parent task's value of team_serial flag
1348   taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1349 
1350   // GEH - Note we serialize the task if the team is serialized to make sure
1351   // implicit parallel region tasks are not left until program termination to
1352   // execute. Also, it helps locality to execute immediately.
1353 
1354   taskdata->td_flags.task_serial =
1355       (parent_task->td_flags.final || taskdata->td_flags.team_serial ||
1356        taskdata->td_flags.tasking_ser || flags->merged_if0);
1357 
1358   taskdata->td_flags.started = 0;
1359   taskdata->td_flags.executing = 0;
1360   taskdata->td_flags.complete = 0;
1361   taskdata->td_flags.freed = 0;
1362 
1363   KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
1364   // start at one because counts current task and children
1365   KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
1366   taskdata->td_taskgroup =
1367       parent_task->td_taskgroup; // task inherits taskgroup from the parent task
1368   taskdata->td_dephash = NULL;
1369   taskdata->td_depnode = NULL;
1370   if (flags->tiedness == TASK_UNTIED)
1371     taskdata->td_last_tied = NULL; // will be set when the task is scheduled
1372   else
1373     taskdata->td_last_tied = taskdata;
1374   taskdata->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1375 #if OMPT_SUPPORT
1376   if (UNLIKELY(ompt_enabled.enabled))
1377     __ompt_task_init(taskdata, gtid);
1378 #endif
1379   // Only need to keep track of child task counts if team parallel and tasking
1380   // not serialized or if it is a proxy or detachable or hidden helper task
1381   if (flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE ||
1382       flags->hidden_helper ||
1383       !(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
1384     KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
1385     if (parent_task->td_taskgroup)
1386       KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
1387     // Only need to keep track of allocated child tasks for explicit tasks since
1388     // implicit not deallocated
1389     if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
1390       KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
1391     }
1392     if (flags->hidden_helper) {
1393       taskdata->td_flags.task_serial = FALSE;
1394       // Increment the number of hidden helper tasks to be executed
1395       KMP_ATOMIC_INC(&__kmp_unexecuted_hidden_helper_tasks);
1396     }
1397   }
1398 
1399   KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
1400                 gtid, taskdata, taskdata->td_parent));
1401 
1402   return task;
1403 }
1404 
1405 kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1406                                   kmp_int32 flags, size_t sizeof_kmp_task_t,
1407                                   size_t sizeof_shareds,
1408                                   kmp_routine_entry_t task_entry) {
1409   kmp_task_t *retval;
1410   kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
1411   __kmp_assert_valid_gtid(gtid);
1412   input_flags->native = FALSE;
1413   // __kmp_task_alloc() sets up all other runtime flags
1414   KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s %s) "
1415                 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1416                 gtid, loc_ref, input_flags->tiedness ? "tied  " : "untied",
1417                 input_flags->proxy ? "proxy" : "",
1418                 input_flags->detachable ? "detachable" : "", sizeof_kmp_task_t,
1419                 sizeof_shareds, task_entry));
1420 
1421   retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t,
1422                             sizeof_shareds, task_entry);
1423 
1424   KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval));
1425 
1426   return retval;
1427 }
1428 
1429 kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1430                                          kmp_int32 flags,
1431                                          size_t sizeof_kmp_task_t,
1432                                          size_t sizeof_shareds,
1433                                          kmp_routine_entry_t task_entry,
1434                                          kmp_int64 device_id) {
1435   auto &input_flags = reinterpret_cast<kmp_tasking_flags_t &>(flags);
1436   // target task is untied defined in the specification
1437   input_flags.tiedness = TASK_UNTIED;
1438 
1439   if (__kmp_enable_hidden_helper)
1440     input_flags.hidden_helper = TRUE;
1441 
1442   return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t,
1443                                sizeof_shareds, task_entry);
1444 }
1445 
1446 /*!
1447 @ingroup TASKING
1448 @param loc_ref location of the original task directive
1449 @param gtid Global Thread ID of encountering thread
1450 @param new_task task thunk allocated by __kmpc_omp_task_alloc() for the ''new
1451 task''
1452 @param naffins Number of affinity items
1453 @param affin_list List of affinity items
1454 @return Returns non-zero if registering affinity information was not successful.
1455  Returns 0 if registration was successful
1456 This entry registers the affinity information attached to a task with the task
1457 thunk structure kmp_taskdata_t.
1458 */
1459 kmp_int32
1460 __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid,
1461                                   kmp_task_t *new_task, kmp_int32 naffins,
1462                                   kmp_task_affinity_info_t *affin_list) {
1463   return 0;
1464 }
1465 
1466 //  __kmp_invoke_task: invoke the specified task
1467 //
1468 // gtid: global thread ID of caller
1469 // task: the task to invoke
1470 // current_task: the task to resume after task invocation
1471 static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
1472                               kmp_taskdata_t *current_task) {
1473   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
1474   kmp_info_t *thread;
1475   int discard = 0 /* false */;
1476   KA_TRACE(
1477       30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
1478            gtid, taskdata, current_task));
1479   KMP_DEBUG_ASSERT(task);
1480   if (UNLIKELY(taskdata->td_flags.proxy == TASK_PROXY &&
1481                taskdata->td_flags.complete == 1)) {
1482     // This is a proxy task that was already completed but it needs to run
1483     // its bottom-half finish
1484     KA_TRACE(
1485         30,
1486         ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
1487          gtid, taskdata));
1488 
1489     __kmp_bottom_half_finish_proxy(gtid, task);
1490 
1491     KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for "
1492                   "proxy task %p, resuming task %p\n",
1493                   gtid, taskdata, current_task));
1494 
1495     return;
1496   }
1497 
1498 #if OMPT_SUPPORT
1499   // For untied tasks, the first task executed only calls __kmpc_omp_task and
1500   // does not execute code.
1501   ompt_thread_info_t oldInfo;
1502   if (UNLIKELY(ompt_enabled.enabled)) {
1503     // Store the threads states and restore them after the task
1504     thread = __kmp_threads[gtid];
1505     oldInfo = thread->th.ompt_thread_info;
1506     thread->th.ompt_thread_info.wait_id = 0;
1507     thread->th.ompt_thread_info.state = (thread->th.th_team_serialized)
1508                                             ? ompt_state_work_serial
1509                                             : ompt_state_work_parallel;
1510     taskdata->ompt_task_info.frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1511   }
1512 #endif
1513 
1514   // Decreament the counter of hidden helper tasks to be executed
1515   if (taskdata->td_flags.hidden_helper) {
1516     // Hidden helper tasks can only be executed by hidden helper threads
1517     KMP_ASSERT(KMP_HIDDEN_HELPER_THREAD(gtid));
1518     KMP_ATOMIC_DEC(&__kmp_unexecuted_hidden_helper_tasks);
1519   }
1520 
1521   // Proxy tasks are not handled by the runtime
1522   if (taskdata->td_flags.proxy != TASK_PROXY) {
1523     __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded
1524   }
1525 
1526   // TODO: cancel tasks if the parallel region has also been cancelled
1527   // TODO: check if this sequence can be hoisted above __kmp_task_start
1528   // if cancellation has been enabled for this run ...
1529   if (UNLIKELY(__kmp_omp_cancellation)) {
1530     thread = __kmp_threads[gtid];
1531     kmp_team_t *this_team = thread->th.th_team;
1532     kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
1533     if ((taskgroup && taskgroup->cancel_request) ||
1534         (this_team->t.t_cancel_request == cancel_parallel)) {
1535 #if OMPT_SUPPORT && OMPT_OPTIONAL
1536       ompt_data_t *task_data;
1537       if (UNLIKELY(ompt_enabled.ompt_callback_cancel)) {
1538         __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
1539         ompt_callbacks.ompt_callback(ompt_callback_cancel)(
1540             task_data,
1541             ((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup
1542                                                       : ompt_cancel_parallel) |
1543                 ompt_cancel_discarded_task,
1544             NULL);
1545       }
1546 #endif
1547       KMP_COUNT_BLOCK(TASK_cancelled);
1548       // this task belongs to a task group and we need to cancel it
1549       discard = 1 /* true */;
1550     }
1551   }
1552 
1553   // Invoke the task routine and pass in relevant data.
1554   // Thunks generated by gcc take a different argument list.
1555   if (!discard) {
1556     if (taskdata->td_flags.tiedness == TASK_UNTIED) {
1557       taskdata->td_last_tied = current_task->td_last_tied;
1558       KMP_DEBUG_ASSERT(taskdata->td_last_tied);
1559     }
1560 #if KMP_STATS_ENABLED
1561     KMP_COUNT_BLOCK(TASK_executed);
1562     switch (KMP_GET_THREAD_STATE()) {
1563     case FORK_JOIN_BARRIER:
1564       KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar);
1565       break;
1566     case PLAIN_BARRIER:
1567       KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar);
1568       break;
1569     case TASKYIELD:
1570       KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield);
1571       break;
1572     case TASKWAIT:
1573       KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait);
1574       break;
1575     case TASKGROUP:
1576       KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup);
1577       break;
1578     default:
1579       KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate);
1580       break;
1581     }
1582 #endif // KMP_STATS_ENABLED
1583 
1584 // OMPT task begin
1585 #if OMPT_SUPPORT
1586     if (UNLIKELY(ompt_enabled.enabled))
1587       __ompt_task_start(task, current_task, gtid);
1588 #endif
1589 
1590 #if OMPD_SUPPORT
1591     if (ompd_state & OMPD_ENABLE_BP)
1592       ompd_bp_task_begin();
1593 #endif
1594 
1595 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1596     kmp_uint64 cur_time;
1597     kmp_int32 kmp_itt_count_task =
1598         __kmp_forkjoin_frames_mode == 3 && !taskdata->td_flags.task_serial &&
1599         current_task->td_flags.tasktype == TASK_IMPLICIT;
1600     if (kmp_itt_count_task) {
1601       thread = __kmp_threads[gtid];
1602       // Time outer level explicit task on barrier for adjusting imbalance time
1603       if (thread->th.th_bar_arrive_time)
1604         cur_time = __itt_get_timestamp();
1605       else
1606         kmp_itt_count_task = 0; // thread is not on a barrier - skip timing
1607     }
1608     KMP_FSYNC_ACQUIRED(taskdata); // acquired self (new task)
1609 #endif
1610 
1611 #ifdef KMP_GOMP_COMPAT
1612     if (taskdata->td_flags.native) {
1613       ((void (*)(void *))(*(task->routine)))(task->shareds);
1614     } else
1615 #endif /* KMP_GOMP_COMPAT */
1616     {
1617       (*(task->routine))(gtid, task);
1618     }
1619     KMP_POP_PARTITIONED_TIMER();
1620 
1621 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1622     if (kmp_itt_count_task) {
1623       // Barrier imbalance - adjust arrive time with the task duration
1624       thread->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time);
1625     }
1626     KMP_FSYNC_CANCEL(taskdata); // destroy self (just executed)
1627     KMP_FSYNC_RELEASING(taskdata->td_parent); // releasing parent
1628 #endif
1629   }
1630 
1631 #if OMPD_SUPPORT
1632   if (ompd_state & OMPD_ENABLE_BP)
1633     ompd_bp_task_end();
1634 #endif
1635 
1636   // Proxy tasks are not handled by the runtime
1637   if (taskdata->td_flags.proxy != TASK_PROXY) {
1638 #if OMPT_SUPPORT
1639     if (UNLIKELY(ompt_enabled.enabled)) {
1640       thread->th.ompt_thread_info = oldInfo;
1641       if (taskdata->td_flags.tiedness == TASK_TIED) {
1642         taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1643       }
1644       __kmp_task_finish<true>(gtid, task, current_task);
1645     } else
1646 #endif
1647       __kmp_task_finish<false>(gtid, task, current_task);
1648   }
1649 
1650   KA_TRACE(
1651       30,
1652       ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
1653        gtid, taskdata, current_task));
1654   return;
1655 }
1656 
1657 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution
1658 //
1659 // loc_ref: location of original task pragma (ignored)
1660 // gtid: Global Thread ID of encountering thread
1661 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
1662 // Returns:
1663 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1664 //    be resumed later.
1665 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1666 //    resumed later.
1667 kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,
1668                                 kmp_task_t *new_task) {
1669   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1670 
1671   KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid,
1672                 loc_ref, new_taskdata));
1673 
1674 #if OMPT_SUPPORT
1675   kmp_taskdata_t *parent;
1676   if (UNLIKELY(ompt_enabled.enabled)) {
1677     parent = new_taskdata->td_parent;
1678     if (ompt_enabled.ompt_callback_task_create) {
1679       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1680           &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
1681           &(new_taskdata->ompt_task_info.task_data), ompt_task_explicit, 0,
1682           OMPT_GET_RETURN_ADDRESS(0));
1683     }
1684   }
1685 #endif
1686 
1687   /* Should we execute the new task or queue it? For now, let's just always try
1688      to queue it.  If the queue fills up, then we'll execute it.  */
1689 
1690   if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1691   { // Execute this task immediately
1692     kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1693     new_taskdata->td_flags.task_serial = 1;
1694     __kmp_invoke_task(gtid, new_task, current_task);
1695   }
1696 
1697   KA_TRACE(
1698       10,
1699       ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "
1700        "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
1701        gtid, loc_ref, new_taskdata));
1702 
1703 #if OMPT_SUPPORT
1704   if (UNLIKELY(ompt_enabled.enabled)) {
1705     parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1706   }
1707 #endif
1708   return TASK_CURRENT_NOT_QUEUED;
1709 }
1710 
1711 // __kmp_omp_task: Schedule a non-thread-switchable task for execution
1712 //
1713 // gtid: Global Thread ID of encountering thread
1714 // new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
1715 // serialize_immediate: if TRUE then if the task is executed immediately its
1716 // execution will be serialized
1717 // Returns:
1718 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1719 //    be resumed later.
1720 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1721 //    resumed later.
1722 kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
1723                          bool serialize_immediate) {
1724   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1725 
1726   /* Should we execute the new task or queue it? For now, let's just always try
1727      to queue it.  If the queue fills up, then we'll execute it.  */
1728   if (new_taskdata->td_flags.proxy == TASK_PROXY ||
1729       __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1730   { // Execute this task immediately
1731     kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1732     if (serialize_immediate)
1733       new_taskdata->td_flags.task_serial = 1;
1734     __kmp_invoke_task(gtid, new_task, current_task);
1735   }
1736 
1737   return TASK_CURRENT_NOT_QUEUED;
1738 }
1739 
1740 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a
1741 // non-thread-switchable task from the parent thread only!
1742 //
1743 // loc_ref: location of original task pragma (ignored)
1744 // gtid: Global Thread ID of encountering thread
1745 // new_task: non-thread-switchable task thunk allocated by
1746 // __kmp_omp_task_alloc()
1747 // Returns:
1748 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1749 //    be resumed later.
1750 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1751 //    resumed later.
1752 kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid,
1753                           kmp_task_t *new_task) {
1754   kmp_int32 res;
1755   KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1756 
1757 #if KMP_DEBUG || OMPT_SUPPORT
1758   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1759 #endif
1760   KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1761                 new_taskdata));
1762   __kmp_assert_valid_gtid(gtid);
1763 
1764 #if OMPT_SUPPORT
1765   kmp_taskdata_t *parent = NULL;
1766   if (UNLIKELY(ompt_enabled.enabled)) {
1767     if (!new_taskdata->td_flags.started) {
1768       OMPT_STORE_RETURN_ADDRESS(gtid);
1769       parent = new_taskdata->td_parent;
1770       if (!parent->ompt_task_info.frame.enter_frame.ptr) {
1771         parent->ompt_task_info.frame.enter_frame.ptr =
1772             OMPT_GET_FRAME_ADDRESS(0);
1773       }
1774       if (ompt_enabled.ompt_callback_task_create) {
1775         ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1776             &(parent->ompt_task_info.task_data),
1777             &(parent->ompt_task_info.frame),
1778             &(new_taskdata->ompt_task_info.task_data),
1779             ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1780             OMPT_LOAD_RETURN_ADDRESS(gtid));
1781       }
1782     } else {
1783       // We are scheduling the continuation of an UNTIED task.
1784       // Scheduling back to the parent task.
1785       __ompt_task_finish(new_task,
1786                          new_taskdata->ompt_task_info.scheduling_parent,
1787                          ompt_task_switch);
1788       new_taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1789     }
1790   }
1791 #endif
1792 
1793   res = __kmp_omp_task(gtid, new_task, true);
1794 
1795   KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1796                 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1797                 gtid, loc_ref, new_taskdata));
1798 #if OMPT_SUPPORT
1799   if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
1800     parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1801   }
1802 #endif
1803   return res;
1804 }
1805 
1806 // __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule
1807 // a taskloop task with the correct OMPT return address
1808 //
1809 // loc_ref: location of original task pragma (ignored)
1810 // gtid: Global Thread ID of encountering thread
1811 // new_task: non-thread-switchable task thunk allocated by
1812 // __kmp_omp_task_alloc()
1813 // codeptr_ra: return address for OMPT callback
1814 // Returns:
1815 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1816 //    be resumed later.
1817 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1818 //    resumed later.
1819 kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid,
1820                                   kmp_task_t *new_task, void *codeptr_ra) {
1821   kmp_int32 res;
1822   KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1823 
1824 #if KMP_DEBUG || OMPT_SUPPORT
1825   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1826 #endif
1827   KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1828                 new_taskdata));
1829 
1830 #if OMPT_SUPPORT
1831   kmp_taskdata_t *parent = NULL;
1832   if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) {
1833     parent = new_taskdata->td_parent;
1834     if (!parent->ompt_task_info.frame.enter_frame.ptr)
1835       parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1836     if (ompt_enabled.ompt_callback_task_create) {
1837       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1838           &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
1839           &(new_taskdata->ompt_task_info.task_data),
1840           ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1841           codeptr_ra);
1842     }
1843   }
1844 #endif
1845 
1846   res = __kmp_omp_task(gtid, new_task, true);
1847 
1848   KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1849                 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1850                 gtid, loc_ref, new_taskdata));
1851 #if OMPT_SUPPORT
1852   if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
1853     parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1854   }
1855 #endif
1856   return res;
1857 }
1858 
1859 template <bool ompt>
1860 static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
1861                                               void *frame_address,
1862                                               void *return_address) {
1863   kmp_taskdata_t *taskdata = nullptr;
1864   kmp_info_t *thread;
1865   int thread_finished = FALSE;
1866   KMP_SET_THREAD_STATE_BLOCK(TASKWAIT);
1867 
1868   KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref));
1869   KMP_DEBUG_ASSERT(gtid >= 0);
1870 
1871   if (__kmp_tasking_mode != tskm_immediate_exec) {
1872     thread = __kmp_threads[gtid];
1873     taskdata = thread->th.th_current_task;
1874 
1875 #if OMPT_SUPPORT && OMPT_OPTIONAL
1876     ompt_data_t *my_task_data;
1877     ompt_data_t *my_parallel_data;
1878 
1879     if (ompt) {
1880       my_task_data = &(taskdata->ompt_task_info.task_data);
1881       my_parallel_data = OMPT_CUR_TEAM_DATA(thread);
1882 
1883       taskdata->ompt_task_info.frame.enter_frame.ptr = frame_address;
1884 
1885       if (ompt_enabled.ompt_callback_sync_region) {
1886         ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1887             ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
1888             my_task_data, return_address);
1889       }
1890 
1891       if (ompt_enabled.ompt_callback_sync_region_wait) {
1892         ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1893             ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
1894             my_task_data, return_address);
1895       }
1896     }
1897 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1898 
1899 // Debugger: The taskwait is active. Store location and thread encountered the
1900 // taskwait.
1901 #if USE_ITT_BUILD
1902 // Note: These values are used by ITT events as well.
1903 #endif /* USE_ITT_BUILD */
1904     taskdata->td_taskwait_counter += 1;
1905     taskdata->td_taskwait_ident = loc_ref;
1906     taskdata->td_taskwait_thread = gtid + 1;
1907 
1908 #if USE_ITT_BUILD
1909     void *itt_sync_obj = NULL;
1910 #if USE_ITT_NOTIFY
1911     KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
1912 #endif /* USE_ITT_NOTIFY */
1913 #endif /* USE_ITT_BUILD */
1914 
1915     bool must_wait =
1916         !taskdata->td_flags.team_serial && !taskdata->td_flags.final;
1917 
1918     must_wait = must_wait || (thread->th.th_task_team != NULL &&
1919                               thread->th.th_task_team->tt.tt_found_proxy_tasks);
1920     // If hidden helper thread is encountered, we must enable wait here.
1921     must_wait =
1922         must_wait ||
1923         (__kmp_enable_hidden_helper && thread->th.th_task_team != NULL &&
1924          thread->th.th_task_team->tt.tt_hidden_helper_task_encountered);
1925 
1926     if (must_wait) {
1927       kmp_flag_32<false, false> flag(
1928           RCAST(std::atomic<kmp_uint32> *,
1929                 &(taskdata->td_incomplete_child_tasks)),
1930           0U);
1931       while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) {
1932         flag.execute_tasks(thread, gtid, FALSE,
1933                            &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
1934                            __kmp_task_stealing_constraint);
1935       }
1936     }
1937 #if USE_ITT_BUILD
1938     KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
1939     KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with children
1940 #endif /* USE_ITT_BUILD */
1941 
1942     // Debugger:  The taskwait is completed. Location remains, but thread is
1943     // negated.
1944     taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
1945 
1946 #if OMPT_SUPPORT && OMPT_OPTIONAL
1947     if (ompt) {
1948       if (ompt_enabled.ompt_callback_sync_region_wait) {
1949         ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1950             ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
1951             my_task_data, return_address);
1952       }
1953       if (ompt_enabled.ompt_callback_sync_region) {
1954         ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1955             ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
1956             my_task_data, return_address);
1957       }
1958       taskdata->ompt_task_info.frame.enter_frame = ompt_data_none;
1959     }
1960 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1961 
1962   }
1963 
1964   KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
1965                 "returning TASK_CURRENT_NOT_QUEUED\n",
1966                 gtid, taskdata));
1967 
1968   return TASK_CURRENT_NOT_QUEUED;
1969 }
1970 
1971 #if OMPT_SUPPORT && OMPT_OPTIONAL
1972 OMPT_NOINLINE
1973 static kmp_int32 __kmpc_omp_taskwait_ompt(ident_t *loc_ref, kmp_int32 gtid,
1974                                           void *frame_address,
1975                                           void *return_address) {
1976   return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address,
1977                                             return_address);
1978 }
1979 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1980 
1981 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are
1982 // complete
1983 kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) {
1984 #if OMPT_SUPPORT && OMPT_OPTIONAL
1985   if (UNLIKELY(ompt_enabled.enabled)) {
1986     OMPT_STORE_RETURN_ADDRESS(gtid);
1987     return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(0),
1988                                     OMPT_LOAD_RETURN_ADDRESS(gtid));
1989   }
1990 #endif
1991   return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL);
1992 }
1993 
1994 // __kmpc_omp_taskyield: switch to a different task
1995 kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
1996   kmp_taskdata_t *taskdata = NULL;
1997   kmp_info_t *thread;
1998   int thread_finished = FALSE;
1999 
2000   KMP_COUNT_BLOCK(OMP_TASKYIELD);
2001   KMP_SET_THREAD_STATE_BLOCK(TASKYIELD);
2002 
2003   KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
2004                 gtid, loc_ref, end_part));
2005   __kmp_assert_valid_gtid(gtid);
2006 
2007   if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) {
2008     thread = __kmp_threads[gtid];
2009     taskdata = thread->th.th_current_task;
2010 // Should we model this as a task wait or not?
2011 // Debugger: The taskwait is active. Store location and thread encountered the
2012 // taskwait.
2013 #if USE_ITT_BUILD
2014 // Note: These values are used by ITT events as well.
2015 #endif /* USE_ITT_BUILD */
2016     taskdata->td_taskwait_counter += 1;
2017     taskdata->td_taskwait_ident = loc_ref;
2018     taskdata->td_taskwait_thread = gtid + 1;
2019 
2020 #if USE_ITT_BUILD
2021     void *itt_sync_obj = NULL;
2022 #if USE_ITT_NOTIFY
2023     KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2024 #endif /* USE_ITT_NOTIFY */
2025 #endif /* USE_ITT_BUILD */
2026     if (!taskdata->td_flags.team_serial) {
2027       kmp_task_team_t *task_team = thread->th.th_task_team;
2028       if (task_team != NULL) {
2029         if (KMP_TASKING_ENABLED(task_team)) {
2030 #if OMPT_SUPPORT
2031           if (UNLIKELY(ompt_enabled.enabled))
2032             thread->th.ompt_thread_info.ompt_task_yielded = 1;
2033 #endif
2034           __kmp_execute_tasks_32(
2035               thread, gtid, (kmp_flag_32<> *)NULL, FALSE,
2036               &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2037               __kmp_task_stealing_constraint);
2038 #if OMPT_SUPPORT
2039           if (UNLIKELY(ompt_enabled.enabled))
2040             thread->th.ompt_thread_info.ompt_task_yielded = 0;
2041 #endif
2042         }
2043       }
2044     }
2045 #if USE_ITT_BUILD
2046     KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2047 #endif /* USE_ITT_BUILD */
2048 
2049     // Debugger:  The taskwait is completed. Location remains, but thread is
2050     // negated.
2051     taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2052   }
2053 
2054   KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
2055                 "returning TASK_CURRENT_NOT_QUEUED\n",
2056                 gtid, taskdata));
2057 
2058   return TASK_CURRENT_NOT_QUEUED;
2059 }
2060 
2061 // Task Reduction implementation
2062 //
2063 // Note: initial implementation didn't take into account the possibility
2064 // to specify omp_orig for initializer of the UDR (user defined reduction).
2065 // Corrected implementation takes into account the omp_orig object.
2066 // Compiler is free to use old implementation if omp_orig is not specified.
2067 
2068 /*!
2069 @ingroup BASIC_TYPES
2070 @{
2071 */
2072 
2073 /*!
2074 Flags for special info per task reduction item.
2075 */
2076 typedef struct kmp_taskred_flags {
2077   /*! 1 - use lazy alloc/init (e.g. big objects, #tasks < #threads) */
2078   unsigned lazy_priv : 1;
2079   unsigned reserved31 : 31;
2080 } kmp_taskred_flags_t;
2081 
2082 /*!
2083 Internal struct for reduction data item related info set up by compiler.
2084 */
2085 typedef struct kmp_task_red_input {
2086   void *reduce_shar; /**< shared between tasks item to reduce into */
2087   size_t reduce_size; /**< size of data item in bytes */
2088   // three compiler-generated routines (init, fini are optional):
2089   void *reduce_init; /**< data initialization routine (single parameter) */
2090   void *reduce_fini; /**< data finalization routine */
2091   void *reduce_comb; /**< data combiner routine */
2092   kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
2093 } kmp_task_red_input_t;
2094 
2095 /*!
2096 Internal struct for reduction data item related info saved by the library.
2097 */
2098 typedef struct kmp_taskred_data {
2099   void *reduce_shar; /**< shared between tasks item to reduce into */
2100   size_t reduce_size; /**< size of data item */
2101   kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
2102   void *reduce_priv; /**< array of thread specific items */
2103   void *reduce_pend; /**< end of private data for faster comparison op */
2104   // three compiler-generated routines (init, fini are optional):
2105   void *reduce_comb; /**< data combiner routine */
2106   void *reduce_init; /**< data initialization routine (two parameters) */
2107   void *reduce_fini; /**< data finalization routine */
2108   void *reduce_orig; /**< original item (can be used in UDR initializer) */
2109 } kmp_taskred_data_t;
2110 
2111 /*!
2112 Internal struct for reduction data item related info set up by compiler.
2113 
2114 New interface: added reduce_orig field to provide omp_orig for UDR initializer.
2115 */
2116 typedef struct kmp_taskred_input {
2117   void *reduce_shar; /**< shared between tasks item to reduce into */
2118   void *reduce_orig; /**< original reduction item used for initialization */
2119   size_t reduce_size; /**< size of data item */
2120   // three compiler-generated routines (init, fini are optional):
2121   void *reduce_init; /**< data initialization routine (two parameters) */
2122   void *reduce_fini; /**< data finalization routine */
2123   void *reduce_comb; /**< data combiner routine */
2124   kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
2125 } kmp_taskred_input_t;
2126 /*!
2127 @}
2128 */
2129 
2130 template <typename T> void __kmp_assign_orig(kmp_taskred_data_t &item, T &src);
2131 template <>
2132 void __kmp_assign_orig<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2133                                              kmp_task_red_input_t &src) {
2134   item.reduce_orig = NULL;
2135 }
2136 template <>
2137 void __kmp_assign_orig<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2138                                             kmp_taskred_input_t &src) {
2139   if (src.reduce_orig != NULL) {
2140     item.reduce_orig = src.reduce_orig;
2141   } else {
2142     item.reduce_orig = src.reduce_shar;
2143   } // non-NULL reduce_orig means new interface used
2144 }
2145 
2146 template <typename T> void __kmp_call_init(kmp_taskred_data_t &item, size_t j);
2147 template <>
2148 void __kmp_call_init<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2149                                            size_t offset) {
2150   ((void (*)(void *))item.reduce_init)((char *)(item.reduce_priv) + offset);
2151 }
2152 template <>
2153 void __kmp_call_init<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2154                                           size_t offset) {
2155   ((void (*)(void *, void *))item.reduce_init)(
2156       (char *)(item.reduce_priv) + offset, item.reduce_orig);
2157 }
2158 
2159 template <typename T>
2160 void *__kmp_task_reduction_init(int gtid, int num, T *data) {
2161   __kmp_assert_valid_gtid(gtid);
2162   kmp_info_t *thread = __kmp_threads[gtid];
2163   kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
2164   kmp_uint32 nth = thread->th.th_team_nproc;
2165   kmp_taskred_data_t *arr;
2166 
2167   // check input data just in case
2168   KMP_ASSERT(tg != NULL);
2169   KMP_ASSERT(data != NULL);
2170   KMP_ASSERT(num > 0);
2171   if (nth == 1) {
2172     KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n",
2173                   gtid, tg));
2174     return (void *)tg;
2175   }
2176   KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n",
2177                 gtid, tg, num));
2178   arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2179       thread, num * sizeof(kmp_taskred_data_t));
2180   for (int i = 0; i < num; ++i) {
2181     size_t size = data[i].reduce_size - 1;
2182     // round the size up to cache line per thread-specific item
2183     size += CACHE_LINE - size % CACHE_LINE;
2184     KMP_ASSERT(data[i].reduce_comb != NULL); // combiner is mandatory
2185     arr[i].reduce_shar = data[i].reduce_shar;
2186     arr[i].reduce_size = size;
2187     arr[i].flags = data[i].flags;
2188     arr[i].reduce_comb = data[i].reduce_comb;
2189     arr[i].reduce_init = data[i].reduce_init;
2190     arr[i].reduce_fini = data[i].reduce_fini;
2191     __kmp_assign_orig<T>(arr[i], data[i]);
2192     if (!arr[i].flags.lazy_priv) {
2193       // allocate cache-line aligned block and fill it with zeros
2194       arr[i].reduce_priv = __kmp_allocate(nth * size);
2195       arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size;
2196       if (arr[i].reduce_init != NULL) {
2197         // initialize all thread-specific items
2198         for (size_t j = 0; j < nth; ++j) {
2199           __kmp_call_init<T>(arr[i], j * size);
2200         }
2201       }
2202     } else {
2203       // only allocate space for pointers now,
2204       // objects will be lazily allocated/initialized if/when requested
2205       // note that __kmp_allocate zeroes the allocated memory
2206       arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *));
2207     }
2208   }
2209   tg->reduce_data = (void *)arr;
2210   tg->reduce_num_data = num;
2211   return (void *)tg;
2212 }
2213 
2214 /*!
2215 @ingroup TASKING
2216 @param gtid      Global thread ID
2217 @param num       Number of data items to reduce
2218 @param data      Array of data for reduction
2219 @return The taskgroup identifier
2220 
2221 Initialize task reduction for the taskgroup.
2222 
2223 Note: this entry supposes the optional compiler-generated initializer routine
2224 has single parameter - pointer to object to be initialized. That means
2225 the reduction either does not use omp_orig object, or the omp_orig is accessible
2226 without help of the runtime library.
2227 */
2228 void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
2229   return __kmp_task_reduction_init(gtid, num, (kmp_task_red_input_t *)data);
2230 }
2231 
2232 /*!
2233 @ingroup TASKING
2234 @param gtid      Global thread ID
2235 @param num       Number of data items to reduce
2236 @param data      Array of data for reduction
2237 @return The taskgroup identifier
2238 
2239 Initialize task reduction for the taskgroup.
2240 
2241 Note: this entry supposes the optional compiler-generated initializer routine
2242 has two parameters, pointer to object to be initialized and pointer to omp_orig
2243 */
2244 void *__kmpc_taskred_init(int gtid, int num, void *data) {
2245   return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data);
2246 }
2247 
2248 // Copy task reduction data (except for shared pointers).
2249 template <typename T>
2250 void __kmp_task_reduction_init_copy(kmp_info_t *thr, int num, T *data,
2251                                     kmp_taskgroup_t *tg, void *reduce_data) {
2252   kmp_taskred_data_t *arr;
2253   KA_TRACE(20, ("__kmp_task_reduction_init_copy: Th %p, init taskgroup %p,"
2254                 " from data %p\n",
2255                 thr, tg, reduce_data));
2256   arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2257       thr, num * sizeof(kmp_taskred_data_t));
2258   // threads will share private copies, thunk routines, sizes, flags, etc.:
2259   KMP_MEMCPY(arr, reduce_data, num * sizeof(kmp_taskred_data_t));
2260   for (int i = 0; i < num; ++i) {
2261     arr[i].reduce_shar = data[i].reduce_shar; // init unique shared pointers
2262   }
2263   tg->reduce_data = (void *)arr;
2264   tg->reduce_num_data = num;
2265 }
2266 
2267 /*!
2268 @ingroup TASKING
2269 @param gtid    Global thread ID
2270 @param tskgrp  The taskgroup ID (optional)
2271 @param data    Shared location of the item
2272 @return The pointer to per-thread data
2273 
2274 Get thread-specific location of data item
2275 */
2276 void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
2277   __kmp_assert_valid_gtid(gtid);
2278   kmp_info_t *thread = __kmp_threads[gtid];
2279   kmp_int32 nth = thread->th.th_team_nproc;
2280   if (nth == 1)
2281     return data; // nothing to do
2282 
2283   kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp;
2284   if (tg == NULL)
2285     tg = thread->th.th_current_task->td_taskgroup;
2286   KMP_ASSERT(tg != NULL);
2287   kmp_taskred_data_t *arr = (kmp_taskred_data_t *)(tg->reduce_data);
2288   kmp_int32 num = tg->reduce_num_data;
2289   kmp_int32 tid = thread->th.th_info.ds.ds_tid;
2290 
2291   KMP_ASSERT(data != NULL);
2292   while (tg != NULL) {
2293     for (int i = 0; i < num; ++i) {
2294       if (!arr[i].flags.lazy_priv) {
2295         if (data == arr[i].reduce_shar ||
2296             (data >= arr[i].reduce_priv && data < arr[i].reduce_pend))
2297           return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size;
2298       } else {
2299         // check shared location first
2300         void **p_priv = (void **)(arr[i].reduce_priv);
2301         if (data == arr[i].reduce_shar)
2302           goto found;
2303         // check if we get some thread specific location as parameter
2304         for (int j = 0; j < nth; ++j)
2305           if (data == p_priv[j])
2306             goto found;
2307         continue; // not found, continue search
2308       found:
2309         if (p_priv[tid] == NULL) {
2310           // allocate thread specific object lazily
2311           p_priv[tid] = __kmp_allocate(arr[i].reduce_size);
2312           if (arr[i].reduce_init != NULL) {
2313             if (arr[i].reduce_orig != NULL) { // new interface
2314               ((void (*)(void *, void *))arr[i].reduce_init)(
2315                   p_priv[tid], arr[i].reduce_orig);
2316             } else { // old interface (single parameter)
2317               ((void (*)(void *))arr[i].reduce_init)(p_priv[tid]);
2318             }
2319           }
2320         }
2321         return p_priv[tid];
2322       }
2323     }
2324     tg = tg->parent;
2325     arr = (kmp_taskred_data_t *)(tg->reduce_data);
2326     num = tg->reduce_num_data;
2327   }
2328   KMP_ASSERT2(0, "Unknown task reduction item");
2329   return NULL; // ERROR, this line never executed
2330 }
2331 
2332 // Finalize task reduction.
2333 // Called from __kmpc_end_taskgroup()
2334 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) {
2335   kmp_int32 nth = th->th.th_team_nproc;
2336   KMP_DEBUG_ASSERT(nth > 1); // should not be called if nth == 1
2337   kmp_taskred_data_t *arr = (kmp_taskred_data_t *)tg->reduce_data;
2338   kmp_int32 num = tg->reduce_num_data;
2339   for (int i = 0; i < num; ++i) {
2340     void *sh_data = arr[i].reduce_shar;
2341     void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini);
2342     void (*f_comb)(void *, void *) =
2343         (void (*)(void *, void *))(arr[i].reduce_comb);
2344     if (!arr[i].flags.lazy_priv) {
2345       void *pr_data = arr[i].reduce_priv;
2346       size_t size = arr[i].reduce_size;
2347       for (int j = 0; j < nth; ++j) {
2348         void *priv_data = (char *)pr_data + j * size;
2349         f_comb(sh_data, priv_data); // combine results
2350         if (f_fini)
2351           f_fini(priv_data); // finalize if needed
2352       }
2353     } else {
2354       void **pr_data = (void **)(arr[i].reduce_priv);
2355       for (int j = 0; j < nth; ++j) {
2356         if (pr_data[j] != NULL) {
2357           f_comb(sh_data, pr_data[j]); // combine results
2358           if (f_fini)
2359             f_fini(pr_data[j]); // finalize if needed
2360           __kmp_free(pr_data[j]);
2361         }
2362       }
2363     }
2364     __kmp_free(arr[i].reduce_priv);
2365   }
2366   __kmp_thread_free(th, arr);
2367   tg->reduce_data = NULL;
2368   tg->reduce_num_data = 0;
2369 }
2370 
2371 // Cleanup task reduction data for parallel or worksharing,
2372 // do not touch task private data other threads still working with.
2373 // Called from __kmpc_end_taskgroup()
2374 static void __kmp_task_reduction_clean(kmp_info_t *th, kmp_taskgroup_t *tg) {
2375   __kmp_thread_free(th, tg->reduce_data);
2376   tg->reduce_data = NULL;
2377   tg->reduce_num_data = 0;
2378 }
2379 
2380 template <typename T>
2381 void *__kmp_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2382                                          int num, T *data) {
2383   __kmp_assert_valid_gtid(gtid);
2384   kmp_info_t *thr = __kmp_threads[gtid];
2385   kmp_int32 nth = thr->th.th_team_nproc;
2386   __kmpc_taskgroup(loc, gtid); // form new taskgroup first
2387   if (nth == 1) {
2388     KA_TRACE(10,
2389              ("__kmpc_reduction_modifier_init: T#%d, tg %p, exiting nth=1\n",
2390               gtid, thr->th.th_current_task->td_taskgroup));
2391     return (void *)thr->th.th_current_task->td_taskgroup;
2392   }
2393   kmp_team_t *team = thr->th.th_team;
2394   void *reduce_data;
2395   kmp_taskgroup_t *tg;
2396   reduce_data = KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[is_ws]);
2397   if (reduce_data == NULL &&
2398       __kmp_atomic_compare_store(&team->t.t_tg_reduce_data[is_ws], reduce_data,
2399                                  (void *)1)) {
2400     // single thread enters this block to initialize common reduction data
2401     KMP_DEBUG_ASSERT(reduce_data == NULL);
2402     // first initialize own data, then make a copy other threads can use
2403     tg = (kmp_taskgroup_t *)__kmp_task_reduction_init<T>(gtid, num, data);
2404     reduce_data = __kmp_thread_malloc(thr, num * sizeof(kmp_taskred_data_t));
2405     KMP_MEMCPY(reduce_data, tg->reduce_data, num * sizeof(kmp_taskred_data_t));
2406     // fini counters should be 0 at this point
2407     KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[0]) == 0);
2408     KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[1]) == 0);
2409     KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[is_ws], reduce_data);
2410   } else {
2411     while (
2412         (reduce_data = KMP_ATOMIC_LD_ACQ(&team->t.t_tg_reduce_data[is_ws])) ==
2413         (void *)1) { // wait for task reduction initialization
2414       KMP_CPU_PAUSE();
2415     }
2416     KMP_DEBUG_ASSERT(reduce_data > (void *)1); // should be valid pointer here
2417     tg = thr->th.th_current_task->td_taskgroup;
2418     __kmp_task_reduction_init_copy<T>(thr, num, data, tg, reduce_data);
2419   }
2420   return tg;
2421 }
2422 
2423 /*!
2424 @ingroup TASKING
2425 @param loc       Source location info
2426 @param gtid      Global thread ID
2427 @param is_ws     Is 1 if the reduction is for worksharing, 0 otherwise
2428 @param num       Number of data items to reduce
2429 @param data      Array of data for reduction
2430 @return The taskgroup identifier
2431 
2432 Initialize task reduction for a parallel or worksharing.
2433 
2434 Note: this entry supposes the optional compiler-generated initializer routine
2435 has single parameter - pointer to object to be initialized. That means
2436 the reduction either does not use omp_orig object, or the omp_orig is accessible
2437 without help of the runtime library.
2438 */
2439 void *__kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2440                                           int num, void *data) {
2441   return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2442                                             (kmp_task_red_input_t *)data);
2443 }
2444 
2445 /*!
2446 @ingroup TASKING
2447 @param loc       Source location info
2448 @param gtid      Global thread ID
2449 @param is_ws     Is 1 if the reduction is for worksharing, 0 otherwise
2450 @param num       Number of data items to reduce
2451 @param data      Array of data for reduction
2452 @return The taskgroup identifier
2453 
2454 Initialize task reduction for a parallel or worksharing.
2455 
2456 Note: this entry supposes the optional compiler-generated initializer routine
2457 has two parameters, pointer to object to be initialized and pointer to omp_orig
2458 */
2459 void *__kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num,
2460                                    void *data) {
2461   return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2462                                             (kmp_taskred_input_t *)data);
2463 }
2464 
2465 /*!
2466 @ingroup TASKING
2467 @param loc       Source location info
2468 @param gtid      Global thread ID
2469 @param is_ws     Is 1 if the reduction is for worksharing, 0 otherwise
2470 
2471 Finalize task reduction for a parallel or worksharing.
2472 */
2473 void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws) {
2474   __kmpc_end_taskgroup(loc, gtid);
2475 }
2476 
2477 // __kmpc_taskgroup: Start a new taskgroup
2478 void __kmpc_taskgroup(ident_t *loc, int gtid) {
2479   __kmp_assert_valid_gtid(gtid);
2480   kmp_info_t *thread = __kmp_threads[gtid];
2481   kmp_taskdata_t *taskdata = thread->th.th_current_task;
2482   kmp_taskgroup_t *tg_new =
2483       (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t));
2484   KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new));
2485   KMP_ATOMIC_ST_RLX(&tg_new->count, 0);
2486   KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq);
2487   tg_new->parent = taskdata->td_taskgroup;
2488   tg_new->reduce_data = NULL;
2489   tg_new->reduce_num_data = 0;
2490   tg_new->gomp_data = NULL;
2491   taskdata->td_taskgroup = tg_new;
2492 
2493 #if OMPT_SUPPORT && OMPT_OPTIONAL
2494   if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2495     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2496     if (!codeptr)
2497       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2498     kmp_team_t *team = thread->th.th_team;
2499     ompt_data_t my_task_data = taskdata->ompt_task_info.task_data;
2500     // FIXME: I think this is wrong for lwt!
2501     ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data;
2502 
2503     ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2504         ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2505         &(my_task_data), codeptr);
2506   }
2507 #endif
2508 }
2509 
2510 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task
2511 //                       and its descendants are complete
2512 void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
2513   __kmp_assert_valid_gtid(gtid);
2514   kmp_info_t *thread = __kmp_threads[gtid];
2515   kmp_taskdata_t *taskdata = thread->th.th_current_task;
2516   kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
2517   int thread_finished = FALSE;
2518 
2519 #if OMPT_SUPPORT && OMPT_OPTIONAL
2520   kmp_team_t *team;
2521   ompt_data_t my_task_data;
2522   ompt_data_t my_parallel_data;
2523   void *codeptr = nullptr;
2524   if (UNLIKELY(ompt_enabled.enabled)) {
2525     team = thread->th.th_team;
2526     my_task_data = taskdata->ompt_task_info.task_data;
2527     // FIXME: I think this is wrong for lwt!
2528     my_parallel_data = team->t.ompt_team_info.parallel_data;
2529     codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2530     if (!codeptr)
2531       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2532   }
2533 #endif
2534 
2535   KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc));
2536   KMP_DEBUG_ASSERT(taskgroup != NULL);
2537   KMP_SET_THREAD_STATE_BLOCK(TASKGROUP);
2538 
2539   if (__kmp_tasking_mode != tskm_immediate_exec) {
2540     // mark task as waiting not on a barrier
2541     taskdata->td_taskwait_counter += 1;
2542     taskdata->td_taskwait_ident = loc;
2543     taskdata->td_taskwait_thread = gtid + 1;
2544 #if USE_ITT_BUILD
2545     // For ITT the taskgroup wait is similar to taskwait until we need to
2546     // distinguish them
2547     void *itt_sync_obj = NULL;
2548 #if USE_ITT_NOTIFY
2549     KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2550 #endif /* USE_ITT_NOTIFY */
2551 #endif /* USE_ITT_BUILD */
2552 
2553 #if OMPT_SUPPORT && OMPT_OPTIONAL
2554     if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2555       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2556           ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2557           &(my_task_data), codeptr);
2558     }
2559 #endif
2560 
2561     if (!taskdata->td_flags.team_serial ||
2562         (thread->th.th_task_team != NULL &&
2563          (thread->th.th_task_team->tt.tt_found_proxy_tasks ||
2564           thread->th.th_task_team->tt.tt_hidden_helper_task_encountered))) {
2565       kmp_flag_32<false, false> flag(
2566           RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)), 0U);
2567       while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) {
2568         flag.execute_tasks(thread, gtid, FALSE,
2569                            &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2570                            __kmp_task_stealing_constraint);
2571       }
2572     }
2573     taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; // end waiting
2574 
2575 #if OMPT_SUPPORT && OMPT_OPTIONAL
2576     if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2577       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2578           ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2579           &(my_task_data), codeptr);
2580     }
2581 #endif
2582 
2583 #if USE_ITT_BUILD
2584     KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2585     KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with descendants
2586 #endif /* USE_ITT_BUILD */
2587   }
2588   KMP_DEBUG_ASSERT(taskgroup->count == 0);
2589 
2590   if (taskgroup->reduce_data != NULL &&
2591       !taskgroup->gomp_data) { // need to reduce?
2592     int cnt;
2593     void *reduce_data;
2594     kmp_team_t *t = thread->th.th_team;
2595     kmp_taskred_data_t *arr = (kmp_taskred_data_t *)taskgroup->reduce_data;
2596     // check if <priv> data of the first reduction variable shared for the team
2597     void *priv0 = arr[0].reduce_priv;
2598     if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[0])) != NULL &&
2599         ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
2600       // finishing task reduction on parallel
2601       cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[0]);
2602       if (cnt == thread->th.th_team_nproc - 1) {
2603         // we are the last thread passing __kmpc_reduction_modifier_fini()
2604         // finalize task reduction:
2605         __kmp_task_reduction_fini(thread, taskgroup);
2606         // cleanup fields in the team structure:
2607         // TODO: is relaxed store enough here (whole barrier should follow)?
2608         __kmp_thread_free(thread, reduce_data);
2609         KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[0], NULL);
2610         KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[0], 0);
2611       } else {
2612         // we are not the last thread passing __kmpc_reduction_modifier_fini(),
2613         // so do not finalize reduction, just clean own copy of the data
2614         __kmp_task_reduction_clean(thread, taskgroup);
2615       }
2616     } else if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[1])) !=
2617                    NULL &&
2618                ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
2619       // finishing task reduction on worksharing
2620       cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[1]);
2621       if (cnt == thread->th.th_team_nproc - 1) {
2622         // we are the last thread passing __kmpc_reduction_modifier_fini()
2623         __kmp_task_reduction_fini(thread, taskgroup);
2624         // cleanup fields in team structure:
2625         // TODO: is relaxed store enough here (whole barrier should follow)?
2626         __kmp_thread_free(thread, reduce_data);
2627         KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[1], NULL);
2628         KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[1], 0);
2629       } else {
2630         // we are not the last thread passing __kmpc_reduction_modifier_fini(),
2631         // so do not finalize reduction, just clean own copy of the data
2632         __kmp_task_reduction_clean(thread, taskgroup);
2633       }
2634     } else {
2635       // finishing task reduction on taskgroup
2636       __kmp_task_reduction_fini(thread, taskgroup);
2637     }
2638   }
2639   // Restore parent taskgroup for the current task
2640   taskdata->td_taskgroup = taskgroup->parent;
2641   __kmp_thread_free(thread, taskgroup);
2642 
2643   KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n",
2644                 gtid, taskdata));
2645 
2646 #if OMPT_SUPPORT && OMPT_OPTIONAL
2647   if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2648     ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2649         ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2650         &(my_task_data), codeptr);
2651   }
2652 #endif
2653 }
2654 
2655 // __kmp_remove_my_task: remove a task from my own deque
2656 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid,
2657                                         kmp_task_team_t *task_team,
2658                                         kmp_int32 is_constrained) {
2659   kmp_task_t *task;
2660   kmp_taskdata_t *taskdata;
2661   kmp_thread_data_t *thread_data;
2662   kmp_uint32 tail;
2663 
2664   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2665   KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data !=
2666                    NULL); // Caller should check this condition
2667 
2668   thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
2669 
2670   KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",
2671                 gtid, thread_data->td.td_deque_ntasks,
2672                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2673 
2674   if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2675     KA_TRACE(10,
2676              ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: "
2677               "ntasks=%d head=%u tail=%u\n",
2678               gtid, thread_data->td.td_deque_ntasks,
2679               thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2680     return NULL;
2681   }
2682 
2683   __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
2684 
2685   if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2686     __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2687     KA_TRACE(10,
2688              ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "
2689               "ntasks=%d head=%u tail=%u\n",
2690               gtid, thread_data->td.td_deque_ntasks,
2691               thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2692     return NULL;
2693   }
2694 
2695   tail = (thread_data->td.td_deque_tail - 1) &
2696          TASK_DEQUE_MASK(thread_data->td); // Wrap index.
2697   taskdata = thread_data->td.td_deque[tail];
2698 
2699   if (!__kmp_task_is_allowed(gtid, is_constrained, taskdata,
2700                              thread->th.th_current_task)) {
2701     // The TSC does not allow to steal victim task
2702     __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2703     KA_TRACE(10,
2704              ("__kmp_remove_my_task(exit #3): T#%d TSC blocks tail task: "
2705               "ntasks=%d head=%u tail=%u\n",
2706               gtid, thread_data->td.td_deque_ntasks,
2707               thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2708     return NULL;
2709   }
2710 
2711   thread_data->td.td_deque_tail = tail;
2712   TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1);
2713 
2714   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2715 
2716   KA_TRACE(10, ("__kmp_remove_my_task(exit #4): T#%d task %p removed: "
2717                 "ntasks=%d head=%u tail=%u\n",
2718                 gtid, taskdata, thread_data->td.td_deque_ntasks,
2719                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2720 
2721   task = KMP_TASKDATA_TO_TASK(taskdata);
2722   return task;
2723 }
2724 
2725 // __kmp_steal_task: remove a task from another thread's deque
2726 // Assume that calling thread has already checked existence of
2727 // task_team thread_data before calling this routine.
2728 static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid,
2729                                     kmp_task_team_t *task_team,
2730                                     std::atomic<kmp_int32> *unfinished_threads,
2731                                     int *thread_finished,
2732                                     kmp_int32 is_constrained) {
2733   kmp_task_t *task;
2734   kmp_taskdata_t *taskdata;
2735   kmp_taskdata_t *current;
2736   kmp_thread_data_t *victim_td, *threads_data;
2737   kmp_int32 target;
2738   kmp_int32 victim_tid;
2739 
2740   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2741 
2742   threads_data = task_team->tt.tt_threads_data;
2743   KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition
2744 
2745   victim_tid = victim_thr->th.th_info.ds.ds_tid;
2746   victim_td = &threads_data[victim_tid];
2747 
2748   KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: "
2749                 "task_team=%p ntasks=%d head=%u tail=%u\n",
2750                 gtid, __kmp_gtid_from_thread(victim_thr), task_team,
2751                 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
2752                 victim_td->td.td_deque_tail));
2753 
2754   if (TCR_4(victim_td->td.td_deque_ntasks) == 0) {
2755     KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: "
2756                   "task_team=%p ntasks=%d head=%u tail=%u\n",
2757                   gtid, __kmp_gtid_from_thread(victim_thr), task_team,
2758                   victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
2759                   victim_td->td.td_deque_tail));
2760     return NULL;
2761   }
2762 
2763   __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock);
2764 
2765   int ntasks = TCR_4(victim_td->td.td_deque_ntasks);
2766   // Check again after we acquire the lock
2767   if (ntasks == 0) {
2768     __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2769     KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: "
2770                   "task_team=%p ntasks=%d head=%u tail=%u\n",
2771                   gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
2772                   victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2773     return NULL;
2774   }
2775 
2776   KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL);
2777   current = __kmp_threads[gtid]->th.th_current_task;
2778   taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head];
2779   if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
2780     // Bump head pointer and Wrap.
2781     victim_td->td.td_deque_head =
2782         (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td);
2783   } else {
2784     if (!task_team->tt.tt_untied_task_encountered) {
2785       // The TSC does not allow to steal victim task
2786       __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2787       KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d could not steal from "
2788                     "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
2789                     gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
2790                     victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2791       return NULL;
2792     }
2793     int i;
2794     // walk through victim's deque trying to steal any task
2795     target = victim_td->td.td_deque_head;
2796     taskdata = NULL;
2797     for (i = 1; i < ntasks; ++i) {
2798       target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
2799       taskdata = victim_td->td.td_deque[target];
2800       if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
2801         break; // found victim task
2802       } else {
2803         taskdata = NULL;
2804       }
2805     }
2806     if (taskdata == NULL) {
2807       // No appropriate candidate to steal found
2808       __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2809       KA_TRACE(10, ("__kmp_steal_task(exit #4): T#%d could not steal from "
2810                     "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
2811                     gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
2812                     victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2813       return NULL;
2814     }
2815     int prev = target;
2816     for (i = i + 1; i < ntasks; ++i) {
2817       // shift remaining tasks in the deque left by 1
2818       target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
2819       victim_td->td.td_deque[prev] = victim_td->td.td_deque[target];
2820       prev = target;
2821     }
2822     KMP_DEBUG_ASSERT(
2823         victim_td->td.td_deque_tail ==
2824         (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(victim_td->td)));
2825     victim_td->td.td_deque_tail = target; // tail -= 1 (wrapped))
2826   }
2827   if (*thread_finished) {
2828     // We need to un-mark this victim as a finished victim.  This must be done
2829     // before releasing the lock, or else other threads (starting with the
2830     // primary thread victim) might be prematurely released from the barrier!!!
2831 #if KMP_DEBUG
2832     kmp_int32 count =
2833 #endif
2834         KMP_ATOMIC_INC(unfinished_threads);
2835     KA_TRACE(
2836         20,
2837         ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
2838          gtid, count + 1, task_team));
2839     *thread_finished = FALSE;
2840   }
2841   TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1);
2842 
2843   __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2844 
2845   KMP_COUNT_BLOCK(TASK_stolen);
2846   KA_TRACE(10,
2847            ("__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: "
2848             "task_team=%p ntasks=%d head=%u tail=%u\n",
2849             gtid, taskdata, __kmp_gtid_from_thread(victim_thr), task_team,
2850             ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2851 
2852   task = KMP_TASKDATA_TO_TASK(taskdata);
2853   return task;
2854 }
2855 
2856 // __kmp_execute_tasks_template: Choose and execute tasks until either the
2857 // condition is statisfied (return true) or there are none left (return false).
2858 //
2859 // final_spin is TRUE if this is the spin at the release barrier.
2860 // thread_finished indicates whether the thread is finished executing all
2861 // the tasks it has on its deque, and is at the release barrier.
2862 // spinner is the location on which to spin.
2863 // spinner == NULL means only execute a single task and return.
2864 // checker is the value to check to terminate the spin.
2865 template <class C>
2866 static inline int __kmp_execute_tasks_template(
2867     kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin,
2868     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
2869     kmp_int32 is_constrained) {
2870   kmp_task_team_t *task_team = thread->th.th_task_team;
2871   kmp_thread_data_t *threads_data;
2872   kmp_task_t *task;
2873   kmp_info_t *other_thread;
2874   kmp_taskdata_t *current_task = thread->th.th_current_task;
2875   std::atomic<kmp_int32> *unfinished_threads;
2876   kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0,
2877                       tid = thread->th.th_info.ds.ds_tid;
2878 
2879   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2880   KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]);
2881 
2882   if (task_team == NULL || current_task == NULL)
2883     return FALSE;
2884 
2885   KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d "
2886                 "*thread_finished=%d\n",
2887                 gtid, final_spin, *thread_finished));
2888 
2889   thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
2890   threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
2891 
2892   KMP_DEBUG_ASSERT(threads_data != NULL);
2893 
2894   nthreads = task_team->tt.tt_nproc;
2895   unfinished_threads = &(task_team->tt.tt_unfinished_threads);
2896   KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks ||
2897                    task_team->tt.tt_hidden_helper_task_encountered);
2898   KMP_DEBUG_ASSERT(*unfinished_threads >= 0);
2899 
2900   while (1) { // Outer loop keeps trying to find tasks in case of single thread
2901     // getting tasks from target constructs
2902     while (1) { // Inner loop to find a task and execute it
2903       task = NULL;
2904       if (use_own_tasks) { // check on own queue first
2905         task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained);
2906       }
2907       if ((task == NULL) && (nthreads > 1)) { // Steal a task
2908         int asleep = 1;
2909         use_own_tasks = 0;
2910         // Try to steal from the last place I stole from successfully.
2911         if (victim_tid == -2) { // haven't stolen anything yet
2912           victim_tid = threads_data[tid].td.td_deque_last_stolen;
2913           if (victim_tid !=
2914               -1) // if we have a last stolen from victim, get the thread
2915             other_thread = threads_data[victim_tid].td.td_thr;
2916         }
2917         if (victim_tid != -1) { // found last victim
2918           asleep = 0;
2919         } else if (!new_victim) { // no recent steals and we haven't already
2920           // used a new victim; select a random thread
2921           do { // Find a different thread to steal work from.
2922             // Pick a random thread. Initial plan was to cycle through all the
2923             // threads, and only return if we tried to steal from every thread,
2924             // and failed.  Arch says that's not such a great idea.
2925             victim_tid = __kmp_get_random(thread) % (nthreads - 1);
2926             if (victim_tid >= tid) {
2927               ++victim_tid; // Adjusts random distribution to exclude self
2928             }
2929             // Found a potential victim
2930             other_thread = threads_data[victim_tid].td.td_thr;
2931             // There is a slight chance that __kmp_enable_tasking() did not wake
2932             // up all threads waiting at the barrier.  If victim is sleeping,
2933             // then wake it up. Since we were going to pay the cache miss
2934             // penalty for referencing another thread's kmp_info_t struct
2935             // anyway,
2936             // the check shouldn't cost too much performance at this point. In
2937             // extra barrier mode, tasks do not sleep at the separate tasking
2938             // barrier, so this isn't a problem.
2939             asleep = 0;
2940             if ((__kmp_tasking_mode == tskm_task_teams) &&
2941                 (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&
2942                 (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) !=
2943                  NULL)) {
2944               asleep = 1;
2945               __kmp_null_resume_wrapper(other_thread);
2946               // A sleeping thread should not have any tasks on it's queue.
2947               // There is a slight possibility that it resumes, steals a task
2948               // from another thread, which spawns more tasks, all in the time
2949               // that it takes this thread to check => don't write an assertion
2950               // that the victim's queue is empty.  Try stealing from a
2951               // different thread.
2952             }
2953           } while (asleep);
2954         }
2955 
2956         if (!asleep) {
2957           // We have a victim to try to steal from
2958           task = __kmp_steal_task(other_thread, gtid, task_team,
2959                                   unfinished_threads, thread_finished,
2960                                   is_constrained);
2961         }
2962         if (task != NULL) { // set last stolen to victim
2963           if (threads_data[tid].td.td_deque_last_stolen != victim_tid) {
2964             threads_data[tid].td.td_deque_last_stolen = victim_tid;
2965             // The pre-refactored code did not try more than 1 successful new
2966             // vicitm, unless the last one generated more local tasks;
2967             // new_victim keeps track of this
2968             new_victim = 1;
2969           }
2970         } else { // No tasks found; unset last_stolen
2971           KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1);
2972           victim_tid = -2; // no successful victim found
2973         }
2974       }
2975 
2976       if (task == NULL)
2977         break; // break out of tasking loop
2978 
2979 // Found a task; execute it
2980 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2981       if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2982         if (itt_sync_obj == NULL) { // we are at fork barrier where we could not
2983           // get the object reliably
2984           itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2985         }
2986         __kmp_itt_task_starting(itt_sync_obj);
2987       }
2988 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2989       __kmp_invoke_task(gtid, task, current_task);
2990 #if USE_ITT_BUILD
2991       if (itt_sync_obj != NULL)
2992         __kmp_itt_task_finished(itt_sync_obj);
2993 #endif /* USE_ITT_BUILD */
2994       // If this thread is only partway through the barrier and the condition is
2995       // met, then return now, so that the barrier gather/release pattern can
2996       // proceed. If this thread is in the last spin loop in the barrier,
2997       // waiting to be released, we know that the termination condition will not
2998       // be satisfied, so don't waste any cycles checking it.
2999       if (flag == NULL || (!final_spin && flag->done_check())) {
3000         KA_TRACE(
3001             15,
3002             ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3003              gtid));
3004         return TRUE;
3005       }
3006       if (thread->th.th_task_team == NULL) {
3007         break;
3008       }
3009       KMP_YIELD(__kmp_library == library_throughput); // Yield before next task
3010       // If execution of a stolen task results in more tasks being placed on our
3011       // run queue, reset use_own_tasks
3012       if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) {
3013         KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned "
3014                       "other tasks, restart\n",
3015                       gtid));
3016         use_own_tasks = 1;
3017         new_victim = 0;
3018       }
3019     }
3020 
3021     // The task source has been exhausted. If in final spin loop of barrier,
3022     // check if termination condition is satisfied. The work queue may be empty
3023     // but there might be proxy tasks still executing.
3024     if (final_spin &&
3025         KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks) == 0) {
3026       // First, decrement the #unfinished threads, if that has not already been
3027       // done.  This decrement might be to the spin location, and result in the
3028       // termination condition being satisfied.
3029       if (!*thread_finished) {
3030 #if KMP_DEBUG
3031         kmp_int32 count = -1 +
3032 #endif
3033             KMP_ATOMIC_DEC(unfinished_threads);
3034         KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec "
3035                       "unfinished_threads to %d task_team=%p\n",
3036                       gtid, count, task_team));
3037         *thread_finished = TRUE;
3038       }
3039 
3040       // It is now unsafe to reference thread->th.th_team !!!
3041       // Decrementing task_team->tt.tt_unfinished_threads can allow the primary
3042       // thread to pass through the barrier, where it might reset each thread's
3043       // th.th_team field for the next parallel region. If we can steal more
3044       // work, we know that this has not happened yet.
3045       if (flag != NULL && flag->done_check()) {
3046         KA_TRACE(
3047             15,
3048             ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3049              gtid));
3050         return TRUE;
3051       }
3052     }
3053 
3054     // If this thread's task team is NULL, primary thread has recognized that
3055     // there are no more tasks; bail out
3056     if (thread->th.th_task_team == NULL) {
3057       KA_TRACE(15,
3058                ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid));
3059       return FALSE;
3060     }
3061 
3062     // We could be getting tasks from target constructs; if this is the only
3063     // thread, keep trying to execute tasks from own queue
3064     if (nthreads == 1 &&
3065         KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks))
3066       use_own_tasks = 1;
3067     else {
3068       KA_TRACE(15,
3069                ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid));
3070       return FALSE;
3071     }
3072   }
3073 }
3074 
3075 template <bool C, bool S>
3076 int __kmp_execute_tasks_32(
3077     kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32<C, S> *flag, int final_spin,
3078     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3079     kmp_int32 is_constrained) {
3080   return __kmp_execute_tasks_template(
3081       thread, gtid, flag, final_spin,
3082       thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3083 }
3084 
3085 template <bool C, bool S>
3086 int __kmp_execute_tasks_64(
3087     kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64<C, S> *flag, int final_spin,
3088     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3089     kmp_int32 is_constrained) {
3090   return __kmp_execute_tasks_template(
3091       thread, gtid, flag, final_spin,
3092       thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3093 }
3094 
3095 template <bool C, bool S>
3096 int __kmp_atomic_execute_tasks_64(
3097     kmp_info_t *thread, kmp_int32 gtid, kmp_atomic_flag_64<C, S> *flag,
3098     int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3099     kmp_int32 is_constrained) {
3100   return __kmp_execute_tasks_template(
3101       thread, gtid, flag, final_spin,
3102       thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3103 }
3104 
3105 int __kmp_execute_tasks_oncore(
3106     kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
3107     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3108     kmp_int32 is_constrained) {
3109   return __kmp_execute_tasks_template(
3110       thread, gtid, flag, final_spin,
3111       thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3112 }
3113 
3114 template int
3115 __kmp_execute_tasks_32<false, false>(kmp_info_t *, kmp_int32,
3116                                      kmp_flag_32<false, false> *, int,
3117                                      int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3118 
3119 template int __kmp_execute_tasks_64<false, true>(kmp_info_t *, kmp_int32,
3120                                                  kmp_flag_64<false, true> *,
3121                                                  int,
3122                                                  int *USE_ITT_BUILD_ARG(void *),
3123                                                  kmp_int32);
3124 
3125 template int __kmp_execute_tasks_64<true, false>(kmp_info_t *, kmp_int32,
3126                                                  kmp_flag_64<true, false> *,
3127                                                  int,
3128                                                  int *USE_ITT_BUILD_ARG(void *),
3129                                                  kmp_int32);
3130 
3131 template int __kmp_atomic_execute_tasks_64<false, true>(
3132     kmp_info_t *, kmp_int32, kmp_atomic_flag_64<false, true> *, int,
3133     int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3134 
3135 template int __kmp_atomic_execute_tasks_64<true, false>(
3136     kmp_info_t *, kmp_int32, kmp_atomic_flag_64<true, false> *, int,
3137     int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3138 
3139 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
3140 // next barrier so they can assist in executing enqueued tasks.
3141 // First thread in allocates the task team atomically.
3142 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
3143                                  kmp_info_t *this_thr) {
3144   kmp_thread_data_t *threads_data;
3145   int nthreads, i, is_init_thread;
3146 
3147   KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n",
3148                 __kmp_gtid_from_thread(this_thr)));
3149 
3150   KMP_DEBUG_ASSERT(task_team != NULL);
3151   KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL);
3152 
3153   nthreads = task_team->tt.tt_nproc;
3154   KMP_DEBUG_ASSERT(nthreads > 0);
3155   KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc);
3156 
3157   // Allocate or increase the size of threads_data if necessary
3158   is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team);
3159 
3160   if (!is_init_thread) {
3161     // Some other thread already set up the array.
3162     KA_TRACE(
3163         20,
3164         ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n",
3165          __kmp_gtid_from_thread(this_thr)));
3166     return;
3167   }
3168   threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3169   KMP_DEBUG_ASSERT(threads_data != NULL);
3170 
3171   if (__kmp_tasking_mode == tskm_task_teams &&
3172       (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) {
3173     // Release any threads sleeping at the barrier, so that they can steal
3174     // tasks and execute them.  In extra barrier mode, tasks do not sleep
3175     // at the separate tasking barrier, so this isn't a problem.
3176     for (i = 0; i < nthreads; i++) {
3177       void *sleep_loc;
3178       kmp_info_t *thread = threads_data[i].td.td_thr;
3179 
3180       if (i == this_thr->th.th_info.ds.ds_tid) {
3181         continue;
3182       }
3183       // Since we haven't locked the thread's suspend mutex lock at this
3184       // point, there is a small window where a thread might be putting
3185       // itself to sleep, but hasn't set the th_sleep_loc field yet.
3186       // To work around this, __kmp_execute_tasks_template() periodically checks
3187       // see if other threads are sleeping (using the same random mechanism that
3188       // is used for task stealing) and awakens them if they are.
3189       if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3190           NULL) {
3191         KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n",
3192                       __kmp_gtid_from_thread(this_thr),
3193                       __kmp_gtid_from_thread(thread)));
3194         __kmp_null_resume_wrapper(thread);
3195       } else {
3196         KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
3197                       __kmp_gtid_from_thread(this_thr),
3198                       __kmp_gtid_from_thread(thread)));
3199       }
3200     }
3201   }
3202 
3203   KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n",
3204                 __kmp_gtid_from_thread(this_thr)));
3205 }
3206 
3207 /* // TODO: Check the comment consistency
3208  * Utility routines for "task teams".  A task team (kmp_task_t) is kind of
3209  * like a shadow of the kmp_team_t data struct, with a different lifetime.
3210  * After a child * thread checks into a barrier and calls __kmp_release() from
3211  * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
3212  * longer assume that the kmp_team_t structure is intact (at any moment, the
3213  * primary thread may exit the barrier code and free the team data structure,
3214  * and return the threads to the thread pool).
3215  *
3216  * This does not work with the tasking code, as the thread is still
3217  * expected to participate in the execution of any tasks that may have been
3218  * spawned my a member of the team, and the thread still needs access to all
3219  * to each thread in the team, so that it can steal work from it.
3220  *
3221  * Enter the existence of the kmp_task_team_t struct.  It employs a reference
3222  * counting mechanism, and is allocated by the primary thread before calling
3223  * __kmp_<barrier_kind>_release, and then is release by the last thread to
3224  * exit __kmp_<barrier_kind>_release at the next barrier.  I.e. the lifetimes
3225  * of the kmp_task_team_t structs for consecutive barriers can overlap
3226  * (and will, unless the primary thread is the last thread to exit the barrier
3227  * release phase, which is not typical). The existence of such a struct is
3228  * useful outside the context of tasking.
3229  *
3230  * We currently use the existence of the threads array as an indicator that
3231  * tasks were spawned since the last barrier.  If the structure is to be
3232  * useful outside the context of tasking, then this will have to change, but
3233  * not setting the field minimizes the performance impact of tasking on
3234  * barriers, when no explicit tasks were spawned (pushed, actually).
3235  */
3236 
3237 static kmp_task_team_t *__kmp_free_task_teams =
3238     NULL; // Free list for task_team data structures
3239 // Lock for task team data structures
3240 kmp_bootstrap_lock_t __kmp_task_team_lock =
3241     KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock);
3242 
3243 // __kmp_alloc_task_deque:
3244 // Allocates a task deque for a particular thread, and initialize the necessary
3245 // data structures relating to the deque.  This only happens once per thread
3246 // per task team since task teams are recycled. No lock is needed during
3247 // allocation since each thread allocates its own deque.
3248 static void __kmp_alloc_task_deque(kmp_info_t *thread,
3249                                    kmp_thread_data_t *thread_data) {
3250   __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
3251   KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL);
3252 
3253   // Initialize last stolen task field to "none"
3254   thread_data->td.td_deque_last_stolen = -1;
3255 
3256   KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0);
3257   KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0);
3258   KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0);
3259 
3260   KE_TRACE(
3261       10,
3262       ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
3263        __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data));
3264   // Allocate space for task deque, and zero the deque
3265   // Cannot use __kmp_thread_calloc() because threads not around for
3266   // kmp_reap_task_team( ).
3267   thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
3268       INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
3269   thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
3270 }
3271 
3272 // __kmp_free_task_deque:
3273 // Deallocates a task deque for a particular thread. Happens at library
3274 // deallocation so don't need to reset all thread data fields.
3275 static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) {
3276   if (thread_data->td.td_deque != NULL) {
3277     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3278     TCW_4(thread_data->td.td_deque_ntasks, 0);
3279     __kmp_free(thread_data->td.td_deque);
3280     thread_data->td.td_deque = NULL;
3281     __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3282   }
3283 
3284 #ifdef BUILD_TIED_TASK_STACK
3285   // GEH: Figure out what to do here for td_susp_tied_tasks
3286   if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) {
3287     __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data);
3288   }
3289 #endif // BUILD_TIED_TASK_STACK
3290 }
3291 
3292 // __kmp_realloc_task_threads_data:
3293 // Allocates a threads_data array for a task team, either by allocating an
3294 // initial array or enlarging an existing array.  Only the first thread to get
3295 // the lock allocs or enlarges the array and re-initializes the array elements.
3296 // That thread returns "TRUE", the rest return "FALSE".
3297 // Assumes that the new array size is given by task_team -> tt.tt_nproc.
3298 // The current size is given by task_team -> tt.tt_max_threads.
3299 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
3300                                            kmp_task_team_t *task_team) {
3301   kmp_thread_data_t **threads_data_p;
3302   kmp_int32 nthreads, maxthreads;
3303   int is_init_thread = FALSE;
3304 
3305   if (TCR_4(task_team->tt.tt_found_tasks)) {
3306     // Already reallocated and initialized.
3307     return FALSE;
3308   }
3309 
3310   threads_data_p = &task_team->tt.tt_threads_data;
3311   nthreads = task_team->tt.tt_nproc;
3312   maxthreads = task_team->tt.tt_max_threads;
3313 
3314   // All threads must lock when they encounter the first task of the implicit
3315   // task region to make sure threads_data fields are (re)initialized before
3316   // used.
3317   __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3318 
3319   if (!TCR_4(task_team->tt.tt_found_tasks)) {
3320     // first thread to enable tasking
3321     kmp_team_t *team = thread->th.th_team;
3322     int i;
3323 
3324     is_init_thread = TRUE;
3325     if (maxthreads < nthreads) {
3326 
3327       if (*threads_data_p != NULL) {
3328         kmp_thread_data_t *old_data = *threads_data_p;
3329         kmp_thread_data_t *new_data = NULL;
3330 
3331         KE_TRACE(
3332             10,
3333             ("__kmp_realloc_task_threads_data: T#%d reallocating "
3334              "threads data for task_team %p, new_size = %d, old_size = %d\n",
3335              __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads));
3336         // Reallocate threads_data to have more elements than current array
3337         // Cannot use __kmp_thread_realloc() because threads not around for
3338         // kmp_reap_task_team( ).  Note all new array entries are initialized
3339         // to zero by __kmp_allocate().
3340         new_data = (kmp_thread_data_t *)__kmp_allocate(
3341             nthreads * sizeof(kmp_thread_data_t));
3342         // copy old data to new data
3343         KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t),
3344                      (void *)old_data, maxthreads * sizeof(kmp_thread_data_t));
3345 
3346 #ifdef BUILD_TIED_TASK_STACK
3347         // GEH: Figure out if this is the right thing to do
3348         for (i = maxthreads; i < nthreads; i++) {
3349           kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3350           __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3351         }
3352 #endif // BUILD_TIED_TASK_STACK
3353        // Install the new data and free the old data
3354         (*threads_data_p) = new_data;
3355         __kmp_free(old_data);
3356       } else {
3357         KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating "
3358                       "threads data for task_team %p, size = %d\n",
3359                       __kmp_gtid_from_thread(thread), task_team, nthreads));
3360         // Make the initial allocate for threads_data array, and zero entries
3361         // Cannot use __kmp_thread_calloc() because threads not around for
3362         // kmp_reap_task_team( ).
3363         *threads_data_p = (kmp_thread_data_t *)__kmp_allocate(
3364             nthreads * sizeof(kmp_thread_data_t));
3365 #ifdef BUILD_TIED_TASK_STACK
3366         // GEH: Figure out if this is the right thing to do
3367         for (i = 0; i < nthreads; i++) {
3368           kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3369           __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3370         }
3371 #endif // BUILD_TIED_TASK_STACK
3372       }
3373       task_team->tt.tt_max_threads = nthreads;
3374     } else {
3375       // If array has (more than) enough elements, go ahead and use it
3376       KMP_DEBUG_ASSERT(*threads_data_p != NULL);
3377     }
3378 
3379     // initialize threads_data pointers back to thread_info structures
3380     for (i = 0; i < nthreads; i++) {
3381       kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3382       thread_data->td.td_thr = team->t.t_threads[i];
3383 
3384       if (thread_data->td.td_deque_last_stolen >= nthreads) {
3385         // The last stolen field survives across teams / barrier, and the number
3386         // of threads may have changed.  It's possible (likely?) that a new
3387         // parallel region will exhibit the same behavior as previous region.
3388         thread_data->td.td_deque_last_stolen = -1;
3389       }
3390     }
3391 
3392     KMP_MB();
3393     TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE);
3394   }
3395 
3396   __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3397   return is_init_thread;
3398 }
3399 
3400 // __kmp_free_task_threads_data:
3401 // Deallocates a threads_data array for a task team, including any attached
3402 // tasking deques.  Only occurs at library shutdown.
3403 static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) {
3404   __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3405   if (task_team->tt.tt_threads_data != NULL) {
3406     int i;
3407     for (i = 0; i < task_team->tt.tt_max_threads; i++) {
3408       __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]);
3409     }
3410     __kmp_free(task_team->tt.tt_threads_data);
3411     task_team->tt.tt_threads_data = NULL;
3412   }
3413   __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3414 }
3415 
3416 // __kmp_allocate_task_team:
3417 // Allocates a task team associated with a specific team, taking it from
3418 // the global task team free list if possible.  Also initializes data
3419 // structures.
3420 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
3421                                                  kmp_team_t *team) {
3422   kmp_task_team_t *task_team = NULL;
3423   int nthreads;
3424 
3425   KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n",
3426                 (thread ? __kmp_gtid_from_thread(thread) : -1), team));
3427 
3428   if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3429     // Take a task team from the task team pool
3430     __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3431     if (__kmp_free_task_teams != NULL) {
3432       task_team = __kmp_free_task_teams;
3433       TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next);
3434       task_team->tt.tt_next = NULL;
3435     }
3436     __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3437   }
3438 
3439   if (task_team == NULL) {
3440     KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating "
3441                   "task team for team %p\n",
3442                   __kmp_gtid_from_thread(thread), team));
3443     // Allocate a new task team if one is not available. Cannot use
3444     // __kmp_thread_malloc because threads not around for kmp_reap_task_team.
3445     task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t));
3446     __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock);
3447 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
3448     // suppress race conditions detection on synchronization flags in debug mode
3449     // this helps to analyze library internals eliminating false positives
3450     __itt_suppress_mark_range(
3451         __itt_suppress_range, __itt_suppress_threading_errors,
3452         &task_team->tt.tt_found_tasks, sizeof(task_team->tt.tt_found_tasks));
3453     __itt_suppress_mark_range(__itt_suppress_range,
3454                               __itt_suppress_threading_errors,
3455                               CCAST(kmp_uint32 *, &task_team->tt.tt_active),
3456                               sizeof(task_team->tt.tt_active));
3457 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
3458     // Note: __kmp_allocate zeroes returned memory, othewise we would need:
3459     // task_team->tt.tt_threads_data = NULL;
3460     // task_team->tt.tt_max_threads = 0;
3461     // task_team->tt.tt_next = NULL;
3462   }
3463 
3464   TCW_4(task_team->tt.tt_found_tasks, FALSE);
3465   TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3466   task_team->tt.tt_nproc = nthreads = team->t.t_nproc;
3467 
3468   KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads);
3469   TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
3470   TCW_4(task_team->tt.tt_active, TRUE);
3471 
3472   KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
3473                 "unfinished_threads init'd to %d\n",
3474                 (thread ? __kmp_gtid_from_thread(thread) : -1), task_team,
3475                 KMP_ATOMIC_LD_RLX(&task_team->tt.tt_unfinished_threads)));
3476   return task_team;
3477 }
3478 
3479 // __kmp_free_task_team:
3480 // Frees the task team associated with a specific thread, and adds it
3481 // to the global task team free list.
3482 void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) {
3483   KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n",
3484                 thread ? __kmp_gtid_from_thread(thread) : -1, task_team));
3485 
3486   // Put task team back on free list
3487   __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3488 
3489   KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL);
3490   task_team->tt.tt_next = __kmp_free_task_teams;
3491   TCW_PTR(__kmp_free_task_teams, task_team);
3492 
3493   __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3494 }
3495 
3496 // __kmp_reap_task_teams:
3497 // Free all the task teams on the task team free list.
3498 // Should only be done during library shutdown.
3499 // Cannot do anything that needs a thread structure or gtid since they are
3500 // already gone.
3501 void __kmp_reap_task_teams(void) {
3502   kmp_task_team_t *task_team;
3503 
3504   if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3505     // Free all task_teams on the free list
3506     __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3507     while ((task_team = __kmp_free_task_teams) != NULL) {
3508       __kmp_free_task_teams = task_team->tt.tt_next;
3509       task_team->tt.tt_next = NULL;
3510 
3511       // Free threads_data if necessary
3512       if (task_team->tt.tt_threads_data != NULL) {
3513         __kmp_free_task_threads_data(task_team);
3514       }
3515       __kmp_free(task_team);
3516     }
3517     __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3518   }
3519 }
3520 
3521 // __kmp_wait_to_unref_task_teams:
3522 // Some threads could still be in the fork barrier release code, possibly
3523 // trying to steal tasks.  Wait for each thread to unreference its task team.
3524 void __kmp_wait_to_unref_task_teams(void) {
3525   kmp_info_t *thread;
3526   kmp_uint32 spins;
3527   int done;
3528 
3529   KMP_INIT_YIELD(spins);
3530 
3531   for (;;) {
3532     done = TRUE;
3533 
3534     // TODO: GEH - this may be is wrong because some sync would be necessary
3535     // in case threads are added to the pool during the traversal. Need to
3536     // verify that lock for thread pool is held when calling this routine.
3537     for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL;
3538          thread = thread->th.th_next_pool) {
3539 #if KMP_OS_WINDOWS
3540       DWORD exit_val;
3541 #endif
3542       if (TCR_PTR(thread->th.th_task_team) == NULL) {
3543         KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n",
3544                       __kmp_gtid_from_thread(thread)));
3545         continue;
3546       }
3547 #if KMP_OS_WINDOWS
3548       // TODO: GEH - add this check for Linux* OS / OS X* as well?
3549       if (!__kmp_is_thread_alive(thread, &exit_val)) {
3550         thread->th.th_task_team = NULL;
3551         continue;
3552       }
3553 #endif
3554 
3555       done = FALSE; // Because th_task_team pointer is not NULL for this thread
3556 
3557       KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to "
3558                     "unreference task_team\n",
3559                     __kmp_gtid_from_thread(thread)));
3560 
3561       if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
3562         void *sleep_loc;
3563         // If the thread is sleeping, awaken it.
3564         if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3565             NULL) {
3566           KA_TRACE(
3567               10,
3568               ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
3569                __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread)));
3570           __kmp_null_resume_wrapper(thread);
3571         }
3572       }
3573     }
3574     if (done) {
3575       break;
3576     }
3577 
3578     // If oversubscribed or have waited a bit, yield.
3579     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
3580   }
3581 }
3582 
3583 // __kmp_task_team_setup:  Create a task_team for the current team, but use
3584 // an already created, unused one if it already exists.
3585 void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
3586   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3587 
3588   // If this task_team hasn't been created yet, allocate it. It will be used in
3589   // the region after the next.
3590   // If it exists, it is the current task team and shouldn't be touched yet as
3591   // it may still be in use.
3592   if (team->t.t_task_team[this_thr->th.th_task_state] == NULL &&
3593       (always || team->t.t_nproc > 1)) {
3594     team->t.t_task_team[this_thr->th.th_task_state] =
3595         __kmp_allocate_task_team(this_thr, team);
3596     KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
3597                   " for team %d at parity=%d\n",
3598                   __kmp_gtid_from_thread(this_thr),
3599                   team->t.t_task_team[this_thr->th.th_task_state], team->t.t_id,
3600                   this_thr->th.th_task_state));
3601   }
3602 
3603   // After threads exit the release, they will call sync, and then point to this
3604   // other task_team; make sure it is allocated and properly initialized. As
3605   // threads spin in the barrier release phase, they will continue to use the
3606   // previous task_team struct(above), until they receive the signal to stop
3607   // checking for tasks (they can't safely reference the kmp_team_t struct,
3608   // which could be reallocated by the primary thread). No task teams are formed
3609   // for serialized teams.
3610   if (team->t.t_nproc > 1) {
3611     int other_team = 1 - this_thr->th.th_task_state;
3612     KMP_DEBUG_ASSERT(other_team >= 0 && other_team < 2);
3613     if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
3614       team->t.t_task_team[other_team] =
3615           __kmp_allocate_task_team(this_thr, team);
3616       KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created second new "
3617                     "task_team %p for team %d at parity=%d\n",
3618                     __kmp_gtid_from_thread(this_thr),
3619                     team->t.t_task_team[other_team], team->t.t_id, other_team));
3620     } else { // Leave the old task team struct in place for the upcoming region;
3621       // adjust as needed
3622       kmp_task_team_t *task_team = team->t.t_task_team[other_team];
3623       if (!task_team->tt.tt_active ||
3624           team->t.t_nproc != task_team->tt.tt_nproc) {
3625         TCW_4(task_team->tt.tt_nproc, team->t.t_nproc);
3626         TCW_4(task_team->tt.tt_found_tasks, FALSE);
3627         TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3628         KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads,
3629                           team->t.t_nproc);
3630         TCW_4(task_team->tt.tt_active, TRUE);
3631       }
3632       // if team size has changed, the first thread to enable tasking will
3633       // realloc threads_data if necessary
3634       KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d reset next task_team "
3635                     "%p for team %d at parity=%d\n",
3636                     __kmp_gtid_from_thread(this_thr),
3637                     team->t.t_task_team[other_team], team->t.t_id, other_team));
3638     }
3639   }
3640 
3641   // For regular thread, task enabling should be called when the task is going
3642   // to be pushed to a dequeue. However, for the hidden helper thread, we need
3643   // it ahead of time so that some operations can be performed without race
3644   // condition.
3645   if (this_thr == __kmp_hidden_helper_main_thread) {
3646     for (int i = 0; i < 2; ++i) {
3647       kmp_task_team_t *task_team = team->t.t_task_team[i];
3648       if (KMP_TASKING_ENABLED(task_team)) {
3649         continue;
3650       }
3651       __kmp_enable_tasking(task_team, this_thr);
3652       for (int j = 0; j < task_team->tt.tt_nproc; ++j) {
3653         kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[j];
3654         if (thread_data->td.td_deque == NULL) {
3655           __kmp_alloc_task_deque(__kmp_hidden_helper_threads[j], thread_data);
3656         }
3657       }
3658     }
3659   }
3660 }
3661 
3662 // __kmp_task_team_sync: Propagation of task team data from team to threads
3663 // which happens just after the release phase of a team barrier.  This may be
3664 // called by any thread, but only for teams with # threads > 1.
3665 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
3666   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3667 
3668   // Toggle the th_task_state field, to switch which task_team this thread
3669   // refers to
3670   this_thr->th.th_task_state = (kmp_uint8)(1 - this_thr->th.th_task_state);
3671 
3672   // It is now safe to propagate the task team pointer from the team struct to
3673   // the current thread.
3674   TCW_PTR(this_thr->th.th_task_team,
3675           team->t.t_task_team[this_thr->th.th_task_state]);
3676   KA_TRACE(20,
3677            ("__kmp_task_team_sync: Thread T#%d task team switched to task_team "
3678             "%p from Team #%d (parity=%d)\n",
3679             __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team,
3680             team->t.t_id, this_thr->th.th_task_state));
3681 }
3682 
3683 // __kmp_task_team_wait: Primary thread waits for outstanding tasks after the
3684 // barrier gather phase. Only called by primary thread if #threads in team > 1
3685 // or if proxy tasks were created.
3686 //
3687 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
3688 // by passing in 0 optionally as the last argument. When wait is zero, primary
3689 // thread does not wait for unfinished_threads to reach 0.
3690 void __kmp_task_team_wait(
3691     kmp_info_t *this_thr,
3692     kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) {
3693   kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state];
3694 
3695   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3696   KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team);
3697 
3698   if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) {
3699     if (wait) {
3700       KA_TRACE(20, ("__kmp_task_team_wait: Primary T#%d waiting for all tasks "
3701                     "(for unfinished_threads to reach 0) on task_team = %p\n",
3702                     __kmp_gtid_from_thread(this_thr), task_team));
3703       // Worker threads may have dropped through to release phase, but could
3704       // still be executing tasks. Wait here for tasks to complete. To avoid
3705       // memory contention, only primary thread checks termination condition.
3706       kmp_flag_32<false, false> flag(
3707           RCAST(std::atomic<kmp_uint32> *,
3708                 &task_team->tt.tt_unfinished_threads),
3709           0U);
3710       flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
3711     }
3712     // Deactivate the old task team, so that the worker threads will stop
3713     // referencing it while spinning.
3714     KA_TRACE(
3715         20,
3716         ("__kmp_task_team_wait: Primary T#%d deactivating task_team %p: "
3717          "setting active to false, setting local and team's pointer to NULL\n",
3718          __kmp_gtid_from_thread(this_thr), task_team));
3719     KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 ||
3720                      task_team->tt.tt_found_proxy_tasks == TRUE);
3721     TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3722     KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0);
3723     TCW_SYNC_4(task_team->tt.tt_active, FALSE);
3724     KMP_MB();
3725 
3726     TCW_PTR(this_thr->th.th_task_team, NULL);
3727   }
3728 }
3729 
3730 // __kmp_tasking_barrier:
3731 // This routine is called only when __kmp_tasking_mode == tskm_extra_barrier.
3732 // Internal function to execute all tasks prior to a regular barrier or a join
3733 // barrier. It is a full barrier itself, which unfortunately turns regular
3734 // barriers into double barriers and join barriers into 1 1/2 barriers.
3735 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) {
3736   std::atomic<kmp_uint32> *spin = RCAST(
3737       std::atomic<kmp_uint32> *,
3738       &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads);
3739   int flag = FALSE;
3740   KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier);
3741 
3742 #if USE_ITT_BUILD
3743   KMP_FSYNC_SPIN_INIT(spin, NULL);
3744 #endif /* USE_ITT_BUILD */
3745   kmp_flag_32<false, false> spin_flag(spin, 0U);
3746   while (!spin_flag.execute_tasks(thread, gtid, TRUE,
3747                                   &flag USE_ITT_BUILD_ARG(NULL), 0)) {
3748 #if USE_ITT_BUILD
3749     // TODO: What about itt_sync_obj??
3750     KMP_FSYNC_SPIN_PREPARE(RCAST(void *, spin));
3751 #endif /* USE_ITT_BUILD */
3752 
3753     if (TCR_4(__kmp_global.g.g_done)) {
3754       if (__kmp_global.g.g_abort)
3755         __kmp_abort_thread();
3756       break;
3757     }
3758     KMP_YIELD(TRUE);
3759   }
3760 #if USE_ITT_BUILD
3761   KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin));
3762 #endif /* USE_ITT_BUILD */
3763 }
3764 
3765 // __kmp_give_task puts a task into a given thread queue if:
3766 //  - the queue for that thread was created
3767 //  - there's space in that queue
3768 // Because of this, __kmp_push_task needs to check if there's space after
3769 // getting the lock
3770 static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task,
3771                             kmp_int32 pass) {
3772   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
3773   kmp_task_team_t *task_team = taskdata->td_task_team;
3774 
3775   KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n",
3776                 taskdata, tid));
3777 
3778   // If task_team is NULL something went really bad...
3779   KMP_DEBUG_ASSERT(task_team != NULL);
3780 
3781   bool result = false;
3782   kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
3783 
3784   if (thread_data->td.td_deque == NULL) {
3785     // There's no queue in this thread, go find another one
3786     // We're guaranteed that at least one thread has a queue
3787     KA_TRACE(30,
3788              ("__kmp_give_task: thread %d has no queue while giving task %p.\n",
3789               tid, taskdata));
3790     return result;
3791   }
3792 
3793   if (TCR_4(thread_data->td.td_deque_ntasks) >=
3794       TASK_DEQUE_SIZE(thread_data->td)) {
3795     KA_TRACE(
3796         30,
3797         ("__kmp_give_task: queue is full while giving task %p to thread %d.\n",
3798          taskdata, tid));
3799 
3800     // if this deque is bigger than the pass ratio give a chance to another
3801     // thread
3802     if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
3803       return result;
3804 
3805     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3806     if (TCR_4(thread_data->td.td_deque_ntasks) >=
3807         TASK_DEQUE_SIZE(thread_data->td)) {
3808       // expand deque to push the task which is not allowed to execute
3809       __kmp_realloc_task_deque(thread, thread_data);
3810     }
3811 
3812   } else {
3813 
3814     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3815 
3816     if (TCR_4(thread_data->td.td_deque_ntasks) >=
3817         TASK_DEQUE_SIZE(thread_data->td)) {
3818       KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to "
3819                     "thread %d.\n",
3820                     taskdata, tid));
3821 
3822       // if this deque is bigger than the pass ratio give a chance to another
3823       // thread
3824       if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
3825         goto release_and_exit;
3826 
3827       __kmp_realloc_task_deque(thread, thread_data);
3828     }
3829   }
3830 
3831   // lock is held here, and there is space in the deque
3832 
3833   thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
3834   // Wrap index.
3835   thread_data->td.td_deque_tail =
3836       (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
3837   TCW_4(thread_data->td.td_deque_ntasks,
3838         TCR_4(thread_data->td.td_deque_ntasks) + 1);
3839 
3840   result = true;
3841   KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n",
3842                 taskdata, tid));
3843 
3844 release_and_exit:
3845   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3846 
3847   return result;
3848 }
3849 
3850 #define PROXY_TASK_FLAG 0x40000000
3851 /* The finish of the proxy tasks is divided in two pieces:
3852     - the top half is the one that can be done from a thread outside the team
3853     - the bottom half must be run from a thread within the team
3854 
3855    In order to run the bottom half the task gets queued back into one of the
3856    threads of the team. Once the td_incomplete_child_task counter of the parent
3857    is decremented the threads can leave the barriers. So, the bottom half needs
3858    to be queued before the counter is decremented. The top half is therefore
3859    divided in two parts:
3860     - things that can be run before queuing the bottom half
3861     - things that must be run after queuing the bottom half
3862 
3863    This creates a second race as the bottom half can free the task before the
3864    second top half is executed. To avoid this we use the
3865    td_incomplete_child_task of the proxy task to synchronize the top and bottom
3866    half. */
3867 static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
3868   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
3869   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3870   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
3871   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
3872 
3873   taskdata->td_flags.complete = 1; // mark the task as completed
3874 
3875   if (taskdata->td_taskgroup)
3876     KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
3877 
3878   // Create an imaginary children for this task so the bottom half cannot
3879   // release the task before we have completed the second top half
3880   KMP_ATOMIC_OR(&taskdata->td_incomplete_child_tasks, PROXY_TASK_FLAG);
3881 }
3882 
3883 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
3884 #if KMP_DEBUG
3885   kmp_int32 children = 0;
3886   // Predecrement simulated by "- 1" calculation
3887   children = -1 +
3888 #endif
3889       KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
3890   KMP_DEBUG_ASSERT(children >= 0);
3891 
3892   // Remove the imaginary children
3893   KMP_ATOMIC_AND(&taskdata->td_incomplete_child_tasks, ~PROXY_TASK_FLAG);
3894 }
3895 
3896 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) {
3897   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3898   kmp_info_t *thread = __kmp_threads[gtid];
3899 
3900   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3901   KMP_DEBUG_ASSERT(taskdata->td_flags.complete ==
3902                    1); // top half must run before bottom half
3903 
3904   // We need to wait to make sure the top half is finished
3905   // Spinning here should be ok as this should happen quickly
3906   while ((KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) &
3907           PROXY_TASK_FLAG) > 0)
3908     ;
3909 
3910   __kmp_release_deps(gtid, taskdata);
3911   __kmp_free_task_and_ancestors(gtid, taskdata, thread);
3912 }
3913 
3914 /*!
3915 @ingroup TASKING
3916 @param gtid Global Thread ID of encountering thread
3917 @param ptask Task which execution is completed
3918 
3919 Execute the completion of a proxy task from a thread of that is part of the
3920 team. Run first and bottom halves directly.
3921 */
3922 void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) {
3923   KMP_DEBUG_ASSERT(ptask != NULL);
3924   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3925   KA_TRACE(
3926       10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n",
3927            gtid, taskdata));
3928   __kmp_assert_valid_gtid(gtid);
3929   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3930 
3931   __kmp_first_top_half_finish_proxy(taskdata);
3932   __kmp_second_top_half_finish_proxy(taskdata);
3933   __kmp_bottom_half_finish_proxy(gtid, ptask);
3934 
3935   KA_TRACE(10,
3936            ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n",
3937             gtid, taskdata));
3938 }
3939 
3940 void __kmpc_give_task(kmp_task_t *ptask, kmp_int32 start = 0) {
3941   KMP_DEBUG_ASSERT(ptask != NULL);
3942   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3943 
3944   // Enqueue task to complete bottom half completion from a thread within the
3945   // corresponding team
3946   kmp_team_t *team = taskdata->td_team;
3947   kmp_int32 nthreads = team->t.t_nproc;
3948   kmp_info_t *thread;
3949 
3950   // This should be similar to start_k = __kmp_get_random( thread ) % nthreads
3951   // but we cannot use __kmp_get_random here
3952   kmp_int32 start_k = start % nthreads;
3953   kmp_int32 pass = 1;
3954   kmp_int32 k = start_k;
3955 
3956   do {
3957     // For now we're just linearly trying to find a thread
3958     thread = team->t.t_threads[k];
3959     k = (k + 1) % nthreads;
3960 
3961     // we did a full pass through all the threads
3962     if (k == start_k)
3963       pass = pass << 1;
3964 
3965   } while (!__kmp_give_task(thread, k, ptask, pass));
3966 }
3967 
3968 /*!
3969 @ingroup TASKING
3970 @param ptask Task which execution is completed
3971 
3972 Execute the completion of a proxy task from a thread that could not belong to
3973 the team.
3974 */
3975 void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) {
3976   KMP_DEBUG_ASSERT(ptask != NULL);
3977   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3978 
3979   KA_TRACE(
3980       10,
3981       ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n",
3982        taskdata));
3983 
3984   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3985 
3986   __kmp_first_top_half_finish_proxy(taskdata);
3987 
3988   __kmpc_give_task(ptask);
3989 
3990   __kmp_second_top_half_finish_proxy(taskdata);
3991 
3992   KA_TRACE(
3993       10,
3994       ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n",
3995        taskdata));
3996 }
3997 
3998 kmp_event_t *__kmpc_task_allow_completion_event(ident_t *loc_ref, int gtid,
3999                                                 kmp_task_t *task) {
4000   kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
4001   if (td->td_allow_completion_event.type == KMP_EVENT_UNINITIALIZED) {
4002     td->td_allow_completion_event.type = KMP_EVENT_ALLOW_COMPLETION;
4003     td->td_allow_completion_event.ed.task = task;
4004     __kmp_init_tas_lock(&td->td_allow_completion_event.lock);
4005   }
4006   return &td->td_allow_completion_event;
4007 }
4008 
4009 void __kmp_fulfill_event(kmp_event_t *event) {
4010   if (event->type == KMP_EVENT_ALLOW_COMPLETION) {
4011     kmp_task_t *ptask = event->ed.task;
4012     kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4013     bool detached = false;
4014     int gtid = __kmp_get_gtid();
4015 
4016     // The associated task might have completed or could be completing at this
4017     // point.
4018     // We need to take the lock to avoid races
4019     __kmp_acquire_tas_lock(&event->lock, gtid);
4020     if (taskdata->td_flags.proxy == TASK_PROXY) {
4021       detached = true;
4022     } else {
4023 #if OMPT_SUPPORT
4024       // The OMPT event must occur under mutual exclusion,
4025       // otherwise the tool might access ptask after free
4026       if (UNLIKELY(ompt_enabled.enabled))
4027         __ompt_task_finish(ptask, NULL, ompt_task_early_fulfill);
4028 #endif
4029     }
4030     event->type = KMP_EVENT_UNINITIALIZED;
4031     __kmp_release_tas_lock(&event->lock, gtid);
4032 
4033     if (detached) {
4034 #if OMPT_SUPPORT
4035       // We free ptask afterwards and know the task is finished,
4036       // so locking is not necessary
4037       if (UNLIKELY(ompt_enabled.enabled))
4038         __ompt_task_finish(ptask, NULL, ompt_task_late_fulfill);
4039 #endif
4040       // If the task detached complete the proxy task
4041       if (gtid >= 0) {
4042         kmp_team_t *team = taskdata->td_team;
4043         kmp_info_t *thread = __kmp_get_thread();
4044         if (thread->th.th_team == team) {
4045           __kmpc_proxy_task_completed(gtid, ptask);
4046           return;
4047         }
4048       }
4049 
4050       // fallback
4051       __kmpc_proxy_task_completed_ooo(ptask);
4052     }
4053   }
4054 }
4055 
4056 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task
4057 // for taskloop
4058 //
4059 // thread:   allocating thread
4060 // task_src: pointer to source task to be duplicated
4061 // returns:  a pointer to the allocated kmp_task_t structure (task).
4062 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) {
4063   kmp_task_t *task;
4064   kmp_taskdata_t *taskdata;
4065   kmp_taskdata_t *taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
4066   kmp_taskdata_t *parent_task = taskdata_src->td_parent; // same parent task
4067   size_t shareds_offset;
4068   size_t task_size;
4069 
4070   KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread,
4071                 task_src));
4072   KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy ==
4073                    TASK_FULL); // it should not be proxy task
4074   KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT);
4075   task_size = taskdata_src->td_size_alloc;
4076 
4077   // Allocate a kmp_taskdata_t block and a kmp_task_t block.
4078   KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread,
4079                 task_size));
4080 #if USE_FAST_MEMORY
4081   taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size);
4082 #else
4083   taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size);
4084 #endif /* USE_FAST_MEMORY */
4085   KMP_MEMCPY(taskdata, taskdata_src, task_size);
4086 
4087   task = KMP_TASKDATA_TO_TASK(taskdata);
4088 
4089   // Initialize new task (only specific fields not affected by memcpy)
4090   taskdata->td_task_id = KMP_GEN_TASK_ID();
4091   if (task->shareds != NULL) { // need setup shareds pointer
4092     shareds_offset = (char *)task_src->shareds - (char *)taskdata_src;
4093     task->shareds = &((char *)taskdata)[shareds_offset];
4094     KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
4095                      0);
4096   }
4097   taskdata->td_alloc_thread = thread;
4098   taskdata->td_parent = parent_task;
4099   // task inherits the taskgroup from the parent task
4100   taskdata->td_taskgroup = parent_task->td_taskgroup;
4101   // tied task needs to initialize the td_last_tied at creation,
4102   // untied one does this when it is scheduled for execution
4103   if (taskdata->td_flags.tiedness == TASK_TIED)
4104     taskdata->td_last_tied = taskdata;
4105 
4106   // Only need to keep track of child task counts if team parallel and tasking
4107   // not serialized
4108   if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
4109     KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
4110     if (parent_task->td_taskgroup)
4111       KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
4112     // Only need to keep track of allocated child tasks for explicit tasks since
4113     // implicit not deallocated
4114     if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT)
4115       KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
4116   }
4117 
4118   KA_TRACE(20,
4119            ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
4120             thread, taskdata, taskdata->td_parent));
4121 #if OMPT_SUPPORT
4122   if (UNLIKELY(ompt_enabled.enabled))
4123     __ompt_task_init(taskdata, thread->th.th_info.ds.ds_gtid);
4124 #endif
4125   return task;
4126 }
4127 
4128 // Routine optionally generated by the compiler for setting the lastprivate flag
4129 // and calling needed constructors for private/firstprivate objects
4130 // (used to form taskloop tasks from pattern task)
4131 // Parameters: dest task, src task, lastprivate flag.
4132 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
4133 
4134 KMP_BUILD_ASSERT(sizeof(long) == 4 || sizeof(long) == 8);
4135 
4136 // class to encapsulate manipulating loop bounds in a taskloop task.
4137 // this abstracts away the Intel vs GOMP taskloop interface for setting/getting
4138 // the loop bound variables.
4139 class kmp_taskloop_bounds_t {
4140   kmp_task_t *task;
4141   const kmp_taskdata_t *taskdata;
4142   size_t lower_offset;
4143   size_t upper_offset;
4144 
4145 public:
4146   kmp_taskloop_bounds_t(kmp_task_t *_task, kmp_uint64 *lb, kmp_uint64 *ub)
4147       : task(_task), taskdata(KMP_TASK_TO_TASKDATA(task)),
4148         lower_offset((char *)lb - (char *)task),
4149         upper_offset((char *)ub - (char *)task) {
4150     KMP_DEBUG_ASSERT((char *)lb > (char *)_task);
4151     KMP_DEBUG_ASSERT((char *)ub > (char *)_task);
4152   }
4153   kmp_taskloop_bounds_t(kmp_task_t *_task, const kmp_taskloop_bounds_t &bounds)
4154       : task(_task), taskdata(KMP_TASK_TO_TASKDATA(_task)),
4155         lower_offset(bounds.lower_offset), upper_offset(bounds.upper_offset) {}
4156   size_t get_lower_offset() const { return lower_offset; }
4157   size_t get_upper_offset() const { return upper_offset; }
4158   kmp_uint64 get_lb() const {
4159     kmp_int64 retval;
4160 #if defined(KMP_GOMP_COMPAT)
4161     // Intel task just returns the lower bound normally
4162     if (!taskdata->td_flags.native) {
4163       retval = *(kmp_int64 *)((char *)task + lower_offset);
4164     } else {
4165       // GOMP task has to take into account the sizeof(long)
4166       if (taskdata->td_size_loop_bounds == 4) {
4167         kmp_int32 *lb = RCAST(kmp_int32 *, task->shareds);
4168         retval = (kmp_int64)*lb;
4169       } else {
4170         kmp_int64 *lb = RCAST(kmp_int64 *, task->shareds);
4171         retval = (kmp_int64)*lb;
4172       }
4173     }
4174 #else
4175     (void)taskdata;
4176     retval = *(kmp_int64 *)((char *)task + lower_offset);
4177 #endif // defined(KMP_GOMP_COMPAT)
4178     return retval;
4179   }
4180   kmp_uint64 get_ub() const {
4181     kmp_int64 retval;
4182 #if defined(KMP_GOMP_COMPAT)
4183     // Intel task just returns the upper bound normally
4184     if (!taskdata->td_flags.native) {
4185       retval = *(kmp_int64 *)((char *)task + upper_offset);
4186     } else {
4187       // GOMP task has to take into account the sizeof(long)
4188       if (taskdata->td_size_loop_bounds == 4) {
4189         kmp_int32 *ub = RCAST(kmp_int32 *, task->shareds) + 1;
4190         retval = (kmp_int64)*ub;
4191       } else {
4192         kmp_int64 *ub = RCAST(kmp_int64 *, task->shareds) + 1;
4193         retval = (kmp_int64)*ub;
4194       }
4195     }
4196 #else
4197     retval = *(kmp_int64 *)((char *)task + upper_offset);
4198 #endif // defined(KMP_GOMP_COMPAT)
4199     return retval;
4200   }
4201   void set_lb(kmp_uint64 lb) {
4202 #if defined(KMP_GOMP_COMPAT)
4203     // Intel task just sets the lower bound normally
4204     if (!taskdata->td_flags.native) {
4205       *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4206     } else {
4207       // GOMP task has to take into account the sizeof(long)
4208       if (taskdata->td_size_loop_bounds == 4) {
4209         kmp_uint32 *lower = RCAST(kmp_uint32 *, task->shareds);
4210         *lower = (kmp_uint32)lb;
4211       } else {
4212         kmp_uint64 *lower = RCAST(kmp_uint64 *, task->shareds);
4213         *lower = (kmp_uint64)lb;
4214       }
4215     }
4216 #else
4217     *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4218 #endif // defined(KMP_GOMP_COMPAT)
4219   }
4220   void set_ub(kmp_uint64 ub) {
4221 #if defined(KMP_GOMP_COMPAT)
4222     // Intel task just sets the upper bound normally
4223     if (!taskdata->td_flags.native) {
4224       *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4225     } else {
4226       // GOMP task has to take into account the sizeof(long)
4227       if (taskdata->td_size_loop_bounds == 4) {
4228         kmp_uint32 *upper = RCAST(kmp_uint32 *, task->shareds) + 1;
4229         *upper = (kmp_uint32)ub;
4230       } else {
4231         kmp_uint64 *upper = RCAST(kmp_uint64 *, task->shareds) + 1;
4232         *upper = (kmp_uint64)ub;
4233       }
4234     }
4235 #else
4236     *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4237 #endif // defined(KMP_GOMP_COMPAT)
4238   }
4239 };
4240 
4241 // __kmp_taskloop_linear: Start tasks of the taskloop linearly
4242 //
4243 // loc        Source location information
4244 // gtid       Global thread ID
4245 // task       Pattern task, exposes the loop iteration range
4246 // lb         Pointer to loop lower bound in task structure
4247 // ub         Pointer to loop upper bound in task structure
4248 // st         Loop stride
4249 // ub_glob    Global upper bound (used for lastprivate check)
4250 // num_tasks  Number of tasks to execute
4251 // grainsize  Number of loop iterations per task
4252 // extras     Number of chunks with grainsize+1 iterations
4253 // last_chunk Reduction of grainsize for last task
4254 // tc         Iterations count
4255 // task_dup   Tasks duplication routine
4256 // codeptr_ra Return address for OMPT events
4257 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
4258                            kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4259                            kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4260                            kmp_uint64 grainsize, kmp_uint64 extras,
4261                            kmp_int64 last_chunk, kmp_uint64 tc,
4262 #if OMPT_SUPPORT
4263                            void *codeptr_ra,
4264 #endif
4265                            void *task_dup) {
4266   KMP_COUNT_BLOCK(OMP_TASKLOOP);
4267   KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
4268   p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4269   // compiler provides global bounds here
4270   kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4271   kmp_uint64 lower = task_bounds.get_lb();
4272   kmp_uint64 upper = task_bounds.get_ub();
4273   kmp_uint64 i;
4274   kmp_info_t *thread = __kmp_threads[gtid];
4275   kmp_taskdata_t *current_task = thread->th.th_current_task;
4276   kmp_task_t *next_task;
4277   kmp_int32 lastpriv = 0;
4278 
4279   KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
4280                              (last_chunk < 0 ? last_chunk : extras));
4281   KMP_DEBUG_ASSERT(num_tasks > extras);
4282   KMP_DEBUG_ASSERT(num_tasks > 0);
4283   KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, "
4284                 "extras %lld, last_chunk %lld, i=%lld,%lld(%d)%lld, dup %p\n",
4285                 gtid, num_tasks, grainsize, extras, last_chunk, lower, upper,
4286                 ub_glob, st, task_dup));
4287 
4288   // Launch num_tasks tasks, assign grainsize iterations each task
4289   for (i = 0; i < num_tasks; ++i) {
4290     kmp_uint64 chunk_minus_1;
4291     if (extras == 0) {
4292       chunk_minus_1 = grainsize - 1;
4293     } else {
4294       chunk_minus_1 = grainsize;
4295       --extras; // first extras iterations get bigger chunk (grainsize+1)
4296     }
4297     upper = lower + st * chunk_minus_1;
4298     if (upper > *ub) {
4299       upper = *ub;
4300     }
4301     if (i == num_tasks - 1) {
4302       // schedule the last task, set lastprivate flag if needed
4303       if (st == 1) { // most common case
4304         KMP_DEBUG_ASSERT(upper == *ub);
4305         if (upper == ub_glob)
4306           lastpriv = 1;
4307       } else if (st > 0) { // positive loop stride
4308         KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper);
4309         if ((kmp_uint64)st > ub_glob - upper)
4310           lastpriv = 1;
4311       } else { // negative loop stride
4312         KMP_DEBUG_ASSERT(upper + st < *ub);
4313         if (upper - ub_glob < (kmp_uint64)(-st))
4314           lastpriv = 1;
4315       }
4316     }
4317     next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
4318     kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task);
4319     kmp_taskloop_bounds_t next_task_bounds =
4320         kmp_taskloop_bounds_t(next_task, task_bounds);
4321 
4322     // adjust task-specific bounds
4323     next_task_bounds.set_lb(lower);
4324     if (next_taskdata->td_flags.native) {
4325       next_task_bounds.set_ub(upper + (st > 0 ? 1 : -1));
4326     } else {
4327       next_task_bounds.set_ub(upper);
4328     }
4329     if (ptask_dup != NULL) // set lastprivate flag, construct firstprivates,
4330                            // etc.
4331       ptask_dup(next_task, task, lastpriv);
4332     KA_TRACE(40,
4333              ("__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, "
4334               "upper %lld stride %lld, (offsets %p %p)\n",
4335               gtid, i, next_task, lower, upper, st,
4336               next_task_bounds.get_lower_offset(),
4337               next_task_bounds.get_upper_offset()));
4338 #if OMPT_SUPPORT
4339     __kmp_omp_taskloop_task(NULL, gtid, next_task,
4340                             codeptr_ra); // schedule new task
4341 #else
4342     __kmp_omp_task(gtid, next_task, true); // schedule new task
4343 #endif
4344     lower = upper + st; // adjust lower bound for the next iteration
4345   }
4346   // free the pattern task and exit
4347   __kmp_task_start(gtid, task, current_task); // make internal bookkeeping
4348   // do not execute the pattern task, just do internal bookkeeping
4349   __kmp_task_finish<false>(gtid, task, current_task);
4350 }
4351 
4352 // Structure to keep taskloop parameters for auxiliary task
4353 // kept in the shareds of the task structure.
4354 typedef struct __taskloop_params {
4355   kmp_task_t *task;
4356   kmp_uint64 *lb;
4357   kmp_uint64 *ub;
4358   void *task_dup;
4359   kmp_int64 st;
4360   kmp_uint64 ub_glob;
4361   kmp_uint64 num_tasks;
4362   kmp_uint64 grainsize;
4363   kmp_uint64 extras;
4364   kmp_int64 last_chunk;
4365   kmp_uint64 tc;
4366   kmp_uint64 num_t_min;
4367 #if OMPT_SUPPORT
4368   void *codeptr_ra;
4369 #endif
4370 } __taskloop_params_t;
4371 
4372 void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *,
4373                           kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64,
4374                           kmp_uint64, kmp_uint64, kmp_int64, kmp_uint64,
4375                           kmp_uint64,
4376 #if OMPT_SUPPORT
4377                           void *,
4378 #endif
4379                           void *);
4380 
4381 // Execute part of the taskloop submitted as a task.
4382 int __kmp_taskloop_task(int gtid, void *ptask) {
4383   __taskloop_params_t *p =
4384       (__taskloop_params_t *)((kmp_task_t *)ptask)->shareds;
4385   kmp_task_t *task = p->task;
4386   kmp_uint64 *lb = p->lb;
4387   kmp_uint64 *ub = p->ub;
4388   void *task_dup = p->task_dup;
4389   //  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4390   kmp_int64 st = p->st;
4391   kmp_uint64 ub_glob = p->ub_glob;
4392   kmp_uint64 num_tasks = p->num_tasks;
4393   kmp_uint64 grainsize = p->grainsize;
4394   kmp_uint64 extras = p->extras;
4395   kmp_int64 last_chunk = p->last_chunk;
4396   kmp_uint64 tc = p->tc;
4397   kmp_uint64 num_t_min = p->num_t_min;
4398 #if OMPT_SUPPORT
4399   void *codeptr_ra = p->codeptr_ra;
4400 #endif
4401 #if KMP_DEBUG
4402   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4403   KMP_DEBUG_ASSERT(task != NULL);
4404   KA_TRACE(20,
4405            ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize"
4406             " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
4407             gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,
4408             st, task_dup));
4409 #endif
4410   KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min);
4411   if (num_tasks > num_t_min)
4412     __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
4413                          grainsize, extras, last_chunk, tc, num_t_min,
4414 #if OMPT_SUPPORT
4415                          codeptr_ra,
4416 #endif
4417                          task_dup);
4418   else
4419     __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
4420                           grainsize, extras, last_chunk, tc,
4421 #if OMPT_SUPPORT
4422                           codeptr_ra,
4423 #endif
4424                           task_dup);
4425 
4426   KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid));
4427   return 0;
4428 }
4429 
4430 // Schedule part of the taskloop as a task,
4431 // execute the rest of the taskloop.
4432 //
4433 // loc        Source location information
4434 // gtid       Global thread ID
4435 // task       Pattern task, exposes the loop iteration range
4436 // lb         Pointer to loop lower bound in task structure
4437 // ub         Pointer to loop upper bound in task structure
4438 // st         Loop stride
4439 // ub_glob    Global upper bound (used for lastprivate check)
4440 // num_tasks  Number of tasks to execute
4441 // grainsize  Number of loop iterations per task
4442 // extras     Number of chunks with grainsize+1 iterations
4443 // last_chunk Reduction of grainsize for last task
4444 // tc         Iterations count
4445 // num_t_min  Threshold to launch tasks recursively
4446 // task_dup   Tasks duplication routine
4447 // codeptr_ra Return address for OMPT events
4448 void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
4449                           kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4450                           kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4451                           kmp_uint64 grainsize, kmp_uint64 extras,
4452                           kmp_int64 last_chunk, kmp_uint64 tc,
4453                           kmp_uint64 num_t_min,
4454 #if OMPT_SUPPORT
4455                           void *codeptr_ra,
4456 #endif
4457                           void *task_dup) {
4458   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4459   KMP_DEBUG_ASSERT(task != NULL);
4460   KMP_DEBUG_ASSERT(num_tasks > num_t_min);
4461   KA_TRACE(20,
4462            ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize"
4463             " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
4464             gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,
4465             st, task_dup));
4466   p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4467   kmp_uint64 lower = *lb;
4468   kmp_info_t *thread = __kmp_threads[gtid];
4469   //  kmp_taskdata_t *current_task = thread->th.th_current_task;
4470   kmp_task_t *next_task;
4471   size_t lower_offset =
4472       (char *)lb - (char *)task; // remember offset of lb in the task structure
4473   size_t upper_offset =
4474       (char *)ub - (char *)task; // remember offset of ub in the task structure
4475 
4476   KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
4477                              (last_chunk < 0 ? last_chunk : extras));
4478   KMP_DEBUG_ASSERT(num_tasks > extras);
4479   KMP_DEBUG_ASSERT(num_tasks > 0);
4480 
4481   // split the loop in two halves
4482   kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1;
4483   kmp_int64 last_chunk0 = 0, last_chunk1 = 0;
4484   kmp_uint64 gr_size0 = grainsize;
4485   kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute
4486   kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task
4487   if (last_chunk < 0) {
4488     ext0 = ext1 = 0;
4489     last_chunk1 = last_chunk;
4490     tc0 = grainsize * n_tsk0;
4491     tc1 = tc - tc0;
4492   } else if (n_tsk0 <= extras) {
4493     gr_size0++; // integrate extras into grainsize
4494     ext0 = 0; // no extra iters in 1st half
4495     ext1 = extras - n_tsk0; // remaining extras
4496     tc0 = gr_size0 * n_tsk0;
4497     tc1 = tc - tc0;
4498   } else { // n_tsk0 > extras
4499     ext1 = 0; // no extra iters in 2nd half
4500     ext0 = extras;
4501     tc1 = grainsize * n_tsk1;
4502     tc0 = tc - tc1;
4503   }
4504   ub0 = lower + st * (tc0 - 1);
4505   lb1 = ub0 + st;
4506 
4507   // create pattern task for 2nd half of the loop
4508   next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task
4509   // adjust lower bound (upper bound is not changed) for the 2nd half
4510   *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1;
4511   if (ptask_dup != NULL) // construct firstprivates, etc.
4512     ptask_dup(next_task, task, 0);
4513   *ub = ub0; // adjust upper bound for the 1st half
4514 
4515   // create auxiliary task for 2nd half of the loop
4516   // make sure new task has same parent task as the pattern task
4517   kmp_taskdata_t *current_task = thread->th.th_current_task;
4518   thread->th.th_current_task = taskdata->td_parent;
4519   kmp_task_t *new_task =
4520       __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *),
4521                             sizeof(__taskloop_params_t), &__kmp_taskloop_task);
4522   // restore current task
4523   thread->th.th_current_task = current_task;
4524   __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds;
4525   p->task = next_task;
4526   p->lb = (kmp_uint64 *)((char *)next_task + lower_offset);
4527   p->ub = (kmp_uint64 *)((char *)next_task + upper_offset);
4528   p->task_dup = task_dup;
4529   p->st = st;
4530   p->ub_glob = ub_glob;
4531   p->num_tasks = n_tsk1;
4532   p->grainsize = grainsize;
4533   p->extras = ext1;
4534   p->last_chunk = last_chunk1;
4535   p->tc = tc1;
4536   p->num_t_min = num_t_min;
4537 #if OMPT_SUPPORT
4538   p->codeptr_ra = codeptr_ra;
4539 #endif
4540 
4541 #if OMPT_SUPPORT
4542   // schedule new task with correct return address for OMPT events
4543   __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra);
4544 #else
4545   __kmp_omp_task(gtid, new_task, true); // schedule new task
4546 #endif
4547 
4548   // execute the 1st half of current subrange
4549   if (n_tsk0 > num_t_min)
4550     __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0,
4551                          ext0, last_chunk0, tc0, num_t_min,
4552 #if OMPT_SUPPORT
4553                          codeptr_ra,
4554 #endif
4555                          task_dup);
4556   else
4557     __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,
4558                           gr_size0, ext0, last_chunk0, tc0,
4559 #if OMPT_SUPPORT
4560                           codeptr_ra,
4561 #endif
4562                           task_dup);
4563 
4564   KA_TRACE(40, ("__kmp_taskloop_recur(exit): T#%d\n", gtid));
4565 }
4566 
4567 static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
4568                            kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4569                            int nogroup, int sched, kmp_uint64 grainsize,
4570                            int modifier, void *task_dup) {
4571   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4572   KMP_DEBUG_ASSERT(task != NULL);
4573   if (nogroup == 0) {
4574 #if OMPT_SUPPORT && OMPT_OPTIONAL
4575     OMPT_STORE_RETURN_ADDRESS(gtid);
4576 #endif
4577     __kmpc_taskgroup(loc, gtid);
4578   }
4579 
4580   // =========================================================================
4581   // calculate loop parameters
4582   kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4583   kmp_uint64 tc;
4584   // compiler provides global bounds here
4585   kmp_uint64 lower = task_bounds.get_lb();
4586   kmp_uint64 upper = task_bounds.get_ub();
4587   kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag
4588   kmp_uint64 num_tasks = 0, extras = 0;
4589   kmp_int64 last_chunk =
4590       0; // reduce grainsize of last task by last_chunk in strict mode
4591   kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks;
4592   kmp_info_t *thread = __kmp_threads[gtid];
4593   kmp_taskdata_t *current_task = thread->th.th_current_task;
4594 
4595   KA_TRACE(20, ("__kmp_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, "
4596                 "grain %llu(%d, %d), dup %p\n",
4597                 gtid, taskdata, lower, upper, st, grainsize, sched, modifier,
4598                 task_dup));
4599 
4600   // compute trip count
4601   if (st == 1) { // most common case
4602     tc = upper - lower + 1;
4603   } else if (st < 0) {
4604     tc = (lower - upper) / (-st) + 1;
4605   } else { // st > 0
4606     tc = (upper - lower) / st + 1;
4607   }
4608   if (tc == 0) {
4609     KA_TRACE(20, ("__kmp_taskloop(exit): T#%d zero-trip loop\n", gtid));
4610     // free the pattern task and exit
4611     __kmp_task_start(gtid, task, current_task);
4612     // do not execute anything for zero-trip loop
4613     __kmp_task_finish<false>(gtid, task, current_task);
4614     return;
4615   }
4616 
4617 #if OMPT_SUPPORT && OMPT_OPTIONAL
4618   ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
4619   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
4620   if (ompt_enabled.ompt_callback_work) {
4621     ompt_callbacks.ompt_callback(ompt_callback_work)(
4622         ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data),
4623         &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
4624   }
4625 #endif
4626 
4627   if (num_tasks_min == 0)
4628     // TODO: can we choose better default heuristic?
4629     num_tasks_min =
4630         KMP_MIN(thread->th.th_team_nproc * 10, INITIAL_TASK_DEQUE_SIZE);
4631 
4632   // compute num_tasks/grainsize based on the input provided
4633   switch (sched) {
4634   case 0: // no schedule clause specified, we can choose the default
4635     // let's try to schedule (team_size*10) tasks
4636     grainsize = thread->th.th_team_nproc * 10;
4637     KMP_FALLTHROUGH();
4638   case 2: // num_tasks provided
4639     if (grainsize > tc) {
4640       num_tasks = tc; // too big num_tasks requested, adjust values
4641       grainsize = 1;
4642       extras = 0;
4643     } else {
4644       num_tasks = grainsize;
4645       grainsize = tc / num_tasks;
4646       extras = tc % num_tasks;
4647     }
4648     break;
4649   case 1: // grainsize provided
4650     if (grainsize > tc) {
4651       num_tasks = 1;
4652       grainsize = tc; // too big grainsize requested, adjust values
4653       extras = 0;
4654     } else {
4655       if (modifier) {
4656         num_tasks = (tc + grainsize - 1) / grainsize;
4657         last_chunk = tc - (num_tasks * grainsize);
4658         extras = 0;
4659       } else {
4660         num_tasks = tc / grainsize;
4661         // adjust grainsize for balanced distribution of iterations
4662         grainsize = tc / num_tasks;
4663         extras = tc % num_tasks;
4664       }
4665     }
4666     break;
4667   default:
4668     KMP_ASSERT2(0, "unknown scheduling of taskloop");
4669   }
4670 
4671   KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
4672                              (last_chunk < 0 ? last_chunk : extras));
4673   KMP_DEBUG_ASSERT(num_tasks > extras);
4674   KMP_DEBUG_ASSERT(num_tasks > 0);
4675   // =========================================================================
4676 
4677   // check if clause value first
4678   // Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native)
4679   if (if_val == 0) { // if(0) specified, mark task as serial
4680     taskdata->td_flags.task_serial = 1;
4681     taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
4682     // always start serial tasks linearly
4683     __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
4684                           grainsize, extras, last_chunk, tc,
4685 #if OMPT_SUPPORT
4686                           OMPT_GET_RETURN_ADDRESS(0),
4687 #endif
4688                           task_dup);
4689     // !taskdata->td_flags.native => currently force linear spawning of tasks
4690     // for GOMP_taskloop
4691   } else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) {
4692     KA_TRACE(20, ("__kmp_taskloop: T#%d, go recursive: tc %llu, #tasks %llu"
4693                   "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
4694                   gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
4695                   last_chunk));
4696     __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
4697                          grainsize, extras, last_chunk, tc, num_tasks_min,
4698 #if OMPT_SUPPORT
4699                          OMPT_GET_RETURN_ADDRESS(0),
4700 #endif
4701                          task_dup);
4702   } else {
4703     KA_TRACE(20, ("__kmp_taskloop: T#%d, go linear: tc %llu, #tasks %llu"
4704                   "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
4705                   gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
4706                   last_chunk));
4707     __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
4708                           grainsize, extras, last_chunk, tc,
4709 #if OMPT_SUPPORT
4710                           OMPT_GET_RETURN_ADDRESS(0),
4711 #endif
4712                           task_dup);
4713   }
4714 
4715 #if OMPT_SUPPORT && OMPT_OPTIONAL
4716   if (ompt_enabled.ompt_callback_work) {
4717     ompt_callbacks.ompt_callback(ompt_callback_work)(
4718         ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data),
4719         &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
4720   }
4721 #endif
4722 
4723   if (nogroup == 0) {
4724 #if OMPT_SUPPORT && OMPT_OPTIONAL
4725     OMPT_STORE_RETURN_ADDRESS(gtid);
4726 #endif
4727     __kmpc_end_taskgroup(loc, gtid);
4728   }
4729   KA_TRACE(20, ("__kmp_taskloop(exit): T#%d\n", gtid));
4730 }
4731 
4732 /*!
4733 @ingroup TASKING
4734 @param loc       Source location information
4735 @param gtid      Global thread ID
4736 @param task      Task structure
4737 @param if_val    Value of the if clause
4738 @param lb        Pointer to loop lower bound in task structure
4739 @param ub        Pointer to loop upper bound in task structure
4740 @param st        Loop stride
4741 @param nogroup   Flag, 1 if nogroup clause specified, 0 otherwise
4742 @param sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
4743 @param grainsize Schedule value if specified
4744 @param task_dup  Tasks duplication routine
4745 
4746 Execute the taskloop construct.
4747 */
4748 void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
4749                      kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup,
4750                      int sched, kmp_uint64 grainsize, void *task_dup) {
4751   __kmp_assert_valid_gtid(gtid);
4752   KA_TRACE(20, ("__kmpc_taskloop(enter): T#%d\n", gtid));
4753   __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
4754                  0, task_dup);
4755   KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid));
4756 }
4757 
4758 /*!
4759 @ingroup TASKING
4760 @param loc       Source location information
4761 @param gtid      Global thread ID
4762 @param task      Task structure
4763 @param if_val    Value of the if clause
4764 @param lb        Pointer to loop lower bound in task structure
4765 @param ub        Pointer to loop upper bound in task structure
4766 @param st        Loop stride
4767 @param nogroup   Flag, 1 if nogroup clause specified, 0 otherwise
4768 @param sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
4769 @param grainsize Schedule value if specified
4770 @param modifer   Modifier 'strict' for sched, 1 if present, 0 otherwise
4771 @param task_dup  Tasks duplication routine
4772 
4773 Execute the taskloop construct.
4774 */
4775 void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
4776                        kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4777                        int nogroup, int sched, kmp_uint64 grainsize,
4778                        int modifier, void *task_dup) {
4779   __kmp_assert_valid_gtid(gtid);
4780   KA_TRACE(20, ("__kmpc_taskloop_5(enter): T#%d\n", gtid));
4781   __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
4782                  modifier, task_dup);
4783   KA_TRACE(20, ("__kmpc_taskloop_5(exit): T#%d\n", gtid));
4784 }
4785