1 /*
2  * kmp_tasking.cpp -- OpenMP 3.0 tasking support.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 //                     The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "kmp.h"
15 #include "kmp_i18n.h"
16 #include "kmp_itt.h"
17 #include "kmp_stats.h"
18 #include "kmp_wait_release.h"
19 
20 #if OMPT_SUPPORT
21 #include "ompt-specific.h"
22 #endif
23 
24 #include "tsan_annotations.h"
25 
26 /* forward declaration */
27 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
28                                  kmp_info_t *this_thr);
29 static void __kmp_alloc_task_deque(kmp_info_t *thread,
30                                    kmp_thread_data_t *thread_data);
31 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
32                                            kmp_task_team_t *task_team);
33 
34 #ifdef OMP_45_ENABLED
35 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask);
36 #endif
37 
38 #ifdef BUILD_TIED_TASK_STACK
39 
40 //  __kmp_trace_task_stack: print the tied tasks from the task stack in order
41 //  from top do bottom
42 //
43 //  gtid: global thread identifier for thread containing stack
44 //  thread_data: thread data for task team thread containing stack
45 //  threshold: value above which the trace statement triggers
46 //  location: string identifying call site of this function (for trace)
47 static void __kmp_trace_task_stack(kmp_int32 gtid,
48                                    kmp_thread_data_t *thread_data,
49                                    int threshold, char *location) {
50   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
51   kmp_taskdata_t **stack_top = task_stack->ts_top;
52   kmp_int32 entries = task_stack->ts_entries;
53   kmp_taskdata_t *tied_task;
54 
55   KA_TRACE(
56       threshold,
57       ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, "
58        "first_block = %p, stack_top = %p \n",
59        location, gtid, entries, task_stack->ts_first_block, stack_top));
60 
61   KMP_DEBUG_ASSERT(stack_top != NULL);
62   KMP_DEBUG_ASSERT(entries > 0);
63 
64   while (entries != 0) {
65     KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]);
66     // fix up ts_top if we need to pop from previous block
67     if (entries & TASK_STACK_INDEX_MASK == 0) {
68       kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top);
69 
70       stack_block = stack_block->sb_prev;
71       stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
72     }
73 
74     // finish bookkeeping
75     stack_top--;
76     entries--;
77 
78     tied_task = *stack_top;
79 
80     KMP_DEBUG_ASSERT(tied_task != NULL);
81     KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
82 
83     KA_TRACE(threshold,
84              ("__kmp_trace_task_stack(%s):             gtid=%d, entry=%d, "
85               "stack_top=%p, tied_task=%p\n",
86               location, gtid, entries, stack_top, tied_task));
87   }
88   KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]);
89 
90   KA_TRACE(threshold,
91            ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n",
92             location, gtid));
93 }
94 
95 //  __kmp_init_task_stack: initialize the task stack for the first time
96 //  after a thread_data structure is created.
97 //  It should not be necessary to do this again (assuming the stack works).
98 //
99 //  gtid: global thread identifier of calling thread
100 //  thread_data: thread data for task team thread containing stack
101 static void __kmp_init_task_stack(kmp_int32 gtid,
102                                   kmp_thread_data_t *thread_data) {
103   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
104   kmp_stack_block_t *first_block;
105 
106   // set up the first block of the stack
107   first_block = &task_stack->ts_first_block;
108   task_stack->ts_top = (kmp_taskdata_t **)first_block;
109   memset((void *)first_block, '\0',
110          TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *));
111 
112   // initialize the stack to be empty
113   task_stack->ts_entries = TASK_STACK_EMPTY;
114   first_block->sb_next = NULL;
115   first_block->sb_prev = NULL;
116 }
117 
118 //  __kmp_free_task_stack: free the task stack when thread_data is destroyed.
119 //
120 //  gtid: global thread identifier for calling thread
121 //  thread_data: thread info for thread containing stack
122 static void __kmp_free_task_stack(kmp_int32 gtid,
123                                   kmp_thread_data_t *thread_data) {
124   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
125   kmp_stack_block_t *stack_block = &task_stack->ts_first_block;
126 
127   KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY);
128   // free from the second block of the stack
129   while (stack_block != NULL) {
130     kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL;
131 
132     stack_block->sb_next = NULL;
133     stack_block->sb_prev = NULL;
134     if (stack_block != &task_stack->ts_first_block) {
135       __kmp_thread_free(thread,
136                         stack_block); // free the block, if not the first
137     }
138     stack_block = next_block;
139   }
140   // initialize the stack to be empty
141   task_stack->ts_entries = 0;
142   task_stack->ts_top = NULL;
143 }
144 
145 //  __kmp_push_task_stack: Push the tied task onto the task stack.
146 //     Grow the stack if necessary by allocating another block.
147 //
148 //  gtid: global thread identifier for calling thread
149 //  thread: thread info for thread containing stack
150 //  tied_task: the task to push on the stack
151 static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread,
152                                   kmp_taskdata_t *tied_task) {
153   // GEH - need to consider what to do if tt_threads_data not allocated yet
154   kmp_thread_data_t *thread_data =
155       &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
156   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
157 
158   if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) {
159     return; // Don't push anything on stack if team or team tasks are serialized
160   }
161 
162   KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
163   KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
164 
165   KA_TRACE(20,
166            ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n",
167             gtid, thread, tied_task));
168   // Store entry
169   *(task_stack->ts_top) = tied_task;
170 
171   // Do bookkeeping for next push
172   task_stack->ts_top++;
173   task_stack->ts_entries++;
174 
175   if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
176     // Find beginning of this task block
177     kmp_stack_block_t *stack_block =
178         (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE);
179 
180     // Check if we already have a block
181     if (stack_block->sb_next !=
182         NULL) { // reset ts_top to beginning of next block
183       task_stack->ts_top = &stack_block->sb_next->sb_block[0];
184     } else { // Alloc new block and link it up
185       kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc(
186           thread, sizeof(kmp_stack_block_t));
187 
188       task_stack->ts_top = &new_block->sb_block[0];
189       stack_block->sb_next = new_block;
190       new_block->sb_prev = stack_block;
191       new_block->sb_next = NULL;
192 
193       KA_TRACE(
194           30,
195           ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n",
196            gtid, tied_task, new_block));
197     }
198   }
199   KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
200                 tied_task));
201 }
202 
203 //  __kmp_pop_task_stack: Pop the tied task from the task stack.  Don't return
204 //  the task, just check to make sure it matches the ending task passed in.
205 //
206 //  gtid: global thread identifier for the calling thread
207 //  thread: thread info structure containing stack
208 //  tied_task: the task popped off the stack
209 //  ending_task: the task that is ending (should match popped task)
210 static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread,
211                                  kmp_taskdata_t *ending_task) {
212   // GEH - need to consider what to do if tt_threads_data not allocated yet
213   kmp_thread_data_t *thread_data =
214       &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)];
215   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
216   kmp_taskdata_t *tied_task;
217 
218   if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) {
219     // Don't pop anything from stack if team or team tasks are serialized
220     return;
221   }
222 
223   KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
224   KMP_DEBUG_ASSERT(task_stack->ts_entries > 0);
225 
226   KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid,
227                 thread));
228 
229   // fix up ts_top if we need to pop from previous block
230   if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
231     kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top);
232 
233     stack_block = stack_block->sb_prev;
234     task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
235   }
236 
237   // finish bookkeeping
238   task_stack->ts_top--;
239   task_stack->ts_entries--;
240 
241   tied_task = *(task_stack->ts_top);
242 
243   KMP_DEBUG_ASSERT(tied_task != NULL);
244   KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
245   KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly
246 
247   KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
248                 tied_task));
249   return;
250 }
251 #endif /* BUILD_TIED_TASK_STACK */
252 
253 //  __kmp_push_task: Add a task to the thread's deque
254 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
255   kmp_info_t *thread = __kmp_threads[gtid];
256   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
257   kmp_task_team_t *task_team = thread->th.th_task_team;
258   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
259   kmp_thread_data_t *thread_data;
260 
261   KA_TRACE(20,
262            ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata));
263 
264   if (taskdata->td_flags.tiedness == TASK_UNTIED) {
265     // untied task needs to increment counter so that the task structure is not
266     // freed prematurely
267     kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
268     KMP_DEBUG_USE_VAR(counter);
269     KA_TRACE(
270         20,
271         ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",
272          gtid, counter, taskdata));
273   }
274 
275   // The first check avoids building task_team thread data if serialized
276   if (taskdata->td_flags.task_serial) {
277     KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning "
278                   "TASK_NOT_PUSHED for task %p\n",
279                   gtid, taskdata));
280     return TASK_NOT_PUSHED;
281   }
282 
283   // Now that serialized tasks have returned, we can assume that we are not in
284   // immediate exec mode
285   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
286   if (!KMP_TASKING_ENABLED(task_team)) {
287     __kmp_enable_tasking(task_team, thread);
288   }
289   KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE);
290   KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL);
291 
292   // Find tasking deque specific to encountering thread
293   thread_data = &task_team->tt.tt_threads_data[tid];
294 
295   // No lock needed since only owner can allocate
296   if (thread_data->td.td_deque == NULL) {
297     __kmp_alloc_task_deque(thread, thread_data);
298   }
299 
300   // Check if deque is full
301   if (TCR_4(thread_data->td.td_deque_ntasks) >=
302       TASK_DEQUE_SIZE(thread_data->td)) {
303     KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning "
304                   "TASK_NOT_PUSHED for task %p\n",
305                   gtid, taskdata));
306     return TASK_NOT_PUSHED;
307   }
308 
309   // Lock the deque for the task push operation
310   __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
311 
312 #if OMP_45_ENABLED
313   // Need to recheck as we can get a proxy task from a thread outside of OpenMP
314   if (TCR_4(thread_data->td.td_deque_ntasks) >=
315       TASK_DEQUE_SIZE(thread_data->td)) {
316     __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
317     KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; returning "
318                   "TASK_NOT_PUSHED for task %p\n",
319                   gtid, taskdata));
320     return TASK_NOT_PUSHED;
321   }
322 #else
323   // Must have room since no thread can add tasks but calling thread
324   KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
325                    TASK_DEQUE_SIZE(thread_data->td));
326 #endif
327 
328   thread_data->td.td_deque[thread_data->td.td_deque_tail] =
329       taskdata; // Push taskdata
330   // Wrap index.
331   thread_data->td.td_deque_tail =
332       (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
333   TCW_4(thread_data->td.td_deque_ntasks,
334         TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
335 
336   KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
337                 "task=%p ntasks=%d head=%u tail=%u\n",
338                 gtid, taskdata, thread_data->td.td_deque_ntasks,
339                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
340 
341   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
342 
343   return TASK_SUCCESSFULLY_PUSHED;
344 }
345 
346 // __kmp_pop_current_task_from_thread: set up current task from called thread
347 // when team ends
348 //
349 // this_thr: thread structure to set current_task in.
350 void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) {
351   KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d "
352                 "this_thread=%p, curtask=%p, "
353                 "curtask_parent=%p\n",
354                 0, this_thr, this_thr->th.th_current_task,
355                 this_thr->th.th_current_task->td_parent));
356 
357   this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent;
358 
359   KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d "
360                 "this_thread=%p, curtask=%p, "
361                 "curtask_parent=%p\n",
362                 0, this_thr, this_thr->th.th_current_task,
363                 this_thr->th.th_current_task->td_parent));
364 }
365 
366 // __kmp_push_current_task_to_thread: set up current task in called thread for a
367 // new team
368 //
369 // this_thr: thread structure to set up
370 // team: team for implicit task data
371 // tid: thread within team to set up
372 void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team,
373                                        int tid) {
374   // current task of the thread is a parent of the new just created implicit
375   // tasks of new team
376   KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p "
377                 "curtask=%p "
378                 "parent_task=%p\n",
379                 tid, this_thr, this_thr->th.th_current_task,
380                 team->t.t_implicit_task_taskdata[tid].td_parent));
381 
382   KMP_DEBUG_ASSERT(this_thr != NULL);
383 
384   if (tid == 0) {
385     if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) {
386       team->t.t_implicit_task_taskdata[0].td_parent =
387           this_thr->th.th_current_task;
388       this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0];
389     }
390   } else {
391     team->t.t_implicit_task_taskdata[tid].td_parent =
392         team->t.t_implicit_task_taskdata[0].td_parent;
393     this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid];
394   }
395 
396   KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p "
397                 "curtask=%p "
398                 "parent_task=%p\n",
399                 tid, this_thr, this_thr->th.th_current_task,
400                 team->t.t_implicit_task_taskdata[tid].td_parent));
401 }
402 
403 // __kmp_task_start: bookkeeping for a task starting execution
404 //
405 // GTID: global thread id of calling thread
406 // task: task starting execution
407 // current_task: task suspending
408 static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task,
409                              kmp_taskdata_t *current_task) {
410   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
411   kmp_info_t *thread = __kmp_threads[gtid];
412 
413   KA_TRACE(10,
414            ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",
415             gtid, taskdata, current_task));
416 
417   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
418 
419   // mark currently executing task as suspended
420   // TODO: GEH - make sure root team implicit task is initialized properly.
421   // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );
422   current_task->td_flags.executing = 0;
423 
424 // Add task to stack if tied
425 #ifdef BUILD_TIED_TASK_STACK
426   if (taskdata->td_flags.tiedness == TASK_TIED) {
427     __kmp_push_task_stack(gtid, thread, taskdata);
428   }
429 #endif /* BUILD_TIED_TASK_STACK */
430 
431   // mark starting task as executing and as current task
432   thread->th.th_current_task = taskdata;
433 
434   KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 ||
435                    taskdata->td_flags.tiedness == TASK_UNTIED);
436   KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 ||
437                    taskdata->td_flags.tiedness == TASK_UNTIED);
438   taskdata->td_flags.started = 1;
439   taskdata->td_flags.executing = 1;
440   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
441   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
442 
443   // GEH TODO: shouldn't we pass some sort of location identifier here?
444   // APT: yes, we will pass location here.
445   // need to store current thread state (in a thread or taskdata structure)
446   // before setting work_state, otherwise wrong state is set after end of task
447 
448   KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata));
449 
450   return;
451 }
452 
453 #if OMPT_SUPPORT
454 //------------------------------------------------------------------------------
455 // __ompt_task_init:
456 //   Initialize OMPT fields maintained by a task. This will only be called after
457 //   ompt_start_tool, so we already know whether ompt is enabled or not.
458 
459 static inline void __ompt_task_init(kmp_taskdata_t *task, int tid) {
460   // The calls to __ompt_task_init already have the ompt_enabled condition.
461   task->ompt_task_info.task_data.value = 0;
462   task->ompt_task_info.frame.exit_frame = NULL;
463   task->ompt_task_info.frame.enter_frame = NULL;
464 #if OMP_40_ENABLED
465   task->ompt_task_info.ndeps = 0;
466   task->ompt_task_info.deps = NULL;
467 #endif /* OMP_40_ENABLED */
468 }
469 
470 // __ompt_task_start:
471 //   Build and trigger task-begin event
472 static inline void __ompt_task_start(kmp_task_t *task,
473                                      kmp_taskdata_t *current_task,
474                                      kmp_int32 gtid) {
475   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
476   ompt_task_status_t status = ompt_task_others;
477   if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) {
478     status = ompt_task_yield;
479     __kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = 0;
480   }
481   /* let OMPT know that we're about to run this task */
482   if (ompt_enabled.ompt_callback_task_schedule) {
483     ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
484         &(current_task->ompt_task_info.task_data), status,
485         &(taskdata->ompt_task_info.task_data));
486   }
487   taskdata->ompt_task_info.scheduling_parent = current_task;
488 }
489 
490 // __ompt_task_finish:
491 //   Build and trigger final task-schedule event
492 static inline void
493 __ompt_task_finish(kmp_task_t *task, kmp_taskdata_t *resumed_task,
494                    ompt_task_status_t status = ompt_task_complete) {
495   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
496   if (__kmp_omp_cancellation && taskdata->td_taskgroup &&
497       taskdata->td_taskgroup->cancel_request == cancel_taskgroup) {
498     status = ompt_task_cancel;
499   }
500 
501   /* let OMPT know that we're returning to the callee task */
502   if (ompt_enabled.ompt_callback_task_schedule) {
503     ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
504         &(taskdata->ompt_task_info.task_data), status,
505         &((resumed_task ? resumed_task
506                         : (taskdata->ompt_task_info.scheduling_parent
507                                ? taskdata->ompt_task_info.scheduling_parent
508                                : taskdata->td_parent))
509               ->ompt_task_info.task_data));
510   }
511 }
512 #endif
513 
514 template <bool ompt>
515 static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid,
516                                                kmp_task_t *task,
517                                                void *frame_address,
518                                                void *return_address) {
519   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
520   kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
521 
522   KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p "
523                 "current_task=%p\n",
524                 gtid, loc_ref, taskdata, current_task));
525 
526   if (taskdata->td_flags.tiedness == TASK_UNTIED) {
527     // untied task needs to increment counter so that the task structure is not
528     // freed prematurely
529     kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
530     KMP_DEBUG_USE_VAR(counter);
531     KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) "
532                   "incremented for task %p\n",
533                   gtid, counter, taskdata));
534   }
535 
536   taskdata->td_flags.task_serial =
537       1; // Execute this task immediately, not deferred.
538   __kmp_task_start(gtid, task, current_task);
539 
540 #if OMPT_SUPPORT
541   if (ompt) {
542     if (current_task->ompt_task_info.frame.enter_frame == NULL) {
543       current_task->ompt_task_info.frame.enter_frame =
544           taskdata->ompt_task_info.frame.exit_frame = frame_address;
545     }
546     if (ompt_enabled.ompt_callback_task_create) {
547       ompt_task_info_t *parent_info = &(current_task->ompt_task_info);
548       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
549           &(parent_info->task_data), &(parent_info->frame),
550           &(taskdata->ompt_task_info.task_data),
551           ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(taskdata), 0,
552           return_address);
553     }
554     __ompt_task_start(task, current_task, gtid);
555   }
556 #endif // OMPT_SUPPORT
557 
558   KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid,
559                 loc_ref, taskdata));
560 }
561 
562 #if OMPT_SUPPORT
563 OMPT_NOINLINE
564 static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
565                                            kmp_task_t *task,
566                                            void *frame_address,
567                                            void *return_address) {
568   __kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address,
569                                            return_address);
570 }
571 #endif // OMPT_SUPPORT
572 
573 // __kmpc_omp_task_begin_if0: report that a given serialized task has started
574 // execution
575 //
576 // loc_ref: source location information; points to beginning of task block.
577 // gtid: global thread number.
578 // task: task thunk for the started task.
579 void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
580                                kmp_task_t *task) {
581 #if OMPT_SUPPORT
582   if (UNLIKELY(ompt_enabled.enabled)) {
583     OMPT_STORE_RETURN_ADDRESS(gtid);
584     __kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task,
585                                    OMPT_GET_FRAME_ADDRESS(1),
586                                    OMPT_LOAD_RETURN_ADDRESS(gtid));
587     return;
588   }
589 #endif
590   __kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL);
591 }
592 
593 #ifdef TASK_UNUSED
594 // __kmpc_omp_task_begin: report that a given task has started execution
595 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
596 void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) {
597   kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
598 
599   KA_TRACE(
600       10,
601       ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
602        gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task));
603 
604   __kmp_task_start(gtid, task, current_task);
605 
606   KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid,
607                 loc_ref, KMP_TASK_TO_TASKDATA(task)));
608   return;
609 }
610 #endif // TASK_UNUSED
611 
612 // __kmp_free_task: free the current task space and the space for shareds
613 //
614 // gtid: Global thread ID of calling thread
615 // taskdata: task to free
616 // thread: thread data structure of caller
617 static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
618                             kmp_info_t *thread) {
619   KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid,
620                 taskdata));
621 
622   // Check to make sure all flags and counters have the correct values
623   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
624   KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0);
625   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1);
626   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
627   KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == 0 ||
628                    taskdata->td_flags.task_serial == 1);
629   KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0);
630 
631   taskdata->td_flags.freed = 1;
632   ANNOTATE_HAPPENS_BEFORE(taskdata);
633 // deallocate the taskdata and shared variable blocks associated with this task
634 #if USE_FAST_MEMORY
635   __kmp_fast_free(thread, taskdata);
636 #else /* ! USE_FAST_MEMORY */
637   __kmp_thread_free(thread, taskdata);
638 #endif
639 
640   KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));
641 }
642 
643 // __kmp_free_task_and_ancestors: free the current task and ancestors without
644 // children
645 //
646 // gtid: Global thread ID of calling thread
647 // taskdata: task to free
648 // thread: thread data structure of caller
649 static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
650                                           kmp_taskdata_t *taskdata,
651                                           kmp_info_t *thread) {
652 #if OMP_45_ENABLED
653   // Proxy tasks must always be allowed to free their parents
654   // because they can be run in background even in serial mode.
655   kmp_int32 team_serial =
656       (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) &&
657       !taskdata->td_flags.proxy;
658 #else
659   kmp_int32 team_serial =
660       taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser;
661 #endif
662   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
663 
664   kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
665   KMP_DEBUG_ASSERT(children >= 0);
666 
667   // Now, go up the ancestor tree to see if any ancestors can now be freed.
668   while (children == 0) {
669     kmp_taskdata_t *parent_taskdata = taskdata->td_parent;
670 
671     KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete "
672                   "and freeing itself\n",
673                   gtid, taskdata));
674 
675     // --- Deallocate my ancestor task ---
676     __kmp_free_task(gtid, taskdata, thread);
677 
678     taskdata = parent_taskdata;
679 
680     // Stop checking ancestors at implicit task instead of walking up ancestor
681     // tree to avoid premature deallocation of ancestors.
682     if (team_serial || taskdata->td_flags.tasktype == TASK_IMPLICIT)
683       return;
684 
685     // Predecrement simulated by "- 1" calculation
686     children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
687     KMP_DEBUG_ASSERT(children >= 0);
688   }
689 
690   KA_TRACE(
691       20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "
692            "not freeing it yet\n",
693            gtid, taskdata, children));
694 }
695 
696 // __kmp_task_finish: bookkeeping to do when a task finishes execution
697 //
698 // gtid: global thread ID for calling thread
699 // task: task to be finished
700 // resumed_task: task to be resumed.  (may be NULL if task is serialized)
701 template <bool ompt>
702 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
703                               kmp_taskdata_t *resumed_task) {
704   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
705   kmp_info_t *thread = __kmp_threads[gtid];
706   kmp_task_team_t *task_team =
707       thread->th.th_task_team; // might be NULL for serial teams...
708   kmp_int32 children = 0;
709 
710   KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming "
711                 "task %p\n",
712                 gtid, taskdata, resumed_task));
713 
714   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
715 
716 // Pop task from stack if tied
717 #ifdef BUILD_TIED_TASK_STACK
718   if (taskdata->td_flags.tiedness == TASK_TIED) {
719     __kmp_pop_task_stack(gtid, thread, taskdata);
720   }
721 #endif /* BUILD_TIED_TASK_STACK */
722 
723   if (taskdata->td_flags.tiedness == TASK_UNTIED) {
724     // untied task needs to check the counter so that the task structure is not
725     // freed prematurely
726     kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1;
727     KA_TRACE(
728         20,
729         ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",
730          gtid, counter, taskdata));
731     if (counter > 0) {
732       // untied task is not done, to be continued possibly by other thread, do
733       // not free it now
734       if (resumed_task == NULL) {
735         KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial);
736         resumed_task = taskdata->td_parent; // In a serialized task, the resumed
737         // task is the parent
738       }
739       thread->th.th_current_task = resumed_task; // restore current_task
740       resumed_task->td_flags.executing = 1; // resume previous task
741       KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, "
742                     "resuming task %p\n",
743                     gtid, taskdata, resumed_task));
744       return;
745     }
746   }
747 #if OMPT_SUPPORT
748   if (ompt)
749     __ompt_task_finish(task, resumed_task);
750 #endif
751 
752   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
753   taskdata->td_flags.complete = 1; // mark the task as completed
754   KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
755   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
756 
757   // Only need to keep track of count if team parallel and tasking not
758   // serialized
759   if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
760     // Predecrement simulated by "- 1" calculation
761     children =
762         KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
763     KMP_DEBUG_ASSERT(children >= 0);
764 #if OMP_40_ENABLED
765     if (taskdata->td_taskgroup)
766       KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
767 #if OMP_45_ENABLED
768   }
769   // if we found proxy tasks there could exist a dependency chain
770   // with the proxy task as origin
771   if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) ||
772       (task_team && task_team->tt.tt_found_proxy_tasks)) {
773 #endif
774     __kmp_release_deps(gtid, taskdata);
775 #endif
776   }
777 
778   // td_flags.executing must be marked as 0 after __kmp_release_deps has been
779   // called. Othertwise, if a task is executed immediately from the release_deps
780   // code, the flag will be reset to 1 again by this same function
781   KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
782   taskdata->td_flags.executing = 0; // suspend the finishing task
783 
784   KA_TRACE(
785       20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
786            gtid, taskdata, children));
787 
788 #if OMP_40_ENABLED
789   /* If the tasks' destructor thunk flag has been set, we need to invoke the
790      destructor thunk that has been generated by the compiler. The code is
791      placed here, since at this point other tasks might have been released
792      hence overlapping the destructor invokations with some other work in the
793      released tasks.  The OpenMP spec is not specific on when the destructors
794      are invoked, so we should be free to choose. */
795   if (taskdata->td_flags.destructors_thunk) {
796     kmp_routine_entry_t destr_thunk = task->data1.destructors;
797     KMP_ASSERT(destr_thunk);
798     destr_thunk(gtid, task);
799   }
800 #endif // OMP_40_ENABLED
801 
802   // bookkeeping for resuming task:
803   // GEH - note tasking_ser => task_serial
804   KMP_DEBUG_ASSERT(
805       (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
806       taskdata->td_flags.task_serial);
807   if (taskdata->td_flags.task_serial) {
808     if (resumed_task == NULL) {
809       resumed_task = taskdata->td_parent; // In a serialized task, the resumed
810       // task is the parent
811     }
812   } else {
813     KMP_DEBUG_ASSERT(resumed_task !=
814                      NULL); // verify that resumed task is passed as arguemnt
815   }
816 
817   // Free this task and then ancestor tasks if they have no children.
818   // Restore th_current_task first as suggested by John:
819   // johnmc: if an asynchronous inquiry peers into the runtime system
820   // it doesn't see the freed task as the current task.
821   thread->th.th_current_task = resumed_task;
822   __kmp_free_task_and_ancestors(gtid, taskdata, thread);
823 
824   // TODO: GEH - make sure root team implicit task is initialized properly.
825   // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
826   resumed_task->td_flags.executing = 1; // resume previous task
827 
828   KA_TRACE(
829       10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
830            gtid, taskdata, resumed_task));
831 
832   return;
833 }
834 
835 template <bool ompt>
836 static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref,
837                                                   kmp_int32 gtid,
838                                                   kmp_task_t *task) {
839   KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
840                 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
841   // this routine will provide task to resume
842   __kmp_task_finish<ompt>(gtid, task, NULL);
843 
844   KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
845                 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
846 
847 #if OMPT_SUPPORT
848   if (ompt) {
849     omp_frame_t *ompt_frame;
850     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
851     ompt_frame->enter_frame = NULL;
852   }
853 #endif
854 
855   return;
856 }
857 
858 #if OMPT_SUPPORT
859 OMPT_NOINLINE
860 void __kmpc_omp_task_complete_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
861                                        kmp_task_t *task) {
862   __kmpc_omp_task_complete_if0_template<true>(loc_ref, gtid, task);
863 }
864 #endif // OMPT_SUPPORT
865 
866 // __kmpc_omp_task_complete_if0: report that a task has completed execution
867 //
868 // loc_ref: source location information; points to end of task block.
869 // gtid: global thread number.
870 // task: task thunk for the completed task.
871 void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,
872                                   kmp_task_t *task) {
873 #if OMPT_SUPPORT
874   if (UNLIKELY(ompt_enabled.enabled)) {
875     __kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task);
876     return;
877   }
878 #endif
879   __kmpc_omp_task_complete_if0_template<false>(loc_ref, gtid, task);
880 }
881 
882 #ifdef TASK_UNUSED
883 // __kmpc_omp_task_complete: report that a task has completed execution
884 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
885 void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid,
886                               kmp_task_t *task) {
887   KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid,
888                 loc_ref, KMP_TASK_TO_TASKDATA(task)));
889 
890   __kmp_task_finish<false>(gtid, task,
891                            NULL); // Not sure how to find task to resume
892 
893   KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid,
894                 loc_ref, KMP_TASK_TO_TASKDATA(task)));
895   return;
896 }
897 #endif // TASK_UNUSED
898 
899 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit
900 // task for a given thread
901 //
902 // loc_ref:  reference to source location of parallel region
903 // this_thr:  thread data structure corresponding to implicit task
904 // team: team for this_thr
905 // tid: thread id of given thread within team
906 // set_curr_task: TRUE if need to push current task to thread
907 // NOTE: Routine does not set up the implicit task ICVS.  This is assumed to
908 // have already been done elsewhere.
909 // TODO: Get better loc_ref.  Value passed in may be NULL
910 void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
911                               kmp_team_t *team, int tid, int set_curr_task) {
912   kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid];
913 
914   KF_TRACE(
915       10,
916       ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",
917        tid, team, task, set_curr_task ? "TRUE" : "FALSE"));
918 
919   task->td_task_id = KMP_GEN_TASK_ID();
920   task->td_team = team;
921   //    task->td_parent   = NULL;  // fix for CQ230101 (broken parent task info
922   //    in debugger)
923   task->td_ident = loc_ref;
924   task->td_taskwait_ident = NULL;
925   task->td_taskwait_counter = 0;
926   task->td_taskwait_thread = 0;
927 
928   task->td_flags.tiedness = TASK_TIED;
929   task->td_flags.tasktype = TASK_IMPLICIT;
930 #if OMP_45_ENABLED
931   task->td_flags.proxy = TASK_FULL;
932 #endif
933 
934   // All implicit tasks are executed immediately, not deferred
935   task->td_flags.task_serial = 1;
936   task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
937   task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
938 
939   task->td_flags.started = 1;
940   task->td_flags.executing = 1;
941   task->td_flags.complete = 0;
942   task->td_flags.freed = 0;
943 
944 #if OMP_40_ENABLED
945   task->td_depnode = NULL;
946 #endif
947   task->td_last_tied = task;
948 
949   if (set_curr_task) { // only do this init first time thread is created
950     KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0);
951     // Not used: don't need to deallocate implicit task
952     KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0);
953 #if OMP_40_ENABLED
954     task->td_taskgroup = NULL; // An implicit task does not have taskgroup
955     task->td_dephash = NULL;
956 #endif
957     __kmp_push_current_task_to_thread(this_thr, team, tid);
958   } else {
959     KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);
960     KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0);
961   }
962 
963 #if OMPT_SUPPORT
964   if (UNLIKELY(ompt_enabled.enabled))
965     __ompt_task_init(task, tid);
966 #endif
967 
968   KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid,
969                 team, task));
970 }
971 
972 // __kmp_finish_implicit_task: Release resources associated to implicit tasks
973 // at the end of parallel regions. Some resources are kept for reuse in the next
974 // parallel region.
975 //
976 // thread:  thread data structure corresponding to implicit task
977 void __kmp_finish_implicit_task(kmp_info_t *thread) {
978   kmp_taskdata_t *task = thread->th.th_current_task;
979   if (task->td_dephash)
980     __kmp_dephash_free_entries(thread, task->td_dephash);
981 }
982 
983 // __kmp_free_implicit_task: Release resources associated to implicit tasks
984 // when these are destroyed regions
985 //
986 // thread:  thread data structure corresponding to implicit task
987 void __kmp_free_implicit_task(kmp_info_t *thread) {
988   kmp_taskdata_t *task = thread->th.th_current_task;
989   if (task && task->td_dephash) {
990     __kmp_dephash_free(thread, task->td_dephash);
991     task->td_dephash = NULL;
992   }
993 }
994 
995 // Round up a size to a power of two specified by val: Used to insert padding
996 // between structures co-allocated using a single malloc() call
997 static size_t __kmp_round_up_to_val(size_t size, size_t val) {
998   if (size & (val - 1)) {
999     size &= ~(val - 1);
1000     if (size <= KMP_SIZE_T_MAX - val) {
1001       size += val; // Round up if there is no overflow.
1002     }
1003   }
1004   return size;
1005 } // __kmp_round_up_to_va
1006 
1007 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task
1008 //
1009 // loc_ref: source location information
1010 // gtid: global thread number.
1011 // flags: include tiedness & task type (explicit vs. implicit) of the ''new''
1012 // task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine.
1013 // sizeof_kmp_task_t:  Size in bytes of kmp_task_t data structure including
1014 // private vars accessed in task.
1015 // sizeof_shareds:  Size in bytes of array of pointers to shared vars accessed
1016 // in task.
1017 // task_entry: Pointer to task code entry point generated by compiler.
1018 // returns: a pointer to the allocated kmp_task_t structure (task).
1019 kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1020                              kmp_tasking_flags_t *flags,
1021                              size_t sizeof_kmp_task_t, size_t sizeof_shareds,
1022                              kmp_routine_entry_t task_entry) {
1023   kmp_task_t *task;
1024   kmp_taskdata_t *taskdata;
1025   kmp_info_t *thread = __kmp_threads[gtid];
1026   kmp_team_t *team = thread->th.th_team;
1027   kmp_taskdata_t *parent_task = thread->th.th_current_task;
1028   size_t shareds_offset;
1029 
1030   if (!TCR_4(__kmp_init_middle))
1031     __kmp_middle_initialize();
1032 
1033   KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
1034                 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1035                 gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
1036                 sizeof_shareds, task_entry));
1037 
1038   if (parent_task->td_flags.final) {
1039     if (flags->merged_if0) {
1040     }
1041     flags->final = 1;
1042   }
1043   if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) {
1044     // Untied task encountered causes the TSC algorithm to check entire deque of
1045     // the victim thread. If no untied task encountered, then checking the head
1046     // of the deque should be enough.
1047     KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1);
1048   }
1049 
1050 #if OMP_45_ENABLED
1051   if (flags->proxy == TASK_PROXY) {
1052     flags->tiedness = TASK_UNTIED;
1053     flags->merged_if0 = 1;
1054 
1055     /* are we running in a sequential parallel or tskm_immediate_exec... we need
1056        tasking support enabled */
1057     if ((thread->th.th_task_team) == NULL) {
1058       /* This should only happen if the team is serialized
1059           setup a task team and propagate it to the thread */
1060       KMP_DEBUG_ASSERT(team->t.t_serialized);
1061       KA_TRACE(30,
1062                ("T#%d creating task team in __kmp_task_alloc for proxy task\n",
1063                 gtid));
1064       __kmp_task_team_setup(
1065           thread, team,
1066           1); // 1 indicates setup the current team regardless of nthreads
1067       thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
1068     }
1069     kmp_task_team_t *task_team = thread->th.th_task_team;
1070 
1071     /* tasking must be enabled now as the task might not be pushed */
1072     if (!KMP_TASKING_ENABLED(task_team)) {
1073       KA_TRACE(
1074           30,
1075           ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
1076       __kmp_enable_tasking(task_team, thread);
1077       kmp_int32 tid = thread->th.th_info.ds.ds_tid;
1078       kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
1079       // No lock needed since only owner can allocate
1080       if (thread_data->td.td_deque == NULL) {
1081         __kmp_alloc_task_deque(thread, thread_data);
1082       }
1083     }
1084 
1085     if (task_team->tt.tt_found_proxy_tasks == FALSE)
1086       TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE);
1087   }
1088 #endif
1089 
1090   // Calculate shared structure offset including padding after kmp_task_t struct
1091   // to align pointers in shared struct
1092   shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t;
1093   shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(void *));
1094 
1095   // Allocate a kmp_taskdata_t block and a kmp_task_t block.
1096   KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid,
1097                 shareds_offset));
1098   KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid,
1099                 sizeof_shareds));
1100 
1101 // Avoid double allocation here by combining shareds with taskdata
1102 #if USE_FAST_MEMORY
1103   taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset +
1104                                                                sizeof_shareds);
1105 #else /* ! USE_FAST_MEMORY */
1106   taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset +
1107                                                                sizeof_shareds);
1108 #endif /* USE_FAST_MEMORY */
1109   ANNOTATE_HAPPENS_AFTER(taskdata);
1110 
1111   task = KMP_TASKDATA_TO_TASK(taskdata);
1112 
1113 // Make sure task & taskdata are aligned appropriately
1114 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD
1115   KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0);
1116   KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0);
1117 #else
1118   KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0);
1119   KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0);
1120 #endif
1121   if (sizeof_shareds > 0) {
1122     // Avoid double allocation here by combining shareds with taskdata
1123     task->shareds = &((char *)taskdata)[shareds_offset];
1124     // Make sure shareds struct is aligned to pointer size
1125     KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
1126                      0);
1127   } else {
1128     task->shareds = NULL;
1129   }
1130   task->routine = task_entry;
1131   task->part_id = 0; // AC: Always start with 0 part id
1132 
1133   taskdata->td_task_id = KMP_GEN_TASK_ID();
1134   taskdata->td_team = team;
1135   taskdata->td_alloc_thread = thread;
1136   taskdata->td_parent = parent_task;
1137   taskdata->td_level = parent_task->td_level + 1; // increment nesting level
1138   KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
1139   taskdata->td_ident = loc_ref;
1140   taskdata->td_taskwait_ident = NULL;
1141   taskdata->td_taskwait_counter = 0;
1142   taskdata->td_taskwait_thread = 0;
1143   KMP_DEBUG_ASSERT(taskdata->td_parent != NULL);
1144 #if OMP_45_ENABLED
1145   // avoid copying icvs for proxy tasks
1146   if (flags->proxy == TASK_FULL)
1147 #endif
1148     copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs);
1149 
1150   taskdata->td_flags.tiedness = flags->tiedness;
1151   taskdata->td_flags.final = flags->final;
1152   taskdata->td_flags.merged_if0 = flags->merged_if0;
1153 #if OMP_40_ENABLED
1154   taskdata->td_flags.destructors_thunk = flags->destructors_thunk;
1155 #endif // OMP_40_ENABLED
1156 #if OMP_45_ENABLED
1157   taskdata->td_flags.proxy = flags->proxy;
1158   taskdata->td_task_team = thread->th.th_task_team;
1159   taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
1160 #endif
1161   taskdata->td_flags.tasktype = TASK_EXPLICIT;
1162 
1163   // GEH - TODO: fix this to copy parent task's value of tasking_ser flag
1164   taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1165 
1166   // GEH - TODO: fix this to copy parent task's value of team_serial flag
1167   taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1168 
1169   // GEH - Note we serialize the task if the team is serialized to make sure
1170   // implicit parallel region tasks are not left until program termination to
1171   // execute. Also, it helps locality to execute immediately.
1172 
1173   taskdata->td_flags.task_serial =
1174       (parent_task->td_flags.final || taskdata->td_flags.team_serial ||
1175        taskdata->td_flags.tasking_ser);
1176 
1177   taskdata->td_flags.started = 0;
1178   taskdata->td_flags.executing = 0;
1179   taskdata->td_flags.complete = 0;
1180   taskdata->td_flags.freed = 0;
1181 
1182   taskdata->td_flags.native = flags->native;
1183 
1184   KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
1185   // start at one because counts current task and children
1186   KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
1187 #if OMP_40_ENABLED
1188   taskdata->td_taskgroup =
1189       parent_task->td_taskgroup; // task inherits taskgroup from the parent task
1190   taskdata->td_dephash = NULL;
1191   taskdata->td_depnode = NULL;
1192 #endif
1193   if (flags->tiedness == TASK_UNTIED)
1194     taskdata->td_last_tied = NULL; // will be set when the task is scheduled
1195   else
1196     taskdata->td_last_tied = taskdata;
1197 
1198 #if OMPT_SUPPORT
1199   if (UNLIKELY(ompt_enabled.enabled))
1200     __ompt_task_init(taskdata, gtid);
1201 #endif
1202 // Only need to keep track of child task counts if team parallel and tasking not
1203 // serialized or if it is a proxy task
1204 #if OMP_45_ENABLED
1205   if (flags->proxy == TASK_PROXY ||
1206       !(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser))
1207 #else
1208   if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser))
1209 #endif
1210   {
1211     KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
1212 #if OMP_40_ENABLED
1213     if (parent_task->td_taskgroup)
1214       KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
1215 #endif
1216     // Only need to keep track of allocated child tasks for explicit tasks since
1217     // implicit not deallocated
1218     if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
1219       KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
1220     }
1221   }
1222 
1223   KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
1224                 gtid, taskdata, taskdata->td_parent));
1225   ANNOTATE_HAPPENS_BEFORE(task);
1226 
1227   return task;
1228 }
1229 
1230 kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1231                                   kmp_int32 flags, size_t sizeof_kmp_task_t,
1232                                   size_t sizeof_shareds,
1233                                   kmp_routine_entry_t task_entry) {
1234   kmp_task_t *retval;
1235   kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
1236 
1237   input_flags->native = FALSE;
1238 // __kmp_task_alloc() sets up all other runtime flags
1239 
1240 #if OMP_45_ENABLED
1241   KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s) "
1242                 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1243                 gtid, loc_ref, input_flags->tiedness ? "tied  " : "untied",
1244                 input_flags->proxy ? "proxy" : "", sizeof_kmp_task_t,
1245                 sizeof_shareds, task_entry));
1246 #else
1247   KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s) "
1248                 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1249                 gtid, loc_ref, input_flags->tiedness ? "tied  " : "untied",
1250                 sizeof_kmp_task_t, sizeof_shareds, task_entry));
1251 #endif
1252 
1253   retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t,
1254                             sizeof_shareds, task_entry);
1255 
1256   KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval));
1257 
1258   return retval;
1259 }
1260 
1261 //  __kmp_invoke_task: invoke the specified task
1262 //
1263 // gtid: global thread ID of caller
1264 // task: the task to invoke
1265 // current_task: the task to resume after task invokation
1266 static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
1267                               kmp_taskdata_t *current_task) {
1268   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
1269 #if OMP_40_ENABLED
1270   int discard = 0 /* false */;
1271 #endif
1272   KA_TRACE(
1273       30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
1274            gtid, taskdata, current_task));
1275   KMP_DEBUG_ASSERT(task);
1276 #if OMP_45_ENABLED
1277   if (taskdata->td_flags.proxy == TASK_PROXY &&
1278       taskdata->td_flags.complete == 1) {
1279     // This is a proxy task that was already completed but it needs to run
1280     // its bottom-half finish
1281     KA_TRACE(
1282         30,
1283         ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
1284          gtid, taskdata));
1285 
1286     __kmp_bottom_half_finish_proxy(gtid, task);
1287 
1288     KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for "
1289                   "proxy task %p, resuming task %p\n",
1290                   gtid, taskdata, current_task));
1291 
1292     return;
1293   }
1294 #endif
1295 
1296 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1297   kmp_uint64 cur_time;
1298   if (__kmp_forkjoin_frames_mode == 3) {
1299     // Get the current time stamp to measure task execution time to correct
1300     // barrier imbalance time
1301     cur_time = __itt_get_timestamp();
1302   }
1303 #endif
1304 
1305 #if OMPT_SUPPORT
1306   // For untied tasks, the first task executed only calls __kmpc_omp_task and
1307   // does not execute code.
1308   ompt_thread_info_t oldInfo;
1309   kmp_info_t *thread;
1310   if (UNLIKELY(ompt_enabled.enabled)) {
1311     // Store the threads states and restore them after the task
1312     thread = __kmp_threads[gtid];
1313     oldInfo = thread->th.ompt_thread_info;
1314     thread->th.ompt_thread_info.wait_id = 0;
1315     thread->th.ompt_thread_info.state = (thread->th.th_team_serialized)
1316                                             ? omp_state_work_serial
1317                                             : omp_state_work_parallel;
1318     taskdata->ompt_task_info.frame.exit_frame = OMPT_GET_FRAME_ADDRESS(0);
1319   }
1320 #endif
1321 
1322 #if OMP_45_ENABLED
1323   // Proxy tasks are not handled by the runtime
1324   if (taskdata->td_flags.proxy != TASK_PROXY) {
1325 #endif
1326     ANNOTATE_HAPPENS_AFTER(task);
1327     __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded
1328 #if OMP_45_ENABLED
1329   }
1330 #endif
1331 
1332 #if OMP_40_ENABLED
1333   // TODO: cancel tasks if the parallel region has also been cancelled
1334   // TODO: check if this sequence can be hoisted above __kmp_task_start
1335   // if cancellation has been enabled for this run ...
1336   if (__kmp_omp_cancellation) {
1337     kmp_info_t *this_thr = __kmp_threads[gtid];
1338     kmp_team_t *this_team = this_thr->th.th_team;
1339     kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
1340     if ((taskgroup && taskgroup->cancel_request) ||
1341         (this_team->t.t_cancel_request == cancel_parallel)) {
1342 #if OMPT_SUPPORT && OMPT_OPTIONAL
1343       ompt_data_t *task_data;
1344       if (UNLIKELY(ompt_enabled.ompt_callback_cancel)) {
1345         __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
1346         ompt_callbacks.ompt_callback(ompt_callback_cancel)(
1347             task_data,
1348             ((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup
1349                                                       : ompt_cancel_parallel) |
1350                 ompt_cancel_discarded_task,
1351             NULL);
1352       }
1353 #endif
1354       KMP_COUNT_BLOCK(TASK_cancelled);
1355       // this task belongs to a task group and we need to cancel it
1356       discard = 1 /* true */;
1357     }
1358   }
1359 
1360   // Invoke the task routine and pass in relevant data.
1361   // Thunks generated by gcc take a different argument list.
1362   if (!discard) {
1363     if (taskdata->td_flags.tiedness == TASK_UNTIED) {
1364       taskdata->td_last_tied = current_task->td_last_tied;
1365       KMP_DEBUG_ASSERT(taskdata->td_last_tied);
1366     }
1367 #if KMP_STATS_ENABLED
1368     KMP_COUNT_BLOCK(TASK_executed);
1369     switch (KMP_GET_THREAD_STATE()) {
1370     case FORK_JOIN_BARRIER:
1371       KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar);
1372       break;
1373     case PLAIN_BARRIER:
1374       KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar);
1375       break;
1376     case TASKYIELD:
1377       KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield);
1378       break;
1379     case TASKWAIT:
1380       KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait);
1381       break;
1382     case TASKGROUP:
1383       KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup);
1384       break;
1385     default:
1386       KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate);
1387       break;
1388     }
1389 #endif // KMP_STATS_ENABLED
1390 #endif // OMP_40_ENABLED
1391 
1392 // OMPT task begin
1393 #if OMPT_SUPPORT
1394     if (UNLIKELY(ompt_enabled.enabled))
1395       __ompt_task_start(task, current_task, gtid);
1396 #endif
1397 
1398 #ifdef KMP_GOMP_COMPAT
1399     if (taskdata->td_flags.native) {
1400       ((void (*)(void *))(*(task->routine)))(task->shareds);
1401     } else
1402 #endif /* KMP_GOMP_COMPAT */
1403     {
1404       (*(task->routine))(gtid, task);
1405     }
1406     KMP_POP_PARTITIONED_TIMER();
1407 
1408 #if OMP_40_ENABLED
1409   }
1410 #endif // OMP_40_ENABLED
1411 
1412 
1413 #if OMP_45_ENABLED
1414   // Proxy tasks are not handled by the runtime
1415   if (taskdata->td_flags.proxy != TASK_PROXY) {
1416 #endif
1417     ANNOTATE_HAPPENS_BEFORE(taskdata->td_parent);
1418 #if OMPT_SUPPORT
1419     if (UNLIKELY(ompt_enabled.enabled)) {
1420       thread->th.ompt_thread_info = oldInfo;
1421       if (taskdata->td_flags.tiedness == TASK_TIED) {
1422         taskdata->ompt_task_info.frame.exit_frame = NULL;
1423       }
1424       __kmp_task_finish<true>(gtid, task, current_task);
1425     } else
1426 #endif
1427       __kmp_task_finish<false>(gtid, task, current_task);
1428 #if OMP_45_ENABLED
1429   }
1430 #endif
1431 
1432 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1433   // Barrier imbalance - correct arrive time after the task finished
1434   if (__kmp_forkjoin_frames_mode == 3) {
1435     kmp_info_t *this_thr = __kmp_threads[gtid];
1436     if (this_thr->th.th_bar_arrive_time) {
1437       this_thr->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time);
1438     }
1439   }
1440 #endif
1441   KA_TRACE(
1442       30,
1443       ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
1444        gtid, taskdata, current_task));
1445   return;
1446 }
1447 
1448 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution
1449 //
1450 // loc_ref: location of original task pragma (ignored)
1451 // gtid: Global Thread ID of encountering thread
1452 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
1453 // Returns:
1454 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1455 //    be resumed later.
1456 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1457 //    resumed later.
1458 kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,
1459                                 kmp_task_t *new_task) {
1460   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1461 
1462   KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid,
1463                 loc_ref, new_taskdata));
1464 
1465 #if OMPT_SUPPORT
1466   kmp_taskdata_t *parent;
1467   if (UNLIKELY(ompt_enabled.enabled)) {
1468     parent = new_taskdata->td_parent;
1469     if (ompt_enabled.ompt_callback_task_create) {
1470       ompt_data_t task_data = ompt_data_none;
1471       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1472           parent ? &(parent->ompt_task_info.task_data) : &task_data,
1473           parent ? &(parent->ompt_task_info.frame) : NULL,
1474           &(new_taskdata->ompt_task_info.task_data), ompt_task_explicit, 0,
1475           OMPT_GET_RETURN_ADDRESS(0));
1476     }
1477   }
1478 #endif
1479 
1480   /* Should we execute the new task or queue it? For now, let's just always try
1481      to queue it.  If the queue fills up, then we'll execute it.  */
1482 
1483   if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1484   { // Execute this task immediately
1485     kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1486     new_taskdata->td_flags.task_serial = 1;
1487     __kmp_invoke_task(gtid, new_task, current_task);
1488   }
1489 
1490   KA_TRACE(
1491       10,
1492       ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "
1493        "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
1494        gtid, loc_ref, new_taskdata));
1495 
1496   ANNOTATE_HAPPENS_BEFORE(new_task);
1497 #if OMPT_SUPPORT
1498   if (UNLIKELY(ompt_enabled.enabled)) {
1499     parent->ompt_task_info.frame.enter_frame = NULL;
1500   }
1501 #endif
1502   return TASK_CURRENT_NOT_QUEUED;
1503 }
1504 
1505 // __kmp_omp_task: Schedule a non-thread-switchable task for execution
1506 //
1507 // gtid: Global Thread ID of encountering thread
1508 // new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
1509 // serialize_immediate: if TRUE then if the task is executed immediately its
1510 // execution will be serialized
1511 // Returns:
1512 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1513 //    be resumed later.
1514 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1515 //    resumed later.
1516 kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
1517                          bool serialize_immediate) {
1518   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1519 
1520 /* Should we execute the new task or queue it? For now, let's just always try to
1521    queue it.  If the queue fills up, then we'll execute it.  */
1522 #if OMP_45_ENABLED
1523   if (new_taskdata->td_flags.proxy == TASK_PROXY ||
1524       __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1525 #else
1526   if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1527 #endif
1528   { // Execute this task immediately
1529     kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1530     if (serialize_immediate)
1531       new_taskdata->td_flags.task_serial = 1;
1532     __kmp_invoke_task(gtid, new_task, current_task);
1533   }
1534 
1535   ANNOTATE_HAPPENS_BEFORE(new_task);
1536   return TASK_CURRENT_NOT_QUEUED;
1537 }
1538 
1539 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a
1540 // non-thread-switchable task from the parent thread only!
1541 //
1542 // loc_ref: location of original task pragma (ignored)
1543 // gtid: Global Thread ID of encountering thread
1544 // new_task: non-thread-switchable task thunk allocated by
1545 // __kmp_omp_task_alloc()
1546 // Returns:
1547 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1548 //    be resumed later.
1549 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1550 //    resumed later.
1551 kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid,
1552                           kmp_task_t *new_task) {
1553   kmp_int32 res;
1554   KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1555 
1556 #if KMP_DEBUG || OMPT_SUPPORT
1557   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1558 #endif
1559   KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1560                 new_taskdata));
1561 
1562 #if OMPT_SUPPORT
1563   kmp_taskdata_t *parent = NULL;
1564   if (UNLIKELY(ompt_enabled.enabled)) {
1565     if (!new_taskdata->td_flags.started) {
1566       OMPT_STORE_RETURN_ADDRESS(gtid);
1567       parent = new_taskdata->td_parent;
1568       if (!parent->ompt_task_info.frame.enter_frame) {
1569         parent->ompt_task_info.frame.enter_frame = OMPT_GET_FRAME_ADDRESS(1);
1570       }
1571       if (ompt_enabled.ompt_callback_task_create) {
1572         ompt_data_t task_data = ompt_data_none;
1573         ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1574             parent ? &(parent->ompt_task_info.task_data) : &task_data,
1575             parent ? &(parent->ompt_task_info.frame) : NULL,
1576             &(new_taskdata->ompt_task_info.task_data),
1577             ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1578             OMPT_LOAD_RETURN_ADDRESS(gtid));
1579       }
1580     } else {
1581       // We are scheduling the continuation of an UNTIED task.
1582       // Scheduling back to the parent task.
1583       __ompt_task_finish(new_task,
1584                          new_taskdata->ompt_task_info.scheduling_parent,
1585                          ompt_task_others);
1586       new_taskdata->ompt_task_info.frame.exit_frame = NULL;
1587     }
1588   }
1589 #endif
1590 
1591   res = __kmp_omp_task(gtid, new_task, true);
1592 
1593   KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1594                 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1595                 gtid, loc_ref, new_taskdata));
1596 #if OMPT_SUPPORT
1597   if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
1598     parent->ompt_task_info.frame.enter_frame = NULL;
1599   }
1600 #endif
1601   return res;
1602 }
1603 
1604 // __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule
1605 // a taskloop task with the correct OMPT return address
1606 //
1607 // loc_ref: location of original task pragma (ignored)
1608 // gtid: Global Thread ID of encountering thread
1609 // new_task: non-thread-switchable task thunk allocated by
1610 // __kmp_omp_task_alloc()
1611 // codeptr_ra: return address for OMPT callback
1612 // Returns:
1613 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1614 //    be resumed later.
1615 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1616 //    resumed later.
1617 kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid,
1618                                   kmp_task_t *new_task, void *codeptr_ra) {
1619   kmp_int32 res;
1620   KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1621 
1622 #if KMP_DEBUG || OMPT_SUPPORT
1623   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1624 #endif
1625   KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1626                 new_taskdata));
1627 
1628 #if OMPT_SUPPORT
1629   kmp_taskdata_t *parent = NULL;
1630   if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) {
1631     parent = new_taskdata->td_parent;
1632     if (!parent->ompt_task_info.frame.enter_frame)
1633       parent->ompt_task_info.frame.enter_frame = OMPT_GET_FRAME_ADDRESS(1);
1634     if (ompt_enabled.ompt_callback_task_create) {
1635       ompt_data_t task_data = ompt_data_none;
1636       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1637           parent ? &(parent->ompt_task_info.task_data) : &task_data,
1638           parent ? &(parent->ompt_task_info.frame) : NULL,
1639           &(new_taskdata->ompt_task_info.task_data),
1640           ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1641           codeptr_ra);
1642     }
1643   }
1644 #endif
1645 
1646   res = __kmp_omp_task(gtid, new_task, true);
1647 
1648   KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1649                 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1650                 gtid, loc_ref, new_taskdata));
1651 #if OMPT_SUPPORT
1652   if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
1653     parent->ompt_task_info.frame.enter_frame = NULL;
1654   }
1655 #endif
1656   return res;
1657 }
1658 
1659 template <bool ompt>
1660 static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
1661                                               void *frame_address,
1662                                               void *return_address) {
1663   kmp_taskdata_t *taskdata;
1664   kmp_info_t *thread;
1665   int thread_finished = FALSE;
1666   KMP_SET_THREAD_STATE_BLOCK(TASKWAIT);
1667 
1668   KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref));
1669 
1670   if (__kmp_tasking_mode != tskm_immediate_exec) {
1671     thread = __kmp_threads[gtid];
1672     taskdata = thread->th.th_current_task;
1673 
1674 #if OMPT_SUPPORT && OMPT_OPTIONAL
1675     ompt_data_t *my_task_data;
1676     ompt_data_t *my_parallel_data;
1677 
1678     if (ompt) {
1679       my_task_data = &(taskdata->ompt_task_info.task_data);
1680       my_parallel_data = OMPT_CUR_TEAM_DATA(thread);
1681 
1682       taskdata->ompt_task_info.frame.enter_frame = frame_address;
1683 
1684       if (ompt_enabled.ompt_callback_sync_region) {
1685         ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1686             ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
1687             my_task_data, return_address);
1688       }
1689 
1690       if (ompt_enabled.ompt_callback_sync_region_wait) {
1691         ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1692             ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
1693             my_task_data, return_address);
1694       }
1695     }
1696 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1697 
1698 // Debugger: The taskwait is active. Store location and thread encountered the
1699 // taskwait.
1700 #if USE_ITT_BUILD
1701 // Note: These values are used by ITT events as well.
1702 #endif /* USE_ITT_BUILD */
1703     taskdata->td_taskwait_counter += 1;
1704     taskdata->td_taskwait_ident = loc_ref;
1705     taskdata->td_taskwait_thread = gtid + 1;
1706 
1707 #if USE_ITT_BUILD
1708     void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
1709     if (itt_sync_obj != NULL)
1710       __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
1711 #endif /* USE_ITT_BUILD */
1712 
1713     bool must_wait =
1714         !taskdata->td_flags.team_serial && !taskdata->td_flags.final;
1715 
1716 #if OMP_45_ENABLED
1717     must_wait = must_wait || (thread->th.th_task_team != NULL &&
1718                               thread->th.th_task_team->tt.tt_found_proxy_tasks);
1719 #endif
1720     if (must_wait) {
1721       kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *,
1722                              &(taskdata->td_incomplete_child_tasks)),
1723                        0U);
1724       while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) {
1725         flag.execute_tasks(thread, gtid, FALSE,
1726                            &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
1727                            __kmp_task_stealing_constraint);
1728       }
1729     }
1730 #if USE_ITT_BUILD
1731     if (itt_sync_obj != NULL)
1732       __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
1733 #endif /* USE_ITT_BUILD */
1734 
1735     // Debugger:  The taskwait is completed. Location remains, but thread is
1736     // negated.
1737     taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
1738 
1739 #if OMPT_SUPPORT && OMPT_OPTIONAL
1740     if (ompt) {
1741       if (ompt_enabled.ompt_callback_sync_region_wait) {
1742         ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1743             ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
1744             my_task_data, return_address);
1745       }
1746       if (ompt_enabled.ompt_callback_sync_region) {
1747         ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1748             ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
1749             my_task_data, return_address);
1750       }
1751       taskdata->ompt_task_info.frame.enter_frame = NULL;
1752     }
1753 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1754 
1755     ANNOTATE_HAPPENS_AFTER(taskdata);
1756   }
1757 
1758   KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
1759                 "returning TASK_CURRENT_NOT_QUEUED\n",
1760                 gtid, taskdata));
1761 
1762   return TASK_CURRENT_NOT_QUEUED;
1763 }
1764 
1765 #if OMPT_SUPPORT
1766 OMPT_NOINLINE
1767 static kmp_int32 __kmpc_omp_taskwait_ompt(ident_t *loc_ref, kmp_int32 gtid,
1768                                           void *frame_address,
1769                                           void *return_address) {
1770   return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address,
1771                                             return_address);
1772 }
1773 #endif // OMPT_SUPPORT
1774 
1775 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are
1776 // complete
1777 kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) {
1778 #if OMPT_SUPPORT && OMPT_OPTIONAL
1779   if (UNLIKELY(ompt_enabled.enabled)) {
1780     OMPT_STORE_RETURN_ADDRESS(gtid);
1781     return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(1),
1782                                     OMPT_LOAD_RETURN_ADDRESS(gtid));
1783   }
1784 #endif
1785   return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL);
1786 }
1787 
1788 // __kmpc_omp_taskyield: switch to a different task
1789 kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
1790   kmp_taskdata_t *taskdata;
1791   kmp_info_t *thread;
1792   int thread_finished = FALSE;
1793 
1794   KMP_COUNT_BLOCK(OMP_TASKYIELD);
1795   KMP_SET_THREAD_STATE_BLOCK(TASKYIELD);
1796 
1797   KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
1798                 gtid, loc_ref, end_part));
1799 
1800   if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) {
1801     thread = __kmp_threads[gtid];
1802     taskdata = thread->th.th_current_task;
1803 // Should we model this as a task wait or not?
1804 // Debugger: The taskwait is active. Store location and thread encountered the
1805 // taskwait.
1806 #if USE_ITT_BUILD
1807 // Note: These values are used by ITT events as well.
1808 #endif /* USE_ITT_BUILD */
1809     taskdata->td_taskwait_counter += 1;
1810     taskdata->td_taskwait_ident = loc_ref;
1811     taskdata->td_taskwait_thread = gtid + 1;
1812 
1813 #if USE_ITT_BUILD
1814     void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
1815     if (itt_sync_obj != NULL)
1816       __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
1817 #endif /* USE_ITT_BUILD */
1818     if (!taskdata->td_flags.team_serial) {
1819       kmp_task_team_t *task_team = thread->th.th_task_team;
1820       if (task_team != NULL) {
1821         if (KMP_TASKING_ENABLED(task_team)) {
1822 #if OMPT_SUPPORT
1823           if (UNLIKELY(ompt_enabled.enabled))
1824             thread->th.ompt_thread_info.ompt_task_yielded = 1;
1825 #endif
1826           __kmp_execute_tasks_32(
1827               thread, gtid, NULL, FALSE,
1828               &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
1829               __kmp_task_stealing_constraint);
1830 #if OMPT_SUPPORT
1831           if (UNLIKELY(ompt_enabled.enabled))
1832             thread->th.ompt_thread_info.ompt_task_yielded = 0;
1833 #endif
1834         }
1835       }
1836     }
1837 #if USE_ITT_BUILD
1838     if (itt_sync_obj != NULL)
1839       __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
1840 #endif /* USE_ITT_BUILD */
1841 
1842     // Debugger:  The taskwait is completed. Location remains, but thread is
1843     // negated.
1844     taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
1845   }
1846 
1847   KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
1848                 "returning TASK_CURRENT_NOT_QUEUED\n",
1849                 gtid, taskdata));
1850 
1851   return TASK_CURRENT_NOT_QUEUED;
1852 }
1853 
1854 // TODO: change to OMP_50_ENABLED, need to change build tools for this to work
1855 #if OMP_45_ENABLED
1856 // Task Reduction implementation
1857 
1858 typedef struct kmp_task_red_flags {
1859   unsigned lazy_priv : 1; // hint: (1) use lazy allocation (big objects)
1860   unsigned reserved31 : 31;
1861 } kmp_task_red_flags_t;
1862 
1863 // internal structure for reduction data item related info
1864 typedef struct kmp_task_red_data {
1865   void *reduce_shar; // shared reduction item
1866   size_t reduce_size; // size of data item
1867   void *reduce_priv; // thread specific data
1868   void *reduce_pend; // end of private data for comparison op
1869   void *reduce_init; // data initialization routine
1870   void *reduce_fini; // data finalization routine
1871   void *reduce_comb; // data combiner routine
1872   kmp_task_red_flags_t flags; // flags for additional info from compiler
1873 } kmp_task_red_data_t;
1874 
1875 // structure sent us by compiler - one per reduction item
1876 typedef struct kmp_task_red_input {
1877   void *reduce_shar; // shared reduction item
1878   size_t reduce_size; // size of data item
1879   void *reduce_init; // data initialization routine
1880   void *reduce_fini; // data finalization routine
1881   void *reduce_comb; // data combiner routine
1882   kmp_task_red_flags_t flags; // flags for additional info from compiler
1883 } kmp_task_red_input_t;
1884 
1885 /*!
1886 @ingroup TASKING
1887 @param gtid      Global thread ID
1888 @param num       Number of data items to reduce
1889 @param data      Array of data for reduction
1890 @return The taskgroup identifier
1891 
1892 Initialize task reduction for the taskgroup.
1893 */
1894 void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
1895   kmp_info_t *thread = __kmp_threads[gtid];
1896   kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
1897   kmp_int32 nth = thread->th.th_team_nproc;
1898   kmp_task_red_input_t *input = (kmp_task_red_input_t *)data;
1899   kmp_task_red_data_t *arr;
1900 
1901   // check input data just in case
1902   KMP_ASSERT(tg != NULL);
1903   KMP_ASSERT(data != NULL);
1904   KMP_ASSERT(num > 0);
1905   if (nth == 1) {
1906     KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n",
1907                   gtid, tg));
1908     return (void *)tg;
1909   }
1910   KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n",
1911                 gtid, tg, num));
1912   arr = (kmp_task_red_data_t *)__kmp_thread_malloc(
1913       thread, num * sizeof(kmp_task_red_data_t));
1914   for (int i = 0; i < num; ++i) {
1915     void (*f_init)(void *) = (void (*)(void *))(input[i].reduce_init);
1916     size_t size = input[i].reduce_size - 1;
1917     // round the size up to cache line per thread-specific item
1918     size += CACHE_LINE - size % CACHE_LINE;
1919     KMP_ASSERT(input[i].reduce_comb != NULL); // combiner is mandatory
1920     arr[i].reduce_shar = input[i].reduce_shar;
1921     arr[i].reduce_size = size;
1922     arr[i].reduce_init = input[i].reduce_init;
1923     arr[i].reduce_fini = input[i].reduce_fini;
1924     arr[i].reduce_comb = input[i].reduce_comb;
1925     arr[i].flags = input[i].flags;
1926     if (!input[i].flags.lazy_priv) {
1927       // allocate cache-line aligned block and fill it with zeros
1928       arr[i].reduce_priv = __kmp_allocate(nth * size);
1929       arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size;
1930       if (f_init != NULL) {
1931         // initialize thread-specific items
1932         for (int j = 0; j < nth; ++j) {
1933           f_init((char *)(arr[i].reduce_priv) + j * size);
1934         }
1935       }
1936     } else {
1937       // only allocate space for pointers now,
1938       // objects will be lazily allocated/initialized once requested
1939       arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *));
1940     }
1941   }
1942   tg->reduce_data = (void *)arr;
1943   tg->reduce_num_data = num;
1944   return (void *)tg;
1945 }
1946 
1947 /*!
1948 @ingroup TASKING
1949 @param gtid    Global thread ID
1950 @param tskgrp  The taskgroup ID (optional)
1951 @param data    Shared location of the item
1952 @return The pointer to per-thread data
1953 
1954 Get thread-specific location of data item
1955 */
1956 void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
1957   kmp_info_t *thread = __kmp_threads[gtid];
1958   kmp_int32 nth = thread->th.th_team_nproc;
1959   if (nth == 1)
1960     return data; // nothing to do
1961 
1962   kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp;
1963   if (tg == NULL)
1964     tg = thread->th.th_current_task->td_taskgroup;
1965   KMP_ASSERT(tg != NULL);
1966   kmp_task_red_data_t *arr = (kmp_task_red_data_t *)(tg->reduce_data);
1967   kmp_int32 num = tg->reduce_num_data;
1968   kmp_int32 tid = thread->th.th_info.ds.ds_tid;
1969 
1970   KMP_ASSERT(data != NULL);
1971   while (tg != NULL) {
1972     for (int i = 0; i < num; ++i) {
1973       if (!arr[i].flags.lazy_priv) {
1974         if (data == arr[i].reduce_shar ||
1975             (data >= arr[i].reduce_priv && data < arr[i].reduce_pend))
1976           return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size;
1977       } else {
1978         // check shared location first
1979         void **p_priv = (void **)(arr[i].reduce_priv);
1980         if (data == arr[i].reduce_shar)
1981           goto found;
1982         // check if we get some thread specific location as parameter
1983         for (int j = 0; j < nth; ++j)
1984           if (data == p_priv[j])
1985             goto found;
1986         continue; // not found, continue search
1987       found:
1988         if (p_priv[tid] == NULL) {
1989           // allocate thread specific object lazily
1990           void (*f_init)(void *) = (void (*)(void *))(arr[i].reduce_init);
1991           p_priv[tid] = __kmp_allocate(arr[i].reduce_size);
1992           if (f_init != NULL) {
1993             f_init(p_priv[tid]);
1994           }
1995         }
1996         return p_priv[tid];
1997       }
1998     }
1999     tg = tg->parent;
2000     arr = (kmp_task_red_data_t *)(tg->reduce_data);
2001     num = tg->reduce_num_data;
2002   }
2003   KMP_ASSERT2(0, "Unknown task reduction item");
2004   return NULL; // ERROR, this line never executed
2005 }
2006 
2007 // Finalize task reduction.
2008 // Called from __kmpc_end_taskgroup()
2009 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) {
2010   kmp_int32 nth = th->th.th_team_nproc;
2011   KMP_DEBUG_ASSERT(nth > 1); // should not be called if nth == 1
2012   kmp_task_red_data_t *arr = (kmp_task_red_data_t *)tg->reduce_data;
2013   kmp_int32 num = tg->reduce_num_data;
2014   for (int i = 0; i < num; ++i) {
2015     void *sh_data = arr[i].reduce_shar;
2016     void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini);
2017     void (*f_comb)(void *, void *) =
2018         (void (*)(void *, void *))(arr[i].reduce_comb);
2019     if (!arr[i].flags.lazy_priv) {
2020       void *pr_data = arr[i].reduce_priv;
2021       size_t size = arr[i].reduce_size;
2022       for (int j = 0; j < nth; ++j) {
2023         void *priv_data = (char *)pr_data + j * size;
2024         f_comb(sh_data, priv_data); // combine results
2025         if (f_fini)
2026           f_fini(priv_data); // finalize if needed
2027       }
2028     } else {
2029       void **pr_data = (void **)(arr[i].reduce_priv);
2030       for (int j = 0; j < nth; ++j) {
2031         if (pr_data[j] != NULL) {
2032           f_comb(sh_data, pr_data[j]); // combine results
2033           if (f_fini)
2034             f_fini(pr_data[j]); // finalize if needed
2035           __kmp_free(pr_data[j]);
2036         }
2037       }
2038     }
2039     __kmp_free(arr[i].reduce_priv);
2040   }
2041   __kmp_thread_free(th, arr);
2042   tg->reduce_data = NULL;
2043   tg->reduce_num_data = 0;
2044 }
2045 #endif
2046 
2047 #if OMP_40_ENABLED
2048 // __kmpc_taskgroup: Start a new taskgroup
2049 void __kmpc_taskgroup(ident_t *loc, int gtid) {
2050   kmp_info_t *thread = __kmp_threads[gtid];
2051   kmp_taskdata_t *taskdata = thread->th.th_current_task;
2052   kmp_taskgroup_t *tg_new =
2053       (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t));
2054   KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new));
2055   KMP_ATOMIC_ST_RLX(&tg_new->count, 0);
2056   KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq);
2057   tg_new->parent = taskdata->td_taskgroup;
2058 // TODO: change to OMP_50_ENABLED, need to change build tools for this to work
2059 #if OMP_45_ENABLED
2060   tg_new->reduce_data = NULL;
2061   tg_new->reduce_num_data = 0;
2062 #endif
2063   taskdata->td_taskgroup = tg_new;
2064 
2065 #if OMPT_SUPPORT && OMPT_OPTIONAL
2066   if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2067     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2068     if (!codeptr)
2069       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2070     kmp_team_t *team = thread->th.th_team;
2071     ompt_data_t my_task_data = taskdata->ompt_task_info.task_data;
2072     // FIXME: I think this is wrong for lwt!
2073     ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data;
2074 
2075     ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2076         ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2077         &(my_task_data), codeptr);
2078   }
2079 #endif
2080 }
2081 
2082 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task
2083 //                       and its descendants are complete
2084 void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
2085   kmp_info_t *thread = __kmp_threads[gtid];
2086   kmp_taskdata_t *taskdata = thread->th.th_current_task;
2087   kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
2088   int thread_finished = FALSE;
2089 
2090 #if OMPT_SUPPORT && OMPT_OPTIONAL
2091   kmp_team_t *team;
2092   ompt_data_t my_task_data;
2093   ompt_data_t my_parallel_data;
2094   void *codeptr;
2095   if (UNLIKELY(ompt_enabled.enabled)) {
2096     team = thread->th.th_team;
2097     my_task_data = taskdata->ompt_task_info.task_data;
2098     // FIXME: I think this is wrong for lwt!
2099     my_parallel_data = team->t.ompt_team_info.parallel_data;
2100     codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2101     if (!codeptr)
2102       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2103   }
2104 #endif
2105 
2106   KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc));
2107   KMP_DEBUG_ASSERT(taskgroup != NULL);
2108   KMP_SET_THREAD_STATE_BLOCK(TASKGROUP);
2109 
2110   if (__kmp_tasking_mode != tskm_immediate_exec) {
2111     // mark task as waiting not on a barrier
2112     taskdata->td_taskwait_counter += 1;
2113     taskdata->td_taskwait_ident = loc;
2114     taskdata->td_taskwait_thread = gtid + 1;
2115 #if USE_ITT_BUILD
2116     // For ITT the taskgroup wait is similar to taskwait until we need to
2117     // distinguish them
2118     void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
2119     if (itt_sync_obj != NULL)
2120       __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
2121 #endif /* USE_ITT_BUILD */
2122 
2123 #if OMPT_SUPPORT && OMPT_OPTIONAL
2124     if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2125       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2126           ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2127           &(my_task_data), codeptr);
2128     }
2129 #endif
2130 
2131 #if OMP_45_ENABLED
2132     if (!taskdata->td_flags.team_serial ||
2133         (thread->th.th_task_team != NULL &&
2134          thread->th.th_task_team->tt.tt_found_proxy_tasks))
2135 #else
2136     if (!taskdata->td_flags.team_serial)
2137 #endif
2138     {
2139       kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)),
2140                        0U);
2141       while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) {
2142         flag.execute_tasks(thread, gtid, FALSE,
2143                            &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2144                            __kmp_task_stealing_constraint);
2145       }
2146     }
2147     taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; // end waiting
2148 
2149 #if OMPT_SUPPORT && OMPT_OPTIONAL
2150     if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2151       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2152           ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2153           &(my_task_data), codeptr);
2154     }
2155 #endif
2156 
2157 #if USE_ITT_BUILD
2158     if (itt_sync_obj != NULL)
2159       __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
2160 #endif /* USE_ITT_BUILD */
2161   }
2162   KMP_DEBUG_ASSERT(taskgroup->count == 0);
2163 
2164 // TODO: change to OMP_50_ENABLED, need to change build tools for this to work
2165 #if OMP_45_ENABLED
2166   if (taskgroup->reduce_data != NULL) // need to reduce?
2167     __kmp_task_reduction_fini(thread, taskgroup);
2168 #endif
2169   // Restore parent taskgroup for the current task
2170   taskdata->td_taskgroup = taskgroup->parent;
2171   __kmp_thread_free(thread, taskgroup);
2172 
2173   KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n",
2174                 gtid, taskdata));
2175   ANNOTATE_HAPPENS_AFTER(taskdata);
2176 
2177 #if OMPT_SUPPORT && OMPT_OPTIONAL
2178   if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2179     ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2180         ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2181         &(my_task_data), codeptr);
2182   }
2183 #endif
2184 }
2185 #endif
2186 
2187 // __kmp_remove_my_task: remove a task from my own deque
2188 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid,
2189                                         kmp_task_team_t *task_team,
2190                                         kmp_int32 is_constrained) {
2191   kmp_task_t *task;
2192   kmp_taskdata_t *taskdata;
2193   kmp_thread_data_t *thread_data;
2194   kmp_uint32 tail;
2195 
2196   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2197   KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data !=
2198                    NULL); // Caller should check this condition
2199 
2200   thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
2201 
2202   KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",
2203                 gtid, thread_data->td.td_deque_ntasks,
2204                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2205 
2206   if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2207     KA_TRACE(10,
2208              ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: "
2209               "ntasks=%d head=%u tail=%u\n",
2210               gtid, thread_data->td.td_deque_ntasks,
2211               thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2212     return NULL;
2213   }
2214 
2215   __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
2216 
2217   if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2218     __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2219     KA_TRACE(10,
2220              ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "
2221               "ntasks=%d head=%u tail=%u\n",
2222               gtid, thread_data->td.td_deque_ntasks,
2223               thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2224     return NULL;
2225   }
2226 
2227   tail = (thread_data->td.td_deque_tail - 1) &
2228          TASK_DEQUE_MASK(thread_data->td); // Wrap index.
2229   taskdata = thread_data->td.td_deque[tail];
2230 
2231   if (is_constrained && (taskdata->td_flags.tiedness == TASK_TIED)) {
2232     // we need to check if the candidate obeys task scheduling constraint (TSC)
2233     // only descendant of all deferred tied tasks can be scheduled, checking
2234     // the last one is enough, as it in turn is the descendant of all others
2235     kmp_taskdata_t *current = thread->th.th_current_task->td_last_tied;
2236     KMP_DEBUG_ASSERT(current != NULL);
2237     // check if last tied task is not suspended on barrier
2238     if (current->td_flags.tasktype == TASK_EXPLICIT ||
2239         current->td_taskwait_thread > 0) { // <= 0 on barrier
2240       kmp_int32 level = current->td_level;
2241       kmp_taskdata_t *parent = taskdata->td_parent;
2242       while (parent != current && parent->td_level > level) {
2243         parent = parent->td_parent; // check generation up to the level of the
2244         // current task
2245         KMP_DEBUG_ASSERT(parent != NULL);
2246       }
2247       if (parent != current) {
2248         // The TSC does not allow to steal victim task
2249         __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2250         KA_TRACE(10, ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "
2251                       "ntasks=%d head=%u tail=%u\n",
2252                       gtid, thread_data->td.td_deque_ntasks,
2253                       thread_data->td.td_deque_head,
2254                       thread_data->td.td_deque_tail));
2255         return NULL;
2256       }
2257     }
2258   }
2259 
2260   thread_data->td.td_deque_tail = tail;
2261   TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1);
2262 
2263   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2264 
2265   KA_TRACE(10, ("__kmp_remove_my_task(exit #2): T#%d task %p removed: "
2266                 "ntasks=%d head=%u tail=%u\n",
2267                 gtid, taskdata, thread_data->td.td_deque_ntasks,
2268                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2269 
2270   task = KMP_TASKDATA_TO_TASK(taskdata);
2271   return task;
2272 }
2273 
2274 // __kmp_steal_task: remove a task from another thread's deque
2275 // Assume that calling thread has already checked existence of
2276 // task_team thread_data before calling this routine.
2277 static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid,
2278                                     kmp_task_team_t *task_team,
2279                                     std::atomic<kmp_int32> *unfinished_threads,
2280                                     int *thread_finished,
2281                                     kmp_int32 is_constrained) {
2282   kmp_task_t *task;
2283   kmp_taskdata_t *taskdata;
2284   kmp_taskdata_t *current;
2285   kmp_thread_data_t *victim_td, *threads_data;
2286   kmp_int32 level, target;
2287   kmp_int32 victim_tid;
2288 
2289   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2290 
2291   threads_data = task_team->tt.tt_threads_data;
2292   KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition
2293 
2294   victim_tid = victim_thr->th.th_info.ds.ds_tid;
2295   victim_td = &threads_data[victim_tid];
2296 
2297   KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: "
2298                 "task_team=%p ntasks=%d head=%u tail=%u\n",
2299                 gtid, __kmp_gtid_from_thread(victim_thr), task_team,
2300                 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
2301                 victim_td->td.td_deque_tail));
2302 
2303   if (TCR_4(victim_td->td.td_deque_ntasks) == 0) {
2304     KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: "
2305                   "task_team=%p ntasks=%d head=%u tail=%u\n",
2306                   gtid, __kmp_gtid_from_thread(victim_thr), task_team,
2307                   victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
2308                   victim_td->td.td_deque_tail));
2309     return NULL;
2310   }
2311 
2312   __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock);
2313 
2314   int ntasks = TCR_4(victim_td->td.td_deque_ntasks);
2315   // Check again after we acquire the lock
2316   if (ntasks == 0) {
2317     __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2318     KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: "
2319                   "task_team=%p ntasks=%d head=%u tail=%u\n",
2320                   gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
2321                   victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2322     return NULL;
2323   }
2324 
2325   KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL);
2326 
2327   taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head];
2328   if (is_constrained && (taskdata->td_flags.tiedness == TASK_TIED)) {
2329     // we need to check if the candidate obeys task scheduling constraint (TSC)
2330     // only descendant of all deferred tied tasks can be scheduled, checking
2331     // the last one is enough, as it in turn is the descendant of all others
2332     current = __kmp_threads[gtid]->th.th_current_task->td_last_tied;
2333     KMP_DEBUG_ASSERT(current != NULL);
2334     // check if last tied task is not suspended on barrier
2335     if (current->td_flags.tasktype == TASK_EXPLICIT ||
2336         current->td_taskwait_thread > 0) { // <= 0 on barrier
2337       level = current->td_level;
2338       kmp_taskdata_t *parent = taskdata->td_parent;
2339       while (parent != current && parent->td_level > level) {
2340         parent = parent->td_parent; // check generation up to the level of the
2341         // current task
2342         KMP_DEBUG_ASSERT(parent != NULL);
2343       }
2344       if (parent != current) {
2345         if (!task_team->tt.tt_untied_task_encountered) {
2346           // The TSC does not allow to steal victim task
2347           __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2348           KA_TRACE(10,
2349                    ("__kmp_steal_task(exit #3): T#%d could not steal from "
2350                     "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
2351                     gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
2352                     victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2353           return NULL;
2354         }
2355         taskdata = NULL; // will check other tasks in victim's deque
2356       }
2357     }
2358   }
2359   if (taskdata != NULL) {
2360     // Bump head pointer and Wrap.
2361     victim_td->td.td_deque_head =
2362         (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td);
2363   } else {
2364     int i;
2365     // walk through victim's deque trying to steal any task
2366     target = victim_td->td.td_deque_head;
2367     for (i = 1; i < ntasks; ++i) {
2368       target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
2369       taskdata = victim_td->td.td_deque[target];
2370       if (taskdata->td_flags.tiedness == TASK_TIED) {
2371         // check if the candidate obeys the TSC
2372         kmp_taskdata_t *parent = taskdata->td_parent;
2373         // check generation up to the level of the current task
2374         while (parent != current && parent->td_level > level) {
2375           parent = parent->td_parent;
2376           KMP_DEBUG_ASSERT(parent != NULL);
2377         }
2378         if (parent != current) {
2379           // The TSC does not allow to steal the candidate
2380           taskdata = NULL;
2381           continue;
2382         } else {
2383           // found victim tied task
2384           break;
2385         }
2386       } else {
2387         // found victim untied task
2388         break;
2389       }
2390     }
2391     if (taskdata == NULL) {
2392       // No appropriate candidate to steal found
2393       __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2394       KA_TRACE(10, ("__kmp_steal_task(exit #4): T#%d could not steal from "
2395                     "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
2396                     gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
2397                     victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2398       return NULL;
2399     }
2400     int prev = target;
2401     for (i = i + 1; i < ntasks; ++i) {
2402       // shift remaining tasks in the deque left by 1
2403       target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
2404       victim_td->td.td_deque[prev] = victim_td->td.td_deque[target];
2405       prev = target;
2406     }
2407     KMP_DEBUG_ASSERT(
2408         victim_td->td.td_deque_tail ==
2409         (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(victim_td->td)));
2410     victim_td->td.td_deque_tail = target; // tail -= 1 (wrapped))
2411   }
2412   if (*thread_finished) {
2413     // We need to un-mark this victim as a finished victim.  This must be done
2414     // before releasing the lock, or else other threads (starting with the
2415     // master victim) might be prematurely released from the barrier!!!
2416     kmp_int32 count;
2417 
2418     count = KMP_ATOMIC_INC(unfinished_threads);
2419 
2420     KA_TRACE(
2421         20,
2422         ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
2423          gtid, count + 1, task_team));
2424 
2425     *thread_finished = FALSE;
2426   }
2427   TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1);
2428 
2429   __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2430 
2431   KMP_COUNT_BLOCK(TASK_stolen);
2432   KA_TRACE(10,
2433            ("__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: "
2434             "task_team=%p ntasks=%d head=%u tail=%u\n",
2435             gtid, taskdata, __kmp_gtid_from_thread(victim_thr), task_team,
2436             ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2437 
2438   task = KMP_TASKDATA_TO_TASK(taskdata);
2439   return task;
2440 }
2441 
2442 // __kmp_execute_tasks_template: Choose and execute tasks until either the
2443 // condition is statisfied (return true) or there are none left (return false).
2444 //
2445 // final_spin is TRUE if this is the spin at the release barrier.
2446 // thread_finished indicates whether the thread is finished executing all
2447 // the tasks it has on its deque, and is at the release barrier.
2448 // spinner is the location on which to spin.
2449 // spinner == NULL means only execute a single task and return.
2450 // checker is the value to check to terminate the spin.
2451 template <class C>
2452 static inline int __kmp_execute_tasks_template(
2453     kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin,
2454     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
2455     kmp_int32 is_constrained) {
2456   kmp_task_team_t *task_team = thread->th.th_task_team;
2457   kmp_thread_data_t *threads_data;
2458   kmp_task_t *task;
2459   kmp_info_t *other_thread;
2460   kmp_taskdata_t *current_task = thread->th.th_current_task;
2461   std::atomic<kmp_int32> *unfinished_threads;
2462   kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0,
2463                       tid = thread->th.th_info.ds.ds_tid;
2464 
2465   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2466   KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]);
2467 
2468   if (task_team == NULL)
2469     return FALSE;
2470 
2471   KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d "
2472                 "*thread_finished=%d\n",
2473                 gtid, final_spin, *thread_finished));
2474 
2475   thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
2476   threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
2477   KMP_DEBUG_ASSERT(threads_data != NULL);
2478 
2479   nthreads = task_team->tt.tt_nproc;
2480   unfinished_threads = &(task_team->tt.tt_unfinished_threads);
2481 #if OMP_45_ENABLED
2482   KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks);
2483 #else
2484   KMP_DEBUG_ASSERT(nthreads > 1);
2485 #endif
2486   KMP_DEBUG_ASSERT(*unfinished_threads >= 0);
2487 
2488   while (1) { // Outer loop keeps trying to find tasks in case of single thread
2489     // getting tasks from target constructs
2490     while (1) { // Inner loop to find a task and execute it
2491       task = NULL;
2492       if (use_own_tasks) { // check on own queue first
2493         task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained);
2494       }
2495       if ((task == NULL) && (nthreads > 1)) { // Steal a task
2496         int asleep = 1;
2497         use_own_tasks = 0;
2498         // Try to steal from the last place I stole from successfully.
2499         if (victim_tid == -2) { // haven't stolen anything yet
2500           victim_tid = threads_data[tid].td.td_deque_last_stolen;
2501           if (victim_tid !=
2502               -1) // if we have a last stolen from victim, get the thread
2503             other_thread = threads_data[victim_tid].td.td_thr;
2504         }
2505         if (victim_tid != -1) { // found last victim
2506           asleep = 0;
2507         } else if (!new_victim) { // no recent steals and we haven't already
2508           // used a new victim; select a random thread
2509           do { // Find a different thread to steal work from.
2510             // Pick a random thread. Initial plan was to cycle through all the
2511             // threads, and only return if we tried to steal from every thread,
2512             // and failed.  Arch says that's not such a great idea.
2513             victim_tid = __kmp_get_random(thread) % (nthreads - 1);
2514             if (victim_tid >= tid) {
2515               ++victim_tid; // Adjusts random distribution to exclude self
2516             }
2517             // Found a potential victim
2518             other_thread = threads_data[victim_tid].td.td_thr;
2519             // There is a slight chance that __kmp_enable_tasking() did not wake
2520             // up all threads waiting at the barrier.  If victim is sleeping,
2521             // then wake it up. Since we were going to pay the cache miss
2522             // penalty for referencing another thread's kmp_info_t struct
2523             // anyway,
2524             // the check shouldn't cost too much performance at this point. In
2525             // extra barrier mode, tasks do not sleep at the separate tasking
2526             // barrier, so this isn't a problem.
2527             asleep = 0;
2528             if ((__kmp_tasking_mode == tskm_task_teams) &&
2529                 (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&
2530                 (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) !=
2531                  NULL)) {
2532               asleep = 1;
2533               __kmp_null_resume_wrapper(__kmp_gtid_from_thread(other_thread),
2534                                         other_thread->th.th_sleep_loc);
2535               // A sleeping thread should not have any tasks on it's queue.
2536               // There is a slight possibility that it resumes, steals a task
2537               // from another thread, which spawns more tasks, all in the time
2538               // that it takes this thread to check => don't write an assertion
2539               // that the victim's queue is empty.  Try stealing from a
2540               // different thread.
2541             }
2542           } while (asleep);
2543         }
2544 
2545         if (!asleep) {
2546           // We have a victim to try to steal from
2547           task = __kmp_steal_task(other_thread, gtid, task_team,
2548                                   unfinished_threads, thread_finished,
2549                                   is_constrained);
2550         }
2551         if (task != NULL) { // set last stolen to victim
2552           if (threads_data[tid].td.td_deque_last_stolen != victim_tid) {
2553             threads_data[tid].td.td_deque_last_stolen = victim_tid;
2554             // The pre-refactored code did not try more than 1 successful new
2555             // vicitm, unless the last one generated more local tasks;
2556             // new_victim keeps track of this
2557             new_victim = 1;
2558           }
2559         } else { // No tasks found; unset last_stolen
2560           KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1);
2561           victim_tid = -2; // no successful victim found
2562         }
2563       }
2564 
2565       if (task == NULL) // break out of tasking loop
2566         break;
2567 
2568 // Found a task; execute it
2569 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2570       if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2571         if (itt_sync_obj == NULL) { // we are at fork barrier where we could not
2572           // get the object reliably
2573           itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2574         }
2575         __kmp_itt_task_starting(itt_sync_obj);
2576       }
2577 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2578       __kmp_invoke_task(gtid, task, current_task);
2579 #if USE_ITT_BUILD
2580       if (itt_sync_obj != NULL)
2581         __kmp_itt_task_finished(itt_sync_obj);
2582 #endif /* USE_ITT_BUILD */
2583       // If this thread is only partway through the barrier and the condition is
2584       // met, then return now, so that the barrier gather/release pattern can
2585       // proceed. If this thread is in the last spin loop in the barrier,
2586       // waiting to be released, we know that the termination condition will not
2587       // be satisified, so don't waste any cycles checking it.
2588       if (flag == NULL || (!final_spin && flag->done_check())) {
2589         KA_TRACE(
2590             15,
2591             ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
2592              gtid));
2593         return TRUE;
2594       }
2595       if (thread->th.th_task_team == NULL) {
2596         break;
2597       }
2598       // Yield before executing next task
2599       KMP_YIELD(__kmp_library == library_throughput);
2600       // If execution of a stolen task results in more tasks being placed on our
2601       // run queue, reset use_own_tasks
2602       if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) {
2603         KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned "
2604                       "other tasks, restart\n",
2605                       gtid));
2606         use_own_tasks = 1;
2607         new_victim = 0;
2608       }
2609     }
2610 
2611 // The task source has been exhausted. If in final spin loop of barrier, check
2612 // if termination condition is satisfied.
2613 #if OMP_45_ENABLED
2614     // The work queue may be empty but there might be proxy tasks still
2615     // executing
2616     if (final_spin &&
2617         KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks) == 0)
2618 #else
2619     if (final_spin)
2620 #endif
2621     {
2622       // First, decrement the #unfinished threads, if that has not already been
2623       // done.  This decrement might be to the spin location, and result in the
2624       // termination condition being satisfied.
2625       if (!*thread_finished) {
2626         kmp_int32 count;
2627 
2628         count = KMP_ATOMIC_DEC(unfinished_threads) - 1;
2629         KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec "
2630                       "unfinished_threads to %d task_team=%p\n",
2631                       gtid, count, task_team));
2632         *thread_finished = TRUE;
2633       }
2634 
2635       // It is now unsafe to reference thread->th.th_team !!!
2636       // Decrementing task_team->tt.tt_unfinished_threads can allow the master
2637       // thread to pass through the barrier, where it might reset each thread's
2638       // th.th_team field for the next parallel region. If we can steal more
2639       // work, we know that this has not happened yet.
2640       if (flag != NULL && flag->done_check()) {
2641         KA_TRACE(
2642             15,
2643             ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
2644              gtid));
2645         return TRUE;
2646       }
2647     }
2648 
2649     // If this thread's task team is NULL, master has recognized that there are
2650     // no more tasks; bail out
2651     if (thread->th.th_task_team == NULL) {
2652       KA_TRACE(15,
2653                ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid));
2654       return FALSE;
2655     }
2656 
2657 #if OMP_45_ENABLED
2658     // We could be getting tasks from target constructs; if this is the only
2659     // thread, keep trying to execute tasks from own queue
2660     if (nthreads == 1)
2661       use_own_tasks = 1;
2662     else
2663 #endif
2664     {
2665       KA_TRACE(15,
2666                ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid));
2667       return FALSE;
2668     }
2669   }
2670 }
2671 
2672 int __kmp_execute_tasks_32(
2673     kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32 *flag, int final_spin,
2674     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
2675     kmp_int32 is_constrained) {
2676   return __kmp_execute_tasks_template(
2677       thread, gtid, flag, final_spin,
2678       thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
2679 }
2680 
2681 int __kmp_execute_tasks_64(
2682     kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64 *flag, int final_spin,
2683     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
2684     kmp_int32 is_constrained) {
2685   return __kmp_execute_tasks_template(
2686       thread, gtid, flag, final_spin,
2687       thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
2688 }
2689 
2690 int __kmp_execute_tasks_oncore(
2691     kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
2692     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
2693     kmp_int32 is_constrained) {
2694   return __kmp_execute_tasks_template(
2695       thread, gtid, flag, final_spin,
2696       thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
2697 }
2698 
2699 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
2700 // next barrier so they can assist in executing enqueued tasks.
2701 // First thread in allocates the task team atomically.
2702 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
2703                                  kmp_info_t *this_thr) {
2704   kmp_thread_data_t *threads_data;
2705   int nthreads, i, is_init_thread;
2706 
2707   KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n",
2708                 __kmp_gtid_from_thread(this_thr)));
2709 
2710   KMP_DEBUG_ASSERT(task_team != NULL);
2711   KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL);
2712 
2713   nthreads = task_team->tt.tt_nproc;
2714   KMP_DEBUG_ASSERT(nthreads > 0);
2715   KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc);
2716 
2717   // Allocate or increase the size of threads_data if necessary
2718   is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team);
2719 
2720   if (!is_init_thread) {
2721     // Some other thread already set up the array.
2722     KA_TRACE(
2723         20,
2724         ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n",
2725          __kmp_gtid_from_thread(this_thr)));
2726     return;
2727   }
2728   threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
2729   KMP_DEBUG_ASSERT(threads_data != NULL);
2730 
2731   if ((__kmp_tasking_mode == tskm_task_teams) &&
2732       (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) {
2733     // Release any threads sleeping at the barrier, so that they can steal
2734     // tasks and execute them.  In extra barrier mode, tasks do not sleep
2735     // at the separate tasking barrier, so this isn't a problem.
2736     for (i = 0; i < nthreads; i++) {
2737       volatile void *sleep_loc;
2738       kmp_info_t *thread = threads_data[i].td.td_thr;
2739 
2740       if (i == this_thr->th.th_info.ds.ds_tid) {
2741         continue;
2742       }
2743       // Since we haven't locked the thread's suspend mutex lock at this
2744       // point, there is a small window where a thread might be putting
2745       // itself to sleep, but hasn't set the th_sleep_loc field yet.
2746       // To work around this, __kmp_execute_tasks_template() periodically checks
2747       // see if other threads are sleeping (using the same random mechanism that
2748       // is used for task stealing) and awakens them if they are.
2749       if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
2750           NULL) {
2751         KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n",
2752                       __kmp_gtid_from_thread(this_thr),
2753                       __kmp_gtid_from_thread(thread)));
2754         __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
2755       } else {
2756         KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
2757                       __kmp_gtid_from_thread(this_thr),
2758                       __kmp_gtid_from_thread(thread)));
2759       }
2760     }
2761   }
2762 
2763   KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n",
2764                 __kmp_gtid_from_thread(this_thr)));
2765 }
2766 
2767 /* // TODO: Check the comment consistency
2768  * Utility routines for "task teams".  A task team (kmp_task_t) is kind of
2769  * like a shadow of the kmp_team_t data struct, with a different lifetime.
2770  * After a child * thread checks into a barrier and calls __kmp_release() from
2771  * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
2772  * longer assume that the kmp_team_t structure is intact (at any moment, the
2773  * master thread may exit the barrier code and free the team data structure,
2774  * and return the threads to the thread pool).
2775  *
2776  * This does not work with the the tasking code, as the thread is still
2777  * expected to participate in the execution of any tasks that may have been
2778  * spawned my a member of the team, and the thread still needs access to all
2779  * to each thread in the team, so that it can steal work from it.
2780  *
2781  * Enter the existence of the kmp_task_team_t struct.  It employs a reference
2782  * counting mechanims, and is allocated by the master thread before calling
2783  * __kmp_<barrier_kind>_release, and then is release by the last thread to
2784  * exit __kmp_<barrier_kind>_release at the next barrier.  I.e. the lifetimes
2785  * of the kmp_task_team_t structs for consecutive barriers can overlap
2786  * (and will, unless the master thread is the last thread to exit the barrier
2787  * release phase, which is not typical).
2788  *
2789  * The existence of such a struct is useful outside the context of tasking,
2790  * but for now, I'm trying to keep it specific to the OMP_30_ENABLED macro,
2791  * so that any performance differences show up when comparing the 2.5 vs. 3.0
2792  * libraries.
2793  *
2794  * We currently use the existence of the threads array as an indicator that
2795  * tasks were spawned since the last barrier.  If the structure is to be
2796  * useful outside the context of tasking, then this will have to change, but
2797  * not settting the field minimizes the performance impact of tasking on
2798  * barriers, when no explicit tasks were spawned (pushed, actually).
2799  */
2800 
2801 static kmp_task_team_t *__kmp_free_task_teams =
2802     NULL; // Free list for task_team data structures
2803 // Lock for task team data structures
2804 kmp_bootstrap_lock_t __kmp_task_team_lock =
2805     KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock);
2806 
2807 // __kmp_alloc_task_deque:
2808 // Allocates a task deque for a particular thread, and initialize the necessary
2809 // data structures relating to the deque.  This only happens once per thread
2810 // per task team since task teams are recycled. No lock is needed during
2811 // allocation since each thread allocates its own deque.
2812 static void __kmp_alloc_task_deque(kmp_info_t *thread,
2813                                    kmp_thread_data_t *thread_data) {
2814   __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
2815   KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL);
2816 
2817   // Initialize last stolen task field to "none"
2818   thread_data->td.td_deque_last_stolen = -1;
2819 
2820   KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0);
2821   KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0);
2822   KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0);
2823 
2824   KE_TRACE(
2825       10,
2826       ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
2827        __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data));
2828   // Allocate space for task deque, and zero the deque
2829   // Cannot use __kmp_thread_calloc() because threads not around for
2830   // kmp_reap_task_team( ).
2831   thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
2832       INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
2833   thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
2834 }
2835 
2836 // __kmp_realloc_task_deque:
2837 // Re-allocates a task deque for a particular thread, copies the content from
2838 // the old deque and adjusts the necessary data structures relating to the
2839 // deque. This operation must be done with a the deque_lock being held
2840 static void __kmp_realloc_task_deque(kmp_info_t *thread,
2841                                      kmp_thread_data_t *thread_data) {
2842   kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
2843   kmp_int32 new_size = 2 * size;
2844 
2845   KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to "
2846                 "%d] for thread_data %p\n",
2847                 __kmp_gtid_from_thread(thread), size, new_size, thread_data));
2848 
2849   kmp_taskdata_t **new_deque =
2850       (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *));
2851 
2852   int i, j;
2853   for (i = thread_data->td.td_deque_head, j = 0; j < size;
2854        i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++)
2855     new_deque[j] = thread_data->td.td_deque[i];
2856 
2857   __kmp_free(thread_data->td.td_deque);
2858 
2859   thread_data->td.td_deque_head = 0;
2860   thread_data->td.td_deque_tail = size;
2861   thread_data->td.td_deque = new_deque;
2862   thread_data->td.td_deque_size = new_size;
2863 }
2864 
2865 // __kmp_free_task_deque:
2866 // Deallocates a task deque for a particular thread. Happens at library
2867 // deallocation so don't need to reset all thread data fields.
2868 static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) {
2869   if (thread_data->td.td_deque != NULL) {
2870     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
2871     TCW_4(thread_data->td.td_deque_ntasks, 0);
2872     __kmp_free(thread_data->td.td_deque);
2873     thread_data->td.td_deque = NULL;
2874     __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2875   }
2876 
2877 #ifdef BUILD_TIED_TASK_STACK
2878   // GEH: Figure out what to do here for td_susp_tied_tasks
2879   if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) {
2880     __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data);
2881   }
2882 #endif // BUILD_TIED_TASK_STACK
2883 }
2884 
2885 // __kmp_realloc_task_threads_data:
2886 // Allocates a threads_data array for a task team, either by allocating an
2887 // initial array or enlarging an existing array.  Only the first thread to get
2888 // the lock allocs or enlarges the array and re-initializes the array eleemnts.
2889 // That thread returns "TRUE", the rest return "FALSE".
2890 // Assumes that the new array size is given by task_team -> tt.tt_nproc.
2891 // The current size is given by task_team -> tt.tt_max_threads.
2892 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
2893                                            kmp_task_team_t *task_team) {
2894   kmp_thread_data_t **threads_data_p;
2895   kmp_int32 nthreads, maxthreads;
2896   int is_init_thread = FALSE;
2897 
2898   if (TCR_4(task_team->tt.tt_found_tasks)) {
2899     // Already reallocated and initialized.
2900     return FALSE;
2901   }
2902 
2903   threads_data_p = &task_team->tt.tt_threads_data;
2904   nthreads = task_team->tt.tt_nproc;
2905   maxthreads = task_team->tt.tt_max_threads;
2906 
2907   // All threads must lock when they encounter the first task of the implicit
2908   // task region to make sure threads_data fields are (re)initialized before
2909   // used.
2910   __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
2911 
2912   if (!TCR_4(task_team->tt.tt_found_tasks)) {
2913     // first thread to enable tasking
2914     kmp_team_t *team = thread->th.th_team;
2915     int i;
2916 
2917     is_init_thread = TRUE;
2918     if (maxthreads < nthreads) {
2919 
2920       if (*threads_data_p != NULL) {
2921         kmp_thread_data_t *old_data = *threads_data_p;
2922         kmp_thread_data_t *new_data = NULL;
2923 
2924         KE_TRACE(
2925             10,
2926             ("__kmp_realloc_task_threads_data: T#%d reallocating "
2927              "threads data for task_team %p, new_size = %d, old_size = %d\n",
2928              __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads));
2929         // Reallocate threads_data to have more elements than current array
2930         // Cannot use __kmp_thread_realloc() because threads not around for
2931         // kmp_reap_task_team( ).  Note all new array entries are initialized
2932         // to zero by __kmp_allocate().
2933         new_data = (kmp_thread_data_t *)__kmp_allocate(
2934             nthreads * sizeof(kmp_thread_data_t));
2935         // copy old data to new data
2936         KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t),
2937                      (void *)old_data, maxthreads * sizeof(kmp_thread_data_t));
2938 
2939 #ifdef BUILD_TIED_TASK_STACK
2940         // GEH: Figure out if this is the right thing to do
2941         for (i = maxthreads; i < nthreads; i++) {
2942           kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
2943           __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
2944         }
2945 #endif // BUILD_TIED_TASK_STACK
2946         // Install the new data and free the old data
2947         (*threads_data_p) = new_data;
2948         __kmp_free(old_data);
2949       } else {
2950         KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating "
2951                       "threads data for task_team %p, size = %d\n",
2952                       __kmp_gtid_from_thread(thread), task_team, nthreads));
2953         // Make the initial allocate for threads_data array, and zero entries
2954         // Cannot use __kmp_thread_calloc() because threads not around for
2955         // kmp_reap_task_team( ).
2956         ANNOTATE_IGNORE_WRITES_BEGIN();
2957         *threads_data_p = (kmp_thread_data_t *)__kmp_allocate(
2958             nthreads * sizeof(kmp_thread_data_t));
2959         ANNOTATE_IGNORE_WRITES_END();
2960 #ifdef BUILD_TIED_TASK_STACK
2961         // GEH: Figure out if this is the right thing to do
2962         for (i = 0; i < nthreads; i++) {
2963           kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
2964           __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
2965         }
2966 #endif // BUILD_TIED_TASK_STACK
2967       }
2968       task_team->tt.tt_max_threads = nthreads;
2969     } else {
2970       // If array has (more than) enough elements, go ahead and use it
2971       KMP_DEBUG_ASSERT(*threads_data_p != NULL);
2972     }
2973 
2974     // initialize threads_data pointers back to thread_info structures
2975     for (i = 0; i < nthreads; i++) {
2976       kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
2977       thread_data->td.td_thr = team->t.t_threads[i];
2978 
2979       if (thread_data->td.td_deque_last_stolen >= nthreads) {
2980         // The last stolen field survives across teams / barrier, and the number
2981         // of threads may have changed.  It's possible (likely?) that a new
2982         // parallel region will exhibit the same behavior as previous region.
2983         thread_data->td.td_deque_last_stolen = -1;
2984       }
2985     }
2986 
2987     KMP_MB();
2988     TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE);
2989   }
2990 
2991   __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
2992   return is_init_thread;
2993 }
2994 
2995 // __kmp_free_task_threads_data:
2996 // Deallocates a threads_data array for a task team, including any attached
2997 // tasking deques.  Only occurs at library shutdown.
2998 static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) {
2999   __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3000   if (task_team->tt.tt_threads_data != NULL) {
3001     int i;
3002     for (i = 0; i < task_team->tt.tt_max_threads; i++) {
3003       __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]);
3004     }
3005     __kmp_free(task_team->tt.tt_threads_data);
3006     task_team->tt.tt_threads_data = NULL;
3007   }
3008   __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3009 }
3010 
3011 // __kmp_allocate_task_team:
3012 // Allocates a task team associated with a specific team, taking it from
3013 // the global task team free list if possible.  Also initializes data
3014 // structures.
3015 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
3016                                                  kmp_team_t *team) {
3017   kmp_task_team_t *task_team = NULL;
3018   int nthreads;
3019 
3020   KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n",
3021                 (thread ? __kmp_gtid_from_thread(thread) : -1), team));
3022 
3023   if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3024     // Take a task team from the task team pool
3025     __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3026     if (__kmp_free_task_teams != NULL) {
3027       task_team = __kmp_free_task_teams;
3028       TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next);
3029       task_team->tt.tt_next = NULL;
3030     }
3031     __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3032   }
3033 
3034   if (task_team == NULL) {
3035     KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating "
3036                   "task team for team %p\n",
3037                   __kmp_gtid_from_thread(thread), team));
3038     // Allocate a new task team if one is not available.
3039     // Cannot use __kmp_thread_malloc() because threads not around for
3040     // kmp_reap_task_team( ).
3041     task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t));
3042     __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock);
3043     // AC: __kmp_allocate zeroes returned memory
3044     // task_team -> tt.tt_threads_data = NULL;
3045     // task_team -> tt.tt_max_threads = 0;
3046     // task_team -> tt.tt_next = NULL;
3047   }
3048 
3049   TCW_4(task_team->tt.tt_found_tasks, FALSE);
3050 #if OMP_45_ENABLED
3051   TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3052 #endif
3053   task_team->tt.tt_nproc = nthreads = team->t.t_nproc;
3054 
3055   KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads);
3056   TCW_4(task_team->tt.tt_active, TRUE);
3057 
3058   KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
3059                 "unfinished_threads init'd to %d\n",
3060                 (thread ? __kmp_gtid_from_thread(thread) : -1), task_team,
3061                 KMP_ATOMIC_LD_RLX(&task_team->tt.tt_unfinished_threads)));
3062   return task_team;
3063 }
3064 
3065 // __kmp_free_task_team:
3066 // Frees the task team associated with a specific thread, and adds it
3067 // to the global task team free list.
3068 void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) {
3069   KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n",
3070                 thread ? __kmp_gtid_from_thread(thread) : -1, task_team));
3071 
3072   // Put task team back on free list
3073   __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3074 
3075   KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL);
3076   task_team->tt.tt_next = __kmp_free_task_teams;
3077   TCW_PTR(__kmp_free_task_teams, task_team);
3078 
3079   __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3080 }
3081 
3082 // __kmp_reap_task_teams:
3083 // Free all the task teams on the task team free list.
3084 // Should only be done during library shutdown.
3085 // Cannot do anything that needs a thread structure or gtid since they are
3086 // already gone.
3087 void __kmp_reap_task_teams(void) {
3088   kmp_task_team_t *task_team;
3089 
3090   if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3091     // Free all task_teams on the free list
3092     __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3093     while ((task_team = __kmp_free_task_teams) != NULL) {
3094       __kmp_free_task_teams = task_team->tt.tt_next;
3095       task_team->tt.tt_next = NULL;
3096 
3097       // Free threads_data if necessary
3098       if (task_team->tt.tt_threads_data != NULL) {
3099         __kmp_free_task_threads_data(task_team);
3100       }
3101       __kmp_free(task_team);
3102     }
3103     __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3104   }
3105 }
3106 
3107 // __kmp_wait_to_unref_task_teams:
3108 // Some threads could still be in the fork barrier release code, possibly
3109 // trying to steal tasks.  Wait for each thread to unreference its task team.
3110 void __kmp_wait_to_unref_task_teams(void) {
3111   kmp_info_t *thread;
3112   kmp_uint32 spins;
3113   int done;
3114 
3115   KMP_INIT_YIELD(spins);
3116 
3117   for (;;) {
3118     done = TRUE;
3119 
3120     // TODO: GEH - this may be is wrong because some sync would be necessary
3121     // in case threads are added to the pool during the traversal. Need to
3122     // verify that lock for thread pool is held when calling this routine.
3123     for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL;
3124          thread = thread->th.th_next_pool) {
3125 #if KMP_OS_WINDOWS
3126       DWORD exit_val;
3127 #endif
3128       if (TCR_PTR(thread->th.th_task_team) == NULL) {
3129         KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n",
3130                       __kmp_gtid_from_thread(thread)));
3131         continue;
3132       }
3133 #if KMP_OS_WINDOWS
3134       // TODO: GEH - add this check for Linux* OS / OS X* as well?
3135       if (!__kmp_is_thread_alive(thread, &exit_val)) {
3136         thread->th.th_task_team = NULL;
3137         continue;
3138       }
3139 #endif
3140 
3141       done = FALSE; // Because th_task_team pointer is not NULL for this thread
3142 
3143       KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to "
3144                     "unreference task_team\n",
3145                     __kmp_gtid_from_thread(thread)));
3146 
3147       if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
3148         volatile void *sleep_loc;
3149         // If the thread is sleeping, awaken it.
3150         if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3151             NULL) {
3152           KA_TRACE(
3153               10,
3154               ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
3155                __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread)));
3156           __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
3157         }
3158       }
3159     }
3160     if (done) {
3161       break;
3162     }
3163 
3164     // If we are oversubscribed, or have waited a bit (and library mode is
3165     // throughput), yield. Pause is in the following code.
3166     KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
3167     KMP_YIELD_SPIN(spins); // Yields only if KMP_LIBRARY=throughput
3168   }
3169 }
3170 
3171 // __kmp_task_team_setup:  Create a task_team for the current team, but use
3172 // an already created, unused one if it already exists.
3173 void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
3174   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3175 
3176   // If this task_team hasn't been created yet, allocate it. It will be used in
3177   // the region after the next.
3178   // If it exists, it is the current task team and shouldn't be touched yet as
3179   // it may still be in use.
3180   if (team->t.t_task_team[this_thr->th.th_task_state] == NULL &&
3181       (always || team->t.t_nproc > 1)) {
3182     team->t.t_task_team[this_thr->th.th_task_state] =
3183         __kmp_allocate_task_team(this_thr, team);
3184     KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created new task_team %p "
3185                   "for team %d at parity=%d\n",
3186                   __kmp_gtid_from_thread(this_thr),
3187                   team->t.t_task_team[this_thr->th.th_task_state],
3188                   ((team != NULL) ? team->t.t_id : -1),
3189                   this_thr->th.th_task_state));
3190   }
3191 
3192   // After threads exit the release, they will call sync, and then point to this
3193   // other task_team; make sure it is allocated and properly initialized. As
3194   // threads spin in the barrier release phase, they will continue to use the
3195   // previous task_team struct(above), until they receive the signal to stop
3196   // checking for tasks (they can't safely reference the kmp_team_t struct,
3197   // which could be reallocated by the master thread). No task teams are formed
3198   // for serialized teams.
3199   if (team->t.t_nproc > 1) {
3200     int other_team = 1 - this_thr->th.th_task_state;
3201     if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
3202       team->t.t_task_team[other_team] =
3203           __kmp_allocate_task_team(this_thr, team);
3204       KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created second new "
3205                     "task_team %p for team %d at parity=%d\n",
3206                     __kmp_gtid_from_thread(this_thr),
3207                     team->t.t_task_team[other_team],
3208                     ((team != NULL) ? team->t.t_id : -1), other_team));
3209     } else { // Leave the old task team struct in place for the upcoming region;
3210       // adjust as needed
3211       kmp_task_team_t *task_team = team->t.t_task_team[other_team];
3212       if (!task_team->tt.tt_active ||
3213           team->t.t_nproc != task_team->tt.tt_nproc) {
3214         TCW_4(task_team->tt.tt_nproc, team->t.t_nproc);
3215         TCW_4(task_team->tt.tt_found_tasks, FALSE);
3216 #if OMP_45_ENABLED
3217         TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3218 #endif
3219         KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads,
3220                           team->t.t_nproc);
3221         TCW_4(task_team->tt.tt_active, TRUE);
3222       }
3223       // if team size has changed, the first thread to enable tasking will
3224       // realloc threads_data if necessary
3225       KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d reset next task_team "
3226                     "%p for team %d at parity=%d\n",
3227                     __kmp_gtid_from_thread(this_thr),
3228                     team->t.t_task_team[other_team],
3229                     ((team != NULL) ? team->t.t_id : -1), other_team));
3230     }
3231   }
3232 }
3233 
3234 // __kmp_task_team_sync: Propagation of task team data from team to threads
3235 // which happens just after the release phase of a team barrier.  This may be
3236 // called by any thread, but only for teams with # threads > 1.
3237 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
3238   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3239 
3240   // Toggle the th_task_state field, to switch which task_team this thread
3241   // refers to
3242   this_thr->th.th_task_state = 1 - this_thr->th.th_task_state;
3243   // It is now safe to propagate the task team pointer from the team struct to
3244   // the current thread.
3245   TCW_PTR(this_thr->th.th_task_team,
3246           team->t.t_task_team[this_thr->th.th_task_state]);
3247   KA_TRACE(20,
3248            ("__kmp_task_team_sync: Thread T#%d task team switched to task_team "
3249             "%p from Team #%d (parity=%d)\n",
3250             __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team,
3251             ((team != NULL) ? team->t.t_id : -1), this_thr->th.th_task_state));
3252 }
3253 
3254 // __kmp_task_team_wait: Master thread waits for outstanding tasks after the
3255 // barrier gather phase. Only called by master thread if #threads in team > 1 or
3256 // if proxy tasks were created.
3257 //
3258 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
3259 // by passing in 0 optionally as the last argument. When wait is zero, master
3260 // thread does not wait for unfinished_threads to reach 0.
3261 void __kmp_task_team_wait(
3262     kmp_info_t *this_thr,
3263     kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) {
3264   kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state];
3265 
3266   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3267   KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team);
3268 
3269   if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) {
3270     if (wait) {
3271       KA_TRACE(20, ("__kmp_task_team_wait: Master T#%d waiting for all tasks "
3272                     "(for unfinished_threads to reach 0) on task_team = %p\n",
3273                     __kmp_gtid_from_thread(this_thr), task_team));
3274       // Worker threads may have dropped through to release phase, but could
3275       // still be executing tasks. Wait here for tasks to complete. To avoid
3276       // memory contention, only master thread checks termination condition.
3277       kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *,
3278                              &task_team->tt.tt_unfinished_threads),
3279                        0U);
3280       flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
3281     }
3282     // Deactivate the old task team, so that the worker threads will stop
3283     // referencing it while spinning.
3284     KA_TRACE(
3285         20,
3286         ("__kmp_task_team_wait: Master T#%d deactivating task_team %p: "
3287          "setting active to false, setting local and team's pointer to NULL\n",
3288          __kmp_gtid_from_thread(this_thr), task_team));
3289 #if OMP_45_ENABLED
3290     KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 ||
3291                      task_team->tt.tt_found_proxy_tasks == TRUE);
3292     TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3293 #else
3294     KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1);
3295 #endif
3296     KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0);
3297     TCW_SYNC_4(task_team->tt.tt_active, FALSE);
3298     KMP_MB();
3299 
3300     TCW_PTR(this_thr->th.th_task_team, NULL);
3301   }
3302 }
3303 
3304 // __kmp_tasking_barrier:
3305 // This routine may only called when __kmp_tasking_mode == tskm_extra_barrier.
3306 // Internal function to execute all tasks prior to a regular barrier or a join
3307 // barrier. It is a full barrier itself, which unfortunately turns regular
3308 // barriers into double barriers and join barriers into 1 1/2 barriers.
3309 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) {
3310   std::atomic<kmp_uint32> *spin = RCAST(
3311       std::atomic<kmp_uint32> *,
3312       &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads);
3313   int flag = FALSE;
3314   KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier);
3315 
3316 #if USE_ITT_BUILD
3317   KMP_FSYNC_SPIN_INIT(spin, NULL);
3318 #endif /* USE_ITT_BUILD */
3319   kmp_flag_32 spin_flag(spin, 0U);
3320   while (!spin_flag.execute_tasks(thread, gtid, TRUE,
3321                                   &flag USE_ITT_BUILD_ARG(NULL), 0)) {
3322 #if USE_ITT_BUILD
3323     // TODO: What about itt_sync_obj??
3324     KMP_FSYNC_SPIN_PREPARE(RCAST(void *, spin));
3325 #endif /* USE_ITT_BUILD */
3326 
3327     if (TCR_4(__kmp_global.g.g_done)) {
3328       if (__kmp_global.g.g_abort)
3329         __kmp_abort_thread();
3330       break;
3331     }
3332     KMP_YIELD(TRUE); // GH: We always yield here
3333   }
3334 #if USE_ITT_BUILD
3335   KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin));
3336 #endif /* USE_ITT_BUILD */
3337 }
3338 
3339 #if OMP_45_ENABLED
3340 
3341 // __kmp_give_task puts a task into a given thread queue if:
3342 //  - the queue for that thread was created
3343 //  - there's space in that queue
3344 // Because of this, __kmp_push_task needs to check if there's space after
3345 // getting the lock
3346 static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task,
3347                             kmp_int32 pass) {
3348   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
3349   kmp_task_team_t *task_team = taskdata->td_task_team;
3350 
3351   KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n",
3352                 taskdata, tid));
3353 
3354   // If task_team is NULL something went really bad...
3355   KMP_DEBUG_ASSERT(task_team != NULL);
3356 
3357   bool result = false;
3358   kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
3359 
3360   if (thread_data->td.td_deque == NULL) {
3361     // There's no queue in this thread, go find another one
3362     // We're guaranteed that at least one thread has a queue
3363     KA_TRACE(30,
3364              ("__kmp_give_task: thread %d has no queue while giving task %p.\n",
3365               tid, taskdata));
3366     return result;
3367   }
3368 
3369   if (TCR_4(thread_data->td.td_deque_ntasks) >=
3370       TASK_DEQUE_SIZE(thread_data->td)) {
3371     KA_TRACE(
3372         30,
3373         ("__kmp_give_task: queue is full while giving task %p to thread %d.\n",
3374          taskdata, tid));
3375 
3376     // if this deque is bigger than the pass ratio give a chance to another
3377     // thread
3378     if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
3379       return result;
3380 
3381     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3382     __kmp_realloc_task_deque(thread, thread_data);
3383 
3384   } else {
3385 
3386     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3387 
3388     if (TCR_4(thread_data->td.td_deque_ntasks) >=
3389         TASK_DEQUE_SIZE(thread_data->td)) {
3390       KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to "
3391                     "thread %d.\n",
3392                     taskdata, tid));
3393 
3394       // if this deque is bigger than the pass ratio give a chance to another
3395       // thread
3396       if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
3397         goto release_and_exit;
3398 
3399       __kmp_realloc_task_deque(thread, thread_data);
3400     }
3401   }
3402 
3403   // lock is held here, and there is space in the deque
3404 
3405   thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
3406   // Wrap index.
3407   thread_data->td.td_deque_tail =
3408       (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
3409   TCW_4(thread_data->td.td_deque_ntasks,
3410         TCR_4(thread_data->td.td_deque_ntasks) + 1);
3411 
3412   result = true;
3413   KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n",
3414                 taskdata, tid));
3415 
3416 release_and_exit:
3417   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3418 
3419   return result;
3420 }
3421 
3422 /* The finish of the proxy tasks is divided in two pieces:
3423     - the top half is the one that can be done from a thread outside the team
3424     - the bottom half must be run from a them within the team
3425 
3426    In order to run the bottom half the task gets queued back into one of the
3427    threads of the team. Once the td_incomplete_child_task counter of the parent
3428    is decremented the threads can leave the barriers. So, the bottom half needs
3429    to be queued before the counter is decremented. The top half is therefore
3430    divided in two parts:
3431     - things that can be run before queuing the bottom half
3432     - things that must be run after queuing the bottom half
3433 
3434    This creates a second race as the bottom half can free the task before the
3435    second top half is executed. To avoid this we use the
3436    td_incomplete_child_task of the proxy task to synchronize the top and bottom
3437    half. */
3438 static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
3439   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
3440   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3441   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
3442   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
3443 
3444   taskdata->td_flags.complete = 1; // mark the task as completed
3445 
3446   if (taskdata->td_taskgroup)
3447     KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
3448 
3449   // Create an imaginary children for this task so the bottom half cannot
3450   // release the task before we have completed the second top half
3451   KMP_ATOMIC_INC(&taskdata->td_incomplete_child_tasks);
3452 }
3453 
3454 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
3455   kmp_int32 children = 0;
3456 
3457   // Predecrement simulated by "- 1" calculation
3458   children =
3459       KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
3460   KMP_DEBUG_ASSERT(children >= 0);
3461 
3462   // Remove the imaginary children
3463   KMP_ATOMIC_DEC(&taskdata->td_incomplete_child_tasks);
3464 }
3465 
3466 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) {
3467   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3468   kmp_info_t *thread = __kmp_threads[gtid];
3469 
3470   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3471   KMP_DEBUG_ASSERT(taskdata->td_flags.complete ==
3472                    1); // top half must run before bottom half
3473 
3474   // We need to wait to make sure the top half is finished
3475   // Spinning here should be ok as this should happen quickly
3476   while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) > 0)
3477     ;
3478 
3479   __kmp_release_deps(gtid, taskdata);
3480   __kmp_free_task_and_ancestors(gtid, taskdata, thread);
3481 }
3482 
3483 /*!
3484 @ingroup TASKING
3485 @param gtid Global Thread ID of encountering thread
3486 @param ptask Task which execution is completed
3487 
3488 Execute the completation of a proxy task from a thread of that is part of the
3489 team. Run first and bottom halves directly.
3490 */
3491 void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) {
3492   KMP_DEBUG_ASSERT(ptask != NULL);
3493   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3494   KA_TRACE(
3495       10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n",
3496            gtid, taskdata));
3497 
3498   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3499 
3500   __kmp_first_top_half_finish_proxy(taskdata);
3501   __kmp_second_top_half_finish_proxy(taskdata);
3502   __kmp_bottom_half_finish_proxy(gtid, ptask);
3503 
3504   KA_TRACE(10,
3505            ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n",
3506             gtid, taskdata));
3507 }
3508 
3509 /*!
3510 @ingroup TASKING
3511 @param ptask Task which execution is completed
3512 
3513 Execute the completation of a proxy task from a thread that could not belong to
3514 the team.
3515 */
3516 void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) {
3517   KMP_DEBUG_ASSERT(ptask != NULL);
3518   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3519 
3520   KA_TRACE(
3521       10,
3522       ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n",
3523        taskdata));
3524 
3525   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3526 
3527   __kmp_first_top_half_finish_proxy(taskdata);
3528 
3529   // Enqueue task to complete bottom half completion from a thread within the
3530   // corresponding team
3531   kmp_team_t *team = taskdata->td_team;
3532   kmp_int32 nthreads = team->t.t_nproc;
3533   kmp_info_t *thread;
3534 
3535   // This should be similar to start_k = __kmp_get_random( thread ) % nthreads
3536   // but we cannot use __kmp_get_random here
3537   kmp_int32 start_k = 0;
3538   kmp_int32 pass = 1;
3539   kmp_int32 k = start_k;
3540 
3541   do {
3542     // For now we're just linearly trying to find a thread
3543     thread = team->t.t_threads[k];
3544     k = (k + 1) % nthreads;
3545 
3546     // we did a full pass through all the threads
3547     if (k == start_k)
3548       pass = pass << 1;
3549 
3550   } while (!__kmp_give_task(thread, k, ptask, pass));
3551 
3552   __kmp_second_top_half_finish_proxy(taskdata);
3553 
3554   KA_TRACE(
3555       10,
3556       ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n",
3557        taskdata));
3558 }
3559 
3560 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task
3561 // for taskloop
3562 //
3563 // thread:   allocating thread
3564 // task_src: pointer to source task to be duplicated
3565 // returns:  a pointer to the allocated kmp_task_t structure (task).
3566 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) {
3567   kmp_task_t *task;
3568   kmp_taskdata_t *taskdata;
3569   kmp_taskdata_t *taskdata_src;
3570   kmp_taskdata_t *parent_task = thread->th.th_current_task;
3571   size_t shareds_offset;
3572   size_t task_size;
3573 
3574   KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread,
3575                 task_src));
3576   taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
3577   KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy ==
3578                    TASK_FULL); // it should not be proxy task
3579   KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT);
3580   task_size = taskdata_src->td_size_alloc;
3581 
3582   // Allocate a kmp_taskdata_t block and a kmp_task_t block.
3583   KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread,
3584                 task_size));
3585 #if USE_FAST_MEMORY
3586   taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size);
3587 #else
3588   taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size);
3589 #endif /* USE_FAST_MEMORY */
3590   KMP_MEMCPY(taskdata, taskdata_src, task_size);
3591 
3592   task = KMP_TASKDATA_TO_TASK(taskdata);
3593 
3594   // Initialize new task (only specific fields not affected by memcpy)
3595   taskdata->td_task_id = KMP_GEN_TASK_ID();
3596   if (task->shareds != NULL) { // need setup shareds pointer
3597     shareds_offset = (char *)task_src->shareds - (char *)taskdata_src;
3598     task->shareds = &((char *)taskdata)[shareds_offset];
3599     KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
3600                      0);
3601   }
3602   taskdata->td_alloc_thread = thread;
3603   taskdata->td_parent = parent_task;
3604   taskdata->td_taskgroup =
3605       parent_task
3606           ->td_taskgroup; // task inherits the taskgroup from the parent task
3607 
3608   // Only need to keep track of child task counts if team parallel and tasking
3609   // not serialized
3610   if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
3611     KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
3612     if (parent_task->td_taskgroup)
3613       KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
3614     // Only need to keep track of allocated child tasks for explicit tasks since
3615     // implicit not deallocated
3616     if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT)
3617       KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
3618   }
3619 
3620   KA_TRACE(20,
3621            ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
3622             thread, taskdata, taskdata->td_parent));
3623 #if OMPT_SUPPORT
3624   if (UNLIKELY(ompt_enabled.enabled))
3625     __ompt_task_init(taskdata, thread->th.th_info.ds.ds_gtid);
3626 #endif
3627   return task;
3628 }
3629 
3630 // Routine optionally generated by the compiler for setting the lastprivate flag
3631 // and calling needed constructors for private/firstprivate objects
3632 // (used to form taskloop tasks from pattern task)
3633 // Parameters: dest task, src task, lastprivate flag.
3634 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
3635 
3636 KMP_BUILD_ASSERT(sizeof(long) == 4 || sizeof(long) == 8);
3637 
3638 // class to encapsulate manipulating loop bounds in a taskloop task.
3639 // this abstracts away the Intel vs GOMP taskloop interface for setting/getting
3640 // the loop bound variables.
3641 class kmp_taskloop_bounds_t {
3642   kmp_task_t *task;
3643   const kmp_taskdata_t *taskdata;
3644   size_t lower_offset;
3645   size_t upper_offset;
3646 
3647 public:
3648   kmp_taskloop_bounds_t(kmp_task_t *_task, kmp_uint64 *lb, kmp_uint64 *ub)
3649       : task(_task), taskdata(KMP_TASK_TO_TASKDATA(task)),
3650         lower_offset((char *)lb - (char *)task),
3651         upper_offset((char *)ub - (char *)task) {
3652     KMP_DEBUG_ASSERT((char *)lb > (char *)_task);
3653     KMP_DEBUG_ASSERT((char *)ub > (char *)_task);
3654   }
3655   kmp_taskloop_bounds_t(kmp_task_t *_task, const kmp_taskloop_bounds_t &bounds)
3656       : task(_task), taskdata(KMP_TASK_TO_TASKDATA(_task)),
3657         lower_offset(bounds.lower_offset), upper_offset(bounds.upper_offset) {}
3658   size_t get_lower_offset() const { return lower_offset; }
3659   size_t get_upper_offset() const { return upper_offset; }
3660   kmp_uint64 get_lb() const {
3661     kmp_int64 retval;
3662 #if defined(KMP_GOMP_COMPAT)
3663     // Intel task just returns the lower bound normally
3664     if (!taskdata->td_flags.native) {
3665       retval = *(kmp_int64 *)((char *)task + lower_offset);
3666     } else {
3667       // GOMP task has to take into account the sizeof(long)
3668       if (taskdata->td_size_loop_bounds == 4) {
3669         kmp_int32 *lb = RCAST(kmp_int32 *, task->shareds);
3670         retval = (kmp_int64)*lb;
3671       } else {
3672         kmp_int64 *lb = RCAST(kmp_int64 *, task->shareds);
3673         retval = (kmp_int64)*lb;
3674       }
3675     }
3676 #else
3677     retval = *(kmp_int64 *)((char *)task + lower_offset);
3678 #endif // defined(KMP_GOMP_COMPAT)
3679     return retval;
3680   }
3681   kmp_uint64 get_ub() const {
3682     kmp_int64 retval;
3683 #if defined(KMP_GOMP_COMPAT)
3684     // Intel task just returns the upper bound normally
3685     if (!taskdata->td_flags.native) {
3686       retval = *(kmp_int64 *)((char *)task + upper_offset);
3687     } else {
3688       // GOMP task has to take into account the sizeof(long)
3689       if (taskdata->td_size_loop_bounds == 4) {
3690         kmp_int32 *ub = RCAST(kmp_int32 *, task->shareds) + 1;
3691         retval = (kmp_int64)*ub;
3692       } else {
3693         kmp_int64 *ub = RCAST(kmp_int64 *, task->shareds) + 1;
3694         retval = (kmp_int64)*ub;
3695       }
3696     }
3697 #else
3698     retval = *(kmp_int64 *)((char *)task + upper_offset);
3699 #endif // defined(KMP_GOMP_COMPAT)
3700     return retval;
3701   }
3702   void set_lb(kmp_uint64 lb) {
3703 #if defined(KMP_GOMP_COMPAT)
3704     // Intel task just sets the lower bound normally
3705     if (!taskdata->td_flags.native) {
3706       *(kmp_uint64 *)((char *)task + lower_offset) = lb;
3707     } else {
3708       // GOMP task has to take into account the sizeof(long)
3709       if (taskdata->td_size_loop_bounds == 4) {
3710         kmp_uint32 *lower = RCAST(kmp_uint32 *, task->shareds);
3711         *lower = (kmp_uint32)lb;
3712       } else {
3713         kmp_uint64 *lower = RCAST(kmp_uint64 *, task->shareds);
3714         *lower = (kmp_uint64)lb;
3715       }
3716     }
3717 #else
3718     *(kmp_uint64 *)((char *)task + lower_offset) = lb;
3719 #endif // defined(KMP_GOMP_COMPAT)
3720   }
3721   void set_ub(kmp_uint64 ub) {
3722 #if defined(KMP_GOMP_COMPAT)
3723     // Intel task just sets the upper bound normally
3724     if (!taskdata->td_flags.native) {
3725       *(kmp_uint64 *)((char *)task + upper_offset) = ub;
3726     } else {
3727       // GOMP task has to take into account the sizeof(long)
3728       if (taskdata->td_size_loop_bounds == 4) {
3729         kmp_uint32 *upper = RCAST(kmp_uint32 *, task->shareds) + 1;
3730         *upper = (kmp_uint32)ub;
3731       } else {
3732         kmp_uint64 *upper = RCAST(kmp_uint64 *, task->shareds) + 1;
3733         *upper = (kmp_uint64)ub;
3734       }
3735     }
3736 #else
3737     *(kmp_uint64 *)((char *)task + upper_offset) = ub;
3738 #endif // defined(KMP_GOMP_COMPAT)
3739   }
3740 };
3741 
3742 // __kmp_taskloop_linear: Start tasks of the taskloop linearly
3743 //
3744 // loc        Source location information
3745 // gtid       Global thread ID
3746 // task       Pattern task, exposes the loop iteration range
3747 // lb         Pointer to loop lower bound in task structure
3748 // ub         Pointer to loop upper bound in task structure
3749 // st         Loop stride
3750 // ub_glob    Global upper bound (used for lastprivate check)
3751 // num_tasks  Number of tasks to execute
3752 // grainsize  Number of loop iterations per task
3753 // extras     Number of chunks with grainsize+1 iterations
3754 // tc         Iterations count
3755 // task_dup   Tasks duplication routine
3756 // codeptr_ra Return address for OMPT events
3757 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
3758                            kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
3759                            kmp_uint64 ub_glob, kmp_uint64 num_tasks,
3760                            kmp_uint64 grainsize, kmp_uint64 extras,
3761                            kmp_uint64 tc,
3762 #if OMPT_SUPPORT
3763                            void *codeptr_ra,
3764 #endif
3765                            void *task_dup) {
3766   KMP_COUNT_BLOCK(OMP_TASKLOOP);
3767   KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
3768   p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
3769   // compiler provides global bounds here
3770   kmp_taskloop_bounds_t task_bounds(task, lb, ub);
3771   kmp_uint64 lower = task_bounds.get_lb();
3772   kmp_uint64 upper = task_bounds.get_ub();
3773   kmp_uint64 i;
3774   kmp_info_t *thread = __kmp_threads[gtid];
3775   kmp_taskdata_t *current_task = thread->th.th_current_task;
3776   kmp_task_t *next_task;
3777   kmp_int32 lastpriv = 0;
3778 
3779   KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
3780   KMP_DEBUG_ASSERT(num_tasks > extras);
3781   KMP_DEBUG_ASSERT(num_tasks > 0);
3782   KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, "
3783                 "extras %lld, i=%lld,%lld(%d)%lld, dup %p\n",
3784                 gtid, num_tasks, grainsize, extras, lower, upper, ub_glob, st,
3785                 task_dup));
3786 
3787   // Launch num_tasks tasks, assign grainsize iterations each task
3788   for (i = 0; i < num_tasks; ++i) {
3789     kmp_uint64 chunk_minus_1;
3790     if (extras == 0) {
3791       chunk_minus_1 = grainsize - 1;
3792     } else {
3793       chunk_minus_1 = grainsize;
3794       --extras; // first extras iterations get bigger chunk (grainsize+1)
3795     }
3796     upper = lower + st * chunk_minus_1;
3797     if (i == num_tasks - 1) {
3798       // schedule the last task, set lastprivate flag if needed
3799       if (st == 1) { // most common case
3800         KMP_DEBUG_ASSERT(upper == *ub);
3801         if (upper == ub_glob)
3802           lastpriv = 1;
3803       } else if (st > 0) { // positive loop stride
3804         KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper);
3805         if ((kmp_uint64)st > ub_glob - upper)
3806           lastpriv = 1;
3807       } else { // negative loop stride
3808         KMP_DEBUG_ASSERT(upper + st < *ub);
3809         if (upper - ub_glob < (kmp_uint64)(-st))
3810           lastpriv = 1;
3811       }
3812     }
3813     next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
3814     kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task);
3815     kmp_taskloop_bounds_t next_task_bounds =
3816         kmp_taskloop_bounds_t(next_task, task_bounds);
3817 
3818     // adjust task-specific bounds
3819     next_task_bounds.set_lb(lower);
3820     if (next_taskdata->td_flags.native) {
3821       next_task_bounds.set_ub(upper + (st > 0 ? 1 : -1));
3822     } else {
3823       next_task_bounds.set_ub(upper);
3824     }
3825     if (ptask_dup != NULL) // set lastprivate flag, construct fistprivates, etc.
3826       ptask_dup(next_task, task, lastpriv);
3827     KA_TRACE(40,
3828              ("__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, "
3829               "upper %lld stride %lld, (offsets %p %p)\n",
3830               gtid, i, next_task, lower, upper, st,
3831               next_task_bounds.get_lower_offset(),
3832               next_task_bounds.get_upper_offset()));
3833 #if OMPT_SUPPORT
3834     __kmp_omp_taskloop_task(NULL, gtid, next_task,
3835                            codeptr_ra); // schedule new task
3836 #else
3837     __kmp_omp_task(gtid, next_task, true); // schedule new task
3838 #endif
3839     lower = upper + st; // adjust lower bound for the next iteration
3840   }
3841   // free the pattern task and exit
3842   __kmp_task_start(gtid, task, current_task); // make internal bookkeeping
3843   // do not execute the pattern task, just do internal bookkeeping
3844   __kmp_task_finish<false>(gtid, task, current_task);
3845 }
3846 
3847 // Structure to keep taskloop parameters for auxiliary task
3848 // kept in the shareds of the task structure.
3849 typedef struct __taskloop_params {
3850   kmp_task_t *task;
3851   kmp_uint64 *lb;
3852   kmp_uint64 *ub;
3853   void *task_dup;
3854   kmp_int64 st;
3855   kmp_uint64 ub_glob;
3856   kmp_uint64 num_tasks;
3857   kmp_uint64 grainsize;
3858   kmp_uint64 extras;
3859   kmp_uint64 tc;
3860   kmp_uint64 num_t_min;
3861 #if OMPT_SUPPORT
3862   void *codeptr_ra;
3863 #endif
3864 } __taskloop_params_t;
3865 
3866 void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *,
3867                           kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64,
3868                           kmp_uint64, kmp_uint64, kmp_uint64, kmp_uint64,
3869 #if OMPT_SUPPORT
3870                           void *,
3871 #endif
3872                           void *);
3873 
3874 // Execute part of the the taskloop submitted as a task.
3875 int __kmp_taskloop_task(int gtid, void *ptask) {
3876   __taskloop_params_t *p =
3877       (__taskloop_params_t *)((kmp_task_t *)ptask)->shareds;
3878   kmp_task_t *task = p->task;
3879   kmp_uint64 *lb = p->lb;
3880   kmp_uint64 *ub = p->ub;
3881   void *task_dup = p->task_dup;
3882   //  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
3883   kmp_int64 st = p->st;
3884   kmp_uint64 ub_glob = p->ub_glob;
3885   kmp_uint64 num_tasks = p->num_tasks;
3886   kmp_uint64 grainsize = p->grainsize;
3887   kmp_uint64 extras = p->extras;
3888   kmp_uint64 tc = p->tc;
3889   kmp_uint64 num_t_min = p->num_t_min;
3890 #if OMPT_SUPPORT
3891   void *codeptr_ra = p->codeptr_ra;
3892 #endif
3893 #if KMP_DEBUG
3894   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
3895   KMP_DEBUG_ASSERT(task != NULL);
3896   KA_TRACE(20, ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize"
3897                 " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n",
3898                 gtid, taskdata, num_tasks, grainsize, extras, *lb, *ub, st,
3899                 task_dup));
3900 #endif
3901   KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min);
3902   if (num_tasks > num_t_min)
3903     __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
3904                          grainsize, extras, tc, num_t_min,
3905 #if OMPT_SUPPORT
3906                          codeptr_ra,
3907 #endif
3908                          task_dup);
3909   else
3910     __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
3911                           grainsize, extras, tc,
3912 #if OMPT_SUPPORT
3913                           codeptr_ra,
3914 #endif
3915                           task_dup);
3916 
3917   KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid));
3918   return 0;
3919 }
3920 
3921 // Schedule part of the the taskloop as a task,
3922 // execute the rest of the the taskloop.
3923 //
3924 // loc        Source location information
3925 // gtid       Global thread ID
3926 // task       Pattern task, exposes the loop iteration range
3927 // lb         Pointer to loop lower bound in task structure
3928 // ub         Pointer to loop upper bound in task structure
3929 // st         Loop stride
3930 // ub_glob    Global upper bound (used for lastprivate check)
3931 // num_tasks  Number of tasks to execute
3932 // grainsize  Number of loop iterations per task
3933 // extras     Number of chunks with grainsize+1 iterations
3934 // tc         Iterations count
3935 // num_t_min  Threashold to launch tasks recursively
3936 // task_dup   Tasks duplication routine
3937 // codeptr_ra Return address for OMPT events
3938 void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
3939                           kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
3940                           kmp_uint64 ub_glob, kmp_uint64 num_tasks,
3941                           kmp_uint64 grainsize, kmp_uint64 extras,
3942                           kmp_uint64 tc, kmp_uint64 num_t_min,
3943 #if OMPT_SUPPORT
3944                           void *codeptr_ra,
3945 #endif
3946                           void *task_dup) {
3947 #if KMP_DEBUG
3948   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
3949   KMP_DEBUG_ASSERT(task != NULL);
3950   KMP_DEBUG_ASSERT(num_tasks > num_t_min);
3951   KA_TRACE(20, ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize"
3952                 " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n",
3953                 gtid, taskdata, num_tasks, grainsize, extras, *lb, *ub, st,
3954                 task_dup));
3955 #endif
3956   p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
3957   kmp_uint64 lower = *lb;
3958   kmp_info_t *thread = __kmp_threads[gtid];
3959   //  kmp_taskdata_t *current_task = thread->th.th_current_task;
3960   kmp_task_t *next_task;
3961   size_t lower_offset =
3962       (char *)lb - (char *)task; // remember offset of lb in the task structure
3963   size_t upper_offset =
3964       (char *)ub - (char *)task; // remember offset of ub in the task structure
3965 
3966   KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
3967   KMP_DEBUG_ASSERT(num_tasks > extras);
3968   KMP_DEBUG_ASSERT(num_tasks > 0);
3969 
3970   // split the loop in two halves
3971   kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1;
3972   kmp_uint64 gr_size0 = grainsize;
3973   kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute
3974   kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task
3975   if (n_tsk0 <= extras) {
3976     gr_size0++; // integrate extras into grainsize
3977     ext0 = 0; // no extra iters in 1st half
3978     ext1 = extras - n_tsk0; // remaining extras
3979     tc0 = gr_size0 * n_tsk0;
3980     tc1 = tc - tc0;
3981   } else { // n_tsk0 > extras
3982     ext1 = 0; // no extra iters in 2nd half
3983     ext0 = extras;
3984     tc1 = grainsize * n_tsk1;
3985     tc0 = tc - tc1;
3986   }
3987   ub0 = lower + st * (tc0 - 1);
3988   lb1 = ub0 + st;
3989 
3990   // create pattern task for 2nd half of the loop
3991   next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task
3992   // adjust lower bound (upper bound is not changed) for the 2nd half
3993   *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1;
3994   if (ptask_dup != NULL) // construct fistprivates, etc.
3995     ptask_dup(next_task, task, 0);
3996   *ub = ub0; // adjust upper bound for the 1st half
3997 
3998   // create auxiliary task for 2nd half of the loop
3999   kmp_task_t *new_task =
4000       __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *),
4001                             sizeof(__taskloop_params_t), &__kmp_taskloop_task);
4002   __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds;
4003   p->task = next_task;
4004   p->lb = (kmp_uint64 *)((char *)next_task + lower_offset);
4005   p->ub = (kmp_uint64 *)((char *)next_task + upper_offset);
4006   p->task_dup = task_dup;
4007   p->st = st;
4008   p->ub_glob = ub_glob;
4009   p->num_tasks = n_tsk1;
4010   p->grainsize = grainsize;
4011   p->extras = ext1;
4012   p->tc = tc1;
4013   p->num_t_min = num_t_min;
4014 #if OMPT_SUPPORT
4015   p->codeptr_ra = codeptr_ra;
4016 #endif
4017 
4018 #if OMPT_SUPPORT
4019   // schedule new task with correct return address for OMPT events
4020   __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra);
4021 #else
4022   __kmp_omp_task(gtid, new_task, true); // schedule new task
4023 #endif
4024 
4025   // execute the 1st half of current subrange
4026   if (n_tsk0 > num_t_min)
4027     __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0,
4028                          ext0, tc0, num_t_min,
4029 #if OMPT_SUPPORT
4030                          codeptr_ra,
4031 #endif
4032                          task_dup);
4033   else
4034     __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,
4035                           gr_size0, ext0, tc0,
4036 #if OMPT_SUPPORT
4037                           codeptr_ra,
4038 #endif
4039                           task_dup);
4040 
4041   KA_TRACE(40, ("__kmpc_taskloop_recur(exit): T#%d\n", gtid));
4042 }
4043 
4044 /*!
4045 @ingroup TASKING
4046 @param loc       Source location information
4047 @param gtid      Global thread ID
4048 @param task      Task structure
4049 @param if_val    Value of the if clause
4050 @param lb        Pointer to loop lower bound in task structure
4051 @param ub        Pointer to loop upper bound in task structure
4052 @param st        Loop stride
4053 @param nogroup   Flag, 1 if nogroup clause specified, 0 otherwise
4054 @param sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
4055 @param grainsize Schedule value if specified
4056 @param task_dup  Tasks duplication routine
4057 
4058 Execute the taskloop construct.
4059 */
4060 void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
4061                      kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup,
4062                      int sched, kmp_uint64 grainsize, void *task_dup) {
4063   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4064   KMP_DEBUG_ASSERT(task != NULL);
4065 
4066   if (nogroup == 0) {
4067 #if OMPT_SUPPORT && OMPT_OPTIONAL
4068     OMPT_STORE_RETURN_ADDRESS(gtid);
4069 #endif
4070     __kmpc_taskgroup(loc, gtid);
4071   }
4072 
4073   // =========================================================================
4074   // calculate loop parameters
4075   kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4076   kmp_uint64 tc;
4077   // compiler provides global bounds here
4078   kmp_uint64 lower = task_bounds.get_lb();
4079   kmp_uint64 upper = task_bounds.get_ub();
4080   kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag
4081   kmp_uint64 num_tasks = 0, extras = 0;
4082   kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks;
4083   kmp_info_t *thread = __kmp_threads[gtid];
4084   kmp_taskdata_t *current_task = thread->th.th_current_task;
4085 
4086   KA_TRACE(20, ("__kmpc_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, "
4087                 "grain %llu(%d), dup %p\n",
4088                 gtid, taskdata, lower, upper, st, grainsize, sched, task_dup));
4089 
4090   // compute trip count
4091   if (st == 1) { // most common case
4092     tc = upper - lower + 1;
4093   } else if (st < 0) {
4094     tc = (lower - upper) / (-st) + 1;
4095   } else { // st > 0
4096     tc = (upper - lower) / st + 1;
4097   }
4098   if (tc == 0) {
4099     KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d zero-trip loop\n", gtid));
4100     // free the pattern task and exit
4101     __kmp_task_start(gtid, task, current_task);
4102     // do not execute anything for zero-trip loop
4103     __kmp_task_finish<false>(gtid, task, current_task);
4104     return;
4105   }
4106 
4107 #if OMPT_SUPPORT && OMPT_OPTIONAL
4108   ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
4109   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
4110   if (ompt_enabled.ompt_callback_work) {
4111     ompt_callbacks.ompt_callback(ompt_callback_work)(
4112         ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data),
4113         &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
4114   }
4115 #endif
4116 
4117   if (num_tasks_min == 0)
4118     // TODO: can we choose better default heuristic?
4119     num_tasks_min =
4120         KMP_MIN(thread->th.th_team_nproc * 10, INITIAL_TASK_DEQUE_SIZE);
4121 
4122   // compute num_tasks/grainsize based on the input provided
4123   switch (sched) {
4124   case 0: // no schedule clause specified, we can choose the default
4125     // let's try to schedule (team_size*10) tasks
4126     grainsize = thread->th.th_team_nproc * 10;
4127   case 2: // num_tasks provided
4128     if (grainsize > tc) {
4129       num_tasks = tc; // too big num_tasks requested, adjust values
4130       grainsize = 1;
4131       extras = 0;
4132     } else {
4133       num_tasks = grainsize;
4134       grainsize = tc / num_tasks;
4135       extras = tc % num_tasks;
4136     }
4137     break;
4138   case 1: // grainsize provided
4139     if (grainsize > tc) {
4140       num_tasks = 1; // too big grainsize requested, adjust values
4141       grainsize = tc;
4142       extras = 0;
4143     } else {
4144       num_tasks = tc / grainsize;
4145       // adjust grainsize for balanced distribution of iterations
4146       grainsize = tc / num_tasks;
4147       extras = tc % num_tasks;
4148     }
4149     break;
4150   default:
4151     KMP_ASSERT2(0, "unknown scheduling of taskloop");
4152   }
4153   KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
4154   KMP_DEBUG_ASSERT(num_tasks > extras);
4155   KMP_DEBUG_ASSERT(num_tasks > 0);
4156   // =========================================================================
4157 
4158   // check if clause value first
4159   // Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native)
4160   if (if_val == 0) { // if(0) specified, mark task as serial
4161     taskdata->td_flags.task_serial = 1;
4162     taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
4163     // always start serial tasks linearly
4164     __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
4165                           grainsize, extras, tc,
4166 #if OMPT_SUPPORT
4167                           OMPT_GET_RETURN_ADDRESS(0),
4168 #endif
4169                           task_dup);
4170     // !taskdata->td_flags.native => currently force linear spawning of tasks
4171     // for GOMP_taskloop
4172   } else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) {
4173     KA_TRACE(20, ("__kmpc_taskloop: T#%d, go recursive: tc %llu, #tasks %llu"
4174                   "(%lld), grain %llu, extras %llu\n",
4175                   gtid, tc, num_tasks, num_tasks_min, grainsize, extras));
4176     __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
4177                          grainsize, extras, tc, num_tasks_min,
4178 #if OMPT_SUPPORT
4179                          OMPT_GET_RETURN_ADDRESS(0),
4180 #endif
4181                          task_dup);
4182   } else {
4183     KA_TRACE(20, ("__kmpc_taskloop: T#%d, go linear: tc %llu, #tasks %llu"
4184                   "(%lld), grain %llu, extras %llu\n",
4185                   gtid, tc, num_tasks, num_tasks_min, grainsize, extras));
4186     __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
4187                           grainsize, extras, tc,
4188 #if OMPT_SUPPORT
4189                           OMPT_GET_RETURN_ADDRESS(0),
4190 #endif
4191                           task_dup);
4192   }
4193 
4194 #if OMPT_SUPPORT && OMPT_OPTIONAL
4195   if (ompt_enabled.ompt_callback_work) {
4196     ompt_callbacks.ompt_callback(ompt_callback_work)(
4197         ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data),
4198         &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
4199   }
4200 #endif
4201 
4202   if (nogroup == 0) {
4203 #if OMPT_SUPPORT && OMPT_OPTIONAL
4204     OMPT_STORE_RETURN_ADDRESS(gtid);
4205 #endif
4206     __kmpc_end_taskgroup(loc, gtid);
4207   }
4208   KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid));
4209 }
4210 
4211 #endif
4212