1 /*
2  * kmp_tasking.cpp -- OpenMP 3.0 tasking support.
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_stats.h"
20 #include "kmp_wait_release.h"
21 
22 #if OMPT_SUPPORT
23 #include "ompt-specific.h"
24 #endif
25 
26 #include "tsan_annotations.h"
27 
28 /* forward declaration */
29 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
30                                  kmp_info_t *this_thr);
31 static void __kmp_alloc_task_deque(kmp_info_t *thread,
32                                    kmp_thread_data_t *thread_data);
33 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
34                                            kmp_task_team_t *task_team);
35 
36 #ifdef OMP_45_ENABLED
37 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask);
38 #endif
39 
40 #ifdef BUILD_TIED_TASK_STACK
41 
42 //  __kmp_trace_task_stack: print the tied tasks from the task stack in order
43 //  from top do bottom
44 //
45 //  gtid: global thread identifier for thread containing stack
46 //  thread_data: thread data for task team thread containing stack
47 //  threshold: value above which the trace statement triggers
48 //  location: string identifying call site of this function (for trace)
49 static void __kmp_trace_task_stack(kmp_int32 gtid,
50                                    kmp_thread_data_t *thread_data,
51                                    int threshold, char *location) {
52   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
53   kmp_taskdata_t **stack_top = task_stack->ts_top;
54   kmp_int32 entries = task_stack->ts_entries;
55   kmp_taskdata_t *tied_task;
56 
57   KA_TRACE(
58       threshold,
59       ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, "
60        "first_block = %p, stack_top = %p \n",
61        location, gtid, entries, task_stack->ts_first_block, stack_top));
62 
63   KMP_DEBUG_ASSERT(stack_top != NULL);
64   KMP_DEBUG_ASSERT(entries > 0);
65 
66   while (entries != 0) {
67     KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]);
68     // fix up ts_top if we need to pop from previous block
69     if (entries & TASK_STACK_INDEX_MASK == 0) {
70       kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top);
71 
72       stack_block = stack_block->sb_prev;
73       stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
74     }
75 
76     // finish bookkeeping
77     stack_top--;
78     entries--;
79 
80     tied_task = *stack_top;
81 
82     KMP_DEBUG_ASSERT(tied_task != NULL);
83     KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
84 
85     KA_TRACE(threshold,
86              ("__kmp_trace_task_stack(%s):             gtid=%d, entry=%d, "
87               "stack_top=%p, tied_task=%p\n",
88               location, gtid, entries, stack_top, tied_task));
89   }
90   KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]);
91 
92   KA_TRACE(threshold,
93            ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n",
94             location, gtid));
95 }
96 
97 //  __kmp_init_task_stack: initialize the task stack for the first time
98 //  after a thread_data structure is created.
99 //  It should not be necessary to do this again (assuming the stack works).
100 //
101 //  gtid: global thread identifier of calling thread
102 //  thread_data: thread data for task team thread containing stack
103 static void __kmp_init_task_stack(kmp_int32 gtid,
104                                   kmp_thread_data_t *thread_data) {
105   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
106   kmp_stack_block_t *first_block;
107 
108   // set up the first block of the stack
109   first_block = &task_stack->ts_first_block;
110   task_stack->ts_top = (kmp_taskdata_t **)first_block;
111   memset((void *)first_block, '\0',
112          TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *));
113 
114   // initialize the stack to be empty
115   task_stack->ts_entries = TASK_STACK_EMPTY;
116   first_block->sb_next = NULL;
117   first_block->sb_prev = NULL;
118 }
119 
120 //  __kmp_free_task_stack: free the task stack when thread_data is destroyed.
121 //
122 //  gtid: global thread identifier for calling thread
123 //  thread_data: thread info for thread containing stack
124 static void __kmp_free_task_stack(kmp_int32 gtid,
125                                   kmp_thread_data_t *thread_data) {
126   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
127   kmp_stack_block_t *stack_block = &task_stack->ts_first_block;
128 
129   KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY);
130   // free from the second block of the stack
131   while (stack_block != NULL) {
132     kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL;
133 
134     stack_block->sb_next = NULL;
135     stack_block->sb_prev = NULL;
136     if (stack_block != &task_stack->ts_first_block) {
137       __kmp_thread_free(thread,
138                         stack_block); // free the block, if not the first
139     }
140     stack_block = next_block;
141   }
142   // initialize the stack to be empty
143   task_stack->ts_entries = 0;
144   task_stack->ts_top = NULL;
145 }
146 
147 //  __kmp_push_task_stack: Push the tied task onto the task stack.
148 //     Grow the stack if necessary by allocating another block.
149 //
150 //  gtid: global thread identifier for calling thread
151 //  thread: thread info for thread containing stack
152 //  tied_task: the task to push on the stack
153 static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread,
154                                   kmp_taskdata_t *tied_task) {
155   // GEH - need to consider what to do if tt_threads_data not allocated yet
156   kmp_thread_data_t *thread_data =
157       &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
158   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
159 
160   if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) {
161     return; // Don't push anything on stack if team or team tasks are serialized
162   }
163 
164   KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
165   KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
166 
167   KA_TRACE(20,
168            ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n",
169             gtid, thread, tied_task));
170   // Store entry
171   *(task_stack->ts_top) = tied_task;
172 
173   // Do bookkeeping for next push
174   task_stack->ts_top++;
175   task_stack->ts_entries++;
176 
177   if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
178     // Find beginning of this task block
179     kmp_stack_block_t *stack_block =
180         (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE);
181 
182     // Check if we already have a block
183     if (stack_block->sb_next !=
184         NULL) { // reset ts_top to beginning of next block
185       task_stack->ts_top = &stack_block->sb_next->sb_block[0];
186     } else { // Alloc new block and link it up
187       kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc(
188           thread, sizeof(kmp_stack_block_t));
189 
190       task_stack->ts_top = &new_block->sb_block[0];
191       stack_block->sb_next = new_block;
192       new_block->sb_prev = stack_block;
193       new_block->sb_next = NULL;
194 
195       KA_TRACE(
196           30,
197           ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n",
198            gtid, tied_task, new_block));
199     }
200   }
201   KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
202                 tied_task));
203 }
204 
205 //  __kmp_pop_task_stack: Pop the tied task from the task stack.  Don't return
206 //  the task, just check to make sure it matches the ending task passed in.
207 //
208 //  gtid: global thread identifier for the calling thread
209 //  thread: thread info structure containing stack
210 //  tied_task: the task popped off the stack
211 //  ending_task: the task that is ending (should match popped task)
212 static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread,
213                                  kmp_taskdata_t *ending_task) {
214   // GEH - need to consider what to do if tt_threads_data not allocated yet
215   kmp_thread_data_t *thread_data =
216       &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)];
217   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
218   kmp_taskdata_t *tied_task;
219 
220   if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) {
221     // Don't pop anything from stack if team or team tasks are serialized
222     return;
223   }
224 
225   KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
226   KMP_DEBUG_ASSERT(task_stack->ts_entries > 0);
227 
228   KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid,
229                 thread));
230 
231   // fix up ts_top if we need to pop from previous block
232   if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
233     kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top);
234 
235     stack_block = stack_block->sb_prev;
236     task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
237   }
238 
239   // finish bookkeeping
240   task_stack->ts_top--;
241   task_stack->ts_entries--;
242 
243   tied_task = *(task_stack->ts_top);
244 
245   KMP_DEBUG_ASSERT(tied_task != NULL);
246   KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
247   KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly
248 
249   KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
250                 tied_task));
251   return;
252 }
253 #endif /* BUILD_TIED_TASK_STACK */
254 
255 //  __kmp_push_task: Add a task to the thread's deque
256 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
257   kmp_info_t *thread = __kmp_threads[gtid];
258   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
259   kmp_task_team_t *task_team = thread->th.th_task_team;
260   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
261   kmp_thread_data_t *thread_data;
262 
263   KA_TRACE(20,
264            ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata));
265 
266   if (taskdata->td_flags.tiedness == TASK_UNTIED) {
267     // untied task needs to increment counter so that the task structure is not
268     // freed prematurely
269     kmp_int32 counter = 1 + KMP_TEST_THEN_INC32(&taskdata->td_untied_count);
270     KA_TRACE(
271         20,
272         ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",
273          gtid, counter, taskdata));
274   }
275 
276   // The first check avoids building task_team thread data if serialized
277   if (taskdata->td_flags.task_serial) {
278     KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning "
279                   "TASK_NOT_PUSHED for task %p\n",
280                   gtid, taskdata));
281     return TASK_NOT_PUSHED;
282   }
283 
284   // Now that serialized tasks have returned, we can assume that we are not in
285   // immediate exec mode
286   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
287   if (!KMP_TASKING_ENABLED(task_team)) {
288     __kmp_enable_tasking(task_team, thread);
289   }
290   KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE);
291   KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL);
292 
293   // Find tasking deque specific to encountering thread
294   thread_data = &task_team->tt.tt_threads_data[tid];
295 
296   // No lock needed since only owner can allocate
297   if (thread_data->td.td_deque == NULL) {
298     __kmp_alloc_task_deque(thread, thread_data);
299   }
300 
301   // Check if deque is full
302   if (TCR_4(thread_data->td.td_deque_ntasks) >=
303       TASK_DEQUE_SIZE(thread_data->td)) {
304     KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning "
305                   "TASK_NOT_PUSHED for task %p\n",
306                   gtid, taskdata));
307     return TASK_NOT_PUSHED;
308   }
309 
310   // Lock the deque for the task push operation
311   __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
312 
313 #if OMP_45_ENABLED
314   // Need to recheck as we can get a proxy task from a thread outside of OpenMP
315   if (TCR_4(thread_data->td.td_deque_ntasks) >=
316       TASK_DEQUE_SIZE(thread_data->td)) {
317     __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
318     KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; returning "
319                   "TASK_NOT_PUSHED for task %p\n",
320                   gtid, taskdata));
321     return TASK_NOT_PUSHED;
322   }
323 #else
324   // Must have room since no thread can add tasks but calling thread
325   KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
326                    TASK_DEQUE_SIZE(thread_data->td));
327 #endif
328 
329   thread_data->td.td_deque[thread_data->td.td_deque_tail] =
330       taskdata; // Push taskdata
331   // Wrap index.
332   thread_data->td.td_deque_tail =
333       (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
334   TCW_4(thread_data->td.td_deque_ntasks,
335         TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
336 
337   KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
338                 "task=%p ntasks=%d head=%u tail=%u\n",
339                 gtid, taskdata, thread_data->td.td_deque_ntasks,
340                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
341 
342   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
343 
344   return TASK_SUCCESSFULLY_PUSHED;
345 }
346 
347 // __kmp_pop_current_task_from_thread: set up current task from called thread
348 // when team ends
349 //
350 // this_thr: thread structure to set current_task in.
351 void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) {
352   KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d "
353                 "this_thread=%p, curtask=%p, "
354                 "curtask_parent=%p\n",
355                 0, this_thr, this_thr->th.th_current_task,
356                 this_thr->th.th_current_task->td_parent));
357 
358   this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent;
359 
360   KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d "
361                 "this_thread=%p, curtask=%p, "
362                 "curtask_parent=%p\n",
363                 0, this_thr, this_thr->th.th_current_task,
364                 this_thr->th.th_current_task->td_parent));
365 }
366 
367 // __kmp_push_current_task_to_thread: set up current task in called thread for a
368 // new team
369 //
370 // this_thr: thread structure to set up
371 // team: team for implicit task data
372 // tid: thread within team to set up
373 void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team,
374                                        int tid) {
375   // current task of the thread is a parent of the new just created implicit
376   // tasks of new team
377   KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p "
378                 "curtask=%p "
379                 "parent_task=%p\n",
380                 tid, this_thr, this_thr->th.th_current_task,
381                 team->t.t_implicit_task_taskdata[tid].td_parent));
382 
383   KMP_DEBUG_ASSERT(this_thr != NULL);
384 
385   if (tid == 0) {
386     if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) {
387       team->t.t_implicit_task_taskdata[0].td_parent =
388           this_thr->th.th_current_task;
389       this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0];
390     }
391   } else {
392     team->t.t_implicit_task_taskdata[tid].td_parent =
393         team->t.t_implicit_task_taskdata[0].td_parent;
394     this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid];
395   }
396 
397   KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p "
398                 "curtask=%p "
399                 "parent_task=%p\n",
400                 tid, this_thr, this_thr->th.th_current_task,
401                 team->t.t_implicit_task_taskdata[tid].td_parent));
402 }
403 
404 // __kmp_task_start: bookkeeping for a task starting execution
405 //
406 // GTID: global thread id of calling thread
407 // task: task starting execution
408 // current_task: task suspending
409 static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task,
410                              kmp_taskdata_t *current_task) {
411   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
412   kmp_info_t *thread = __kmp_threads[gtid];
413 
414   KA_TRACE(10,
415            ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",
416             gtid, taskdata, current_task));
417 
418   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
419 
420   // mark currently executing task as suspended
421   // TODO: GEH - make sure root team implicit task is initialized properly.
422   // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );
423   current_task->td_flags.executing = 0;
424 
425 // Add task to stack if tied
426 #ifdef BUILD_TIED_TASK_STACK
427   if (taskdata->td_flags.tiedness == TASK_TIED) {
428     __kmp_push_task_stack(gtid, thread, taskdata);
429   }
430 #endif /* BUILD_TIED_TASK_STACK */
431 
432   // mark starting task as executing and as current task
433   thread->th.th_current_task = taskdata;
434 
435   KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 ||
436                    taskdata->td_flags.tiedness == TASK_UNTIED);
437   KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 ||
438                    taskdata->td_flags.tiedness == TASK_UNTIED);
439   taskdata->td_flags.started = 1;
440   taskdata->td_flags.executing = 1;
441   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
442   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
443 
444   // GEH TODO: shouldn't we pass some sort of location identifier here?
445   // APT: yes, we will pass location here.
446   // need to store current thread state (in a thread or taskdata structure)
447   // before setting work_state, otherwise wrong state is set after end of task
448 
449   KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata));
450 
451 #if OMPT_SUPPORT
452   if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_task_begin)) {
453     kmp_taskdata_t *parent = taskdata->td_parent;
454     ompt_callbacks.ompt_callback(ompt_event_task_begin)(
455         parent ? parent->ompt_task_info.task_id : ompt_task_id_none,
456         parent ? &(parent->ompt_task_info.frame) : NULL,
457         taskdata->ompt_task_info.task_id, taskdata->ompt_task_info.function);
458   }
459 #endif
460 #if OMP_40_ENABLED && OMPT_SUPPORT && OMPT_TRACE
461   /* OMPT emit all dependences if requested by the tool */
462   if (ompt_enabled && taskdata->ompt_task_info.ndeps > 0 &&
463       ompt_callbacks.ompt_callback(ompt_event_task_dependences)) {
464     ompt_callbacks.ompt_callback(ompt_event_task_dependences)(
465         taskdata->ompt_task_info.task_id, taskdata->ompt_task_info.deps,
466         taskdata->ompt_task_info.ndeps);
467     /* We can now free the allocated memory for the dependencies */
468     KMP_OMPT_DEPS_FREE(thread, taskdata->ompt_task_info.deps);
469     taskdata->ompt_task_info.deps = NULL;
470     taskdata->ompt_task_info.ndeps = 0;
471   }
472 #endif /* OMP_40_ENABLED && OMPT_SUPPORT && OMPT_TRACE */
473 
474   return;
475 }
476 
477 // __kmpc_omp_task_begin_if0: report that a given serialized task has started
478 // execution
479 //
480 // loc_ref: source location information; points to beginning of task block.
481 // gtid: global thread number.
482 // task: task thunk for the started task.
483 void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
484                                kmp_task_t *task) {
485   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
486   kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
487 
488   KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p "
489                 "current_task=%p\n",
490                 gtid, loc_ref, taskdata, current_task));
491 
492   if (taskdata->td_flags.tiedness == TASK_UNTIED) {
493     // untied task needs to increment counter so that the task structure is not
494     // freed prematurely
495     kmp_int32 counter = 1 + KMP_TEST_THEN_INC32(&taskdata->td_untied_count);
496     KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) "
497                   "incremented for task %p\n",
498                   gtid, counter, taskdata));
499   }
500 
501   taskdata->td_flags.task_serial =
502       1; // Execute this task immediately, not deferred.
503   __kmp_task_start(gtid, task, current_task);
504 
505   KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid,
506                 loc_ref, taskdata));
507 
508   return;
509 }
510 
511 #ifdef TASK_UNUSED
512 // __kmpc_omp_task_begin: report that a given task has started execution
513 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
514 void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) {
515   kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
516 
517   KA_TRACE(
518       10,
519       ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
520        gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task));
521 
522   __kmp_task_start(gtid, task, current_task);
523 
524   KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid,
525                 loc_ref, KMP_TASK_TO_TASKDATA(task)));
526   return;
527 }
528 #endif // TASK_UNUSED
529 
530 // __kmp_free_task: free the current task space and the space for shareds
531 //
532 // gtid: Global thread ID of calling thread
533 // taskdata: task to free
534 // thread: thread data structure of caller
535 static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
536                             kmp_info_t *thread) {
537   KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid,
538                 taskdata));
539 
540   // Check to make sure all flags and counters have the correct values
541   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
542   KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0);
543   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1);
544   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
545   KMP_DEBUG_ASSERT(TCR_4(taskdata->td_allocated_child_tasks) == 0 ||
546                    taskdata->td_flags.task_serial == 1);
547   KMP_DEBUG_ASSERT(TCR_4(taskdata->td_incomplete_child_tasks) == 0);
548 
549   taskdata->td_flags.freed = 1;
550   ANNOTATE_HAPPENS_BEFORE(taskdata);
551 // deallocate the taskdata and shared variable blocks associated with this task
552 #if USE_FAST_MEMORY
553   __kmp_fast_free(thread, taskdata);
554 #else /* ! USE_FAST_MEMORY */
555   __kmp_thread_free(thread, taskdata);
556 #endif
557 
558   KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));
559 }
560 
561 // __kmp_free_task_and_ancestors: free the current task and ancestors without
562 // children
563 //
564 // gtid: Global thread ID of calling thread
565 // taskdata: task to free
566 // thread: thread data structure of caller
567 static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
568                                           kmp_taskdata_t *taskdata,
569                                           kmp_info_t *thread) {
570 #if OMP_45_ENABLED
571   // Proxy tasks must always be allowed to free their parents
572   // because they can be run in background even in serial mode.
573   kmp_int32 team_serial =
574       (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) &&
575       !taskdata->td_flags.proxy;
576 #else
577   kmp_int32 team_serial =
578       taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser;
579 #endif
580   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
581 
582   kmp_int32 children =
583       KMP_TEST_THEN_DEC32((kmp_int32 *)(&taskdata->td_allocated_child_tasks)) -
584       1;
585   KMP_DEBUG_ASSERT(children >= 0);
586 
587   // Now, go up the ancestor tree to see if any ancestors can now be freed.
588   while (children == 0) {
589     kmp_taskdata_t *parent_taskdata = taskdata->td_parent;
590 
591     KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete "
592                   "and freeing itself\n",
593                   gtid, taskdata));
594 
595     // --- Deallocate my ancestor task ---
596     __kmp_free_task(gtid, taskdata, thread);
597 
598     taskdata = parent_taskdata;
599 
600     // Stop checking ancestors at implicit task instead of walking up ancestor
601     // tree to avoid premature deallocation of ancestors.
602     if (team_serial || taskdata->td_flags.tasktype == TASK_IMPLICIT)
603       return;
604 
605     // Predecrement simulated by "- 1" calculation
606     children = KMP_TEST_THEN_DEC32(
607                    (kmp_int32 *)(&taskdata->td_allocated_child_tasks)) -
608                1;
609     KMP_DEBUG_ASSERT(children >= 0);
610   }
611 
612   KA_TRACE(
613       20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "
614            "not freeing it yet\n",
615            gtid, taskdata, children));
616 }
617 
618 // __kmp_task_finish: bookkeeping to do when a task finishes execution
619 //
620 // gtid: global thread ID for calling thread
621 // task: task to be finished
622 // resumed_task: task to be resumed.  (may be NULL if task is serialized)
623 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
624                               kmp_taskdata_t *resumed_task) {
625   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
626   kmp_info_t *thread = __kmp_threads[gtid];
627   kmp_task_team_t *task_team =
628       thread->th.th_task_team; // might be NULL for serial teams...
629   kmp_int32 children = 0;
630 
631 #if OMPT_SUPPORT
632   if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_task_end)) {
633     kmp_taskdata_t *parent = taskdata->td_parent;
634     ompt_callbacks.ompt_callback(ompt_event_task_end)(
635         taskdata->ompt_task_info.task_id);
636   }
637 #endif
638 
639   KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming "
640                 "task %p\n",
641                 gtid, taskdata, resumed_task));
642 
643   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
644 
645 // Pop task from stack if tied
646 #ifdef BUILD_TIED_TASK_STACK
647   if (taskdata->td_flags.tiedness == TASK_TIED) {
648     __kmp_pop_task_stack(gtid, thread, taskdata);
649   }
650 #endif /* BUILD_TIED_TASK_STACK */
651 
652   if (taskdata->td_flags.tiedness == TASK_UNTIED) {
653     // untied task needs to check the counter so that the task structure is not
654     // freed prematurely
655     kmp_int32 counter = KMP_TEST_THEN_DEC32(&taskdata->td_untied_count) - 1;
656     KA_TRACE(
657         20,
658         ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",
659          gtid, counter, taskdata));
660     if (counter > 0) {
661       // untied task is not done, to be continued possibly by other thread, do
662       // not free it now
663       if (resumed_task == NULL) {
664         KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial);
665         resumed_task = taskdata->td_parent; // In a serialized task, the resumed
666         // task is the parent
667       }
668       thread->th.th_current_task = resumed_task; // restore current_task
669       resumed_task->td_flags.executing = 1; // resume previous task
670       KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, "
671                     "resuming task %p\n",
672                     gtid, taskdata, resumed_task));
673       return;
674     }
675   }
676 
677   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
678   taskdata->td_flags.complete = 1; // mark the task as completed
679   KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
680   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
681 
682   // Only need to keep track of count if team parallel and tasking not
683   // serialized
684   if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
685     // Predecrement simulated by "- 1" calculation
686     children =
687         KMP_TEST_THEN_DEC32(
688             (kmp_int32 *)(&taskdata->td_parent->td_incomplete_child_tasks)) -
689         1;
690     KMP_DEBUG_ASSERT(children >= 0);
691 #if OMP_40_ENABLED
692     if (taskdata->td_taskgroup)
693       KMP_TEST_THEN_DEC32((kmp_int32 *)(&taskdata->td_taskgroup->count));
694 #if OMP_45_ENABLED
695   }
696   // if we found proxy tasks there could exist a dependency chain
697   // with the proxy task as origin
698   if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) ||
699       (task_team && task_team->tt.tt_found_proxy_tasks)) {
700 #endif
701     __kmp_release_deps(gtid, taskdata);
702 #endif
703   }
704 
705   // td_flags.executing must be marked as 0 after __kmp_release_deps has been
706   // called. Othertwise, if a task is executed immediately from the release_deps
707   // code, the flag will be reset to 1 again by this same function
708   KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
709   taskdata->td_flags.executing = 0; // suspend the finishing task
710 
711   KA_TRACE(
712       20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
713            gtid, taskdata, children));
714 
715 #if OMP_40_ENABLED
716   /* If the tasks' destructor thunk flag has been set, we need to invoke the
717      destructor thunk that has been generated by the compiler. The code is
718      placed here, since at this point other tasks might have been released
719      hence overlapping the destructor invokations with some other work in the
720      released tasks.  The OpenMP spec is not specific on when the destructors
721      are invoked, so we should be free to choose. */
722   if (taskdata->td_flags.destructors_thunk) {
723     kmp_routine_entry_t destr_thunk = task->data1.destructors;
724     KMP_ASSERT(destr_thunk);
725     destr_thunk(gtid, task);
726   }
727 #endif // OMP_40_ENABLED
728 
729   // bookkeeping for resuming task:
730   // GEH - note tasking_ser => task_serial
731   KMP_DEBUG_ASSERT(
732       (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
733       taskdata->td_flags.task_serial);
734   if (taskdata->td_flags.task_serial) {
735     if (resumed_task == NULL) {
736       resumed_task = taskdata->td_parent; // In a serialized task, the resumed
737       // task is the parent
738     } else
739 #if OMP_45_ENABLED
740         if (!(task_team && task_team->tt.tt_found_proxy_tasks))
741 #endif
742     {
743       // verify resumed task passed in points to parent
744       KMP_DEBUG_ASSERT(resumed_task == taskdata->td_parent);
745     }
746   } else {
747     KMP_DEBUG_ASSERT(resumed_task !=
748                      NULL); // verify that resumed task is passed as arguemnt
749   }
750 
751   // Free this task and then ancestor tasks if they have no children.
752   // Restore th_current_task first as suggested by John:
753   // johnmc: if an asynchronous inquiry peers into the runtime system
754   // it doesn't see the freed task as the current task.
755   thread->th.th_current_task = resumed_task;
756   __kmp_free_task_and_ancestors(gtid, taskdata, thread);
757 
758   // TODO: GEH - make sure root team implicit task is initialized properly.
759   // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
760   resumed_task->td_flags.executing = 1; // resume previous task
761 
762   KA_TRACE(
763       10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
764            gtid, taskdata, resumed_task));
765 
766   return;
767 }
768 
769 // __kmpc_omp_task_complete_if0: report that a task has completed execution
770 //
771 // loc_ref: source location information; points to end of task block.
772 // gtid: global thread number.
773 // task: task thunk for the completed task.
774 void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,
775                                   kmp_task_t *task) {
776   KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
777                 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
778   // this routine will provide task to resume
779   __kmp_task_finish(gtid, task, NULL);
780 
781   KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
782                 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
783   return;
784 }
785 
786 #ifdef TASK_UNUSED
787 // __kmpc_omp_task_complete: report that a task has completed execution
788 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
789 void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid,
790                               kmp_task_t *task) {
791   KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid,
792                 loc_ref, KMP_TASK_TO_TASKDATA(task)));
793 
794   __kmp_task_finish(gtid, task, NULL); // Not sure how to find task to resume
795 
796   KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid,
797                 loc_ref, KMP_TASK_TO_TASKDATA(task)));
798   return;
799 }
800 #endif // TASK_UNUSED
801 
802 #if OMPT_SUPPORT
803 // __kmp_task_init_ompt: Initialize OMPT fields maintained by a task. This will
804 //  only be called after ompt_tool, so we already know whether ompt is enabled
805 // or not.
806 static inline void __kmp_task_init_ompt(kmp_taskdata_t *task, int tid,
807                                         void *function) {
808   if (ompt_enabled) {
809     task->ompt_task_info.task_id = __ompt_task_id_new(tid);
810     task->ompt_task_info.function = function;
811     task->ompt_task_info.frame.exit_runtime_frame = NULL;
812     task->ompt_task_info.frame.reenter_runtime_frame = NULL;
813 #if OMP_40_ENABLED
814     task->ompt_task_info.ndeps = 0;
815     task->ompt_task_info.deps = NULL;
816 #endif /* OMP_40_ENABLED */
817   }
818 }
819 #endif
820 
821 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit
822 // task for a given thread
823 //
824 // loc_ref:  reference to source location of parallel region
825 // this_thr:  thread data structure corresponding to implicit task
826 // team: team for this_thr
827 // tid: thread id of given thread within team
828 // set_curr_task: TRUE if need to push current task to thread
829 // NOTE: Routine does not set up the implicit task ICVS.  This is assumed to
830 // have already been done elsewhere.
831 // TODO: Get better loc_ref.  Value passed in may be NULL
832 void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
833                               kmp_team_t *team, int tid, int set_curr_task) {
834   kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid];
835 
836   KF_TRACE(
837       10,
838       ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",
839        tid, team, task, set_curr_task ? "TRUE" : "FALSE"));
840 
841   task->td_task_id = KMP_GEN_TASK_ID();
842   task->td_team = team;
843   //    task->td_parent   = NULL;  // fix for CQ230101 (broken parent task info
844   //    in debugger)
845   task->td_ident = loc_ref;
846   task->td_taskwait_ident = NULL;
847   task->td_taskwait_counter = 0;
848   task->td_taskwait_thread = 0;
849 
850   task->td_flags.tiedness = TASK_TIED;
851   task->td_flags.tasktype = TASK_IMPLICIT;
852 #if OMP_45_ENABLED
853   task->td_flags.proxy = TASK_FULL;
854 #endif
855 
856   // All implicit tasks are executed immediately, not deferred
857   task->td_flags.task_serial = 1;
858   task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
859   task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
860 
861   task->td_flags.started = 1;
862   task->td_flags.executing = 1;
863   task->td_flags.complete = 0;
864   task->td_flags.freed = 0;
865 
866 #if OMP_40_ENABLED
867   task->td_depnode = NULL;
868 #endif
869 
870   if (set_curr_task) { // only do this init first time thread is created
871     task->td_incomplete_child_tasks = 0;
872     // Not used: don't need to deallocate implicit task
873     task->td_allocated_child_tasks = 0;
874 #if OMP_40_ENABLED
875     task->td_taskgroup = NULL; // An implicit task does not have taskgroup
876     task->td_dephash = NULL;
877 #endif
878     __kmp_push_current_task_to_thread(this_thr, team, tid);
879   } else {
880     KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);
881     KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0);
882   }
883 
884 #if OMPT_SUPPORT
885   __kmp_task_init_ompt(task, tid, NULL);
886 #endif
887 
888   KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid,
889                 team, task));
890 }
891 
892 // __kmp_finish_implicit_task: Release resources associated to implicit tasks
893 // at the end of parallel regions. Some resources are kept for reuse in the next
894 // parallel region.
895 //
896 // thread:  thread data structure corresponding to implicit task
897 void __kmp_finish_implicit_task(kmp_info_t *thread) {
898   kmp_taskdata_t *task = thread->th.th_current_task;
899   if (task->td_dephash)
900     __kmp_dephash_free_entries(thread, task->td_dephash);
901 }
902 
903 // __kmp_free_implicit_task: Release resources associated to implicit tasks
904 // when these are destroyed regions
905 //
906 // thread:  thread data structure corresponding to implicit task
907 void __kmp_free_implicit_task(kmp_info_t *thread) {
908   kmp_taskdata_t *task = thread->th.th_current_task;
909   if (task->td_dephash)
910     __kmp_dephash_free(thread, task->td_dephash);
911   task->td_dephash = NULL;
912 }
913 
914 // Round up a size to a power of two specified by val: Used to insert padding
915 // between structures co-allocated using a single malloc() call
916 static size_t __kmp_round_up_to_val(size_t size, size_t val) {
917   if (size & (val - 1)) {
918     size &= ~(val - 1);
919     if (size <= KMP_SIZE_T_MAX - val) {
920       size += val; // Round up if there is no overflow.
921     }; // if
922   }; // if
923   return size;
924 } // __kmp_round_up_to_va
925 
926 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task
927 //
928 // loc_ref: source location information
929 // gtid: global thread number.
930 // flags: include tiedness & task type (explicit vs. implicit) of the ''new''
931 // task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine.
932 // sizeof_kmp_task_t:  Size in bytes of kmp_task_t data structure including
933 // private vars accessed in task.
934 // sizeof_shareds:  Size in bytes of array of pointers to shared vars accessed
935 // in task.
936 // task_entry: Pointer to task code entry point generated by compiler.
937 // returns: a pointer to the allocated kmp_task_t structure (task).
938 kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
939                              kmp_tasking_flags_t *flags,
940                              size_t sizeof_kmp_task_t, size_t sizeof_shareds,
941                              kmp_routine_entry_t task_entry) {
942   kmp_task_t *task;
943   kmp_taskdata_t *taskdata;
944   kmp_info_t *thread = __kmp_threads[gtid];
945   kmp_team_t *team = thread->th.th_team;
946   kmp_taskdata_t *parent_task = thread->th.th_current_task;
947   size_t shareds_offset;
948 
949   KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
950                 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
951                 gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
952                 sizeof_shareds, task_entry));
953 
954   if (parent_task->td_flags.final) {
955     if (flags->merged_if0) {
956     }
957     flags->final = 1;
958   }
959 
960 #if OMP_45_ENABLED
961   if (flags->proxy == TASK_PROXY) {
962     flags->tiedness = TASK_UNTIED;
963     flags->merged_if0 = 1;
964 
965     /* are we running in a sequential parallel or tskm_immediate_exec... we need
966        tasking support enabled */
967     if ((thread->th.th_task_team) == NULL) {
968       /* This should only happen if the team is serialized
969           setup a task team and propagate it to the thread */
970       KMP_DEBUG_ASSERT(team->t.t_serialized);
971       KA_TRACE(30,
972                ("T#%d creating task team in __kmp_task_alloc for proxy task\n",
973                 gtid));
974       __kmp_task_team_setup(
975           thread, team,
976           1); // 1 indicates setup the current team regardless of nthreads
977       thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
978     }
979     kmp_task_team_t *task_team = thread->th.th_task_team;
980 
981     /* tasking must be enabled now as the task might not be pushed */
982     if (!KMP_TASKING_ENABLED(task_team)) {
983       KA_TRACE(
984           30,
985           ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
986       __kmp_enable_tasking(task_team, thread);
987       kmp_int32 tid = thread->th.th_info.ds.ds_tid;
988       kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
989       // No lock needed since only owner can allocate
990       if (thread_data->td.td_deque == NULL) {
991         __kmp_alloc_task_deque(thread, thread_data);
992       }
993     }
994 
995     if (task_team->tt.tt_found_proxy_tasks == FALSE)
996       TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE);
997   }
998 #endif
999 
1000   // Calculate shared structure offset including padding after kmp_task_t struct
1001   // to align pointers in shared struct
1002   shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t;
1003   shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(void *));
1004 
1005   // Allocate a kmp_taskdata_t block and a kmp_task_t block.
1006   KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid,
1007                 shareds_offset));
1008   KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid,
1009                 sizeof_shareds));
1010 
1011 // Avoid double allocation here by combining shareds with taskdata
1012 #if USE_FAST_MEMORY
1013   taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset +
1014                                                                sizeof_shareds);
1015 #else /* ! USE_FAST_MEMORY */
1016   taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset +
1017                                                                sizeof_shareds);
1018 #endif /* USE_FAST_MEMORY */
1019   ANNOTATE_HAPPENS_AFTER(taskdata);
1020 
1021   task = KMP_TASKDATA_TO_TASK(taskdata);
1022 
1023 // Make sure task & taskdata are aligned appropriately
1024 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD
1025   KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0);
1026   KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0);
1027 #else
1028   KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0);
1029   KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0);
1030 #endif
1031   if (sizeof_shareds > 0) {
1032     // Avoid double allocation here by combining shareds with taskdata
1033     task->shareds = &((char *)taskdata)[shareds_offset];
1034     // Make sure shareds struct is aligned to pointer size
1035     KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
1036                      0);
1037   } else {
1038     task->shareds = NULL;
1039   }
1040   task->routine = task_entry;
1041   task->part_id = 0; // AC: Always start with 0 part id
1042 
1043   taskdata->td_task_id = KMP_GEN_TASK_ID();
1044   taskdata->td_team = team;
1045   taskdata->td_alloc_thread = thread;
1046   taskdata->td_parent = parent_task;
1047   taskdata->td_level = parent_task->td_level + 1; // increment nesting level
1048   taskdata->td_untied_count = 0;
1049   taskdata->td_ident = loc_ref;
1050   taskdata->td_taskwait_ident = NULL;
1051   taskdata->td_taskwait_counter = 0;
1052   taskdata->td_taskwait_thread = 0;
1053   KMP_DEBUG_ASSERT(taskdata->td_parent != NULL);
1054 #if OMP_45_ENABLED
1055   // avoid copying icvs for proxy tasks
1056   if (flags->proxy == TASK_FULL)
1057 #endif
1058     copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs);
1059 
1060   taskdata->td_flags.tiedness = flags->tiedness;
1061   taskdata->td_flags.final = flags->final;
1062   taskdata->td_flags.merged_if0 = flags->merged_if0;
1063 #if OMP_40_ENABLED
1064   taskdata->td_flags.destructors_thunk = flags->destructors_thunk;
1065 #endif // OMP_40_ENABLED
1066 #if OMP_45_ENABLED
1067   taskdata->td_flags.proxy = flags->proxy;
1068   taskdata->td_task_team = thread->th.th_task_team;
1069   taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
1070 #endif
1071   taskdata->td_flags.tasktype = TASK_EXPLICIT;
1072 
1073   // GEH - TODO: fix this to copy parent task's value of tasking_ser flag
1074   taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1075 
1076   // GEH - TODO: fix this to copy parent task's value of team_serial flag
1077   taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1078 
1079   // GEH - Note we serialize the task if the team is serialized to make sure
1080   // implicit parallel region tasks are not left until program termination to
1081   // execute. Also, it helps locality to execute immediately.
1082 
1083   taskdata->td_flags.task_serial =
1084       (parent_task->td_flags.final || taskdata->td_flags.team_serial ||
1085        taskdata->td_flags.tasking_ser);
1086 
1087   taskdata->td_flags.started = 0;
1088   taskdata->td_flags.executing = 0;
1089   taskdata->td_flags.complete = 0;
1090   taskdata->td_flags.freed = 0;
1091 
1092   taskdata->td_flags.native = flags->native;
1093 
1094   taskdata->td_incomplete_child_tasks = 0;
1095   taskdata->td_allocated_child_tasks = 1; // start at one because counts current
1096 // task and children
1097 #if OMP_40_ENABLED
1098   taskdata->td_taskgroup =
1099       parent_task->td_taskgroup; // task inherits taskgroup from the parent task
1100   taskdata->td_dephash = NULL;
1101   taskdata->td_depnode = NULL;
1102 #endif
1103 
1104 // Only need to keep track of child task counts if team parallel and tasking not
1105 // serialized or if it is a proxy task
1106 #if OMP_45_ENABLED
1107   if (flags->proxy == TASK_PROXY ||
1108       !(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser))
1109 #else
1110   if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser))
1111 #endif
1112   {
1113     KMP_TEST_THEN_INC32((kmp_int32 *)(&parent_task->td_incomplete_child_tasks));
1114 #if OMP_40_ENABLED
1115     if (parent_task->td_taskgroup)
1116       KMP_TEST_THEN_INC32((kmp_int32 *)(&parent_task->td_taskgroup->count));
1117 #endif
1118     // Only need to keep track of allocated child tasks for explicit tasks since
1119     // implicit not deallocated
1120     if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
1121       KMP_TEST_THEN_INC32(
1122           (kmp_int32 *)(&taskdata->td_parent->td_allocated_child_tasks));
1123     }
1124   }
1125 
1126   KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
1127                 gtid, taskdata, taskdata->td_parent));
1128   ANNOTATE_HAPPENS_BEFORE(task);
1129 
1130 #if OMPT_SUPPORT
1131   __kmp_task_init_ompt(taskdata, gtid, (void *)task_entry);
1132 #endif
1133 
1134   return task;
1135 }
1136 
1137 kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1138                                   kmp_int32 flags, size_t sizeof_kmp_task_t,
1139                                   size_t sizeof_shareds,
1140                                   kmp_routine_entry_t task_entry) {
1141   kmp_task_t *retval;
1142   kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
1143 
1144   input_flags->native = FALSE;
1145 // __kmp_task_alloc() sets up all other runtime flags
1146 
1147 #if OMP_45_ENABLED
1148   KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s) "
1149                 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1150                 gtid, loc_ref, input_flags->tiedness ? "tied  " : "untied",
1151                 input_flags->proxy ? "proxy" : "", sizeof_kmp_task_t,
1152                 sizeof_shareds, task_entry));
1153 #else
1154   KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s) "
1155                 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1156                 gtid, loc_ref, input_flags->tiedness ? "tied  " : "untied",
1157                 sizeof_kmp_task_t, sizeof_shareds, task_entry));
1158 #endif
1159 
1160   retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t,
1161                             sizeof_shareds, task_entry);
1162 
1163   KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval));
1164 
1165   return retval;
1166 }
1167 
1168 //  __kmp_invoke_task: invoke the specified task
1169 //
1170 // gtid: global thread ID of caller
1171 // task: the task to invoke
1172 // current_task: the task to resume after task invokation
1173 static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
1174                               kmp_taskdata_t *current_task) {
1175   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
1176   kmp_uint64 cur_time;
1177 #if OMP_40_ENABLED
1178   int discard = 0 /* false */;
1179 #endif
1180   KA_TRACE(
1181       30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
1182            gtid, taskdata, current_task));
1183   KMP_DEBUG_ASSERT(task);
1184 #if OMP_45_ENABLED
1185   if (taskdata->td_flags.proxy == TASK_PROXY &&
1186       taskdata->td_flags.complete == 1) {
1187     // This is a proxy task that was already completed but it needs to run
1188     // its bottom-half finish
1189     KA_TRACE(
1190         30,
1191         ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
1192          gtid, taskdata));
1193 
1194     __kmp_bottom_half_finish_proxy(gtid, task);
1195 
1196     KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for "
1197                   "proxy task %p, resuming task %p\n",
1198                   gtid, taskdata, current_task));
1199 
1200     return;
1201   }
1202 #endif
1203 
1204 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1205   if (__kmp_forkjoin_frames_mode == 3) {
1206     // Get the current time stamp to measure task execution time to correct
1207     // barrier imbalance time
1208     cur_time = __itt_get_timestamp();
1209   }
1210 #endif
1211 
1212 #if OMP_45_ENABLED
1213   // Proxy tasks are not handled by the runtime
1214   if (taskdata->td_flags.proxy != TASK_PROXY) {
1215 #endif
1216     ANNOTATE_HAPPENS_AFTER(task);
1217     __kmp_task_start(gtid, task, current_task);
1218 #if OMP_45_ENABLED
1219   }
1220 #endif
1221 
1222 #if OMPT_SUPPORT
1223   ompt_thread_info_t oldInfo;
1224   kmp_info_t *thread;
1225   if (ompt_enabled) {
1226     // Store the threads states and restore them after the task
1227     thread = __kmp_threads[gtid];
1228     oldInfo = thread->th.ompt_thread_info;
1229     thread->th.ompt_thread_info.wait_id = 0;
1230     thread->th.ompt_thread_info.state = ompt_state_work_parallel;
1231     taskdata->ompt_task_info.frame.exit_runtime_frame =
1232         __builtin_frame_address(0);
1233   }
1234 #endif
1235 
1236 #if OMP_40_ENABLED
1237   // TODO: cancel tasks if the parallel region has also been cancelled
1238   // TODO: check if this sequence can be hoisted above __kmp_task_start
1239   // if cancellation has been enabled for this run ...
1240   if (__kmp_omp_cancellation) {
1241     kmp_info_t *this_thr = __kmp_threads[gtid];
1242     kmp_team_t *this_team = this_thr->th.th_team;
1243     kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
1244     if ((taskgroup && taskgroup->cancel_request) ||
1245         (this_team->t.t_cancel_request == cancel_parallel)) {
1246       KMP_COUNT_BLOCK(TASK_cancelled);
1247       // this task belongs to a task group and we need to cancel it
1248       discard = 1 /* true */;
1249     }
1250   }
1251 
1252   // Invoke the task routine and pass in relevant data.
1253   // Thunks generated by gcc take a different argument list.
1254   if (!discard) {
1255 #if KMP_STATS_ENABLED
1256     KMP_COUNT_BLOCK(TASK_executed);
1257     switch (KMP_GET_THREAD_STATE()) {
1258     case FORK_JOIN_BARRIER:
1259       KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar);
1260       break;
1261     case PLAIN_BARRIER:
1262       KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar);
1263       break;
1264     case TASKYIELD:
1265       KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield);
1266       break;
1267     case TASKWAIT:
1268       KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait);
1269       break;
1270     case TASKGROUP:
1271       KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup);
1272       break;
1273     default:
1274       KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate);
1275       break;
1276     }
1277 #endif // KMP_STATS_ENABLED
1278 #endif // OMP_40_ENABLED
1279 
1280 #if OMPT_SUPPORT && OMPT_TRACE
1281     /* let OMPT know that we're about to run this task */
1282     if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_task_switch)) {
1283       ompt_callbacks.ompt_callback(ompt_event_task_switch)(
1284           current_task->ompt_task_info.task_id,
1285           taskdata->ompt_task_info.task_id);
1286     }
1287 #endif
1288 
1289 #ifdef KMP_GOMP_COMPAT
1290     if (taskdata->td_flags.native) {
1291       ((void (*)(void *))(*(task->routine)))(task->shareds);
1292     } else
1293 #endif /* KMP_GOMP_COMPAT */
1294     {
1295       (*(task->routine))(gtid, task);
1296     }
1297     KMP_POP_PARTITIONED_TIMER();
1298 
1299 #if OMPT_SUPPORT && OMPT_TRACE
1300     /* let OMPT know that we're returning to the callee task */
1301     if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_task_switch)) {
1302       ompt_callbacks.ompt_callback(ompt_event_task_switch)(
1303           taskdata->ompt_task_info.task_id,
1304           current_task->ompt_task_info.task_id);
1305     }
1306 #endif
1307 
1308 #if OMP_40_ENABLED
1309   }
1310 #endif // OMP_40_ENABLED
1311 
1312 #if OMPT_SUPPORT
1313   if (ompt_enabled) {
1314     thread->th.ompt_thread_info = oldInfo;
1315     taskdata->ompt_task_info.frame.exit_runtime_frame = NULL;
1316   }
1317 #endif
1318 
1319 #if OMP_45_ENABLED
1320   // Proxy tasks are not handled by the runtime
1321   if (taskdata->td_flags.proxy != TASK_PROXY) {
1322 #endif
1323     ANNOTATE_HAPPENS_BEFORE(taskdata->td_parent);
1324     __kmp_task_finish(gtid, task, current_task);
1325 #if OMP_45_ENABLED
1326   }
1327 #endif
1328 
1329 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1330   // Barrier imbalance - correct arrive time after the task finished
1331   if (__kmp_forkjoin_frames_mode == 3) {
1332     kmp_info_t *this_thr = __kmp_threads[gtid];
1333     if (this_thr->th.th_bar_arrive_time) {
1334       this_thr->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time);
1335     }
1336   }
1337 #endif
1338   KA_TRACE(
1339       30,
1340       ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
1341        gtid, taskdata, current_task));
1342   return;
1343 }
1344 
1345 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution
1346 //
1347 // loc_ref: location of original task pragma (ignored)
1348 // gtid: Global Thread ID of encountering thread
1349 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
1350 // Returns:
1351 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1352 //    be resumed later.
1353 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1354 //    resumed later.
1355 kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,
1356                                 kmp_task_t *new_task) {
1357   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1358 
1359   KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid,
1360                 loc_ref, new_taskdata));
1361 
1362   /* Should we execute the new task or queue it? For now, let's just always try
1363      to queue it.  If the queue fills up, then we'll execute it.  */
1364 
1365   if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1366   { // Execute this task immediately
1367     kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1368     new_taskdata->td_flags.task_serial = 1;
1369     __kmp_invoke_task(gtid, new_task, current_task);
1370   }
1371 
1372   KA_TRACE(
1373       10,
1374       ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "
1375        "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
1376        gtid, loc_ref, new_taskdata));
1377 
1378   ANNOTATE_HAPPENS_BEFORE(new_task);
1379   return TASK_CURRENT_NOT_QUEUED;
1380 }
1381 
1382 // __kmp_omp_task: Schedule a non-thread-switchable task for execution
1383 //
1384 // gtid: Global Thread ID of encountering thread
1385 // new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
1386 // serialize_immediate: if TRUE then if the task is executed immediately its
1387 // execution will be serialized
1388 // Returns:
1389 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1390 //    be resumed later.
1391 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1392 //    resumed later.
1393 kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
1394                          bool serialize_immediate) {
1395   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1396 
1397 #if OMPT_SUPPORT
1398   if (ompt_enabled) {
1399     new_taskdata->ompt_task_info.frame.reenter_runtime_frame =
1400         __builtin_frame_address(1);
1401   }
1402 #endif
1403 
1404 /* Should we execute the new task or queue it? For now, let's just always try to
1405    queue it.  If the queue fills up, then we'll execute it.  */
1406 #if OMP_45_ENABLED
1407   if (new_taskdata->td_flags.proxy == TASK_PROXY ||
1408       __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1409 #else
1410   if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1411 #endif
1412   { // Execute this task immediately
1413     kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1414     if (serialize_immediate)
1415       new_taskdata->td_flags.task_serial = 1;
1416     __kmp_invoke_task(gtid, new_task, current_task);
1417   }
1418 
1419 #if OMPT_SUPPORT
1420   if (ompt_enabled) {
1421     new_taskdata->ompt_task_info.frame.reenter_runtime_frame = NULL;
1422   }
1423 #endif
1424 
1425   ANNOTATE_HAPPENS_BEFORE(new_task);
1426   return TASK_CURRENT_NOT_QUEUED;
1427 }
1428 
1429 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a
1430 // non-thread-switchable task from the parent thread only!
1431 //
1432 // loc_ref: location of original task pragma (ignored)
1433 // gtid: Global Thread ID of encountering thread
1434 // new_task: non-thread-switchable task thunk allocated by
1435 // __kmp_omp_task_alloc()
1436 // Returns:
1437 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1438 //    be resumed later.
1439 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1440 //    resumed later.
1441 kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid,
1442                           kmp_task_t *new_task) {
1443   kmp_int32 res;
1444   KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1445 
1446 #if KMP_DEBUG
1447   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1448 #endif
1449   KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1450                 new_taskdata));
1451 
1452   res = __kmp_omp_task(gtid, new_task, true);
1453 
1454   KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1455                 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1456                 gtid, loc_ref, new_taskdata));
1457   return res;
1458 }
1459 
1460 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are
1461 // complete
1462 kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) {
1463   kmp_taskdata_t *taskdata;
1464   kmp_info_t *thread;
1465   int thread_finished = FALSE;
1466   KMP_SET_THREAD_STATE_BLOCK(TASKWAIT);
1467 
1468   KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref));
1469 
1470   if (__kmp_tasking_mode != tskm_immediate_exec) {
1471     thread = __kmp_threads[gtid];
1472     taskdata = thread->th.th_current_task;
1473 #if OMPT_SUPPORT && OMPT_TRACE
1474     ompt_task_id_t my_task_id;
1475     ompt_parallel_id_t my_parallel_id;
1476 
1477     if (ompt_enabled) {
1478       kmp_team_t *team = thread->th.th_team;
1479       my_task_id = taskdata->ompt_task_info.task_id;
1480       my_parallel_id = team->t.ompt_team_info.parallel_id;
1481 
1482       taskdata->ompt_task_info.frame.reenter_runtime_frame =
1483           __builtin_frame_address(1);
1484       if (ompt_callbacks.ompt_callback(ompt_event_taskwait_begin)) {
1485         ompt_callbacks.ompt_callback(ompt_event_taskwait_begin)(my_parallel_id,
1486                                                                 my_task_id);
1487       }
1488     }
1489 #endif
1490 
1491 // Debugger: The taskwait is active. Store location and thread encountered the
1492 // taskwait.
1493 #if USE_ITT_BUILD
1494 // Note: These values are used by ITT events as well.
1495 #endif /* USE_ITT_BUILD */
1496     taskdata->td_taskwait_counter += 1;
1497     taskdata->td_taskwait_ident = loc_ref;
1498     taskdata->td_taskwait_thread = gtid + 1;
1499 
1500 #if USE_ITT_BUILD
1501     void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
1502     if (itt_sync_obj != NULL)
1503       __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
1504 #endif /* USE_ITT_BUILD */
1505 
1506     bool must_wait =
1507         !taskdata->td_flags.team_serial && !taskdata->td_flags.final;
1508 
1509 #if OMP_45_ENABLED
1510     must_wait = must_wait || (thread->th.th_task_team != NULL &&
1511                               thread->th.th_task_team->tt.tt_found_proxy_tasks);
1512 #endif
1513     if (must_wait) {
1514       kmp_flag_32 flag(&(taskdata->td_incomplete_child_tasks), 0U);
1515       while (TCR_4(taskdata->td_incomplete_child_tasks) != 0) {
1516         flag.execute_tasks(thread, gtid, FALSE,
1517                            &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
1518                            __kmp_task_stealing_constraint);
1519       }
1520     }
1521 #if USE_ITT_BUILD
1522     if (itt_sync_obj != NULL)
1523       __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
1524 #endif /* USE_ITT_BUILD */
1525 
1526     // Debugger:  The taskwait is completed. Location remains, but thread is
1527     // negated.
1528     taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
1529 
1530 #if OMPT_SUPPORT && OMPT_TRACE
1531     if (ompt_enabled) {
1532       if (ompt_callbacks.ompt_callback(ompt_event_taskwait_end)) {
1533         ompt_callbacks.ompt_callback(ompt_event_taskwait_end)(my_parallel_id,
1534                                                               my_task_id);
1535       }
1536       taskdata->ompt_task_info.frame.reenter_runtime_frame = NULL;
1537     }
1538 #endif
1539     ANNOTATE_HAPPENS_AFTER(taskdata);
1540   }
1541 
1542   KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
1543                 "returning TASK_CURRENT_NOT_QUEUED\n",
1544                 gtid, taskdata));
1545 
1546   return TASK_CURRENT_NOT_QUEUED;
1547 }
1548 
1549 // __kmpc_omp_taskyield: switch to a different task
1550 kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
1551   kmp_taskdata_t *taskdata;
1552   kmp_info_t *thread;
1553   int thread_finished = FALSE;
1554 
1555   KMP_COUNT_BLOCK(OMP_TASKYIELD);
1556   KMP_SET_THREAD_STATE_BLOCK(TASKYIELD);
1557 
1558   KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
1559                 gtid, loc_ref, end_part));
1560 
1561   if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) {
1562     thread = __kmp_threads[gtid];
1563     taskdata = thread->th.th_current_task;
1564 // Should we model this as a task wait or not?
1565 // Debugger: The taskwait is active. Store location and thread encountered the
1566 // taskwait.
1567 #if USE_ITT_BUILD
1568 // Note: These values are used by ITT events as well.
1569 #endif /* USE_ITT_BUILD */
1570     taskdata->td_taskwait_counter += 1;
1571     taskdata->td_taskwait_ident = loc_ref;
1572     taskdata->td_taskwait_thread = gtid + 1;
1573 
1574 #if USE_ITT_BUILD
1575     void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
1576     if (itt_sync_obj != NULL)
1577       __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
1578 #endif /* USE_ITT_BUILD */
1579     if (!taskdata->td_flags.team_serial) {
1580       kmp_task_team_t *task_team = thread->th.th_task_team;
1581       if (task_team != NULL) {
1582         if (KMP_TASKING_ENABLED(task_team)) {
1583           __kmp_execute_tasks_32(
1584               thread, gtid, NULL, FALSE,
1585               &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
1586               __kmp_task_stealing_constraint);
1587         }
1588       }
1589     }
1590 #if USE_ITT_BUILD
1591     if (itt_sync_obj != NULL)
1592       __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
1593 #endif /* USE_ITT_BUILD */
1594 
1595     // Debugger:  The taskwait is completed. Location remains, but thread is
1596     // negated.
1597     taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
1598   }
1599 
1600   KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
1601                 "returning TASK_CURRENT_NOT_QUEUED\n",
1602                 gtid, taskdata));
1603 
1604   return TASK_CURRENT_NOT_QUEUED;
1605 }
1606 
1607 // TODO: change to OMP_50_ENABLED, need to change build tools for this to work
1608 #if OMP_45_ENABLED
1609 // Task Reduction implementation
1610 
1611 typedef struct kmp_task_red_flags {
1612   unsigned lazy_priv : 1; // hint: (1) use lazy allocation (big objects)
1613   unsigned reserved31 : 31;
1614 } kmp_task_red_flags_t;
1615 
1616 // internal structure for reduction data item related info
1617 typedef struct kmp_task_red_data {
1618   void *reduce_shar; // shared reduction item
1619   size_t reduce_size; // size of data item
1620   void *reduce_priv; // thread specific data
1621   void *reduce_pend; // end of private data for comparison op
1622   void *reduce_init; // data initialization routine
1623   void *reduce_fini; // data finalization routine
1624   void *reduce_comb; // data combiner routine
1625   kmp_task_red_flags_t flags; // flags for additional info from compiler
1626 } kmp_task_red_data_t;
1627 
1628 // structure sent us by compiler - one per reduction item
1629 typedef struct kmp_task_red_input {
1630   void *reduce_shar; // shared reduction item
1631   size_t reduce_size; // size of data item
1632   void *reduce_init; // data initialization routine
1633   void *reduce_fini; // data finalization routine
1634   void *reduce_comb; // data combiner routine
1635   kmp_task_red_flags_t flags; // flags for additional info from compiler
1636 } kmp_task_red_input_t;
1637 
1638 /*!
1639 @ingroup TASKING
1640 @param gtid      Global thread ID
1641 @param num       Number of data items to reduce
1642 @param data      Array of data for reduction
1643 @return The taskgroup identifier
1644 
1645 Initialize task reduction for the taskgroup.
1646 */
1647 void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
1648   kmp_info_t *thread = __kmp_threads[gtid];
1649   kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
1650   kmp_int32 nth = thread->th.th_team_nproc;
1651   kmp_task_red_input_t *input = (kmp_task_red_input_t *)data;
1652   kmp_task_red_data_t *arr;
1653 
1654   // check input data just in case
1655   KMP_ASSERT(tg != NULL);
1656   KMP_ASSERT(data != NULL);
1657   KMP_ASSERT(num > 0);
1658   if (nth == 1) {
1659     KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n",
1660                   gtid, tg));
1661     return (void *)tg;
1662   }
1663   KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n",
1664                 gtid, tg, num));
1665   arr = (kmp_task_red_data_t *)__kmp_thread_malloc(
1666       thread, num * sizeof(kmp_task_red_data_t));
1667   for (int i = 0; i < num; ++i) {
1668     void (*f_init)(void *) = (void (*)(void *))(input[i].reduce_init);
1669     size_t size = input[i].reduce_size - 1;
1670     // round the size up to cache line per thread-specific item
1671     size += CACHE_LINE - size % CACHE_LINE;
1672     KMP_ASSERT(input[i].reduce_comb != NULL); // combiner is mandatory
1673     arr[i].reduce_shar = input[i].reduce_shar;
1674     arr[i].reduce_size = size;
1675     arr[i].reduce_init = input[i].reduce_init;
1676     arr[i].reduce_fini = input[i].reduce_fini;
1677     arr[i].reduce_comb = input[i].reduce_comb;
1678     arr[i].flags = input[i].flags;
1679     if (!input[i].flags.lazy_priv) {
1680       // allocate cache-line aligned block and fill it with zeros
1681       arr[i].reduce_priv = __kmp_allocate(nth * size);
1682       arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size;
1683       if (f_init != NULL) {
1684         // initialize thread-specific items
1685         for (int j = 0; j < nth; ++j) {
1686           f_init((char *)(arr[i].reduce_priv) + j * size);
1687         }
1688       }
1689     } else {
1690       // only allocate space for pointers now,
1691       // objects will be lazily allocated/initialized once requested
1692       arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *));
1693     }
1694   }
1695   tg->reduce_data = (void *)arr;
1696   tg->reduce_num_data = num;
1697   return (void *)tg;
1698 }
1699 
1700 /*!
1701 @ingroup TASKING
1702 @param gtid    Global thread ID
1703 @param tskgrp  The taskgroup ID (optional)
1704 @param data    Shared location of the item
1705 @return The pointer to per-thread data
1706 
1707 Get thread-specific location of data item
1708 */
1709 void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
1710   kmp_info_t *thread = __kmp_threads[gtid];
1711   kmp_int32 nth = thread->th.th_team_nproc;
1712   if (nth == 1)
1713     return data; // nothing to do
1714 
1715   kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp;
1716   if (tg == NULL)
1717     tg = thread->th.th_current_task->td_taskgroup;
1718   KMP_ASSERT(tg != NULL);
1719   kmp_task_red_data_t *arr = (kmp_task_red_data_t *)(tg->reduce_data);
1720   kmp_int32 num = tg->reduce_num_data;
1721   kmp_int32 tid = thread->th.th_info.ds.ds_tid;
1722 
1723   KMP_ASSERT(data != NULL);
1724   while (tg != NULL) {
1725     for (int i = 0; i < num; ++i) {
1726       if (!arr[i].flags.lazy_priv) {
1727         if (data == arr[i].reduce_shar ||
1728             (data >= arr[i].reduce_priv && data < arr[i].reduce_pend))
1729           return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size;
1730       } else {
1731         // check shared location first
1732         void **p_priv = (void **)(arr[i].reduce_priv);
1733         if (data == arr[i].reduce_shar)
1734           goto found;
1735         // check if we get some thread specific location as parameter
1736         for (int j = 0; j < nth; ++j)
1737           if (data == p_priv[j])
1738             goto found;
1739         continue; // not found, continue search
1740       found:
1741         if (p_priv[tid] == NULL) {
1742           // allocate thread specific object lazily
1743           void (*f_init)(void *) = (void (*)(void *))(arr[i].reduce_init);
1744           p_priv[tid] = __kmp_allocate(arr[i].reduce_size);
1745           if (f_init != NULL) {
1746             f_init(p_priv[tid]);
1747           }
1748         }
1749         return p_priv[tid];
1750       }
1751     }
1752     tg = tg->parent;
1753     arr = (kmp_task_red_data_t *)(tg->reduce_data);
1754     num = tg->reduce_num_data;
1755   }
1756   KMP_ASSERT2(0, "Unknown task reduction item");
1757   return NULL; // ERROR, this line never executed
1758 }
1759 
1760 // Finalize task reduction.
1761 // Called from __kmpc_end_taskgroup()
1762 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) {
1763   kmp_int32 nth = th->th.th_team_nproc;
1764   KMP_DEBUG_ASSERT(nth > 1); // should not be called if nth == 1
1765   kmp_task_red_data_t *arr = (kmp_task_red_data_t *)tg->reduce_data;
1766   kmp_int32 num = tg->reduce_num_data;
1767   for (int i = 0; i < num; ++i) {
1768     void *sh_data = arr[i].reduce_shar;
1769     void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini);
1770     void (*f_comb)(void *, void *) =
1771         (void (*)(void *, void *))(arr[i].reduce_comb);
1772     if (!arr[i].flags.lazy_priv) {
1773       void *pr_data = arr[i].reduce_priv;
1774       size_t size = arr[i].reduce_size;
1775       for (int j = 0; j < nth; ++j) {
1776         void *priv_data = (char *)pr_data + j * size;
1777         f_comb(sh_data, priv_data); // combine results
1778         if (f_fini)
1779           f_fini(priv_data); // finalize if needed
1780       }
1781     } else {
1782       void **pr_data = (void **)(arr[i].reduce_priv);
1783       for (int j = 0; j < nth; ++j) {
1784         if (pr_data[j] != NULL) {
1785           f_comb(sh_data, pr_data[j]); // combine results
1786           if (f_fini)
1787             f_fini(pr_data[j]); // finalize if needed
1788           __kmp_free(pr_data[j]);
1789         }
1790       }
1791     }
1792     __kmp_free(arr[i].reduce_priv);
1793   }
1794   __kmp_thread_free(th, arr);
1795   tg->reduce_data = NULL;
1796   tg->reduce_num_data = 0;
1797 }
1798 #endif
1799 
1800 #if OMP_40_ENABLED
1801 // __kmpc_taskgroup: Start a new taskgroup
1802 void __kmpc_taskgroup(ident_t *loc, int gtid) {
1803   kmp_info_t *thread = __kmp_threads[gtid];
1804   kmp_taskdata_t *taskdata = thread->th.th_current_task;
1805   kmp_taskgroup_t *tg_new =
1806       (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t));
1807   KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new));
1808   tg_new->count = 0;
1809   tg_new->cancel_request = cancel_noreq;
1810   tg_new->parent = taskdata->td_taskgroup;
1811 // TODO: change to OMP_50_ENABLED, need to change build tools for this to work
1812 #if OMP_45_ENABLED
1813   tg_new->reduce_data = NULL;
1814   tg_new->reduce_num_data = 0;
1815 #endif
1816   taskdata->td_taskgroup = tg_new;
1817 }
1818 
1819 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task
1820 //                       and its descendants are complete
1821 void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
1822   kmp_info_t *thread = __kmp_threads[gtid];
1823   kmp_taskdata_t *taskdata = thread->th.th_current_task;
1824   kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
1825   int thread_finished = FALSE;
1826 
1827   KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc));
1828   KMP_DEBUG_ASSERT(taskgroup != NULL);
1829   KMP_SET_THREAD_STATE_BLOCK(TASKGROUP);
1830 
1831   if (__kmp_tasking_mode != tskm_immediate_exec) {
1832 #if USE_ITT_BUILD
1833     // For ITT the taskgroup wait is similar to taskwait until we need to
1834     // distinguish them
1835     void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
1836     if (itt_sync_obj != NULL)
1837       __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
1838 #endif /* USE_ITT_BUILD */
1839 
1840 #if OMP_45_ENABLED
1841     if (!taskdata->td_flags.team_serial ||
1842         (thread->th.th_task_team != NULL &&
1843          thread->th.th_task_team->tt.tt_found_proxy_tasks))
1844 #else
1845     if (!taskdata->td_flags.team_serial)
1846 #endif
1847     {
1848       kmp_flag_32 flag(&(taskgroup->count), 0U);
1849       while (TCR_4(taskgroup->count) != 0) {
1850         flag.execute_tasks(thread, gtid, FALSE,
1851                            &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
1852                            __kmp_task_stealing_constraint);
1853       }
1854     }
1855 
1856 #if USE_ITT_BUILD
1857     if (itt_sync_obj != NULL)
1858       __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
1859 #endif /* USE_ITT_BUILD */
1860   }
1861   KMP_DEBUG_ASSERT(taskgroup->count == 0);
1862 
1863 // TODO: change to OMP_50_ENABLED, need to change build tools for this to work
1864 #if OMP_45_ENABLED
1865   if (taskgroup->reduce_data != NULL) // need to reduce?
1866     __kmp_task_reduction_fini(thread, taskgroup);
1867 #endif
1868   // Restore parent taskgroup for the current task
1869   taskdata->td_taskgroup = taskgroup->parent;
1870   __kmp_thread_free(thread, taskgroup);
1871 
1872   KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n",
1873                 gtid, taskdata));
1874   ANNOTATE_HAPPENS_AFTER(taskdata);
1875 }
1876 #endif
1877 
1878 // __kmp_remove_my_task: remove a task from my own deque
1879 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid,
1880                                         kmp_task_team_t *task_team,
1881                                         kmp_int32 is_constrained) {
1882   kmp_task_t *task;
1883   kmp_taskdata_t *taskdata;
1884   kmp_thread_data_t *thread_data;
1885   kmp_uint32 tail;
1886 
1887   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
1888   KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data !=
1889                    NULL); // Caller should check this condition
1890 
1891   thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
1892 
1893   KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",
1894                 gtid, thread_data->td.td_deque_ntasks,
1895                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
1896 
1897   if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
1898     KA_TRACE(10,
1899              ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: "
1900               "ntasks=%d head=%u tail=%u\n",
1901               gtid, thread_data->td.td_deque_ntasks,
1902               thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
1903     return NULL;
1904   }
1905 
1906   __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
1907 
1908   if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
1909     __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
1910     KA_TRACE(10,
1911              ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "
1912               "ntasks=%d head=%u tail=%u\n",
1913               gtid, thread_data->td.td_deque_ntasks,
1914               thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
1915     return NULL;
1916   }
1917 
1918   tail = (thread_data->td.td_deque_tail - 1) &
1919          TASK_DEQUE_MASK(thread_data->td); // Wrap index.
1920   taskdata = thread_data->td.td_deque[tail];
1921 
1922   if (is_constrained && (taskdata->td_flags.tiedness == TASK_TIED)) {
1923     // we need to check if the candidate obeys task scheduling constraint:
1924     // only child of current task can be scheduled
1925     kmp_taskdata_t *current = thread->th.th_current_task;
1926     kmp_int32 level = current->td_level;
1927     kmp_taskdata_t *parent = taskdata->td_parent;
1928     while (parent != current && parent->td_level > level) {
1929       parent = parent->td_parent; // check generation up to the level of the
1930       // current task
1931       KMP_DEBUG_ASSERT(parent != NULL);
1932     }
1933     if (parent != current) {
1934       // If the tail task is not a child, then no other child can appear in the
1935       // deque.
1936       __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
1937       KA_TRACE(10,
1938                ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "
1939                 "ntasks=%d head=%u tail=%u\n",
1940                 gtid, thread_data->td.td_deque_ntasks,
1941                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
1942       return NULL;
1943     }
1944   }
1945 
1946   thread_data->td.td_deque_tail = tail;
1947   TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1);
1948 
1949   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
1950 
1951   KA_TRACE(10, ("__kmp_remove_my_task(exit #2): T#%d task %p removed: "
1952                 "ntasks=%d head=%u tail=%u\n",
1953                 gtid, taskdata, thread_data->td.td_deque_ntasks,
1954                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
1955 
1956   task = KMP_TASKDATA_TO_TASK(taskdata);
1957   return task;
1958 }
1959 
1960 // __kmp_steal_task: remove a task from another thread's deque
1961 // Assume that calling thread has already checked existence of
1962 // task_team thread_data before calling this routine.
1963 static kmp_task_t *
1964 __kmp_steal_task(kmp_info_t *victim, kmp_int32 gtid, kmp_task_team_t *task_team,
1965                  volatile kmp_uint32 *unfinished_threads, int *thread_finished,
1966                  kmp_int32 is_constrained)
1967 {
1968   kmp_task_t *task;
1969   kmp_taskdata_t *taskdata;
1970   kmp_thread_data_t *victim_td, *threads_data;
1971   kmp_int32 victim_tid;
1972 
1973   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
1974 
1975   threads_data = task_team->tt.tt_threads_data;
1976   KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition
1977 
1978   victim_tid = victim->th.th_info.ds.ds_tid;
1979   victim_td = &threads_data[victim_tid];
1980 
1981   KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: "
1982                 "task_team=%p ntasks=%d "
1983                 "head=%u tail=%u\n",
1984                 gtid, __kmp_gtid_from_thread(victim), task_team,
1985                 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
1986                 victim_td->td.td_deque_tail));
1987 
1988   if ((TCR_4(victim_td->td.td_deque_ntasks) ==
1989        0) || // Caller should not check this condition
1990       (TCR_PTR(victim->th.th_task_team) !=
1991        task_team)) // GEH: why would this happen?
1992   {
1993     KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: "
1994                   "task_team=%p "
1995                   "ntasks=%d head=%u tail=%u\n",
1996                   gtid, __kmp_gtid_from_thread(victim), task_team,
1997                   victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
1998                   victim_td->td.td_deque_tail));
1999     return NULL;
2000   }
2001 
2002   __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock);
2003 
2004   // Check again after we acquire the lock
2005   if ((TCR_4(victim_td->td.td_deque_ntasks) == 0) ||
2006       (TCR_PTR(victim->th.th_task_team) !=
2007        task_team)) // GEH: why would this happen?
2008   {
2009     __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2010     KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: "
2011                   "task_team=%p "
2012                   "ntasks=%d head=%u tail=%u\n",
2013                   gtid, __kmp_gtid_from_thread(victim), task_team,
2014                   victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
2015                   victim_td->td.td_deque_tail));
2016     return NULL;
2017   }
2018 
2019   KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL);
2020 
2021   taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head];
2022   if (is_constrained) {
2023     // we need to check if the candidate obeys task scheduling constraint:
2024     // only descendant of current task can be scheduled
2025     kmp_taskdata_t *current = __kmp_threads[gtid]->th.th_current_task;
2026     kmp_int32 level = current->td_level;
2027     kmp_taskdata_t *parent = taskdata->td_parent;
2028     while (parent != current && parent->td_level > level) {
2029       parent = parent->td_parent; // check generation up to the level of the
2030       // current task
2031       KMP_DEBUG_ASSERT(parent != NULL);
2032     }
2033     if (parent != current) {
2034       // If the head task is not a descendant of the current task then do not
2035       // steal it. No other task in victim's deque can be a descendant of the
2036       // current task.
2037       __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2038       KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from "
2039                     "T#%d: task_team=%p "
2040                     "ntasks=%d head=%u tail=%u\n",
2041                     gtid,
2042                     __kmp_gtid_from_thread(threads_data[victim_tid].td.td_thr),
2043                     task_team, victim_td->td.td_deque_ntasks,
2044                     victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2045       return NULL;
2046     }
2047   }
2048   // Bump head pointer and Wrap.
2049   victim_td->td.td_deque_head =
2050       (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td);
2051   if (*thread_finished) {
2052     // We need to un-mark this victim as a finished victim.  This must be done
2053     // before releasing the lock, or else other threads (starting with the
2054     // master victim) might be prematurely released from the barrier!!!
2055     kmp_uint32 count;
2056 
2057     count = KMP_TEST_THEN_INC32((kmp_int32 *)unfinished_threads);
2058 
2059     KA_TRACE(
2060         20,
2061         ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
2062          gtid, count + 1, task_team));
2063 
2064     *thread_finished = FALSE;
2065   }
2066   TCW_4(victim_td->td.td_deque_ntasks,
2067         TCR_4(victim_td->td.td_deque_ntasks) - 1);
2068 
2069 
2070   __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2071 
2072   KMP_COUNT_BLOCK(TASK_stolen);
2073   KA_TRACE(
2074       10,
2075       ("__kmp_steal_task(exit #3): T#%d stole task %p from T#%d: task_team=%p "
2076        "ntasks=%d head=%u tail=%u\n",
2077        gtid, taskdata, __kmp_gtid_from_thread(victim), task_team,
2078        victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
2079        victim_td->td.td_deque_tail));
2080 
2081   task = KMP_TASKDATA_TO_TASK(taskdata);
2082   return task;
2083 }
2084 
2085 
2086 // __kmp_execute_tasks_template: Choose and execute tasks until either the
2087 // condition is statisfied (return true) or there are none left (return false).
2088 //
2089 // final_spin is TRUE if this is the spin at the release barrier.
2090 // thread_finished indicates whether the thread is finished executing all
2091 // the tasks it has on its deque, and is at the release barrier.
2092 // spinner is the location on which to spin.
2093 // spinner == NULL means only execute a single task and return.
2094 // checker is the value to check to terminate the spin.
2095 template <class C>
2096 static inline int __kmp_execute_tasks_template(
2097     kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin,
2098     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
2099     kmp_int32 is_constrained) {
2100   kmp_task_team_t *task_team = thread->th.th_task_team;
2101   kmp_thread_data_t *threads_data;
2102   kmp_task_t *task;
2103   kmp_info_t *other_thread;
2104   kmp_taskdata_t *current_task = thread->th.th_current_task;
2105   volatile kmp_uint32 *unfinished_threads;
2106   kmp_int32 nthreads, victim = -2, use_own_tasks = 1, new_victim = 0,
2107                       tid = thread->th.th_info.ds.ds_tid;
2108 
2109   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2110   KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]);
2111 
2112   if (task_team == NULL)
2113     return FALSE;
2114 
2115   KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d "
2116                 "*thread_finished=%d\n",
2117                 gtid, final_spin, *thread_finished));
2118 
2119   thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
2120   threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
2121   KMP_DEBUG_ASSERT(threads_data != NULL);
2122 
2123   nthreads = task_team->tt.tt_nproc;
2124   unfinished_threads = &(task_team->tt.tt_unfinished_threads);
2125 #if OMP_45_ENABLED
2126   KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks);
2127 #else
2128   KMP_DEBUG_ASSERT(nthreads > 1);
2129 #endif
2130   KMP_DEBUG_ASSERT((int)(TCR_4(*unfinished_threads)) >= 0);
2131 
2132   while (1) { // Outer loop keeps trying to find tasks in case of single thread
2133     // getting tasks from target constructs
2134     while (1) { // Inner loop to find a task and execute it
2135       task = NULL;
2136       if (use_own_tasks) { // check on own queue first
2137         task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained);
2138       }
2139       if ((task == NULL) && (nthreads > 1)) { // Steal a task
2140         int asleep = 1;
2141         use_own_tasks = 0;
2142         // Try to steal from the last place I stole from successfully.
2143         if (victim == -2) { // haven't stolen anything yet
2144           victim = threads_data[tid].td.td_deque_last_stolen;
2145           if (victim !=
2146               -1) // if we have a last stolen from victim, get the thread
2147             other_thread = threads_data[victim].td.td_thr;
2148         }
2149         if (victim != -1) { // found last victim
2150           asleep = 0;
2151         } else if (!new_victim) { // no recent steals and we haven't already
2152           // used a new victim; select a random thread
2153           do { // Find a different thread to steal work from.
2154             // Pick a random thread. Initial plan was to cycle through all the
2155             // threads, and only return if we tried to steal from every thread,
2156             // and failed.  Arch says that's not such a great idea.
2157             victim = __kmp_get_random(thread) % (nthreads - 1);
2158             if (victim >= tid) {
2159               ++victim; // Adjusts random distribution to exclude self
2160             }
2161             // Found a potential victim
2162             other_thread = threads_data[victim].td.td_thr;
2163             // There is a slight chance that __kmp_enable_tasking() did not wake
2164             // up all threads waiting at the barrier.  If victim is sleeping,
2165             // then wake it up. Since we were going to pay the cache miss
2166             // penalty for referencing another thread's kmp_info_t struct
2167             // anyway,
2168             // the check shouldn't cost too much performance at this point. In
2169             // extra barrier mode, tasks do not sleep at the separate tasking
2170             // barrier, so this isn't a problem.
2171             asleep = 0;
2172             if ((__kmp_tasking_mode == tskm_task_teams) &&
2173                 (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&
2174                 (TCR_PTR(other_thread->th.th_sleep_loc) != NULL)) {
2175               asleep = 1;
2176               __kmp_null_resume_wrapper(__kmp_gtid_from_thread(other_thread),
2177                                         other_thread->th.th_sleep_loc);
2178               // A sleeping thread should not have any tasks on it's queue.
2179               // There is a slight possibility that it resumes, steals a task
2180               // from another thread, which spawns more tasks, all in the time
2181               // that it takes this thread to check => don't write an assertion
2182               // that the victim's queue is empty.  Try stealing from a
2183               // different thread.
2184             }
2185           } while (asleep);
2186         }
2187 
2188         if (!asleep) {
2189           // We have a victim to try to steal from
2190           task = __kmp_steal_task(other_thread, gtid, task_team,
2191                                   unfinished_threads, thread_finished,
2192                                   is_constrained);
2193         }
2194         if (task != NULL) { // set last stolen to victim
2195           if (threads_data[tid].td.td_deque_last_stolen != victim) {
2196             threads_data[tid].td.td_deque_last_stolen = victim;
2197             // The pre-refactored code did not try more than 1 successful new
2198             // vicitm, unless the last one generated more local tasks;
2199             // new_victim keeps track of this
2200             new_victim = 1;
2201           }
2202         } else { // No tasks found; unset last_stolen
2203           KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1);
2204           victim = -2; // no successful victim found
2205         }
2206       }
2207 
2208       if (task == NULL) // break out of tasking loop
2209         break;
2210 
2211 // Found a task; execute it
2212 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2213       if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2214         if (itt_sync_obj == NULL) { // we are at fork barrier where we could not
2215           // get the object reliably
2216           itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2217         }
2218         __kmp_itt_task_starting(itt_sync_obj);
2219       }
2220 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2221       __kmp_invoke_task(gtid, task, current_task);
2222 #if USE_ITT_BUILD
2223       if (itt_sync_obj != NULL)
2224         __kmp_itt_task_finished(itt_sync_obj);
2225 #endif /* USE_ITT_BUILD */
2226       // If this thread is only partway through the barrier and the condition is
2227       // met, then return now, so that the barrier gather/release pattern can
2228       // proceed. If this thread is in the last spin loop in the barrier,
2229       // waiting to be released, we know that the termination condition will not
2230       // be satisified, so don't waste any cycles checking it.
2231       if (flag == NULL || (!final_spin && flag->done_check())) {
2232         KA_TRACE(
2233             15,
2234             ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
2235              gtid));
2236         return TRUE;
2237       }
2238       if (thread->th.th_task_team == NULL) {
2239         break;
2240       }
2241       // Yield before executing next task
2242       KMP_YIELD(__kmp_library == library_throughput);
2243       // If execution of a stolen task results in more tasks being placed on our
2244       // run queue, reset use_own_tasks
2245       if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) {
2246         KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned "
2247                       "other tasks, restart\n",
2248                       gtid));
2249         use_own_tasks = 1;
2250         new_victim = 0;
2251       }
2252     }
2253 
2254 // The task source has been exhausted. If in final spin loop of barrier, check
2255 // if termination condition is satisfied.
2256 #if OMP_45_ENABLED
2257     // The work queue may be empty but there might be proxy tasks still
2258     // executing
2259     if (final_spin && TCR_4(current_task->td_incomplete_child_tasks) == 0)
2260 #else
2261     if (final_spin)
2262 #endif
2263     {
2264       // First, decrement the #unfinished threads, if that has not already been
2265       // done.  This decrement might be to the spin location, and result in the
2266       // termination condition being satisfied.
2267       if (!*thread_finished) {
2268         kmp_uint32 count;
2269 
2270         count = KMP_TEST_THEN_DEC32((kmp_int32 *)unfinished_threads) - 1;
2271         KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec "
2272                       "unfinished_threads to %d task_team=%p\n",
2273                       gtid, count, task_team));
2274         *thread_finished = TRUE;
2275       }
2276 
2277       // It is now unsafe to reference thread->th.th_team !!!
2278       // Decrementing task_team->tt.tt_unfinished_threads can allow the master
2279       // thread to pass through the barrier, where it might reset each thread's
2280       // th.th_team field for the next parallel region. If we can steal more
2281       // work, we know that this has not happened yet.
2282       if (flag != NULL && flag->done_check()) {
2283         KA_TRACE(
2284             15,
2285             ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
2286              gtid));
2287         return TRUE;
2288       }
2289     }
2290 
2291     // If this thread's task team is NULL, master has recognized that there are
2292     // no more tasks; bail out
2293     if (thread->th.th_task_team == NULL) {
2294       KA_TRACE(15,
2295                ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid));
2296       return FALSE;
2297     }
2298 
2299 #if OMP_45_ENABLED
2300     // We could be getting tasks from target constructs; if this is the only
2301     // thread, keep trying to execute tasks from own queue
2302     if (nthreads == 1)
2303       use_own_tasks = 1;
2304     else
2305 #endif
2306     {
2307       KA_TRACE(15,
2308                ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid));
2309       return FALSE;
2310     }
2311   }
2312 }
2313 
2314 int __kmp_execute_tasks_32(
2315     kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32 *flag, int final_spin,
2316     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
2317     kmp_int32 is_constrained) {
2318   return __kmp_execute_tasks_template(
2319       thread, gtid, flag, final_spin,
2320       thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
2321 }
2322 
2323 int __kmp_execute_tasks_64(
2324     kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64 *flag, int final_spin,
2325     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
2326     kmp_int32 is_constrained) {
2327   return __kmp_execute_tasks_template(
2328       thread, gtid, flag, final_spin,
2329       thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
2330 }
2331 
2332 int __kmp_execute_tasks_oncore(
2333     kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
2334     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
2335     kmp_int32 is_constrained) {
2336   return __kmp_execute_tasks_template(
2337       thread, gtid, flag, final_spin,
2338       thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
2339 }
2340 
2341 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
2342 // next barrier so they can assist in executing enqueued tasks.
2343 // First thread in allocates the task team atomically.
2344 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
2345                                  kmp_info_t *this_thr) {
2346   kmp_thread_data_t *threads_data;
2347   int nthreads, i, is_init_thread;
2348 
2349   KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n",
2350                 __kmp_gtid_from_thread(this_thr)));
2351 
2352   KMP_DEBUG_ASSERT(task_team != NULL);
2353   KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL);
2354 
2355   nthreads = task_team->tt.tt_nproc;
2356   KMP_DEBUG_ASSERT(nthreads > 0);
2357   KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc);
2358 
2359   // Allocate or increase the size of threads_data if necessary
2360   is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team);
2361 
2362   if (!is_init_thread) {
2363     // Some other thread already set up the array.
2364     KA_TRACE(
2365         20,
2366         ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n",
2367          __kmp_gtid_from_thread(this_thr)));
2368     return;
2369   }
2370   threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
2371   KMP_DEBUG_ASSERT(threads_data != NULL);
2372 
2373   if ((__kmp_tasking_mode == tskm_task_teams) &&
2374       (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) {
2375     // Release any threads sleeping at the barrier, so that they can steal
2376     // tasks and execute them.  In extra barrier mode, tasks do not sleep
2377     // at the separate tasking barrier, so this isn't a problem.
2378     for (i = 0; i < nthreads; i++) {
2379       volatile void *sleep_loc;
2380       kmp_info_t *thread = threads_data[i].td.td_thr;
2381 
2382       if (i == this_thr->th.th_info.ds.ds_tid) {
2383         continue;
2384       }
2385       // Since we haven't locked the thread's suspend mutex lock at this
2386       // point, there is a small window where a thread might be putting
2387       // itself to sleep, but hasn't set the th_sleep_loc field yet.
2388       // To work around this, __kmp_execute_tasks_template() periodically checks
2389       // see if other threads are sleeping (using the same random mechanism that
2390       // is used for task stealing) and awakens them if they are.
2391       if ((sleep_loc = TCR_PTR(thread->th.th_sleep_loc)) != NULL) {
2392         KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n",
2393                       __kmp_gtid_from_thread(this_thr),
2394                       __kmp_gtid_from_thread(thread)));
2395         __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
2396       } else {
2397         KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
2398                       __kmp_gtid_from_thread(this_thr),
2399                       __kmp_gtid_from_thread(thread)));
2400       }
2401     }
2402   }
2403 
2404   KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n",
2405                 __kmp_gtid_from_thread(this_thr)));
2406 }
2407 
2408 /* // TODO: Check the comment consistency
2409  * Utility routines for "task teams".  A task team (kmp_task_t) is kind of
2410  * like a shadow of the kmp_team_t data struct, with a different lifetime.
2411  * After a child * thread checks into a barrier and calls __kmp_release() from
2412  * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
2413  * longer assume that the kmp_team_t structure is intact (at any moment, the
2414  * master thread may exit the barrier code and free the team data structure,
2415  * and return the threads to the thread pool).
2416  *
2417  * This does not work with the the tasking code, as the thread is still
2418  * expected to participate in the execution of any tasks that may have been
2419  * spawned my a member of the team, and the thread still needs access to all
2420  * to each thread in the team, so that it can steal work from it.
2421  *
2422  * Enter the existence of the kmp_task_team_t struct.  It employs a reference
2423  * counting mechanims, and is allocated by the master thread before calling
2424  * __kmp_<barrier_kind>_release, and then is release by the last thread to
2425  * exit __kmp_<barrier_kind>_release at the next barrier.  I.e. the lifetimes
2426  * of the kmp_task_team_t structs for consecutive barriers can overlap
2427  * (and will, unless the master thread is the last thread to exit the barrier
2428  * release phase, which is not typical).
2429  *
2430  * The existence of such a struct is useful outside the context of tasking,
2431  * but for now, I'm trying to keep it specific to the OMP_30_ENABLED macro,
2432  * so that any performance differences show up when comparing the 2.5 vs. 3.0
2433  * libraries.
2434  *
2435  * We currently use the existence of the threads array as an indicator that
2436  * tasks were spawned since the last barrier.  If the structure is to be
2437  * useful outside the context of tasking, then this will have to change, but
2438  * not settting the field minimizes the performance impact of tasking on
2439  * barriers, when no explicit tasks were spawned (pushed, actually).
2440  */
2441 
2442 static kmp_task_team_t *__kmp_free_task_teams =
2443     NULL; // Free list for task_team data structures
2444 // Lock for task team data structures
2445 static kmp_bootstrap_lock_t __kmp_task_team_lock =
2446     KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock);
2447 
2448 // __kmp_alloc_task_deque:
2449 // Allocates a task deque for a particular thread, and initialize the necessary
2450 // data structures relating to the deque.  This only happens once per thread
2451 // per task team since task teams are recycled. No lock is needed during
2452 // allocation since each thread allocates its own deque.
2453 static void __kmp_alloc_task_deque(kmp_info_t *thread,
2454                                    kmp_thread_data_t *thread_data) {
2455   __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
2456   KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL);
2457 
2458   // Initialize last stolen task field to "none"
2459   thread_data->td.td_deque_last_stolen = -1;
2460 
2461   KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0);
2462   KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0);
2463   KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0);
2464 
2465   KE_TRACE(
2466       10,
2467       ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
2468        __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data));
2469   // Allocate space for task deque, and zero the deque
2470   // Cannot use __kmp_thread_calloc() because threads not around for
2471   // kmp_reap_task_team( ).
2472   thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
2473       INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
2474   thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
2475 }
2476 
2477 // __kmp_realloc_task_deque:
2478 // Re-allocates a task deque for a particular thread, copies the content from
2479 // the old deque and adjusts the necessary data structures relating to the
2480 // deque. This operation must be done with a the deque_lock being held
2481 static void __kmp_realloc_task_deque(kmp_info_t *thread,
2482                                      kmp_thread_data_t *thread_data) {
2483   kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
2484   kmp_int32 new_size = 2 * size;
2485 
2486   KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to "
2487                 "%d] for thread_data %p\n",
2488                 __kmp_gtid_from_thread(thread), size, new_size, thread_data));
2489 
2490   kmp_taskdata_t **new_deque =
2491       (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *));
2492 
2493   int i, j;
2494   for (i = thread_data->td.td_deque_head, j = 0; j < size;
2495        i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++)
2496     new_deque[j] = thread_data->td.td_deque[i];
2497 
2498   __kmp_free(thread_data->td.td_deque);
2499 
2500   thread_data->td.td_deque_head = 0;
2501   thread_data->td.td_deque_tail = size;
2502   thread_data->td.td_deque = new_deque;
2503   thread_data->td.td_deque_size = new_size;
2504 }
2505 
2506 // __kmp_free_task_deque:
2507 // Deallocates a task deque for a particular thread. Happens at library
2508 // deallocation so don't need to reset all thread data fields.
2509 static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) {
2510   __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
2511 
2512   if (thread_data->td.td_deque != NULL) {
2513     TCW_4(thread_data->td.td_deque_ntasks, 0);
2514     __kmp_free(thread_data->td.td_deque);
2515     thread_data->td.td_deque = NULL;
2516   }
2517   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2518 
2519 #ifdef BUILD_TIED_TASK_STACK
2520   // GEH: Figure out what to do here for td_susp_tied_tasks
2521   if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) {
2522     __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data);
2523   }
2524 #endif // BUILD_TIED_TASK_STACK
2525 }
2526 
2527 // __kmp_realloc_task_threads_data:
2528 // Allocates a threads_data array for a task team, either by allocating an
2529 // initial array or enlarging an existing array.  Only the first thread to get
2530 // the lock allocs or enlarges the array and re-initializes the array eleemnts.
2531 // That thread returns "TRUE", the rest return "FALSE".
2532 // Assumes that the new array size is given by task_team -> tt.tt_nproc.
2533 // The current size is given by task_team -> tt.tt_max_threads.
2534 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
2535                                            kmp_task_team_t *task_team) {
2536   kmp_thread_data_t **threads_data_p;
2537   kmp_int32 nthreads, maxthreads;
2538   int is_init_thread = FALSE;
2539 
2540   if (TCR_4(task_team->tt.tt_found_tasks)) {
2541     // Already reallocated and initialized.
2542     return FALSE;
2543   }
2544 
2545   threads_data_p = &task_team->tt.tt_threads_data;
2546   nthreads = task_team->tt.tt_nproc;
2547   maxthreads = task_team->tt.tt_max_threads;
2548 
2549   // All threads must lock when they encounter the first task of the implicit
2550   // task region to make sure threads_data fields are (re)initialized before
2551   // used.
2552   __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
2553 
2554   if (!TCR_4(task_team->tt.tt_found_tasks)) {
2555     // first thread to enable tasking
2556     kmp_team_t *team = thread->th.th_team;
2557     int i;
2558 
2559     is_init_thread = TRUE;
2560     if (maxthreads < nthreads) {
2561 
2562       if (*threads_data_p != NULL) {
2563         kmp_thread_data_t *old_data = *threads_data_p;
2564         kmp_thread_data_t *new_data = NULL;
2565 
2566         KE_TRACE(
2567             10,
2568             ("__kmp_realloc_task_threads_data: T#%d reallocating "
2569              "threads data for task_team %p, new_size = %d, old_size = %d\n",
2570              __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads));
2571         // Reallocate threads_data to have more elements than current array
2572         // Cannot use __kmp_thread_realloc() because threads not around for
2573         // kmp_reap_task_team( ).  Note all new array entries are initialized
2574         // to zero by __kmp_allocate().
2575         new_data = (kmp_thread_data_t *)__kmp_allocate(
2576             nthreads * sizeof(kmp_thread_data_t));
2577         // copy old data to new data
2578         KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t),
2579                      (void *)old_data, maxthreads * sizeof(kmp_taskdata_t *));
2580 
2581 #ifdef BUILD_TIED_TASK_STACK
2582         // GEH: Figure out if this is the right thing to do
2583         for (i = maxthreads; i < nthreads; i++) {
2584           kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
2585           __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
2586         }
2587 #endif // BUILD_TIED_TASK_STACK
2588         // Install the new data and free the old data
2589         (*threads_data_p) = new_data;
2590         __kmp_free(old_data);
2591       } else {
2592         KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating "
2593                       "threads data for task_team %p, size = %d\n",
2594                       __kmp_gtid_from_thread(thread), task_team, nthreads));
2595         // Make the initial allocate for threads_data array, and zero entries
2596         // Cannot use __kmp_thread_calloc() because threads not around for
2597         // kmp_reap_task_team( ).
2598         ANNOTATE_IGNORE_WRITES_BEGIN();
2599         *threads_data_p = (kmp_thread_data_t *)__kmp_allocate(
2600             nthreads * sizeof(kmp_thread_data_t));
2601         ANNOTATE_IGNORE_WRITES_END();
2602 #ifdef BUILD_TIED_TASK_STACK
2603         // GEH: Figure out if this is the right thing to do
2604         for (i = 0; i < nthreads; i++) {
2605           kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
2606           __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
2607         }
2608 #endif // BUILD_TIED_TASK_STACK
2609       }
2610       task_team->tt.tt_max_threads = nthreads;
2611     } else {
2612       // If array has (more than) enough elements, go ahead and use it
2613       KMP_DEBUG_ASSERT(*threads_data_p != NULL);
2614     }
2615 
2616     // initialize threads_data pointers back to thread_info structures
2617     for (i = 0; i < nthreads; i++) {
2618       kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
2619       thread_data->td.td_thr = team->t.t_threads[i];
2620 
2621       if (thread_data->td.td_deque_last_stolen >= nthreads) {
2622         // The last stolen field survives across teams / barrier, and the number
2623         // of threads may have changed.  It's possible (likely?) that a new
2624         // parallel region will exhibit the same behavior as previous region.
2625         thread_data->td.td_deque_last_stolen = -1;
2626       }
2627     }
2628 
2629     KMP_MB();
2630     TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE);
2631   }
2632 
2633   __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
2634   return is_init_thread;
2635 }
2636 
2637 // __kmp_free_task_threads_data:
2638 // Deallocates a threads_data array for a task team, including any attached
2639 // tasking deques.  Only occurs at library shutdown.
2640 static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) {
2641   __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
2642   if (task_team->tt.tt_threads_data != NULL) {
2643     int i;
2644     for (i = 0; i < task_team->tt.tt_max_threads; i++) {
2645       __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]);
2646     }
2647     __kmp_free(task_team->tt.tt_threads_data);
2648     task_team->tt.tt_threads_data = NULL;
2649   }
2650   __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
2651 }
2652 
2653 // __kmp_allocate_task_team:
2654 // Allocates a task team associated with a specific team, taking it from
2655 // the global task team free list if possible.  Also initializes data
2656 // structures.
2657 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
2658                                                  kmp_team_t *team) {
2659   kmp_task_team_t *task_team = NULL;
2660   int nthreads;
2661 
2662   KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n",
2663                 (thread ? __kmp_gtid_from_thread(thread) : -1), team));
2664 
2665   if (TCR_PTR(__kmp_free_task_teams) != NULL) {
2666     // Take a task team from the task team pool
2667     __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
2668     if (__kmp_free_task_teams != NULL) {
2669       task_team = __kmp_free_task_teams;
2670       TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next);
2671       task_team->tt.tt_next = NULL;
2672     }
2673     __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
2674   }
2675 
2676   if (task_team == NULL) {
2677     KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating "
2678                   "task team for team %p\n",
2679                   __kmp_gtid_from_thread(thread), team));
2680     // Allocate a new task team if one is not available.
2681     // Cannot use __kmp_thread_malloc() because threads not around for
2682     // kmp_reap_task_team( ).
2683     task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t));
2684     __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock);
2685     // AC: __kmp_allocate zeroes returned memory
2686     // task_team -> tt.tt_threads_data = NULL;
2687     // task_team -> tt.tt_max_threads = 0;
2688     // task_team -> tt.tt_next = NULL;
2689   }
2690 
2691   TCW_4(task_team->tt.tt_found_tasks, FALSE);
2692 #if OMP_45_ENABLED
2693   TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
2694 #endif
2695   task_team->tt.tt_nproc = nthreads = team->t.t_nproc;
2696 
2697   TCW_4(task_team->tt.tt_unfinished_threads, nthreads);
2698   TCW_4(task_team->tt.tt_active, TRUE);
2699 
2700   KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
2701                 "unfinished_threads init'd to %d\n",
2702                 (thread ? __kmp_gtid_from_thread(thread) : -1), task_team,
2703                 task_team->tt.tt_unfinished_threads));
2704   return task_team;
2705 }
2706 
2707 // __kmp_free_task_team:
2708 // Frees the task team associated with a specific thread, and adds it
2709 // to the global task team free list.
2710 void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) {
2711   KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n",
2712                 thread ? __kmp_gtid_from_thread(thread) : -1, task_team));
2713 
2714   // Put task team back on free list
2715   __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
2716 
2717   KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL);
2718   task_team->tt.tt_next = __kmp_free_task_teams;
2719   TCW_PTR(__kmp_free_task_teams, task_team);
2720 
2721   __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
2722 }
2723 
2724 // __kmp_reap_task_teams:
2725 // Free all the task teams on the task team free list.
2726 // Should only be done during library shutdown.
2727 // Cannot do anything that needs a thread structure or gtid since they are
2728 // already gone.
2729 void __kmp_reap_task_teams(void) {
2730   kmp_task_team_t *task_team;
2731 
2732   if (TCR_PTR(__kmp_free_task_teams) != NULL) {
2733     // Free all task_teams on the free list
2734     __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
2735     while ((task_team = __kmp_free_task_teams) != NULL) {
2736       __kmp_free_task_teams = task_team->tt.tt_next;
2737       task_team->tt.tt_next = NULL;
2738 
2739       // Free threads_data if necessary
2740       if (task_team->tt.tt_threads_data != NULL) {
2741         __kmp_free_task_threads_data(task_team);
2742       }
2743       __kmp_free(task_team);
2744     }
2745     __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
2746   }
2747 }
2748 
2749 // __kmp_wait_to_unref_task_teams:
2750 // Some threads could still be in the fork barrier release code, possibly
2751 // trying to steal tasks.  Wait for each thread to unreference its task team.
2752 void __kmp_wait_to_unref_task_teams(void) {
2753   kmp_info_t *thread;
2754   kmp_uint32 spins;
2755   int done;
2756 
2757   KMP_INIT_YIELD(spins);
2758 
2759   for (;;) {
2760     done = TRUE;
2761 
2762     // TODO: GEH - this may be is wrong because some sync would be necessary
2763     // in case threads are added to the pool during the traversal. Need to
2764     // verify that lock for thread pool is held when calling this routine.
2765     for (thread = (kmp_info_t *)__kmp_thread_pool; thread != NULL;
2766          thread = thread->th.th_next_pool) {
2767 #if KMP_OS_WINDOWS
2768       DWORD exit_val;
2769 #endif
2770       if (TCR_PTR(thread->th.th_task_team) == NULL) {
2771         KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n",
2772                       __kmp_gtid_from_thread(thread)));
2773         continue;
2774       }
2775 #if KMP_OS_WINDOWS
2776       // TODO: GEH - add this check for Linux* OS / OS X* as well?
2777       if (!__kmp_is_thread_alive(thread, &exit_val)) {
2778         thread->th.th_task_team = NULL;
2779         continue;
2780       }
2781 #endif
2782 
2783       done = FALSE; // Because th_task_team pointer is not NULL for this thread
2784 
2785       KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to "
2786                     "unreference task_team\n",
2787                     __kmp_gtid_from_thread(thread)));
2788 
2789       if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
2790         volatile void *sleep_loc;
2791         // If the thread is sleeping, awaken it.
2792         if ((sleep_loc = TCR_PTR(thread->th.th_sleep_loc)) != NULL) {
2793           KA_TRACE(
2794               10,
2795               ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
2796                __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread)));
2797           __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
2798         }
2799       }
2800     }
2801     if (done) {
2802       break;
2803     }
2804 
2805     // If we are oversubscribed, or have waited a bit (and library mode is
2806     // throughput), yield. Pause is in the following code.
2807     KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2808     KMP_YIELD_SPIN(spins); // Yields only if KMP_LIBRARY=throughput
2809   }
2810 }
2811 
2812 // __kmp_task_team_setup:  Create a task_team for the current team, but use
2813 // an already created, unused one if it already exists.
2814 void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
2815   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2816 
2817   // If this task_team hasn't been created yet, allocate it. It will be used in
2818   // the region after the next.
2819   // If it exists, it is the current task team and shouldn't be touched yet as
2820   // it may still be in use.
2821   if (team->t.t_task_team[this_thr->th.th_task_state] == NULL &&
2822       (always || team->t.t_nproc > 1)) {
2823     team->t.t_task_team[this_thr->th.th_task_state] =
2824         __kmp_allocate_task_team(this_thr, team);
2825     KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created new task_team %p "
2826                   "for team %d at parity=%d\n",
2827                   __kmp_gtid_from_thread(this_thr),
2828                   team->t.t_task_team[this_thr->th.th_task_state],
2829                   ((team != NULL) ? team->t.t_id : -1),
2830                   this_thr->th.th_task_state));
2831   }
2832 
2833   // After threads exit the release, they will call sync, and then point to this
2834   // other task_team; make sure it is allocated and properly initialized. As
2835   // threads spin in the barrier release phase, they will continue to use the
2836   // previous task_team struct(above), until they receive the signal to stop
2837   // checking for tasks (they can't safely reference the kmp_team_t struct,
2838   // which could be reallocated by the master thread). No task teams are formed
2839   // for serialized teams.
2840   if (team->t.t_nproc > 1) {
2841     int other_team = 1 - this_thr->th.th_task_state;
2842     if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
2843       team->t.t_task_team[other_team] =
2844           __kmp_allocate_task_team(this_thr, team);
2845       KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created second new "
2846                     "task_team %p for team %d at parity=%d\n",
2847                     __kmp_gtid_from_thread(this_thr),
2848                     team->t.t_task_team[other_team],
2849                     ((team != NULL) ? team->t.t_id : -1), other_team));
2850     } else { // Leave the old task team struct in place for the upcoming region;
2851       // adjust as needed
2852       kmp_task_team_t *task_team = team->t.t_task_team[other_team];
2853       if (!task_team->tt.tt_active ||
2854           team->t.t_nproc != task_team->tt.tt_nproc) {
2855         TCW_4(task_team->tt.tt_nproc, team->t.t_nproc);
2856         TCW_4(task_team->tt.tt_found_tasks, FALSE);
2857 #if OMP_45_ENABLED
2858         TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
2859 #endif
2860         TCW_4(task_team->tt.tt_unfinished_threads, team->t.t_nproc);
2861         TCW_4(task_team->tt.tt_active, TRUE);
2862       }
2863       // if team size has changed, the first thread to enable tasking will
2864       // realloc threads_data if necessary
2865       KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d reset next task_team "
2866                     "%p for team %d at parity=%d\n",
2867                     __kmp_gtid_from_thread(this_thr),
2868                     team->t.t_task_team[other_team],
2869                     ((team != NULL) ? team->t.t_id : -1), other_team));
2870     }
2871   }
2872 }
2873 
2874 // __kmp_task_team_sync: Propagation of task team data from team to threads
2875 // which happens just after the release phase of a team barrier.  This may be
2876 // called by any thread, but only for teams with # threads > 1.
2877 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
2878   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2879 
2880   // Toggle the th_task_state field, to switch which task_team this thread
2881   // refers to
2882   this_thr->th.th_task_state = 1 - this_thr->th.th_task_state;
2883   // It is now safe to propagate the task team pointer from the team struct to
2884   // the current thread.
2885   TCW_PTR(this_thr->th.th_task_team,
2886           team->t.t_task_team[this_thr->th.th_task_state]);
2887   KA_TRACE(20,
2888            ("__kmp_task_team_sync: Thread T#%d task team switched to task_team "
2889             "%p from Team #%d (parity=%d)\n",
2890             __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team,
2891             ((team != NULL) ? team->t.t_id : -1), this_thr->th.th_task_state));
2892 }
2893 
2894 // __kmp_task_team_wait: Master thread waits for outstanding tasks after the
2895 // barrier gather phase. Only called by master thread if #threads in team > 1 or
2896 // if proxy tasks were created.
2897 //
2898 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
2899 // by passing in 0 optionally as the last argument. When wait is zero, master
2900 // thread does not wait for unfinished_threads to reach 0.
2901 void __kmp_task_team_wait(
2902     kmp_info_t *this_thr,
2903     kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) {
2904   kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state];
2905 
2906   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2907   KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team);
2908 
2909   if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) {
2910     if (wait) {
2911       KA_TRACE(20, ("__kmp_task_team_wait: Master T#%d waiting for all tasks "
2912                     "(for unfinished_threads to reach 0) on task_team = %p\n",
2913                     __kmp_gtid_from_thread(this_thr), task_team));
2914       // Worker threads may have dropped through to release phase, but could
2915       // still be executing tasks. Wait here for tasks to complete. To avoid
2916       // memory contention, only master thread checks termination condition.
2917       kmp_flag_32 flag(&task_team->tt.tt_unfinished_threads, 0U);
2918       flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2919     }
2920     // Deactivate the old task team, so that the worker threads will stop
2921     // referencing it while spinning.
2922     KA_TRACE(
2923         20,
2924         ("__kmp_task_team_wait: Master T#%d deactivating task_team %p: "
2925          "setting active to false, setting local and team's pointer to NULL\n",
2926          __kmp_gtid_from_thread(this_thr), task_team));
2927 #if OMP_45_ENABLED
2928     KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 ||
2929                      task_team->tt.tt_found_proxy_tasks == TRUE);
2930     TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE);
2931 #else
2932     KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1);
2933 #endif
2934     TCW_SYNC_4(task_team->tt.tt_active, FALSE);
2935     KMP_MB();
2936 
2937     TCW_PTR(this_thr->th.th_task_team, NULL);
2938   }
2939 }
2940 
2941 // __kmp_tasking_barrier:
2942 // This routine may only called when __kmp_tasking_mode == tskm_extra_barrier.
2943 // Internal function to execute all tasks prior to a regular barrier or a join
2944 // barrier. It is a full barrier itself, which unfortunately turns regular
2945 // barriers into double barriers and join barriers into 1 1/2 barriers.
2946 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) {
2947   volatile kmp_uint32 *spin =
2948       &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads;
2949   int flag = FALSE;
2950   KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier);
2951 
2952 #if USE_ITT_BUILD
2953   KMP_FSYNC_SPIN_INIT(spin, (kmp_uint32 *)NULL);
2954 #endif /* USE_ITT_BUILD */
2955   kmp_flag_32 spin_flag(spin, 0U);
2956   while (!spin_flag.execute_tasks(thread, gtid, TRUE,
2957                                   &flag USE_ITT_BUILD_ARG(NULL), 0)) {
2958 #if USE_ITT_BUILD
2959     // TODO: What about itt_sync_obj??
2960     KMP_FSYNC_SPIN_PREPARE(spin);
2961 #endif /* USE_ITT_BUILD */
2962 
2963     if (TCR_4(__kmp_global.g.g_done)) {
2964       if (__kmp_global.g.g_abort)
2965         __kmp_abort_thread();
2966       break;
2967     }
2968     KMP_YIELD(TRUE); // GH: We always yield here
2969   }
2970 #if USE_ITT_BUILD
2971   KMP_FSYNC_SPIN_ACQUIRED((void *)spin);
2972 #endif /* USE_ITT_BUILD */
2973 }
2974 
2975 #if OMP_45_ENABLED
2976 
2977 // __kmp_give_task puts a task into a given thread queue if:
2978 //  - the queue for that thread was created
2979 //  - there's space in that queue
2980 // Because of this, __kmp_push_task needs to check if there's space after
2981 // getting the lock
2982 static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task,
2983                             kmp_int32 pass) {
2984   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
2985   kmp_task_team_t *task_team = taskdata->td_task_team;
2986 
2987   KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n",
2988                 taskdata, tid));
2989 
2990   // If task_team is NULL something went really bad...
2991   KMP_DEBUG_ASSERT(task_team != NULL);
2992 
2993   bool result = false;
2994   kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
2995 
2996   if (thread_data->td.td_deque == NULL) {
2997     // There's no queue in this thread, go find another one
2998     // We're guaranteed that at least one thread has a queue
2999     KA_TRACE(30,
3000              ("__kmp_give_task: thread %d has no queue while giving task %p.\n",
3001               tid, taskdata));
3002     return result;
3003   }
3004 
3005   if (TCR_4(thread_data->td.td_deque_ntasks) >=
3006       TASK_DEQUE_SIZE(thread_data->td)) {
3007     KA_TRACE(
3008         30,
3009         ("__kmp_give_task: queue is full while giving task %p to thread %d.\n",
3010          taskdata, tid));
3011 
3012     // if this deque is bigger than the pass ratio give a chance to another
3013     // thread
3014     if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
3015       return result;
3016 
3017     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3018     __kmp_realloc_task_deque(thread, thread_data);
3019 
3020   } else {
3021 
3022     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3023 
3024     if (TCR_4(thread_data->td.td_deque_ntasks) >=
3025         TASK_DEQUE_SIZE(thread_data->td)) {
3026       KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to "
3027                     "thread %d.\n",
3028                     taskdata, tid));
3029 
3030       // if this deque is bigger than the pass ratio give a chance to another
3031       // thread
3032       if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
3033         goto release_and_exit;
3034 
3035       __kmp_realloc_task_deque(thread, thread_data);
3036     }
3037   }
3038 
3039   // lock is held here, and there is space in the deque
3040 
3041   thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
3042   // Wrap index.
3043   thread_data->td.td_deque_tail =
3044       (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
3045   TCW_4(thread_data->td.td_deque_ntasks,
3046         TCR_4(thread_data->td.td_deque_ntasks) + 1);
3047 
3048   result = true;
3049   KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n",
3050                 taskdata, tid));
3051 
3052 release_and_exit:
3053   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3054 
3055   return result;
3056 }
3057 
3058 /* The finish of the proxy tasks is divided in two pieces:
3059     - the top half is the one that can be done from a thread outside the team
3060     - the bottom half must be run from a them within the team
3061 
3062    In order to run the bottom half the task gets queued back into one of the
3063    threads of the team. Once the td_incomplete_child_task counter of the parent
3064    is decremented the threads can leave the barriers. So, the bottom half needs
3065    to be queued before the counter is decremented. The top half is therefore
3066    divided in two parts:
3067     - things that can be run before queuing the bottom half
3068     - things that must be run after queuing the bottom half
3069 
3070    This creates a second race as the bottom half can free the task before the
3071    second top half is executed. To avoid this we use the
3072    td_incomplete_child_task of the proxy task to synchronize the top and bottom
3073    half. */
3074 static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
3075   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
3076   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3077   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
3078   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
3079 
3080   taskdata->td_flags.complete = 1; // mark the task as completed
3081 
3082   if (taskdata->td_taskgroup)
3083     KMP_TEST_THEN_DEC32((kmp_int32 *)(&taskdata->td_taskgroup->count));
3084 
3085   // Create an imaginary children for this task so the bottom half cannot
3086   // release the task before we have completed the second top half
3087   TCI_4(taskdata->td_incomplete_child_tasks);
3088 }
3089 
3090 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
3091   kmp_int32 children = 0;
3092 
3093   // Predecrement simulated by "- 1" calculation
3094   children =
3095       KMP_TEST_THEN_DEC32(
3096           (kmp_int32 *)(&taskdata->td_parent->td_incomplete_child_tasks)) -
3097       1;
3098   KMP_DEBUG_ASSERT(children >= 0);
3099 
3100   // Remove the imaginary children
3101   TCD_4(taskdata->td_incomplete_child_tasks);
3102 }
3103 
3104 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) {
3105   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3106   kmp_info_t *thread = __kmp_threads[gtid];
3107 
3108   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3109   KMP_DEBUG_ASSERT(taskdata->td_flags.complete ==
3110                    1); // top half must run before bottom half
3111 
3112   // We need to wait to make sure the top half is finished
3113   // Spinning here should be ok as this should happen quickly
3114   while (TCR_4(taskdata->td_incomplete_child_tasks) > 0)
3115     ;
3116 
3117   __kmp_release_deps(gtid, taskdata);
3118   __kmp_free_task_and_ancestors(gtid, taskdata, thread);
3119 }
3120 
3121 /*!
3122 @ingroup TASKING
3123 @param gtid Global Thread ID of encountering thread
3124 @param ptask Task which execution is completed
3125 
3126 Execute the completation of a proxy task from a thread of that is part of the
3127 team. Run first and bottom halves directly.
3128 */
3129 void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) {
3130   KMP_DEBUG_ASSERT(ptask != NULL);
3131   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3132   KA_TRACE(
3133       10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n",
3134            gtid, taskdata));
3135 
3136   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3137 
3138   __kmp_first_top_half_finish_proxy(taskdata);
3139   __kmp_second_top_half_finish_proxy(taskdata);
3140   __kmp_bottom_half_finish_proxy(gtid, ptask);
3141 
3142   KA_TRACE(10,
3143            ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n",
3144             gtid, taskdata));
3145 }
3146 
3147 /*!
3148 @ingroup TASKING
3149 @param ptask Task which execution is completed
3150 
3151 Execute the completation of a proxy task from a thread that could not belong to
3152 the team.
3153 */
3154 void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) {
3155   KMP_DEBUG_ASSERT(ptask != NULL);
3156   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3157 
3158   KA_TRACE(
3159       10,
3160       ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n",
3161        taskdata));
3162 
3163   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3164 
3165   __kmp_first_top_half_finish_proxy(taskdata);
3166 
3167   // Enqueue task to complete bottom half completion from a thread within the
3168   // corresponding team
3169   kmp_team_t *team = taskdata->td_team;
3170   kmp_int32 nthreads = team->t.t_nproc;
3171   kmp_info_t *thread;
3172 
3173   // This should be similar to start_k = __kmp_get_random( thread ) % nthreads
3174   // but we cannot use __kmp_get_random here
3175   kmp_int32 start_k = 0;
3176   kmp_int32 pass = 1;
3177   kmp_int32 k = start_k;
3178 
3179   do {
3180     // For now we're just linearly trying to find a thread
3181     thread = team->t.t_threads[k];
3182     k = (k + 1) % nthreads;
3183 
3184     // we did a full pass through all the threads
3185     if (k == start_k)
3186       pass = pass << 1;
3187 
3188   } while (!__kmp_give_task(thread, k, ptask, pass));
3189 
3190   __kmp_second_top_half_finish_proxy(taskdata);
3191 
3192   KA_TRACE(
3193       10,
3194       ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n",
3195        taskdata));
3196 }
3197 
3198 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task
3199 // for taskloop
3200 //
3201 // thread:   allocating thread
3202 // task_src: pointer to source task to be duplicated
3203 // returns:  a pointer to the allocated kmp_task_t structure (task).
3204 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) {
3205   kmp_task_t *task;
3206   kmp_taskdata_t *taskdata;
3207   kmp_taskdata_t *taskdata_src;
3208   kmp_taskdata_t *parent_task = thread->th.th_current_task;
3209   size_t shareds_offset;
3210   size_t task_size;
3211 
3212   KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread,
3213                 task_src));
3214   taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
3215   KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy ==
3216                    TASK_FULL); // it should not be proxy task
3217   KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT);
3218   task_size = taskdata_src->td_size_alloc;
3219 
3220   // Allocate a kmp_taskdata_t block and a kmp_task_t block.
3221   KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread,
3222                 task_size));
3223 #if USE_FAST_MEMORY
3224   taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size);
3225 #else
3226   taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size);
3227 #endif /* USE_FAST_MEMORY */
3228   KMP_MEMCPY(taskdata, taskdata_src, task_size);
3229 
3230   task = KMP_TASKDATA_TO_TASK(taskdata);
3231 
3232   // Initialize new task (only specific fields not affected by memcpy)
3233   taskdata->td_task_id = KMP_GEN_TASK_ID();
3234   if (task->shareds != NULL) { // need setup shareds pointer
3235     shareds_offset = (char *)task_src->shareds - (char *)taskdata_src;
3236     task->shareds = &((char *)taskdata)[shareds_offset];
3237     KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
3238                      0);
3239   }
3240   taskdata->td_alloc_thread = thread;
3241   taskdata->td_taskgroup =
3242       parent_task
3243           ->td_taskgroup; // task inherits the taskgroup from the parent task
3244 
3245   // Only need to keep track of child task counts if team parallel and tasking
3246   // not serialized
3247   if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
3248     KMP_TEST_THEN_INC32((kmp_int32 *)(&parent_task->td_incomplete_child_tasks));
3249     if (parent_task->td_taskgroup)
3250       KMP_TEST_THEN_INC32((kmp_int32 *)(&parent_task->td_taskgroup->count));
3251     // Only need to keep track of allocated child tasks for explicit tasks since
3252     // implicit not deallocated
3253     if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT)
3254       KMP_TEST_THEN_INC32(
3255           (kmp_int32 *)(&taskdata->td_parent->td_allocated_child_tasks));
3256   }
3257 
3258   KA_TRACE(20,
3259            ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
3260             thread, taskdata, taskdata->td_parent));
3261 #if OMPT_SUPPORT
3262   __kmp_task_init_ompt(taskdata, thread->th.th_info.ds.ds_gtid,
3263                        (void *)task->routine);
3264 #endif
3265   return task;
3266 }
3267 
3268 // Routine optionally generated by th ecompiler for setting the lastprivate flag
3269 // and calling needed constructors for private/firstprivate objects
3270 // (used to form taskloop tasks from pattern task)
3271 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
3272 
3273 // __kmp_taskloop_linear: Start tasks of the taskloop linearly
3274 //
3275 // loc       Source location information
3276 // gtid      Global thread ID
3277 // task      Task with whole loop iteration range
3278 // lb        Pointer to loop lower bound
3279 // ub        Pointer to loop upper bound
3280 // st        Loop stride
3281 // sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
3282 // grainsize Schedule value if specified
3283 // task_dup  Tasks duplication routine
3284 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
3285                            kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
3286                            int sched, kmp_uint64 grainsize, void *task_dup) {
3287   KMP_COUNT_BLOCK(OMP_TASKLOOP);
3288   KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
3289   p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
3290   kmp_uint64 tc;
3291   kmp_uint64 lower = *lb; // compiler provides global bounds here
3292   kmp_uint64 upper = *ub;
3293   kmp_uint64 i, num_tasks = 0, extras = 0;
3294   kmp_info_t *thread = __kmp_threads[gtid];
3295   kmp_taskdata_t *current_task = thread->th.th_current_task;
3296   kmp_task_t *next_task;
3297   kmp_int32 lastpriv = 0;
3298   size_t lower_offset =
3299       (char *)lb - (char *)task; // remember offset of lb in the task structure
3300   size_t upper_offset =
3301       (char *)ub - (char *)task; // remember offset of ub in the task structure
3302 
3303   // compute trip count
3304   if (st == 1) { // most common case
3305     tc = upper - lower + 1;
3306   } else if (st < 0) {
3307     tc = (lower - upper) / (-st) + 1;
3308   } else { // st > 0
3309     tc = (upper - lower) / st + 1;
3310   }
3311   if (tc == 0) {
3312     KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d zero-trip loop\n", gtid));
3313     // free the pattern task and exit
3314     __kmp_task_start(gtid, task, current_task);
3315     // do not execute anything for zero-trip loop
3316     __kmp_task_finish(gtid, task, current_task);
3317     return;
3318   }
3319 
3320   // compute num_tasks/grainsize based on the input provided
3321   switch (sched) {
3322   case 0: // no schedule clause specified, we can choose the default
3323     // let's try to schedule (team_size*10) tasks
3324     grainsize = thread->th.th_team_nproc * 10;
3325   case 2: // num_tasks provided
3326     if (grainsize > tc) {
3327       num_tasks = tc; // too big num_tasks requested, adjust values
3328       grainsize = 1;
3329       extras = 0;
3330     } else {
3331       num_tasks = grainsize;
3332       grainsize = tc / num_tasks;
3333       extras = tc % num_tasks;
3334     }
3335     break;
3336   case 1: // grainsize provided
3337     if (grainsize > tc) {
3338       num_tasks = 1; // too big grainsize requested, adjust values
3339       grainsize = tc;
3340       extras = 0;
3341     } else {
3342       num_tasks = tc / grainsize;
3343       grainsize =
3344           tc /
3345           num_tasks; // adjust grainsize for balanced distribution of iterations
3346       extras = tc % num_tasks;
3347     }
3348     break;
3349   default:
3350     KMP_ASSERT2(0, "unknown scheduling of taskloop");
3351   }
3352   KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
3353   KMP_DEBUG_ASSERT(num_tasks > extras);
3354   KMP_DEBUG_ASSERT(num_tasks > 0);
3355   KA_TRACE(20, ("__kmpc_taskloop: T#%d will launch: num_tasks %lld, grainsize "
3356                 "%lld, extras %lld\n",
3357                 gtid, num_tasks, grainsize, extras));
3358 
3359   // Main loop, launch num_tasks tasks, assign grainsize iterations each task
3360   for (i = 0; i < num_tasks; ++i) {
3361     kmp_uint64 chunk_minus_1;
3362     if (extras == 0) {
3363       chunk_minus_1 = grainsize - 1;
3364     } else {
3365       chunk_minus_1 = grainsize;
3366       --extras; // first extras iterations get bigger chunk (grainsize+1)
3367     }
3368     upper = lower + st * chunk_minus_1;
3369     if (i == num_tasks - 1) {
3370       // schedule the last task, set lastprivate flag
3371       lastpriv = 1;
3372 #if KMP_DEBUG
3373       if (st == 1)
3374         KMP_DEBUG_ASSERT(upper == *ub);
3375       else if (st > 0)
3376         KMP_DEBUG_ASSERT(upper + st > *ub);
3377       else
3378         KMP_DEBUG_ASSERT(upper + st < *ub);
3379 #endif
3380     }
3381     next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
3382     *(kmp_uint64 *)((char *)next_task + lower_offset) =
3383         lower; // adjust task-specific bounds
3384     *(kmp_uint64 *)((char *)next_task + upper_offset) = upper;
3385     if (ptask_dup != NULL)
3386       ptask_dup(next_task, task,
3387                 lastpriv); // set lastprivate flag, construct fistprivates, etc.
3388     KA_TRACE(20, ("__kmpc_taskloop: T#%d schedule task %p: lower %lld, upper "
3389                   "%lld (offsets %p %p)\n",
3390                   gtid, next_task, lower, upper, lower_offset, upper_offset));
3391     __kmp_omp_task(gtid, next_task, true); // schedule new task
3392     lower = upper + st; // adjust lower bound for the next iteration
3393   }
3394   // free the pattern task and exit
3395   __kmp_task_start(gtid, task, current_task);
3396   // do not execute the pattern task, just do bookkeeping
3397   __kmp_task_finish(gtid, task, current_task);
3398 }
3399 
3400 /*!
3401 @ingroup TASKING
3402 @param loc       Source location information
3403 @param gtid      Global thread ID
3404 @param task      Task structure
3405 @param if_val    Value of the if clause
3406 @param lb        Pointer to loop lower bound
3407 @param ub        Pointer to loop upper bound
3408 @param st        Loop stride
3409 @param nogroup   Flag, 1 if nogroup clause specified, 0 otherwise
3410 @param sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
3411 @param grainsize Schedule value if specified
3412 @param task_dup  Tasks duplication routine
3413 
3414 Execute the taskloop construct.
3415 */
3416 void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
3417                      kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup,
3418                      int sched, kmp_uint64 grainsize, void *task_dup) {
3419   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
3420   KMP_DEBUG_ASSERT(task != NULL);
3421 
3422   KA_TRACE(10, ("__kmpc_taskloop(enter): T#%d, pattern task %p, lb %lld ub "
3423                 "%lld st %lld, grain %llu(%d)\n",
3424                 gtid, taskdata, *lb, *ub, st, grainsize, sched));
3425 
3426   // check if clause value first
3427   if (if_val == 0) { // if(0) specified, mark task as serial
3428     taskdata->td_flags.task_serial = 1;
3429     taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
3430   }
3431   if (nogroup == 0) {
3432     __kmpc_taskgroup(loc, gtid);
3433   }
3434 
3435   if (1 /* AC: use some heuristic here to choose task scheduling method */) {
3436     __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, sched, grainsize,
3437                           task_dup);
3438   }
3439 
3440   if (nogroup == 0) {
3441     __kmpc_end_taskgroup(loc, gtid);
3442   }
3443   KA_TRACE(10, ("__kmpc_taskloop(exit): T#%d\n", gtid));
3444 }
3445 
3446 #endif
3447