1 /*
2  * kmp_tasking.cpp -- OpenMP 3.0 tasking support.
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_stats.h"
20 #include "kmp_wait_release.h"
21 
22 #if OMPT_SUPPORT
23 #include "ompt-specific.h"
24 #endif
25 
26 #include "tsan_annotations.h"
27 
28 /* forward declaration */
29 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
30                                  kmp_info_t *this_thr);
31 static void __kmp_alloc_task_deque(kmp_info_t *thread,
32                                    kmp_thread_data_t *thread_data);
33 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
34                                            kmp_task_team_t *task_team);
35 
36 #ifdef OMP_45_ENABLED
37 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask);
38 #endif
39 
40 #ifdef BUILD_TIED_TASK_STACK
41 
42 //  __kmp_trace_task_stack: print the tied tasks from the task stack in order
43 //  from top do bottom
44 //
45 //  gtid: global thread identifier for thread containing stack
46 //  thread_data: thread data for task team thread containing stack
47 //  threshold: value above which the trace statement triggers
48 //  location: string identifying call site of this function (for trace)
49 static void __kmp_trace_task_stack(kmp_int32 gtid,
50                                    kmp_thread_data_t *thread_data,
51                                    int threshold, char *location) {
52   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
53   kmp_taskdata_t **stack_top = task_stack->ts_top;
54   kmp_int32 entries = task_stack->ts_entries;
55   kmp_taskdata_t *tied_task;
56 
57   KA_TRACE(
58       threshold,
59       ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, "
60        "first_block = %p, stack_top = %p \n",
61        location, gtid, entries, task_stack->ts_first_block, stack_top));
62 
63   KMP_DEBUG_ASSERT(stack_top != NULL);
64   KMP_DEBUG_ASSERT(entries > 0);
65 
66   while (entries != 0) {
67     KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]);
68     // fix up ts_top if we need to pop from previous block
69     if (entries & TASK_STACK_INDEX_MASK == 0) {
70       kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top);
71 
72       stack_block = stack_block->sb_prev;
73       stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
74     }
75 
76     // finish bookkeeping
77     stack_top--;
78     entries--;
79 
80     tied_task = *stack_top;
81 
82     KMP_DEBUG_ASSERT(tied_task != NULL);
83     KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
84 
85     KA_TRACE(threshold,
86              ("__kmp_trace_task_stack(%s):             gtid=%d, entry=%d, "
87               "stack_top=%p, tied_task=%p\n",
88               location, gtid, entries, stack_top, tied_task));
89   }
90   KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]);
91 
92   KA_TRACE(threshold,
93            ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n",
94             location, gtid));
95 }
96 
97 //  __kmp_init_task_stack: initialize the task stack for the first time
98 //  after a thread_data structure is created.
99 //  It should not be necessary to do this again (assuming the stack works).
100 //
101 //  gtid: global thread identifier of calling thread
102 //  thread_data: thread data for task team thread containing stack
103 static void __kmp_init_task_stack(kmp_int32 gtid,
104                                   kmp_thread_data_t *thread_data) {
105   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
106   kmp_stack_block_t *first_block;
107 
108   // set up the first block of the stack
109   first_block = &task_stack->ts_first_block;
110   task_stack->ts_top = (kmp_taskdata_t **)first_block;
111   memset((void *)first_block, '\0',
112          TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *));
113 
114   // initialize the stack to be empty
115   task_stack->ts_entries = TASK_STACK_EMPTY;
116   first_block->sb_next = NULL;
117   first_block->sb_prev = NULL;
118 }
119 
120 //  __kmp_free_task_stack: free the task stack when thread_data is destroyed.
121 //
122 //  gtid: global thread identifier for calling thread
123 //  thread_data: thread info for thread containing stack
124 static void __kmp_free_task_stack(kmp_int32 gtid,
125                                   kmp_thread_data_t *thread_data) {
126   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
127   kmp_stack_block_t *stack_block = &task_stack->ts_first_block;
128 
129   KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY);
130   // free from the second block of the stack
131   while (stack_block != NULL) {
132     kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL;
133 
134     stack_block->sb_next = NULL;
135     stack_block->sb_prev = NULL;
136     if (stack_block != &task_stack->ts_first_block) {
137       __kmp_thread_free(thread,
138                         stack_block); // free the block, if not the first
139     }
140     stack_block = next_block;
141   }
142   // initialize the stack to be empty
143   task_stack->ts_entries = 0;
144   task_stack->ts_top = NULL;
145 }
146 
147 //  __kmp_push_task_stack: Push the tied task onto the task stack.
148 //     Grow the stack if necessary by allocating another block.
149 //
150 //  gtid: global thread identifier for calling thread
151 //  thread: thread info for thread containing stack
152 //  tied_task: the task to push on the stack
153 static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread,
154                                   kmp_taskdata_t *tied_task) {
155   // GEH - need to consider what to do if tt_threads_data not allocated yet
156   kmp_thread_data_t *thread_data =
157       &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
158   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
159 
160   if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) {
161     return; // Don't push anything on stack if team or team tasks are serialized
162   }
163 
164   KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
165   KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
166 
167   KA_TRACE(20,
168            ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n",
169             gtid, thread, tied_task));
170   // Store entry
171   *(task_stack->ts_top) = tied_task;
172 
173   // Do bookkeeping for next push
174   task_stack->ts_top++;
175   task_stack->ts_entries++;
176 
177   if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
178     // Find beginning of this task block
179     kmp_stack_block_t *stack_block =
180         (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE);
181 
182     // Check if we already have a block
183     if (stack_block->sb_next !=
184         NULL) { // reset ts_top to beginning of next block
185       task_stack->ts_top = &stack_block->sb_next->sb_block[0];
186     } else { // Alloc new block and link it up
187       kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc(
188           thread, sizeof(kmp_stack_block_t));
189 
190       task_stack->ts_top = &new_block->sb_block[0];
191       stack_block->sb_next = new_block;
192       new_block->sb_prev = stack_block;
193       new_block->sb_next = NULL;
194 
195       KA_TRACE(
196           30,
197           ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n",
198            gtid, tied_task, new_block));
199     }
200   }
201   KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
202                 tied_task));
203 }
204 
205 //  __kmp_pop_task_stack: Pop the tied task from the task stack.  Don't return
206 //  the task, just check to make sure it matches the ending task passed in.
207 //
208 //  gtid: global thread identifier for the calling thread
209 //  thread: thread info structure containing stack
210 //  tied_task: the task popped off the stack
211 //  ending_task: the task that is ending (should match popped task)
212 static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread,
213                                  kmp_taskdata_t *ending_task) {
214   // GEH - need to consider what to do if tt_threads_data not allocated yet
215   kmp_thread_data_t *thread_data =
216       &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)];
217   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
218   kmp_taskdata_t *tied_task;
219 
220   if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) {
221     // Don't pop anything from stack if team or team tasks are serialized
222     return;
223   }
224 
225   KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
226   KMP_DEBUG_ASSERT(task_stack->ts_entries > 0);
227 
228   KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid,
229                 thread));
230 
231   // fix up ts_top if we need to pop from previous block
232   if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
233     kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top);
234 
235     stack_block = stack_block->sb_prev;
236     task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
237   }
238 
239   // finish bookkeeping
240   task_stack->ts_top--;
241   task_stack->ts_entries--;
242 
243   tied_task = *(task_stack->ts_top);
244 
245   KMP_DEBUG_ASSERT(tied_task != NULL);
246   KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
247   KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly
248 
249   KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
250                 tied_task));
251   return;
252 }
253 #endif /* BUILD_TIED_TASK_STACK */
254 
255 //  __kmp_push_task: Add a task to the thread's deque
256 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
257   kmp_info_t *thread = __kmp_threads[gtid];
258   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
259   kmp_task_team_t *task_team = thread->th.th_task_team;
260   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
261   kmp_thread_data_t *thread_data;
262 
263   KA_TRACE(20,
264            ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata));
265 
266   if (taskdata->td_flags.tiedness == TASK_UNTIED) {
267     // untied task needs to increment counter so that the task structure is not
268     // freed prematurely
269     kmp_int32 counter = 1 + KMP_TEST_THEN_INC32(&taskdata->td_untied_count);
270     KA_TRACE(
271         20,
272         ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",
273          gtid, counter, taskdata));
274   }
275 
276   // The first check avoids building task_team thread data if serialized
277   if (taskdata->td_flags.task_serial) {
278     KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning "
279                   "TASK_NOT_PUSHED for task %p\n",
280                   gtid, taskdata));
281     return TASK_NOT_PUSHED;
282   }
283 
284   // Now that serialized tasks have returned, we can assume that we are not in
285   // immediate exec mode
286   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
287   if (!KMP_TASKING_ENABLED(task_team)) {
288     __kmp_enable_tasking(task_team, thread);
289   }
290   KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE);
291   KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL);
292 
293   // Find tasking deque specific to encountering thread
294   thread_data = &task_team->tt.tt_threads_data[tid];
295 
296   // No lock needed since only owner can allocate
297   if (thread_data->td.td_deque == NULL) {
298     __kmp_alloc_task_deque(thread, thread_data);
299   }
300 
301   // Check if deque is full
302   if (TCR_4(thread_data->td.td_deque_ntasks) >=
303       TASK_DEQUE_SIZE(thread_data->td)) {
304     KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning "
305                   "TASK_NOT_PUSHED for task %p\n",
306                   gtid, taskdata));
307     return TASK_NOT_PUSHED;
308   }
309 
310   // Lock the deque for the task push operation
311   __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
312 
313 #if OMP_45_ENABLED
314   // Need to recheck as we can get a proxy task from a thread outside of OpenMP
315   if (TCR_4(thread_data->td.td_deque_ntasks) >=
316       TASK_DEQUE_SIZE(thread_data->td)) {
317     __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
318     KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; returning "
319                   "TASK_NOT_PUSHED for task %p\n",
320                   gtid, taskdata));
321     return TASK_NOT_PUSHED;
322   }
323 #else
324   // Must have room since no thread can add tasks but calling thread
325   KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
326                    TASK_DEQUE_SIZE(thread_data->td));
327 #endif
328 
329   thread_data->td.td_deque[thread_data->td.td_deque_tail] =
330       taskdata; // Push taskdata
331   // Wrap index.
332   thread_data->td.td_deque_tail =
333       (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
334   TCW_4(thread_data->td.td_deque_ntasks,
335         TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
336 
337   KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
338                 "task=%p ntasks=%d head=%u tail=%u\n",
339                 gtid, taskdata, thread_data->td.td_deque_ntasks,
340                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
341 
342   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
343 
344   return TASK_SUCCESSFULLY_PUSHED;
345 }
346 
347 // __kmp_pop_current_task_from_thread: set up current task from called thread
348 // when team ends
349 //
350 // this_thr: thread structure to set current_task in.
351 void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) {
352   KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d "
353                 "this_thread=%p, curtask=%p, "
354                 "curtask_parent=%p\n",
355                 0, this_thr, this_thr->th.th_current_task,
356                 this_thr->th.th_current_task->td_parent));
357 
358   this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent;
359 
360   KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d "
361                 "this_thread=%p, curtask=%p, "
362                 "curtask_parent=%p\n",
363                 0, this_thr, this_thr->th.th_current_task,
364                 this_thr->th.th_current_task->td_parent));
365 }
366 
367 // __kmp_push_current_task_to_thread: set up current task in called thread for a
368 // new team
369 //
370 // this_thr: thread structure to set up
371 // team: team for implicit task data
372 // tid: thread within team to set up
373 void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team,
374                                        int tid) {
375   // current task of the thread is a parent of the new just created implicit
376   // tasks of new team
377   KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p "
378                 "curtask=%p "
379                 "parent_task=%p\n",
380                 tid, this_thr, this_thr->th.th_current_task,
381                 team->t.t_implicit_task_taskdata[tid].td_parent));
382 
383   KMP_DEBUG_ASSERT(this_thr != NULL);
384 
385   if (tid == 0) {
386     if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) {
387       team->t.t_implicit_task_taskdata[0].td_parent =
388           this_thr->th.th_current_task;
389       this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0];
390     }
391   } else {
392     team->t.t_implicit_task_taskdata[tid].td_parent =
393         team->t.t_implicit_task_taskdata[0].td_parent;
394     this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid];
395   }
396 
397   KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p "
398                 "curtask=%p "
399                 "parent_task=%p\n",
400                 tid, this_thr, this_thr->th.th_current_task,
401                 team->t.t_implicit_task_taskdata[tid].td_parent));
402 }
403 
404 // __kmp_task_start: bookkeeping for a task starting execution
405 //
406 // GTID: global thread id of calling thread
407 // task: task starting execution
408 // current_task: task suspending
409 static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task,
410                              kmp_taskdata_t *current_task) {
411   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
412   kmp_info_t *thread = __kmp_threads[gtid];
413 
414   KA_TRACE(10,
415            ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",
416             gtid, taskdata, current_task));
417 
418   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
419 
420   // mark currently executing task as suspended
421   // TODO: GEH - make sure root team implicit task is initialized properly.
422   // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );
423   current_task->td_flags.executing = 0;
424 
425 // Add task to stack if tied
426 #ifdef BUILD_TIED_TASK_STACK
427   if (taskdata->td_flags.tiedness == TASK_TIED) {
428     __kmp_push_task_stack(gtid, thread, taskdata);
429   }
430 #endif /* BUILD_TIED_TASK_STACK */
431 
432   // mark starting task as executing and as current task
433   thread->th.th_current_task = taskdata;
434 
435   KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 ||
436                    taskdata->td_flags.tiedness == TASK_UNTIED);
437   KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 ||
438                    taskdata->td_flags.tiedness == TASK_UNTIED);
439   taskdata->td_flags.started = 1;
440   taskdata->td_flags.executing = 1;
441   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
442   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
443 
444   // GEH TODO: shouldn't we pass some sort of location identifier here?
445   // APT: yes, we will pass location here.
446   // need to store current thread state (in a thread or taskdata structure)
447   // before setting work_state, otherwise wrong state is set after end of task
448 
449   KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata));
450 
451 #if OMPT_SUPPORT
452   if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_task_begin)) {
453     kmp_taskdata_t *parent = taskdata->td_parent;
454     ompt_callbacks.ompt_callback(ompt_event_task_begin)(
455         parent ? parent->ompt_task_info.task_id : ompt_task_id_none,
456         parent ? &(parent->ompt_task_info.frame) : NULL,
457         taskdata->ompt_task_info.task_id, taskdata->ompt_task_info.function);
458   }
459 #endif
460 #if OMP_40_ENABLED && OMPT_SUPPORT && OMPT_TRACE
461   /* OMPT emit all dependences if requested by the tool */
462   if (ompt_enabled && taskdata->ompt_task_info.ndeps > 0 &&
463       ompt_callbacks.ompt_callback(ompt_event_task_dependences)) {
464     ompt_callbacks.ompt_callback(ompt_event_task_dependences)(
465         taskdata->ompt_task_info.task_id, taskdata->ompt_task_info.deps,
466         taskdata->ompt_task_info.ndeps);
467     /* We can now free the allocated memory for the dependencies */
468     KMP_OMPT_DEPS_FREE(thread, taskdata->ompt_task_info.deps);
469     taskdata->ompt_task_info.deps = NULL;
470     taskdata->ompt_task_info.ndeps = 0;
471   }
472 #endif /* OMP_40_ENABLED && OMPT_SUPPORT && OMPT_TRACE */
473 
474   return;
475 }
476 
477 // __kmpc_omp_task_begin_if0: report that a given serialized task has started
478 // execution
479 //
480 // loc_ref: source location information; points to beginning of task block.
481 // gtid: global thread number.
482 // task: task thunk for the started task.
483 void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
484                                kmp_task_t *task) {
485   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
486   kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
487 
488   KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p "
489                 "current_task=%p\n",
490                 gtid, loc_ref, taskdata, current_task));
491 
492   if (taskdata->td_flags.tiedness == TASK_UNTIED) {
493     // untied task needs to increment counter so that the task structure is not
494     // freed prematurely
495     kmp_int32 counter = 1 + KMP_TEST_THEN_INC32(&taskdata->td_untied_count);
496     KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) "
497                   "incremented for task %p\n",
498                   gtid, counter, taskdata));
499   }
500 
501   taskdata->td_flags.task_serial =
502       1; // Execute this task immediately, not deferred.
503   __kmp_task_start(gtid, task, current_task);
504 
505   KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid,
506                 loc_ref, taskdata));
507 
508   return;
509 }
510 
511 #ifdef TASK_UNUSED
512 // __kmpc_omp_task_begin: report that a given task has started execution
513 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
514 void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) {
515   kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
516 
517   KA_TRACE(
518       10,
519       ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
520        gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task));
521 
522   __kmp_task_start(gtid, task, current_task);
523 
524   KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid,
525                 loc_ref, KMP_TASK_TO_TASKDATA(task)));
526   return;
527 }
528 #endif // TASK_UNUSED
529 
530 // __kmp_free_task: free the current task space and the space for shareds
531 //
532 // gtid: Global thread ID of calling thread
533 // taskdata: task to free
534 // thread: thread data structure of caller
535 static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
536                             kmp_info_t *thread) {
537   KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid,
538                 taskdata));
539 
540   // Check to make sure all flags and counters have the correct values
541   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
542   KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0);
543   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1);
544   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
545   KMP_DEBUG_ASSERT(TCR_4(taskdata->td_allocated_child_tasks) == 0 ||
546                    taskdata->td_flags.task_serial == 1);
547   KMP_DEBUG_ASSERT(TCR_4(taskdata->td_incomplete_child_tasks) == 0);
548 
549   taskdata->td_flags.freed = 1;
550   ANNOTATE_HAPPENS_BEFORE(taskdata);
551 // deallocate the taskdata and shared variable blocks associated with this task
552 #if USE_FAST_MEMORY
553   __kmp_fast_free(thread, taskdata);
554 #else /* ! USE_FAST_MEMORY */
555   __kmp_thread_free(thread, taskdata);
556 #endif
557 
558   KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));
559 }
560 
561 // __kmp_free_task_and_ancestors: free the current task and ancestors without
562 // children
563 //
564 // gtid: Global thread ID of calling thread
565 // taskdata: task to free
566 // thread: thread data structure of caller
567 static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
568                                           kmp_taskdata_t *taskdata,
569                                           kmp_info_t *thread) {
570 #if OMP_45_ENABLED
571   // Proxy tasks must always be allowed to free their parents
572   // because they can be run in background even in serial mode.
573   kmp_int32 team_serial =
574       (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) &&
575       !taskdata->td_flags.proxy;
576 #else
577   kmp_int32 team_serial =
578       taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser;
579 #endif
580   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
581 
582   kmp_int32 children = KMP_TEST_THEN_DEC32(CCAST(
583                            kmp_int32 *, &taskdata->td_allocated_child_tasks)) -
584                        1;
585   KMP_DEBUG_ASSERT(children >= 0);
586 
587   // Now, go up the ancestor tree to see if any ancestors can now be freed.
588   while (children == 0) {
589     kmp_taskdata_t *parent_taskdata = taskdata->td_parent;
590 
591     KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete "
592                   "and freeing itself\n",
593                   gtid, taskdata));
594 
595     // --- Deallocate my ancestor task ---
596     __kmp_free_task(gtid, taskdata, thread);
597 
598     taskdata = parent_taskdata;
599 
600     // Stop checking ancestors at implicit task instead of walking up ancestor
601     // tree to avoid premature deallocation of ancestors.
602     if (team_serial || taskdata->td_flags.tasktype == TASK_IMPLICIT)
603       return;
604 
605     // Predecrement simulated by "- 1" calculation
606     children = KMP_TEST_THEN_DEC32(
607                    CCAST(kmp_int32 *, &taskdata->td_allocated_child_tasks)) -
608                1;
609     KMP_DEBUG_ASSERT(children >= 0);
610   }
611 
612   KA_TRACE(
613       20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "
614            "not freeing it yet\n",
615            gtid, taskdata, children));
616 }
617 
618 // __kmp_task_finish: bookkeeping to do when a task finishes execution
619 //
620 // gtid: global thread ID for calling thread
621 // task: task to be finished
622 // resumed_task: task to be resumed.  (may be NULL if task is serialized)
623 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
624                               kmp_taskdata_t *resumed_task) {
625   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
626   kmp_info_t *thread = __kmp_threads[gtid];
627   kmp_task_team_t *task_team =
628       thread->th.th_task_team; // might be NULL for serial teams...
629   kmp_int32 children = 0;
630 
631 #if OMPT_SUPPORT
632   if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_task_end)) {
633     kmp_taskdata_t *parent = taskdata->td_parent;
634     ompt_callbacks.ompt_callback(ompt_event_task_end)(
635         taskdata->ompt_task_info.task_id);
636   }
637 #endif
638 
639   KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming "
640                 "task %p\n",
641                 gtid, taskdata, resumed_task));
642 
643   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
644 
645 // Pop task from stack if tied
646 #ifdef BUILD_TIED_TASK_STACK
647   if (taskdata->td_flags.tiedness == TASK_TIED) {
648     __kmp_pop_task_stack(gtid, thread, taskdata);
649   }
650 #endif /* BUILD_TIED_TASK_STACK */
651 
652   if (taskdata->td_flags.tiedness == TASK_UNTIED) {
653     // untied task needs to check the counter so that the task structure is not
654     // freed prematurely
655     kmp_int32 counter = KMP_TEST_THEN_DEC32(&taskdata->td_untied_count) - 1;
656     KA_TRACE(
657         20,
658         ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",
659          gtid, counter, taskdata));
660     if (counter > 0) {
661       // untied task is not done, to be continued possibly by other thread, do
662       // not free it now
663       if (resumed_task == NULL) {
664         KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial);
665         resumed_task = taskdata->td_parent; // In a serialized task, the resumed
666         // task is the parent
667       }
668       thread->th.th_current_task = resumed_task; // restore current_task
669       resumed_task->td_flags.executing = 1; // resume previous task
670       KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, "
671                     "resuming task %p\n",
672                     gtid, taskdata, resumed_task));
673       return;
674     }
675   }
676 
677   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
678   taskdata->td_flags.complete = 1; // mark the task as completed
679   KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
680   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
681 
682   // Only need to keep track of count if team parallel and tasking not
683   // serialized
684   if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
685     // Predecrement simulated by "- 1" calculation
686     children =
687         KMP_TEST_THEN_DEC32(CCAST(
688             kmp_int32 *, &taskdata->td_parent->td_incomplete_child_tasks)) -
689         1;
690     KMP_DEBUG_ASSERT(children >= 0);
691 #if OMP_40_ENABLED
692     if (taskdata->td_taskgroup)
693       KMP_TEST_THEN_DEC32((kmp_int32 *)(&taskdata->td_taskgroup->count));
694 #if OMP_45_ENABLED
695   }
696   // if we found proxy tasks there could exist a dependency chain
697   // with the proxy task as origin
698   if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) ||
699       (task_team && task_team->tt.tt_found_proxy_tasks)) {
700 #endif
701     __kmp_release_deps(gtid, taskdata);
702 #endif
703   }
704 
705   // td_flags.executing must be marked as 0 after __kmp_release_deps has been
706   // called. Othertwise, if a task is executed immediately from the release_deps
707   // code, the flag will be reset to 1 again by this same function
708   KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
709   taskdata->td_flags.executing = 0; // suspend the finishing task
710 
711   KA_TRACE(
712       20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
713            gtid, taskdata, children));
714 
715 #if OMP_40_ENABLED
716   /* If the tasks' destructor thunk flag has been set, we need to invoke the
717      destructor thunk that has been generated by the compiler. The code is
718      placed here, since at this point other tasks might have been released
719      hence overlapping the destructor invokations with some other work in the
720      released tasks.  The OpenMP spec is not specific on when the destructors
721      are invoked, so we should be free to choose. */
722   if (taskdata->td_flags.destructors_thunk) {
723     kmp_routine_entry_t destr_thunk = task->data1.destructors;
724     KMP_ASSERT(destr_thunk);
725     destr_thunk(gtid, task);
726   }
727 #endif // OMP_40_ENABLED
728 
729   // bookkeeping for resuming task:
730   // GEH - note tasking_ser => task_serial
731   KMP_DEBUG_ASSERT(
732       (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
733       taskdata->td_flags.task_serial);
734   if (taskdata->td_flags.task_serial) {
735     if (resumed_task == NULL) {
736       resumed_task = taskdata->td_parent; // In a serialized task, the resumed
737       // task is the parent
738     } else
739 #if OMP_45_ENABLED
740         if (!(task_team && task_team->tt.tt_found_proxy_tasks))
741 #endif
742     {
743       // verify resumed task passed in points to parent
744       KMP_DEBUG_ASSERT(resumed_task == taskdata->td_parent);
745     }
746   } else {
747     KMP_DEBUG_ASSERT(resumed_task !=
748                      NULL); // verify that resumed task is passed as arguemnt
749   }
750 
751   // Free this task and then ancestor tasks if they have no children.
752   // Restore th_current_task first as suggested by John:
753   // johnmc: if an asynchronous inquiry peers into the runtime system
754   // it doesn't see the freed task as the current task.
755   thread->th.th_current_task = resumed_task;
756   __kmp_free_task_and_ancestors(gtid, taskdata, thread);
757 
758   // TODO: GEH - make sure root team implicit task is initialized properly.
759   // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
760   resumed_task->td_flags.executing = 1; // resume previous task
761 
762   KA_TRACE(
763       10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
764            gtid, taskdata, resumed_task));
765 
766   return;
767 }
768 
769 // __kmpc_omp_task_complete_if0: report that a task has completed execution
770 //
771 // loc_ref: source location information; points to end of task block.
772 // gtid: global thread number.
773 // task: task thunk for the completed task.
774 void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,
775                                   kmp_task_t *task) {
776   KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
777                 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
778   // this routine will provide task to resume
779   __kmp_task_finish(gtid, task, NULL);
780 
781   KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
782                 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
783   return;
784 }
785 
786 #ifdef TASK_UNUSED
787 // __kmpc_omp_task_complete: report that a task has completed execution
788 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
789 void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid,
790                               kmp_task_t *task) {
791   KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid,
792                 loc_ref, KMP_TASK_TO_TASKDATA(task)));
793 
794   __kmp_task_finish(gtid, task, NULL); // Not sure how to find task to resume
795 
796   KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid,
797                 loc_ref, KMP_TASK_TO_TASKDATA(task)));
798   return;
799 }
800 #endif // TASK_UNUSED
801 
802 #if OMPT_SUPPORT
803 // __kmp_task_init_ompt: Initialize OMPT fields maintained by a task. This will
804 //  only be called after ompt_tool, so we already know whether ompt is enabled
805 // or not.
806 static inline void __kmp_task_init_ompt(kmp_taskdata_t *task, int tid,
807                                         void *function) {
808   if (ompt_enabled) {
809     task->ompt_task_info.task_id = __ompt_task_id_new(tid);
810     task->ompt_task_info.function = function;
811     task->ompt_task_info.frame.exit_runtime_frame = NULL;
812     task->ompt_task_info.frame.reenter_runtime_frame = NULL;
813 #if OMP_40_ENABLED
814     task->ompt_task_info.ndeps = 0;
815     task->ompt_task_info.deps = NULL;
816 #endif /* OMP_40_ENABLED */
817   }
818 }
819 #endif
820 
821 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit
822 // task for a given thread
823 //
824 // loc_ref:  reference to source location of parallel region
825 // this_thr:  thread data structure corresponding to implicit task
826 // team: team for this_thr
827 // tid: thread id of given thread within team
828 // set_curr_task: TRUE if need to push current task to thread
829 // NOTE: Routine does not set up the implicit task ICVS.  This is assumed to
830 // have already been done elsewhere.
831 // TODO: Get better loc_ref.  Value passed in may be NULL
832 void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
833                               kmp_team_t *team, int tid, int set_curr_task) {
834   kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid];
835 
836   KF_TRACE(
837       10,
838       ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",
839        tid, team, task, set_curr_task ? "TRUE" : "FALSE"));
840 
841   task->td_task_id = KMP_GEN_TASK_ID();
842   task->td_team = team;
843   //    task->td_parent   = NULL;  // fix for CQ230101 (broken parent task info
844   //    in debugger)
845   task->td_ident = loc_ref;
846   task->td_taskwait_ident = NULL;
847   task->td_taskwait_counter = 0;
848   task->td_taskwait_thread = 0;
849 
850   task->td_flags.tiedness = TASK_TIED;
851   task->td_flags.tasktype = TASK_IMPLICIT;
852 #if OMP_45_ENABLED
853   task->td_flags.proxy = TASK_FULL;
854 #endif
855 
856   // All implicit tasks are executed immediately, not deferred
857   task->td_flags.task_serial = 1;
858   task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
859   task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
860 
861   task->td_flags.started = 1;
862   task->td_flags.executing = 1;
863   task->td_flags.complete = 0;
864   task->td_flags.freed = 0;
865 
866 #if OMP_40_ENABLED
867   task->td_depnode = NULL;
868 #endif
869 
870   if (set_curr_task) { // only do this init first time thread is created
871     task->td_incomplete_child_tasks = 0;
872     // Not used: don't need to deallocate implicit task
873     task->td_allocated_child_tasks = 0;
874 #if OMP_40_ENABLED
875     task->td_taskgroup = NULL; // An implicit task does not have taskgroup
876     task->td_dephash = NULL;
877 #endif
878     __kmp_push_current_task_to_thread(this_thr, team, tid);
879   } else {
880     KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);
881     KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0);
882   }
883 
884 #if OMPT_SUPPORT
885   __kmp_task_init_ompt(task, tid, NULL);
886 #endif
887 
888   KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid,
889                 team, task));
890 }
891 
892 // __kmp_finish_implicit_task: Release resources associated to implicit tasks
893 // at the end of parallel regions. Some resources are kept for reuse in the next
894 // parallel region.
895 //
896 // thread:  thread data structure corresponding to implicit task
897 void __kmp_finish_implicit_task(kmp_info_t *thread) {
898   kmp_taskdata_t *task = thread->th.th_current_task;
899   if (task->td_dephash)
900     __kmp_dephash_free_entries(thread, task->td_dephash);
901 }
902 
903 // __kmp_free_implicit_task: Release resources associated to implicit tasks
904 // when these are destroyed regions
905 //
906 // thread:  thread data structure corresponding to implicit task
907 void __kmp_free_implicit_task(kmp_info_t *thread) {
908   kmp_taskdata_t *task = thread->th.th_current_task;
909   if (task->td_dephash)
910     __kmp_dephash_free(thread, task->td_dephash);
911   task->td_dephash = NULL;
912 }
913 
914 // Round up a size to a power of two specified by val: Used to insert padding
915 // between structures co-allocated using a single malloc() call
916 static size_t __kmp_round_up_to_val(size_t size, size_t val) {
917   if (size & (val - 1)) {
918     size &= ~(val - 1);
919     if (size <= KMP_SIZE_T_MAX - val) {
920       size += val; // Round up if there is no overflow.
921     }; // if
922   }; // if
923   return size;
924 } // __kmp_round_up_to_va
925 
926 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task
927 //
928 // loc_ref: source location information
929 // gtid: global thread number.
930 // flags: include tiedness & task type (explicit vs. implicit) of the ''new''
931 // task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine.
932 // sizeof_kmp_task_t:  Size in bytes of kmp_task_t data structure including
933 // private vars accessed in task.
934 // sizeof_shareds:  Size in bytes of array of pointers to shared vars accessed
935 // in task.
936 // task_entry: Pointer to task code entry point generated by compiler.
937 // returns: a pointer to the allocated kmp_task_t structure (task).
938 kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
939                              kmp_tasking_flags_t *flags,
940                              size_t sizeof_kmp_task_t, size_t sizeof_shareds,
941                              kmp_routine_entry_t task_entry) {
942   kmp_task_t *task;
943   kmp_taskdata_t *taskdata;
944   kmp_info_t *thread = __kmp_threads[gtid];
945   kmp_team_t *team = thread->th.th_team;
946   kmp_taskdata_t *parent_task = thread->th.th_current_task;
947   size_t shareds_offset;
948 
949   KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
950                 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
951                 gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
952                 sizeof_shareds, task_entry));
953 
954   if (parent_task->td_flags.final) {
955     if (flags->merged_if0) {
956     }
957     flags->final = 1;
958   }
959 
960 #if OMP_45_ENABLED
961   if (flags->proxy == TASK_PROXY) {
962     flags->tiedness = TASK_UNTIED;
963     flags->merged_if0 = 1;
964 
965     /* are we running in a sequential parallel or tskm_immediate_exec... we need
966        tasking support enabled */
967     if ((thread->th.th_task_team) == NULL) {
968       /* This should only happen if the team is serialized
969           setup a task team and propagate it to the thread */
970       KMP_DEBUG_ASSERT(team->t.t_serialized);
971       KA_TRACE(30,
972                ("T#%d creating task team in __kmp_task_alloc for proxy task\n",
973                 gtid));
974       __kmp_task_team_setup(
975           thread, team,
976           1); // 1 indicates setup the current team regardless of nthreads
977       thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
978     }
979     kmp_task_team_t *task_team = thread->th.th_task_team;
980 
981     /* tasking must be enabled now as the task might not be pushed */
982     if (!KMP_TASKING_ENABLED(task_team)) {
983       KA_TRACE(
984           30,
985           ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
986       __kmp_enable_tasking(task_team, thread);
987       kmp_int32 tid = thread->th.th_info.ds.ds_tid;
988       kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
989       // No lock needed since only owner can allocate
990       if (thread_data->td.td_deque == NULL) {
991         __kmp_alloc_task_deque(thread, thread_data);
992       }
993     }
994 
995     if (task_team->tt.tt_found_proxy_tasks == FALSE)
996       TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE);
997   }
998 #endif
999 
1000   // Calculate shared structure offset including padding after kmp_task_t struct
1001   // to align pointers in shared struct
1002   shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t;
1003   shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(void *));
1004 
1005   // Allocate a kmp_taskdata_t block and a kmp_task_t block.
1006   KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid,
1007                 shareds_offset));
1008   KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid,
1009                 sizeof_shareds));
1010 
1011 // Avoid double allocation here by combining shareds with taskdata
1012 #if USE_FAST_MEMORY
1013   taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset +
1014                                                                sizeof_shareds);
1015 #else /* ! USE_FAST_MEMORY */
1016   taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset +
1017                                                                sizeof_shareds);
1018 #endif /* USE_FAST_MEMORY */
1019   ANNOTATE_HAPPENS_AFTER(taskdata);
1020 
1021   task = KMP_TASKDATA_TO_TASK(taskdata);
1022 
1023 // Make sure task & taskdata are aligned appropriately
1024 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD
1025   KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0);
1026   KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0);
1027 #else
1028   KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0);
1029   KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0);
1030 #endif
1031   if (sizeof_shareds > 0) {
1032     // Avoid double allocation here by combining shareds with taskdata
1033     task->shareds = &((char *)taskdata)[shareds_offset];
1034     // Make sure shareds struct is aligned to pointer size
1035     KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
1036                      0);
1037   } else {
1038     task->shareds = NULL;
1039   }
1040   task->routine = task_entry;
1041   task->part_id = 0; // AC: Always start with 0 part id
1042 
1043   taskdata->td_task_id = KMP_GEN_TASK_ID();
1044   taskdata->td_team = team;
1045   taskdata->td_alloc_thread = thread;
1046   taskdata->td_parent = parent_task;
1047   taskdata->td_level = parent_task->td_level + 1; // increment nesting level
1048   taskdata->td_untied_count = 0;
1049   taskdata->td_ident = loc_ref;
1050   taskdata->td_taskwait_ident = NULL;
1051   taskdata->td_taskwait_counter = 0;
1052   taskdata->td_taskwait_thread = 0;
1053   KMP_DEBUG_ASSERT(taskdata->td_parent != NULL);
1054 #if OMP_45_ENABLED
1055   // avoid copying icvs for proxy tasks
1056   if (flags->proxy == TASK_FULL)
1057 #endif
1058     copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs);
1059 
1060   taskdata->td_flags.tiedness = flags->tiedness;
1061   taskdata->td_flags.final = flags->final;
1062   taskdata->td_flags.merged_if0 = flags->merged_if0;
1063 #if OMP_40_ENABLED
1064   taskdata->td_flags.destructors_thunk = flags->destructors_thunk;
1065 #endif // OMP_40_ENABLED
1066 #if OMP_45_ENABLED
1067   taskdata->td_flags.proxy = flags->proxy;
1068   taskdata->td_task_team = thread->th.th_task_team;
1069   taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
1070 #endif
1071   taskdata->td_flags.tasktype = TASK_EXPLICIT;
1072 
1073   // GEH - TODO: fix this to copy parent task's value of tasking_ser flag
1074   taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1075 
1076   // GEH - TODO: fix this to copy parent task's value of team_serial flag
1077   taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1078 
1079   // GEH - Note we serialize the task if the team is serialized to make sure
1080   // implicit parallel region tasks are not left until program termination to
1081   // execute. Also, it helps locality to execute immediately.
1082 
1083   taskdata->td_flags.task_serial =
1084       (parent_task->td_flags.final || taskdata->td_flags.team_serial ||
1085        taskdata->td_flags.tasking_ser);
1086 
1087   taskdata->td_flags.started = 0;
1088   taskdata->td_flags.executing = 0;
1089   taskdata->td_flags.complete = 0;
1090   taskdata->td_flags.freed = 0;
1091 
1092   taskdata->td_flags.native = flags->native;
1093 
1094   taskdata->td_incomplete_child_tasks = 0;
1095   taskdata->td_allocated_child_tasks = 1; // start at one because counts current
1096 // task and children
1097 #if OMP_40_ENABLED
1098   taskdata->td_taskgroup =
1099       parent_task->td_taskgroup; // task inherits taskgroup from the parent task
1100   taskdata->td_dephash = NULL;
1101   taskdata->td_depnode = NULL;
1102 #endif
1103 
1104 // Only need to keep track of child task counts if team parallel and tasking not
1105 // serialized or if it is a proxy task
1106 #if OMP_45_ENABLED
1107   if (flags->proxy == TASK_PROXY ||
1108       !(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser))
1109 #else
1110   if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser))
1111 #endif
1112   {
1113     KMP_TEST_THEN_INC32(
1114         CCAST(kmp_int32 *, &parent_task->td_incomplete_child_tasks));
1115 #if OMP_40_ENABLED
1116     if (parent_task->td_taskgroup)
1117       KMP_TEST_THEN_INC32((kmp_int32 *)(&parent_task->td_taskgroup->count));
1118 #endif
1119     // Only need to keep track of allocated child tasks for explicit tasks since
1120     // implicit not deallocated
1121     if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
1122       KMP_TEST_THEN_INC32(
1123           CCAST(kmp_int32 *, &taskdata->td_parent->td_allocated_child_tasks));
1124     }
1125   }
1126 
1127   KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
1128                 gtid, taskdata, taskdata->td_parent));
1129   ANNOTATE_HAPPENS_BEFORE(task);
1130 
1131 #if OMPT_SUPPORT
1132   __kmp_task_init_ompt(taskdata, gtid, (void *)task_entry);
1133 #endif
1134 
1135   return task;
1136 }
1137 
1138 kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1139                                   kmp_int32 flags, size_t sizeof_kmp_task_t,
1140                                   size_t sizeof_shareds,
1141                                   kmp_routine_entry_t task_entry) {
1142   kmp_task_t *retval;
1143   kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
1144 
1145   input_flags->native = FALSE;
1146 // __kmp_task_alloc() sets up all other runtime flags
1147 
1148 #if OMP_45_ENABLED
1149   KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s) "
1150                 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1151                 gtid, loc_ref, input_flags->tiedness ? "tied  " : "untied",
1152                 input_flags->proxy ? "proxy" : "", sizeof_kmp_task_t,
1153                 sizeof_shareds, task_entry));
1154 #else
1155   KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s) "
1156                 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1157                 gtid, loc_ref, input_flags->tiedness ? "tied  " : "untied",
1158                 sizeof_kmp_task_t, sizeof_shareds, task_entry));
1159 #endif
1160 
1161   retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t,
1162                             sizeof_shareds, task_entry);
1163 
1164   KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval));
1165 
1166   return retval;
1167 }
1168 
1169 //  __kmp_invoke_task: invoke the specified task
1170 //
1171 // gtid: global thread ID of caller
1172 // task: the task to invoke
1173 // current_task: the task to resume after task invokation
1174 static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
1175                               kmp_taskdata_t *current_task) {
1176   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
1177   kmp_uint64 cur_time;
1178 #if OMP_40_ENABLED
1179   int discard = 0 /* false */;
1180 #endif
1181   KA_TRACE(
1182       30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
1183            gtid, taskdata, current_task));
1184   KMP_DEBUG_ASSERT(task);
1185 #if OMP_45_ENABLED
1186   if (taskdata->td_flags.proxy == TASK_PROXY &&
1187       taskdata->td_flags.complete == 1) {
1188     // This is a proxy task that was already completed but it needs to run
1189     // its bottom-half finish
1190     KA_TRACE(
1191         30,
1192         ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
1193          gtid, taskdata));
1194 
1195     __kmp_bottom_half_finish_proxy(gtid, task);
1196 
1197     KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for "
1198                   "proxy task %p, resuming task %p\n",
1199                   gtid, taskdata, current_task));
1200 
1201     return;
1202   }
1203 #endif
1204 
1205 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1206   if (__kmp_forkjoin_frames_mode == 3) {
1207     // Get the current time stamp to measure task execution time to correct
1208     // barrier imbalance time
1209     cur_time = __itt_get_timestamp();
1210   }
1211 #endif
1212 
1213 #if OMP_45_ENABLED
1214   // Proxy tasks are not handled by the runtime
1215   if (taskdata->td_flags.proxy != TASK_PROXY) {
1216 #endif
1217     ANNOTATE_HAPPENS_AFTER(task);
1218     __kmp_task_start(gtid, task, current_task);
1219 #if OMP_45_ENABLED
1220   }
1221 #endif
1222 
1223 #if OMPT_SUPPORT
1224   ompt_thread_info_t oldInfo;
1225   kmp_info_t *thread;
1226   if (ompt_enabled) {
1227     // Store the threads states and restore them after the task
1228     thread = __kmp_threads[gtid];
1229     oldInfo = thread->th.ompt_thread_info;
1230     thread->th.ompt_thread_info.wait_id = 0;
1231     thread->th.ompt_thread_info.state = ompt_state_work_parallel;
1232     taskdata->ompt_task_info.frame.exit_runtime_frame =
1233         __builtin_frame_address(0);
1234   }
1235 #endif
1236 
1237 #if OMP_40_ENABLED
1238   // TODO: cancel tasks if the parallel region has also been cancelled
1239   // TODO: check if this sequence can be hoisted above __kmp_task_start
1240   // if cancellation has been enabled for this run ...
1241   if (__kmp_omp_cancellation) {
1242     kmp_info_t *this_thr = __kmp_threads[gtid];
1243     kmp_team_t *this_team = this_thr->th.th_team;
1244     kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
1245     if ((taskgroup && taskgroup->cancel_request) ||
1246         (this_team->t.t_cancel_request == cancel_parallel)) {
1247       KMP_COUNT_BLOCK(TASK_cancelled);
1248       // this task belongs to a task group and we need to cancel it
1249       discard = 1 /* true */;
1250     }
1251   }
1252 
1253   // Invoke the task routine and pass in relevant data.
1254   // Thunks generated by gcc take a different argument list.
1255   if (!discard) {
1256 #if KMP_STATS_ENABLED
1257     KMP_COUNT_BLOCK(TASK_executed);
1258     switch (KMP_GET_THREAD_STATE()) {
1259     case FORK_JOIN_BARRIER:
1260       KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar);
1261       break;
1262     case PLAIN_BARRIER:
1263       KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar);
1264       break;
1265     case TASKYIELD:
1266       KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield);
1267       break;
1268     case TASKWAIT:
1269       KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait);
1270       break;
1271     case TASKGROUP:
1272       KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup);
1273       break;
1274     default:
1275       KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate);
1276       break;
1277     }
1278 #endif // KMP_STATS_ENABLED
1279 #endif // OMP_40_ENABLED
1280 
1281 #if OMPT_SUPPORT && OMPT_TRACE
1282     /* let OMPT know that we're about to run this task */
1283     if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_task_switch)) {
1284       ompt_callbacks.ompt_callback(ompt_event_task_switch)(
1285           current_task->ompt_task_info.task_id,
1286           taskdata->ompt_task_info.task_id);
1287     }
1288 #endif
1289 
1290 #ifdef KMP_GOMP_COMPAT
1291     if (taskdata->td_flags.native) {
1292       ((void (*)(void *))(*(task->routine)))(task->shareds);
1293     } else
1294 #endif /* KMP_GOMP_COMPAT */
1295     {
1296       (*(task->routine))(gtid, task);
1297     }
1298     KMP_POP_PARTITIONED_TIMER();
1299 
1300 #if OMPT_SUPPORT && OMPT_TRACE
1301     /* let OMPT know that we're returning to the callee task */
1302     if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_task_switch)) {
1303       ompt_callbacks.ompt_callback(ompt_event_task_switch)(
1304           taskdata->ompt_task_info.task_id,
1305           current_task->ompt_task_info.task_id);
1306     }
1307 #endif
1308 
1309 #if OMP_40_ENABLED
1310   }
1311 #endif // OMP_40_ENABLED
1312 
1313 #if OMPT_SUPPORT
1314   if (ompt_enabled) {
1315     thread->th.ompt_thread_info = oldInfo;
1316     taskdata->ompt_task_info.frame.exit_runtime_frame = NULL;
1317   }
1318 #endif
1319 
1320 #if OMP_45_ENABLED
1321   // Proxy tasks are not handled by the runtime
1322   if (taskdata->td_flags.proxy != TASK_PROXY) {
1323 #endif
1324     ANNOTATE_HAPPENS_BEFORE(taskdata->td_parent);
1325     __kmp_task_finish(gtid, task, current_task);
1326 #if OMP_45_ENABLED
1327   }
1328 #endif
1329 
1330 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1331   // Barrier imbalance - correct arrive time after the task finished
1332   if (__kmp_forkjoin_frames_mode == 3) {
1333     kmp_info_t *this_thr = __kmp_threads[gtid];
1334     if (this_thr->th.th_bar_arrive_time) {
1335       this_thr->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time);
1336     }
1337   }
1338 #endif
1339   KA_TRACE(
1340       30,
1341       ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
1342        gtid, taskdata, current_task));
1343   return;
1344 }
1345 
1346 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution
1347 //
1348 // loc_ref: location of original task pragma (ignored)
1349 // gtid: Global Thread ID of encountering thread
1350 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
1351 // Returns:
1352 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1353 //    be resumed later.
1354 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1355 //    resumed later.
1356 kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,
1357                                 kmp_task_t *new_task) {
1358   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1359 
1360   KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid,
1361                 loc_ref, new_taskdata));
1362 
1363   /* Should we execute the new task or queue it? For now, let's just always try
1364      to queue it.  If the queue fills up, then we'll execute it.  */
1365 
1366   if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1367   { // Execute this task immediately
1368     kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1369     new_taskdata->td_flags.task_serial = 1;
1370     __kmp_invoke_task(gtid, new_task, current_task);
1371   }
1372 
1373   KA_TRACE(
1374       10,
1375       ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "
1376        "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
1377        gtid, loc_ref, new_taskdata));
1378 
1379   ANNOTATE_HAPPENS_BEFORE(new_task);
1380   return TASK_CURRENT_NOT_QUEUED;
1381 }
1382 
1383 // __kmp_omp_task: Schedule a non-thread-switchable task for execution
1384 //
1385 // gtid: Global Thread ID of encountering thread
1386 // new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
1387 // serialize_immediate: if TRUE then if the task is executed immediately its
1388 // execution will be serialized
1389 // Returns:
1390 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1391 //    be resumed later.
1392 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1393 //    resumed later.
1394 kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
1395                          bool serialize_immediate) {
1396   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1397 
1398 #if OMPT_SUPPORT
1399   if (ompt_enabled) {
1400     new_taskdata->ompt_task_info.frame.reenter_runtime_frame =
1401         __builtin_frame_address(1);
1402   }
1403 #endif
1404 
1405 /* Should we execute the new task or queue it? For now, let's just always try to
1406    queue it.  If the queue fills up, then we'll execute it.  */
1407 #if OMP_45_ENABLED
1408   if (new_taskdata->td_flags.proxy == TASK_PROXY ||
1409       __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1410 #else
1411   if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1412 #endif
1413   { // Execute this task immediately
1414     kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1415     if (serialize_immediate)
1416       new_taskdata->td_flags.task_serial = 1;
1417     __kmp_invoke_task(gtid, new_task, current_task);
1418   }
1419 
1420 #if OMPT_SUPPORT
1421   if (ompt_enabled) {
1422     new_taskdata->ompt_task_info.frame.reenter_runtime_frame = NULL;
1423   }
1424 #endif
1425 
1426   ANNOTATE_HAPPENS_BEFORE(new_task);
1427   return TASK_CURRENT_NOT_QUEUED;
1428 }
1429 
1430 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a
1431 // non-thread-switchable task from the parent thread only!
1432 //
1433 // loc_ref: location of original task pragma (ignored)
1434 // gtid: Global Thread ID of encountering thread
1435 // new_task: non-thread-switchable task thunk allocated by
1436 // __kmp_omp_task_alloc()
1437 // Returns:
1438 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1439 //    be resumed later.
1440 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1441 //    resumed later.
1442 kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid,
1443                           kmp_task_t *new_task) {
1444   kmp_int32 res;
1445   KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1446 
1447 #if KMP_DEBUG
1448   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1449 #endif
1450   KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1451                 new_taskdata));
1452 
1453   res = __kmp_omp_task(gtid, new_task, true);
1454 
1455   KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1456                 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1457                 gtid, loc_ref, new_taskdata));
1458   return res;
1459 }
1460 
1461 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are
1462 // complete
1463 kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) {
1464   kmp_taskdata_t *taskdata;
1465   kmp_info_t *thread;
1466   int thread_finished = FALSE;
1467   KMP_SET_THREAD_STATE_BLOCK(TASKWAIT);
1468 
1469   KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref));
1470 
1471   if (__kmp_tasking_mode != tskm_immediate_exec) {
1472     thread = __kmp_threads[gtid];
1473     taskdata = thread->th.th_current_task;
1474 #if OMPT_SUPPORT && OMPT_TRACE
1475     ompt_task_id_t my_task_id;
1476     ompt_parallel_id_t my_parallel_id;
1477 
1478     if (ompt_enabled) {
1479       kmp_team_t *team = thread->th.th_team;
1480       my_task_id = taskdata->ompt_task_info.task_id;
1481       my_parallel_id = team->t.ompt_team_info.parallel_id;
1482 
1483       taskdata->ompt_task_info.frame.reenter_runtime_frame =
1484           __builtin_frame_address(1);
1485       if (ompt_callbacks.ompt_callback(ompt_event_taskwait_begin)) {
1486         ompt_callbacks.ompt_callback(ompt_event_taskwait_begin)(my_parallel_id,
1487                                                                 my_task_id);
1488       }
1489     }
1490 #endif
1491 
1492 // Debugger: The taskwait is active. Store location and thread encountered the
1493 // taskwait.
1494 #if USE_ITT_BUILD
1495 // Note: These values are used by ITT events as well.
1496 #endif /* USE_ITT_BUILD */
1497     taskdata->td_taskwait_counter += 1;
1498     taskdata->td_taskwait_ident = loc_ref;
1499     taskdata->td_taskwait_thread = gtid + 1;
1500 
1501 #if USE_ITT_BUILD
1502     void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
1503     if (itt_sync_obj != NULL)
1504       __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
1505 #endif /* USE_ITT_BUILD */
1506 
1507     bool must_wait =
1508         !taskdata->td_flags.team_serial && !taskdata->td_flags.final;
1509 
1510 #if OMP_45_ENABLED
1511     must_wait = must_wait || (thread->th.th_task_team != NULL &&
1512                               thread->th.th_task_team->tt.tt_found_proxy_tasks);
1513 #endif
1514     if (must_wait) {
1515       kmp_flag_32 flag(
1516           RCAST(volatile kmp_uint32 *, &taskdata->td_incomplete_child_tasks),
1517           0U);
1518       while (TCR_4(taskdata->td_incomplete_child_tasks) != 0) {
1519         flag.execute_tasks(thread, gtid, FALSE,
1520                            &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
1521                            __kmp_task_stealing_constraint);
1522       }
1523     }
1524 #if USE_ITT_BUILD
1525     if (itt_sync_obj != NULL)
1526       __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
1527 #endif /* USE_ITT_BUILD */
1528 
1529     // Debugger:  The taskwait is completed. Location remains, but thread is
1530     // negated.
1531     taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
1532 
1533 #if OMPT_SUPPORT && OMPT_TRACE
1534     if (ompt_enabled) {
1535       if (ompt_callbacks.ompt_callback(ompt_event_taskwait_end)) {
1536         ompt_callbacks.ompt_callback(ompt_event_taskwait_end)(my_parallel_id,
1537                                                               my_task_id);
1538       }
1539       taskdata->ompt_task_info.frame.reenter_runtime_frame = NULL;
1540     }
1541 #endif
1542     ANNOTATE_HAPPENS_AFTER(taskdata);
1543   }
1544 
1545   KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
1546                 "returning TASK_CURRENT_NOT_QUEUED\n",
1547                 gtid, taskdata));
1548 
1549   return TASK_CURRENT_NOT_QUEUED;
1550 }
1551 
1552 // __kmpc_omp_taskyield: switch to a different task
1553 kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
1554   kmp_taskdata_t *taskdata;
1555   kmp_info_t *thread;
1556   int thread_finished = FALSE;
1557 
1558   KMP_COUNT_BLOCK(OMP_TASKYIELD);
1559   KMP_SET_THREAD_STATE_BLOCK(TASKYIELD);
1560 
1561   KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
1562                 gtid, loc_ref, end_part));
1563 
1564   if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) {
1565     thread = __kmp_threads[gtid];
1566     taskdata = thread->th.th_current_task;
1567 // Should we model this as a task wait or not?
1568 // Debugger: The taskwait is active. Store location and thread encountered the
1569 // taskwait.
1570 #if USE_ITT_BUILD
1571 // Note: These values are used by ITT events as well.
1572 #endif /* USE_ITT_BUILD */
1573     taskdata->td_taskwait_counter += 1;
1574     taskdata->td_taskwait_ident = loc_ref;
1575     taskdata->td_taskwait_thread = gtid + 1;
1576 
1577 #if USE_ITT_BUILD
1578     void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
1579     if (itt_sync_obj != NULL)
1580       __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
1581 #endif /* USE_ITT_BUILD */
1582     if (!taskdata->td_flags.team_serial) {
1583       kmp_task_team_t *task_team = thread->th.th_task_team;
1584       if (task_team != NULL) {
1585         if (KMP_TASKING_ENABLED(task_team)) {
1586           __kmp_execute_tasks_32(
1587               thread, gtid, NULL, FALSE,
1588               &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
1589               __kmp_task_stealing_constraint);
1590         }
1591       }
1592     }
1593 #if USE_ITT_BUILD
1594     if (itt_sync_obj != NULL)
1595       __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
1596 #endif /* USE_ITT_BUILD */
1597 
1598     // Debugger:  The taskwait is completed. Location remains, but thread is
1599     // negated.
1600     taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
1601   }
1602 
1603   KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
1604                 "returning TASK_CURRENT_NOT_QUEUED\n",
1605                 gtid, taskdata));
1606 
1607   return TASK_CURRENT_NOT_QUEUED;
1608 }
1609 
1610 // TODO: change to OMP_50_ENABLED, need to change build tools for this to work
1611 #if OMP_45_ENABLED
1612 // Task Reduction implementation
1613 
1614 typedef struct kmp_task_red_flags {
1615   unsigned lazy_priv : 1; // hint: (1) use lazy allocation (big objects)
1616   unsigned reserved31 : 31;
1617 } kmp_task_red_flags_t;
1618 
1619 // internal structure for reduction data item related info
1620 typedef struct kmp_task_red_data {
1621   void *reduce_shar; // shared reduction item
1622   size_t reduce_size; // size of data item
1623   void *reduce_priv; // thread specific data
1624   void *reduce_pend; // end of private data for comparison op
1625   void *reduce_init; // data initialization routine
1626   void *reduce_fini; // data finalization routine
1627   void *reduce_comb; // data combiner routine
1628   kmp_task_red_flags_t flags; // flags for additional info from compiler
1629 } kmp_task_red_data_t;
1630 
1631 // structure sent us by compiler - one per reduction item
1632 typedef struct kmp_task_red_input {
1633   void *reduce_shar; // shared reduction item
1634   size_t reduce_size; // size of data item
1635   void *reduce_init; // data initialization routine
1636   void *reduce_fini; // data finalization routine
1637   void *reduce_comb; // data combiner routine
1638   kmp_task_red_flags_t flags; // flags for additional info from compiler
1639 } kmp_task_red_input_t;
1640 
1641 /*!
1642 @ingroup TASKING
1643 @param gtid      Global thread ID
1644 @param num       Number of data items to reduce
1645 @param data      Array of data for reduction
1646 @return The taskgroup identifier
1647 
1648 Initialize task reduction for the taskgroup.
1649 */
1650 void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
1651   kmp_info_t *thread = __kmp_threads[gtid];
1652   kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
1653   kmp_int32 nth = thread->th.th_team_nproc;
1654   kmp_task_red_input_t *input = (kmp_task_red_input_t *)data;
1655   kmp_task_red_data_t *arr;
1656 
1657   // check input data just in case
1658   KMP_ASSERT(tg != NULL);
1659   KMP_ASSERT(data != NULL);
1660   KMP_ASSERT(num > 0);
1661   if (nth == 1) {
1662     KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n",
1663                   gtid, tg));
1664     return (void *)tg;
1665   }
1666   KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n",
1667                 gtid, tg, num));
1668   arr = (kmp_task_red_data_t *)__kmp_thread_malloc(
1669       thread, num * sizeof(kmp_task_red_data_t));
1670   for (int i = 0; i < num; ++i) {
1671     void (*f_init)(void *) = (void (*)(void *))(input[i].reduce_init);
1672     size_t size = input[i].reduce_size - 1;
1673     // round the size up to cache line per thread-specific item
1674     size += CACHE_LINE - size % CACHE_LINE;
1675     KMP_ASSERT(input[i].reduce_comb != NULL); // combiner is mandatory
1676     arr[i].reduce_shar = input[i].reduce_shar;
1677     arr[i].reduce_size = size;
1678     arr[i].reduce_init = input[i].reduce_init;
1679     arr[i].reduce_fini = input[i].reduce_fini;
1680     arr[i].reduce_comb = input[i].reduce_comb;
1681     arr[i].flags = input[i].flags;
1682     if (!input[i].flags.lazy_priv) {
1683       // allocate cache-line aligned block and fill it with zeros
1684       arr[i].reduce_priv = __kmp_allocate(nth * size);
1685       arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size;
1686       if (f_init != NULL) {
1687         // initialize thread-specific items
1688         for (int j = 0; j < nth; ++j) {
1689           f_init((char *)(arr[i].reduce_priv) + j * size);
1690         }
1691       }
1692     } else {
1693       // only allocate space for pointers now,
1694       // objects will be lazily allocated/initialized once requested
1695       arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *));
1696     }
1697   }
1698   tg->reduce_data = (void *)arr;
1699   tg->reduce_num_data = num;
1700   return (void *)tg;
1701 }
1702 
1703 /*!
1704 @ingroup TASKING
1705 @param gtid    Global thread ID
1706 @param tskgrp  The taskgroup ID (optional)
1707 @param data    Shared location of the item
1708 @return The pointer to per-thread data
1709 
1710 Get thread-specific location of data item
1711 */
1712 void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
1713   kmp_info_t *thread = __kmp_threads[gtid];
1714   kmp_int32 nth = thread->th.th_team_nproc;
1715   if (nth == 1)
1716     return data; // nothing to do
1717 
1718   kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp;
1719   if (tg == NULL)
1720     tg = thread->th.th_current_task->td_taskgroup;
1721   KMP_ASSERT(tg != NULL);
1722   kmp_task_red_data_t *arr = (kmp_task_red_data_t *)(tg->reduce_data);
1723   kmp_int32 num = tg->reduce_num_data;
1724   kmp_int32 tid = thread->th.th_info.ds.ds_tid;
1725 
1726   KMP_ASSERT(data != NULL);
1727   while (tg != NULL) {
1728     for (int i = 0; i < num; ++i) {
1729       if (!arr[i].flags.lazy_priv) {
1730         if (data == arr[i].reduce_shar ||
1731             (data >= arr[i].reduce_priv && data < arr[i].reduce_pend))
1732           return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size;
1733       } else {
1734         // check shared location first
1735         void **p_priv = (void **)(arr[i].reduce_priv);
1736         if (data == arr[i].reduce_shar)
1737           goto found;
1738         // check if we get some thread specific location as parameter
1739         for (int j = 0; j < nth; ++j)
1740           if (data == p_priv[j])
1741             goto found;
1742         continue; // not found, continue search
1743       found:
1744         if (p_priv[tid] == NULL) {
1745           // allocate thread specific object lazily
1746           void (*f_init)(void *) = (void (*)(void *))(arr[i].reduce_init);
1747           p_priv[tid] = __kmp_allocate(arr[i].reduce_size);
1748           if (f_init != NULL) {
1749             f_init(p_priv[tid]);
1750           }
1751         }
1752         return p_priv[tid];
1753       }
1754     }
1755     tg = tg->parent;
1756     arr = (kmp_task_red_data_t *)(tg->reduce_data);
1757     num = tg->reduce_num_data;
1758   }
1759   KMP_ASSERT2(0, "Unknown task reduction item");
1760   return NULL; // ERROR, this line never executed
1761 }
1762 
1763 // Finalize task reduction.
1764 // Called from __kmpc_end_taskgroup()
1765 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) {
1766   kmp_int32 nth = th->th.th_team_nproc;
1767   KMP_DEBUG_ASSERT(nth > 1); // should not be called if nth == 1
1768   kmp_task_red_data_t *arr = (kmp_task_red_data_t *)tg->reduce_data;
1769   kmp_int32 num = tg->reduce_num_data;
1770   for (int i = 0; i < num; ++i) {
1771     void *sh_data = arr[i].reduce_shar;
1772     void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini);
1773     void (*f_comb)(void *, void *) =
1774         (void (*)(void *, void *))(arr[i].reduce_comb);
1775     if (!arr[i].flags.lazy_priv) {
1776       void *pr_data = arr[i].reduce_priv;
1777       size_t size = arr[i].reduce_size;
1778       for (int j = 0; j < nth; ++j) {
1779         void *priv_data = (char *)pr_data + j * size;
1780         f_comb(sh_data, priv_data); // combine results
1781         if (f_fini)
1782           f_fini(priv_data); // finalize if needed
1783       }
1784     } else {
1785       void **pr_data = (void **)(arr[i].reduce_priv);
1786       for (int j = 0; j < nth; ++j) {
1787         if (pr_data[j] != NULL) {
1788           f_comb(sh_data, pr_data[j]); // combine results
1789           if (f_fini)
1790             f_fini(pr_data[j]); // finalize if needed
1791           __kmp_free(pr_data[j]);
1792         }
1793       }
1794     }
1795     __kmp_free(arr[i].reduce_priv);
1796   }
1797   __kmp_thread_free(th, arr);
1798   tg->reduce_data = NULL;
1799   tg->reduce_num_data = 0;
1800 }
1801 #endif
1802 
1803 #if OMP_40_ENABLED
1804 // __kmpc_taskgroup: Start a new taskgroup
1805 void __kmpc_taskgroup(ident_t *loc, int gtid) {
1806   kmp_info_t *thread = __kmp_threads[gtid];
1807   kmp_taskdata_t *taskdata = thread->th.th_current_task;
1808   kmp_taskgroup_t *tg_new =
1809       (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t));
1810   KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new));
1811   tg_new->count = 0;
1812   tg_new->cancel_request = cancel_noreq;
1813   tg_new->parent = taskdata->td_taskgroup;
1814 // TODO: change to OMP_50_ENABLED, need to change build tools for this to work
1815 #if OMP_45_ENABLED
1816   tg_new->reduce_data = NULL;
1817   tg_new->reduce_num_data = 0;
1818 #endif
1819   taskdata->td_taskgroup = tg_new;
1820 }
1821 
1822 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task
1823 //                       and its descendants are complete
1824 void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
1825   kmp_info_t *thread = __kmp_threads[gtid];
1826   kmp_taskdata_t *taskdata = thread->th.th_current_task;
1827   kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
1828   int thread_finished = FALSE;
1829 
1830   KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc));
1831   KMP_DEBUG_ASSERT(taskgroup != NULL);
1832   KMP_SET_THREAD_STATE_BLOCK(TASKGROUP);
1833 
1834   if (__kmp_tasking_mode != tskm_immediate_exec) {
1835 #if USE_ITT_BUILD
1836     // For ITT the taskgroup wait is similar to taskwait until we need to
1837     // distinguish them
1838     void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
1839     if (itt_sync_obj != NULL)
1840       __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
1841 #endif /* USE_ITT_BUILD */
1842 
1843 #if OMP_45_ENABLED
1844     if (!taskdata->td_flags.team_serial ||
1845         (thread->th.th_task_team != NULL &&
1846          thread->th.th_task_team->tt.tt_found_proxy_tasks))
1847 #else
1848     if (!taskdata->td_flags.team_serial)
1849 #endif
1850     {
1851       kmp_flag_32 flag(RCAST(kmp_uint32 *, &taskgroup->count), 0U);
1852       while (TCR_4(taskgroup->count) != 0) {
1853         flag.execute_tasks(thread, gtid, FALSE,
1854                            &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
1855                            __kmp_task_stealing_constraint);
1856       }
1857     }
1858 
1859 #if USE_ITT_BUILD
1860     if (itt_sync_obj != NULL)
1861       __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
1862 #endif /* USE_ITT_BUILD */
1863   }
1864   KMP_DEBUG_ASSERT(taskgroup->count == 0);
1865 
1866 // TODO: change to OMP_50_ENABLED, need to change build tools for this to work
1867 #if OMP_45_ENABLED
1868   if (taskgroup->reduce_data != NULL) // need to reduce?
1869     __kmp_task_reduction_fini(thread, taskgroup);
1870 #endif
1871   // Restore parent taskgroup for the current task
1872   taskdata->td_taskgroup = taskgroup->parent;
1873   __kmp_thread_free(thread, taskgroup);
1874 
1875   KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n",
1876                 gtid, taskdata));
1877   ANNOTATE_HAPPENS_AFTER(taskdata);
1878 }
1879 #endif
1880 
1881 // __kmp_remove_my_task: remove a task from my own deque
1882 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid,
1883                                         kmp_task_team_t *task_team,
1884                                         kmp_int32 is_constrained) {
1885   kmp_task_t *task;
1886   kmp_taskdata_t *taskdata;
1887   kmp_thread_data_t *thread_data;
1888   kmp_uint32 tail;
1889 
1890   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
1891   KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data !=
1892                    NULL); // Caller should check this condition
1893 
1894   thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
1895 
1896   KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",
1897                 gtid, thread_data->td.td_deque_ntasks,
1898                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
1899 
1900   if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
1901     KA_TRACE(10,
1902              ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: "
1903               "ntasks=%d head=%u tail=%u\n",
1904               gtid, thread_data->td.td_deque_ntasks,
1905               thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
1906     return NULL;
1907   }
1908 
1909   __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
1910 
1911   if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
1912     __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
1913     KA_TRACE(10,
1914              ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "
1915               "ntasks=%d head=%u tail=%u\n",
1916               gtid, thread_data->td.td_deque_ntasks,
1917               thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
1918     return NULL;
1919   }
1920 
1921   tail = (thread_data->td.td_deque_tail - 1) &
1922          TASK_DEQUE_MASK(thread_data->td); // Wrap index.
1923   taskdata = thread_data->td.td_deque[tail];
1924 
1925   if (is_constrained && (taskdata->td_flags.tiedness == TASK_TIED)) {
1926     // we need to check if the candidate obeys task scheduling constraint:
1927     // only child of current task can be scheduled
1928     kmp_taskdata_t *current = thread->th.th_current_task;
1929     kmp_int32 level = current->td_level;
1930     kmp_taskdata_t *parent = taskdata->td_parent;
1931     while (parent != current && parent->td_level > level) {
1932       parent = parent->td_parent; // check generation up to the level of the
1933       // current task
1934       KMP_DEBUG_ASSERT(parent != NULL);
1935     }
1936     if (parent != current) {
1937       // If the tail task is not a child, then no other child can appear in the
1938       // deque.
1939       __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
1940       KA_TRACE(10,
1941                ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "
1942                 "ntasks=%d head=%u tail=%u\n",
1943                 gtid, thread_data->td.td_deque_ntasks,
1944                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
1945       return NULL;
1946     }
1947   }
1948 
1949   thread_data->td.td_deque_tail = tail;
1950   TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1);
1951 
1952   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
1953 
1954   KA_TRACE(10, ("__kmp_remove_my_task(exit #2): T#%d task %p removed: "
1955                 "ntasks=%d head=%u tail=%u\n",
1956                 gtid, taskdata, thread_data->td.td_deque_ntasks,
1957                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
1958 
1959   task = KMP_TASKDATA_TO_TASK(taskdata);
1960   return task;
1961 }
1962 
1963 // __kmp_steal_task: remove a task from another thread's deque
1964 // Assume that calling thread has already checked existence of
1965 // task_team thread_data before calling this routine.
1966 static kmp_task_t *__kmp_steal_task(kmp_info_t *victim, kmp_int32 gtid,
1967                                     kmp_task_team_t *task_team,
1968                                     volatile kmp_int32 *unfinished_threads,
1969                                     int *thread_finished,
1970                                     kmp_int32 is_constrained) {
1971   kmp_task_t *task;
1972   kmp_taskdata_t *taskdata;
1973   kmp_thread_data_t *victim_td, *threads_data;
1974   kmp_int32 victim_tid;
1975 
1976   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
1977 
1978   threads_data = task_team->tt.tt_threads_data;
1979   KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition
1980 
1981   victim_tid = victim->th.th_info.ds.ds_tid;
1982   victim_td = &threads_data[victim_tid];
1983 
1984   KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: "
1985                 "task_team=%p ntasks=%d "
1986                 "head=%u tail=%u\n",
1987                 gtid, __kmp_gtid_from_thread(victim), task_team,
1988                 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
1989                 victim_td->td.td_deque_tail));
1990 
1991   if ((TCR_4(victim_td->td.td_deque_ntasks) ==
1992        0) || // Caller should not check this condition
1993       (TCR_PTR(victim->th.th_task_team) !=
1994        task_team)) // GEH: why would this happen?
1995   {
1996     KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: "
1997                   "task_team=%p "
1998                   "ntasks=%d head=%u tail=%u\n",
1999                   gtid, __kmp_gtid_from_thread(victim), task_team,
2000                   victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
2001                   victim_td->td.td_deque_tail));
2002     return NULL;
2003   }
2004 
2005   __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock);
2006 
2007   // Check again after we acquire the lock
2008   if ((TCR_4(victim_td->td.td_deque_ntasks) == 0) ||
2009       (TCR_PTR(victim->th.th_task_team) !=
2010        task_team)) // GEH: why would this happen?
2011   {
2012     __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2013     KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: "
2014                   "task_team=%p "
2015                   "ntasks=%d head=%u tail=%u\n",
2016                   gtid, __kmp_gtid_from_thread(victim), task_team,
2017                   victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
2018                   victim_td->td.td_deque_tail));
2019     return NULL;
2020   }
2021 
2022   KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL);
2023 
2024   taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head];
2025   if (is_constrained) {
2026     // we need to check if the candidate obeys task scheduling constraint:
2027     // only descendant of current task can be scheduled
2028     kmp_taskdata_t *current = __kmp_threads[gtid]->th.th_current_task;
2029     kmp_int32 level = current->td_level;
2030     kmp_taskdata_t *parent = taskdata->td_parent;
2031     while (parent != current && parent->td_level > level) {
2032       parent = parent->td_parent; // check generation up to the level of the
2033       // current task
2034       KMP_DEBUG_ASSERT(parent != NULL);
2035     }
2036     if (parent != current) {
2037       // If the head task is not a descendant of the current task then do not
2038       // steal it. No other task in victim's deque can be a descendant of the
2039       // current task.
2040       __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2041       KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from "
2042                     "T#%d: task_team=%p "
2043                     "ntasks=%d head=%u tail=%u\n",
2044                     gtid,
2045                     __kmp_gtid_from_thread(threads_data[victim_tid].td.td_thr),
2046                     task_team, victim_td->td.td_deque_ntasks,
2047                     victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2048       return NULL;
2049     }
2050   }
2051   // Bump head pointer and Wrap.
2052   victim_td->td.td_deque_head =
2053       (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td);
2054   if (*thread_finished) {
2055     // We need to un-mark this victim as a finished victim.  This must be done
2056     // before releasing the lock, or else other threads (starting with the
2057     // master victim) might be prematurely released from the barrier!!!
2058     kmp_int32 count;
2059 
2060     count = KMP_TEST_THEN_INC32(CCAST(kmp_int32 *, unfinished_threads));
2061 
2062     KA_TRACE(
2063         20,
2064         ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
2065          gtid, count + 1, task_team));
2066 
2067     *thread_finished = FALSE;
2068   }
2069   TCW_4(victim_td->td.td_deque_ntasks,
2070         TCR_4(victim_td->td.td_deque_ntasks) - 1);
2071 
2072   __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2073 
2074   KMP_COUNT_BLOCK(TASK_stolen);
2075   KA_TRACE(
2076       10,
2077       ("__kmp_steal_task(exit #3): T#%d stole task %p from T#%d: task_team=%p "
2078        "ntasks=%d head=%u tail=%u\n",
2079        gtid, taskdata, __kmp_gtid_from_thread(victim), task_team,
2080        victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
2081        victim_td->td.td_deque_tail));
2082 
2083   task = KMP_TASKDATA_TO_TASK(taskdata);
2084   return task;
2085 }
2086 
2087 // __kmp_execute_tasks_template: Choose and execute tasks until either the
2088 // condition is statisfied (return true) or there are none left (return false).
2089 //
2090 // final_spin is TRUE if this is the spin at the release barrier.
2091 // thread_finished indicates whether the thread is finished executing all
2092 // the tasks it has on its deque, and is at the release barrier.
2093 // spinner is the location on which to spin.
2094 // spinner == NULL means only execute a single task and return.
2095 // checker is the value to check to terminate the spin.
2096 template <class C>
2097 static inline int __kmp_execute_tasks_template(
2098     kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin,
2099     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
2100     kmp_int32 is_constrained) {
2101   kmp_task_team_t *task_team = thread->th.th_task_team;
2102   kmp_thread_data_t *threads_data;
2103   kmp_task_t *task;
2104   kmp_info_t *other_thread;
2105   kmp_taskdata_t *current_task = thread->th.th_current_task;
2106   volatile kmp_int32 *unfinished_threads;
2107   kmp_int32 nthreads, victim = -2, use_own_tasks = 1, new_victim = 0,
2108                       tid = thread->th.th_info.ds.ds_tid;
2109 
2110   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2111   KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]);
2112 
2113   if (task_team == NULL)
2114     return FALSE;
2115 
2116   KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d "
2117                 "*thread_finished=%d\n",
2118                 gtid, final_spin, *thread_finished));
2119 
2120   thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
2121   threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
2122   KMP_DEBUG_ASSERT(threads_data != NULL);
2123 
2124   nthreads = task_team->tt.tt_nproc;
2125   unfinished_threads = &(task_team->tt.tt_unfinished_threads);
2126 #if OMP_45_ENABLED
2127   KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks);
2128 #else
2129   KMP_DEBUG_ASSERT(nthreads > 1);
2130 #endif
2131   KMP_DEBUG_ASSERT(TCR_4(*unfinished_threads) >= 0);
2132 
2133   while (1) { // Outer loop keeps trying to find tasks in case of single thread
2134     // getting tasks from target constructs
2135     while (1) { // Inner loop to find a task and execute it
2136       task = NULL;
2137       if (use_own_tasks) { // check on own queue first
2138         task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained);
2139       }
2140       if ((task == NULL) && (nthreads > 1)) { // Steal a task
2141         int asleep = 1;
2142         use_own_tasks = 0;
2143         // Try to steal from the last place I stole from successfully.
2144         if (victim == -2) { // haven't stolen anything yet
2145           victim = threads_data[tid].td.td_deque_last_stolen;
2146           if (victim !=
2147               -1) // if we have a last stolen from victim, get the thread
2148             other_thread = threads_data[victim].td.td_thr;
2149         }
2150         if (victim != -1) { // found last victim
2151           asleep = 0;
2152         } else if (!new_victim) { // no recent steals and we haven't already
2153           // used a new victim; select a random thread
2154           do { // Find a different thread to steal work from.
2155             // Pick a random thread. Initial plan was to cycle through all the
2156             // threads, and only return if we tried to steal from every thread,
2157             // and failed.  Arch says that's not such a great idea.
2158             victim = __kmp_get_random(thread) % (nthreads - 1);
2159             if (victim >= tid) {
2160               ++victim; // Adjusts random distribution to exclude self
2161             }
2162             // Found a potential victim
2163             other_thread = threads_data[victim].td.td_thr;
2164             // There is a slight chance that __kmp_enable_tasking() did not wake
2165             // up all threads waiting at the barrier.  If victim is sleeping,
2166             // then wake it up. Since we were going to pay the cache miss
2167             // penalty for referencing another thread's kmp_info_t struct
2168             // anyway,
2169             // the check shouldn't cost too much performance at this point. In
2170             // extra barrier mode, tasks do not sleep at the separate tasking
2171             // barrier, so this isn't a problem.
2172             asleep = 0;
2173             if ((__kmp_tasking_mode == tskm_task_teams) &&
2174                 (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&
2175                 (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) !=
2176                  NULL)) {
2177               asleep = 1;
2178               __kmp_null_resume_wrapper(__kmp_gtid_from_thread(other_thread),
2179                                         other_thread->th.th_sleep_loc);
2180               // A sleeping thread should not have any tasks on it's queue.
2181               // There is a slight possibility that it resumes, steals a task
2182               // from another thread, which spawns more tasks, all in the time
2183               // that it takes this thread to check => don't write an assertion
2184               // that the victim's queue is empty.  Try stealing from a
2185               // different thread.
2186             }
2187           } while (asleep);
2188         }
2189 
2190         if (!asleep) {
2191           // We have a victim to try to steal from
2192           task = __kmp_steal_task(other_thread, gtid, task_team,
2193                                   unfinished_threads, thread_finished,
2194                                   is_constrained);
2195         }
2196         if (task != NULL) { // set last stolen to victim
2197           if (threads_data[tid].td.td_deque_last_stolen != victim) {
2198             threads_data[tid].td.td_deque_last_stolen = victim;
2199             // The pre-refactored code did not try more than 1 successful new
2200             // vicitm, unless the last one generated more local tasks;
2201             // new_victim keeps track of this
2202             new_victim = 1;
2203           }
2204         } else { // No tasks found; unset last_stolen
2205           KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1);
2206           victim = -2; // no successful victim found
2207         }
2208       }
2209 
2210       if (task == NULL) // break out of tasking loop
2211         break;
2212 
2213 // Found a task; execute it
2214 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2215       if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2216         if (itt_sync_obj == NULL) { // we are at fork barrier where we could not
2217           // get the object reliably
2218           itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2219         }
2220         __kmp_itt_task_starting(itt_sync_obj);
2221       }
2222 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2223       __kmp_invoke_task(gtid, task, current_task);
2224 #if USE_ITT_BUILD
2225       if (itt_sync_obj != NULL)
2226         __kmp_itt_task_finished(itt_sync_obj);
2227 #endif /* USE_ITT_BUILD */
2228       // If this thread is only partway through the barrier and the condition is
2229       // met, then return now, so that the barrier gather/release pattern can
2230       // proceed. If this thread is in the last spin loop in the barrier,
2231       // waiting to be released, we know that the termination condition will not
2232       // be satisified, so don't waste any cycles checking it.
2233       if (flag == NULL || (!final_spin && flag->done_check())) {
2234         KA_TRACE(
2235             15,
2236             ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
2237              gtid));
2238         return TRUE;
2239       }
2240       if (thread->th.th_task_team == NULL) {
2241         break;
2242       }
2243       // Yield before executing next task
2244       KMP_YIELD(__kmp_library == library_throughput);
2245       // If execution of a stolen task results in more tasks being placed on our
2246       // run queue, reset use_own_tasks
2247       if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) {
2248         KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned "
2249                       "other tasks, restart\n",
2250                       gtid));
2251         use_own_tasks = 1;
2252         new_victim = 0;
2253       }
2254     }
2255 
2256 // The task source has been exhausted. If in final spin loop of barrier, check
2257 // if termination condition is satisfied.
2258 #if OMP_45_ENABLED
2259     // The work queue may be empty but there might be proxy tasks still
2260     // executing
2261     if (final_spin && TCR_4(current_task->td_incomplete_child_tasks) == 0)
2262 #else
2263     if (final_spin)
2264 #endif
2265     {
2266       // First, decrement the #unfinished threads, if that has not already been
2267       // done.  This decrement might be to the spin location, and result in the
2268       // termination condition being satisfied.
2269       if (!*thread_finished) {
2270         kmp_int32 count;
2271 
2272         count = KMP_TEST_THEN_DEC32(CCAST(kmp_int32 *, unfinished_threads)) - 1;
2273         KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec "
2274                       "unfinished_threads to %d task_team=%p\n",
2275                       gtid, count, task_team));
2276         *thread_finished = TRUE;
2277       }
2278 
2279       // It is now unsafe to reference thread->th.th_team !!!
2280       // Decrementing task_team->tt.tt_unfinished_threads can allow the master
2281       // thread to pass through the barrier, where it might reset each thread's
2282       // th.th_team field for the next parallel region. If we can steal more
2283       // work, we know that this has not happened yet.
2284       if (flag != NULL && flag->done_check()) {
2285         KA_TRACE(
2286             15,
2287             ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
2288              gtid));
2289         return TRUE;
2290       }
2291     }
2292 
2293     // If this thread's task team is NULL, master has recognized that there are
2294     // no more tasks; bail out
2295     if (thread->th.th_task_team == NULL) {
2296       KA_TRACE(15,
2297                ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid));
2298       return FALSE;
2299     }
2300 
2301 #if OMP_45_ENABLED
2302     // We could be getting tasks from target constructs; if this is the only
2303     // thread, keep trying to execute tasks from own queue
2304     if (nthreads == 1)
2305       use_own_tasks = 1;
2306     else
2307 #endif
2308     {
2309       KA_TRACE(15,
2310                ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid));
2311       return FALSE;
2312     }
2313   }
2314 }
2315 
2316 int __kmp_execute_tasks_32(
2317     kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32 *flag, int final_spin,
2318     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
2319     kmp_int32 is_constrained) {
2320   return __kmp_execute_tasks_template(
2321       thread, gtid, flag, final_spin,
2322       thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
2323 }
2324 
2325 int __kmp_execute_tasks_64(
2326     kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64 *flag, int final_spin,
2327     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
2328     kmp_int32 is_constrained) {
2329   return __kmp_execute_tasks_template(
2330       thread, gtid, flag, final_spin,
2331       thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
2332 }
2333 
2334 int __kmp_execute_tasks_oncore(
2335     kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
2336     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
2337     kmp_int32 is_constrained) {
2338   return __kmp_execute_tasks_template(
2339       thread, gtid, flag, final_spin,
2340       thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
2341 }
2342 
2343 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
2344 // next barrier so they can assist in executing enqueued tasks.
2345 // First thread in allocates the task team atomically.
2346 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
2347                                  kmp_info_t *this_thr) {
2348   kmp_thread_data_t *threads_data;
2349   int nthreads, i, is_init_thread;
2350 
2351   KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n",
2352                 __kmp_gtid_from_thread(this_thr)));
2353 
2354   KMP_DEBUG_ASSERT(task_team != NULL);
2355   KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL);
2356 
2357   nthreads = task_team->tt.tt_nproc;
2358   KMP_DEBUG_ASSERT(nthreads > 0);
2359   KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc);
2360 
2361   // Allocate or increase the size of threads_data if necessary
2362   is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team);
2363 
2364   if (!is_init_thread) {
2365     // Some other thread already set up the array.
2366     KA_TRACE(
2367         20,
2368         ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n",
2369          __kmp_gtid_from_thread(this_thr)));
2370     return;
2371   }
2372   threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
2373   KMP_DEBUG_ASSERT(threads_data != NULL);
2374 
2375   if ((__kmp_tasking_mode == tskm_task_teams) &&
2376       (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) {
2377     // Release any threads sleeping at the barrier, so that they can steal
2378     // tasks and execute them.  In extra barrier mode, tasks do not sleep
2379     // at the separate tasking barrier, so this isn't a problem.
2380     for (i = 0; i < nthreads; i++) {
2381       volatile void *sleep_loc;
2382       kmp_info_t *thread = threads_data[i].td.td_thr;
2383 
2384       if (i == this_thr->th.th_info.ds.ds_tid) {
2385         continue;
2386       }
2387       // Since we haven't locked the thread's suspend mutex lock at this
2388       // point, there is a small window where a thread might be putting
2389       // itself to sleep, but hasn't set the th_sleep_loc field yet.
2390       // To work around this, __kmp_execute_tasks_template() periodically checks
2391       // see if other threads are sleeping (using the same random mechanism that
2392       // is used for task stealing) and awakens them if they are.
2393       if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
2394           NULL) {
2395         KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n",
2396                       __kmp_gtid_from_thread(this_thr),
2397                       __kmp_gtid_from_thread(thread)));
2398         __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
2399       } else {
2400         KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
2401                       __kmp_gtid_from_thread(this_thr),
2402                       __kmp_gtid_from_thread(thread)));
2403       }
2404     }
2405   }
2406 
2407   KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n",
2408                 __kmp_gtid_from_thread(this_thr)));
2409 }
2410 
2411 /* // TODO: Check the comment consistency
2412  * Utility routines for "task teams".  A task team (kmp_task_t) is kind of
2413  * like a shadow of the kmp_team_t data struct, with a different lifetime.
2414  * After a child * thread checks into a barrier and calls __kmp_release() from
2415  * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
2416  * longer assume that the kmp_team_t structure is intact (at any moment, the
2417  * master thread may exit the barrier code and free the team data structure,
2418  * and return the threads to the thread pool).
2419  *
2420  * This does not work with the the tasking code, as the thread is still
2421  * expected to participate in the execution of any tasks that may have been
2422  * spawned my a member of the team, and the thread still needs access to all
2423  * to each thread in the team, so that it can steal work from it.
2424  *
2425  * Enter the existence of the kmp_task_team_t struct.  It employs a reference
2426  * counting mechanims, and is allocated by the master thread before calling
2427  * __kmp_<barrier_kind>_release, and then is release by the last thread to
2428  * exit __kmp_<barrier_kind>_release at the next barrier.  I.e. the lifetimes
2429  * of the kmp_task_team_t structs for consecutive barriers can overlap
2430  * (and will, unless the master thread is the last thread to exit the barrier
2431  * release phase, which is not typical).
2432  *
2433  * The existence of such a struct is useful outside the context of tasking,
2434  * but for now, I'm trying to keep it specific to the OMP_30_ENABLED macro,
2435  * so that any performance differences show up when comparing the 2.5 vs. 3.0
2436  * libraries.
2437  *
2438  * We currently use the existence of the threads array as an indicator that
2439  * tasks were spawned since the last barrier.  If the structure is to be
2440  * useful outside the context of tasking, then this will have to change, but
2441  * not settting the field minimizes the performance impact of tasking on
2442  * barriers, when no explicit tasks were spawned (pushed, actually).
2443  */
2444 
2445 static kmp_task_team_t *__kmp_free_task_teams =
2446     NULL; // Free list for task_team data structures
2447 // Lock for task team data structures
2448 static kmp_bootstrap_lock_t __kmp_task_team_lock =
2449     KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock);
2450 
2451 // __kmp_alloc_task_deque:
2452 // Allocates a task deque for a particular thread, and initialize the necessary
2453 // data structures relating to the deque.  This only happens once per thread
2454 // per task team since task teams are recycled. No lock is needed during
2455 // allocation since each thread allocates its own deque.
2456 static void __kmp_alloc_task_deque(kmp_info_t *thread,
2457                                    kmp_thread_data_t *thread_data) {
2458   __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
2459   KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL);
2460 
2461   // Initialize last stolen task field to "none"
2462   thread_data->td.td_deque_last_stolen = -1;
2463 
2464   KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0);
2465   KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0);
2466   KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0);
2467 
2468   KE_TRACE(
2469       10,
2470       ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
2471        __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data));
2472   // Allocate space for task deque, and zero the deque
2473   // Cannot use __kmp_thread_calloc() because threads not around for
2474   // kmp_reap_task_team( ).
2475   thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
2476       INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
2477   thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
2478 }
2479 
2480 // __kmp_realloc_task_deque:
2481 // Re-allocates a task deque for a particular thread, copies the content from
2482 // the old deque and adjusts the necessary data structures relating to the
2483 // deque. This operation must be done with a the deque_lock being held
2484 static void __kmp_realloc_task_deque(kmp_info_t *thread,
2485                                      kmp_thread_data_t *thread_data) {
2486   kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
2487   kmp_int32 new_size = 2 * size;
2488 
2489   KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to "
2490                 "%d] for thread_data %p\n",
2491                 __kmp_gtid_from_thread(thread), size, new_size, thread_data));
2492 
2493   kmp_taskdata_t **new_deque =
2494       (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *));
2495 
2496   int i, j;
2497   for (i = thread_data->td.td_deque_head, j = 0; j < size;
2498        i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++)
2499     new_deque[j] = thread_data->td.td_deque[i];
2500 
2501   __kmp_free(thread_data->td.td_deque);
2502 
2503   thread_data->td.td_deque_head = 0;
2504   thread_data->td.td_deque_tail = size;
2505   thread_data->td.td_deque = new_deque;
2506   thread_data->td.td_deque_size = new_size;
2507 }
2508 
2509 // __kmp_free_task_deque:
2510 // Deallocates a task deque for a particular thread. Happens at library
2511 // deallocation so don't need to reset all thread data fields.
2512 static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) {
2513   __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
2514 
2515   if (thread_data->td.td_deque != NULL) {
2516     TCW_4(thread_data->td.td_deque_ntasks, 0);
2517     __kmp_free(thread_data->td.td_deque);
2518     thread_data->td.td_deque = NULL;
2519   }
2520   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2521 
2522 #ifdef BUILD_TIED_TASK_STACK
2523   // GEH: Figure out what to do here for td_susp_tied_tasks
2524   if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) {
2525     __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data);
2526   }
2527 #endif // BUILD_TIED_TASK_STACK
2528 }
2529 
2530 // __kmp_realloc_task_threads_data:
2531 // Allocates a threads_data array for a task team, either by allocating an
2532 // initial array or enlarging an existing array.  Only the first thread to get
2533 // the lock allocs or enlarges the array and re-initializes the array eleemnts.
2534 // That thread returns "TRUE", the rest return "FALSE".
2535 // Assumes that the new array size is given by task_team -> tt.tt_nproc.
2536 // The current size is given by task_team -> tt.tt_max_threads.
2537 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
2538                                            kmp_task_team_t *task_team) {
2539   kmp_thread_data_t **threads_data_p;
2540   kmp_int32 nthreads, maxthreads;
2541   int is_init_thread = FALSE;
2542 
2543   if (TCR_4(task_team->tt.tt_found_tasks)) {
2544     // Already reallocated and initialized.
2545     return FALSE;
2546   }
2547 
2548   threads_data_p = &task_team->tt.tt_threads_data;
2549   nthreads = task_team->tt.tt_nproc;
2550   maxthreads = task_team->tt.tt_max_threads;
2551 
2552   // All threads must lock when they encounter the first task of the implicit
2553   // task region to make sure threads_data fields are (re)initialized before
2554   // used.
2555   __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
2556 
2557   if (!TCR_4(task_team->tt.tt_found_tasks)) {
2558     // first thread to enable tasking
2559     kmp_team_t *team = thread->th.th_team;
2560     int i;
2561 
2562     is_init_thread = TRUE;
2563     if (maxthreads < nthreads) {
2564 
2565       if (*threads_data_p != NULL) {
2566         kmp_thread_data_t *old_data = *threads_data_p;
2567         kmp_thread_data_t *new_data = NULL;
2568 
2569         KE_TRACE(
2570             10,
2571             ("__kmp_realloc_task_threads_data: T#%d reallocating "
2572              "threads data for task_team %p, new_size = %d, old_size = %d\n",
2573              __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads));
2574         // Reallocate threads_data to have more elements than current array
2575         // Cannot use __kmp_thread_realloc() because threads not around for
2576         // kmp_reap_task_team( ).  Note all new array entries are initialized
2577         // to zero by __kmp_allocate().
2578         new_data = (kmp_thread_data_t *)__kmp_allocate(
2579             nthreads * sizeof(kmp_thread_data_t));
2580         // copy old data to new data
2581         KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t),
2582                      (void *)old_data, maxthreads * sizeof(kmp_taskdata_t *));
2583 
2584 #ifdef BUILD_TIED_TASK_STACK
2585         // GEH: Figure out if this is the right thing to do
2586         for (i = maxthreads; i < nthreads; i++) {
2587           kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
2588           __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
2589         }
2590 #endif // BUILD_TIED_TASK_STACK
2591         // Install the new data and free the old data
2592         (*threads_data_p) = new_data;
2593         __kmp_free(old_data);
2594       } else {
2595         KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating "
2596                       "threads data for task_team %p, size = %d\n",
2597                       __kmp_gtid_from_thread(thread), task_team, nthreads));
2598         // Make the initial allocate for threads_data array, and zero entries
2599         // Cannot use __kmp_thread_calloc() because threads not around for
2600         // kmp_reap_task_team( ).
2601         ANNOTATE_IGNORE_WRITES_BEGIN();
2602         *threads_data_p = (kmp_thread_data_t *)__kmp_allocate(
2603             nthreads * sizeof(kmp_thread_data_t));
2604         ANNOTATE_IGNORE_WRITES_END();
2605 #ifdef BUILD_TIED_TASK_STACK
2606         // GEH: Figure out if this is the right thing to do
2607         for (i = 0; i < nthreads; i++) {
2608           kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
2609           __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
2610         }
2611 #endif // BUILD_TIED_TASK_STACK
2612       }
2613       task_team->tt.tt_max_threads = nthreads;
2614     } else {
2615       // If array has (more than) enough elements, go ahead and use it
2616       KMP_DEBUG_ASSERT(*threads_data_p != NULL);
2617     }
2618 
2619     // initialize threads_data pointers back to thread_info structures
2620     for (i = 0; i < nthreads; i++) {
2621       kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
2622       thread_data->td.td_thr = team->t.t_threads[i];
2623 
2624       if (thread_data->td.td_deque_last_stolen >= nthreads) {
2625         // The last stolen field survives across teams / barrier, and the number
2626         // of threads may have changed.  It's possible (likely?) that a new
2627         // parallel region will exhibit the same behavior as previous region.
2628         thread_data->td.td_deque_last_stolen = -1;
2629       }
2630     }
2631 
2632     KMP_MB();
2633     TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE);
2634   }
2635 
2636   __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
2637   return is_init_thread;
2638 }
2639 
2640 // __kmp_free_task_threads_data:
2641 // Deallocates a threads_data array for a task team, including any attached
2642 // tasking deques.  Only occurs at library shutdown.
2643 static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) {
2644   __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
2645   if (task_team->tt.tt_threads_data != NULL) {
2646     int i;
2647     for (i = 0; i < task_team->tt.tt_max_threads; i++) {
2648       __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]);
2649     }
2650     __kmp_free(task_team->tt.tt_threads_data);
2651     task_team->tt.tt_threads_data = NULL;
2652   }
2653   __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
2654 }
2655 
2656 // __kmp_allocate_task_team:
2657 // Allocates a task team associated with a specific team, taking it from
2658 // the global task team free list if possible.  Also initializes data
2659 // structures.
2660 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
2661                                                  kmp_team_t *team) {
2662   kmp_task_team_t *task_team = NULL;
2663   int nthreads;
2664 
2665   KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n",
2666                 (thread ? __kmp_gtid_from_thread(thread) : -1), team));
2667 
2668   if (TCR_PTR(__kmp_free_task_teams) != NULL) {
2669     // Take a task team from the task team pool
2670     __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
2671     if (__kmp_free_task_teams != NULL) {
2672       task_team = __kmp_free_task_teams;
2673       TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next);
2674       task_team->tt.tt_next = NULL;
2675     }
2676     __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
2677   }
2678 
2679   if (task_team == NULL) {
2680     KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating "
2681                   "task team for team %p\n",
2682                   __kmp_gtid_from_thread(thread), team));
2683     // Allocate a new task team if one is not available.
2684     // Cannot use __kmp_thread_malloc() because threads not around for
2685     // kmp_reap_task_team( ).
2686     task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t));
2687     __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock);
2688     // AC: __kmp_allocate zeroes returned memory
2689     // task_team -> tt.tt_threads_data = NULL;
2690     // task_team -> tt.tt_max_threads = 0;
2691     // task_team -> tt.tt_next = NULL;
2692   }
2693 
2694   TCW_4(task_team->tt.tt_found_tasks, FALSE);
2695 #if OMP_45_ENABLED
2696   TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
2697 #endif
2698   task_team->tt.tt_nproc = nthreads = team->t.t_nproc;
2699 
2700   TCW_4(task_team->tt.tt_unfinished_threads, nthreads);
2701   TCW_4(task_team->tt.tt_active, TRUE);
2702 
2703   KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
2704                 "unfinished_threads init'd to %d\n",
2705                 (thread ? __kmp_gtid_from_thread(thread) : -1), task_team,
2706                 task_team->tt.tt_unfinished_threads));
2707   return task_team;
2708 }
2709 
2710 // __kmp_free_task_team:
2711 // Frees the task team associated with a specific thread, and adds it
2712 // to the global task team free list.
2713 void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) {
2714   KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n",
2715                 thread ? __kmp_gtid_from_thread(thread) : -1, task_team));
2716 
2717   // Put task team back on free list
2718   __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
2719 
2720   KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL);
2721   task_team->tt.tt_next = __kmp_free_task_teams;
2722   TCW_PTR(__kmp_free_task_teams, task_team);
2723 
2724   __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
2725 }
2726 
2727 // __kmp_reap_task_teams:
2728 // Free all the task teams on the task team free list.
2729 // Should only be done during library shutdown.
2730 // Cannot do anything that needs a thread structure or gtid since they are
2731 // already gone.
2732 void __kmp_reap_task_teams(void) {
2733   kmp_task_team_t *task_team;
2734 
2735   if (TCR_PTR(__kmp_free_task_teams) != NULL) {
2736     // Free all task_teams on the free list
2737     __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
2738     while ((task_team = __kmp_free_task_teams) != NULL) {
2739       __kmp_free_task_teams = task_team->tt.tt_next;
2740       task_team->tt.tt_next = NULL;
2741 
2742       // Free threads_data if necessary
2743       if (task_team->tt.tt_threads_data != NULL) {
2744         __kmp_free_task_threads_data(task_team);
2745       }
2746       __kmp_free(task_team);
2747     }
2748     __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
2749   }
2750 }
2751 
2752 // __kmp_wait_to_unref_task_teams:
2753 // Some threads could still be in the fork barrier release code, possibly
2754 // trying to steal tasks.  Wait for each thread to unreference its task team.
2755 void __kmp_wait_to_unref_task_teams(void) {
2756   kmp_info_t *thread;
2757   kmp_uint32 spins;
2758   int done;
2759 
2760   KMP_INIT_YIELD(spins);
2761 
2762   for (;;) {
2763     done = TRUE;
2764 
2765     // TODO: GEH - this may be is wrong because some sync would be necessary
2766     // in case threads are added to the pool during the traversal. Need to
2767     // verify that lock for thread pool is held when calling this routine.
2768     for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL;
2769          thread = thread->th.th_next_pool) {
2770 #if KMP_OS_WINDOWS
2771       DWORD exit_val;
2772 #endif
2773       if (TCR_PTR(thread->th.th_task_team) == NULL) {
2774         KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n",
2775                       __kmp_gtid_from_thread(thread)));
2776         continue;
2777       }
2778 #if KMP_OS_WINDOWS
2779       // TODO: GEH - add this check for Linux* OS / OS X* as well?
2780       if (!__kmp_is_thread_alive(thread, &exit_val)) {
2781         thread->th.th_task_team = NULL;
2782         continue;
2783       }
2784 #endif
2785 
2786       done = FALSE; // Because th_task_team pointer is not NULL for this thread
2787 
2788       KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to "
2789                     "unreference task_team\n",
2790                     __kmp_gtid_from_thread(thread)));
2791 
2792       if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
2793         volatile void *sleep_loc;
2794         // If the thread is sleeping, awaken it.
2795         if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
2796             NULL) {
2797           KA_TRACE(
2798               10,
2799               ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
2800                __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread)));
2801           __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
2802         }
2803       }
2804     }
2805     if (done) {
2806       break;
2807     }
2808 
2809     // If we are oversubscribed, or have waited a bit (and library mode is
2810     // throughput), yield. Pause is in the following code.
2811     KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2812     KMP_YIELD_SPIN(spins); // Yields only if KMP_LIBRARY=throughput
2813   }
2814 }
2815 
2816 // __kmp_task_team_setup:  Create a task_team for the current team, but use
2817 // an already created, unused one if it already exists.
2818 void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
2819   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2820 
2821   // If this task_team hasn't been created yet, allocate it. It will be used in
2822   // the region after the next.
2823   // If it exists, it is the current task team and shouldn't be touched yet as
2824   // it may still be in use.
2825   if (team->t.t_task_team[this_thr->th.th_task_state] == NULL &&
2826       (always || team->t.t_nproc > 1)) {
2827     team->t.t_task_team[this_thr->th.th_task_state] =
2828         __kmp_allocate_task_team(this_thr, team);
2829     KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created new task_team %p "
2830                   "for team %d at parity=%d\n",
2831                   __kmp_gtid_from_thread(this_thr),
2832                   team->t.t_task_team[this_thr->th.th_task_state],
2833                   ((team != NULL) ? team->t.t_id : -1),
2834                   this_thr->th.th_task_state));
2835   }
2836 
2837   // After threads exit the release, they will call sync, and then point to this
2838   // other task_team; make sure it is allocated and properly initialized. As
2839   // threads spin in the barrier release phase, they will continue to use the
2840   // previous task_team struct(above), until they receive the signal to stop
2841   // checking for tasks (they can't safely reference the kmp_team_t struct,
2842   // which could be reallocated by the master thread). No task teams are formed
2843   // for serialized teams.
2844   if (team->t.t_nproc > 1) {
2845     int other_team = 1 - this_thr->th.th_task_state;
2846     if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
2847       team->t.t_task_team[other_team] =
2848           __kmp_allocate_task_team(this_thr, team);
2849       KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created second new "
2850                     "task_team %p for team %d at parity=%d\n",
2851                     __kmp_gtid_from_thread(this_thr),
2852                     team->t.t_task_team[other_team],
2853                     ((team != NULL) ? team->t.t_id : -1), other_team));
2854     } else { // Leave the old task team struct in place for the upcoming region;
2855       // adjust as needed
2856       kmp_task_team_t *task_team = team->t.t_task_team[other_team];
2857       if (!task_team->tt.tt_active ||
2858           team->t.t_nproc != task_team->tt.tt_nproc) {
2859         TCW_4(task_team->tt.tt_nproc, team->t.t_nproc);
2860         TCW_4(task_team->tt.tt_found_tasks, FALSE);
2861 #if OMP_45_ENABLED
2862         TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
2863 #endif
2864         TCW_4(task_team->tt.tt_unfinished_threads, team->t.t_nproc);
2865         TCW_4(task_team->tt.tt_active, TRUE);
2866       }
2867       // if team size has changed, the first thread to enable tasking will
2868       // realloc threads_data if necessary
2869       KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d reset next task_team "
2870                     "%p for team %d at parity=%d\n",
2871                     __kmp_gtid_from_thread(this_thr),
2872                     team->t.t_task_team[other_team],
2873                     ((team != NULL) ? team->t.t_id : -1), other_team));
2874     }
2875   }
2876 }
2877 
2878 // __kmp_task_team_sync: Propagation of task team data from team to threads
2879 // which happens just after the release phase of a team barrier.  This may be
2880 // called by any thread, but only for teams with # threads > 1.
2881 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
2882   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2883 
2884   // Toggle the th_task_state field, to switch which task_team this thread
2885   // refers to
2886   this_thr->th.th_task_state = 1 - this_thr->th.th_task_state;
2887   // It is now safe to propagate the task team pointer from the team struct to
2888   // the current thread.
2889   TCW_PTR(this_thr->th.th_task_team,
2890           team->t.t_task_team[this_thr->th.th_task_state]);
2891   KA_TRACE(20,
2892            ("__kmp_task_team_sync: Thread T#%d task team switched to task_team "
2893             "%p from Team #%d (parity=%d)\n",
2894             __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team,
2895             ((team != NULL) ? team->t.t_id : -1), this_thr->th.th_task_state));
2896 }
2897 
2898 // __kmp_task_team_wait: Master thread waits for outstanding tasks after the
2899 // barrier gather phase. Only called by master thread if #threads in team > 1 or
2900 // if proxy tasks were created.
2901 //
2902 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
2903 // by passing in 0 optionally as the last argument. When wait is zero, master
2904 // thread does not wait for unfinished_threads to reach 0.
2905 void __kmp_task_team_wait(
2906     kmp_info_t *this_thr,
2907     kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) {
2908   kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state];
2909 
2910   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2911   KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team);
2912 
2913   if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) {
2914     if (wait) {
2915       KA_TRACE(20, ("__kmp_task_team_wait: Master T#%d waiting for all tasks "
2916                     "(for unfinished_threads to reach 0) on task_team = %p\n",
2917                     __kmp_gtid_from_thread(this_thr), task_team));
2918       // Worker threads may have dropped through to release phase, but could
2919       // still be executing tasks. Wait here for tasks to complete. To avoid
2920       // memory contention, only master thread checks termination condition.
2921       kmp_flag_32 flag(
2922           RCAST(volatile kmp_uint32 *, &task_team->tt.tt_unfinished_threads),
2923           0U);
2924       flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2925     }
2926     // Deactivate the old task team, so that the worker threads will stop
2927     // referencing it while spinning.
2928     KA_TRACE(
2929         20,
2930         ("__kmp_task_team_wait: Master T#%d deactivating task_team %p: "
2931          "setting active to false, setting local and team's pointer to NULL\n",
2932          __kmp_gtid_from_thread(this_thr), task_team));
2933 #if OMP_45_ENABLED
2934     KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 ||
2935                      task_team->tt.tt_found_proxy_tasks == TRUE);
2936     TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE);
2937 #else
2938     KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1);
2939 #endif
2940     TCW_SYNC_4(task_team->tt.tt_active, FALSE);
2941     KMP_MB();
2942 
2943     TCW_PTR(this_thr->th.th_task_team, NULL);
2944   }
2945 }
2946 
2947 // __kmp_tasking_barrier:
2948 // This routine may only called when __kmp_tasking_mode == tskm_extra_barrier.
2949 // Internal function to execute all tasks prior to a regular barrier or a join
2950 // barrier. It is a full barrier itself, which unfortunately turns regular
2951 // barriers into double barriers and join barriers into 1 1/2 barriers.
2952 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) {
2953   volatile kmp_uint32 *spin = RCAST(
2954       volatile kmp_uint32 *,
2955       &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads);
2956   int flag = FALSE;
2957   KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier);
2958 
2959 #if USE_ITT_BUILD
2960   KMP_FSYNC_SPIN_INIT(spin, (kmp_uint32 *)NULL);
2961 #endif /* USE_ITT_BUILD */
2962   kmp_flag_32 spin_flag(spin, 0U);
2963   while (!spin_flag.execute_tasks(thread, gtid, TRUE,
2964                                   &flag USE_ITT_BUILD_ARG(NULL), 0)) {
2965 #if USE_ITT_BUILD
2966     // TODO: What about itt_sync_obj??
2967     KMP_FSYNC_SPIN_PREPARE(CCAST(void *, RCAST(volatile void *, spin)));
2968 #endif /* USE_ITT_BUILD */
2969 
2970     if (TCR_4(__kmp_global.g.g_done)) {
2971       if (__kmp_global.g.g_abort)
2972         __kmp_abort_thread();
2973       break;
2974     }
2975     KMP_YIELD(TRUE); // GH: We always yield here
2976   }
2977 #if USE_ITT_BUILD
2978   KMP_FSYNC_SPIN_ACQUIRED(CCAST(void *, RCAST(volatile void *, spin)));
2979 #endif /* USE_ITT_BUILD */
2980 }
2981 
2982 #if OMP_45_ENABLED
2983 
2984 // __kmp_give_task puts a task into a given thread queue if:
2985 //  - the queue for that thread was created
2986 //  - there's space in that queue
2987 // Because of this, __kmp_push_task needs to check if there's space after
2988 // getting the lock
2989 static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task,
2990                             kmp_int32 pass) {
2991   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
2992   kmp_task_team_t *task_team = taskdata->td_task_team;
2993 
2994   KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n",
2995                 taskdata, tid));
2996 
2997   // If task_team is NULL something went really bad...
2998   KMP_DEBUG_ASSERT(task_team != NULL);
2999 
3000   bool result = false;
3001   kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
3002 
3003   if (thread_data->td.td_deque == NULL) {
3004     // There's no queue in this thread, go find another one
3005     // We're guaranteed that at least one thread has a queue
3006     KA_TRACE(30,
3007              ("__kmp_give_task: thread %d has no queue while giving task %p.\n",
3008               tid, taskdata));
3009     return result;
3010   }
3011 
3012   if (TCR_4(thread_data->td.td_deque_ntasks) >=
3013       TASK_DEQUE_SIZE(thread_data->td)) {
3014     KA_TRACE(
3015         30,
3016         ("__kmp_give_task: queue is full while giving task %p to thread %d.\n",
3017          taskdata, tid));
3018 
3019     // if this deque is bigger than the pass ratio give a chance to another
3020     // thread
3021     if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
3022       return result;
3023 
3024     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3025     __kmp_realloc_task_deque(thread, thread_data);
3026 
3027   } else {
3028 
3029     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3030 
3031     if (TCR_4(thread_data->td.td_deque_ntasks) >=
3032         TASK_DEQUE_SIZE(thread_data->td)) {
3033       KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to "
3034                     "thread %d.\n",
3035                     taskdata, tid));
3036 
3037       // if this deque is bigger than the pass ratio give a chance to another
3038       // thread
3039       if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
3040         goto release_and_exit;
3041 
3042       __kmp_realloc_task_deque(thread, thread_data);
3043     }
3044   }
3045 
3046   // lock is held here, and there is space in the deque
3047 
3048   thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
3049   // Wrap index.
3050   thread_data->td.td_deque_tail =
3051       (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
3052   TCW_4(thread_data->td.td_deque_ntasks,
3053         TCR_4(thread_data->td.td_deque_ntasks) + 1);
3054 
3055   result = true;
3056   KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n",
3057                 taskdata, tid));
3058 
3059 release_and_exit:
3060   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3061 
3062   return result;
3063 }
3064 
3065 /* The finish of the proxy tasks is divided in two pieces:
3066     - the top half is the one that can be done from a thread outside the team
3067     - the bottom half must be run from a them within the team
3068 
3069    In order to run the bottom half the task gets queued back into one of the
3070    threads of the team. Once the td_incomplete_child_task counter of the parent
3071    is decremented the threads can leave the barriers. So, the bottom half needs
3072    to be queued before the counter is decremented. The top half is therefore
3073    divided in two parts:
3074     - things that can be run before queuing the bottom half
3075     - things that must be run after queuing the bottom half
3076 
3077    This creates a second race as the bottom half can free the task before the
3078    second top half is executed. To avoid this we use the
3079    td_incomplete_child_task of the proxy task to synchronize the top and bottom
3080    half. */
3081 static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
3082   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
3083   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3084   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
3085   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
3086 
3087   taskdata->td_flags.complete = 1; // mark the task as completed
3088 
3089   if (taskdata->td_taskgroup)
3090     KMP_TEST_THEN_DEC32(&taskdata->td_taskgroup->count);
3091 
3092   // Create an imaginary children for this task so the bottom half cannot
3093   // release the task before we have completed the second top half
3094   TCI_4(taskdata->td_incomplete_child_tasks);
3095 }
3096 
3097 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
3098   kmp_int32 children = 0;
3099 
3100   // Predecrement simulated by "- 1" calculation
3101   children =
3102       KMP_TEST_THEN_DEC32(
3103           CCAST(kmp_int32 *, &taskdata->td_parent->td_incomplete_child_tasks)) -
3104       1;
3105   KMP_DEBUG_ASSERT(children >= 0);
3106 
3107   // Remove the imaginary children
3108   TCD_4(taskdata->td_incomplete_child_tasks);
3109 }
3110 
3111 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) {
3112   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3113   kmp_info_t *thread = __kmp_threads[gtid];
3114 
3115   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3116   KMP_DEBUG_ASSERT(taskdata->td_flags.complete ==
3117                    1); // top half must run before bottom half
3118 
3119   // We need to wait to make sure the top half is finished
3120   // Spinning here should be ok as this should happen quickly
3121   while (TCR_4(taskdata->td_incomplete_child_tasks) > 0)
3122     ;
3123 
3124   __kmp_release_deps(gtid, taskdata);
3125   __kmp_free_task_and_ancestors(gtid, taskdata, thread);
3126 }
3127 
3128 /*!
3129 @ingroup TASKING
3130 @param gtid Global Thread ID of encountering thread
3131 @param ptask Task which execution is completed
3132 
3133 Execute the completation of a proxy task from a thread of that is part of the
3134 team. Run first and bottom halves directly.
3135 */
3136 void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) {
3137   KMP_DEBUG_ASSERT(ptask != NULL);
3138   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3139   KA_TRACE(
3140       10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n",
3141            gtid, taskdata));
3142 
3143   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3144 
3145   __kmp_first_top_half_finish_proxy(taskdata);
3146   __kmp_second_top_half_finish_proxy(taskdata);
3147   __kmp_bottom_half_finish_proxy(gtid, ptask);
3148 
3149   KA_TRACE(10,
3150            ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n",
3151             gtid, taskdata));
3152 }
3153 
3154 /*!
3155 @ingroup TASKING
3156 @param ptask Task which execution is completed
3157 
3158 Execute the completation of a proxy task from a thread that could not belong to
3159 the team.
3160 */
3161 void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) {
3162   KMP_DEBUG_ASSERT(ptask != NULL);
3163   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3164 
3165   KA_TRACE(
3166       10,
3167       ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n",
3168        taskdata));
3169 
3170   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3171 
3172   __kmp_first_top_half_finish_proxy(taskdata);
3173 
3174   // Enqueue task to complete bottom half completion from a thread within the
3175   // corresponding team
3176   kmp_team_t *team = taskdata->td_team;
3177   kmp_int32 nthreads = team->t.t_nproc;
3178   kmp_info_t *thread;
3179 
3180   // This should be similar to start_k = __kmp_get_random( thread ) % nthreads
3181   // but we cannot use __kmp_get_random here
3182   kmp_int32 start_k = 0;
3183   kmp_int32 pass = 1;
3184   kmp_int32 k = start_k;
3185 
3186   do {
3187     // For now we're just linearly trying to find a thread
3188     thread = team->t.t_threads[k];
3189     k = (k + 1) % nthreads;
3190 
3191     // we did a full pass through all the threads
3192     if (k == start_k)
3193       pass = pass << 1;
3194 
3195   } while (!__kmp_give_task(thread, k, ptask, pass));
3196 
3197   __kmp_second_top_half_finish_proxy(taskdata);
3198 
3199   KA_TRACE(
3200       10,
3201       ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n",
3202        taskdata));
3203 }
3204 
3205 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task
3206 // for taskloop
3207 //
3208 // thread:   allocating thread
3209 // task_src: pointer to source task to be duplicated
3210 // returns:  a pointer to the allocated kmp_task_t structure (task).
3211 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) {
3212   kmp_task_t *task;
3213   kmp_taskdata_t *taskdata;
3214   kmp_taskdata_t *taskdata_src;
3215   kmp_taskdata_t *parent_task = thread->th.th_current_task;
3216   size_t shareds_offset;
3217   size_t task_size;
3218 
3219   KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread,
3220                 task_src));
3221   taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
3222   KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy ==
3223                    TASK_FULL); // it should not be proxy task
3224   KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT);
3225   task_size = taskdata_src->td_size_alloc;
3226 
3227   // Allocate a kmp_taskdata_t block and a kmp_task_t block.
3228   KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread,
3229                 task_size));
3230 #if USE_FAST_MEMORY
3231   taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size);
3232 #else
3233   taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size);
3234 #endif /* USE_FAST_MEMORY */
3235   KMP_MEMCPY(taskdata, taskdata_src, task_size);
3236 
3237   task = KMP_TASKDATA_TO_TASK(taskdata);
3238 
3239   // Initialize new task (only specific fields not affected by memcpy)
3240   taskdata->td_task_id = KMP_GEN_TASK_ID();
3241   if (task->shareds != NULL) { // need setup shareds pointer
3242     shareds_offset = (char *)task_src->shareds - (char *)taskdata_src;
3243     task->shareds = &((char *)taskdata)[shareds_offset];
3244     KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
3245                      0);
3246   }
3247   taskdata->td_alloc_thread = thread;
3248   taskdata->td_taskgroup =
3249       parent_task
3250           ->td_taskgroup; // task inherits the taskgroup from the parent task
3251 
3252   // Only need to keep track of child task counts if team parallel and tasking
3253   // not serialized
3254   if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
3255     KMP_TEST_THEN_INC32(
3256         CCAST(kmp_int32 *, &parent_task->td_incomplete_child_tasks));
3257     if (parent_task->td_taskgroup)
3258       KMP_TEST_THEN_INC32(&parent_task->td_taskgroup->count);
3259     // Only need to keep track of allocated child tasks for explicit tasks since
3260     // implicit not deallocated
3261     if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT)
3262       KMP_TEST_THEN_INC32(
3263           CCAST(kmp_int32 *, &taskdata->td_parent->td_allocated_child_tasks));
3264   }
3265 
3266   KA_TRACE(20,
3267            ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
3268             thread, taskdata, taskdata->td_parent));
3269 #if OMPT_SUPPORT
3270   __kmp_task_init_ompt(taskdata, thread->th.th_info.ds.ds_gtid,
3271                        (void *)task->routine);
3272 #endif
3273   return task;
3274 }
3275 
3276 // Routine optionally generated by th ecompiler for setting the lastprivate flag
3277 // and calling needed constructors for private/firstprivate objects
3278 // (used to form taskloop tasks from pattern task)
3279 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
3280 
3281 // __kmp_taskloop_linear: Start tasks of the taskloop linearly
3282 //
3283 // loc       Source location information
3284 // gtid      Global thread ID
3285 // task      Task with whole loop iteration range
3286 // lb        Pointer to loop lower bound
3287 // ub        Pointer to loop upper bound
3288 // st        Loop stride
3289 // sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
3290 // grainsize Schedule value if specified
3291 // task_dup  Tasks duplication routine
3292 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
3293                            kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
3294                            int sched, kmp_uint64 grainsize, void *task_dup) {
3295   KMP_COUNT_BLOCK(OMP_TASKLOOP);
3296   KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
3297   p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
3298   kmp_uint64 tc;
3299   kmp_uint64 lower = *lb; // compiler provides global bounds here
3300   kmp_uint64 upper = *ub;
3301   kmp_uint64 i, num_tasks = 0, extras = 0;
3302   kmp_info_t *thread = __kmp_threads[gtid];
3303   kmp_taskdata_t *current_task = thread->th.th_current_task;
3304   kmp_task_t *next_task;
3305   kmp_int32 lastpriv = 0;
3306   size_t lower_offset =
3307       (char *)lb - (char *)task; // remember offset of lb in the task structure
3308   size_t upper_offset =
3309       (char *)ub - (char *)task; // remember offset of ub in the task structure
3310 
3311   // compute trip count
3312   if (st == 1) { // most common case
3313     tc = upper - lower + 1;
3314   } else if (st < 0) {
3315     tc = (lower - upper) / (-st) + 1;
3316   } else { // st > 0
3317     tc = (upper - lower) / st + 1;
3318   }
3319   if (tc == 0) {
3320     KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d zero-trip loop\n", gtid));
3321     // free the pattern task and exit
3322     __kmp_task_start(gtid, task, current_task);
3323     // do not execute anything for zero-trip loop
3324     __kmp_task_finish(gtid, task, current_task);
3325     return;
3326   }
3327 
3328   // compute num_tasks/grainsize based on the input provided
3329   switch (sched) {
3330   case 0: // no schedule clause specified, we can choose the default
3331     // let's try to schedule (team_size*10) tasks
3332     grainsize = thread->th.th_team_nproc * 10;
3333   case 2: // num_tasks provided
3334     if (grainsize > tc) {
3335       num_tasks = tc; // too big num_tasks requested, adjust values
3336       grainsize = 1;
3337       extras = 0;
3338     } else {
3339       num_tasks = grainsize;
3340       grainsize = tc / num_tasks;
3341       extras = tc % num_tasks;
3342     }
3343     break;
3344   case 1: // grainsize provided
3345     if (grainsize > tc) {
3346       num_tasks = 1; // too big grainsize requested, adjust values
3347       grainsize = tc;
3348       extras = 0;
3349     } else {
3350       num_tasks = tc / grainsize;
3351       grainsize =
3352           tc /
3353           num_tasks; // adjust grainsize for balanced distribution of iterations
3354       extras = tc % num_tasks;
3355     }
3356     break;
3357   default:
3358     KMP_ASSERT2(0, "unknown scheduling of taskloop");
3359   }
3360   KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
3361   KMP_DEBUG_ASSERT(num_tasks > extras);
3362   KMP_DEBUG_ASSERT(num_tasks > 0);
3363   KA_TRACE(20, ("__kmpc_taskloop: T#%d will launch: num_tasks %lld, grainsize "
3364                 "%lld, extras %lld\n",
3365                 gtid, num_tasks, grainsize, extras));
3366 
3367   // Main loop, launch num_tasks tasks, assign grainsize iterations each task
3368   for (i = 0; i < num_tasks; ++i) {
3369     kmp_uint64 chunk_minus_1;
3370     if (extras == 0) {
3371       chunk_minus_1 = grainsize - 1;
3372     } else {
3373       chunk_minus_1 = grainsize;
3374       --extras; // first extras iterations get bigger chunk (grainsize+1)
3375     }
3376     upper = lower + st * chunk_minus_1;
3377     if (i == num_tasks - 1) {
3378       // schedule the last task, set lastprivate flag
3379       lastpriv = 1;
3380 #if KMP_DEBUG
3381       if (st == 1)
3382         KMP_DEBUG_ASSERT(upper == *ub);
3383       else if (st > 0)
3384         KMP_DEBUG_ASSERT(upper + st > *ub);
3385       else
3386         KMP_DEBUG_ASSERT(upper + st < *ub);
3387 #endif
3388     }
3389     next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
3390     *(kmp_uint64 *)((char *)next_task + lower_offset) =
3391         lower; // adjust task-specific bounds
3392     *(kmp_uint64 *)((char *)next_task + upper_offset) = upper;
3393     if (ptask_dup != NULL)
3394       ptask_dup(next_task, task,
3395                 lastpriv); // set lastprivate flag, construct fistprivates, etc.
3396     KA_TRACE(20, ("__kmpc_taskloop: T#%d schedule task %p: lower %lld, upper "
3397                   "%lld (offsets %p %p)\n",
3398                   gtid, next_task, lower, upper, lower_offset, upper_offset));
3399     __kmp_omp_task(gtid, next_task, true); // schedule new task
3400     lower = upper + st; // adjust lower bound for the next iteration
3401   }
3402   // free the pattern task and exit
3403   __kmp_task_start(gtid, task, current_task);
3404   // do not execute the pattern task, just do bookkeeping
3405   __kmp_task_finish(gtid, task, current_task);
3406 }
3407 
3408 /*!
3409 @ingroup TASKING
3410 @param loc       Source location information
3411 @param gtid      Global thread ID
3412 @param task      Task structure
3413 @param if_val    Value of the if clause
3414 @param lb        Pointer to loop lower bound
3415 @param ub        Pointer to loop upper bound
3416 @param st        Loop stride
3417 @param nogroup   Flag, 1 if nogroup clause specified, 0 otherwise
3418 @param sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
3419 @param grainsize Schedule value if specified
3420 @param task_dup  Tasks duplication routine
3421 
3422 Execute the taskloop construct.
3423 */
3424 void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
3425                      kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup,
3426                      int sched, kmp_uint64 grainsize, void *task_dup) {
3427   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
3428   KMP_DEBUG_ASSERT(task != NULL);
3429 
3430   KA_TRACE(10, ("__kmpc_taskloop(enter): T#%d, pattern task %p, lb %lld ub "
3431                 "%lld st %lld, grain %llu(%d)\n",
3432                 gtid, taskdata, *lb, *ub, st, grainsize, sched));
3433 
3434   // check if clause value first
3435   if (if_val == 0) { // if(0) specified, mark task as serial
3436     taskdata->td_flags.task_serial = 1;
3437     taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
3438   }
3439   if (nogroup == 0) {
3440     __kmpc_taskgroup(loc, gtid);
3441   }
3442 
3443   if (1 /* AC: use some heuristic here to choose task scheduling method */) {
3444     __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, sched, grainsize,
3445                           task_dup);
3446   }
3447 
3448   if (nogroup == 0) {
3449     __kmpc_end_taskgroup(loc, gtid);
3450   }
3451   KA_TRACE(10, ("__kmpc_taskloop(exit): T#%d\n", gtid));
3452 }
3453 
3454 #endif
3455