1 /*
2  * kmp_tasking.cpp -- OpenMP 3.0 tasking support.
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_stats.h"
20 #include "kmp_wait_release.h"
21 
22 #if OMPT_SUPPORT
23 #include "ompt-specific.h"
24 #endif
25 
26 #include "tsan_annotations.h"
27 
28 /* forward declaration */
29 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
30                                  kmp_info_t *this_thr);
31 static void __kmp_alloc_task_deque(kmp_info_t *thread,
32                                    kmp_thread_data_t *thread_data);
33 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
34                                            kmp_task_team_t *task_team);
35 
36 #ifdef OMP_45_ENABLED
37 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask);
38 #endif
39 
40 #ifdef BUILD_TIED_TASK_STACK
41 
42 //  __kmp_trace_task_stack: print the tied tasks from the task stack in order
43 //  from top do bottom
44 //
45 //  gtid: global thread identifier for thread containing stack
46 //  thread_data: thread data for task team thread containing stack
47 //  threshold: value above which the trace statement triggers
48 //  location: string identifying call site of this function (for trace)
49 static void __kmp_trace_task_stack(kmp_int32 gtid,
50                                    kmp_thread_data_t *thread_data,
51                                    int threshold, char *location) {
52   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
53   kmp_taskdata_t **stack_top = task_stack->ts_top;
54   kmp_int32 entries = task_stack->ts_entries;
55   kmp_taskdata_t *tied_task;
56 
57   KA_TRACE(
58       threshold,
59       ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, "
60        "first_block = %p, stack_top = %p \n",
61        location, gtid, entries, task_stack->ts_first_block, stack_top));
62 
63   KMP_DEBUG_ASSERT(stack_top != NULL);
64   KMP_DEBUG_ASSERT(entries > 0);
65 
66   while (entries != 0) {
67     KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]);
68     // fix up ts_top if we need to pop from previous block
69     if (entries & TASK_STACK_INDEX_MASK == 0) {
70       kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top);
71 
72       stack_block = stack_block->sb_prev;
73       stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
74     }
75 
76     // finish bookkeeping
77     stack_top--;
78     entries--;
79 
80     tied_task = *stack_top;
81 
82     KMP_DEBUG_ASSERT(tied_task != NULL);
83     KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
84 
85     KA_TRACE(threshold,
86              ("__kmp_trace_task_stack(%s):             gtid=%d, entry=%d, "
87               "stack_top=%p, tied_task=%p\n",
88               location, gtid, entries, stack_top, tied_task));
89   }
90   KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]);
91 
92   KA_TRACE(threshold,
93            ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n",
94             location, gtid));
95 }
96 
97 //  __kmp_init_task_stack: initialize the task stack for the first time
98 //  after a thread_data structure is created.
99 //  It should not be necessary to do this again (assuming the stack works).
100 //
101 //  gtid: global thread identifier of calling thread
102 //  thread_data: thread data for task team thread containing stack
103 static void __kmp_init_task_stack(kmp_int32 gtid,
104                                   kmp_thread_data_t *thread_data) {
105   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
106   kmp_stack_block_t *first_block;
107 
108   // set up the first block of the stack
109   first_block = &task_stack->ts_first_block;
110   task_stack->ts_top = (kmp_taskdata_t **)first_block;
111   memset((void *)first_block, '\0',
112          TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *));
113 
114   // initialize the stack to be empty
115   task_stack->ts_entries = TASK_STACK_EMPTY;
116   first_block->sb_next = NULL;
117   first_block->sb_prev = NULL;
118 }
119 
120 //  __kmp_free_task_stack: free the task stack when thread_data is destroyed.
121 //
122 //  gtid: global thread identifier for calling thread
123 //  thread_data: thread info for thread containing stack
124 static void __kmp_free_task_stack(kmp_int32 gtid,
125                                   kmp_thread_data_t *thread_data) {
126   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
127   kmp_stack_block_t *stack_block = &task_stack->ts_first_block;
128 
129   KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY);
130   // free from the second block of the stack
131   while (stack_block != NULL) {
132     kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL;
133 
134     stack_block->sb_next = NULL;
135     stack_block->sb_prev = NULL;
136     if (stack_block != &task_stack->ts_first_block) {
137       __kmp_thread_free(thread,
138                         stack_block); // free the block, if not the first
139     }
140     stack_block = next_block;
141   }
142   // initialize the stack to be empty
143   task_stack->ts_entries = 0;
144   task_stack->ts_top = NULL;
145 }
146 
147 //  __kmp_push_task_stack: Push the tied task onto the task stack.
148 //     Grow the stack if necessary by allocating another block.
149 //
150 //  gtid: global thread identifier for calling thread
151 //  thread: thread info for thread containing stack
152 //  tied_task: the task to push on the stack
153 static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread,
154                                   kmp_taskdata_t *tied_task) {
155   // GEH - need to consider what to do if tt_threads_data not allocated yet
156   kmp_thread_data_t *thread_data =
157       &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
158   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
159 
160   if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) {
161     return; // Don't push anything on stack if team or team tasks are serialized
162   }
163 
164   KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
165   KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
166 
167   KA_TRACE(20,
168            ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n",
169             gtid, thread, tied_task));
170   // Store entry
171   *(task_stack->ts_top) = tied_task;
172 
173   // Do bookkeeping for next push
174   task_stack->ts_top++;
175   task_stack->ts_entries++;
176 
177   if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
178     // Find beginning of this task block
179     kmp_stack_block_t *stack_block =
180         (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE);
181 
182     // Check if we already have a block
183     if (stack_block->sb_next !=
184         NULL) { // reset ts_top to beginning of next block
185       task_stack->ts_top = &stack_block->sb_next->sb_block[0];
186     } else { // Alloc new block and link it up
187       kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc(
188           thread, sizeof(kmp_stack_block_t));
189 
190       task_stack->ts_top = &new_block->sb_block[0];
191       stack_block->sb_next = new_block;
192       new_block->sb_prev = stack_block;
193       new_block->sb_next = NULL;
194 
195       KA_TRACE(
196           30,
197           ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n",
198            gtid, tied_task, new_block));
199     }
200   }
201   KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
202                 tied_task));
203 }
204 
205 //  __kmp_pop_task_stack: Pop the tied task from the task stack.  Don't return
206 //  the task, just check to make sure it matches the ending task passed in.
207 //
208 //  gtid: global thread identifier for the calling thread
209 //  thread: thread info structure containing stack
210 //  tied_task: the task popped off the stack
211 //  ending_task: the task that is ending (should match popped task)
212 static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread,
213                                  kmp_taskdata_t *ending_task) {
214   // GEH - need to consider what to do if tt_threads_data not allocated yet
215   kmp_thread_data_t *thread_data =
216       &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)];
217   kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
218   kmp_taskdata_t *tied_task;
219 
220   if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) {
221     // Don't pop anything from stack if team or team tasks are serialized
222     return;
223   }
224 
225   KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
226   KMP_DEBUG_ASSERT(task_stack->ts_entries > 0);
227 
228   KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid,
229                 thread));
230 
231   // fix up ts_top if we need to pop from previous block
232   if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
233     kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top);
234 
235     stack_block = stack_block->sb_prev;
236     task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
237   }
238 
239   // finish bookkeeping
240   task_stack->ts_top--;
241   task_stack->ts_entries--;
242 
243   tied_task = *(task_stack->ts_top);
244 
245   KMP_DEBUG_ASSERT(tied_task != NULL);
246   KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
247   KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly
248 
249   KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
250                 tied_task));
251   return;
252 }
253 #endif /* BUILD_TIED_TASK_STACK */
254 
255 //  __kmp_push_task: Add a task to the thread's deque
256 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
257   kmp_info_t *thread = __kmp_threads[gtid];
258   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
259   kmp_task_team_t *task_team = thread->th.th_task_team;
260   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
261   kmp_thread_data_t *thread_data;
262 
263   KA_TRACE(20,
264            ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata));
265 
266   if (taskdata->td_flags.tiedness == TASK_UNTIED) {
267     // untied task needs to increment counter so that the task structure is not
268     // freed prematurely
269     kmp_int32 counter = 1 + KMP_TEST_THEN_INC32(&taskdata->td_untied_count);
270     KA_TRACE(
271         20,
272         ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",
273          gtid, counter, taskdata));
274   }
275 
276   // The first check avoids building task_team thread data if serialized
277   if (taskdata->td_flags.task_serial) {
278     KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning "
279                   "TASK_NOT_PUSHED for task %p\n",
280                   gtid, taskdata));
281     return TASK_NOT_PUSHED;
282   }
283 
284   // Now that serialized tasks have returned, we can assume that we are not in
285   // immediate exec mode
286   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
287   if (!KMP_TASKING_ENABLED(task_team)) {
288     __kmp_enable_tasking(task_team, thread);
289   }
290   KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE);
291   KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL);
292 
293   // Find tasking deque specific to encountering thread
294   thread_data = &task_team->tt.tt_threads_data[tid];
295 
296   // No lock needed since only owner can allocate
297   if (thread_data->td.td_deque == NULL) {
298     __kmp_alloc_task_deque(thread, thread_data);
299   }
300 
301   // Check if deque is full
302   if (TCR_4(thread_data->td.td_deque_ntasks) >=
303       TASK_DEQUE_SIZE(thread_data->td)) {
304     KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning "
305                   "TASK_NOT_PUSHED for task %p\n",
306                   gtid, taskdata));
307     return TASK_NOT_PUSHED;
308   }
309 
310   // Lock the deque for the task push operation
311   __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
312 
313 #if OMP_45_ENABLED
314   // Need to recheck as we can get a proxy task from a thread outside of OpenMP
315   if (TCR_4(thread_data->td.td_deque_ntasks) >=
316       TASK_DEQUE_SIZE(thread_data->td)) {
317     __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
318     KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; returning "
319                   "TASK_NOT_PUSHED for task %p\n",
320                   gtid, taskdata));
321     return TASK_NOT_PUSHED;
322   }
323 #else
324   // Must have room since no thread can add tasks but calling thread
325   KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
326                    TASK_DEQUE_SIZE(thread_data->td));
327 #endif
328 
329   thread_data->td.td_deque[thread_data->td.td_deque_tail] =
330       taskdata; // Push taskdata
331   // Wrap index.
332   thread_data->td.td_deque_tail =
333       (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
334   TCW_4(thread_data->td.td_deque_ntasks,
335         TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
336 
337   KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
338                 "task=%p ntasks=%d head=%u tail=%u\n",
339                 gtid, taskdata, thread_data->td.td_deque_ntasks,
340                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
341 
342   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
343 
344   return TASK_SUCCESSFULLY_PUSHED;
345 }
346 
347 // __kmp_pop_current_task_from_thread: set up current task from called thread
348 // when team ends
349 //
350 // this_thr: thread structure to set current_task in.
351 void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) {
352   KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d "
353                 "this_thread=%p, curtask=%p, "
354                 "curtask_parent=%p\n",
355                 0, this_thr, this_thr->th.th_current_task,
356                 this_thr->th.th_current_task->td_parent));
357 
358   this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent;
359 
360   KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d "
361                 "this_thread=%p, curtask=%p, "
362                 "curtask_parent=%p\n",
363                 0, this_thr, this_thr->th.th_current_task,
364                 this_thr->th.th_current_task->td_parent));
365 }
366 
367 // __kmp_push_current_task_to_thread: set up current task in called thread for a
368 // new team
369 //
370 // this_thr: thread structure to set up
371 // team: team for implicit task data
372 // tid: thread within team to set up
373 void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team,
374                                        int tid) {
375   // current task of the thread is a parent of the new just created implicit
376   // tasks of new team
377   KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p "
378                 "curtask=%p "
379                 "parent_task=%p\n",
380                 tid, this_thr, this_thr->th.th_current_task,
381                 team->t.t_implicit_task_taskdata[tid].td_parent));
382 
383   KMP_DEBUG_ASSERT(this_thr != NULL);
384 
385   if (tid == 0) {
386     if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) {
387       team->t.t_implicit_task_taskdata[0].td_parent =
388           this_thr->th.th_current_task;
389       this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0];
390     }
391   } else {
392     team->t.t_implicit_task_taskdata[tid].td_parent =
393         team->t.t_implicit_task_taskdata[0].td_parent;
394     this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid];
395   }
396 
397   KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p "
398                 "curtask=%p "
399                 "parent_task=%p\n",
400                 tid, this_thr, this_thr->th.th_current_task,
401                 team->t.t_implicit_task_taskdata[tid].td_parent));
402 }
403 
404 // __kmp_task_start: bookkeeping for a task starting execution
405 //
406 // GTID: global thread id of calling thread
407 // task: task starting execution
408 // current_task: task suspending
409 static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task,
410                              kmp_taskdata_t *current_task) {
411   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
412   kmp_info_t *thread = __kmp_threads[gtid];
413 
414   KA_TRACE(10,
415            ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",
416             gtid, taskdata, current_task));
417 
418   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
419 
420   // mark currently executing task as suspended
421   // TODO: GEH - make sure root team implicit task is initialized properly.
422   // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );
423   current_task->td_flags.executing = 0;
424 
425 // Add task to stack if tied
426 #ifdef BUILD_TIED_TASK_STACK
427   if (taskdata->td_flags.tiedness == TASK_TIED) {
428     __kmp_push_task_stack(gtid, thread, taskdata);
429   }
430 #endif /* BUILD_TIED_TASK_STACK */
431 
432   // mark starting task as executing and as current task
433   thread->th.th_current_task = taskdata;
434 
435   KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 ||
436                    taskdata->td_flags.tiedness == TASK_UNTIED);
437   KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 ||
438                    taskdata->td_flags.tiedness == TASK_UNTIED);
439   taskdata->td_flags.started = 1;
440   taskdata->td_flags.executing = 1;
441   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
442   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
443 
444   // GEH TODO: shouldn't we pass some sort of location identifier here?
445   // APT: yes, we will pass location here.
446   // need to store current thread state (in a thread or taskdata structure)
447   // before setting work_state, otherwise wrong state is set after end of task
448 
449   KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata));
450 
451 #if OMPT_SUPPORT
452   if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_task_begin)) {
453     kmp_taskdata_t *parent = taskdata->td_parent;
454     ompt_callbacks.ompt_callback(ompt_event_task_begin)(
455         parent ? parent->ompt_task_info.task_id : ompt_task_id_none,
456         parent ? &(parent->ompt_task_info.frame) : NULL,
457         taskdata->ompt_task_info.task_id, taskdata->ompt_task_info.function);
458   }
459 #endif
460 #if OMP_40_ENABLED && OMPT_SUPPORT && OMPT_TRACE
461   /* OMPT emit all dependences if requested by the tool */
462   if (ompt_enabled && taskdata->ompt_task_info.ndeps > 0 &&
463       ompt_callbacks.ompt_callback(ompt_event_task_dependences)) {
464     ompt_callbacks.ompt_callback(ompt_event_task_dependences)(
465         taskdata->ompt_task_info.task_id, taskdata->ompt_task_info.deps,
466         taskdata->ompt_task_info.ndeps);
467     /* We can now free the allocated memory for the dependencies */
468     KMP_OMPT_DEPS_FREE(thread, taskdata->ompt_task_info.deps);
469     taskdata->ompt_task_info.deps = NULL;
470     taskdata->ompt_task_info.ndeps = 0;
471   }
472 #endif /* OMP_40_ENABLED && OMPT_SUPPORT && OMPT_TRACE */
473 
474   return;
475 }
476 
477 // __kmpc_omp_task_begin_if0: report that a given serialized task has started
478 // execution
479 //
480 // loc_ref: source location information; points to beginning of task block.
481 // gtid: global thread number.
482 // task: task thunk for the started task.
483 void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
484                                kmp_task_t *task) {
485   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
486   kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
487 
488   KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p "
489                 "current_task=%p\n",
490                 gtid, loc_ref, taskdata, current_task));
491 
492   if (taskdata->td_flags.tiedness == TASK_UNTIED) {
493     // untied task needs to increment counter so that the task structure is not
494     // freed prematurely
495     kmp_int32 counter = 1 + KMP_TEST_THEN_INC32(&taskdata->td_untied_count);
496     KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) "
497                   "incremented for task %p\n",
498                   gtid, counter, taskdata));
499   }
500 
501   taskdata->td_flags.task_serial =
502       1; // Execute this task immediately, not deferred.
503   __kmp_task_start(gtid, task, current_task);
504 
505   KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid,
506                 loc_ref, taskdata));
507 
508   return;
509 }
510 
511 #ifdef TASK_UNUSED
512 // __kmpc_omp_task_begin: report that a given task has started execution
513 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
514 void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) {
515   kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
516 
517   KA_TRACE(
518       10,
519       ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
520        gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task));
521 
522   __kmp_task_start(gtid, task, current_task);
523 
524   KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid,
525                 loc_ref, KMP_TASK_TO_TASKDATA(task)));
526   return;
527 }
528 #endif // TASK_UNUSED
529 
530 // __kmp_free_task: free the current task space and the space for shareds
531 //
532 // gtid: Global thread ID of calling thread
533 // taskdata: task to free
534 // thread: thread data structure of caller
535 static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
536                             kmp_info_t *thread) {
537   KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid,
538                 taskdata));
539 
540   // Check to make sure all flags and counters have the correct values
541   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
542   KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0);
543   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1);
544   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
545   KMP_DEBUG_ASSERT(TCR_4(taskdata->td_allocated_child_tasks) == 0 ||
546                    taskdata->td_flags.task_serial == 1);
547   KMP_DEBUG_ASSERT(TCR_4(taskdata->td_incomplete_child_tasks) == 0);
548 
549   taskdata->td_flags.freed = 1;
550   ANNOTATE_HAPPENS_BEFORE(taskdata);
551 // deallocate the taskdata and shared variable blocks associated with this task
552 #if USE_FAST_MEMORY
553   __kmp_fast_free(thread, taskdata);
554 #else /* ! USE_FAST_MEMORY */
555   __kmp_thread_free(thread, taskdata);
556 #endif
557 
558   KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));
559 }
560 
561 // __kmp_free_task_and_ancestors: free the current task and ancestors without
562 // children
563 //
564 // gtid: Global thread ID of calling thread
565 // taskdata: task to free
566 // thread: thread data structure of caller
567 static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
568                                           kmp_taskdata_t *taskdata,
569                                           kmp_info_t *thread) {
570 #if OMP_45_ENABLED
571   // Proxy tasks must always be allowed to free their parents
572   // because they can be run in background even in serial mode.
573   kmp_int32 team_serial =
574       (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) &&
575       !taskdata->td_flags.proxy;
576 #else
577   kmp_int32 team_serial =
578       taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser;
579 #endif
580   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
581 
582   kmp_int32 children =
583       KMP_TEST_THEN_DEC32(&taskdata->td_allocated_child_tasks) - 1;
584   KMP_DEBUG_ASSERT(children >= 0);
585 
586   // Now, go up the ancestor tree to see if any ancestors can now be freed.
587   while (children == 0) {
588     kmp_taskdata_t *parent_taskdata = taskdata->td_parent;
589 
590     KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete "
591                   "and freeing itself\n",
592                   gtid, taskdata));
593 
594     // --- Deallocate my ancestor task ---
595     __kmp_free_task(gtid, taskdata, thread);
596 
597     taskdata = parent_taskdata;
598 
599     // Stop checking ancestors at implicit task instead of walking up ancestor
600     // tree to avoid premature deallocation of ancestors.
601     if (team_serial || taskdata->td_flags.tasktype == TASK_IMPLICIT)
602       return;
603 
604     // Predecrement simulated by "- 1" calculation
605     children = KMP_TEST_THEN_DEC32(&taskdata->td_allocated_child_tasks) - 1;
606     KMP_DEBUG_ASSERT(children >= 0);
607   }
608 
609   KA_TRACE(
610       20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "
611            "not freeing it yet\n",
612            gtid, taskdata, children));
613 }
614 
615 // __kmp_task_finish: bookkeeping to do when a task finishes execution
616 //
617 // gtid: global thread ID for calling thread
618 // task: task to be finished
619 // resumed_task: task to be resumed.  (may be NULL if task is serialized)
620 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
621                               kmp_taskdata_t *resumed_task) {
622   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
623   kmp_info_t *thread = __kmp_threads[gtid];
624   kmp_task_team_t *task_team =
625       thread->th.th_task_team; // might be NULL for serial teams...
626   kmp_int32 children = 0;
627 
628 #if OMPT_SUPPORT
629   if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_task_end)) {
630     kmp_taskdata_t *parent = taskdata->td_parent;
631     ompt_callbacks.ompt_callback(ompt_event_task_end)(
632         taskdata->ompt_task_info.task_id);
633   }
634 #endif
635 
636   KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming "
637                 "task %p\n",
638                 gtid, taskdata, resumed_task));
639 
640   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
641 
642 // Pop task from stack if tied
643 #ifdef BUILD_TIED_TASK_STACK
644   if (taskdata->td_flags.tiedness == TASK_TIED) {
645     __kmp_pop_task_stack(gtid, thread, taskdata);
646   }
647 #endif /* BUILD_TIED_TASK_STACK */
648 
649   if (taskdata->td_flags.tiedness == TASK_UNTIED) {
650     // untied task needs to check the counter so that the task structure is not
651     // freed prematurely
652     kmp_int32 counter = KMP_TEST_THEN_DEC32(&taskdata->td_untied_count) - 1;
653     KA_TRACE(
654         20,
655         ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",
656          gtid, counter, taskdata));
657     if (counter > 0) {
658       // untied task is not done, to be continued possibly by other thread, do
659       // not free it now
660       if (resumed_task == NULL) {
661         KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial);
662         resumed_task = taskdata->td_parent; // In a serialized task, the resumed
663         // task is the parent
664       }
665       thread->th.th_current_task = resumed_task; // restore current_task
666       resumed_task->td_flags.executing = 1; // resume previous task
667       KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, "
668                     "resuming task %p\n",
669                     gtid, taskdata, resumed_task));
670       return;
671     }
672   }
673 
674   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
675   taskdata->td_flags.complete = 1; // mark the task as completed
676   KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
677   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
678 
679   // Only need to keep track of count if team parallel and tasking not
680   // serialized
681   if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
682     // Predecrement simulated by "- 1" calculation
683     children =
684         KMP_TEST_THEN_DEC32(&taskdata->td_parent->td_incomplete_child_tasks) -
685         1;
686     KMP_DEBUG_ASSERT(children >= 0);
687 #if OMP_40_ENABLED
688     if (taskdata->td_taskgroup)
689       KMP_TEST_THEN_DEC32((kmp_int32 *)(&taskdata->td_taskgroup->count));
690 #if OMP_45_ENABLED
691   }
692   // if we found proxy tasks there could exist a dependency chain
693   // with the proxy task as origin
694   if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) ||
695       (task_team && task_team->tt.tt_found_proxy_tasks)) {
696 #endif
697     __kmp_release_deps(gtid, taskdata);
698 #endif
699   }
700 
701   // td_flags.executing must be marked as 0 after __kmp_release_deps has been
702   // called. Othertwise, if a task is executed immediately from the release_deps
703   // code, the flag will be reset to 1 again by this same function
704   KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
705   taskdata->td_flags.executing = 0; // suspend the finishing task
706 
707   KA_TRACE(
708       20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
709            gtid, taskdata, children));
710 
711 #if OMP_40_ENABLED
712   /* If the tasks' destructor thunk flag has been set, we need to invoke the
713      destructor thunk that has been generated by the compiler. The code is
714      placed here, since at this point other tasks might have been released
715      hence overlapping the destructor invokations with some other work in the
716      released tasks.  The OpenMP spec is not specific on when the destructors
717      are invoked, so we should be free to choose. */
718   if (taskdata->td_flags.destructors_thunk) {
719     kmp_routine_entry_t destr_thunk = task->data1.destructors;
720     KMP_ASSERT(destr_thunk);
721     destr_thunk(gtid, task);
722   }
723 #endif // OMP_40_ENABLED
724 
725   // bookkeeping for resuming task:
726   // GEH - note tasking_ser => task_serial
727   KMP_DEBUG_ASSERT(
728       (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
729       taskdata->td_flags.task_serial);
730   if (taskdata->td_flags.task_serial) {
731     if (resumed_task == NULL) {
732       resumed_task = taskdata->td_parent; // In a serialized task, the resumed
733       // task is the parent
734     } else
735 #if OMP_45_ENABLED
736         if (!(task_team && task_team->tt.tt_found_proxy_tasks))
737 #endif
738     {
739       // verify resumed task passed in points to parent
740       KMP_DEBUG_ASSERT(resumed_task == taskdata->td_parent);
741     }
742   } else {
743     KMP_DEBUG_ASSERT(resumed_task !=
744                      NULL); // verify that resumed task is passed as arguemnt
745   }
746 
747   // Free this task and then ancestor tasks if they have no children.
748   // Restore th_current_task first as suggested by John:
749   // johnmc: if an asynchronous inquiry peers into the runtime system
750   // it doesn't see the freed task as the current task.
751   thread->th.th_current_task = resumed_task;
752   __kmp_free_task_and_ancestors(gtid, taskdata, thread);
753 
754   // TODO: GEH - make sure root team implicit task is initialized properly.
755   // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
756   resumed_task->td_flags.executing = 1; // resume previous task
757 
758   KA_TRACE(
759       10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
760            gtid, taskdata, resumed_task));
761 
762   return;
763 }
764 
765 // __kmpc_omp_task_complete_if0: report that a task has completed execution
766 //
767 // loc_ref: source location information; points to end of task block.
768 // gtid: global thread number.
769 // task: task thunk for the completed task.
770 void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,
771                                   kmp_task_t *task) {
772   KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
773                 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
774   // this routine will provide task to resume
775   __kmp_task_finish(gtid, task, NULL);
776 
777   KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
778                 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
779   return;
780 }
781 
782 #ifdef TASK_UNUSED
783 // __kmpc_omp_task_complete: report that a task has completed execution
784 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
785 void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid,
786                               kmp_task_t *task) {
787   KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid,
788                 loc_ref, KMP_TASK_TO_TASKDATA(task)));
789 
790   __kmp_task_finish(gtid, task, NULL); // Not sure how to find task to resume
791 
792   KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid,
793                 loc_ref, KMP_TASK_TO_TASKDATA(task)));
794   return;
795 }
796 #endif // TASK_UNUSED
797 
798 #if OMPT_SUPPORT
799 // __kmp_task_init_ompt: Initialize OMPT fields maintained by a task. This will
800 //  only be called after ompt_tool, so we already know whether ompt is enabled
801 // or not.
802 static inline void __kmp_task_init_ompt(kmp_taskdata_t *task, int tid,
803                                         void *function) {
804   if (ompt_enabled) {
805     task->ompt_task_info.task_id = __ompt_task_id_new(tid);
806     task->ompt_task_info.function = function;
807     task->ompt_task_info.frame.exit_runtime_frame = NULL;
808     task->ompt_task_info.frame.reenter_runtime_frame = NULL;
809 #if OMP_40_ENABLED
810     task->ompt_task_info.ndeps = 0;
811     task->ompt_task_info.deps = NULL;
812 #endif /* OMP_40_ENABLED */
813   }
814 }
815 #endif
816 
817 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit
818 // task for a given thread
819 //
820 // loc_ref:  reference to source location of parallel region
821 // this_thr:  thread data structure corresponding to implicit task
822 // team: team for this_thr
823 // tid: thread id of given thread within team
824 // set_curr_task: TRUE if need to push current task to thread
825 // NOTE: Routine does not set up the implicit task ICVS.  This is assumed to
826 // have already been done elsewhere.
827 // TODO: Get better loc_ref.  Value passed in may be NULL
828 void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
829                               kmp_team_t *team, int tid, int set_curr_task) {
830   kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid];
831 
832   KF_TRACE(
833       10,
834       ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",
835        tid, team, task, set_curr_task ? "TRUE" : "FALSE"));
836 
837   task->td_task_id = KMP_GEN_TASK_ID();
838   task->td_team = team;
839   //    task->td_parent   = NULL;  // fix for CQ230101 (broken parent task info
840   //    in debugger)
841   task->td_ident = loc_ref;
842   task->td_taskwait_ident = NULL;
843   task->td_taskwait_counter = 0;
844   task->td_taskwait_thread = 0;
845 
846   task->td_flags.tiedness = TASK_TIED;
847   task->td_flags.tasktype = TASK_IMPLICIT;
848 #if OMP_45_ENABLED
849   task->td_flags.proxy = TASK_FULL;
850 #endif
851 
852   // All implicit tasks are executed immediately, not deferred
853   task->td_flags.task_serial = 1;
854   task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
855   task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
856 
857   task->td_flags.started = 1;
858   task->td_flags.executing = 1;
859   task->td_flags.complete = 0;
860   task->td_flags.freed = 0;
861 
862 #if OMP_40_ENABLED
863   task->td_depnode = NULL;
864 #endif
865 
866   if (set_curr_task) { // only do this init first time thread is created
867     task->td_incomplete_child_tasks = 0;
868     // Not used: don't need to deallocate implicit task
869     task->td_allocated_child_tasks = 0;
870 #if OMP_40_ENABLED
871     task->td_taskgroup = NULL; // An implicit task does not have taskgroup
872     task->td_dephash = NULL;
873 #endif
874     __kmp_push_current_task_to_thread(this_thr, team, tid);
875   } else {
876     KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);
877     KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0);
878   }
879 
880 #if OMPT_SUPPORT
881   __kmp_task_init_ompt(task, tid, NULL);
882 #endif
883 
884   KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid,
885                 team, task));
886 }
887 
888 // __kmp_finish_implicit_task: Release resources associated to implicit tasks
889 // at the end of parallel regions. Some resources are kept for reuse in the next
890 // parallel region.
891 //
892 // thread:  thread data structure corresponding to implicit task
893 void __kmp_finish_implicit_task(kmp_info_t *thread) {
894   kmp_taskdata_t *task = thread->th.th_current_task;
895   if (task->td_dephash)
896     __kmp_dephash_free_entries(thread, task->td_dephash);
897 }
898 
899 // __kmp_free_implicit_task: Release resources associated to implicit tasks
900 // when these are destroyed regions
901 //
902 // thread:  thread data structure corresponding to implicit task
903 void __kmp_free_implicit_task(kmp_info_t *thread) {
904   kmp_taskdata_t *task = thread->th.th_current_task;
905   if (task->td_dephash)
906     __kmp_dephash_free(thread, task->td_dephash);
907   task->td_dephash = NULL;
908 }
909 
910 // Round up a size to a power of two specified by val: Used to insert padding
911 // between structures co-allocated using a single malloc() call
912 static size_t __kmp_round_up_to_val(size_t size, size_t val) {
913   if (size & (val - 1)) {
914     size &= ~(val - 1);
915     if (size <= KMP_SIZE_T_MAX - val) {
916       size += val; // Round up if there is no overflow.
917     }; // if
918   }; // if
919   return size;
920 } // __kmp_round_up_to_va
921 
922 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task
923 //
924 // loc_ref: source location information
925 // gtid: global thread number.
926 // flags: include tiedness & task type (explicit vs. implicit) of the ''new''
927 // task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine.
928 // sizeof_kmp_task_t:  Size in bytes of kmp_task_t data structure including
929 // private vars accessed in task.
930 // sizeof_shareds:  Size in bytes of array of pointers to shared vars accessed
931 // in task.
932 // task_entry: Pointer to task code entry point generated by compiler.
933 // returns: a pointer to the allocated kmp_task_t structure (task).
934 kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
935                              kmp_tasking_flags_t *flags,
936                              size_t sizeof_kmp_task_t, size_t sizeof_shareds,
937                              kmp_routine_entry_t task_entry) {
938   kmp_task_t *task;
939   kmp_taskdata_t *taskdata;
940   kmp_info_t *thread = __kmp_threads[gtid];
941   kmp_team_t *team = thread->th.th_team;
942   kmp_taskdata_t *parent_task = thread->th.th_current_task;
943   size_t shareds_offset;
944 
945   KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
946                 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
947                 gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
948                 sizeof_shareds, task_entry));
949 
950   if (parent_task->td_flags.final) {
951     if (flags->merged_if0) {
952     }
953     flags->final = 1;
954   }
955 
956 #if OMP_45_ENABLED
957   if (flags->proxy == TASK_PROXY) {
958     flags->tiedness = TASK_UNTIED;
959     flags->merged_if0 = 1;
960 
961     /* are we running in a sequential parallel or tskm_immediate_exec... we need
962        tasking support enabled */
963     if ((thread->th.th_task_team) == NULL) {
964       /* This should only happen if the team is serialized
965           setup a task team and propagate it to the thread */
966       KMP_DEBUG_ASSERT(team->t.t_serialized);
967       KA_TRACE(30,
968                ("T#%d creating task team in __kmp_task_alloc for proxy task\n",
969                 gtid));
970       __kmp_task_team_setup(
971           thread, team,
972           1); // 1 indicates setup the current team regardless of nthreads
973       thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
974     }
975     kmp_task_team_t *task_team = thread->th.th_task_team;
976 
977     /* tasking must be enabled now as the task might not be pushed */
978     if (!KMP_TASKING_ENABLED(task_team)) {
979       KA_TRACE(
980           30,
981           ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
982       __kmp_enable_tasking(task_team, thread);
983       kmp_int32 tid = thread->th.th_info.ds.ds_tid;
984       kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
985       // No lock needed since only owner can allocate
986       if (thread_data->td.td_deque == NULL) {
987         __kmp_alloc_task_deque(thread, thread_data);
988       }
989     }
990 
991     if (task_team->tt.tt_found_proxy_tasks == FALSE)
992       TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE);
993   }
994 #endif
995 
996   // Calculate shared structure offset including padding after kmp_task_t struct
997   // to align pointers in shared struct
998   shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t;
999   shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(void *));
1000 
1001   // Allocate a kmp_taskdata_t block and a kmp_task_t block.
1002   KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid,
1003                 shareds_offset));
1004   KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid,
1005                 sizeof_shareds));
1006 
1007 // Avoid double allocation here by combining shareds with taskdata
1008 #if USE_FAST_MEMORY
1009   taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset +
1010                                                                sizeof_shareds);
1011 #else /* ! USE_FAST_MEMORY */
1012   taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset +
1013                                                                sizeof_shareds);
1014 #endif /* USE_FAST_MEMORY */
1015   ANNOTATE_HAPPENS_AFTER(taskdata);
1016 
1017   task = KMP_TASKDATA_TO_TASK(taskdata);
1018 
1019 // Make sure task & taskdata are aligned appropriately
1020 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD
1021   KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0);
1022   KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0);
1023 #else
1024   KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0);
1025   KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0);
1026 #endif
1027   if (sizeof_shareds > 0) {
1028     // Avoid double allocation here by combining shareds with taskdata
1029     task->shareds = &((char *)taskdata)[shareds_offset];
1030     // Make sure shareds struct is aligned to pointer size
1031     KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
1032                      0);
1033   } else {
1034     task->shareds = NULL;
1035   }
1036   task->routine = task_entry;
1037   task->part_id = 0; // AC: Always start with 0 part id
1038 
1039   taskdata->td_task_id = KMP_GEN_TASK_ID();
1040   taskdata->td_team = team;
1041   taskdata->td_alloc_thread = thread;
1042   taskdata->td_parent = parent_task;
1043   taskdata->td_level = parent_task->td_level + 1; // increment nesting level
1044   taskdata->td_untied_count = 0;
1045   taskdata->td_ident = loc_ref;
1046   taskdata->td_taskwait_ident = NULL;
1047   taskdata->td_taskwait_counter = 0;
1048   taskdata->td_taskwait_thread = 0;
1049   KMP_DEBUG_ASSERT(taskdata->td_parent != NULL);
1050 #if OMP_45_ENABLED
1051   // avoid copying icvs for proxy tasks
1052   if (flags->proxy == TASK_FULL)
1053 #endif
1054     copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs);
1055 
1056   taskdata->td_flags.tiedness = flags->tiedness;
1057   taskdata->td_flags.final = flags->final;
1058   taskdata->td_flags.merged_if0 = flags->merged_if0;
1059 #if OMP_40_ENABLED
1060   taskdata->td_flags.destructors_thunk = flags->destructors_thunk;
1061 #endif // OMP_40_ENABLED
1062 #if OMP_45_ENABLED
1063   taskdata->td_flags.proxy = flags->proxy;
1064   taskdata->td_task_team = thread->th.th_task_team;
1065   taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
1066 #endif
1067   taskdata->td_flags.tasktype = TASK_EXPLICIT;
1068 
1069   // GEH - TODO: fix this to copy parent task's value of tasking_ser flag
1070   taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1071 
1072   // GEH - TODO: fix this to copy parent task's value of team_serial flag
1073   taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1074 
1075   // GEH - Note we serialize the task if the team is serialized to make sure
1076   // implicit parallel region tasks are not left until program termination to
1077   // execute. Also, it helps locality to execute immediately.
1078 
1079   taskdata->td_flags.task_serial =
1080       (parent_task->td_flags.final || taskdata->td_flags.team_serial ||
1081        taskdata->td_flags.tasking_ser);
1082 
1083   taskdata->td_flags.started = 0;
1084   taskdata->td_flags.executing = 0;
1085   taskdata->td_flags.complete = 0;
1086   taskdata->td_flags.freed = 0;
1087 
1088   taskdata->td_flags.native = flags->native;
1089 
1090   taskdata->td_incomplete_child_tasks = 0;
1091   taskdata->td_allocated_child_tasks = 1; // start at one because counts current
1092 // task and children
1093 #if OMP_40_ENABLED
1094   taskdata->td_taskgroup =
1095       parent_task->td_taskgroup; // task inherits taskgroup from the parent task
1096   taskdata->td_dephash = NULL;
1097   taskdata->td_depnode = NULL;
1098 #endif
1099 
1100 // Only need to keep track of child task counts if team parallel and tasking not
1101 // serialized or if it is a proxy task
1102 #if OMP_45_ENABLED
1103   if (flags->proxy == TASK_PROXY ||
1104       !(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser))
1105 #else
1106   if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser))
1107 #endif
1108   {
1109     KMP_TEST_THEN_INC32(&parent_task->td_incomplete_child_tasks);
1110 #if OMP_40_ENABLED
1111     if (parent_task->td_taskgroup)
1112       KMP_TEST_THEN_INC32((kmp_int32 *)(&parent_task->td_taskgroup->count));
1113 #endif
1114     // Only need to keep track of allocated child tasks for explicit tasks since
1115     // implicit not deallocated
1116     if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
1117       KMP_TEST_THEN_INC32(&taskdata->td_parent->td_allocated_child_tasks);
1118     }
1119   }
1120 
1121   KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
1122                 gtid, taskdata, taskdata->td_parent));
1123   ANNOTATE_HAPPENS_BEFORE(task);
1124 
1125 #if OMPT_SUPPORT
1126   __kmp_task_init_ompt(taskdata, gtid, (void *)task_entry);
1127 #endif
1128 
1129   return task;
1130 }
1131 
1132 kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1133                                   kmp_int32 flags, size_t sizeof_kmp_task_t,
1134                                   size_t sizeof_shareds,
1135                                   kmp_routine_entry_t task_entry) {
1136   kmp_task_t *retval;
1137   kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
1138 
1139   input_flags->native = FALSE;
1140 // __kmp_task_alloc() sets up all other runtime flags
1141 
1142 #if OMP_45_ENABLED
1143   KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s) "
1144                 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1145                 gtid, loc_ref, input_flags->tiedness ? "tied  " : "untied",
1146                 input_flags->proxy ? "proxy" : "", sizeof_kmp_task_t,
1147                 sizeof_shareds, task_entry));
1148 #else
1149   KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s) "
1150                 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1151                 gtid, loc_ref, input_flags->tiedness ? "tied  " : "untied",
1152                 sizeof_kmp_task_t, sizeof_shareds, task_entry));
1153 #endif
1154 
1155   retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t,
1156                             sizeof_shareds, task_entry);
1157 
1158   KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval));
1159 
1160   return retval;
1161 }
1162 
1163 //  __kmp_invoke_task: invoke the specified task
1164 //
1165 // gtid: global thread ID of caller
1166 // task: the task to invoke
1167 // current_task: the task to resume after task invokation
1168 static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
1169                               kmp_taskdata_t *current_task) {
1170   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
1171   kmp_uint64 cur_time;
1172 #if OMP_40_ENABLED
1173   int discard = 0 /* false */;
1174 #endif
1175   KA_TRACE(
1176       30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
1177            gtid, taskdata, current_task));
1178   KMP_DEBUG_ASSERT(task);
1179 #if OMP_45_ENABLED
1180   if (taskdata->td_flags.proxy == TASK_PROXY &&
1181       taskdata->td_flags.complete == 1) {
1182     // This is a proxy task that was already completed but it needs to run
1183     // its bottom-half finish
1184     KA_TRACE(
1185         30,
1186         ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
1187          gtid, taskdata));
1188 
1189     __kmp_bottom_half_finish_proxy(gtid, task);
1190 
1191     KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for "
1192                   "proxy task %p, resuming task %p\n",
1193                   gtid, taskdata, current_task));
1194 
1195     return;
1196   }
1197 #endif
1198 
1199 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1200   if (__kmp_forkjoin_frames_mode == 3) {
1201     // Get the current time stamp to measure task execution time to correct
1202     // barrier imbalance time
1203     cur_time = __itt_get_timestamp();
1204   }
1205 #endif
1206 
1207 #if OMP_45_ENABLED
1208   // Proxy tasks are not handled by the runtime
1209   if (taskdata->td_flags.proxy != TASK_PROXY) {
1210 #endif
1211     ANNOTATE_HAPPENS_AFTER(task);
1212     __kmp_task_start(gtid, task, current_task);
1213 #if OMP_45_ENABLED
1214   }
1215 #endif
1216 
1217 #if OMPT_SUPPORT
1218   ompt_thread_info_t oldInfo;
1219   kmp_info_t *thread;
1220   if (ompt_enabled) {
1221     // Store the threads states and restore them after the task
1222     thread = __kmp_threads[gtid];
1223     oldInfo = thread->th.ompt_thread_info;
1224     thread->th.ompt_thread_info.wait_id = 0;
1225     thread->th.ompt_thread_info.state = ompt_state_work_parallel;
1226     taskdata->ompt_task_info.frame.exit_runtime_frame =
1227         __builtin_frame_address(0);
1228   }
1229 #endif
1230 
1231 #if OMP_40_ENABLED
1232   // TODO: cancel tasks if the parallel region has also been cancelled
1233   // TODO: check if this sequence can be hoisted above __kmp_task_start
1234   // if cancellation has been enabled for this run ...
1235   if (__kmp_omp_cancellation) {
1236     kmp_info_t *this_thr = __kmp_threads[gtid];
1237     kmp_team_t *this_team = this_thr->th.th_team;
1238     kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
1239     if ((taskgroup && taskgroup->cancel_request) ||
1240         (this_team->t.t_cancel_request == cancel_parallel)) {
1241       KMP_COUNT_BLOCK(TASK_cancelled);
1242       // this task belongs to a task group and we need to cancel it
1243       discard = 1 /* true */;
1244     }
1245   }
1246 
1247   // Invoke the task routine and pass in relevant data.
1248   // Thunks generated by gcc take a different argument list.
1249   if (!discard) {
1250 #if KMP_STATS_ENABLED
1251     KMP_COUNT_BLOCK(TASK_executed);
1252     switch (KMP_GET_THREAD_STATE()) {
1253     case FORK_JOIN_BARRIER:
1254       KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar);
1255       break;
1256     case PLAIN_BARRIER:
1257       KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar);
1258       break;
1259     case TASKYIELD:
1260       KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield);
1261       break;
1262     case TASKWAIT:
1263       KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait);
1264       break;
1265     case TASKGROUP:
1266       KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup);
1267       break;
1268     default:
1269       KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate);
1270       break;
1271     }
1272 #endif // KMP_STATS_ENABLED
1273 #endif // OMP_40_ENABLED
1274 
1275 #if OMPT_SUPPORT && OMPT_TRACE
1276     /* let OMPT know that we're about to run this task */
1277     if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_task_switch)) {
1278       ompt_callbacks.ompt_callback(ompt_event_task_switch)(
1279           current_task->ompt_task_info.task_id,
1280           taskdata->ompt_task_info.task_id);
1281     }
1282 #endif
1283 
1284 #ifdef KMP_GOMP_COMPAT
1285     if (taskdata->td_flags.native) {
1286       ((void (*)(void *))(*(task->routine)))(task->shareds);
1287     } else
1288 #endif /* KMP_GOMP_COMPAT */
1289     {
1290       (*(task->routine))(gtid, task);
1291     }
1292     KMP_POP_PARTITIONED_TIMER();
1293 
1294 #if OMPT_SUPPORT && OMPT_TRACE
1295     /* let OMPT know that we're returning to the callee task */
1296     if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_task_switch)) {
1297       ompt_callbacks.ompt_callback(ompt_event_task_switch)(
1298           taskdata->ompt_task_info.task_id,
1299           current_task->ompt_task_info.task_id);
1300     }
1301 #endif
1302 
1303 #if OMP_40_ENABLED
1304   }
1305 #endif // OMP_40_ENABLED
1306 
1307 #if OMPT_SUPPORT
1308   if (ompt_enabled) {
1309     thread->th.ompt_thread_info = oldInfo;
1310     taskdata->ompt_task_info.frame.exit_runtime_frame = NULL;
1311   }
1312 #endif
1313 
1314 #if OMP_45_ENABLED
1315   // Proxy tasks are not handled by the runtime
1316   if (taskdata->td_flags.proxy != TASK_PROXY) {
1317 #endif
1318     ANNOTATE_HAPPENS_BEFORE(taskdata->td_parent);
1319     __kmp_task_finish(gtid, task, current_task);
1320 #if OMP_45_ENABLED
1321   }
1322 #endif
1323 
1324 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1325   // Barrier imbalance - correct arrive time after the task finished
1326   if (__kmp_forkjoin_frames_mode == 3) {
1327     kmp_info_t *this_thr = __kmp_threads[gtid];
1328     if (this_thr->th.th_bar_arrive_time) {
1329       this_thr->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time);
1330     }
1331   }
1332 #endif
1333   KA_TRACE(
1334       30,
1335       ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
1336        gtid, taskdata, current_task));
1337   return;
1338 }
1339 
1340 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution
1341 //
1342 // loc_ref: location of original task pragma (ignored)
1343 // gtid: Global Thread ID of encountering thread
1344 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
1345 // Returns:
1346 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1347 //    be resumed later.
1348 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1349 //    resumed later.
1350 kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,
1351                                 kmp_task_t *new_task) {
1352   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1353 
1354   KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid,
1355                 loc_ref, new_taskdata));
1356 
1357   /* Should we execute the new task or queue it? For now, let's just always try
1358      to queue it.  If the queue fills up, then we'll execute it.  */
1359 
1360   if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1361   { // Execute this task immediately
1362     kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1363     new_taskdata->td_flags.task_serial = 1;
1364     __kmp_invoke_task(gtid, new_task, current_task);
1365   }
1366 
1367   KA_TRACE(
1368       10,
1369       ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "
1370        "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
1371        gtid, loc_ref, new_taskdata));
1372 
1373   ANNOTATE_HAPPENS_BEFORE(new_task);
1374   return TASK_CURRENT_NOT_QUEUED;
1375 }
1376 
1377 // __kmp_omp_task: Schedule a non-thread-switchable task for execution
1378 //
1379 // gtid: Global Thread ID of encountering thread
1380 // new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
1381 // serialize_immediate: if TRUE then if the task is executed immediately its
1382 // execution will be serialized
1383 // Returns:
1384 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1385 //    be resumed later.
1386 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1387 //    resumed later.
1388 kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
1389                          bool serialize_immediate) {
1390   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1391 
1392 #if OMPT_SUPPORT
1393   if (ompt_enabled) {
1394     new_taskdata->ompt_task_info.frame.reenter_runtime_frame =
1395         __builtin_frame_address(1);
1396   }
1397 #endif
1398 
1399 /* Should we execute the new task or queue it? For now, let's just always try to
1400    queue it.  If the queue fills up, then we'll execute it.  */
1401 #if OMP_45_ENABLED
1402   if (new_taskdata->td_flags.proxy == TASK_PROXY ||
1403       __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1404 #else
1405   if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1406 #endif
1407   { // Execute this task immediately
1408     kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1409     if (serialize_immediate)
1410       new_taskdata->td_flags.task_serial = 1;
1411     __kmp_invoke_task(gtid, new_task, current_task);
1412   }
1413 
1414 #if OMPT_SUPPORT
1415   if (ompt_enabled) {
1416     new_taskdata->ompt_task_info.frame.reenter_runtime_frame = NULL;
1417   }
1418 #endif
1419 
1420   ANNOTATE_HAPPENS_BEFORE(new_task);
1421   return TASK_CURRENT_NOT_QUEUED;
1422 }
1423 
1424 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a
1425 // non-thread-switchable task from the parent thread only!
1426 //
1427 // loc_ref: location of original task pragma (ignored)
1428 // gtid: Global Thread ID of encountering thread
1429 // new_task: non-thread-switchable task thunk allocated by
1430 // __kmp_omp_task_alloc()
1431 // Returns:
1432 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1433 //    be resumed later.
1434 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1435 //    resumed later.
1436 kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid,
1437                           kmp_task_t *new_task) {
1438   kmp_int32 res;
1439   KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1440 
1441 #if KMP_DEBUG
1442   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1443 #endif
1444   KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1445                 new_taskdata));
1446 
1447   res = __kmp_omp_task(gtid, new_task, true);
1448 
1449   KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1450                 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1451                 gtid, loc_ref, new_taskdata));
1452   return res;
1453 }
1454 
1455 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are
1456 // complete
1457 kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) {
1458   kmp_taskdata_t *taskdata;
1459   kmp_info_t *thread;
1460   int thread_finished = FALSE;
1461   KMP_SET_THREAD_STATE_BLOCK(TASKWAIT);
1462 
1463   KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref));
1464 
1465   if (__kmp_tasking_mode != tskm_immediate_exec) {
1466     thread = __kmp_threads[gtid];
1467     taskdata = thread->th.th_current_task;
1468 #if OMPT_SUPPORT && OMPT_TRACE
1469     ompt_task_id_t my_task_id;
1470     ompt_parallel_id_t my_parallel_id;
1471 
1472     if (ompt_enabled) {
1473       kmp_team_t *team = thread->th.th_team;
1474       my_task_id = taskdata->ompt_task_info.task_id;
1475       my_parallel_id = team->t.ompt_team_info.parallel_id;
1476 
1477       taskdata->ompt_task_info.frame.reenter_runtime_frame =
1478           __builtin_frame_address(1);
1479       if (ompt_callbacks.ompt_callback(ompt_event_taskwait_begin)) {
1480         ompt_callbacks.ompt_callback(ompt_event_taskwait_begin)(my_parallel_id,
1481                                                                 my_task_id);
1482       }
1483     }
1484 #endif
1485 
1486 // Debugger: The taskwait is active. Store location and thread encountered the
1487 // taskwait.
1488 #if USE_ITT_BUILD
1489 // Note: These values are used by ITT events as well.
1490 #endif /* USE_ITT_BUILD */
1491     taskdata->td_taskwait_counter += 1;
1492     taskdata->td_taskwait_ident = loc_ref;
1493     taskdata->td_taskwait_thread = gtid + 1;
1494 
1495 #if USE_ITT_BUILD
1496     void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
1497     if (itt_sync_obj != NULL)
1498       __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
1499 #endif /* USE_ITT_BUILD */
1500 
1501     bool must_wait =
1502         !taskdata->td_flags.team_serial && !taskdata->td_flags.final;
1503 
1504 #if OMP_45_ENABLED
1505     must_wait = must_wait || (thread->th.th_task_team != NULL &&
1506                               thread->th.th_task_team->tt.tt_found_proxy_tasks);
1507 #endif
1508     if (must_wait) {
1509       kmp_flag_32 flag(
1510           RCAST(volatile kmp_uint32 *, &taskdata->td_incomplete_child_tasks),
1511           0U);
1512       while (TCR_4(taskdata->td_incomplete_child_tasks) != 0) {
1513         flag.execute_tasks(thread, gtid, FALSE,
1514                            &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
1515                            __kmp_task_stealing_constraint);
1516       }
1517     }
1518 #if USE_ITT_BUILD
1519     if (itt_sync_obj != NULL)
1520       __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
1521 #endif /* USE_ITT_BUILD */
1522 
1523     // Debugger:  The taskwait is completed. Location remains, but thread is
1524     // negated.
1525     taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
1526 
1527 #if OMPT_SUPPORT && OMPT_TRACE
1528     if (ompt_enabled) {
1529       if (ompt_callbacks.ompt_callback(ompt_event_taskwait_end)) {
1530         ompt_callbacks.ompt_callback(ompt_event_taskwait_end)(my_parallel_id,
1531                                                               my_task_id);
1532       }
1533       taskdata->ompt_task_info.frame.reenter_runtime_frame = NULL;
1534     }
1535 #endif
1536     ANNOTATE_HAPPENS_AFTER(taskdata);
1537   }
1538 
1539   KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
1540                 "returning TASK_CURRENT_NOT_QUEUED\n",
1541                 gtid, taskdata));
1542 
1543   return TASK_CURRENT_NOT_QUEUED;
1544 }
1545 
1546 // __kmpc_omp_taskyield: switch to a different task
1547 kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
1548   kmp_taskdata_t *taskdata;
1549   kmp_info_t *thread;
1550   int thread_finished = FALSE;
1551 
1552   KMP_COUNT_BLOCK(OMP_TASKYIELD);
1553   KMP_SET_THREAD_STATE_BLOCK(TASKYIELD);
1554 
1555   KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
1556                 gtid, loc_ref, end_part));
1557 
1558   if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) {
1559     thread = __kmp_threads[gtid];
1560     taskdata = thread->th.th_current_task;
1561 // Should we model this as a task wait or not?
1562 // Debugger: The taskwait is active. Store location and thread encountered the
1563 // taskwait.
1564 #if USE_ITT_BUILD
1565 // Note: These values are used by ITT events as well.
1566 #endif /* USE_ITT_BUILD */
1567     taskdata->td_taskwait_counter += 1;
1568     taskdata->td_taskwait_ident = loc_ref;
1569     taskdata->td_taskwait_thread = gtid + 1;
1570 
1571 #if USE_ITT_BUILD
1572     void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
1573     if (itt_sync_obj != NULL)
1574       __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
1575 #endif /* USE_ITT_BUILD */
1576     if (!taskdata->td_flags.team_serial) {
1577       kmp_task_team_t *task_team = thread->th.th_task_team;
1578       if (task_team != NULL) {
1579         if (KMP_TASKING_ENABLED(task_team)) {
1580           __kmp_execute_tasks_32(
1581               thread, gtid, NULL, FALSE,
1582               &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
1583               __kmp_task_stealing_constraint);
1584         }
1585       }
1586     }
1587 #if USE_ITT_BUILD
1588     if (itt_sync_obj != NULL)
1589       __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
1590 #endif /* USE_ITT_BUILD */
1591 
1592     // Debugger:  The taskwait is completed. Location remains, but thread is
1593     // negated.
1594     taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
1595   }
1596 
1597   KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
1598                 "returning TASK_CURRENT_NOT_QUEUED\n",
1599                 gtid, taskdata));
1600 
1601   return TASK_CURRENT_NOT_QUEUED;
1602 }
1603 
1604 // TODO: change to OMP_50_ENABLED, need to change build tools for this to work
1605 #if OMP_45_ENABLED
1606 // Task Reduction implementation
1607 
1608 typedef struct kmp_task_red_flags {
1609   unsigned lazy_priv : 1; // hint: (1) use lazy allocation (big objects)
1610   unsigned reserved31 : 31;
1611 } kmp_task_red_flags_t;
1612 
1613 // internal structure for reduction data item related info
1614 typedef struct kmp_task_red_data {
1615   void *reduce_shar; // shared reduction item
1616   size_t reduce_size; // size of data item
1617   void *reduce_priv; // thread specific data
1618   void *reduce_pend; // end of private data for comparison op
1619   void *reduce_init; // data initialization routine
1620   void *reduce_fini; // data finalization routine
1621   void *reduce_comb; // data combiner routine
1622   kmp_task_red_flags_t flags; // flags for additional info from compiler
1623 } kmp_task_red_data_t;
1624 
1625 // structure sent us by compiler - one per reduction item
1626 typedef struct kmp_task_red_input {
1627   void *reduce_shar; // shared reduction item
1628   size_t reduce_size; // size of data item
1629   void *reduce_init; // data initialization routine
1630   void *reduce_fini; // data finalization routine
1631   void *reduce_comb; // data combiner routine
1632   kmp_task_red_flags_t flags; // flags for additional info from compiler
1633 } kmp_task_red_input_t;
1634 
1635 /*!
1636 @ingroup TASKING
1637 @param gtid      Global thread ID
1638 @param num       Number of data items to reduce
1639 @param data      Array of data for reduction
1640 @return The taskgroup identifier
1641 
1642 Initialize task reduction for the taskgroup.
1643 */
1644 void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
1645   kmp_info_t *thread = __kmp_threads[gtid];
1646   kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
1647   kmp_int32 nth = thread->th.th_team_nproc;
1648   kmp_task_red_input_t *input = (kmp_task_red_input_t *)data;
1649   kmp_task_red_data_t *arr;
1650 
1651   // check input data just in case
1652   KMP_ASSERT(tg != NULL);
1653   KMP_ASSERT(data != NULL);
1654   KMP_ASSERT(num > 0);
1655   if (nth == 1) {
1656     KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n",
1657                   gtid, tg));
1658     return (void *)tg;
1659   }
1660   KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n",
1661                 gtid, tg, num));
1662   arr = (kmp_task_red_data_t *)__kmp_thread_malloc(
1663       thread, num * sizeof(kmp_task_red_data_t));
1664   for (int i = 0; i < num; ++i) {
1665     void (*f_init)(void *) = (void (*)(void *))(input[i].reduce_init);
1666     size_t size = input[i].reduce_size - 1;
1667     // round the size up to cache line per thread-specific item
1668     size += CACHE_LINE - size % CACHE_LINE;
1669     KMP_ASSERT(input[i].reduce_comb != NULL); // combiner is mandatory
1670     arr[i].reduce_shar = input[i].reduce_shar;
1671     arr[i].reduce_size = size;
1672     arr[i].reduce_init = input[i].reduce_init;
1673     arr[i].reduce_fini = input[i].reduce_fini;
1674     arr[i].reduce_comb = input[i].reduce_comb;
1675     arr[i].flags = input[i].flags;
1676     if (!input[i].flags.lazy_priv) {
1677       // allocate cache-line aligned block and fill it with zeros
1678       arr[i].reduce_priv = __kmp_allocate(nth * size);
1679       arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size;
1680       if (f_init != NULL) {
1681         // initialize thread-specific items
1682         for (int j = 0; j < nth; ++j) {
1683           f_init((char *)(arr[i].reduce_priv) + j * size);
1684         }
1685       }
1686     } else {
1687       // only allocate space for pointers now,
1688       // objects will be lazily allocated/initialized once requested
1689       arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *));
1690     }
1691   }
1692   tg->reduce_data = (void *)arr;
1693   tg->reduce_num_data = num;
1694   return (void *)tg;
1695 }
1696 
1697 /*!
1698 @ingroup TASKING
1699 @param gtid    Global thread ID
1700 @param tskgrp  The taskgroup ID (optional)
1701 @param data    Shared location of the item
1702 @return The pointer to per-thread data
1703 
1704 Get thread-specific location of data item
1705 */
1706 void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
1707   kmp_info_t *thread = __kmp_threads[gtid];
1708   kmp_int32 nth = thread->th.th_team_nproc;
1709   if (nth == 1)
1710     return data; // nothing to do
1711 
1712   kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp;
1713   if (tg == NULL)
1714     tg = thread->th.th_current_task->td_taskgroup;
1715   KMP_ASSERT(tg != NULL);
1716   kmp_task_red_data_t *arr = (kmp_task_red_data_t *)(tg->reduce_data);
1717   kmp_int32 num = tg->reduce_num_data;
1718   kmp_int32 tid = thread->th.th_info.ds.ds_tid;
1719 
1720   KMP_ASSERT(data != NULL);
1721   while (tg != NULL) {
1722     for (int i = 0; i < num; ++i) {
1723       if (!arr[i].flags.lazy_priv) {
1724         if (data == arr[i].reduce_shar ||
1725             (data >= arr[i].reduce_priv && data < arr[i].reduce_pend))
1726           return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size;
1727       } else {
1728         // check shared location first
1729         void **p_priv = (void **)(arr[i].reduce_priv);
1730         if (data == arr[i].reduce_shar)
1731           goto found;
1732         // check if we get some thread specific location as parameter
1733         for (int j = 0; j < nth; ++j)
1734           if (data == p_priv[j])
1735             goto found;
1736         continue; // not found, continue search
1737       found:
1738         if (p_priv[tid] == NULL) {
1739           // allocate thread specific object lazily
1740           void (*f_init)(void *) = (void (*)(void *))(arr[i].reduce_init);
1741           p_priv[tid] = __kmp_allocate(arr[i].reduce_size);
1742           if (f_init != NULL) {
1743             f_init(p_priv[tid]);
1744           }
1745         }
1746         return p_priv[tid];
1747       }
1748     }
1749     tg = tg->parent;
1750     arr = (kmp_task_red_data_t *)(tg->reduce_data);
1751     num = tg->reduce_num_data;
1752   }
1753   KMP_ASSERT2(0, "Unknown task reduction item");
1754   return NULL; // ERROR, this line never executed
1755 }
1756 
1757 // Finalize task reduction.
1758 // Called from __kmpc_end_taskgroup()
1759 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) {
1760   kmp_int32 nth = th->th.th_team_nproc;
1761   KMP_DEBUG_ASSERT(nth > 1); // should not be called if nth == 1
1762   kmp_task_red_data_t *arr = (kmp_task_red_data_t *)tg->reduce_data;
1763   kmp_int32 num = tg->reduce_num_data;
1764   for (int i = 0; i < num; ++i) {
1765     void *sh_data = arr[i].reduce_shar;
1766     void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini);
1767     void (*f_comb)(void *, void *) =
1768         (void (*)(void *, void *))(arr[i].reduce_comb);
1769     if (!arr[i].flags.lazy_priv) {
1770       void *pr_data = arr[i].reduce_priv;
1771       size_t size = arr[i].reduce_size;
1772       for (int j = 0; j < nth; ++j) {
1773         void *priv_data = (char *)pr_data + j * size;
1774         f_comb(sh_data, priv_data); // combine results
1775         if (f_fini)
1776           f_fini(priv_data); // finalize if needed
1777       }
1778     } else {
1779       void **pr_data = (void **)(arr[i].reduce_priv);
1780       for (int j = 0; j < nth; ++j) {
1781         if (pr_data[j] != NULL) {
1782           f_comb(sh_data, pr_data[j]); // combine results
1783           if (f_fini)
1784             f_fini(pr_data[j]); // finalize if needed
1785           __kmp_free(pr_data[j]);
1786         }
1787       }
1788     }
1789     __kmp_free(arr[i].reduce_priv);
1790   }
1791   __kmp_thread_free(th, arr);
1792   tg->reduce_data = NULL;
1793   tg->reduce_num_data = 0;
1794 }
1795 #endif
1796 
1797 #if OMP_40_ENABLED
1798 // __kmpc_taskgroup: Start a new taskgroup
1799 void __kmpc_taskgroup(ident_t *loc, int gtid) {
1800   kmp_info_t *thread = __kmp_threads[gtid];
1801   kmp_taskdata_t *taskdata = thread->th.th_current_task;
1802   kmp_taskgroup_t *tg_new =
1803       (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t));
1804   KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new));
1805   tg_new->count = 0;
1806   tg_new->cancel_request = cancel_noreq;
1807   tg_new->parent = taskdata->td_taskgroup;
1808 // TODO: change to OMP_50_ENABLED, need to change build tools for this to work
1809 #if OMP_45_ENABLED
1810   tg_new->reduce_data = NULL;
1811   tg_new->reduce_num_data = 0;
1812 #endif
1813   taskdata->td_taskgroup = tg_new;
1814 }
1815 
1816 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task
1817 //                       and its descendants are complete
1818 void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
1819   kmp_info_t *thread = __kmp_threads[gtid];
1820   kmp_taskdata_t *taskdata = thread->th.th_current_task;
1821   kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
1822   int thread_finished = FALSE;
1823 
1824   KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc));
1825   KMP_DEBUG_ASSERT(taskgroup != NULL);
1826   KMP_SET_THREAD_STATE_BLOCK(TASKGROUP);
1827 
1828   if (__kmp_tasking_mode != tskm_immediate_exec) {
1829 #if USE_ITT_BUILD
1830     // For ITT the taskgroup wait is similar to taskwait until we need to
1831     // distinguish them
1832     void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
1833     if (itt_sync_obj != NULL)
1834       __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
1835 #endif /* USE_ITT_BUILD */
1836 
1837 #if OMP_45_ENABLED
1838     if (!taskdata->td_flags.team_serial ||
1839         (thread->th.th_task_team != NULL &&
1840          thread->th.th_task_team->tt.tt_found_proxy_tasks))
1841 #else
1842     if (!taskdata->td_flags.team_serial)
1843 #endif
1844     {
1845       kmp_flag_32 flag(RCAST(kmp_uint32 *, &taskgroup->count), 0U);
1846       while (TCR_4(taskgroup->count) != 0) {
1847         flag.execute_tasks(thread, gtid, FALSE,
1848                            &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
1849                            __kmp_task_stealing_constraint);
1850       }
1851     }
1852 
1853 #if USE_ITT_BUILD
1854     if (itt_sync_obj != NULL)
1855       __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
1856 #endif /* USE_ITT_BUILD */
1857   }
1858   KMP_DEBUG_ASSERT(taskgroup->count == 0);
1859 
1860 // TODO: change to OMP_50_ENABLED, need to change build tools for this to work
1861 #if OMP_45_ENABLED
1862   if (taskgroup->reduce_data != NULL) // need to reduce?
1863     __kmp_task_reduction_fini(thread, taskgroup);
1864 #endif
1865   // Restore parent taskgroup for the current task
1866   taskdata->td_taskgroup = taskgroup->parent;
1867   __kmp_thread_free(thread, taskgroup);
1868 
1869   KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n",
1870                 gtid, taskdata));
1871   ANNOTATE_HAPPENS_AFTER(taskdata);
1872 }
1873 #endif
1874 
1875 // __kmp_remove_my_task: remove a task from my own deque
1876 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid,
1877                                         kmp_task_team_t *task_team,
1878                                         kmp_int32 is_constrained) {
1879   kmp_task_t *task;
1880   kmp_taskdata_t *taskdata;
1881   kmp_thread_data_t *thread_data;
1882   kmp_uint32 tail;
1883 
1884   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
1885   KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data !=
1886                    NULL); // Caller should check this condition
1887 
1888   thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
1889 
1890   KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",
1891                 gtid, thread_data->td.td_deque_ntasks,
1892                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
1893 
1894   if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
1895     KA_TRACE(10,
1896              ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: "
1897               "ntasks=%d head=%u tail=%u\n",
1898               gtid, thread_data->td.td_deque_ntasks,
1899               thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
1900     return NULL;
1901   }
1902 
1903   __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
1904 
1905   if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
1906     __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
1907     KA_TRACE(10,
1908              ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "
1909               "ntasks=%d head=%u tail=%u\n",
1910               gtid, thread_data->td.td_deque_ntasks,
1911               thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
1912     return NULL;
1913   }
1914 
1915   tail = (thread_data->td.td_deque_tail - 1) &
1916          TASK_DEQUE_MASK(thread_data->td); // Wrap index.
1917   taskdata = thread_data->td.td_deque[tail];
1918 
1919   if (is_constrained && (taskdata->td_flags.tiedness == TASK_TIED)) {
1920     // we need to check if the candidate obeys task scheduling constraint:
1921     // only child of current task can be scheduled
1922     kmp_taskdata_t *current = thread->th.th_current_task;
1923     kmp_int32 level = current->td_level;
1924     kmp_taskdata_t *parent = taskdata->td_parent;
1925     while (parent != current && parent->td_level > level) {
1926       parent = parent->td_parent; // check generation up to the level of the
1927       // current task
1928       KMP_DEBUG_ASSERT(parent != NULL);
1929     }
1930     if (parent != current) {
1931       // If the tail task is not a child, then no other child can appear in the
1932       // deque.
1933       __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
1934       KA_TRACE(10,
1935                ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "
1936                 "ntasks=%d head=%u tail=%u\n",
1937                 gtid, thread_data->td.td_deque_ntasks,
1938                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
1939       return NULL;
1940     }
1941   }
1942 
1943   thread_data->td.td_deque_tail = tail;
1944   TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1);
1945 
1946   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
1947 
1948   KA_TRACE(10, ("__kmp_remove_my_task(exit #2): T#%d task %p removed: "
1949                 "ntasks=%d head=%u tail=%u\n",
1950                 gtid, taskdata, thread_data->td.td_deque_ntasks,
1951                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
1952 
1953   task = KMP_TASKDATA_TO_TASK(taskdata);
1954   return task;
1955 }
1956 
1957 // __kmp_steal_task: remove a task from another thread's deque
1958 // Assume that calling thread has already checked existence of
1959 // task_team thread_data before calling this routine.
1960 static kmp_task_t *__kmp_steal_task(kmp_info_t *victim, kmp_int32 gtid,
1961                                     kmp_task_team_t *task_team,
1962                                     volatile kmp_int32 *unfinished_threads,
1963                                     int *thread_finished,
1964                                     kmp_int32 is_constrained) {
1965   kmp_task_t *task;
1966   kmp_taskdata_t *taskdata;
1967   kmp_thread_data_t *victim_td, *threads_data;
1968   kmp_int32 victim_tid;
1969 
1970   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
1971 
1972   threads_data = task_team->tt.tt_threads_data;
1973   KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition
1974 
1975   victim_tid = victim->th.th_info.ds.ds_tid;
1976   victim_td = &threads_data[victim_tid];
1977 
1978   KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: "
1979                 "task_team=%p ntasks=%d "
1980                 "head=%u tail=%u\n",
1981                 gtid, __kmp_gtid_from_thread(victim), task_team,
1982                 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
1983                 victim_td->td.td_deque_tail));
1984 
1985   if ((TCR_4(victim_td->td.td_deque_ntasks) ==
1986        0) || // Caller should not check this condition
1987       (TCR_PTR(victim->th.th_task_team) !=
1988        task_team)) // GEH: why would this happen?
1989   {
1990     KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: "
1991                   "task_team=%p "
1992                   "ntasks=%d head=%u tail=%u\n",
1993                   gtid, __kmp_gtid_from_thread(victim), task_team,
1994                   victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
1995                   victim_td->td.td_deque_tail));
1996     return NULL;
1997   }
1998 
1999   __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock);
2000 
2001   // Check again after we acquire the lock
2002   if ((TCR_4(victim_td->td.td_deque_ntasks) == 0) ||
2003       (TCR_PTR(victim->th.th_task_team) !=
2004        task_team)) // GEH: why would this happen?
2005   {
2006     __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2007     KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: "
2008                   "task_team=%p "
2009                   "ntasks=%d head=%u tail=%u\n",
2010                   gtid, __kmp_gtid_from_thread(victim), task_team,
2011                   victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
2012                   victim_td->td.td_deque_tail));
2013     return NULL;
2014   }
2015 
2016   KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL);
2017 
2018   taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head];
2019   if (is_constrained) {
2020     // we need to check if the candidate obeys task scheduling constraint:
2021     // only descendant of current task can be scheduled
2022     kmp_taskdata_t *current = __kmp_threads[gtid]->th.th_current_task;
2023     kmp_int32 level = current->td_level;
2024     kmp_taskdata_t *parent = taskdata->td_parent;
2025     while (parent != current && parent->td_level > level) {
2026       parent = parent->td_parent; // check generation up to the level of the
2027       // current task
2028       KMP_DEBUG_ASSERT(parent != NULL);
2029     }
2030     if (parent != current) {
2031       // If the head task is not a descendant of the current task then do not
2032       // steal it. No other task in victim's deque can be a descendant of the
2033       // current task.
2034       __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2035       KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from "
2036                     "T#%d: task_team=%p "
2037                     "ntasks=%d head=%u tail=%u\n",
2038                     gtid,
2039                     __kmp_gtid_from_thread(threads_data[victim_tid].td.td_thr),
2040                     task_team, victim_td->td.td_deque_ntasks,
2041                     victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2042       return NULL;
2043     }
2044   }
2045   // Bump head pointer and Wrap.
2046   victim_td->td.td_deque_head =
2047       (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td);
2048   if (*thread_finished) {
2049     // We need to un-mark this victim as a finished victim.  This must be done
2050     // before releasing the lock, or else other threads (starting with the
2051     // master victim) might be prematurely released from the barrier!!!
2052     kmp_int32 count;
2053 
2054     count = KMP_TEST_THEN_INC32(unfinished_threads);
2055 
2056     KA_TRACE(
2057         20,
2058         ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
2059          gtid, count + 1, task_team));
2060 
2061     *thread_finished = FALSE;
2062   }
2063   TCW_4(victim_td->td.td_deque_ntasks,
2064         TCR_4(victim_td->td.td_deque_ntasks) - 1);
2065 
2066   __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2067 
2068   KMP_COUNT_BLOCK(TASK_stolen);
2069   KA_TRACE(
2070       10,
2071       ("__kmp_steal_task(exit #3): T#%d stole task %p from T#%d: task_team=%p "
2072        "ntasks=%d head=%u tail=%u\n",
2073        gtid, taskdata, __kmp_gtid_from_thread(victim), task_team,
2074        victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
2075        victim_td->td.td_deque_tail));
2076 
2077   task = KMP_TASKDATA_TO_TASK(taskdata);
2078   return task;
2079 }
2080 
2081 // __kmp_execute_tasks_template: Choose and execute tasks until either the
2082 // condition is statisfied (return true) or there are none left (return false).
2083 //
2084 // final_spin is TRUE if this is the spin at the release barrier.
2085 // thread_finished indicates whether the thread is finished executing all
2086 // the tasks it has on its deque, and is at the release barrier.
2087 // spinner is the location on which to spin.
2088 // spinner == NULL means only execute a single task and return.
2089 // checker is the value to check to terminate the spin.
2090 template <class C>
2091 static inline int __kmp_execute_tasks_template(
2092     kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin,
2093     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
2094     kmp_int32 is_constrained) {
2095   kmp_task_team_t *task_team = thread->th.th_task_team;
2096   kmp_thread_data_t *threads_data;
2097   kmp_task_t *task;
2098   kmp_info_t *other_thread;
2099   kmp_taskdata_t *current_task = thread->th.th_current_task;
2100   volatile kmp_int32 *unfinished_threads;
2101   kmp_int32 nthreads, victim = -2, use_own_tasks = 1, new_victim = 0,
2102                       tid = thread->th.th_info.ds.ds_tid;
2103 
2104   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2105   KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]);
2106 
2107   if (task_team == NULL)
2108     return FALSE;
2109 
2110   KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d "
2111                 "*thread_finished=%d\n",
2112                 gtid, final_spin, *thread_finished));
2113 
2114   thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
2115   threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
2116   KMP_DEBUG_ASSERT(threads_data != NULL);
2117 
2118   nthreads = task_team->tt.tt_nproc;
2119   unfinished_threads = &(task_team->tt.tt_unfinished_threads);
2120 #if OMP_45_ENABLED
2121   KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks);
2122 #else
2123   KMP_DEBUG_ASSERT(nthreads > 1);
2124 #endif
2125   KMP_DEBUG_ASSERT(TCR_4(*unfinished_threads) >= 0);
2126 
2127   while (1) { // Outer loop keeps trying to find tasks in case of single thread
2128     // getting tasks from target constructs
2129     while (1) { // Inner loop to find a task and execute it
2130       task = NULL;
2131       if (use_own_tasks) { // check on own queue first
2132         task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained);
2133       }
2134       if ((task == NULL) && (nthreads > 1)) { // Steal a task
2135         int asleep = 1;
2136         use_own_tasks = 0;
2137         // Try to steal from the last place I stole from successfully.
2138         if (victim == -2) { // haven't stolen anything yet
2139           victim = threads_data[tid].td.td_deque_last_stolen;
2140           if (victim !=
2141               -1) // if we have a last stolen from victim, get the thread
2142             other_thread = threads_data[victim].td.td_thr;
2143         }
2144         if (victim != -1) { // found last victim
2145           asleep = 0;
2146         } else if (!new_victim) { // no recent steals and we haven't already
2147           // used a new victim; select a random thread
2148           do { // Find a different thread to steal work from.
2149             // Pick a random thread. Initial plan was to cycle through all the
2150             // threads, and only return if we tried to steal from every thread,
2151             // and failed.  Arch says that's not such a great idea.
2152             victim = __kmp_get_random(thread) % (nthreads - 1);
2153             if (victim >= tid) {
2154               ++victim; // Adjusts random distribution to exclude self
2155             }
2156             // Found a potential victim
2157             other_thread = threads_data[victim].td.td_thr;
2158             // There is a slight chance that __kmp_enable_tasking() did not wake
2159             // up all threads waiting at the barrier.  If victim is sleeping,
2160             // then wake it up. Since we were going to pay the cache miss
2161             // penalty for referencing another thread's kmp_info_t struct
2162             // anyway,
2163             // the check shouldn't cost too much performance at this point. In
2164             // extra barrier mode, tasks do not sleep at the separate tasking
2165             // barrier, so this isn't a problem.
2166             asleep = 0;
2167             if ((__kmp_tasking_mode == tskm_task_teams) &&
2168                 (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&
2169                 (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) !=
2170                  NULL)) {
2171               asleep = 1;
2172               __kmp_null_resume_wrapper(__kmp_gtid_from_thread(other_thread),
2173                                         other_thread->th.th_sleep_loc);
2174               // A sleeping thread should not have any tasks on it's queue.
2175               // There is a slight possibility that it resumes, steals a task
2176               // from another thread, which spawns more tasks, all in the time
2177               // that it takes this thread to check => don't write an assertion
2178               // that the victim's queue is empty.  Try stealing from a
2179               // different thread.
2180             }
2181           } while (asleep);
2182         }
2183 
2184         if (!asleep) {
2185           // We have a victim to try to steal from
2186           task = __kmp_steal_task(other_thread, gtid, task_team,
2187                                   unfinished_threads, thread_finished,
2188                                   is_constrained);
2189         }
2190         if (task != NULL) { // set last stolen to victim
2191           if (threads_data[tid].td.td_deque_last_stolen != victim) {
2192             threads_data[tid].td.td_deque_last_stolen = victim;
2193             // The pre-refactored code did not try more than 1 successful new
2194             // vicitm, unless the last one generated more local tasks;
2195             // new_victim keeps track of this
2196             new_victim = 1;
2197           }
2198         } else { // No tasks found; unset last_stolen
2199           KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1);
2200           victim = -2; // no successful victim found
2201         }
2202       }
2203 
2204       if (task == NULL) // break out of tasking loop
2205         break;
2206 
2207 // Found a task; execute it
2208 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2209       if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2210         if (itt_sync_obj == NULL) { // we are at fork barrier where we could not
2211           // get the object reliably
2212           itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2213         }
2214         __kmp_itt_task_starting(itt_sync_obj);
2215       }
2216 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2217       __kmp_invoke_task(gtid, task, current_task);
2218 #if USE_ITT_BUILD
2219       if (itt_sync_obj != NULL)
2220         __kmp_itt_task_finished(itt_sync_obj);
2221 #endif /* USE_ITT_BUILD */
2222       // If this thread is only partway through the barrier and the condition is
2223       // met, then return now, so that the barrier gather/release pattern can
2224       // proceed. If this thread is in the last spin loop in the barrier,
2225       // waiting to be released, we know that the termination condition will not
2226       // be satisified, so don't waste any cycles checking it.
2227       if (flag == NULL || (!final_spin && flag->done_check())) {
2228         KA_TRACE(
2229             15,
2230             ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
2231              gtid));
2232         return TRUE;
2233       }
2234       if (thread->th.th_task_team == NULL) {
2235         break;
2236       }
2237       // Yield before executing next task
2238       KMP_YIELD(__kmp_library == library_throughput);
2239       // If execution of a stolen task results in more tasks being placed on our
2240       // run queue, reset use_own_tasks
2241       if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) {
2242         KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned "
2243                       "other tasks, restart\n",
2244                       gtid));
2245         use_own_tasks = 1;
2246         new_victim = 0;
2247       }
2248     }
2249 
2250 // The task source has been exhausted. If in final spin loop of barrier, check
2251 // if termination condition is satisfied.
2252 #if OMP_45_ENABLED
2253     // The work queue may be empty but there might be proxy tasks still
2254     // executing
2255     if (final_spin && TCR_4(current_task->td_incomplete_child_tasks) == 0)
2256 #else
2257     if (final_spin)
2258 #endif
2259     {
2260       // First, decrement the #unfinished threads, if that has not already been
2261       // done.  This decrement might be to the spin location, and result in the
2262       // termination condition being satisfied.
2263       if (!*thread_finished) {
2264         kmp_int32 count;
2265 
2266         count = KMP_TEST_THEN_DEC32(unfinished_threads) - 1;
2267         KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec "
2268                       "unfinished_threads to %d task_team=%p\n",
2269                       gtid, count, task_team));
2270         *thread_finished = TRUE;
2271       }
2272 
2273       // It is now unsafe to reference thread->th.th_team !!!
2274       // Decrementing task_team->tt.tt_unfinished_threads can allow the master
2275       // thread to pass through the barrier, where it might reset each thread's
2276       // th.th_team field for the next parallel region. If we can steal more
2277       // work, we know that this has not happened yet.
2278       if (flag != NULL && flag->done_check()) {
2279         KA_TRACE(
2280             15,
2281             ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
2282              gtid));
2283         return TRUE;
2284       }
2285     }
2286 
2287     // If this thread's task team is NULL, master has recognized that there are
2288     // no more tasks; bail out
2289     if (thread->th.th_task_team == NULL) {
2290       KA_TRACE(15,
2291                ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid));
2292       return FALSE;
2293     }
2294 
2295 #if OMP_45_ENABLED
2296     // We could be getting tasks from target constructs; if this is the only
2297     // thread, keep trying to execute tasks from own queue
2298     if (nthreads == 1)
2299       use_own_tasks = 1;
2300     else
2301 #endif
2302     {
2303       KA_TRACE(15,
2304                ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid));
2305       return FALSE;
2306     }
2307   }
2308 }
2309 
2310 int __kmp_execute_tasks_32(
2311     kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32 *flag, int final_spin,
2312     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
2313     kmp_int32 is_constrained) {
2314   return __kmp_execute_tasks_template(
2315       thread, gtid, flag, final_spin,
2316       thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
2317 }
2318 
2319 int __kmp_execute_tasks_64(
2320     kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64 *flag, int final_spin,
2321     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
2322     kmp_int32 is_constrained) {
2323   return __kmp_execute_tasks_template(
2324       thread, gtid, flag, final_spin,
2325       thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
2326 }
2327 
2328 int __kmp_execute_tasks_oncore(
2329     kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
2330     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
2331     kmp_int32 is_constrained) {
2332   return __kmp_execute_tasks_template(
2333       thread, gtid, flag, final_spin,
2334       thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
2335 }
2336 
2337 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
2338 // next barrier so they can assist in executing enqueued tasks.
2339 // First thread in allocates the task team atomically.
2340 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
2341                                  kmp_info_t *this_thr) {
2342   kmp_thread_data_t *threads_data;
2343   int nthreads, i, is_init_thread;
2344 
2345   KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n",
2346                 __kmp_gtid_from_thread(this_thr)));
2347 
2348   KMP_DEBUG_ASSERT(task_team != NULL);
2349   KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL);
2350 
2351   nthreads = task_team->tt.tt_nproc;
2352   KMP_DEBUG_ASSERT(nthreads > 0);
2353   KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc);
2354 
2355   // Allocate or increase the size of threads_data if necessary
2356   is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team);
2357 
2358   if (!is_init_thread) {
2359     // Some other thread already set up the array.
2360     KA_TRACE(
2361         20,
2362         ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n",
2363          __kmp_gtid_from_thread(this_thr)));
2364     return;
2365   }
2366   threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
2367   KMP_DEBUG_ASSERT(threads_data != NULL);
2368 
2369   if ((__kmp_tasking_mode == tskm_task_teams) &&
2370       (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) {
2371     // Release any threads sleeping at the barrier, so that they can steal
2372     // tasks and execute them.  In extra barrier mode, tasks do not sleep
2373     // at the separate tasking barrier, so this isn't a problem.
2374     for (i = 0; i < nthreads; i++) {
2375       volatile void *sleep_loc;
2376       kmp_info_t *thread = threads_data[i].td.td_thr;
2377 
2378       if (i == this_thr->th.th_info.ds.ds_tid) {
2379         continue;
2380       }
2381       // Since we haven't locked the thread's suspend mutex lock at this
2382       // point, there is a small window where a thread might be putting
2383       // itself to sleep, but hasn't set the th_sleep_loc field yet.
2384       // To work around this, __kmp_execute_tasks_template() periodically checks
2385       // see if other threads are sleeping (using the same random mechanism that
2386       // is used for task stealing) and awakens them if they are.
2387       if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
2388           NULL) {
2389         KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n",
2390                       __kmp_gtid_from_thread(this_thr),
2391                       __kmp_gtid_from_thread(thread)));
2392         __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
2393       } else {
2394         KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
2395                       __kmp_gtid_from_thread(this_thr),
2396                       __kmp_gtid_from_thread(thread)));
2397       }
2398     }
2399   }
2400 
2401   KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n",
2402                 __kmp_gtid_from_thread(this_thr)));
2403 }
2404 
2405 /* // TODO: Check the comment consistency
2406  * Utility routines for "task teams".  A task team (kmp_task_t) is kind of
2407  * like a shadow of the kmp_team_t data struct, with a different lifetime.
2408  * After a child * thread checks into a barrier and calls __kmp_release() from
2409  * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
2410  * longer assume that the kmp_team_t structure is intact (at any moment, the
2411  * master thread may exit the barrier code and free the team data structure,
2412  * and return the threads to the thread pool).
2413  *
2414  * This does not work with the the tasking code, as the thread is still
2415  * expected to participate in the execution of any tasks that may have been
2416  * spawned my a member of the team, and the thread still needs access to all
2417  * to each thread in the team, so that it can steal work from it.
2418  *
2419  * Enter the existence of the kmp_task_team_t struct.  It employs a reference
2420  * counting mechanims, and is allocated by the master thread before calling
2421  * __kmp_<barrier_kind>_release, and then is release by the last thread to
2422  * exit __kmp_<barrier_kind>_release at the next barrier.  I.e. the lifetimes
2423  * of the kmp_task_team_t structs for consecutive barriers can overlap
2424  * (and will, unless the master thread is the last thread to exit the barrier
2425  * release phase, which is not typical).
2426  *
2427  * The existence of such a struct is useful outside the context of tasking,
2428  * but for now, I'm trying to keep it specific to the OMP_30_ENABLED macro,
2429  * so that any performance differences show up when comparing the 2.5 vs. 3.0
2430  * libraries.
2431  *
2432  * We currently use the existence of the threads array as an indicator that
2433  * tasks were spawned since the last barrier.  If the structure is to be
2434  * useful outside the context of tasking, then this will have to change, but
2435  * not settting the field minimizes the performance impact of tasking on
2436  * barriers, when no explicit tasks were spawned (pushed, actually).
2437  */
2438 
2439 static kmp_task_team_t *__kmp_free_task_teams =
2440     NULL; // Free list for task_team data structures
2441 // Lock for task team data structures
2442 static kmp_bootstrap_lock_t __kmp_task_team_lock =
2443     KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock);
2444 
2445 // __kmp_alloc_task_deque:
2446 // Allocates a task deque for a particular thread, and initialize the necessary
2447 // data structures relating to the deque.  This only happens once per thread
2448 // per task team since task teams are recycled. No lock is needed during
2449 // allocation since each thread allocates its own deque.
2450 static void __kmp_alloc_task_deque(kmp_info_t *thread,
2451                                    kmp_thread_data_t *thread_data) {
2452   __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
2453   KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL);
2454 
2455   // Initialize last stolen task field to "none"
2456   thread_data->td.td_deque_last_stolen = -1;
2457 
2458   KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0);
2459   KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0);
2460   KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0);
2461 
2462   KE_TRACE(
2463       10,
2464       ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
2465        __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data));
2466   // Allocate space for task deque, and zero the deque
2467   // Cannot use __kmp_thread_calloc() because threads not around for
2468   // kmp_reap_task_team( ).
2469   thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
2470       INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
2471   thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
2472 }
2473 
2474 // __kmp_realloc_task_deque:
2475 // Re-allocates a task deque for a particular thread, copies the content from
2476 // the old deque and adjusts the necessary data structures relating to the
2477 // deque. This operation must be done with a the deque_lock being held
2478 static void __kmp_realloc_task_deque(kmp_info_t *thread,
2479                                      kmp_thread_data_t *thread_data) {
2480   kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
2481   kmp_int32 new_size = 2 * size;
2482 
2483   KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to "
2484                 "%d] for thread_data %p\n",
2485                 __kmp_gtid_from_thread(thread), size, new_size, thread_data));
2486 
2487   kmp_taskdata_t **new_deque =
2488       (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *));
2489 
2490   int i, j;
2491   for (i = thread_data->td.td_deque_head, j = 0; j < size;
2492        i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++)
2493     new_deque[j] = thread_data->td.td_deque[i];
2494 
2495   __kmp_free(thread_data->td.td_deque);
2496 
2497   thread_data->td.td_deque_head = 0;
2498   thread_data->td.td_deque_tail = size;
2499   thread_data->td.td_deque = new_deque;
2500   thread_data->td.td_deque_size = new_size;
2501 }
2502 
2503 // __kmp_free_task_deque:
2504 // Deallocates a task deque for a particular thread. Happens at library
2505 // deallocation so don't need to reset all thread data fields.
2506 static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) {
2507   if (thread_data->td.td_deque != NULL) {
2508     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
2509     TCW_4(thread_data->td.td_deque_ntasks, 0);
2510     __kmp_free(thread_data->td.td_deque);
2511     thread_data->td.td_deque = NULL;
2512     __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2513   }
2514 
2515 #ifdef BUILD_TIED_TASK_STACK
2516   // GEH: Figure out what to do here for td_susp_tied_tasks
2517   if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) {
2518     __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data);
2519   }
2520 #endif // BUILD_TIED_TASK_STACK
2521 }
2522 
2523 // __kmp_realloc_task_threads_data:
2524 // Allocates a threads_data array for a task team, either by allocating an
2525 // initial array or enlarging an existing array.  Only the first thread to get
2526 // the lock allocs or enlarges the array and re-initializes the array eleemnts.
2527 // That thread returns "TRUE", the rest return "FALSE".
2528 // Assumes that the new array size is given by task_team -> tt.tt_nproc.
2529 // The current size is given by task_team -> tt.tt_max_threads.
2530 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
2531                                            kmp_task_team_t *task_team) {
2532   kmp_thread_data_t **threads_data_p;
2533   kmp_int32 nthreads, maxthreads;
2534   int is_init_thread = FALSE;
2535 
2536   if (TCR_4(task_team->tt.tt_found_tasks)) {
2537     // Already reallocated and initialized.
2538     return FALSE;
2539   }
2540 
2541   threads_data_p = &task_team->tt.tt_threads_data;
2542   nthreads = task_team->tt.tt_nproc;
2543   maxthreads = task_team->tt.tt_max_threads;
2544 
2545   // All threads must lock when they encounter the first task of the implicit
2546   // task region to make sure threads_data fields are (re)initialized before
2547   // used.
2548   __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
2549 
2550   if (!TCR_4(task_team->tt.tt_found_tasks)) {
2551     // first thread to enable tasking
2552     kmp_team_t *team = thread->th.th_team;
2553     int i;
2554 
2555     is_init_thread = TRUE;
2556     if (maxthreads < nthreads) {
2557 
2558       if (*threads_data_p != NULL) {
2559         kmp_thread_data_t *old_data = *threads_data_p;
2560         kmp_thread_data_t *new_data = NULL;
2561 
2562         KE_TRACE(
2563             10,
2564             ("__kmp_realloc_task_threads_data: T#%d reallocating "
2565              "threads data for task_team %p, new_size = %d, old_size = %d\n",
2566              __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads));
2567         // Reallocate threads_data to have more elements than current array
2568         // Cannot use __kmp_thread_realloc() because threads not around for
2569         // kmp_reap_task_team( ).  Note all new array entries are initialized
2570         // to zero by __kmp_allocate().
2571         new_data = (kmp_thread_data_t *)__kmp_allocate(
2572             nthreads * sizeof(kmp_thread_data_t));
2573         // copy old data to new data
2574         KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t),
2575                      (void *)old_data, maxthreads * sizeof(kmp_thread_data_t));
2576 
2577 #ifdef BUILD_TIED_TASK_STACK
2578         // GEH: Figure out if this is the right thing to do
2579         for (i = maxthreads; i < nthreads; i++) {
2580           kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
2581           __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
2582         }
2583 #endif // BUILD_TIED_TASK_STACK
2584         // Install the new data and free the old data
2585         (*threads_data_p) = new_data;
2586         __kmp_free(old_data);
2587       } else {
2588         KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating "
2589                       "threads data for task_team %p, size = %d\n",
2590                       __kmp_gtid_from_thread(thread), task_team, nthreads));
2591         // Make the initial allocate for threads_data array, and zero entries
2592         // Cannot use __kmp_thread_calloc() because threads not around for
2593         // kmp_reap_task_team( ).
2594         ANNOTATE_IGNORE_WRITES_BEGIN();
2595         *threads_data_p = (kmp_thread_data_t *)__kmp_allocate(
2596             nthreads * sizeof(kmp_thread_data_t));
2597         ANNOTATE_IGNORE_WRITES_END();
2598 #ifdef BUILD_TIED_TASK_STACK
2599         // GEH: Figure out if this is the right thing to do
2600         for (i = 0; i < nthreads; i++) {
2601           kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
2602           __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
2603         }
2604 #endif // BUILD_TIED_TASK_STACK
2605       }
2606       task_team->tt.tt_max_threads = nthreads;
2607     } else {
2608       // If array has (more than) enough elements, go ahead and use it
2609       KMP_DEBUG_ASSERT(*threads_data_p != NULL);
2610     }
2611 
2612     // initialize threads_data pointers back to thread_info structures
2613     for (i = 0; i < nthreads; i++) {
2614       kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
2615       thread_data->td.td_thr = team->t.t_threads[i];
2616 
2617       if (thread_data->td.td_deque_last_stolen >= nthreads) {
2618         // The last stolen field survives across teams / barrier, and the number
2619         // of threads may have changed.  It's possible (likely?) that a new
2620         // parallel region will exhibit the same behavior as previous region.
2621         thread_data->td.td_deque_last_stolen = -1;
2622       }
2623     }
2624 
2625     KMP_MB();
2626     TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE);
2627   }
2628 
2629   __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
2630   return is_init_thread;
2631 }
2632 
2633 // __kmp_free_task_threads_data:
2634 // Deallocates a threads_data array for a task team, including any attached
2635 // tasking deques.  Only occurs at library shutdown.
2636 static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) {
2637   __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
2638   if (task_team->tt.tt_threads_data != NULL) {
2639     int i;
2640     for (i = 0; i < task_team->tt.tt_max_threads; i++) {
2641       __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]);
2642     }
2643     __kmp_free(task_team->tt.tt_threads_data);
2644     task_team->tt.tt_threads_data = NULL;
2645   }
2646   __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
2647 }
2648 
2649 // __kmp_allocate_task_team:
2650 // Allocates a task team associated with a specific team, taking it from
2651 // the global task team free list if possible.  Also initializes data
2652 // structures.
2653 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
2654                                                  kmp_team_t *team) {
2655   kmp_task_team_t *task_team = NULL;
2656   int nthreads;
2657 
2658   KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n",
2659                 (thread ? __kmp_gtid_from_thread(thread) : -1), team));
2660 
2661   if (TCR_PTR(__kmp_free_task_teams) != NULL) {
2662     // Take a task team from the task team pool
2663     __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
2664     if (__kmp_free_task_teams != NULL) {
2665       task_team = __kmp_free_task_teams;
2666       TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next);
2667       task_team->tt.tt_next = NULL;
2668     }
2669     __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
2670   }
2671 
2672   if (task_team == NULL) {
2673     KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating "
2674                   "task team for team %p\n",
2675                   __kmp_gtid_from_thread(thread), team));
2676     // Allocate a new task team if one is not available.
2677     // Cannot use __kmp_thread_malloc() because threads not around for
2678     // kmp_reap_task_team( ).
2679     task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t));
2680     __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock);
2681     // AC: __kmp_allocate zeroes returned memory
2682     // task_team -> tt.tt_threads_data = NULL;
2683     // task_team -> tt.tt_max_threads = 0;
2684     // task_team -> tt.tt_next = NULL;
2685   }
2686 
2687   TCW_4(task_team->tt.tt_found_tasks, FALSE);
2688 #if OMP_45_ENABLED
2689   TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
2690 #endif
2691   task_team->tt.tt_nproc = nthreads = team->t.t_nproc;
2692 
2693   TCW_4(task_team->tt.tt_unfinished_threads, nthreads);
2694   TCW_4(task_team->tt.tt_active, TRUE);
2695 
2696   KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
2697                 "unfinished_threads init'd to %d\n",
2698                 (thread ? __kmp_gtid_from_thread(thread) : -1), task_team,
2699                 task_team->tt.tt_unfinished_threads));
2700   return task_team;
2701 }
2702 
2703 // __kmp_free_task_team:
2704 // Frees the task team associated with a specific thread, and adds it
2705 // to the global task team free list.
2706 void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) {
2707   KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n",
2708                 thread ? __kmp_gtid_from_thread(thread) : -1, task_team));
2709 
2710   // Put task team back on free list
2711   __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
2712 
2713   KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL);
2714   task_team->tt.tt_next = __kmp_free_task_teams;
2715   TCW_PTR(__kmp_free_task_teams, task_team);
2716 
2717   __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
2718 }
2719 
2720 // __kmp_reap_task_teams:
2721 // Free all the task teams on the task team free list.
2722 // Should only be done during library shutdown.
2723 // Cannot do anything that needs a thread structure or gtid since they are
2724 // already gone.
2725 void __kmp_reap_task_teams(void) {
2726   kmp_task_team_t *task_team;
2727 
2728   if (TCR_PTR(__kmp_free_task_teams) != NULL) {
2729     // Free all task_teams on the free list
2730     __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
2731     while ((task_team = __kmp_free_task_teams) != NULL) {
2732       __kmp_free_task_teams = task_team->tt.tt_next;
2733       task_team->tt.tt_next = NULL;
2734 
2735       // Free threads_data if necessary
2736       if (task_team->tt.tt_threads_data != NULL) {
2737         __kmp_free_task_threads_data(task_team);
2738       }
2739       __kmp_free(task_team);
2740     }
2741     __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
2742   }
2743 }
2744 
2745 // __kmp_wait_to_unref_task_teams:
2746 // Some threads could still be in the fork barrier release code, possibly
2747 // trying to steal tasks.  Wait for each thread to unreference its task team.
2748 void __kmp_wait_to_unref_task_teams(void) {
2749   kmp_info_t *thread;
2750   kmp_uint32 spins;
2751   int done;
2752 
2753   KMP_INIT_YIELD(spins);
2754 
2755   for (;;) {
2756     done = TRUE;
2757 
2758     // TODO: GEH - this may be is wrong because some sync would be necessary
2759     // in case threads are added to the pool during the traversal. Need to
2760     // verify that lock for thread pool is held when calling this routine.
2761     for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL;
2762          thread = thread->th.th_next_pool) {
2763 #if KMP_OS_WINDOWS
2764       DWORD exit_val;
2765 #endif
2766       if (TCR_PTR(thread->th.th_task_team) == NULL) {
2767         KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n",
2768                       __kmp_gtid_from_thread(thread)));
2769         continue;
2770       }
2771 #if KMP_OS_WINDOWS
2772       // TODO: GEH - add this check for Linux* OS / OS X* as well?
2773       if (!__kmp_is_thread_alive(thread, &exit_val)) {
2774         thread->th.th_task_team = NULL;
2775         continue;
2776       }
2777 #endif
2778 
2779       done = FALSE; // Because th_task_team pointer is not NULL for this thread
2780 
2781       KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to "
2782                     "unreference task_team\n",
2783                     __kmp_gtid_from_thread(thread)));
2784 
2785       if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
2786         volatile void *sleep_loc;
2787         // If the thread is sleeping, awaken it.
2788         if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
2789             NULL) {
2790           KA_TRACE(
2791               10,
2792               ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
2793                __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread)));
2794           __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
2795         }
2796       }
2797     }
2798     if (done) {
2799       break;
2800     }
2801 
2802     // If we are oversubscribed, or have waited a bit (and library mode is
2803     // throughput), yield. Pause is in the following code.
2804     KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2805     KMP_YIELD_SPIN(spins); // Yields only if KMP_LIBRARY=throughput
2806   }
2807 }
2808 
2809 // __kmp_task_team_setup:  Create a task_team for the current team, but use
2810 // an already created, unused one if it already exists.
2811 void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
2812   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2813 
2814   // If this task_team hasn't been created yet, allocate it. It will be used in
2815   // the region after the next.
2816   // If it exists, it is the current task team and shouldn't be touched yet as
2817   // it may still be in use.
2818   if (team->t.t_task_team[this_thr->th.th_task_state] == NULL &&
2819       (always || team->t.t_nproc > 1)) {
2820     team->t.t_task_team[this_thr->th.th_task_state] =
2821         __kmp_allocate_task_team(this_thr, team);
2822     KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created new task_team %p "
2823                   "for team %d at parity=%d\n",
2824                   __kmp_gtid_from_thread(this_thr),
2825                   team->t.t_task_team[this_thr->th.th_task_state],
2826                   ((team != NULL) ? team->t.t_id : -1),
2827                   this_thr->th.th_task_state));
2828   }
2829 
2830   // After threads exit the release, they will call sync, and then point to this
2831   // other task_team; make sure it is allocated and properly initialized. As
2832   // threads spin in the barrier release phase, they will continue to use the
2833   // previous task_team struct(above), until they receive the signal to stop
2834   // checking for tasks (they can't safely reference the kmp_team_t struct,
2835   // which could be reallocated by the master thread). No task teams are formed
2836   // for serialized teams.
2837   if (team->t.t_nproc > 1) {
2838     int other_team = 1 - this_thr->th.th_task_state;
2839     if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
2840       team->t.t_task_team[other_team] =
2841           __kmp_allocate_task_team(this_thr, team);
2842       KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created second new "
2843                     "task_team %p for team %d at parity=%d\n",
2844                     __kmp_gtid_from_thread(this_thr),
2845                     team->t.t_task_team[other_team],
2846                     ((team != NULL) ? team->t.t_id : -1), other_team));
2847     } else { // Leave the old task team struct in place for the upcoming region;
2848       // adjust as needed
2849       kmp_task_team_t *task_team = team->t.t_task_team[other_team];
2850       if (!task_team->tt.tt_active ||
2851           team->t.t_nproc != task_team->tt.tt_nproc) {
2852         TCW_4(task_team->tt.tt_nproc, team->t.t_nproc);
2853         TCW_4(task_team->tt.tt_found_tasks, FALSE);
2854 #if OMP_45_ENABLED
2855         TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
2856 #endif
2857         TCW_4(task_team->tt.tt_unfinished_threads, team->t.t_nproc);
2858         TCW_4(task_team->tt.tt_active, TRUE);
2859       }
2860       // if team size has changed, the first thread to enable tasking will
2861       // realloc threads_data if necessary
2862       KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d reset next task_team "
2863                     "%p for team %d at parity=%d\n",
2864                     __kmp_gtid_from_thread(this_thr),
2865                     team->t.t_task_team[other_team],
2866                     ((team != NULL) ? team->t.t_id : -1), other_team));
2867     }
2868   }
2869 }
2870 
2871 // __kmp_task_team_sync: Propagation of task team data from team to threads
2872 // which happens just after the release phase of a team barrier.  This may be
2873 // called by any thread, but only for teams with # threads > 1.
2874 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
2875   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2876 
2877   // Toggle the th_task_state field, to switch which task_team this thread
2878   // refers to
2879   this_thr->th.th_task_state = 1 - this_thr->th.th_task_state;
2880   // It is now safe to propagate the task team pointer from the team struct to
2881   // the current thread.
2882   TCW_PTR(this_thr->th.th_task_team,
2883           team->t.t_task_team[this_thr->th.th_task_state]);
2884   KA_TRACE(20,
2885            ("__kmp_task_team_sync: Thread T#%d task team switched to task_team "
2886             "%p from Team #%d (parity=%d)\n",
2887             __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team,
2888             ((team != NULL) ? team->t.t_id : -1), this_thr->th.th_task_state));
2889 }
2890 
2891 // __kmp_task_team_wait: Master thread waits for outstanding tasks after the
2892 // barrier gather phase. Only called by master thread if #threads in team > 1 or
2893 // if proxy tasks were created.
2894 //
2895 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
2896 // by passing in 0 optionally as the last argument. When wait is zero, master
2897 // thread does not wait for unfinished_threads to reach 0.
2898 void __kmp_task_team_wait(
2899     kmp_info_t *this_thr,
2900     kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) {
2901   kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state];
2902 
2903   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2904   KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team);
2905 
2906   if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) {
2907     if (wait) {
2908       KA_TRACE(20, ("__kmp_task_team_wait: Master T#%d waiting for all tasks "
2909                     "(for unfinished_threads to reach 0) on task_team = %p\n",
2910                     __kmp_gtid_from_thread(this_thr), task_team));
2911       // Worker threads may have dropped through to release phase, but could
2912       // still be executing tasks. Wait here for tasks to complete. To avoid
2913       // memory contention, only master thread checks termination condition.
2914       kmp_flag_32 flag(
2915           RCAST(volatile kmp_uint32 *, &task_team->tt.tt_unfinished_threads),
2916           0U);
2917       flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2918     }
2919     // Deactivate the old task team, so that the worker threads will stop
2920     // referencing it while spinning.
2921     KA_TRACE(
2922         20,
2923         ("__kmp_task_team_wait: Master T#%d deactivating task_team %p: "
2924          "setting active to false, setting local and team's pointer to NULL\n",
2925          __kmp_gtid_from_thread(this_thr), task_team));
2926 #if OMP_45_ENABLED
2927     KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 ||
2928                      task_team->tt.tt_found_proxy_tasks == TRUE);
2929     TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE);
2930 #else
2931     KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1);
2932 #endif
2933     TCW_SYNC_4(task_team->tt.tt_active, FALSE);
2934     KMP_MB();
2935 
2936     TCW_PTR(this_thr->th.th_task_team, NULL);
2937   }
2938 }
2939 
2940 // __kmp_tasking_barrier:
2941 // This routine may only called when __kmp_tasking_mode == tskm_extra_barrier.
2942 // Internal function to execute all tasks prior to a regular barrier or a join
2943 // barrier. It is a full barrier itself, which unfortunately turns regular
2944 // barriers into double barriers and join barriers into 1 1/2 barriers.
2945 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) {
2946   volatile kmp_uint32 *spin = RCAST(
2947       volatile kmp_uint32 *,
2948       &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads);
2949   int flag = FALSE;
2950   KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier);
2951 
2952 #if USE_ITT_BUILD
2953   KMP_FSYNC_SPIN_INIT(spin, (kmp_uint32 *)NULL);
2954 #endif /* USE_ITT_BUILD */
2955   kmp_flag_32 spin_flag(spin, 0U);
2956   while (!spin_flag.execute_tasks(thread, gtid, TRUE,
2957                                   &flag USE_ITT_BUILD_ARG(NULL), 0)) {
2958 #if USE_ITT_BUILD
2959     // TODO: What about itt_sync_obj??
2960     KMP_FSYNC_SPIN_PREPARE(CCAST(kmp_uint32 *, spin));
2961 #endif /* USE_ITT_BUILD */
2962 
2963     if (TCR_4(__kmp_global.g.g_done)) {
2964       if (__kmp_global.g.g_abort)
2965         __kmp_abort_thread();
2966       break;
2967     }
2968     KMP_YIELD(TRUE); // GH: We always yield here
2969   }
2970 #if USE_ITT_BUILD
2971   KMP_FSYNC_SPIN_ACQUIRED(CCAST(kmp_uint32 *, spin));
2972 #endif /* USE_ITT_BUILD */
2973 }
2974 
2975 #if OMP_45_ENABLED
2976 
2977 // __kmp_give_task puts a task into a given thread queue if:
2978 //  - the queue for that thread was created
2979 //  - there's space in that queue
2980 // Because of this, __kmp_push_task needs to check if there's space after
2981 // getting the lock
2982 static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task,
2983                             kmp_int32 pass) {
2984   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
2985   kmp_task_team_t *task_team = taskdata->td_task_team;
2986 
2987   KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n",
2988                 taskdata, tid));
2989 
2990   // If task_team is NULL something went really bad...
2991   KMP_DEBUG_ASSERT(task_team != NULL);
2992 
2993   bool result = false;
2994   kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
2995 
2996   if (thread_data->td.td_deque == NULL) {
2997     // There's no queue in this thread, go find another one
2998     // We're guaranteed that at least one thread has a queue
2999     KA_TRACE(30,
3000              ("__kmp_give_task: thread %d has no queue while giving task %p.\n",
3001               tid, taskdata));
3002     return result;
3003   }
3004 
3005   if (TCR_4(thread_data->td.td_deque_ntasks) >=
3006       TASK_DEQUE_SIZE(thread_data->td)) {
3007     KA_TRACE(
3008         30,
3009         ("__kmp_give_task: queue is full while giving task %p to thread %d.\n",
3010          taskdata, tid));
3011 
3012     // if this deque is bigger than the pass ratio give a chance to another
3013     // thread
3014     if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
3015       return result;
3016 
3017     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3018     __kmp_realloc_task_deque(thread, thread_data);
3019 
3020   } else {
3021 
3022     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3023 
3024     if (TCR_4(thread_data->td.td_deque_ntasks) >=
3025         TASK_DEQUE_SIZE(thread_data->td)) {
3026       KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to "
3027                     "thread %d.\n",
3028                     taskdata, tid));
3029 
3030       // if this deque is bigger than the pass ratio give a chance to another
3031       // thread
3032       if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
3033         goto release_and_exit;
3034 
3035       __kmp_realloc_task_deque(thread, thread_data);
3036     }
3037   }
3038 
3039   // lock is held here, and there is space in the deque
3040 
3041   thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
3042   // Wrap index.
3043   thread_data->td.td_deque_tail =
3044       (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
3045   TCW_4(thread_data->td.td_deque_ntasks,
3046         TCR_4(thread_data->td.td_deque_ntasks) + 1);
3047 
3048   result = true;
3049   KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n",
3050                 taskdata, tid));
3051 
3052 release_and_exit:
3053   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3054 
3055   return result;
3056 }
3057 
3058 /* The finish of the proxy tasks is divided in two pieces:
3059     - the top half is the one that can be done from a thread outside the team
3060     - the bottom half must be run from a them within the team
3061 
3062    In order to run the bottom half the task gets queued back into one of the
3063    threads of the team. Once the td_incomplete_child_task counter of the parent
3064    is decremented the threads can leave the barriers. So, the bottom half needs
3065    to be queued before the counter is decremented. The top half is therefore
3066    divided in two parts:
3067     - things that can be run before queuing the bottom half
3068     - things that must be run after queuing the bottom half
3069 
3070    This creates a second race as the bottom half can free the task before the
3071    second top half is executed. To avoid this we use the
3072    td_incomplete_child_task of the proxy task to synchronize the top and bottom
3073    half. */
3074 static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
3075   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
3076   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3077   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
3078   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
3079 
3080   taskdata->td_flags.complete = 1; // mark the task as completed
3081 
3082   if (taskdata->td_taskgroup)
3083     KMP_TEST_THEN_DEC32(&taskdata->td_taskgroup->count);
3084 
3085   // Create an imaginary children for this task so the bottom half cannot
3086   // release the task before we have completed the second top half
3087   TCI_4(taskdata->td_incomplete_child_tasks);
3088 }
3089 
3090 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
3091   kmp_int32 children = 0;
3092 
3093   // Predecrement simulated by "- 1" calculation
3094   children =
3095       KMP_TEST_THEN_DEC32(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
3096   KMP_DEBUG_ASSERT(children >= 0);
3097 
3098   // Remove the imaginary children
3099   TCD_4(taskdata->td_incomplete_child_tasks);
3100 }
3101 
3102 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) {
3103   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3104   kmp_info_t *thread = __kmp_threads[gtid];
3105 
3106   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3107   KMP_DEBUG_ASSERT(taskdata->td_flags.complete ==
3108                    1); // top half must run before bottom half
3109 
3110   // We need to wait to make sure the top half is finished
3111   // Spinning here should be ok as this should happen quickly
3112   while (TCR_4(taskdata->td_incomplete_child_tasks) > 0)
3113     ;
3114 
3115   __kmp_release_deps(gtid, taskdata);
3116   __kmp_free_task_and_ancestors(gtid, taskdata, thread);
3117 }
3118 
3119 /*!
3120 @ingroup TASKING
3121 @param gtid Global Thread ID of encountering thread
3122 @param ptask Task which execution is completed
3123 
3124 Execute the completation of a proxy task from a thread of that is part of the
3125 team. Run first and bottom halves directly.
3126 */
3127 void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) {
3128   KMP_DEBUG_ASSERT(ptask != NULL);
3129   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3130   KA_TRACE(
3131       10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n",
3132            gtid, taskdata));
3133 
3134   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3135 
3136   __kmp_first_top_half_finish_proxy(taskdata);
3137   __kmp_second_top_half_finish_proxy(taskdata);
3138   __kmp_bottom_half_finish_proxy(gtid, ptask);
3139 
3140   KA_TRACE(10,
3141            ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n",
3142             gtid, taskdata));
3143 }
3144 
3145 /*!
3146 @ingroup TASKING
3147 @param ptask Task which execution is completed
3148 
3149 Execute the completation of a proxy task from a thread that could not belong to
3150 the team.
3151 */
3152 void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) {
3153   KMP_DEBUG_ASSERT(ptask != NULL);
3154   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3155 
3156   KA_TRACE(
3157       10,
3158       ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n",
3159        taskdata));
3160 
3161   KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3162 
3163   __kmp_first_top_half_finish_proxy(taskdata);
3164 
3165   // Enqueue task to complete bottom half completion from a thread within the
3166   // corresponding team
3167   kmp_team_t *team = taskdata->td_team;
3168   kmp_int32 nthreads = team->t.t_nproc;
3169   kmp_info_t *thread;
3170 
3171   // This should be similar to start_k = __kmp_get_random( thread ) % nthreads
3172   // but we cannot use __kmp_get_random here
3173   kmp_int32 start_k = 0;
3174   kmp_int32 pass = 1;
3175   kmp_int32 k = start_k;
3176 
3177   do {
3178     // For now we're just linearly trying to find a thread
3179     thread = team->t.t_threads[k];
3180     k = (k + 1) % nthreads;
3181 
3182     // we did a full pass through all the threads
3183     if (k == start_k)
3184       pass = pass << 1;
3185 
3186   } while (!__kmp_give_task(thread, k, ptask, pass));
3187 
3188   __kmp_second_top_half_finish_proxy(taskdata);
3189 
3190   KA_TRACE(
3191       10,
3192       ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n",
3193        taskdata));
3194 }
3195 
3196 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task
3197 // for taskloop
3198 //
3199 // thread:   allocating thread
3200 // task_src: pointer to source task to be duplicated
3201 // returns:  a pointer to the allocated kmp_task_t structure (task).
3202 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) {
3203   kmp_task_t *task;
3204   kmp_taskdata_t *taskdata;
3205   kmp_taskdata_t *taskdata_src;
3206   kmp_taskdata_t *parent_task = thread->th.th_current_task;
3207   size_t shareds_offset;
3208   size_t task_size;
3209 
3210   KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread,
3211                 task_src));
3212   taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
3213   KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy ==
3214                    TASK_FULL); // it should not be proxy task
3215   KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT);
3216   task_size = taskdata_src->td_size_alloc;
3217 
3218   // Allocate a kmp_taskdata_t block and a kmp_task_t block.
3219   KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread,
3220                 task_size));
3221 #if USE_FAST_MEMORY
3222   taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size);
3223 #else
3224   taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size);
3225 #endif /* USE_FAST_MEMORY */
3226   KMP_MEMCPY(taskdata, taskdata_src, task_size);
3227 
3228   task = KMP_TASKDATA_TO_TASK(taskdata);
3229 
3230   // Initialize new task (only specific fields not affected by memcpy)
3231   taskdata->td_task_id = KMP_GEN_TASK_ID();
3232   if (task->shareds != NULL) { // need setup shareds pointer
3233     shareds_offset = (char *)task_src->shareds - (char *)taskdata_src;
3234     task->shareds = &((char *)taskdata)[shareds_offset];
3235     KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
3236                      0);
3237   }
3238   taskdata->td_alloc_thread = thread;
3239   taskdata->td_parent = parent_task;
3240   taskdata->td_taskgroup =
3241       parent_task
3242           ->td_taskgroup; // task inherits the taskgroup from the parent task
3243 
3244   // Only need to keep track of child task counts if team parallel and tasking
3245   // not serialized
3246   if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
3247     KMP_TEST_THEN_INC32(&parent_task->td_incomplete_child_tasks);
3248     if (parent_task->td_taskgroup)
3249       KMP_TEST_THEN_INC32(&parent_task->td_taskgroup->count);
3250     // Only need to keep track of allocated child tasks for explicit tasks since
3251     // implicit not deallocated
3252     if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT)
3253       KMP_TEST_THEN_INC32(&taskdata->td_parent->td_allocated_child_tasks);
3254   }
3255 
3256   KA_TRACE(20,
3257            ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
3258             thread, taskdata, taskdata->td_parent));
3259 #if OMPT_SUPPORT
3260   __kmp_task_init_ompt(taskdata, thread->th.th_info.ds.ds_gtid,
3261                        (void *)task->routine);
3262 #endif
3263   return task;
3264 }
3265 
3266 // Routine optionally generated by the compiler for setting the lastprivate flag
3267 // and calling needed constructors for private/firstprivate objects
3268 // (used to form taskloop tasks from pattern task)
3269 // Parameters: dest task, src task, lastprivate flag.
3270 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
3271 
3272 // __kmp_taskloop_linear: Start tasks of the taskloop linearly
3273 //
3274 // loc       Source location information
3275 // gtid      Global thread ID
3276 // task      Pattern task, exposes the loop iteration range
3277 // lb        Pointer to loop lower bound in task structure
3278 // ub        Pointer to loop upper bound in task structure
3279 // st        Loop stride
3280 // ub_glob   Global upper bound (used for lastprivate check)
3281 // num_tasks Number of tasks to execute
3282 // grainsize Number of loop iterations per task
3283 // extras    Number of chunks with grainsize+1 iterations
3284 // tc        Iterations count
3285 // task_dup  Tasks duplication routine
3286 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
3287                            kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
3288                            kmp_uint64 ub_glob, kmp_uint64 num_tasks,
3289                            kmp_uint64 grainsize, kmp_uint64 extras,
3290                            kmp_uint64 tc, void *task_dup) {
3291   KMP_COUNT_BLOCK(OMP_TASKLOOP);
3292   KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
3293   p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
3294   kmp_uint64 lower = *lb; // compiler provides global bounds here
3295   kmp_uint64 upper = *ub;
3296   kmp_uint64 i;
3297   kmp_info_t *thread = __kmp_threads[gtid];
3298   kmp_taskdata_t *current_task = thread->th.th_current_task;
3299   kmp_task_t *next_task;
3300   kmp_int32 lastpriv = 0;
3301   size_t lower_offset =
3302       (char *)lb - (char *)task; // remember offset of lb in the task structure
3303   size_t upper_offset =
3304       (char *)ub - (char *)task; // remember offset of ub in the task structure
3305 
3306   KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
3307   KMP_DEBUG_ASSERT(num_tasks > extras);
3308   KMP_DEBUG_ASSERT(num_tasks > 0);
3309   KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, "
3310                 "extras %lld, i=%lld,%lld(%d)%lld, dup %p\n",
3311                 gtid, num_tasks, grainsize, extras, lower, upper, ub_glob, st,
3312                 task_dup));
3313 
3314   // Launch num_tasks tasks, assign grainsize iterations each task
3315   for (i = 0; i < num_tasks; ++i) {
3316     kmp_uint64 chunk_minus_1;
3317     if (extras == 0) {
3318       chunk_minus_1 = grainsize - 1;
3319     } else {
3320       chunk_minus_1 = grainsize;
3321       --extras; // first extras iterations get bigger chunk (grainsize+1)
3322     }
3323     upper = lower + st * chunk_minus_1;
3324     if (i == num_tasks - 1) {
3325       // schedule the last task, set lastprivate flag if needed
3326       if (st == 1) { // most common case
3327         KMP_DEBUG_ASSERT(upper == *ub);
3328         if (upper == ub_glob)
3329           lastpriv = 1;
3330       } else if (st > 0) { // positive loop stride
3331         KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper);
3332         if ((kmp_uint64)st > ub_glob - upper)
3333           lastpriv = 1;
3334       } else { // negative loop stride
3335         KMP_DEBUG_ASSERT(upper + st < *ub);
3336         if (upper - ub_glob < (kmp_uint64)(-st))
3337           lastpriv = 1;
3338       }
3339     }
3340     next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
3341     // adjust task-specific bounds
3342     *(kmp_uint64 *)((char *)next_task + lower_offset) = lower;
3343     *(kmp_uint64 *)((char *)next_task + upper_offset) = upper;
3344     if (ptask_dup != NULL) // set lastprivate flag, construct fistprivates, etc.
3345       ptask_dup(next_task, task, lastpriv);
3346     KA_TRACE(40, ("__kmp_taskloop_linear: T#%d; task %p: lower %lld, "
3347                   "upper %lld (offsets %p %p)\n",
3348                   gtid, next_task, lower, upper, lower_offset, upper_offset));
3349     __kmp_omp_task(gtid, next_task, true); // schedule new task
3350     lower = upper + st; // adjust lower bound for the next iteration
3351   }
3352   // free the pattern task and exit
3353   __kmp_task_start(gtid, task, current_task); // make internal bookkeeping
3354   // do not execute the pattern task, just do internal bookkeeping
3355   __kmp_task_finish(gtid, task, current_task);
3356 }
3357 
3358 // Structure to keep taskloop parameters for auxiliary task
3359 // kept in the shareds of the task structure.
3360 typedef struct __taskloop_params {
3361   kmp_task_t *task;
3362   kmp_uint64 *lb;
3363   kmp_uint64 *ub;
3364   void *task_dup;
3365   kmp_int64 st;
3366   kmp_uint64 ub_glob;
3367   kmp_uint64 num_tasks;
3368   kmp_uint64 grainsize;
3369   kmp_uint64 extras;
3370   kmp_uint64 tc;
3371   kmp_uint64 num_t_min;
3372 } __taskloop_params_t;
3373 
3374 void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *,
3375                           kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64,
3376                           kmp_uint64, kmp_uint64, kmp_uint64, kmp_uint64,
3377                           void *);
3378 
3379 // Execute part of the the taskloop submitted as a task.
3380 int __kmp_taskloop_task(int gtid, void *ptask) {
3381   __taskloop_params_t *p =
3382       (__taskloop_params_t *)((kmp_task_t *)ptask)->shareds;
3383   kmp_task_t *task = p->task;
3384   kmp_uint64 *lb = p->lb;
3385   kmp_uint64 *ub = p->ub;
3386   void *task_dup = p->task_dup;
3387   //  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
3388   kmp_int64 st = p->st;
3389   kmp_uint64 ub_glob = p->ub_glob;
3390   kmp_uint64 num_tasks = p->num_tasks;
3391   kmp_uint64 grainsize = p->grainsize;
3392   kmp_uint64 extras = p->extras;
3393   kmp_uint64 tc = p->tc;
3394   kmp_uint64 num_t_min = p->num_t_min;
3395 #if KMP_DEBUG
3396   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
3397   KMP_DEBUG_ASSERT(task != NULL);
3398   KA_TRACE(20, ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize"
3399                 " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n",
3400                 gtid, taskdata, num_tasks, grainsize, extras, *lb, *ub, st,
3401                 task_dup));
3402 #endif
3403   KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min);
3404   if (num_tasks > num_t_min)
3405     __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
3406                          grainsize, extras, tc, num_t_min, task_dup);
3407   else
3408     __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
3409                           grainsize, extras, tc, task_dup);
3410 
3411   KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid));
3412   return 0;
3413 }
3414 
3415 // Schedule part of the the taskloop as a task,
3416 // execute the rest of the the taskloop.
3417 //
3418 // loc       Source location information
3419 // gtid      Global thread ID
3420 // task      Pattern task, exposes the loop iteration range
3421 // lb        Pointer to loop lower bound in task structure
3422 // ub        Pointer to loop upper bound in task structure
3423 // st        Loop stride
3424 // ub_glob   Global upper bound (used for lastprivate check)
3425 // num_tasks Number of tasks to execute
3426 // grainsize Number of loop iterations per task
3427 // extras    Number of chunks with grainsize+1 iterations
3428 // tc        Iterations count
3429 // num_t_min Threashold to launch tasks recursively
3430 // task_dup  Tasks duplication routine
3431 void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
3432                           kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
3433                           kmp_uint64 ub_glob, kmp_uint64 num_tasks,
3434                           kmp_uint64 grainsize, kmp_uint64 extras,
3435                           kmp_uint64 tc, kmp_uint64 num_t_min, void *task_dup) {
3436 #if KMP_DEBUG
3437   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
3438   KMP_DEBUG_ASSERT(task != NULL);
3439   KMP_DEBUG_ASSERT(num_tasks > num_t_min);
3440   KA_TRACE(20, ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize"
3441                 " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n",
3442                 gtid, taskdata, num_tasks, grainsize, extras, *lb, *ub, st,
3443                 task_dup));
3444 #endif
3445   p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
3446   kmp_uint64 lower = *lb;
3447   kmp_uint64 upper = *ub;
3448   kmp_info_t *thread = __kmp_threads[gtid];
3449   //  kmp_taskdata_t *current_task = thread->th.th_current_task;
3450   kmp_task_t *next_task;
3451   kmp_int32 lastpriv = 0;
3452   size_t lower_offset =
3453       (char *)lb - (char *)task; // remember offset of lb in the task structure
3454   size_t upper_offset =
3455       (char *)ub - (char *)task; // remember offset of ub in the task structure
3456 
3457   KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
3458   KMP_DEBUG_ASSERT(num_tasks > extras);
3459   KMP_DEBUG_ASSERT(num_tasks > 0);
3460 
3461   // split the loop in two halves
3462   kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1;
3463   kmp_uint64 gr_size0 = grainsize;
3464   kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute
3465   kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task
3466   if (n_tsk0 <= extras) {
3467     gr_size0++; // integrate extras into grainsize
3468     ext0 = 0; // no extra iters in 1st half
3469     ext1 = extras - n_tsk0; // remaining extras
3470     tc0 = gr_size0 * n_tsk0;
3471     tc1 = tc - tc0;
3472   } else { // n_tsk0 > extras
3473     ext1 = 0; // no extra iters in 2nd half
3474     ext0 = extras;
3475     tc1 = grainsize * n_tsk1;
3476     tc0 = tc - tc1;
3477   }
3478   ub0 = lower + st * (tc0 - 1);
3479   lb1 = ub0 + st;
3480 
3481   // create pattern task for 2nd half of the loop
3482   next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task
3483   // adjust lower bound (upper bound is not changed) for the 2nd half
3484   *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1;
3485   if (ptask_dup != NULL) // construct fistprivates, etc.
3486     ptask_dup(next_task, task, 0);
3487   *ub = ub0; // adjust upper bound for the 1st half
3488 
3489   // create auxiliary task for 2nd half of the loop
3490   kmp_task_t *new_task =
3491       __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *),
3492                             sizeof(__taskloop_params_t), &__kmp_taskloop_task);
3493   __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds;
3494   p->task = next_task;
3495   p->lb = (kmp_uint64 *)((char *)next_task + lower_offset);
3496   p->ub = (kmp_uint64 *)((char *)next_task + upper_offset);
3497   p->task_dup = task_dup;
3498   p->st = st;
3499   p->ub_glob = ub_glob;
3500   p->num_tasks = n_tsk1;
3501   p->grainsize = grainsize;
3502   p->extras = ext1;
3503   p->tc = tc1;
3504   p->num_t_min = num_t_min;
3505   __kmp_omp_task(gtid, new_task, true); // schedule new task
3506 
3507   // execute the 1st half of current subrange
3508   if (n_tsk0 > num_t_min)
3509     __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0,
3510                          ext0, tc0, num_t_min, task_dup);
3511   else
3512     __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,
3513                           gr_size0, ext0, tc0, task_dup);
3514 
3515   KA_TRACE(40, ("__kmpc_taskloop_recur(exit): T#%d\n", gtid));
3516 }
3517 
3518 /*!
3519 @ingroup TASKING
3520 @param loc       Source location information
3521 @param gtid      Global thread ID
3522 @param task      Task structure
3523 @param if_val    Value of the if clause
3524 @param lb        Pointer to loop lower bound in task structure
3525 @param ub        Pointer to loop upper bound in task structure
3526 @param st        Loop stride
3527 @param nogroup   Flag, 1 if nogroup clause specified, 0 otherwise
3528 @param sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
3529 @param grainsize Schedule value if specified
3530 @param task_dup  Tasks duplication routine
3531 
3532 Execute the taskloop construct.
3533 */
3534 void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
3535                      kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup,
3536                      int sched, kmp_uint64 grainsize, void *task_dup) {
3537   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
3538   KMP_DEBUG_ASSERT(task != NULL);
3539 
3540   KA_TRACE(20, ("__kmpc_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, "
3541                 "grain %llu(%d), dup %p\n",
3542                 gtid, taskdata, *lb, *ub, st, grainsize, sched, task_dup));
3543 
3544   if (nogroup == 0)
3545     __kmpc_taskgroup(loc, gtid);
3546 
3547   // =========================================================================
3548   // calculate loop parameters
3549   kmp_uint64 tc;
3550   kmp_uint64 lower = *lb; // compiler provides global bounds here
3551   kmp_uint64 upper = *ub;
3552   kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag
3553   kmp_uint64 num_tasks = 0, extras = 0;
3554   kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks;
3555   kmp_info_t *thread = __kmp_threads[gtid];
3556   kmp_taskdata_t *current_task = thread->th.th_current_task;
3557 
3558   // compute trip count
3559   if (st == 1) { // most common case
3560     tc = upper - lower + 1;
3561   } else if (st < 0) {
3562     tc = (lower - upper) / (-st) + 1;
3563   } else { // st > 0
3564     tc = (upper - lower) / st + 1;
3565   }
3566   if (tc == 0) {
3567     KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d zero-trip loop\n", gtid));
3568     // free the pattern task and exit
3569     __kmp_task_start(gtid, task, current_task);
3570     // do not execute anything for zero-trip loop
3571     __kmp_task_finish(gtid, task, current_task);
3572     return;
3573   }
3574   if (num_tasks_min == 0)
3575     // TODO: can we choose better default heuristic?
3576     num_tasks_min =
3577         KMP_MIN(thread->th.th_team_nproc * 10, INITIAL_TASK_DEQUE_SIZE);
3578 
3579   // compute num_tasks/grainsize based on the input provided
3580   switch (sched) {
3581   case 0: // no schedule clause specified, we can choose the default
3582     // let's try to schedule (team_size*10) tasks
3583     grainsize = thread->th.th_team_nproc * 10;
3584   case 2: // num_tasks provided
3585     if (grainsize > tc) {
3586       num_tasks = tc; // too big num_tasks requested, adjust values
3587       grainsize = 1;
3588       extras = 0;
3589     } else {
3590       num_tasks = grainsize;
3591       grainsize = tc / num_tasks;
3592       extras = tc % num_tasks;
3593     }
3594     break;
3595   case 1: // grainsize provided
3596     if (grainsize > tc) {
3597       num_tasks = 1; // too big grainsize requested, adjust values
3598       grainsize = tc;
3599       extras = 0;
3600     } else {
3601       num_tasks = tc / grainsize;
3602       // adjust grainsize for balanced distribution of iterations
3603       grainsize = tc / num_tasks;
3604       extras = tc % num_tasks;
3605     }
3606     break;
3607   default:
3608     KMP_ASSERT2(0, "unknown scheduling of taskloop");
3609   }
3610   KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
3611   KMP_DEBUG_ASSERT(num_tasks > extras);
3612   KMP_DEBUG_ASSERT(num_tasks > 0);
3613   // =========================================================================
3614 
3615   // check if clause value first
3616   if (if_val == 0) { // if(0) specified, mark task as serial
3617     taskdata->td_flags.task_serial = 1;
3618     taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
3619     // always start serial tasks linearly
3620     __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
3621                           grainsize, extras, tc, task_dup);
3622   } else if (num_tasks > num_tasks_min) {
3623     KA_TRACE(20, ("__kmpc_taskloop: T#%d, go recursive: tc %llu, #tasks %llu"
3624                   "(%lld), grain %llu, extras %llu\n",
3625                   gtid, tc, num_tasks, num_tasks_min, grainsize, extras));
3626     __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
3627                          grainsize, extras, tc, num_tasks_min, task_dup);
3628   } else {
3629     KA_TRACE(20, ("__kmpc_taskloop: T#%d, go linear: tc %llu, #tasks %llu"
3630                   "(%lld), grain %llu, extras %llu\n",
3631                   gtid, tc, num_tasks, num_tasks_min, grainsize, extras));
3632     __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
3633                           grainsize, extras, tc, task_dup);
3634   }
3635 
3636   if (nogroup == 0)
3637     __kmpc_end_taskgroup(loc, gtid);
3638   KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid));
3639 }
3640 
3641 #endif
3642