1 /*
2  * kmp_tasking.cpp -- OpenMP 3.0 tasking support.
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_wait_release.h"
20 #include "kmp_stats.h"
21 
22 #if OMPT_SUPPORT
23 #include "ompt-specific.h"
24 #endif
25 
26 #include "tsan_annotations.h"
27 
28 /* ------------------------------------------------------------------------ */
29 /* ------------------------------------------------------------------------ */
30 
31 
32 /* forward declaration */
33 static void __kmp_enable_tasking( kmp_task_team_t *task_team, kmp_info_t *this_thr );
34 static void __kmp_alloc_task_deque( kmp_info_t *thread, kmp_thread_data_t *thread_data );
35 static int  __kmp_realloc_task_threads_data( kmp_info_t *thread, kmp_task_team_t *task_team );
36 
37 #ifdef OMP_45_ENABLED
38 static void __kmp_bottom_half_finish_proxy( kmp_int32 gtid, kmp_task_t * ptask );
39 #endif
40 
41 #ifdef BUILD_TIED_TASK_STACK
42 
43 //---------------------------------------------------------------------------
44 //  __kmp_trace_task_stack: print the tied tasks from the task stack in order
45 //     from top do bottom
46 //
47 //  gtid: global thread identifier for thread containing stack
48 //  thread_data: thread data for task team thread containing stack
49 //  threshold: value above which the trace statement triggers
50 //  location: string identifying call site of this function (for trace)
51 
52 static void
53 __kmp_trace_task_stack( kmp_int32 gtid, kmp_thread_data_t *thread_data, int threshold, char *location )
54 {
55     kmp_task_stack_t *task_stack = & thread_data->td.td_susp_tied_tasks;
56     kmp_taskdata_t **stack_top = task_stack -> ts_top;
57     kmp_int32 entries = task_stack -> ts_entries;
58     kmp_taskdata_t *tied_task;
59 
60     KA_TRACE(threshold, ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, "
61                          "first_block = %p, stack_top = %p \n",
62                          location, gtid, entries, task_stack->ts_first_block, stack_top ) );
63 
64     KMP_DEBUG_ASSERT( stack_top != NULL );
65     KMP_DEBUG_ASSERT( entries > 0 );
66 
67     while ( entries != 0 )
68     {
69         KMP_DEBUG_ASSERT( stack_top != & task_stack->ts_first_block.sb_block[0] );
70         // fix up ts_top if we need to pop from previous block
71         if ( entries & TASK_STACK_INDEX_MASK == 0 )
72         {
73             kmp_stack_block_t *stack_block = (kmp_stack_block_t *) (stack_top) ;
74 
75             stack_block = stack_block -> sb_prev;
76             stack_top = & stack_block -> sb_block[TASK_STACK_BLOCK_SIZE];
77         }
78 
79         // finish bookkeeping
80         stack_top--;
81         entries--;
82 
83         tied_task = * stack_top;
84 
85         KMP_DEBUG_ASSERT( tied_task != NULL );
86         KMP_DEBUG_ASSERT( tied_task -> td_flags.tasktype == TASK_TIED );
87 
88         KA_TRACE(threshold, ("__kmp_trace_task_stack(%s):             gtid=%d, entry=%d, "
89                              "stack_top=%p, tied_task=%p\n",
90                              location, gtid, entries, stack_top, tied_task ) );
91     }
92     KMP_DEBUG_ASSERT( stack_top == & task_stack->ts_first_block.sb_block[0] );
93 
94     KA_TRACE(threshold, ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n",
95                          location, gtid ) );
96 }
97 
98 //---------------------------------------------------------------------------
99 //  __kmp_init_task_stack: initialize the task stack for the first time
100 //    after a thread_data structure is created.
101 //    It should not be necessary to do this again (assuming the stack works).
102 //
103 //  gtid: global thread identifier of calling thread
104 //  thread_data: thread data for task team thread containing stack
105 
106 static void
107 __kmp_init_task_stack( kmp_int32 gtid, kmp_thread_data_t *thread_data )
108 {
109     kmp_task_stack_t *task_stack = & thread_data->td.td_susp_tied_tasks;
110     kmp_stack_block_t *first_block;
111 
112     // set up the first block of the stack
113     first_block = & task_stack -> ts_first_block;
114     task_stack -> ts_top = (kmp_taskdata_t **) first_block;
115     memset( (void *) first_block, '\0', TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *));
116 
117     // initialize the stack to be empty
118     task_stack  -> ts_entries = TASK_STACK_EMPTY;
119     first_block -> sb_next = NULL;
120     first_block -> sb_prev = NULL;
121 }
122 
123 
124 //---------------------------------------------------------------------------
125 //  __kmp_free_task_stack: free the task stack when thread_data is destroyed.
126 //
127 //  gtid: global thread identifier for calling thread
128 //  thread_data: thread info for thread containing stack
129 
130 static void
131 __kmp_free_task_stack( kmp_int32 gtid, kmp_thread_data_t *thread_data )
132 {
133     kmp_task_stack_t *task_stack = & thread_data->td.td_susp_tied_tasks;
134     kmp_stack_block_t *stack_block = & task_stack -> ts_first_block;
135 
136     KMP_DEBUG_ASSERT( task_stack -> ts_entries == TASK_STACK_EMPTY );
137     // free from the second block of the stack
138     while ( stack_block != NULL ) {
139         kmp_stack_block_t *next_block = (stack_block) ? stack_block -> sb_next : NULL;
140 
141         stack_block -> sb_next = NULL;
142         stack_block -> sb_prev = NULL;
143         if (stack_block != & task_stack -> ts_first_block) {
144             __kmp_thread_free( thread, stack_block );  // free the block, if not the first
145         }
146         stack_block = next_block;
147     }
148     // initialize the stack to be empty
149     task_stack -> ts_entries = 0;
150     task_stack -> ts_top = NULL;
151 }
152 
153 
154 //---------------------------------------------------------------------------
155 //  __kmp_push_task_stack: Push the tied task onto the task stack.
156 //     Grow the stack if necessary by allocating another block.
157 //
158 //  gtid: global thread identifier for calling thread
159 //  thread: thread info for thread containing stack
160 //  tied_task: the task to push on the stack
161 
162 static void
163 __kmp_push_task_stack( kmp_int32 gtid, kmp_info_t *thread, kmp_taskdata_t * tied_task )
164 {
165     // GEH - need to consider what to do if tt_threads_data not allocated yet
166     kmp_thread_data_t *thread_data = & thread -> th.th_task_team ->
167                                         tt.tt_threads_data[ __kmp_tid_from_gtid( gtid ) ];
168     kmp_task_stack_t *task_stack = & thread_data->td.td_susp_tied_tasks ;
169 
170     if ( tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser ) {
171         return;  // Don't push anything on stack if team or team tasks are serialized
172     }
173 
174     KMP_DEBUG_ASSERT( tied_task -> td_flags.tasktype == TASK_TIED );
175     KMP_DEBUG_ASSERT( task_stack -> ts_top != NULL );
176 
177     KA_TRACE(20, ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n",
178                   gtid, thread, tied_task ) );
179     // Store entry
180     * (task_stack -> ts_top) = tied_task;
181 
182     // Do bookkeeping for next push
183     task_stack -> ts_top++;
184     task_stack -> ts_entries++;
185 
186     if ( task_stack -> ts_entries & TASK_STACK_INDEX_MASK == 0 )
187     {
188         // Find beginning of this task block
189         kmp_stack_block_t *stack_block =
190              (kmp_stack_block_t *) (task_stack -> ts_top - TASK_STACK_BLOCK_SIZE);
191 
192         // Check if we already have a block
193         if ( stack_block -> sb_next != NULL )
194         {    // reset ts_top to beginning of next block
195             task_stack -> ts_top = & stack_block -> sb_next -> sb_block[0];
196         }
197         else
198         {   // Alloc new block and link it up
199             kmp_stack_block_t *new_block = (kmp_stack_block_t *)
200               __kmp_thread_calloc(thread, sizeof(kmp_stack_block_t));
201 
202             task_stack -> ts_top  = & new_block -> sb_block[0];
203             stack_block -> sb_next = new_block;
204             new_block  -> sb_prev = stack_block;
205             new_block  -> sb_next = NULL;
206 
207             KA_TRACE(30, ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n",
208                           gtid, tied_task, new_block ) );
209         }
210     }
211     KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid, tied_task ) );
212 }
213 
214 //---------------------------------------------------------------------------
215 //  __kmp_pop_task_stack: Pop the tied task from the task stack.  Don't return
216 //     the task, just check to make sure it matches the ending task passed in.
217 //
218 //  gtid: global thread identifier for the calling thread
219 //  thread: thread info structure containing stack
220 //  tied_task: the task popped off the stack
221 //  ending_task: the task that is ending (should match popped task)
222 
223 static void
224 __kmp_pop_task_stack( kmp_int32 gtid, kmp_info_t *thread, kmp_taskdata_t *ending_task )
225 {
226     // GEH - need to consider what to do if tt_threads_data not allocated yet
227     kmp_thread_data_t *thread_data = & thread -> th.th_task_team -> tt_threads_data[ __kmp_tid_from_gtid( gtid ) ];
228     kmp_task_stack_t *task_stack = & thread_data->td.td_susp_tied_tasks ;
229     kmp_taskdata_t *tied_task;
230 
231     if ( ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser ) {
232         return;  // Don't pop anything from stack if team or team tasks are serialized
233     }
234 
235     KMP_DEBUG_ASSERT( task_stack -> ts_top != NULL );
236     KMP_DEBUG_ASSERT( task_stack -> ts_entries > 0 );
237 
238     KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid, thread ) );
239 
240     // fix up ts_top if we need to pop from previous block
241     if ( task_stack -> ts_entries & TASK_STACK_INDEX_MASK == 0 )
242     {
243         kmp_stack_block_t *stack_block =
244            (kmp_stack_block_t *) (task_stack -> ts_top) ;
245 
246         stack_block = stack_block -> sb_prev;
247         task_stack -> ts_top = & stack_block -> sb_block[TASK_STACK_BLOCK_SIZE];
248     }
249 
250     // finish bookkeeping
251     task_stack -> ts_top--;
252     task_stack -> ts_entries--;
253 
254     tied_task = * (task_stack -> ts_top );
255 
256     KMP_DEBUG_ASSERT( tied_task != NULL );
257     KMP_DEBUG_ASSERT( tied_task -> td_flags.tasktype == TASK_TIED );
258     KMP_DEBUG_ASSERT( tied_task == ending_task );  // If we built the stack correctly
259 
260     KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid, tied_task ) );
261     return;
262 }
263 #endif /* BUILD_TIED_TASK_STACK */
264 
265 //---------------------------------------------------
266 //  __kmp_push_task: Add a task to the thread's deque
267 
268 static kmp_int32
269 __kmp_push_task(kmp_int32 gtid, kmp_task_t * task )
270 {
271     kmp_info_t *        thread = __kmp_threads[ gtid ];
272     kmp_taskdata_t *    taskdata = KMP_TASK_TO_TASKDATA(task);
273     kmp_task_team_t *   task_team = thread->th.th_task_team;
274     kmp_int32           tid = __kmp_tid_from_gtid( gtid );
275     kmp_thread_data_t * thread_data;
276 
277     KA_TRACE(20, ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata ) );
278 
279     if ( taskdata->td_flags.tiedness == TASK_UNTIED ) {
280         // untied task needs to increment counter so that the task structure is not freed prematurely
281         kmp_int32 counter = 1 + KMP_TEST_THEN_INC32(&taskdata->td_untied_count);
282         KA_TRACE(20, ( "__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",
283                        gtid, counter, taskdata ) );
284     }
285 
286     // The first check avoids building task_team thread data if serialized
287     if ( taskdata->td_flags.task_serial ) {
288         KA_TRACE(20, ( "__kmp_push_task: T#%d team serialized; returning TASK_NOT_PUSHED for task %p\n",
289                        gtid, taskdata ) );
290         return TASK_NOT_PUSHED;
291     }
292 
293     // Now that serialized tasks have returned, we can assume that we are not in immediate exec mode
294     KMP_DEBUG_ASSERT( __kmp_tasking_mode != tskm_immediate_exec );
295     if ( ! KMP_TASKING_ENABLED(task_team) ) {
296          __kmp_enable_tasking( task_team, thread );
297     }
298     KMP_DEBUG_ASSERT( TCR_4(task_team -> tt.tt_found_tasks) == TRUE );
299     KMP_DEBUG_ASSERT( TCR_PTR(task_team -> tt.tt_threads_data) != NULL );
300 
301     // Find tasking deque specific to encountering thread
302     thread_data = & task_team -> tt.tt_threads_data[ tid ];
303 
304     // No lock needed since only owner can allocate
305     if (thread_data -> td.td_deque == NULL ) {
306         __kmp_alloc_task_deque( thread, thread_data );
307     }
308 
309     // Check if deque is full
310     if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE(thread_data->td) )
311     {
312         KA_TRACE(20, ( "__kmp_push_task: T#%d deque is full; returning TASK_NOT_PUSHED for task %p\n",
313                        gtid, taskdata ) );
314         return TASK_NOT_PUSHED;
315     }
316 
317     // Lock the deque for the task push operation
318     __kmp_acquire_bootstrap_lock( & thread_data -> td.td_deque_lock );
319 
320 #if OMP_45_ENABLED
321     // Need to recheck as we can get a proxy task from a thread outside of OpenMP
322     if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE(thread_data->td) )
323     {
324         __kmp_release_bootstrap_lock( & thread_data -> td.td_deque_lock );
325         KA_TRACE(20, ( "__kmp_push_task: T#%d deque is full on 2nd check; returning TASK_NOT_PUSHED for task %p\n",
326                        gtid, taskdata ) );
327         return TASK_NOT_PUSHED;
328     }
329 #else
330     // Must have room since no thread can add tasks but calling thread
331     KMP_DEBUG_ASSERT( TCR_4(thread_data -> td.td_deque_ntasks) < TASK_DEQUE_SIZE(thread_data->td) );
332 #endif
333 
334     thread_data -> td.td_deque[ thread_data -> td.td_deque_tail ] = taskdata;  // Push taskdata
335     // Wrap index.
336     thread_data -> td.td_deque_tail = ( thread_data -> td.td_deque_tail + 1 ) & TASK_DEQUE_MASK(thread_data->td);
337     TCW_4(thread_data -> td.td_deque_ntasks, TCR_4(thread_data -> td.td_deque_ntasks) + 1);             // Adjust task count
338 
339     KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
340           "task=%p ntasks=%d head=%u tail=%u\n",
341           gtid, taskdata, thread_data->td.td_deque_ntasks,
342           thread_data->td.td_deque_head, thread_data->td.td_deque_tail) );
343 
344     __kmp_release_bootstrap_lock( & thread_data->td.td_deque_lock );
345 
346     return TASK_SUCCESSFULLY_PUSHED;
347 }
348 
349 
350 //-----------------------------------------------------------------------------------------
351 // __kmp_pop_current_task_from_thread: set up current task from called thread when team ends
352 // this_thr: thread structure to set current_task in.
353 
354 void
355 __kmp_pop_current_task_from_thread( kmp_info_t *this_thr )
356 {
357     KF_TRACE( 10, ("__kmp_pop_current_task_from_thread(enter): T#%d this_thread=%p, curtask=%p, "
358                    "curtask_parent=%p\n",
359                    0, this_thr, this_thr -> th.th_current_task,
360                    this_thr -> th.th_current_task -> td_parent ) );
361 
362     this_thr -> th.th_current_task = this_thr -> th.th_current_task -> td_parent;
363 
364     KF_TRACE( 10, ("__kmp_pop_current_task_from_thread(exit): T#%d this_thread=%p, curtask=%p, "
365                    "curtask_parent=%p\n",
366                    0, this_thr, this_thr -> th.th_current_task,
367                    this_thr -> th.th_current_task -> td_parent ) );
368 }
369 
370 
371 //---------------------------------------------------------------------------------------
372 // __kmp_push_current_task_to_thread: set up current task in called thread for a new team
373 // this_thr: thread structure to set up
374 // team: team for implicit task data
375 // tid: thread within team to set up
376 
377 void
378 __kmp_push_current_task_to_thread( kmp_info_t *this_thr, kmp_team_t *team, int tid )
379 {
380     // current task of the thread is a parent of the new just created implicit tasks of new team
381     KF_TRACE( 10, ( "__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p curtask=%p "
382                     "parent_task=%p\n",
383                     tid, this_thr, this_thr->th.th_current_task,
384                     team->t.t_implicit_task_taskdata[tid].td_parent ) );
385 
386     KMP_DEBUG_ASSERT (this_thr != NULL);
387 
388     if( tid == 0 ) {
389         if( this_thr->th.th_current_task != & team -> t.t_implicit_task_taskdata[ 0 ] ) {
390             team -> t.t_implicit_task_taskdata[ 0 ].td_parent = this_thr->th.th_current_task;
391             this_thr->th.th_current_task = & team -> t.t_implicit_task_taskdata[ 0 ];
392         }
393     } else {
394         team -> t.t_implicit_task_taskdata[ tid ].td_parent = team -> t.t_implicit_task_taskdata[ 0 ].td_parent;
395         this_thr->th.th_current_task = & team -> t.t_implicit_task_taskdata[ tid ];
396     }
397 
398     KF_TRACE( 10, ( "__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p curtask=%p "
399                     "parent_task=%p\n",
400                     tid, this_thr, this_thr->th.th_current_task,
401                     team->t.t_implicit_task_taskdata[tid].td_parent ) );
402 }
403 
404 
405 //----------------------------------------------------------------------
406 // __kmp_task_start: bookkeeping for a task starting execution
407 // GTID: global thread id of calling thread
408 // task: task starting execution
409 // current_task: task suspending
410 
411 static void
412 __kmp_task_start( kmp_int32 gtid, kmp_task_t * task, kmp_taskdata_t * current_task )
413 {
414     kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(task);
415     kmp_info_t * thread = __kmp_threads[ gtid ];
416 
417     KA_TRACE(10, ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",
418                   gtid, taskdata, current_task) );
419 
420     KMP_DEBUG_ASSERT( taskdata -> td_flags.tasktype == TASK_EXPLICIT );
421 
422     // mark currently executing task as suspended
423     // TODO: GEH - make sure root team implicit task is initialized properly.
424     // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );
425     current_task -> td_flags.executing = 0;
426 
427     // Add task to stack if tied
428 #ifdef BUILD_TIED_TASK_STACK
429     if ( taskdata -> td_flags.tiedness == TASK_TIED )
430     {
431         __kmp_push_task_stack( gtid, thread, taskdata );
432     }
433 #endif /* BUILD_TIED_TASK_STACK */
434 
435     // mark starting task as executing and as current task
436     thread -> th.th_current_task = taskdata;
437 
438     KMP_DEBUG_ASSERT( taskdata->td_flags.started == 0 || taskdata->td_flags.tiedness == TASK_UNTIED );
439     KMP_DEBUG_ASSERT( taskdata->td_flags.executing == 0 || taskdata->td_flags.tiedness == TASK_UNTIED );
440     taskdata -> td_flags.started = 1;
441     taskdata -> td_flags.executing = 1;
442     KMP_DEBUG_ASSERT( taskdata -> td_flags.complete == 0 );
443     KMP_DEBUG_ASSERT( taskdata -> td_flags.freed == 0 );
444 
445     // GEH TODO: shouldn't we pass some sort of location identifier here?
446     // APT: yes, we will pass location here.
447     // need to store current thread state (in a thread or taskdata structure)
448     // before setting work_state, otherwise wrong state is set after end of task
449 
450     KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n",
451                   gtid, taskdata ) );
452 
453 #if OMPT_SUPPORT
454     if (ompt_enabled &&
455         ompt_callbacks.ompt_callback(ompt_event_task_begin)) {
456         kmp_taskdata_t *parent = taskdata->td_parent;
457         ompt_callbacks.ompt_callback(ompt_event_task_begin)(
458             parent ? parent->ompt_task_info.task_id : ompt_task_id_none,
459             parent ? &(parent->ompt_task_info.frame) : NULL,
460             taskdata->ompt_task_info.task_id,
461             taskdata->ompt_task_info.function);
462     }
463 #endif
464 #if OMP_40_ENABLED && OMPT_SUPPORT && OMPT_TRACE
465     /* OMPT emit all dependences if requested by the tool */
466     if (ompt_enabled && taskdata->ompt_task_info.ndeps > 0 &&
467         ompt_callbacks.ompt_callback(ompt_event_task_dependences))
468 	{
469         ompt_callbacks.ompt_callback(ompt_event_task_dependences)(
470             taskdata->ompt_task_info.task_id,
471             taskdata->ompt_task_info.deps,
472             taskdata->ompt_task_info.ndeps
473         );
474 		/* We can now free the allocated memory for the dependencies */
475 		KMP_OMPT_DEPS_FREE (thread, taskdata->ompt_task_info.deps);
476         taskdata->ompt_task_info.deps = NULL;
477         taskdata->ompt_task_info.ndeps = 0;
478     }
479 #endif /* OMP_40_ENABLED && OMPT_SUPPORT && OMPT_TRACE */
480 
481     return;
482 }
483 
484 
485 //----------------------------------------------------------------------
486 // __kmpc_omp_task_begin_if0: report that a given serialized task has started execution
487 // loc_ref: source location information; points to beginning of task block.
488 // gtid: global thread number.
489 // task: task thunk for the started task.
490 
491 void
492 __kmpc_omp_task_begin_if0( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * task )
493 {
494     kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(task);
495     kmp_taskdata_t * current_task = __kmp_threads[ gtid ] -> th.th_current_task;
496 
497     KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p current_task=%p\n",
498                   gtid, loc_ref, taskdata, current_task ) );
499 
500     if ( taskdata->td_flags.tiedness == TASK_UNTIED ) {
501         // untied task needs to increment counter so that the task structure is not freed prematurely
502         kmp_int32 counter = 1 + KMP_TEST_THEN_INC32(&taskdata->td_untied_count);
503         KA_TRACE(20, ( "__kmpc_omp_task_begin_if0: T#%d untied_count (%d) incremented for task %p\n",
504                        gtid, counter, taskdata ) );
505     }
506 
507     taskdata -> td_flags.task_serial = 1;  // Execute this task immediately, not deferred.
508     __kmp_task_start( gtid, task, current_task );
509 
510     KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n",
511                   gtid, loc_ref, taskdata ) );
512 
513     return;
514 }
515 
516 #ifdef TASK_UNUSED
517 //----------------------------------------------------------------------
518 // __kmpc_omp_task_begin: report that a given task has started execution
519 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
520 
521 void
522 __kmpc_omp_task_begin( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * task )
523 {
524     kmp_taskdata_t * current_task = __kmp_threads[ gtid ] -> th.th_current_task;
525 
526     KA_TRACE(10, ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
527                   gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task ) );
528 
529     __kmp_task_start( gtid, task, current_task );
530 
531     KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n",
532                   gtid, loc_ref, KMP_TASK_TO_TASKDATA(task) ) );
533 
534     return;
535 }
536 #endif // TASK_UNUSED
537 
538 
539 //-------------------------------------------------------------------------------------
540 // __kmp_free_task: free the current task space and the space for shareds
541 // gtid: Global thread ID of calling thread
542 // taskdata: task to free
543 // thread: thread data structure of caller
544 
545 static void
546 __kmp_free_task( kmp_int32 gtid, kmp_taskdata_t * taskdata, kmp_info_t * thread )
547 {
548     KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n",
549                   gtid, taskdata) );
550 
551     // Check to make sure all flags and counters have the correct values
552     KMP_DEBUG_ASSERT( taskdata->td_flags.tasktype == TASK_EXPLICIT );
553     KMP_DEBUG_ASSERT( taskdata->td_flags.executing == 0 );
554     KMP_DEBUG_ASSERT( taskdata->td_flags.complete == 1 );
555     KMP_DEBUG_ASSERT( taskdata->td_flags.freed == 0 );
556     KMP_DEBUG_ASSERT( TCR_4(taskdata->td_allocated_child_tasks) == 0  || taskdata->td_flags.task_serial == 1);
557     KMP_DEBUG_ASSERT( TCR_4(taskdata->td_incomplete_child_tasks) == 0 );
558 
559     taskdata->td_flags.freed = 1;
560     ANNOTATE_HAPPENS_BEFORE(taskdata);
561     // deallocate the taskdata and shared variable blocks associated with this task
562     #if USE_FAST_MEMORY
563         __kmp_fast_free( thread, taskdata );
564     #else /* ! USE_FAST_MEMORY */
565         __kmp_thread_free( thread, taskdata );
566     #endif
567 
568     KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n",
569                   gtid, taskdata) );
570 }
571 
572 //-------------------------------------------------------------------------------------
573 // __kmp_free_task_and_ancestors: free the current task and ancestors without children
574 //
575 // gtid: Global thread ID of calling thread
576 // taskdata: task to free
577 // thread: thread data structure of caller
578 
579 static void
580 __kmp_free_task_and_ancestors( kmp_int32 gtid, kmp_taskdata_t * taskdata, kmp_info_t * thread )
581 {
582 #if OMP_45_ENABLED
583     // Proxy tasks must always be allowed to free their parents
584     // because they can be run in background even in serial mode.
585     kmp_int32 team_serial = ( taskdata->td_flags.team_serial ||
586         taskdata->td_flags.tasking_ser ) && !taskdata->td_flags.proxy;
587 #else
588     kmp_int32 team_serial = taskdata->td_flags.team_serial ||
589         taskdata->td_flags.tasking_ser;
590 #endif
591     KMP_DEBUG_ASSERT( taskdata -> td_flags.tasktype == TASK_EXPLICIT );
592 
593     kmp_int32 children = KMP_TEST_THEN_DEC32( (kmp_int32 *)(& taskdata -> td_allocated_child_tasks) ) - 1;
594     KMP_DEBUG_ASSERT( children >= 0 );
595 
596     // Now, go up the ancestor tree to see if any ancestors can now be freed.
597     while ( children == 0 )
598     {
599         kmp_taskdata_t * parent_taskdata = taskdata -> td_parent;
600 
601         KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete "
602                       "and freeing itself\n", gtid, taskdata) );
603 
604         // --- Deallocate my ancestor task ---
605         __kmp_free_task( gtid, taskdata, thread );
606 
607         taskdata = parent_taskdata;
608 
609         // Stop checking ancestors at implicit task
610         // instead of walking up ancestor tree to avoid premature deallocation of ancestors.
611         if ( team_serial || taskdata -> td_flags.tasktype == TASK_IMPLICIT )
612             return;
613 
614         // Predecrement simulated by "- 1" calculation
615         children = KMP_TEST_THEN_DEC32( (kmp_int32 *)(& taskdata -> td_allocated_child_tasks) ) - 1;
616         KMP_DEBUG_ASSERT( children >= 0 );
617     }
618 
619     KA_TRACE(20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "
620                   "not freeing it yet\n", gtid, taskdata, children) );
621 }
622 
623 //---------------------------------------------------------------------
624 // __kmp_task_finish: bookkeeping to do when a task finishes execution
625 // gtid: global thread ID for calling thread
626 // task: task to be finished
627 // resumed_task: task to be resumed.  (may be NULL if task is serialized)
628 
629 static void
630 __kmp_task_finish( kmp_int32 gtid, kmp_task_t *task, kmp_taskdata_t *resumed_task )
631 {
632     kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(task);
633     kmp_info_t * thread = __kmp_threads[ gtid ];
634     kmp_task_team_t * task_team = thread->th.th_task_team; // might be NULL for serial teams...
635     kmp_int32 children = 0;
636 
637 #if OMPT_SUPPORT
638     if (ompt_enabled &&
639         ompt_callbacks.ompt_callback(ompt_event_task_end)) {
640         kmp_taskdata_t *parent = taskdata->td_parent;
641         ompt_callbacks.ompt_callback(ompt_event_task_end)(
642             taskdata->ompt_task_info.task_id);
643     }
644 #endif
645 
646     KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming task %p\n",
647                   gtid, taskdata, resumed_task) );
648 
649     KMP_DEBUG_ASSERT( taskdata -> td_flags.tasktype == TASK_EXPLICIT );
650 
651     // Pop task from stack if tied
652 #ifdef BUILD_TIED_TASK_STACK
653     if ( taskdata -> td_flags.tiedness == TASK_TIED )
654     {
655         __kmp_pop_task_stack( gtid, thread, taskdata );
656     }
657 #endif /* BUILD_TIED_TASK_STACK */
658 
659     if ( taskdata->td_flags.tiedness == TASK_UNTIED ) {
660         // untied task needs to check the counter so that the task structure is not freed prematurely
661         kmp_int32 counter = KMP_TEST_THEN_DEC32(&taskdata->td_untied_count) - 1;
662         KA_TRACE(20, ( "__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",
663                        gtid, counter, taskdata ) );
664         if ( counter > 0 ) {
665             // untied task is not done, to be continued possibly by other thread, do not free it now
666             if (resumed_task == NULL) {
667                 KMP_DEBUG_ASSERT( taskdata->td_flags.task_serial );
668                 resumed_task = taskdata->td_parent;  // In a serialized task, the resumed task is the parent
669             }
670             thread->th.th_current_task = resumed_task; // restore current_task
671             resumed_task->td_flags.executing = 1;  // resume previous task
672             KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, resuming task %p\n",
673                           gtid, taskdata, resumed_task) );
674             return;
675         }
676     }
677 
678     KMP_DEBUG_ASSERT( taskdata -> td_flags.complete == 0 );
679     taskdata -> td_flags.complete = 1;   // mark the task as completed
680     KMP_DEBUG_ASSERT( taskdata -> td_flags.started == 1 );
681     KMP_DEBUG_ASSERT( taskdata -> td_flags.freed == 0 );
682 
683     // Only need to keep track of count if team parallel and tasking not serialized
684     if ( !( taskdata -> td_flags.team_serial || taskdata -> td_flags.tasking_ser ) ) {
685         // Predecrement simulated by "- 1" calculation
686         children = KMP_TEST_THEN_DEC32( (kmp_int32 *)(& taskdata -> td_parent -> td_incomplete_child_tasks) ) - 1;
687         KMP_DEBUG_ASSERT( children >= 0 );
688 #if OMP_40_ENABLED
689         if ( taskdata->td_taskgroup )
690             KMP_TEST_THEN_DEC32( (kmp_int32 *)(& taskdata->td_taskgroup->count) );
691 #if OMP_45_ENABLED
692     }
693     // if we found proxy tasks there could exist a dependency chain
694     // with the proxy task as origin
695     if ( !( taskdata -> td_flags.team_serial || taskdata -> td_flags.tasking_ser ) || (task_team && task_team->tt.tt_found_proxy_tasks) ) {
696 #endif
697         __kmp_release_deps(gtid,taskdata);
698 #endif
699     }
700 
701     // td_flags.executing  must be marked as 0 after __kmp_release_deps has been called
702     // Othertwise, if a task is executed immediately from the release_deps code
703     // the flag will be reset to 1 again by this same function
704     KMP_DEBUG_ASSERT( taskdata -> td_flags.executing == 1 );
705     taskdata -> td_flags.executing = 0;  // suspend the finishing task
706 
707     KA_TRACE(20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
708                   gtid, taskdata, children) );
709 
710 #if OMP_40_ENABLED
711     /* If the tasks' destructor thunk flag has been set, we need to invoke the
712        destructor thunk that has been generated by the compiler.
713        The code is placed here, since at this point other tasks might have been released
714        hence overlapping the destructor invokations with some other work in the
715        released tasks.  The OpenMP spec is not specific on when the destructors are
716        invoked, so we should be free to choose.
717     */
718     if (taskdata->td_flags.destructors_thunk) {
719         kmp_routine_entry_t destr_thunk = task->data1.destructors;
720         KMP_ASSERT(destr_thunk);
721         destr_thunk(gtid, task);
722     }
723 #endif // OMP_40_ENABLED
724 
725     // bookkeeping for resuming task:
726     // GEH - note tasking_ser => task_serial
727     KMP_DEBUG_ASSERT( (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
728                        taskdata->td_flags.task_serial);
729     if ( taskdata->td_flags.task_serial )
730     {
731         if (resumed_task == NULL) {
732             resumed_task = taskdata->td_parent;  // In a serialized task, the resumed task is the parent
733         }
734         else
735 #if OMP_45_ENABLED
736              if ( !(task_team && task_team->tt.tt_found_proxy_tasks) )
737 #endif
738         {
739             // verify resumed task passed in points to parent
740             KMP_DEBUG_ASSERT( resumed_task == taskdata->td_parent );
741         }
742     }
743     else {
744         KMP_DEBUG_ASSERT( resumed_task != NULL );        // verify that resumed task is passed as arguemnt
745     }
746 
747     // Free this task and then ancestor tasks if they have no children.
748     // Restore th_current_task first as suggested by John:
749     // johnmc: if an asynchronous inquiry peers into the runtime system
750     // it doesn't see the freed task as the current task.
751     thread->th.th_current_task = resumed_task;
752     __kmp_free_task_and_ancestors(gtid, taskdata, thread);
753 
754     // TODO: GEH - make sure root team implicit task is initialized properly.
755     // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
756     resumed_task->td_flags.executing = 1;  // resume previous task
757 
758     KA_TRACE(10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
759                   gtid, taskdata, resumed_task) );
760 
761     return;
762 }
763 
764 //---------------------------------------------------------------------
765 // __kmpc_omp_task_complete_if0: report that a task has completed execution
766 // loc_ref: source location information; points to end of task block.
767 // gtid: global thread number.
768 // task: task thunk for the completed task.
769 
770 void
771 __kmpc_omp_task_complete_if0( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task )
772 {
773     KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
774                   gtid, loc_ref, KMP_TASK_TO_TASKDATA(task) ) );
775 
776     __kmp_task_finish( gtid, task, NULL );  // this routine will provide task to resume
777 
778     KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
779                   gtid, loc_ref, KMP_TASK_TO_TASKDATA(task) ) );
780 
781     return;
782 }
783 
784 #ifdef TASK_UNUSED
785 //---------------------------------------------------------------------
786 // __kmpc_omp_task_complete: report that a task has completed execution
787 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
788 
789 void
790 __kmpc_omp_task_complete( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task )
791 {
792     KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n",
793                   gtid, loc_ref, KMP_TASK_TO_TASKDATA(task) ) );
794 
795     __kmp_task_finish( gtid, task, NULL );  // Not sure how to find task to resume
796 
797     KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n",
798                   gtid, loc_ref, KMP_TASK_TO_TASKDATA(task) ) );
799     return;
800 }
801 #endif // TASK_UNUSED
802 
803 
804 #if OMPT_SUPPORT
805 //----------------------------------------------------------------------------------------------------
806 // __kmp_task_init_ompt:
807 //   Initialize OMPT fields maintained by a task. This will only be called after
808 //   ompt_tool, so we already know whether ompt is enabled or not.
809 
810 static inline void
811 __kmp_task_init_ompt( kmp_taskdata_t * task, int tid, void * function )
812 {
813     if (ompt_enabled) {
814         task->ompt_task_info.task_id = __ompt_task_id_new(tid);
815         task->ompt_task_info.function = function;
816         task->ompt_task_info.frame.exit_runtime_frame = NULL;
817         task->ompt_task_info.frame.reenter_runtime_frame = NULL;
818 #if OMP_40_ENABLED
819         task->ompt_task_info.ndeps = 0;
820         task->ompt_task_info.deps = NULL;
821 #endif /* OMP_40_ENABLED */
822     }
823 }
824 #endif
825 
826 
827 //----------------------------------------------------------------------------------------------------
828 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit task for a given thread
829 //
830 // loc_ref:  reference to source location of parallel region
831 // this_thr:  thread data structure corresponding to implicit task
832 // team: team for this_thr
833 // tid: thread id of given thread within team
834 // set_curr_task: TRUE if need to push current task to thread
835 // NOTE: Routine does not set up the implicit task ICVS.  This is assumed to have already been done elsewhere.
836 // TODO: Get better loc_ref.  Value passed in may be NULL
837 
838 void
839 __kmp_init_implicit_task( ident_t *loc_ref, kmp_info_t *this_thr, kmp_team_t *team, int tid, int set_curr_task )
840 {
841     kmp_taskdata_t * task   = & team->t.t_implicit_task_taskdata[ tid ];
842 
843     KF_TRACE(10, ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",
844                   tid, team, task, set_curr_task ? "TRUE" : "FALSE" ) );
845 
846     task->td_task_id  = KMP_GEN_TASK_ID();
847     task->td_team     = team;
848 //    task->td_parent   = NULL;  // fix for CQ230101 (broken parent task info in debugger)
849     task->td_ident    = loc_ref;
850     task->td_taskwait_ident   = NULL;
851     task->td_taskwait_counter = 0;
852     task->td_taskwait_thread  = 0;
853 
854     task->td_flags.tiedness    = TASK_TIED;
855     task->td_flags.tasktype    = TASK_IMPLICIT;
856 #if OMP_45_ENABLED
857     task->td_flags.proxy       = TASK_FULL;
858 #endif
859 
860     // All implicit tasks are executed immediately, not deferred
861     task->td_flags.task_serial = 1;
862     task->td_flags.tasking_ser = ( __kmp_tasking_mode == tskm_immediate_exec );
863     task->td_flags.team_serial = ( team->t.t_serialized ) ? 1 : 0;
864 
865     task->td_flags.started     = 1;
866     task->td_flags.executing   = 1;
867     task->td_flags.complete    = 0;
868     task->td_flags.freed       = 0;
869 
870 #if OMP_40_ENABLED
871     task->td_depnode = NULL;
872 #endif
873 
874     if (set_curr_task) {  // only do this initialization the first time a thread is created
875         task->td_incomplete_child_tasks = 0;
876         task->td_allocated_child_tasks  = 0; // Not used because do not need to deallocate implicit task
877 #if OMP_40_ENABLED
878         task->td_taskgroup = NULL;           // An implicit task does not have taskgroup
879         task->td_dephash = NULL;
880 #endif
881         __kmp_push_current_task_to_thread( this_thr, team, tid );
882     } else {
883         KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);
884         KMP_DEBUG_ASSERT(task->td_allocated_child_tasks  == 0);
885     }
886 
887 #if OMPT_SUPPORT
888     __kmp_task_init_ompt(task, tid, NULL);
889 #endif
890 
891     KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n",
892                   tid, team, task ) );
893 }
894 
895 
896 //-----------------------------------------------------------------------------
897 //// __kmp_finish_implicit_task: Release resources associated to implicit tasks
898 //// at the end of parallel regions. Some resources are kept for reuse in the
899 //// next parallel region.
900 ////
901 //// thread:  thread data structure corresponding to implicit task
902 //
903 void
904 __kmp_finish_implicit_task(kmp_info_t *thread)
905 {
906     kmp_taskdata_t *task = thread->th.th_current_task;
907     if (task->td_dephash)
908         __kmp_dephash_free_entries(thread, task->td_dephash);
909 }
910 
911 
912 //-----------------------------------------------------------------------------
913 //// __kmp_free_implicit_task: Release resources associated to implicit tasks
914 //// when these are destroyed regions
915 ////
916 //// thread:  thread data structure corresponding to implicit task
917 //
918 void
919 __kmp_free_implicit_task(kmp_info_t *thread)
920 {
921     kmp_taskdata_t *task = thread->th.th_current_task;
922     if (task->td_dephash)
923         __kmp_dephash_free(thread, task->td_dephash);
924     task->td_dephash = NULL;
925 }
926 
927 
928 // Round up a size to a power of two specified by val
929 // Used to insert padding between structures co-allocated using a single malloc() call
930 static size_t
931 __kmp_round_up_to_val( size_t size, size_t val ) {
932     if ( size & ( val - 1 ) ) {
933         size &= ~ ( val - 1 );
934         if ( size <= KMP_SIZE_T_MAX - val ) {
935             size += val;    // Round up if there is no overflow.
936         }; // if
937     }; // if
938     return size;
939 } // __kmp_round_up_to_va
940 
941 
942 //---------------------------------------------------------------------------------
943 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task
944 //
945 // loc_ref: source location information
946 // gtid: global thread number.
947 // flags: include tiedness & task type (explicit vs. implicit) of the ''new'' task encountered.
948 //        Converted from kmp_int32 to kmp_tasking_flags_t in routine.
949 // sizeof_kmp_task_t:  Size in bytes of kmp_task_t data structure including private vars accessed in task.
950 // sizeof_shareds:  Size in bytes of array of pointers to shared vars accessed in task.
951 // task_entry: Pointer to task code entry point generated by compiler.
952 // returns: a pointer to the allocated kmp_task_t structure (task).
953 
954 kmp_task_t *
955 __kmp_task_alloc( ident_t *loc_ref, kmp_int32 gtid, kmp_tasking_flags_t *flags,
956                   size_t sizeof_kmp_task_t, size_t sizeof_shareds,
957                   kmp_routine_entry_t task_entry )
958 {
959     kmp_task_t *task;
960     kmp_taskdata_t *taskdata;
961     kmp_info_t *thread = __kmp_threads[ gtid ];
962     kmp_team_t *team = thread->th.th_team;
963     kmp_taskdata_t *parent_task = thread->th.th_current_task;
964     size_t shareds_offset;
965 
966     KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
967                   "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
968                   gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
969                   sizeof_shareds, task_entry) );
970 
971     if ( parent_task->td_flags.final ) {
972         if (flags->merged_if0) {
973         }
974         flags->final = 1;
975     }
976 
977 #if OMP_45_ENABLED
978     if ( flags->proxy == TASK_PROXY ) {
979         flags->tiedness = TASK_UNTIED;
980         flags->merged_if0 = 1;
981 
982         /* are we running in a sequential parallel or tskm_immediate_exec... we need tasking support enabled */
983         if ( (thread->th.th_task_team) == NULL ) {
984             /* This should only happen if the team is serialized
985                 setup a task team and propagate it to the thread
986             */
987             KMP_DEBUG_ASSERT(team->t.t_serialized);
988             KA_TRACE(30,("T#%d creating task team in __kmp_task_alloc for proxy task\n", gtid));
989             __kmp_task_team_setup(thread,team,1); // 1 indicates setup the current team regardless of nthreads
990             thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
991         }
992         kmp_task_team_t * task_team = thread->th.th_task_team;
993 
994         /* tasking must be enabled now as the task might not be pushed */
995         if ( !KMP_TASKING_ENABLED( task_team ) ) {
996             KA_TRACE(30,("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
997             __kmp_enable_tasking( task_team, thread );
998             kmp_int32 tid = thread->th.th_info.ds.ds_tid;
999             kmp_thread_data_t * thread_data = & task_team -> tt.tt_threads_data[ tid ];
1000             // No lock needed since only owner can allocate
1001             if (thread_data -> td.td_deque == NULL ) {
1002                 __kmp_alloc_task_deque( thread, thread_data );
1003             }
1004         }
1005 
1006         if ( task_team->tt.tt_found_proxy_tasks == FALSE )
1007           TCW_4(task_team -> tt.tt_found_proxy_tasks, TRUE);
1008     }
1009 #endif
1010 
1011     // Calculate shared structure offset including padding after kmp_task_t struct
1012     // to align pointers in shared struct
1013     shareds_offset = sizeof( kmp_taskdata_t ) + sizeof_kmp_task_t;
1014     shareds_offset = __kmp_round_up_to_val( shareds_offset, sizeof( void * ));
1015 
1016     // Allocate a kmp_taskdata_t block and a kmp_task_t block.
1017     KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n",
1018                   gtid, shareds_offset) );
1019     KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n",
1020                   gtid, sizeof_shareds) );
1021 
1022     // Avoid double allocation here by combining shareds with taskdata
1023     #if USE_FAST_MEMORY
1024     taskdata = (kmp_taskdata_t *) __kmp_fast_allocate( thread, shareds_offset + sizeof_shareds );
1025     #else /* ! USE_FAST_MEMORY */
1026     taskdata = (kmp_taskdata_t *) __kmp_thread_malloc( thread, shareds_offset + sizeof_shareds );
1027     #endif /* USE_FAST_MEMORY */
1028     ANNOTATE_HAPPENS_AFTER(taskdata);
1029 
1030     task                      = KMP_TASKDATA_TO_TASK(taskdata);
1031 
1032     // Make sure task & taskdata are aligned appropriately
1033 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD
1034     KMP_DEBUG_ASSERT( ( ((kmp_uintptr_t)taskdata) & (sizeof(double)-1) ) == 0 );
1035     KMP_DEBUG_ASSERT( ( ((kmp_uintptr_t)task) & (sizeof(double)-1) ) == 0 );
1036 #else
1037     KMP_DEBUG_ASSERT( ( ((kmp_uintptr_t)taskdata) & (sizeof(_Quad)-1) ) == 0 );
1038     KMP_DEBUG_ASSERT( ( ((kmp_uintptr_t)task) & (sizeof(_Quad)-1) ) == 0 );
1039 #endif
1040     if (sizeof_shareds > 0) {
1041         // Avoid double allocation here by combining shareds with taskdata
1042         task->shareds         = & ((char *) taskdata)[ shareds_offset ];
1043         // Make sure shareds struct is aligned to pointer size
1044         KMP_DEBUG_ASSERT( ( ((kmp_uintptr_t)task->shareds) & (sizeof(void *)-1) ) == 0 );
1045     } else {
1046         task->shareds         = NULL;
1047     }
1048     task->routine             = task_entry;
1049     task->part_id             = 0;      // AC: Always start with 0 part id
1050 
1051     taskdata->td_task_id      = KMP_GEN_TASK_ID();
1052     taskdata->td_team         = team;
1053     taskdata->td_alloc_thread = thread;
1054     taskdata->td_parent       = parent_task;
1055     taskdata->td_level        = parent_task->td_level + 1; // increment nesting level
1056     taskdata->td_untied_count = 0;
1057     taskdata->td_ident        = loc_ref;
1058     taskdata->td_taskwait_ident   = NULL;
1059     taskdata->td_taskwait_counter = 0;
1060     taskdata->td_taskwait_thread  = 0;
1061     KMP_DEBUG_ASSERT( taskdata->td_parent != NULL );
1062 #if OMP_45_ENABLED
1063     // avoid copying icvs for proxy tasks
1064     if ( flags->proxy == TASK_FULL )
1065 #endif
1066        copy_icvs( &taskdata->td_icvs, &taskdata->td_parent->td_icvs );
1067 
1068     taskdata->td_flags.tiedness    = flags->tiedness;
1069     taskdata->td_flags.final       = flags->final;
1070     taskdata->td_flags.merged_if0  = flags->merged_if0;
1071 #if OMP_40_ENABLED
1072     taskdata->td_flags.destructors_thunk = flags->destructors_thunk;
1073 #endif // OMP_40_ENABLED
1074 #if OMP_45_ENABLED
1075     taskdata->td_flags.proxy           = flags->proxy;
1076     taskdata->td_task_team         = thread->th.th_task_team;
1077     taskdata->td_size_alloc        = shareds_offset + sizeof_shareds;
1078 #endif
1079     taskdata->td_flags.tasktype    = TASK_EXPLICIT;
1080 
1081     // GEH - TODO: fix this to copy parent task's value of tasking_ser flag
1082     taskdata->td_flags.tasking_ser = ( __kmp_tasking_mode == tskm_immediate_exec );
1083 
1084     // GEH - TODO: fix this to copy parent task's value of team_serial flag
1085     taskdata->td_flags.team_serial = ( team->t.t_serialized ) ? 1 : 0;
1086 
1087     // GEH - Note we serialize the task if the team is serialized to make sure implicit parallel region
1088     //       tasks are not left until program termination to execute.  Also, it helps locality to execute
1089     //       immediately.
1090     taskdata->td_flags.task_serial = ( parent_task->td_flags.final
1091       || taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser );
1092 
1093     taskdata->td_flags.started     = 0;
1094     taskdata->td_flags.executing   = 0;
1095     taskdata->td_flags.complete    = 0;
1096     taskdata->td_flags.freed       = 0;
1097 
1098     taskdata->td_flags.native      = flags->native;
1099 
1100     taskdata->td_incomplete_child_tasks = 0;
1101     taskdata->td_allocated_child_tasks  = 1; // start at one because counts current task and children
1102 #if OMP_40_ENABLED
1103     taskdata->td_taskgroup = parent_task->td_taskgroup; // task inherits the taskgroup from the parent task
1104     taskdata->td_dephash = NULL;
1105     taskdata->td_depnode = NULL;
1106 #endif
1107 
1108     // Only need to keep track of child task counts if team parallel and tasking not serialized or if it is a proxy task
1109 #if OMP_45_ENABLED
1110     if ( flags->proxy == TASK_PROXY || !( taskdata -> td_flags.team_serial || taskdata -> td_flags.tasking_ser ) )
1111 #else
1112     if ( !( taskdata -> td_flags.team_serial || taskdata -> td_flags.tasking_ser ) )
1113 #endif
1114     {
1115         KMP_TEST_THEN_INC32( (kmp_int32 *)(& parent_task->td_incomplete_child_tasks) );
1116 #if OMP_40_ENABLED
1117         if ( parent_task->td_taskgroup )
1118             KMP_TEST_THEN_INC32( (kmp_int32 *)(& parent_task->td_taskgroup->count) );
1119 #endif
1120         // Only need to keep track of allocated child tasks for explicit tasks since implicit not deallocated
1121         if ( taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT ) {
1122             KMP_TEST_THEN_INC32( (kmp_int32 *)(& taskdata->td_parent->td_allocated_child_tasks) );
1123         }
1124     }
1125 
1126     KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
1127                   gtid, taskdata, taskdata->td_parent) );
1128     ANNOTATE_HAPPENS_BEFORE(task);
1129 
1130 #if OMPT_SUPPORT
1131     __kmp_task_init_ompt(taskdata, gtid, (void*) task_entry);
1132 #endif
1133 
1134     return task;
1135 }
1136 
1137 
1138 kmp_task_t *
1139 __kmpc_omp_task_alloc( ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags,
1140                        size_t sizeof_kmp_task_t, size_t sizeof_shareds,
1141                        kmp_routine_entry_t task_entry )
1142 {
1143     kmp_task_t *retval;
1144     kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *) & flags;
1145 
1146     input_flags->native = FALSE;
1147     // __kmp_task_alloc() sets up all other runtime flags
1148 
1149 #if OMP_45_ENABLED
1150     KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s) "
1151                   "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1152                   gtid, loc_ref, input_flags->tiedness ? "tied  " : "untied",
1153                   input_flags->proxy ? "proxy" : "",
1154                   sizeof_kmp_task_t, sizeof_shareds, task_entry) );
1155 #else
1156     KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s) "
1157                   "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1158                   gtid, loc_ref, input_flags->tiedness ? "tied  " : "untied",
1159                   sizeof_kmp_task_t, sizeof_shareds, task_entry) );
1160 #endif
1161 
1162     retval = __kmp_task_alloc( loc_ref, gtid, input_flags, sizeof_kmp_task_t,
1163                                sizeof_shareds, task_entry );
1164 
1165     KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval) );
1166 
1167     return retval;
1168 }
1169 
1170 //-----------------------------------------------------------
1171 //  __kmp_invoke_task: invoke the specified task
1172 //
1173 // gtid: global thread ID of caller
1174 // task: the task to invoke
1175 // current_task: the task to resume after task invokation
1176 
1177 static void
1178 __kmp_invoke_task( kmp_int32 gtid, kmp_task_t *task, kmp_taskdata_t * current_task )
1179 {
1180     kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(task);
1181     kmp_uint64 cur_time;
1182 #if OMP_40_ENABLED
1183     int discard = 0 /* false */;
1184 #endif
1185     KA_TRACE(30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
1186                   gtid, taskdata, current_task) );
1187     KMP_DEBUG_ASSERT(task);
1188 #if OMP_45_ENABLED
1189     if ( taskdata->td_flags.proxy == TASK_PROXY &&
1190          taskdata->td_flags.complete == 1)
1191          {
1192             // This is a proxy task that was already completed but it needs to run
1193             // its bottom-half finish
1194             KA_TRACE(30, ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
1195                   gtid, taskdata) );
1196 
1197             __kmp_bottom_half_finish_proxy(gtid,task);
1198 
1199             KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for proxy task %p, resuming task %p\n", gtid, taskdata, current_task) );
1200 
1201             return;
1202          }
1203 #endif
1204 
1205 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1206     if(__kmp_forkjoin_frames_mode == 3) {
1207         // Get the current time stamp to measure task execution time to correct barrier imbalance time
1208         cur_time = __itt_get_timestamp();
1209     }
1210 #endif
1211 
1212 #if OMP_45_ENABLED
1213     // Proxy tasks are not handled by the runtime
1214     if ( taskdata->td_flags.proxy != TASK_PROXY ) {
1215 #endif
1216       ANNOTATE_HAPPENS_AFTER(task);
1217       __kmp_task_start( gtid, task, current_task );
1218 #if OMP_45_ENABLED
1219     }
1220 #endif
1221 
1222 #if OMPT_SUPPORT
1223     ompt_thread_info_t oldInfo;
1224     kmp_info_t * thread;
1225     if (ompt_enabled) {
1226         // Store the threads states and restore them after the task
1227         thread = __kmp_threads[ gtid ];
1228         oldInfo = thread->th.ompt_thread_info;
1229         thread->th.ompt_thread_info.wait_id = 0;
1230         thread->th.ompt_thread_info.state = ompt_state_work_parallel;
1231         taskdata->ompt_task_info.frame.exit_runtime_frame = __builtin_frame_address(0);
1232     }
1233 #endif
1234 
1235 #if OMP_40_ENABLED
1236     // TODO: cancel tasks if the parallel region has also been cancelled
1237     // TODO: check if this sequence can be hoisted above __kmp_task_start
1238     // if cancellation has been enabled for this run ...
1239     if (__kmp_omp_cancellation) {
1240         kmp_info_t *this_thr = __kmp_threads [ gtid ];
1241         kmp_team_t * this_team = this_thr->th.th_team;
1242         kmp_taskgroup_t * taskgroup = taskdata->td_taskgroup;
1243         if ((taskgroup && taskgroup->cancel_request) || (this_team->t.t_cancel_request == cancel_parallel)) {
1244             KMP_COUNT_BLOCK(TASK_cancelled);
1245             // this task belongs to a task group and we need to cancel it
1246             discard = 1 /* true */;
1247         }
1248     }
1249 
1250     //
1251     // Invoke the task routine and pass in relevant data.
1252     // Thunks generated by gcc take a different argument list.
1253     //
1254     if (!discard) {
1255 #if KMP_STATS_ENABLED
1256         KMP_COUNT_BLOCK(TASK_executed);
1257         switch(KMP_GET_THREAD_STATE()) {
1258          case FORK_JOIN_BARRIER: KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar); break;
1259          case PLAIN_BARRIER: KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar); break;
1260          case TASKYIELD: KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield); break;
1261          case TASKWAIT: KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait); break;
1262          case TASKGROUP: KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup); break;
1263          default: KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate); break;
1264         }
1265 #endif // KMP_STATS_ENABLED
1266 #endif // OMP_40_ENABLED
1267 
1268 #if OMPT_SUPPORT && OMPT_TRACE
1269         /* let OMPT know that we're about to run this task */
1270         if (ompt_enabled &&
1271              ompt_callbacks.ompt_callback(ompt_event_task_switch))
1272         {
1273           ompt_callbacks.ompt_callback(ompt_event_task_switch)(
1274             current_task->ompt_task_info.task_id,
1275             taskdata->ompt_task_info.task_id);
1276         }
1277 #endif
1278 
1279 #ifdef KMP_GOMP_COMPAT
1280         if (taskdata->td_flags.native) {
1281             ((void (*)(void *))(*(task->routine)))(task->shareds);
1282         }
1283         else
1284 #endif /* KMP_GOMP_COMPAT */
1285         {
1286             (*(task->routine))(gtid, task);
1287         }
1288         KMP_POP_PARTITIONED_TIMER();
1289 
1290 #if OMPT_SUPPORT && OMPT_TRACE
1291         /* let OMPT know that we're returning to the callee task */
1292         if (ompt_enabled &&
1293              ompt_callbacks.ompt_callback(ompt_event_task_switch))
1294         {
1295           ompt_callbacks.ompt_callback(ompt_event_task_switch)(
1296             taskdata->ompt_task_info.task_id,
1297             current_task->ompt_task_info.task_id);
1298         }
1299 #endif
1300 
1301 #if OMP_40_ENABLED
1302     }
1303 #endif // OMP_40_ENABLED
1304 
1305 
1306 #if OMPT_SUPPORT
1307     if (ompt_enabled) {
1308         thread->th.ompt_thread_info = oldInfo;
1309         taskdata->ompt_task_info.frame.exit_runtime_frame = NULL;
1310     }
1311 #endif
1312 
1313 #if OMP_45_ENABLED
1314     // Proxy tasks are not handled by the runtime
1315     if ( taskdata->td_flags.proxy != TASK_PROXY ) {
1316 #endif
1317       ANNOTATE_HAPPENS_BEFORE(taskdata->td_parent);
1318       __kmp_task_finish( gtid, task, current_task );
1319 #if OMP_45_ENABLED
1320     }
1321 #endif
1322 
1323 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1324     // Barrier imbalance - correct arrive time after the task finished
1325     if(__kmp_forkjoin_frames_mode == 3) {
1326         kmp_info_t *this_thr = __kmp_threads [ gtid ];
1327         if(this_thr->th.th_bar_arrive_time) {
1328             this_thr->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time);
1329         }
1330     }
1331 #endif
1332     KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
1333                   gtid, taskdata, current_task) );
1334     return;
1335 }
1336 
1337 //-----------------------------------------------------------------------
1338 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution
1339 //
1340 // loc_ref: location of original task pragma (ignored)
1341 // gtid: Global Thread ID of encountering thread
1342 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
1343 // Returns:
1344 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to be resumed later.
1345 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be resumed later.
1346 
1347 kmp_int32
1348 __kmpc_omp_task_parts( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_task)
1349 {
1350     kmp_taskdata_t * new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1351 
1352     KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n",
1353                   gtid, loc_ref, new_taskdata ) );
1354 
1355     /* Should we execute the new task or queue it?   For now, let's just always try to
1356        queue it.  If the queue fills up, then we'll execute it.  */
1357 
1358     if ( __kmp_push_task( gtid, new_task ) == TASK_NOT_PUSHED ) // if cannot defer
1359     {                                                           // Execute this task immediately
1360         kmp_taskdata_t * current_task = __kmp_threads[ gtid ] -> th.th_current_task;
1361         new_taskdata->td_flags.task_serial = 1;
1362         __kmp_invoke_task( gtid, new_task, current_task );
1363     }
1364 
1365     KA_TRACE(10, ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "
1366                   "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n", gtid, loc_ref,
1367                   new_taskdata ) );
1368 
1369     ANNOTATE_HAPPENS_BEFORE(new_task);
1370     return TASK_CURRENT_NOT_QUEUED;
1371 }
1372 
1373 //---------------------------------------------------------------------
1374 // __kmp_omp_task: Schedule a non-thread-switchable task for execution
1375 // gtid: Global Thread ID of encountering thread
1376 // new_task: non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
1377 // serialize_immediate: if TRUE then if the task is executed immediately its execution will be serialized
1378 // returns:
1379 //
1380 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to be resumed later.
1381 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be resumed later.
1382 kmp_int32
1383 __kmp_omp_task( kmp_int32 gtid, kmp_task_t * new_task, bool serialize_immediate )
1384 {
1385     kmp_taskdata_t * new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1386 
1387 #if OMPT_SUPPORT
1388     if (ompt_enabled) {
1389         new_taskdata->ompt_task_info.frame.reenter_runtime_frame =
1390             __builtin_frame_address(1);
1391     }
1392 #endif
1393 
1394     /* Should we execute the new task or queue it?   For now, let's just always try to
1395        queue it.  If the queue fills up, then we'll execute it.  */
1396 #if OMP_45_ENABLED
1397     if ( new_taskdata->td_flags.proxy == TASK_PROXY || __kmp_push_task( gtid, new_task ) == TASK_NOT_PUSHED ) // if cannot defer
1398 #else
1399     if ( __kmp_push_task( gtid, new_task ) == TASK_NOT_PUSHED ) // if cannot defer
1400 #endif
1401     {                                                           // Execute this task immediately
1402         kmp_taskdata_t * current_task = __kmp_threads[ gtid ] -> th.th_current_task;
1403         if ( serialize_immediate )
1404           new_taskdata -> td_flags.task_serial = 1;
1405         __kmp_invoke_task( gtid, new_task, current_task );
1406     }
1407 
1408 #if OMPT_SUPPORT
1409     if (ompt_enabled) {
1410         new_taskdata->ompt_task_info.frame.reenter_runtime_frame = NULL;
1411     }
1412 #endif
1413 
1414     ANNOTATE_HAPPENS_BEFORE(new_task);
1415     return TASK_CURRENT_NOT_QUEUED;
1416 }
1417 
1418 //---------------------------------------------------------------------
1419 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a non-thread-switchable task from
1420 // the parent thread only!
1421 // loc_ref: location of original task pragma (ignored)
1422 // gtid: Global Thread ID of encountering thread
1423 // new_task: non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
1424 // returns:
1425 //
1426 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to be resumed later.
1427 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be resumed later.
1428 
1429 kmp_int32
1430 __kmpc_omp_task( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_task)
1431 {
1432     kmp_int32 res;
1433     KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1434 
1435 #if KMP_DEBUG
1436     kmp_taskdata_t * new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1437 #endif
1438     KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n",
1439                   gtid, loc_ref, new_taskdata ) );
1440 
1441     res =  __kmp_omp_task(gtid,new_task,true);
1442 
1443     KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1444                   gtid, loc_ref, new_taskdata ) );
1445     return res;
1446 }
1447 
1448 //-------------------------------------------------------------------------------------
1449 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are complete
1450 
1451 kmp_int32
1452 __kmpc_omp_taskwait( ident_t *loc_ref, kmp_int32 gtid )
1453 {
1454     kmp_taskdata_t * taskdata;
1455     kmp_info_t * thread;
1456     int thread_finished = FALSE;
1457     KMP_SET_THREAD_STATE_BLOCK(TASKWAIT);
1458 
1459     KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref) );
1460 
1461     if ( __kmp_tasking_mode != tskm_immediate_exec ) {
1462         // GEH TODO: shouldn't we have some sort of OMPRAP API calls here to mark begin wait?
1463 
1464         thread = __kmp_threads[ gtid ];
1465         taskdata = thread -> th.th_current_task;
1466 
1467 #if OMPT_SUPPORT && OMPT_TRACE
1468         ompt_task_id_t my_task_id;
1469         ompt_parallel_id_t my_parallel_id;
1470 
1471         if (ompt_enabled) {
1472             kmp_team_t *team = thread->th.th_team;
1473             my_task_id = taskdata->ompt_task_info.task_id;
1474             my_parallel_id = team->t.ompt_team_info.parallel_id;
1475 
1476             taskdata->ompt_task_info.frame.reenter_runtime_frame = __builtin_frame_address(1);
1477             if (ompt_callbacks.ompt_callback(ompt_event_taskwait_begin)) {
1478                 ompt_callbacks.ompt_callback(ompt_event_taskwait_begin)(
1479                                 my_parallel_id, my_task_id);
1480             }
1481         }
1482 #endif
1483 
1484         // Debugger: The taskwait is active. Store location and thread encountered the taskwait.
1485 #if USE_ITT_BUILD
1486         // Note: These values are used by ITT events as well.
1487 #endif /* USE_ITT_BUILD */
1488         taskdata->td_taskwait_counter += 1;
1489         taskdata->td_taskwait_ident    = loc_ref;
1490         taskdata->td_taskwait_thread   = gtid + 1;
1491 
1492 #if USE_ITT_BUILD
1493         void * itt_sync_obj = __kmp_itt_taskwait_object( gtid );
1494         if ( itt_sync_obj != NULL )
1495             __kmp_itt_taskwait_starting( gtid, itt_sync_obj );
1496 #endif /* USE_ITT_BUILD */
1497 
1498         bool must_wait = ! taskdata->td_flags.team_serial && ! taskdata->td_flags.final;
1499 
1500 #if OMP_45_ENABLED
1501         must_wait = must_wait || (thread->th.th_task_team != NULL && thread->th.th_task_team->tt.tt_found_proxy_tasks);
1502 #endif
1503         if (must_wait)
1504         {
1505             kmp_flag_32 flag(&(taskdata->td_incomplete_child_tasks), 0U);
1506             while ( TCR_4(taskdata -> td_incomplete_child_tasks) != 0 ) {
1507                 flag.execute_tasks(thread, gtid, FALSE, &thread_finished
1508                                    USE_ITT_BUILD_ARG(itt_sync_obj), __kmp_task_stealing_constraint );
1509             }
1510         }
1511 #if USE_ITT_BUILD
1512         if ( itt_sync_obj != NULL )
1513             __kmp_itt_taskwait_finished( gtid, itt_sync_obj );
1514 #endif /* USE_ITT_BUILD */
1515 
1516         // GEH TODO: shouldn't we have some sort of OMPRAP API calls here to mark end of wait?
1517         // Debugger:  The taskwait is completed. Location remains, but thread is negated.
1518         taskdata->td_taskwait_thread = - taskdata->td_taskwait_thread;
1519 
1520 #if OMPT_SUPPORT && OMPT_TRACE
1521         if (ompt_enabled) {
1522             if (ompt_callbacks.ompt_callback(ompt_event_taskwait_end)) {
1523                 ompt_callbacks.ompt_callback(ompt_event_taskwait_end)(
1524                                 my_parallel_id, my_task_id);
1525             }
1526             taskdata->ompt_task_info.frame.reenter_runtime_frame = NULL;
1527         }
1528 #endif
1529         ANNOTATE_HAPPENS_AFTER(taskdata);
1530     }
1531 
1532     KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
1533                   "returning TASK_CURRENT_NOT_QUEUED\n", gtid, taskdata) );
1534 
1535     return TASK_CURRENT_NOT_QUEUED;
1536 }
1537 
1538 
1539 //-------------------------------------------------
1540 // __kmpc_omp_taskyield: switch to a different task
1541 
1542 kmp_int32
1543 __kmpc_omp_taskyield( ident_t *loc_ref, kmp_int32 gtid, int end_part )
1544 {
1545     kmp_taskdata_t * taskdata;
1546     kmp_info_t * thread;
1547     int thread_finished = FALSE;
1548 
1549     KMP_COUNT_BLOCK(OMP_TASKYIELD);
1550     KMP_SET_THREAD_STATE_BLOCK(TASKYIELD);
1551 
1552     KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
1553                   gtid, loc_ref, end_part) );
1554 
1555     if ( __kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel ) {
1556         // GEH TODO: shouldn't we have some sort of OMPRAP API calls here to mark begin wait?
1557 
1558         thread = __kmp_threads[ gtid ];
1559         taskdata = thread -> th.th_current_task;
1560         // Should we model this as a task wait or not?
1561         // Debugger: The taskwait is active. Store location and thread encountered the taskwait.
1562 #if USE_ITT_BUILD
1563         // Note: These values are used by ITT events as well.
1564 #endif /* USE_ITT_BUILD */
1565         taskdata->td_taskwait_counter += 1;
1566         taskdata->td_taskwait_ident    = loc_ref;
1567         taskdata->td_taskwait_thread   = gtid + 1;
1568 
1569 #if USE_ITT_BUILD
1570         void * itt_sync_obj = __kmp_itt_taskwait_object( gtid );
1571         if ( itt_sync_obj != NULL )
1572             __kmp_itt_taskwait_starting( gtid, itt_sync_obj );
1573 #endif /* USE_ITT_BUILD */
1574         if ( ! taskdata->td_flags.team_serial ) {
1575             kmp_task_team_t * task_team = thread->th.th_task_team;
1576             if (task_team != NULL) {
1577                 if (KMP_TASKING_ENABLED(task_team)) {
1578                     __kmp_execute_tasks_32( thread, gtid, NULL, FALSE, &thread_finished
1579                                             USE_ITT_BUILD_ARG(itt_sync_obj), __kmp_task_stealing_constraint );
1580                 }
1581             }
1582         }
1583 #if USE_ITT_BUILD
1584         if ( itt_sync_obj != NULL )
1585             __kmp_itt_taskwait_finished( gtid, itt_sync_obj );
1586 #endif /* USE_ITT_BUILD */
1587 
1588         // GEH TODO: shouldn't we have some sort of OMPRAP API calls here to mark end of wait?
1589         // Debugger:  The taskwait is completed. Location remains, but thread is negated.
1590         taskdata->td_taskwait_thread = - taskdata->td_taskwait_thread;
1591     }
1592 
1593     KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
1594                   "returning TASK_CURRENT_NOT_QUEUED\n", gtid, taskdata) );
1595 
1596     return TASK_CURRENT_NOT_QUEUED;
1597 }
1598 
1599 // TODO: change to OMP_50_ENABLED, need to change build tools for this to work
1600 #if OMP_45_ENABLED
1601 //
1602 // Task Reduction implementation
1603 //
1604 
1605 typedef struct kmp_task_red_flags {
1606     unsigned  lazy_priv : 1;  // hint: (1) use lazy allocation (big objects)
1607     unsigned  reserved31 : 31;
1608 } kmp_task_red_flags_t;
1609 
1610 // internal structure for reduction data item related info
1611 typedef struct kmp_task_red_data {
1612     void       *reduce_shar; // shared reduction item
1613     size_t      reduce_size; // size of data item
1614     void       *reduce_priv; // thread specific data
1615     void       *reduce_pend; // end of private data for comparison op
1616     void       *reduce_init; // data initialization routine
1617     void       *reduce_fini; // data finalization routine
1618     void       *reduce_comb; // data combiner routine
1619     kmp_task_red_flags_t flags; // flags for additional info from compiler
1620 } kmp_task_red_data_t;
1621 
1622 // structure sent us by compiler - one per reduction item
1623 typedef struct kmp_task_red_input {
1624     void       *reduce_shar; // shared reduction item
1625     size_t      reduce_size; // size of data item
1626     void       *reduce_init; // data initialization routine
1627     void       *reduce_fini; // data finalization routine
1628     void       *reduce_comb; // data combiner routine
1629     kmp_task_red_flags_t flags; // flags for additional info from compiler
1630 } kmp_task_red_input_t;
1631 
1632 /*!
1633 @ingroup TASKING
1634 @param gtid      Global thread ID
1635 @param num       Number of data items to reduce
1636 @param data      Array of data for reduction
1637 @return The taskgroup identifier
1638 
1639 Initialize task reduction for the taskgroup.
1640 */
1641 void*
1642 __kmpc_task_reduction_init(int gtid, int num, void *data)
1643 {
1644     kmp_info_t * thread = __kmp_threads[gtid];
1645     kmp_taskgroup_t * tg = thread->th.th_current_task->td_taskgroup;
1646     kmp_int32 nth = thread->th.th_team_nproc;
1647     kmp_task_red_input_t *input = (kmp_task_red_input_t*)data;
1648     kmp_task_red_data_t *arr;
1649 
1650     // check input data just in case
1651     KMP_ASSERT(tg != NULL);
1652     KMP_ASSERT(data != NULL);
1653     KMP_ASSERT(num > 0);
1654     if (nth == 1) {
1655         KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n",
1656                 gtid, tg));
1657         return (void*)tg;
1658     }
1659     KA_TRACE(10,("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n",
1660                  gtid, tg, num));
1661     arr = (kmp_task_red_data_t*)__kmp_thread_malloc(thread, num * sizeof(kmp_task_red_data_t));
1662     for (int i = 0; i < num; ++i) {
1663         void(*f_init)(void*) = (void(*)(void*))(input[i].reduce_init);
1664         size_t size = input[i].reduce_size - 1;
1665         // round the size up to cache line per thread-specific item
1666         size += CACHE_LINE - size % CACHE_LINE;
1667         KMP_ASSERT(input[i].reduce_comb != NULL); // combiner is mandatory
1668         arr[i].reduce_shar = input[i].reduce_shar;
1669         arr[i].reduce_size = size;
1670         arr[i].reduce_init = input[i].reduce_init;
1671         arr[i].reduce_fini = input[i].reduce_fini;
1672         arr[i].reduce_comb = input[i].reduce_comb;
1673         arr[i].flags       = input[i].flags;
1674         if (!input[i].flags.lazy_priv) {
1675             // allocate cache-line aligned block and fill it with zeros
1676             arr[i].reduce_priv = __kmp_allocate(nth * size);
1677             arr[i].reduce_pend = (char*)(arr[i].reduce_priv) + nth * size;
1678             if (f_init != NULL) {
1679                 // initialize thread-specific items
1680                 for (int j = 0; j < nth; ++j) {
1681                     f_init((char*)(arr[i].reduce_priv) + j * size);
1682                 }
1683             }
1684         } else {
1685             // only allocate space for pointers now,
1686             // objects will be lazily allocated/initialized once requested
1687             arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void*));
1688         }
1689     }
1690     tg->reduce_data = (void*)arr;
1691     tg->reduce_num_data = num;
1692     return (void*)tg;
1693 }
1694 
1695 /*!
1696 @ingroup TASKING
1697 @param gtid    Global thread ID
1698 @param tskgrp  The taskgroup ID (optional)
1699 @param data    Shared location of the item
1700 @return The pointer to per-thread data
1701 
1702 Get thread-specific location of data item
1703 */
1704 void*
1705 __kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data)
1706 {
1707     kmp_info_t * thread = __kmp_threads[gtid];
1708     kmp_int32 nth = thread->th.th_team_nproc;
1709     if (nth == 1)
1710         return data; // nothing to do
1711 
1712     kmp_taskgroup_t *tg = (kmp_taskgroup_t*)tskgrp;
1713     if (tg == NULL)
1714         tg = thread->th.th_current_task->td_taskgroup;
1715     KMP_ASSERT(tg != NULL);
1716     kmp_task_red_data_t *arr = (kmp_task_red_data_t*)(tg->reduce_data);
1717     kmp_int32 num = tg->reduce_num_data;
1718     kmp_int32 tid = thread->th.th_info.ds.ds_tid;
1719 
1720     KMP_ASSERT(data != NULL);
1721     while (tg != NULL) {
1722       for (int i = 0; i < num; ++i) {
1723         if (!arr[i].flags.lazy_priv) {
1724           if (data == arr[i].reduce_shar ||
1725              (data >= arr[i].reduce_priv && data < arr[i].reduce_pend))
1726             return (char*)(arr[i].reduce_priv) + tid * arr[i].reduce_size;
1727         } else {
1728           // check shared location first
1729           void **p_priv = (void**)(arr[i].reduce_priv);
1730           if (data == arr[i].reduce_shar)
1731             goto found;
1732           // check if we get some thread specific location as parameter
1733           for (int j = 0; j < nth; ++j)
1734             if (data == p_priv[j])
1735               goto found;
1736           continue; // not found, continue search
1737         found:
1738           if (p_priv[tid] == NULL) {
1739             // allocate thread specific object lazily
1740             void(*f_init)(void*) = (void(*)(void*))(arr[i].reduce_init);
1741             p_priv[tid] = __kmp_allocate(arr[i].reduce_size);
1742             if (f_init != NULL) {
1743               f_init(p_priv[tid]);
1744             }
1745           }
1746           return p_priv[tid];
1747         }
1748       }
1749       tg = tg->parent;
1750       arr = (kmp_task_red_data_t*)(tg->reduce_data);
1751       num = tg->reduce_num_data;
1752     }
1753     KMP_ASSERT2(0, "Unknown task reduction item");
1754     return NULL; // ERROR, this line never executed
1755 }
1756 
1757 // Finalize task reduction.
1758 // Called from __kmpc_end_taskgroup()
1759 static void
1760 __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg)
1761 {
1762     kmp_int32 nth = th->th.th_team_nproc;
1763     KMP_DEBUG_ASSERT(nth > 1); // should not be called if nth == 1
1764     kmp_task_red_data_t *arr = (kmp_task_red_data_t*)tg->reduce_data;
1765     kmp_int32 num = tg->reduce_num_data;
1766     for (int i = 0; i < num; ++i) {
1767         void *sh_data = arr[i].reduce_shar;
1768         void(*f_fini)(void*) = (void(*)(void*))(arr[i].reduce_fini);
1769         void(*f_comb)(void*,void*) = (void(*)(void*,void*))(arr[i].reduce_comb);
1770         if (!arr[i].flags.lazy_priv) {
1771             void *pr_data = arr[i].reduce_priv;
1772             size_t size = arr[i].reduce_size;
1773             for (int j = 0; j < nth; ++j) {
1774                 void * priv_data = (char*)pr_data + j * size;
1775                 f_comb(sh_data, priv_data); // combine results
1776                 if (f_fini)
1777                     f_fini(priv_data); // finalize if needed
1778             }
1779         } else {
1780             void **pr_data = (void**)(arr[i].reduce_priv);
1781             for (int j = 0; j < nth; ++j) {
1782                 if (pr_data[j] != NULL) {
1783                     f_comb(sh_data, pr_data[j]); // combine results
1784                     if (f_fini)
1785                         f_fini(pr_data[j]); // finalize if needed
1786                     __kmp_free(pr_data[j]);
1787                 }
1788             }
1789         }
1790         __kmp_free(arr[i].reduce_priv);
1791     }
1792     __kmp_thread_free(th, arr);
1793     tg->reduce_data = NULL;
1794     tg->reduce_num_data = 0;
1795 }
1796 #endif
1797 
1798 #if OMP_40_ENABLED
1799 //-------------------------------------------------------------------------------------
1800 // __kmpc_taskgroup: Start a new taskgroup
1801 
1802 void
1803 __kmpc_taskgroup( ident_t* loc, int gtid )
1804 {
1805     kmp_info_t      * thread = __kmp_threads[ gtid ];
1806     kmp_taskdata_t  * taskdata = thread->th.th_current_task;
1807     kmp_taskgroup_t * tg_new =
1808         (kmp_taskgroup_t *)__kmp_thread_malloc( thread, sizeof( kmp_taskgroup_t ) );
1809     KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new) );
1810     tg_new->count = 0;
1811     tg_new->cancel_request = cancel_noreq;
1812     tg_new->parent = taskdata->td_taskgroup;
1813 // TODO: change to OMP_50_ENABLED, need to change build tools for this to work
1814 #if OMP_45_ENABLED
1815     tg_new->reduce_data = NULL;
1816     tg_new->reduce_num_data = 0;
1817 #endif
1818     taskdata->td_taskgroup = tg_new;
1819 }
1820 
1821 
1822 //-------------------------------------------------------------------------------------
1823 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task
1824 //                       and its descendants are complete
1825 
1826 void
1827 __kmpc_end_taskgroup( ident_t* loc, int gtid )
1828 {
1829     kmp_info_t      * thread = __kmp_threads[ gtid ];
1830     kmp_taskdata_t  * taskdata = thread->th.th_current_task;
1831     kmp_taskgroup_t * taskgroup = taskdata->td_taskgroup;
1832     int thread_finished = FALSE;
1833 
1834     KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc) );
1835     KMP_DEBUG_ASSERT( taskgroup != NULL );
1836     KMP_SET_THREAD_STATE_BLOCK(TASKGROUP);
1837 
1838     if ( __kmp_tasking_mode != tskm_immediate_exec ) {
1839 #if USE_ITT_BUILD
1840         // For ITT the taskgroup wait is similar to taskwait until we need to distinguish them
1841         void * itt_sync_obj = __kmp_itt_taskwait_object( gtid );
1842         if ( itt_sync_obj != NULL )
1843             __kmp_itt_taskwait_starting( gtid, itt_sync_obj );
1844 #endif /* USE_ITT_BUILD */
1845 
1846 #if OMP_45_ENABLED
1847         if ( ! taskdata->td_flags.team_serial || (thread->th.th_task_team != NULL && thread->th.th_task_team->tt.tt_found_proxy_tasks) )
1848 #else
1849         if ( ! taskdata->td_flags.team_serial )
1850 #endif
1851         {
1852             kmp_flag_32 flag(&(taskgroup->count), 0U);
1853             while ( TCR_4(taskgroup->count) != 0 ) {
1854                 flag.execute_tasks(thread, gtid, FALSE, &thread_finished
1855                                    USE_ITT_BUILD_ARG(itt_sync_obj), __kmp_task_stealing_constraint );
1856             }
1857         }
1858 
1859 #if USE_ITT_BUILD
1860         if ( itt_sync_obj != NULL )
1861             __kmp_itt_taskwait_finished( gtid, itt_sync_obj );
1862 #endif /* USE_ITT_BUILD */
1863     }
1864     KMP_DEBUG_ASSERT( taskgroup->count == 0 );
1865 
1866 // TODO: change to OMP_50_ENABLED, need to change build tools for this to work
1867 #if OMP_45_ENABLED
1868     if( taskgroup->reduce_data != NULL ) // need to reduce?
1869         __kmp_task_reduction_fini(thread, taskgroup);
1870 #endif
1871     // Restore parent taskgroup for the current task
1872     taskdata->td_taskgroup = taskgroup->parent;
1873     __kmp_thread_free( thread, taskgroup );
1874 
1875     KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n", gtid, taskdata) );
1876     ANNOTATE_HAPPENS_AFTER(taskdata);
1877 }
1878 #endif
1879 
1880 
1881 //------------------------------------------------------
1882 // __kmp_remove_my_task: remove a task from my own deque
1883 
1884 static kmp_task_t *
1885 __kmp_remove_my_task( kmp_info_t * thread, kmp_int32 gtid, kmp_task_team_t *task_team,
1886                       kmp_int32 is_constrained )
1887 {
1888     kmp_task_t * task;
1889     kmp_taskdata_t * taskdata;
1890     kmp_thread_data_t *thread_data;
1891     kmp_uint32 tail;
1892 
1893     KMP_DEBUG_ASSERT( __kmp_tasking_mode != tskm_immediate_exec );
1894     KMP_DEBUG_ASSERT( task_team -> tt.tt_threads_data != NULL ); // Caller should check this condition
1895 
1896         thread_data = & task_team -> tt.tt_threads_data[ __kmp_tid_from_gtid( gtid ) ];
1897 
1898     KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",
1899                   gtid, thread_data->td.td_deque_ntasks, thread_data->td.td_deque_head,
1900                   thread_data->td.td_deque_tail) );
1901 
1902     if (TCR_4(thread_data -> td.td_deque_ntasks) == 0) {
1903         KA_TRACE(10, ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: ntasks=%d head=%u tail=%u\n",
1904                       gtid, thread_data->td.td_deque_ntasks, thread_data->td.td_deque_head,
1905                       thread_data->td.td_deque_tail) );
1906         return NULL;
1907     }
1908 
1909     __kmp_acquire_bootstrap_lock( & thread_data -> td.td_deque_lock );
1910 
1911     if (TCR_4(thread_data -> td.td_deque_ntasks) == 0) {
1912         __kmp_release_bootstrap_lock( & thread_data -> td.td_deque_lock );
1913         KA_TRACE(10, ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: ntasks=%d head=%u tail=%u\n",
1914                       gtid, thread_data->td.td_deque_ntasks, thread_data->td.td_deque_head,
1915                       thread_data->td.td_deque_tail) );
1916         return NULL;
1917     }
1918 
1919     tail = ( thread_data -> td.td_deque_tail - 1 ) & TASK_DEQUE_MASK(thread_data->td);  // Wrap index.
1920     taskdata = thread_data -> td.td_deque[ tail ];
1921 
1922     if (is_constrained && (taskdata->td_flags.tiedness == TASK_TIED)) {
1923         // we need to check if the candidate obeys task scheduling constraint:
1924         // only child of current task can be scheduled
1925         kmp_taskdata_t * current = thread->th.th_current_task;
1926         kmp_int32        level = current->td_level;
1927         kmp_taskdata_t * parent = taskdata->td_parent;
1928         while ( parent != current && parent->td_level > level ) {
1929             parent = parent->td_parent;  // check generation up to the level of the current task
1930             KMP_DEBUG_ASSERT(parent != NULL);
1931         }
1932         if ( parent != current ) {
1933             // If the tail task is not a child, then no other child can appear in the deque.
1934             __kmp_release_bootstrap_lock( & thread_data -> td.td_deque_lock );
1935             KA_TRACE(10, ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: ntasks=%d head=%u tail=%u\n",
1936                           gtid, thread_data->td.td_deque_ntasks, thread_data->td.td_deque_head,
1937                           thread_data->td.td_deque_tail) );
1938             return NULL;
1939         }
1940     }
1941 
1942     thread_data -> td.td_deque_tail = tail;
1943     TCW_4(thread_data -> td.td_deque_ntasks, thread_data -> td.td_deque_ntasks - 1);
1944 
1945     __kmp_release_bootstrap_lock( & thread_data->td.td_deque_lock );
1946 
1947     KA_TRACE(10, ("__kmp_remove_my_task(exit #2): T#%d task %p removed: ntasks=%d head=%u tail=%u\n",
1948                   gtid, taskdata, thread_data->td.td_deque_ntasks, thread_data->td.td_deque_head,
1949                   thread_data->td.td_deque_tail) );
1950 
1951     task = KMP_TASKDATA_TO_TASK( taskdata );
1952     return task;
1953 }
1954 
1955 
1956 //-----------------------------------------------------------
1957 // __kmp_steal_task: remove a task from another thread's deque
1958 // Assume that calling thread has already checked existence of
1959 // task_team thread_data before calling this routine.
1960 
1961 static kmp_task_t *
1962 __kmp_steal_task( kmp_info_t *victim, kmp_int32 gtid, kmp_task_team_t *task_team,
1963                   volatile kmp_uint32 *unfinished_threads, int *thread_finished,
1964                   kmp_int32 is_constrained )
1965 {
1966     kmp_task_t * task;
1967     kmp_taskdata_t * taskdata;
1968     kmp_thread_data_t *victim_td, *threads_data;
1969     kmp_int32 victim_tid;
1970 
1971     KMP_DEBUG_ASSERT( __kmp_tasking_mode != tskm_immediate_exec );
1972 
1973     threads_data = task_team -> tt.tt_threads_data;
1974     KMP_DEBUG_ASSERT( threads_data != NULL );  // Caller should check this condition
1975 
1976     victim_tid = victim->th.th_info.ds.ds_tid;
1977     victim_td = & threads_data[ victim_tid ];
1978 
1979     KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: task_team=%p ntasks=%d "
1980                   "head=%u tail=%u\n",
1981                   gtid, __kmp_gtid_from_thread( victim ), task_team, victim_td->td.td_deque_ntasks,
1982                   victim_td->td.td_deque_head, victim_td->td.td_deque_tail) );
1983 
1984     if ( (TCR_4(victim_td -> td.td_deque_ntasks) == 0) || // Caller should not check this condition
1985          (TCR_PTR(victim->th.th_task_team) != task_team)) // GEH: why would this happen?
1986     {
1987         KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: task_team=%p "
1988                       "ntasks=%d head=%u tail=%u\n",
1989                       gtid, __kmp_gtid_from_thread( victim ), task_team, victim_td->td.td_deque_ntasks,
1990                       victim_td->td.td_deque_head, victim_td->td.td_deque_tail) );
1991         return NULL;
1992     }
1993 
1994     __kmp_acquire_bootstrap_lock( & victim_td -> td.td_deque_lock );
1995 
1996     // Check again after we acquire the lock
1997     if ( (TCR_4(victim_td -> td.td_deque_ntasks) == 0) ||
1998          (TCR_PTR(victim->th.th_task_team) != task_team)) // GEH: why would this happen?
1999     {
2000         __kmp_release_bootstrap_lock( & victim_td -> td.td_deque_lock );
2001         KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: task_team=%p "
2002                       "ntasks=%d head=%u tail=%u\n",
2003                       gtid, __kmp_gtid_from_thread( victim ), task_team, victim_td->td.td_deque_ntasks,
2004                       victim_td->td.td_deque_head, victim_td->td.td_deque_tail) );
2005         return NULL;
2006     }
2007 
2008     KMP_DEBUG_ASSERT( victim_td -> td.td_deque != NULL );
2009 
2010     taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head];
2011     if ( is_constrained ) {
2012         // we need to check if the candidate obeys task scheduling constraint:
2013         // only descendant of current task can be scheduled
2014         kmp_taskdata_t * current = __kmp_threads[ gtid ]->th.th_current_task;
2015         kmp_int32        level = current->td_level;
2016         kmp_taskdata_t * parent = taskdata->td_parent;
2017         while ( parent != current && parent->td_level > level ) {
2018             parent = parent->td_parent;  // check generation up to the level of the current task
2019             KMP_DEBUG_ASSERT(parent != NULL);
2020         }
2021         if ( parent != current ) {
2022             // If the head task is not a descendant of the current task then do not
2023             // steal it. No other task in victim's deque can be a descendant of the
2024             // current task.
2025             __kmp_release_bootstrap_lock( & victim_td -> td.td_deque_lock );
2026             KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: task_team=%p "
2027                           "ntasks=%d head=%u tail=%u\n",
2028                           gtid, __kmp_gtid_from_thread( threads_data[victim_tid].td.td_thr ),
2029                           task_team, victim_td->td.td_deque_ntasks,
2030                           victim_td->td.td_deque_head, victim_td->td.td_deque_tail) );
2031             return NULL;
2032         }
2033     }
2034     // Bump head pointer and Wrap.
2035     victim_td->td.td_deque_head = (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td);
2036     if (*thread_finished) {
2037         // We need to un-mark this victim as a finished victim.  This must be done before
2038         // releasing the lock, or else other threads (starting with the master victim)
2039         // might be prematurely released from the barrier!!!
2040         kmp_uint32 count;
2041 
2042         count = KMP_TEST_THEN_INC32( (kmp_int32 *)unfinished_threads );
2043 
2044         KA_TRACE(20, ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
2045                       gtid, count + 1, task_team) );
2046 
2047         *thread_finished = FALSE;
2048     }
2049     TCW_4(victim_td -> td.td_deque_ntasks, TCR_4(victim_td -> td.td_deque_ntasks) - 1);
2050 
2051     __kmp_release_bootstrap_lock( & victim_td -> td.td_deque_lock );
2052 
2053     KMP_COUNT_BLOCK(TASK_stolen);
2054     KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d stole task %p from T#%d: task_team=%p "
2055                   "ntasks=%d head=%u tail=%u\n",
2056                   gtid, taskdata, __kmp_gtid_from_thread( victim ), task_team,
2057                   victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
2058                   victim_td->td.td_deque_tail) );
2059 
2060     task = KMP_TASKDATA_TO_TASK( taskdata );
2061     return task;
2062 }
2063 
2064 
2065 //-----------------------------------------------------------------------------
2066 // __kmp_execute_tasks_template: Choose and execute tasks until either the condition
2067 // is statisfied (return true) or there are none left (return false).
2068 // final_spin is TRUE if this is the spin at the release barrier.
2069 // thread_finished indicates whether the thread is finished executing all
2070 // the tasks it has on its deque, and is at the release barrier.
2071 // spinner is the location on which to spin.
2072 // spinner == NULL means only execute a single task and return.
2073 // checker is the value to check to terminate the spin.
2074 template <class C>
2075 static inline int __kmp_execute_tasks_template(kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin,
2076                                                int *thread_finished
2077                                                USE_ITT_BUILD_ARG(void * itt_sync_obj), kmp_int32 is_constrained)
2078 {
2079     kmp_task_team_t *     task_team = thread->th.th_task_team;
2080     kmp_thread_data_t *   threads_data;
2081     kmp_task_t *          task;
2082     kmp_info_t *          other_thread;
2083     kmp_taskdata_t *      current_task = thread -> th.th_current_task;
2084     volatile kmp_uint32 * unfinished_threads;
2085     kmp_int32             nthreads, victim=-2, use_own_tasks=1, new_victim=0, tid=thread->th.th_info.ds.ds_tid;
2086 
2087     KMP_DEBUG_ASSERT( __kmp_tasking_mode != tskm_immediate_exec );
2088     KMP_DEBUG_ASSERT( thread == __kmp_threads[ gtid ] );
2089 
2090     if (task_team == NULL) return FALSE;
2091 
2092     KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d *thread_finished=%d\n",
2093                   gtid, final_spin, *thread_finished) );
2094 
2095     thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
2096     threads_data = (kmp_thread_data_t *)TCR_PTR(task_team -> tt.tt_threads_data);
2097     KMP_DEBUG_ASSERT( threads_data != NULL );
2098 
2099     nthreads = task_team -> tt.tt_nproc;
2100     unfinished_threads = &(task_team -> tt.tt_unfinished_threads);
2101 #if OMP_45_ENABLED
2102     KMP_DEBUG_ASSERT( nthreads > 1 || task_team->tt.tt_found_proxy_tasks);
2103 #else
2104     KMP_DEBUG_ASSERT( nthreads > 1 );
2105 #endif
2106     KMP_DEBUG_ASSERT( (int)(TCR_4(*unfinished_threads)) >= 0 );
2107 
2108     while (1) { // Outer loop keeps trying to find tasks in case of single thread getting tasks from target constructs
2109         while (1) { // Inner loop to find a task and execute it
2110             task = NULL;
2111             if (use_own_tasks) { // check on own queue first
2112                 task = __kmp_remove_my_task( thread, gtid, task_team, is_constrained );
2113             }
2114             if ((task == NULL) && (nthreads > 1)) { // Steal a task
2115                 int asleep = 1;
2116                 use_own_tasks = 0;
2117                 // Try to steal from the last place I stole from successfully.
2118                 if (victim == -2) { // haven't stolen anything yet
2119                     victim = threads_data[tid].td.td_deque_last_stolen;
2120                     if (victim != -1) // if we have a last stolen from victim, get the thread
2121                         other_thread = threads_data[victim].td.td_thr;
2122                 }
2123                 if (victim != -1) { // found last victim
2124                     asleep = 0;
2125                 }
2126                 else if (!new_victim) { // no recent steals and we haven't already used a new victim; select a random thread
2127                     do { // Find a different thread to steal work from.
2128                         // Pick a random thread. Initial plan was to cycle through all the threads, and only return if
2129                         // we tried to steal from every thread, and failed.  Arch says that's not such a great idea.
2130                         victim = __kmp_get_random(thread) % (nthreads - 1);
2131                         if (victim >= tid) {
2132                             ++victim;  // Adjusts random distribution to exclude self
2133                         }
2134                         // Found a potential victim
2135                         other_thread = threads_data[victim].td.td_thr;
2136                         // There is a slight chance that __kmp_enable_tasking() did not wake up all threads
2137                         // waiting at the barrier.  If victim is sleeping, then wake it up.  Since we were going to
2138                         // pay the cache miss penalty for referencing another thread's kmp_info_t struct anyway,
2139                         // the check shouldn't cost too much performance at this point. In extra barrier mode, tasks
2140                         // do not sleep at the separate tasking barrier, so this isn't a problem.
2141                         asleep = 0;
2142                         if ( ( __kmp_tasking_mode == tskm_task_teams ) &&
2143                              (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&
2144                              (TCR_PTR(other_thread->th.th_sleep_loc) != NULL)) {
2145                             asleep = 1;
2146                             __kmp_null_resume_wrapper(__kmp_gtid_from_thread(other_thread), other_thread->th.th_sleep_loc);
2147                             // A sleeping thread should not have any tasks on it's queue. There is a slight
2148                             // possibility that it resumes, steals a task from another thread, which spawns more
2149                             // tasks, all in the time that it takes this thread to check => don't write an assertion
2150                             // that the victim's queue is empty.  Try stealing from a different thread.
2151                         }
2152                     } while (asleep);
2153                 }
2154 
2155                 if (!asleep) {
2156                     // We have a victim to try to steal from
2157                     task = __kmp_steal_task(other_thread, gtid, task_team, unfinished_threads, thread_finished, is_constrained);
2158                 }
2159                 if (task != NULL) { // set last stolen to victim
2160                     if (threads_data[tid].td.td_deque_last_stolen != victim) {
2161                         threads_data[tid].td.td_deque_last_stolen = victim;
2162                         // The pre-refactored code did not try more than 1 successful new vicitm,
2163                         // unless the last one generated more local tasks; new_victim keeps track of this
2164                         new_victim = 1;
2165                     }
2166                 }
2167                 else { // No tasks found; unset last_stolen
2168                     KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1);
2169                     victim = -2; // no successful victim found
2170                 }
2171             }
2172 
2173             if (task == NULL) // break out of tasking loop
2174                 break;
2175 
2176             // Found a task; execute it
2177 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2178             if ( __itt_sync_create_ptr || KMP_ITT_DEBUG ) {
2179                 if ( itt_sync_obj == NULL ) { // we are at fork barrier where we could not get the object reliably
2180                     itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
2181                 }
2182                 __kmp_itt_task_starting( itt_sync_obj );
2183             }
2184 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2185             __kmp_invoke_task( gtid, task, current_task );
2186 #if USE_ITT_BUILD
2187             if ( itt_sync_obj != NULL ) __kmp_itt_task_finished( itt_sync_obj );
2188 #endif /* USE_ITT_BUILD */
2189             // If this thread is only partway through the barrier and the condition is met, then return now,
2190             // so that the barrier gather/release pattern can proceed. If this thread is in the last spin loop
2191             // in the barrier, waiting to be released, we know that the termination condition will not be
2192             // satisified, so don't waste any cycles checking it.
2193             if (flag == NULL || (!final_spin && flag->done_check())) {
2194                 KA_TRACE(15, ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n", gtid) );
2195                 return TRUE;
2196             }
2197             if (thread->th.th_task_team == NULL) {
2198                 break;
2199             }
2200             KMP_YIELD( __kmp_library == library_throughput );   // Yield before executing next task
2201             // If execution of a stolen task results in more tasks being placed on our run queue, reset use_own_tasks
2202             if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) {
2203                 KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned other tasks, restart\n", gtid));
2204                 use_own_tasks = 1;
2205                 new_victim = 0;
2206             }
2207         }
2208 
2209         // The task source has been exhausted. If in final spin loop of barrier, check if termination condition is satisfied.
2210 #if OMP_45_ENABLED
2211         // The work queue may be empty but there might be proxy tasks still executing
2212         if (final_spin && TCR_4(current_task->td_incomplete_child_tasks) == 0)
2213 #else
2214         if (final_spin)
2215 #endif
2216         {
2217             // First, decrement the #unfinished threads, if that has not already been done.  This decrement
2218             // might be to the spin location, and result in the termination condition being satisfied.
2219             if (! *thread_finished) {
2220                 kmp_uint32 count;
2221 
2222                 count = KMP_TEST_THEN_DEC32( (kmp_int32 *)unfinished_threads ) - 1;
2223                 KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec unfinished_threads to %d task_team=%p\n",
2224                               gtid, count, task_team) );
2225                 *thread_finished = TRUE;
2226             }
2227 
2228             // It is now unsafe to reference thread->th.th_team !!!
2229             // Decrementing task_team->tt.tt_unfinished_threads can allow the master thread to pass through
2230             // the barrier, where it might reset each thread's th.th_team field for the next parallel region.
2231             // If we can steal more work, we know that this has not happened yet.
2232             if (flag != NULL && flag->done_check()) {
2233                 KA_TRACE(15, ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n", gtid) );
2234                 return TRUE;
2235             }
2236         }
2237 
2238         // If this thread's task team is NULL, master has recognized that there are no more tasks; bail out
2239         if (thread->th.th_task_team == NULL) {
2240             KA_TRACE(15, ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid) );
2241             return FALSE;
2242         }
2243 
2244 #if OMP_45_ENABLED
2245         // We could be getting tasks from target constructs; if this is the only thread, keep trying to execute
2246         // tasks from own queue
2247         if (nthreads == 1)
2248             use_own_tasks = 1;
2249         else
2250 #endif
2251         {
2252             KA_TRACE(15, ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid) );
2253             return FALSE;
2254         }
2255     }
2256 }
2257 
2258 int __kmp_execute_tasks_32(kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32 *flag, int final_spin,
2259                            int *thread_finished
2260                            USE_ITT_BUILD_ARG(void * itt_sync_obj), kmp_int32 is_constrained)
2261 {
2262     return __kmp_execute_tasks_template(thread, gtid, flag, final_spin, thread_finished
2263                                         USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
2264 }
2265 
2266 int __kmp_execute_tasks_64(kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64 *flag, int final_spin,
2267                            int *thread_finished
2268                            USE_ITT_BUILD_ARG(void * itt_sync_obj), kmp_int32 is_constrained)
2269 {
2270     return __kmp_execute_tasks_template(thread, gtid, flag, final_spin, thread_finished
2271                                         USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
2272 }
2273 
2274 int __kmp_execute_tasks_oncore(kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
2275                                int *thread_finished
2276                                USE_ITT_BUILD_ARG(void * itt_sync_obj), kmp_int32 is_constrained)
2277 {
2278     return __kmp_execute_tasks_template(thread, gtid, flag, final_spin, thread_finished
2279                                         USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
2280 }
2281 
2282 
2283 
2284 //-----------------------------------------------------------------------------
2285 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
2286 // next barrier so they can assist in executing enqueued tasks.
2287 // First thread in allocates the task team atomically.
2288 
2289 static void
2290 __kmp_enable_tasking( kmp_task_team_t *task_team, kmp_info_t *this_thr )
2291 {
2292     kmp_thread_data_t *threads_data;
2293     int nthreads, i, is_init_thread;
2294 
2295     KA_TRACE( 10, ( "__kmp_enable_tasking(enter): T#%d\n",
2296                     __kmp_gtid_from_thread( this_thr ) ) );
2297 
2298     KMP_DEBUG_ASSERT(task_team != NULL);
2299     KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL);
2300 
2301     nthreads = task_team->tt.tt_nproc;
2302     KMP_DEBUG_ASSERT(nthreads > 0);
2303     KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc);
2304 
2305     // Allocate or increase the size of threads_data if necessary
2306     is_init_thread = __kmp_realloc_task_threads_data( this_thr, task_team );
2307 
2308     if (!is_init_thread) {
2309         // Some other thread already set up the array.
2310         KA_TRACE( 20, ( "__kmp_enable_tasking(exit): T#%d: threads array already set up.\n",
2311                         __kmp_gtid_from_thread( this_thr ) ) );
2312         return;
2313     }
2314     threads_data = (kmp_thread_data_t *)TCR_PTR(task_team -> tt.tt_threads_data);
2315     KMP_DEBUG_ASSERT( threads_data != NULL );
2316 
2317     if ( ( __kmp_tasking_mode == tskm_task_teams ) &&
2318          ( __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ) )
2319     {
2320         // Release any threads sleeping at the barrier, so that they can steal
2321         // tasks and execute them.  In extra barrier mode, tasks do not sleep
2322         // at the separate tasking barrier, so this isn't a problem.
2323         for (i = 0; i < nthreads; i++) {
2324             volatile void *sleep_loc;
2325             kmp_info_t *thread = threads_data[i].td.td_thr;
2326 
2327             if (i == this_thr->th.th_info.ds.ds_tid) {
2328                 continue;
2329             }
2330             // Since we haven't locked the thread's suspend mutex lock at this
2331             // point, there is a small window where a thread might be putting
2332             // itself to sleep, but hasn't set the th_sleep_loc field yet.
2333             // To work around this, __kmp_execute_tasks_template() periodically checks
2334             // see if other threads are sleeping (using the same random
2335             // mechanism that is used for task stealing) and awakens them if
2336             // they are.
2337             if ( ( sleep_loc = TCR_PTR( thread -> th.th_sleep_loc) ) != NULL )
2338             {
2339                 KF_TRACE( 50, ( "__kmp_enable_tasking: T#%d waking up thread T#%d\n",
2340                                  __kmp_gtid_from_thread( this_thr ),
2341                                  __kmp_gtid_from_thread( thread ) ) );
2342                 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
2343             }
2344             else {
2345                 KF_TRACE( 50, ( "__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
2346                                  __kmp_gtid_from_thread( this_thr ),
2347                                  __kmp_gtid_from_thread( thread ) ) );
2348             }
2349         }
2350     }
2351 
2352     KA_TRACE( 10, ( "__kmp_enable_tasking(exit): T#%d\n",
2353                     __kmp_gtid_from_thread( this_thr ) ) );
2354 }
2355 
2356 
2357 /* ------------------------------------------------------------------------ */
2358 /* // TODO: Check the comment consistency
2359  * Utility routines for "task teams".  A task team (kmp_task_t) is kind of
2360  * like a shadow of the kmp_team_t data struct, with a different lifetime.
2361  * After a child * thread checks into a barrier and calls __kmp_release() from
2362  * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
2363  * longer assume that the kmp_team_t structure is intact (at any moment, the
2364  * master thread may exit the barrier code and free the team data structure,
2365  * and return the threads to the thread pool).
2366  *
2367  * This does not work with the the tasking code, as the thread is still
2368  * expected to participate in the execution of any tasks that may have been
2369  * spawned my a member of the team, and the thread still needs access to all
2370  * to each thread in the team, so that it can steal work from it.
2371  *
2372  * Enter the existence of the kmp_task_team_t struct.  It employs a reference
2373  * counting mechanims, and is allocated by the master thread before calling
2374  * __kmp_<barrier_kind>_release, and then is release by the last thread to
2375  * exit __kmp_<barrier_kind>_release at the next barrier.  I.e. the lifetimes
2376  * of the kmp_task_team_t structs for consecutive barriers can overlap
2377  * (and will, unless the master thread is the last thread to exit the barrier
2378  * release phase, which is not typical).
2379  *
2380  * The existence of such a struct is useful outside the context of tasking,
2381  * but for now, I'm trying to keep it specific to the OMP_30_ENABLED macro,
2382  * so that any performance differences show up when comparing the 2.5 vs. 3.0
2383  * libraries.
2384  *
2385  * We currently use the existence of the threads array as an indicator that
2386  * tasks were spawned since the last barrier.  If the structure is to be
2387  * useful outside the context of tasking, then this will have to change, but
2388  * not settting the field minimizes the performance impact of tasking on
2389  * barriers, when no explicit tasks were spawned (pushed, actually).
2390  */
2391 
2392 
2393 static kmp_task_team_t *__kmp_free_task_teams = NULL;           // Free list for task_team data structures
2394 // Lock for task team data structures
2395 static kmp_bootstrap_lock_t __kmp_task_team_lock = KMP_BOOTSTRAP_LOCK_INITIALIZER( __kmp_task_team_lock );
2396 
2397 
2398 //------------------------------------------------------------------------------
2399 // __kmp_alloc_task_deque:
2400 // Allocates a task deque for a particular thread, and initialize the necessary
2401 // data structures relating to the deque.  This only happens once per thread
2402 // per task team since task teams are recycled.
2403 // No lock is needed during allocation since each thread allocates its own
2404 // deque.
2405 
2406 static void
2407 __kmp_alloc_task_deque( kmp_info_t *thread, kmp_thread_data_t *thread_data )
2408 {
2409     __kmp_init_bootstrap_lock( & thread_data -> td.td_deque_lock );
2410     KMP_DEBUG_ASSERT( thread_data -> td.td_deque == NULL );
2411 
2412     // Initialize last stolen task field to "none"
2413     thread_data -> td.td_deque_last_stolen = -1;
2414 
2415     KMP_DEBUG_ASSERT( TCR_4(thread_data -> td.td_deque_ntasks) == 0 );
2416     KMP_DEBUG_ASSERT( thread_data -> td.td_deque_head == 0 );
2417     KMP_DEBUG_ASSERT( thread_data -> td.td_deque_tail == 0 );
2418 
2419     KE_TRACE( 10, ( "__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
2420                    __kmp_gtid_from_thread( thread ), INITIAL_TASK_DEQUE_SIZE, thread_data ) );
2421     // Allocate space for task deque, and zero the deque
2422     // Cannot use __kmp_thread_calloc() because threads not around for
2423     // kmp_reap_task_team( ).
2424     thread_data -> td.td_deque = (kmp_taskdata_t **)
2425             __kmp_allocate( INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
2426 	thread_data -> td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
2427 }
2428 
2429 //------------------------------------------------------------------------------
2430 // __kmp_realloc_task_deque:
2431 // Re-allocates a task deque for a particular thread, copies the content from the old deque
2432 // and adjusts the necessary data structures relating to the deque.
2433 // This operation must be done with a the deque_lock being held
2434 
2435 static void __kmp_realloc_task_deque ( kmp_info_t *thread, kmp_thread_data_t *thread_data )
2436 {
2437     kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
2438     kmp_int32 new_size = 2 * size;
2439 
2440     KE_TRACE( 10, ( "__kmp_realloc_task_deque: T#%d reallocating deque[from %d to %d] for thread_data %p\n",
2441                   __kmp_gtid_from_thread( thread ), size, new_size, thread_data ) );
2442 
2443     kmp_taskdata_t ** new_deque = (kmp_taskdata_t **) __kmp_allocate( new_size * sizeof(kmp_taskdata_t *));
2444 
2445     int i,j;
2446     for ( i = thread_data->td.td_deque_head, j = 0; j < size; i = (i+1) & TASK_DEQUE_MASK(thread_data->td), j++ )
2447        new_deque[j] = thread_data->td.td_deque[i];
2448 
2449     __kmp_free(thread_data->td.td_deque);
2450 
2451     thread_data -> td.td_deque_head = 0;
2452     thread_data -> td.td_deque_tail = size;
2453     thread_data -> td.td_deque = new_deque;
2454     thread_data -> td.td_deque_size = new_size;
2455 }
2456 
2457 //------------------------------------------------------------------------------
2458 // __kmp_free_task_deque:
2459 // Deallocates a task deque for a particular thread.
2460 // Happens at library deallocation so don't need to reset all thread data fields.
2461 
2462 static void
2463 __kmp_free_task_deque( kmp_thread_data_t *thread_data )
2464 {
2465     __kmp_acquire_bootstrap_lock( & thread_data -> td.td_deque_lock );
2466 
2467     if ( thread_data -> td.td_deque != NULL ) {
2468         TCW_4(thread_data -> td.td_deque_ntasks, 0);
2469          __kmp_free( thread_data -> td.td_deque );
2470         thread_data -> td.td_deque = NULL;
2471     }
2472     __kmp_release_bootstrap_lock( & thread_data -> td.td_deque_lock );
2473 
2474 #ifdef BUILD_TIED_TASK_STACK
2475     // GEH: Figure out what to do here for td_susp_tied_tasks
2476     if ( thread_data -> td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY ) {
2477         __kmp_free_task_stack( __kmp_thread_from_gtid( gtid ), thread_data );
2478     }
2479 #endif // BUILD_TIED_TASK_STACK
2480 }
2481 
2482 
2483 //------------------------------------------------------------------------------
2484 // __kmp_realloc_task_threads_data:
2485 // Allocates a threads_data array for a task team, either by allocating an initial
2486 // array or enlarging an existing array.  Only the first thread to get the lock
2487 // allocs or enlarges the array and re-initializes the array eleemnts.
2488 // That thread returns "TRUE", the rest return "FALSE".
2489 // Assumes that the new array size is given by task_team -> tt.tt_nproc.
2490 // The current size is given by task_team -> tt.tt_max_threads.
2491 
2492 static int
2493 __kmp_realloc_task_threads_data( kmp_info_t *thread, kmp_task_team_t *task_team )
2494 {
2495     kmp_thread_data_t ** threads_data_p;
2496     kmp_int32            nthreads, maxthreads;
2497     int                  is_init_thread = FALSE;
2498 
2499     if ( TCR_4(task_team -> tt.tt_found_tasks) ) {
2500         // Already reallocated and initialized.
2501         return FALSE;
2502     }
2503 
2504     threads_data_p = & task_team -> tt.tt_threads_data;
2505     nthreads   = task_team -> tt.tt_nproc;
2506     maxthreads = task_team -> tt.tt_max_threads;
2507 
2508     // All threads must lock when they encounter the first task of the implicit task
2509     // region to make sure threads_data fields are (re)initialized before used.
2510     __kmp_acquire_bootstrap_lock( & task_team -> tt.tt_threads_lock );
2511 
2512     if ( ! TCR_4(task_team -> tt.tt_found_tasks) ) {
2513         // first thread to enable tasking
2514         kmp_team_t *team = thread -> th.th_team;
2515         int i;
2516 
2517         is_init_thread = TRUE;
2518         if ( maxthreads < nthreads ) {
2519 
2520             if ( *threads_data_p != NULL ) {
2521                 kmp_thread_data_t *old_data = *threads_data_p;
2522                 kmp_thread_data_t *new_data = NULL;
2523 
2524                 KE_TRACE( 10, ( "__kmp_realloc_task_threads_data: T#%d reallocating "
2525                                "threads data for task_team %p, new_size = %d, old_size = %d\n",
2526                                __kmp_gtid_from_thread( thread ), task_team,
2527                                nthreads, maxthreads ) );
2528                 // Reallocate threads_data to have more elements than current array
2529                 // Cannot use __kmp_thread_realloc() because threads not around for
2530                 // kmp_reap_task_team( ).  Note all new array entries are initialized
2531                 // to zero by __kmp_allocate().
2532                 new_data = (kmp_thread_data_t *)
2533                             __kmp_allocate( nthreads * sizeof(kmp_thread_data_t) );
2534                 // copy old data to new data
2535                 KMP_MEMCPY_S( (void *) new_data, nthreads * sizeof(kmp_thread_data_t),
2536                               (void *) old_data,
2537                               maxthreads * sizeof(kmp_taskdata_t *) );
2538 
2539 #ifdef BUILD_TIED_TASK_STACK
2540                 // GEH: Figure out if this is the right thing to do
2541                 for (i = maxthreads; i < nthreads; i++) {
2542                     kmp_thread_data_t *thread_data = & (*threads_data_p)[i];
2543                     __kmp_init_task_stack( __kmp_gtid_from_thread( thread ), thread_data );
2544                 }
2545 #endif // BUILD_TIED_TASK_STACK
2546                 // Install the new data and free the old data
2547                 (*threads_data_p) = new_data;
2548                 __kmp_free( old_data );
2549             }
2550             else {
2551                 KE_TRACE( 10, ( "__kmp_realloc_task_threads_data: T#%d allocating "
2552                                "threads data for task_team %p, size = %d\n",
2553                                __kmp_gtid_from_thread( thread ), task_team, nthreads ) );
2554                 // Make the initial allocate for threads_data array, and zero entries
2555                 // Cannot use __kmp_thread_calloc() because threads not around for
2556                 // kmp_reap_task_team( ).
2557                 ANNOTATE_IGNORE_WRITES_BEGIN();
2558                 *threads_data_p = (kmp_thread_data_t *)
2559                                   __kmp_allocate( nthreads * sizeof(kmp_thread_data_t) );
2560                 ANNOTATE_IGNORE_WRITES_END();
2561 #ifdef BUILD_TIED_TASK_STACK
2562                 // GEH: Figure out if this is the right thing to do
2563                 for (i = 0; i < nthreads; i++) {
2564                     kmp_thread_data_t *thread_data = & (*threads_data_p)[i];
2565                     __kmp_init_task_stack( __kmp_gtid_from_thread( thread ), thread_data );
2566                 }
2567 #endif // BUILD_TIED_TASK_STACK
2568             }
2569             task_team -> tt.tt_max_threads = nthreads;
2570         }
2571         else {
2572             // If array has (more than) enough elements, go ahead and use it
2573             KMP_DEBUG_ASSERT( *threads_data_p != NULL );
2574         }
2575 
2576         // initialize threads_data pointers back to thread_info structures
2577         for (i = 0; i < nthreads; i++) {
2578             kmp_thread_data_t *thread_data = & (*threads_data_p)[i];
2579             thread_data -> td.td_thr = team -> t.t_threads[i];
2580 
2581             if ( thread_data -> td.td_deque_last_stolen >= nthreads) {
2582                 // The last stolen field survives across teams / barrier, and the number
2583                 // of threads may have changed.  It's possible (likely?) that a new
2584                 // parallel region will exhibit the same behavior as the previous region.
2585                 thread_data -> td.td_deque_last_stolen = -1;
2586             }
2587         }
2588 
2589         KMP_MB();
2590         TCW_SYNC_4(task_team -> tt.tt_found_tasks, TRUE);
2591     }
2592 
2593     __kmp_release_bootstrap_lock( & task_team -> tt.tt_threads_lock );
2594     return is_init_thread;
2595 }
2596 
2597 
2598 //------------------------------------------------------------------------------
2599 // __kmp_free_task_threads_data:
2600 // Deallocates a threads_data array for a task team, including any attached
2601 // tasking deques.  Only occurs at library shutdown.
2602 
2603 static void
2604 __kmp_free_task_threads_data( kmp_task_team_t *task_team )
2605 {
2606     __kmp_acquire_bootstrap_lock( & task_team -> tt.tt_threads_lock );
2607     if ( task_team -> tt.tt_threads_data != NULL ) {
2608         int i;
2609         for (i = 0; i < task_team->tt.tt_max_threads; i++ ) {
2610             __kmp_free_task_deque( & task_team -> tt.tt_threads_data[i] );
2611         }
2612         __kmp_free( task_team -> tt.tt_threads_data );
2613         task_team -> tt.tt_threads_data = NULL;
2614     }
2615     __kmp_release_bootstrap_lock( & task_team -> tt.tt_threads_lock );
2616 }
2617 
2618 
2619 //------------------------------------------------------------------------------
2620 // __kmp_allocate_task_team:
2621 // Allocates a task team associated with a specific team, taking it from
2622 // the global task team free list if possible.  Also initializes data structures.
2623 
2624 static kmp_task_team_t *
2625 __kmp_allocate_task_team( kmp_info_t *thread, kmp_team_t *team )
2626 {
2627     kmp_task_team_t *task_team = NULL;
2628     int nthreads;
2629 
2630     KA_TRACE( 20, ( "__kmp_allocate_task_team: T#%d entering; team = %p\n",
2631                     (thread ? __kmp_gtid_from_thread( thread ) : -1), team ) );
2632 
2633     if (TCR_PTR(__kmp_free_task_teams) != NULL) {
2634         // Take a task team from the task team pool
2635         __kmp_acquire_bootstrap_lock( &__kmp_task_team_lock );
2636         if (__kmp_free_task_teams != NULL) {
2637             task_team = __kmp_free_task_teams;
2638             TCW_PTR(__kmp_free_task_teams, task_team -> tt.tt_next);
2639             task_team -> tt.tt_next = NULL;
2640         }
2641         __kmp_release_bootstrap_lock( &__kmp_task_team_lock );
2642     }
2643 
2644     if (task_team == NULL) {
2645         KE_TRACE( 10, ( "__kmp_allocate_task_team: T#%d allocating "
2646                        "task team for team %p\n",
2647                        __kmp_gtid_from_thread( thread ), team ) );
2648         // Allocate a new task team if one is not available.
2649         // Cannot use __kmp_thread_malloc() because threads not around for
2650         // kmp_reap_task_team( ).
2651         task_team = (kmp_task_team_t *) __kmp_allocate( sizeof(kmp_task_team_t) );
2652         __kmp_init_bootstrap_lock( & task_team -> tt.tt_threads_lock );
2653         //task_team -> tt.tt_threads_data = NULL;   // AC: __kmp_allocate zeroes returned memory
2654         //task_team -> tt.tt_max_threads = 0;
2655         //task_team -> tt.tt_next = NULL;
2656     }
2657 
2658     TCW_4(task_team -> tt.tt_found_tasks, FALSE);
2659 #if OMP_45_ENABLED
2660     TCW_4(task_team -> tt.tt_found_proxy_tasks, FALSE);
2661 #endif
2662     task_team -> tt.tt_nproc = nthreads = team->t.t_nproc;
2663 
2664     TCW_4( task_team -> tt.tt_unfinished_threads, nthreads );
2665     TCW_4( task_team -> tt.tt_active, TRUE );
2666 
2667     KA_TRACE( 20, ( "__kmp_allocate_task_team: T#%d exiting; task_team = %p unfinished_threads init'd to %d\n",
2668                     (thread ? __kmp_gtid_from_thread( thread ) : -1), task_team, task_team -> tt.tt_unfinished_threads) );
2669     return task_team;
2670 }
2671 
2672 
2673 //------------------------------------------------------------------------------
2674 // __kmp_free_task_team:
2675 // Frees the task team associated with a specific thread, and adds it
2676 // to the global task team free list.
2677 
2678 void
2679 __kmp_free_task_team( kmp_info_t *thread, kmp_task_team_t *task_team )
2680 {
2681     KA_TRACE( 20, ( "__kmp_free_task_team: T#%d task_team = %p\n",
2682                     thread ? __kmp_gtid_from_thread( thread ) : -1, task_team ) );
2683 
2684     // Put task team back on free list
2685     __kmp_acquire_bootstrap_lock( & __kmp_task_team_lock );
2686 
2687     KMP_DEBUG_ASSERT( task_team -> tt.tt_next == NULL );
2688     task_team -> tt.tt_next = __kmp_free_task_teams;
2689     TCW_PTR(__kmp_free_task_teams, task_team);
2690 
2691     __kmp_release_bootstrap_lock( & __kmp_task_team_lock );
2692 }
2693 
2694 
2695 //------------------------------------------------------------------------------
2696 // __kmp_reap_task_teams:
2697 // Free all the task teams on the task team free list.
2698 // Should only be done during library shutdown.
2699 // Cannot do anything that needs a thread structure or gtid since they are already gone.
2700 
2701 void
2702 __kmp_reap_task_teams( void )
2703 {
2704     kmp_task_team_t   *task_team;
2705 
2706     if ( TCR_PTR(__kmp_free_task_teams) != NULL ) {
2707         // Free all task_teams on the free list
2708         __kmp_acquire_bootstrap_lock( &__kmp_task_team_lock );
2709         while ( ( task_team = __kmp_free_task_teams ) != NULL ) {
2710             __kmp_free_task_teams = task_team -> tt.tt_next;
2711             task_team -> tt.tt_next = NULL;
2712 
2713             // Free threads_data if necessary
2714             if ( task_team -> tt.tt_threads_data != NULL ) {
2715                 __kmp_free_task_threads_data( task_team );
2716             }
2717             __kmp_free( task_team );
2718         }
2719         __kmp_release_bootstrap_lock( &__kmp_task_team_lock );
2720     }
2721 }
2722 
2723 //------------------------------------------------------------------------------
2724 // __kmp_wait_to_unref_task_teams:
2725 // Some threads could still be in the fork barrier release code, possibly
2726 // trying to steal tasks.  Wait for each thread to unreference its task team.
2727 //
2728 void
2729 __kmp_wait_to_unref_task_teams(void)
2730 {
2731     kmp_info_t *thread;
2732     kmp_uint32 spins;
2733     int done;
2734 
2735     KMP_INIT_YIELD( spins );
2736 
2737     for (;;) {
2738         done = TRUE;
2739 
2740         // TODO: GEH - this may be is wrong because some sync would be necessary
2741         //             in case threads are added to the pool during the traversal.
2742         //             Need to verify that lock for thread pool is held when calling
2743         //             this routine.
2744         for (thread = (kmp_info_t *)__kmp_thread_pool;
2745              thread != NULL;
2746              thread = thread->th.th_next_pool)
2747         {
2748 #if KMP_OS_WINDOWS
2749             DWORD exit_val;
2750 #endif
2751             if ( TCR_PTR(thread->th.th_task_team) == NULL ) {
2752                 KA_TRACE( 10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n",
2753                                __kmp_gtid_from_thread( thread ) ) );
2754                 continue;
2755             }
2756 #if KMP_OS_WINDOWS
2757             // TODO: GEH - add this check for Linux* OS / OS X* as well?
2758             if (!__kmp_is_thread_alive(thread, &exit_val)) {
2759                 thread->th.th_task_team = NULL;
2760                 continue;
2761             }
2762 #endif
2763 
2764             done = FALSE;  // Because th_task_team pointer is not NULL for this thread
2765 
2766             KA_TRACE( 10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to unreference task_team\n",
2767                            __kmp_gtid_from_thread( thread ) ) );
2768 
2769             if ( __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ) {
2770                 volatile void *sleep_loc;
2771                 // If the thread is sleeping, awaken it.
2772                 if ( ( sleep_loc = TCR_PTR( thread->th.th_sleep_loc) ) != NULL ) {
2773                     KA_TRACE( 10, ( "__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
2774                                     __kmp_gtid_from_thread( thread ), __kmp_gtid_from_thread( thread ) ) );
2775                     __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
2776                 }
2777             }
2778         }
2779         if (done) {
2780             break;
2781         }
2782 
2783         // If we are oversubscribed,
2784         // or have waited a bit (and library mode is throughput), yield.
2785         // Pause is in the following code.
2786         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2787         KMP_YIELD_SPIN( spins );        // Yields only if KMP_LIBRARY=throughput
2788     }
2789 }
2790 
2791 
2792 //------------------------------------------------------------------------------
2793 // __kmp_task_team_setup:  Create a task_team for the current team, but use
2794 // an already created, unused one if it already exists.
2795 void
2796 __kmp_task_team_setup( kmp_info_t *this_thr, kmp_team_t *team, int always )
2797 {
2798     KMP_DEBUG_ASSERT( __kmp_tasking_mode != tskm_immediate_exec );
2799 
2800     // If this task_team hasn't been created yet, allocate it. It will be used in the region after the next.
2801     // If it exists, it is the current task team and shouldn't be touched yet as it may still be in use.
2802     if (team->t.t_task_team[this_thr->th.th_task_state] == NULL && (always || team->t.t_nproc > 1) ) {
2803         team->t.t_task_team[this_thr->th.th_task_state] = __kmp_allocate_task_team( this_thr, team );
2804         KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created new task_team %p for team %d at parity=%d\n",
2805                       __kmp_gtid_from_thread(this_thr), team->t.t_task_team[this_thr->th.th_task_state],
2806                       ((team != NULL) ? team->t.t_id : -1), this_thr->th.th_task_state));
2807     }
2808 
2809     // After threads exit the release, they will call sync, and then point to this other task_team; make sure it is
2810     // allocated and properly initialized. As threads spin in the barrier release phase, they will continue to use the
2811     // previous task_team struct(above), until they receive the signal to stop checking for tasks (they can't safely
2812     // reference the kmp_team_t struct, which could be reallocated by the master thread). No task teams are formed for
2813     // serialized teams.
2814     if (team->t.t_nproc > 1) {
2815         int other_team = 1 - this_thr->th.th_task_state;
2816         if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
2817                 team->t.t_task_team[other_team] = __kmp_allocate_task_team( this_thr, team );
2818                 KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created second new task_team %p for team %d at parity=%d\n",
2819                                 __kmp_gtid_from_thread( this_thr ), team->t.t_task_team[other_team],
2820                               ((team != NULL) ? team->t.t_id : -1), other_team ));
2821         }
2822         else { // Leave the old task team struct in place for the upcoming region; adjust as needed
2823             kmp_task_team_t *task_team = team->t.t_task_team[other_team];
2824             if (!task_team->tt.tt_active || team->t.t_nproc != task_team->tt.tt_nproc) {
2825                 TCW_4(task_team->tt.tt_nproc, team->t.t_nproc);
2826                 TCW_4(task_team->tt.tt_found_tasks, FALSE);
2827 #if OMP_45_ENABLED
2828                 TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
2829 #endif
2830                 TCW_4(task_team->tt.tt_unfinished_threads, team->t.t_nproc );
2831                 TCW_4(task_team->tt.tt_active, TRUE );
2832             }
2833             // if team size has changed, the first thread to enable tasking will realloc threads_data if necessary
2834             KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d reset next task_team %p for team %d at parity=%d\n",
2835                           __kmp_gtid_from_thread( this_thr ), team->t.t_task_team[other_team],
2836                           ((team != NULL) ? team->t.t_id : -1), other_team ));
2837         }
2838     }
2839 }
2840 
2841 
2842 //------------------------------------------------------------------------------
2843 // __kmp_task_team_sync: Propagation of task team data from team to threads
2844 // which happens just after the release phase of a team barrier.  This may be
2845 // called by any thread, but only for teams with # threads > 1.
2846 
2847 void
2848 __kmp_task_team_sync( kmp_info_t *this_thr, kmp_team_t *team )
2849 {
2850     KMP_DEBUG_ASSERT( __kmp_tasking_mode != tskm_immediate_exec );
2851 
2852     // Toggle the th_task_state field, to switch which task_team this thread refers to
2853     this_thr->th.th_task_state = 1 - this_thr->th.th_task_state;
2854     // It is now safe to propagate the task team pointer from the team struct to the current thread.
2855     TCW_PTR(this_thr->th.th_task_team, team->t.t_task_team[this_thr->th.th_task_state]);
2856     KA_TRACE(20, ("__kmp_task_team_sync: Thread T#%d task team switched to task_team %p from Team #%d (parity=%d)\n",
2857                   __kmp_gtid_from_thread( this_thr ), this_thr->th.th_task_team,
2858                   ((team != NULL) ? team->t.t_id : -1), this_thr->th.th_task_state));
2859 }
2860 
2861 
2862 //--------------------------------------------------------------------------------------------
2863 // __kmp_task_team_wait: Master thread waits for outstanding tasks after the barrier gather
2864 // phase.  Only called by master thread if #threads in team > 1 or if proxy tasks were created.
2865 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off by passing in 0
2866 // optionally as the last argument. When wait is zero, master thread does not wait for
2867 // unfinished_threads to reach 0.
2868 void
2869 __kmp_task_team_wait( kmp_info_t *this_thr, kmp_team_t *team
2870                       USE_ITT_BUILD_ARG(void * itt_sync_obj)
2871                       , int wait)
2872 {
2873     kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state];
2874 
2875     KMP_DEBUG_ASSERT( __kmp_tasking_mode != tskm_immediate_exec );
2876     KMP_DEBUG_ASSERT( task_team == this_thr->th.th_task_team );
2877 
2878     if ( ( task_team != NULL ) && KMP_TASKING_ENABLED(task_team) ) {
2879         if (wait) {
2880             KA_TRACE(20, ("__kmp_task_team_wait: Master T#%d waiting for all tasks (for unfinished_threads to reach 0) on task_team = %p\n",
2881                           __kmp_gtid_from_thread(this_thr), task_team));
2882             // Worker threads may have dropped through to release phase, but could still be executing tasks. Wait
2883             // here for tasks to complete. To avoid memory contention, only master thread checks termination condition.
2884             kmp_flag_32 flag(&task_team->tt.tt_unfinished_threads, 0U);
2885             flag.wait(this_thr, TRUE
2886                       USE_ITT_BUILD_ARG(itt_sync_obj));
2887         }
2888         // Deactivate the old task team, so that the worker threads will stop referencing it while spinning.
2889         KA_TRACE(20, ("__kmp_task_team_wait: Master T#%d deactivating task_team %p: "
2890                       "setting active to false, setting local and team's pointer to NULL\n",
2891                       __kmp_gtid_from_thread(this_thr), task_team));
2892 #if OMP_45_ENABLED
2893         KMP_DEBUG_ASSERT( task_team->tt.tt_nproc > 1 || task_team->tt.tt_found_proxy_tasks == TRUE );
2894         TCW_SYNC_4( task_team->tt.tt_found_proxy_tasks, FALSE );
2895 #else
2896         KMP_DEBUG_ASSERT( task_team->tt.tt_nproc > 1 );
2897 #endif
2898         TCW_SYNC_4( task_team->tt.tt_active, FALSE );
2899         KMP_MB();
2900 
2901         TCW_PTR(this_thr->th.th_task_team, NULL);
2902     }
2903 }
2904 
2905 
2906 //------------------------------------------------------------------------------
2907 // __kmp_tasking_barrier:
2908 // This routine may only called when __kmp_tasking_mode == tskm_extra_barrier.
2909 // Internal function to execute all tasks prior to a regular barrier or a
2910 // join barrier.  It is a full barrier itself, which unfortunately turns
2911 // regular barriers into double barriers and join barriers into 1 1/2
2912 // barriers.
2913 void
2914 __kmp_tasking_barrier( kmp_team_t *team, kmp_info_t *thread, int gtid )
2915 {
2916     volatile kmp_uint32 *spin = &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads;
2917     int flag = FALSE;
2918     KMP_DEBUG_ASSERT( __kmp_tasking_mode == tskm_extra_barrier );
2919 
2920 #if USE_ITT_BUILD
2921     KMP_FSYNC_SPIN_INIT( spin, (kmp_uint32*) NULL );
2922 #endif /* USE_ITT_BUILD */
2923     kmp_flag_32 spin_flag(spin, 0U);
2924     while (! spin_flag.execute_tasks(thread, gtid, TRUE, &flag
2925                                      USE_ITT_BUILD_ARG(NULL), 0 ) ) {
2926 #if USE_ITT_BUILD
2927         // TODO: What about itt_sync_obj??
2928         KMP_FSYNC_SPIN_PREPARE( spin );
2929 #endif /* USE_ITT_BUILD */
2930 
2931         if( TCR_4(__kmp_global.g.g_done) ) {
2932             if( __kmp_global.g.g_abort )
2933                 __kmp_abort_thread( );
2934             break;
2935         }
2936         KMP_YIELD( TRUE );       // GH: We always yield here
2937     }
2938 #if USE_ITT_BUILD
2939     KMP_FSYNC_SPIN_ACQUIRED( (void*) spin );
2940 #endif /* USE_ITT_BUILD */
2941 }
2942 
2943 
2944 #if OMP_45_ENABLED
2945 
2946 /* __kmp_give_task puts a task into a given thread queue if:
2947     - the queue for that thread was created
2948     - there's space in that queue
2949 
2950     Because of this, __kmp_push_task needs to check if there's space after getting the lock
2951  */
2952 static bool __kmp_give_task ( kmp_info_t *thread, kmp_int32 tid, kmp_task_t * task, kmp_int32 pass )
2953 {
2954     kmp_taskdata_t *    taskdata = KMP_TASK_TO_TASKDATA(task);
2955     kmp_task_team_t *	task_team = taskdata->td_task_team;
2956 
2957     KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n", taskdata, tid ) );
2958 
2959     // If task_team is NULL something went really bad...
2960     KMP_DEBUG_ASSERT( task_team != NULL );
2961 
2962     bool result = false;
2963     kmp_thread_data_t * thread_data = & task_team -> tt.tt_threads_data[ tid ];
2964 
2965     if (thread_data -> td.td_deque == NULL ) {
2966         // There's no queue in this thread, go find another one
2967         // We're guaranteed that at least one thread has a queue
2968         KA_TRACE(30, ("__kmp_give_task: thread %d has no queue while giving task %p.\n", tid, taskdata ) );
2969         return result;
2970     }
2971 
2972     if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE(thread_data->td) )
2973     {
2974         KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to thread %d.\n", taskdata, tid ) );
2975 
2976         // if this deque is bigger than the pass ratio give a chance to another thread
2977         if ( TASK_DEQUE_SIZE(thread_data->td)/INITIAL_TASK_DEQUE_SIZE >= pass ) return result;
2978 
2979         __kmp_acquire_bootstrap_lock( & thread_data-> td.td_deque_lock );
2980         __kmp_realloc_task_deque(thread,thread_data);
2981 
2982     } else {
2983 
2984        __kmp_acquire_bootstrap_lock( & thread_data-> td.td_deque_lock );
2985 
2986        if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE(thread_data->td) )
2987        {
2988            KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to thread %d.\n", taskdata, tid ) );
2989 
2990            // if this deque is bigger than the pass ratio give a chance to another thread
2991            if ( TASK_DEQUE_SIZE(thread_data->td)/INITIAL_TASK_DEQUE_SIZE >= pass )
2992               goto release_and_exit;
2993 
2994            __kmp_realloc_task_deque(thread,thread_data);
2995        }
2996     }
2997 
2998     // lock is held here, and there is space in the deque
2999 
3000     thread_data -> td.td_deque[ thread_data -> td.td_deque_tail ] = taskdata;
3001     // Wrap index.
3002     thread_data -> td.td_deque_tail = ( thread_data -> td.td_deque_tail + 1 ) & TASK_DEQUE_MASK(thread_data->td);
3003     TCW_4(thread_data -> td.td_deque_ntasks, TCR_4(thread_data -> td.td_deque_ntasks) + 1);
3004 
3005     result = true;
3006     KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n", taskdata, tid ) );
3007 
3008 release_and_exit:
3009     __kmp_release_bootstrap_lock( & thread_data-> td.td_deque_lock );
3010 
3011      return result;
3012 }
3013 
3014 
3015 /* The finish of the a proxy tasks is divided in two pieces:
3016     - the top half is the one that can be done from a thread outside the team
3017     - the bottom half must be run from a them within the team
3018 
3019     In order to run the bottom half the task gets queued back into one of the threads of the team.
3020     Once the td_incomplete_child_task counter of the parent is decremented the threads can leave the barriers.
3021     So, the bottom half needs to be queued before the counter is decremented. The top half is therefore divided in two parts:
3022     - things that can be run before queuing the bottom half
3023     - things that must be run after queuing the bottom half
3024 
3025     This creates a second race as the bottom half can free the task before the second top half is executed. To avoid this
3026     we use the td_incomplete_child_task of the proxy task to synchronize the top and bottom half.
3027 */
3028 
3029 static void __kmp_first_top_half_finish_proxy( kmp_taskdata_t * taskdata )
3030 {
3031     KMP_DEBUG_ASSERT( taskdata -> td_flags.tasktype == TASK_EXPLICIT );
3032     KMP_DEBUG_ASSERT( taskdata -> td_flags.proxy == TASK_PROXY );
3033     KMP_DEBUG_ASSERT( taskdata -> td_flags.complete == 0 );
3034     KMP_DEBUG_ASSERT( taskdata -> td_flags.freed == 0 );
3035 
3036     taskdata -> td_flags.complete = 1;   // mark the task as completed
3037 
3038     if ( taskdata->td_taskgroup )
3039        KMP_TEST_THEN_DEC32( (kmp_int32 *)(& taskdata->td_taskgroup->count) );
3040 
3041     // Create an imaginary children for this task so the bottom half cannot release the task before we have completed the second top half
3042     TCI_4(taskdata->td_incomplete_child_tasks);
3043 }
3044 
3045 static void __kmp_second_top_half_finish_proxy( kmp_taskdata_t * taskdata )
3046 {
3047     kmp_int32 children = 0;
3048 
3049     // Predecrement simulated by "- 1" calculation
3050     children = KMP_TEST_THEN_DEC32( (kmp_int32 *)(& taskdata -> td_parent -> td_incomplete_child_tasks) ) - 1;
3051     KMP_DEBUG_ASSERT( children >= 0 );
3052 
3053     // Remove the imaginary children
3054     TCD_4(taskdata->td_incomplete_child_tasks);
3055 }
3056 
3057 static void __kmp_bottom_half_finish_proxy( kmp_int32 gtid, kmp_task_t * ptask )
3058 {
3059     kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(ptask);
3060     kmp_info_t * thread = __kmp_threads[ gtid ];
3061 
3062     KMP_DEBUG_ASSERT( taskdata -> td_flags.proxy == TASK_PROXY );
3063     KMP_DEBUG_ASSERT( taskdata -> td_flags.complete == 1 ); // top half must run before bottom half
3064 
3065     // We need to wait to make sure the top half is finished
3066     // Spinning here should be ok as this should happen quickly
3067     while ( TCR_4(taskdata->td_incomplete_child_tasks) > 0 ) ;
3068 
3069     __kmp_release_deps(gtid,taskdata);
3070     __kmp_free_task_and_ancestors(gtid, taskdata, thread);
3071 }
3072 
3073 /*!
3074 @ingroup TASKING
3075 @param gtid Global Thread ID of encountering thread
3076 @param ptask Task which execution is completed
3077 
3078 Execute the completation of a proxy task from a thread of that is part of the team. Run first and bottom halves directly.
3079 */
3080 void __kmpc_proxy_task_completed( kmp_int32 gtid, kmp_task_t *ptask )
3081 {
3082     KMP_DEBUG_ASSERT( ptask != NULL );
3083     kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(ptask);
3084     KA_TRACE(10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n", gtid, taskdata ) );
3085 
3086     KMP_DEBUG_ASSERT( taskdata->td_flags.proxy == TASK_PROXY );
3087 
3088     __kmp_first_top_half_finish_proxy(taskdata);
3089     __kmp_second_top_half_finish_proxy(taskdata);
3090     __kmp_bottom_half_finish_proxy(gtid,ptask);
3091 
3092     KA_TRACE(10, ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n", gtid, taskdata ) );
3093 }
3094 
3095 /*!
3096 @ingroup TASKING
3097 @param ptask Task which execution is completed
3098 
3099 Execute the completation of a proxy task from a thread that could not belong to the team.
3100 */
3101 void __kmpc_proxy_task_completed_ooo ( kmp_task_t *ptask )
3102 {
3103     KMP_DEBUG_ASSERT( ptask != NULL );
3104     kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(ptask);
3105 
3106     KA_TRACE(10, ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n", taskdata ) );
3107 
3108     KMP_DEBUG_ASSERT( taskdata->td_flags.proxy == TASK_PROXY );
3109 
3110     __kmp_first_top_half_finish_proxy(taskdata);
3111 
3112     // Enqueue task to complete bottom half completion from a thread within the corresponding team
3113     kmp_team_t * team = taskdata->td_team;
3114     kmp_int32 nthreads = team->t.t_nproc;
3115     kmp_info_t *thread;
3116 
3117     //This should be similar to start_k = __kmp_get_random( thread ) % nthreads but we cannot use __kmp_get_random here
3118     kmp_int32 start_k = 0;
3119     kmp_int32 pass = 1;
3120     kmp_int32 k = start_k;
3121 
3122     do {
3123         //For now we're just linearly trying to find a thread
3124         thread = team->t.t_threads[k];
3125         k = (k+1) % nthreads;
3126 
3127         // we did a full pass through all the threads
3128         if ( k == start_k ) pass = pass << 1;
3129 
3130     } while ( !__kmp_give_task( thread, k,  ptask, pass ) );
3131 
3132     __kmp_second_top_half_finish_proxy(taskdata);
3133 
3134     KA_TRACE(10, ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n", taskdata ) );
3135 }
3136 
3137 //---------------------------------------------------------------------------------
3138 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task for taskloop
3139 //
3140 // thread:   allocating thread
3141 // task_src: pointer to source task to be duplicated
3142 // returns:  a pointer to the allocated kmp_task_t structure (task).
3143 kmp_task_t *
3144 __kmp_task_dup_alloc( kmp_info_t *thread, kmp_task_t *task_src )
3145 {
3146     kmp_task_t     *task;
3147     kmp_taskdata_t *taskdata;
3148     kmp_taskdata_t *taskdata_src;
3149     kmp_taskdata_t *parent_task = thread->th.th_current_task;
3150     size_t shareds_offset;
3151     size_t task_size;
3152 
3153     KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread, task_src) );
3154     taskdata_src = KMP_TASK_TO_TASKDATA( task_src );
3155     KMP_DEBUG_ASSERT( taskdata_src->td_flags.proxy == TASK_FULL ); // it should not be proxy task
3156     KMP_DEBUG_ASSERT( taskdata_src->td_flags.tasktype == TASK_EXPLICIT );
3157     task_size = taskdata_src->td_size_alloc;
3158 
3159     // Allocate a kmp_taskdata_t block and a kmp_task_t block.
3160     KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread, task_size) );
3161     #if USE_FAST_MEMORY
3162     taskdata = (kmp_taskdata_t *)__kmp_fast_allocate( thread, task_size );
3163     #else
3164     taskdata = (kmp_taskdata_t *)__kmp_thread_malloc( thread, task_size );
3165     #endif /* USE_FAST_MEMORY */
3166     KMP_MEMCPY(taskdata, taskdata_src, task_size);
3167 
3168     task = KMP_TASKDATA_TO_TASK(taskdata);
3169 
3170     // Initialize new task (only specific fields not affected by memcpy)
3171     taskdata->td_task_id = KMP_GEN_TASK_ID();
3172     if( task->shareds != NULL ) { // need setup shareds pointer
3173         shareds_offset = (char*)task_src->shareds - (char*)taskdata_src;
3174         task->shareds = &((char*)taskdata)[shareds_offset];
3175         KMP_DEBUG_ASSERT( (((kmp_uintptr_t)task->shareds) & (sizeof(void*)-1)) == 0 );
3176     }
3177     taskdata->td_alloc_thread = thread;
3178     taskdata->td_taskgroup = parent_task->td_taskgroup; // task inherits the taskgroup from the parent task
3179 
3180     // Only need to keep track of child task counts if team parallel and tasking not serialized
3181     if ( !( taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser ) ) {
3182         KMP_TEST_THEN_INC32( (kmp_int32 *)(& parent_task->td_incomplete_child_tasks) );
3183         if ( parent_task->td_taskgroup )
3184             KMP_TEST_THEN_INC32( (kmp_int32 *)(& parent_task->td_taskgroup->count) );
3185         // Only need to keep track of allocated child tasks for explicit tasks since implicit not deallocated
3186         if ( taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT )
3187             KMP_TEST_THEN_INC32( (kmp_int32 *)(& taskdata->td_parent->td_allocated_child_tasks) );
3188     }
3189 
3190     KA_TRACE(20, ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
3191                   thread, taskdata, taskdata->td_parent) );
3192 #if OMPT_SUPPORT
3193     __kmp_task_init_ompt(taskdata, thread->th.th_info.ds.ds_gtid, (void*)task->routine);
3194 #endif
3195     return task;
3196 }
3197 
3198 // Routine optionally generated by th ecompiler for setting the lastprivate flag
3199 // and calling needed constructors for private/firstprivate objects
3200 // (used to form taskloop tasks from pattern task)
3201 typedef void(*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
3202 
3203 //---------------------------------------------------------------------------------
3204 // __kmp_taskloop_linear: Start tasks of the taskloop linearly
3205 //
3206 // loc       Source location information
3207 // gtid      Global thread ID
3208 // task      Task with whole loop iteration range
3209 // lb        Pointer to loop lower bound
3210 // ub        Pointer to loop upper bound
3211 // st        Loop stride
3212 // sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
3213 // grainsize Schedule value if specified
3214 // task_dup  Tasks duplication routine
3215 void
3216 __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
3217                 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
3218                 int sched, kmp_uint64 grainsize, void *task_dup )
3219 {
3220     KMP_COUNT_BLOCK(OMP_TASKLOOP);
3221     KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
3222     p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
3223     kmp_uint64 tc;
3224     kmp_uint64 lower = *lb; // compiler provides global bounds here
3225     kmp_uint64 upper = *ub;
3226     kmp_uint64 i, num_tasks = 0, extras = 0;
3227     kmp_info_t *thread = __kmp_threads[gtid];
3228     kmp_taskdata_t *current_task = thread->th.th_current_task;
3229     kmp_task_t *next_task;
3230     kmp_int32 lastpriv = 0;
3231     size_t lower_offset = (char*)lb - (char*)task; // remember offset of lb in the task structure
3232     size_t upper_offset = (char*)ub - (char*)task; // remember offset of ub in the task structure
3233 
3234     // compute trip count
3235     if ( st == 1 ) {   // most common case
3236         tc = upper - lower + 1;
3237     } else if ( st < 0 ) {
3238         tc = (lower - upper) / (-st) + 1;
3239     } else {       // st > 0
3240         tc = (upper - lower) / st + 1;
3241     }
3242     if(tc == 0) {
3243         KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d zero-trip loop\n", gtid));
3244         // free the pattern task and exit
3245         __kmp_task_start( gtid, task, current_task );
3246         // do not execute anything for zero-trip loop
3247         __kmp_task_finish( gtid, task, current_task );
3248         return;
3249     }
3250 
3251     // compute num_tasks/grainsize based on the input provided
3252     switch( sched ) {
3253     case 0: // no schedule clause specified, we can choose the default
3254             // let's try to schedule (team_size*10) tasks
3255         grainsize = thread->th.th_team_nproc * 10;
3256     case 2: // num_tasks provided
3257         if( grainsize > tc ) {
3258             num_tasks = tc;   // too big num_tasks requested, adjust values
3259             grainsize = 1;
3260             extras = 0;
3261         } else {
3262             num_tasks = grainsize;
3263             grainsize = tc / num_tasks;
3264             extras = tc % num_tasks;
3265         }
3266         break;
3267     case 1: // grainsize provided
3268         if( grainsize > tc ) {
3269             num_tasks = 1;    // too big grainsize requested, adjust values
3270             grainsize = tc;
3271             extras = 0;
3272         } else {
3273             num_tasks = tc / grainsize;
3274             grainsize = tc / num_tasks; // adjust grainsize for balanced distribution of iterations
3275             extras = tc % num_tasks;
3276         }
3277         break;
3278     default:
3279         KMP_ASSERT2(0, "unknown scheduling of taskloop");
3280     }
3281     KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
3282     KMP_DEBUG_ASSERT(num_tasks > extras);
3283     KMP_DEBUG_ASSERT(num_tasks > 0);
3284     KA_TRACE(20, ("__kmpc_taskloop: T#%d will launch: num_tasks %lld, grainsize %lld, extras %lld\n",
3285                   gtid, num_tasks, grainsize, extras));
3286 
3287     // Main loop, launch num_tasks tasks, assign grainsize iterations each task
3288     for( i = 0; i < num_tasks; ++i ) {
3289         kmp_uint64 chunk_minus_1;
3290         if( extras == 0 ) {
3291             chunk_minus_1 = grainsize - 1;
3292         } else {
3293             chunk_minus_1 = grainsize;
3294             --extras; // first extras iterations get bigger chunk (grainsize+1)
3295         }
3296         upper = lower + st * chunk_minus_1;
3297         if( i == num_tasks - 1 ) {
3298             // schedule the last task, set lastprivate flag
3299             lastpriv = 1;
3300 #if KMP_DEBUG
3301             if( st == 1 )
3302                 KMP_DEBUG_ASSERT(upper == *ub);
3303             else if( st > 0 )
3304                 KMP_DEBUG_ASSERT(upper+st > *ub);
3305             else
3306                 KMP_DEBUG_ASSERT(upper+st < *ub);
3307 #endif
3308         }
3309         next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
3310         *(kmp_uint64*)((char*)next_task + lower_offset) = lower; // adjust task-specific bounds
3311         *(kmp_uint64*)((char*)next_task + upper_offset) = upper;
3312         if( ptask_dup != NULL )
3313             ptask_dup(next_task, task, lastpriv); // set lastprivate flag, construct fistprivates, etc.
3314         KA_TRACE(20, ("__kmpc_taskloop: T#%d schedule task %p: lower %lld, upper %lld (offsets %p %p)\n",
3315                       gtid, next_task, lower, upper, lower_offset, upper_offset));
3316         __kmp_omp_task(gtid, next_task, true); // schedule new task
3317         lower = upper + st; // adjust lower bound for the next iteration
3318     }
3319     // free the pattern task and exit
3320     __kmp_task_start( gtid, task, current_task );
3321     // do not execute the pattern task, just do bookkeeping
3322     __kmp_task_finish( gtid, task, current_task );
3323 }
3324 
3325 /*!
3326 @ingroup TASKING
3327 @param loc       Source location information
3328 @param gtid      Global thread ID
3329 @param task      Task structure
3330 @param if_val    Value of the if clause
3331 @param lb        Pointer to loop lower bound
3332 @param ub        Pointer to loop upper bound
3333 @param st        Loop stride
3334 @param nogroup   Flag, 1 if nogroup clause specified, 0 otherwise
3335 @param sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
3336 @param grainsize Schedule value if specified
3337 @param task_dup  Tasks duplication routine
3338 
3339 Execute the taskloop construct.
3340 */
3341 void
3342 __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
3343                 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
3344                 int nogroup, int sched, kmp_uint64 grainsize, void *task_dup )
3345 {
3346     kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(task);
3347     KMP_DEBUG_ASSERT( task != NULL );
3348 
3349     KA_TRACE(10, ("__kmpc_taskloop(enter): T#%d, pattern task %p, lb %lld ub %lld st %lld, grain %llu(%d)\n",
3350         gtid, taskdata, *lb, *ub, st, grainsize, sched));
3351 
3352     // check if clause value first
3353     if( if_val == 0 ) { // if(0) specified, mark task as serial
3354         taskdata->td_flags.task_serial = 1;
3355         taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
3356     }
3357     if( nogroup == 0 ) {
3358         __kmpc_taskgroup( loc, gtid );
3359     }
3360 
3361     if( 1 /* AC: use some heuristic here to choose task scheduling method */ ) {
3362         __kmp_taskloop_linear( loc, gtid, task, lb, ub, st, sched, grainsize, task_dup );
3363     }
3364 
3365     if( nogroup == 0 ) {
3366         __kmpc_end_taskgroup( loc, gtid );
3367     }
3368     KA_TRACE(10, ("__kmpc_taskloop(exit): T#%d\n", gtid));
3369 }
3370 
3371 #endif
3372